117 files changed, 59766 insertions, 0 deletions
diff --git a/source/luametatex/source/libraries/avl/avl.c b/source/luametatex/source/libraries/avl/avl.c
new file mode 100644
index 000000000..46e0bcd50
--- /dev/null
+++ b/source/luametatex/source/libraries/avl/avl.c
@@ -0,0 +1,2040 @@
+/*
+    This small C package is made of an independent set of routines dedicated to manipulating AVL
+    trees (files avl.c, avl.h), and of an extension module for Python that builds upon it (file
+    avlmodule.c). Unlike collectionsmodule.c, the latter file contains only Python bindings: it
+    adds nothing to the underlying implementation.
+
+    license: this package, pyavl, is donated to the public domain
+    author : Richard McGraw
+    Email  : dasnar@fastmail.fm
+*/
+
+/*
+    This file is reformatted a little. As there has not been any changed in the original we assume
+    this is okay. No changes means that the code is fine and we never ran into issues. Also, we
+    always check for NULL here.
+
+    The avl code is used for hashing strings. It is also used in the backend of luatex but in
+    luametatex we don't have that, so its use is now only in mplib. Actually there are two modules
+    used in luatex: one for metapost an done for tex, and they are somewhat different. So, I took
+    the most extensive one.
+
+    Todo: Rename some variables to avoid a compiler warning.
+    Todo: Maybe abstract the error messages and make them a callback.
+    Todo: Maybe some more if/else/local (likely branch prediction).
+    Todo: Maybe turn some common code into functions (just for fun, will make the source smaller).
+*/
+
+# include "avl.h"
+
+# ifdef AVL_SHOW_ERROR_ON
+    # define AVL_SHOW_ERROR(fmt,arg) fprintf(stderr, "! avl.c: " fmt, arg)
+# else
+    # define AVL_SHOW_ERROR(fmt,arg) (void) (fmt), (void) (arg)
+# endif
+
+const void *avl_default_item_copy(const void *item)
+{
+    return (const void *) item;
+}
+
+void *avl_default_item_dispose(void *item)
+{
+    (void) item;
+    return (void *) NULL;
+}
+
+/* integral type to encode rank and skew bits */
+
+typedef uint32_t rbal_t;
+
+/* avl_node structure */
+
+typedef struct avl_node { /* aligned */
+    struct avl_node *sub[2];
+    struct avl_node *up;
+    void            *item;
+    rbal_t           rbal;
+    int              padding;
+} avl_node;
+
+/*
+ * avl_tree structure
+ */
+
+struct avl_tree_ { /* aligned */
+    avl_node              *root;
+    avl_size_t             count;    /* how many nodes in tree rooted at [root] */
+    int                    padding;  /* alignment */
+    avl_compare_func       compare;  /* compare items */
+    avl_item_copy_func     copy;
+    avl_item_dispose_func  dispose;
+    avl_alloc_func         alloc;    /* to allocate memory (same signature as malloc) */
+    avl_dealloc_func       dealloc;  /* to deallocate memory (same signature as free) */
+    void *param;
+};
+
+# define item_compare(cmp, tree, item1, item2) (*cmp)(tree->param, item1, item2)
+
+# define sub_left(a)  (a)->sub[0]
+# define sub_right(a) (a)->sub[1]
+# define get_item(a)  (a)->item
+
+/* RANK(a) = size of left subtree + 1 */
+
+# define rbal(a)         (a)->rbal
+# define rzero(a)        (rbal(a) & ~3)
+# define get_bal(a)      (rbal(a) & 3)
+# define is_lskew(a)     (rbal(a) & 1)
+# define is_rskew(a)     (rbal(a)>>1 & 1)
+# define set_lskew(a)    (rbal(a) |= 1)
+# define set_rskew(a)    (rbal(a) |= 2)
+# define set_skew(a,d)   (rbal(a) |= (1 << d))
+# define unset_lskew(a)  (rbal(a) &= ~1)
+# define unset_rskew(a)  (rbal(a) &= ~2)
+# define get_rank(a)     (rbal(a) >> 2)
+# define set_rank(a,r)   (rbal(a) = (r<<2 | get_bal(a)))
+# define incr_rank(a,r)  (rbal(a) += r<<2)
+# define decr_rank(a,r)  (rbal(a) -= r<<2)
+
+# define AVL_MIN_DEPTH   0
+
+/* helper structure */
+
+typedef enum {
+    OP_BACKUP,
+    OP_DETACH,
+    OP_FREE
+} whichop_t;
+
+typedef struct ptr_handler { /* swapped and aligned */
+    void      *ptr;
+    whichop_t  whichop;
+    int        padding;
+} ptr_handler;
+
+static void clear_node(avl_node *a)
+{
+    sub_left(a) = NULL;
+    sub_right(a) = NULL;
+    (a)->up = NULL;
+    rbal(a) = 4u;
+}
+
+/* Called by 'avl_ins', 'avl_dup', 'node_slice' */
+
+static avl_node *new_node(void *item, avl_node *up, avl_tree t)
+{
+    avl_node *a = (*t->alloc)(sizeof(avl_node));
+    if (a) {
+        sub_left(a) = NULL;
+        sub_right(a) = NULL;
+        a->up = up;
+        a->rbal = 4u;
+        a->item = (*t->copy)(item);
+    } else {
+        AVL_SHOW_ERROR("%s\n", "couldn't allocate node");
+    }
+    return a;
+}
+
+static void free_node(avl_node *a, avl_tree t)
+{
+    a->item = (*t->dispose)(a->item);
+    (*t->dealloc)(a);
+}
+
+/* function to detach node [a] from tree [t]  (compiler will inline if needed) */
+
+static void detach_node(avl_node *a, avl_tree t, struct ptr_handler *h)
+{
+    clear_node(a);
+    do {
+        if (! h) {
+            /* nothing */
+        } else if (h->whichop == OP_DETACH) {
+            h->ptr = a;
+            break;
+        } else if (h->whichop == OP_BACKUP) {
+            h->ptr = (*t->copy)(a->item);
+        }
+        free_node(a, t);
+    } while (0);
+    t->count--;
+}
+
+/* Tree methods */
+
+avl_tree avl_create (
+    avl_compare_func       compare,
+    avl_item_copy_func     copy,
+    avl_item_dispose_func  dispose,
+    avl_alloc_func         alloc,
+    avl_dealloc_func       dealloc,
+    void                  *param
+)
+{
+    avl_tree t = (*alloc)(sizeof(struct avl_tree_));
+    if (t) {
+        t->root = NULL;
+        t->count = 0;
+        t->param = param;
+        t->compare = compare;
+        t->copy = copy;
+        t->dispose = dispose;
+        t->alloc = alloc;
+        t->dealloc = dealloc;
+    } else {
+        AVL_SHOW_ERROR("%s\n", "couldn't create new handle in avl_create()");
+    }
+    return t;
+}
+
+/* Empty the tree, using rotations */
+
+static void node_empty(avl_tree t)
+{
+    avl_node *a, *p;
+    for (a = t->root; a != NULL;) {
+        p = a;
+        if (! sub_right(a)) {
+            a = sub_left(a);
+        } else {
+            while (sub_left(a)) {
+                /* rotR(a) */
+                a = sub_left(a);
+                sub_left(p) = sub_right(a);
+                sub_right(a) = p;
+                p = a;
+            }
+            a = sub_right(p);
+        }
+        free_node(p, t);
+        t->count--;
+    }
+    t->root = NULL;
+}
+
+/* [t] is an existing tree handle; this function invokes node_empty() */
+
+void avl_reset (
+    avl_tree              t,
+    avl_compare_func      compare,
+    avl_item_copy_func    copy,
+    avl_item_dispose_func dispose,
+    avl_alloc_func        alloc,
+    avl_dealloc_func      dealloc
+)
+{
+    if (t) {
+        node_empty(t);
+        t->compare = compare;
+        t->copy = copy;
+        t->dispose = dispose;
+        t->alloc = alloc;
+        t->dealloc = dealloc;
+    }
+}
+
+void avl_empty(avl_tree t)
+{
+    if (t) {
+        node_empty(t);
+    }
+}
+
+/* Destroy nodes, free handle */
+
+void avl_destroy(avl_tree t)
+{
+    if (t) {
+        node_empty(t);
+        (*t->dealloc)(t);
+    }
+}
+
+avl_tree avl_dup(avl_tree t, void *param)
+{
+    if (t) {
+        avl_tree tt = avl_create(t->compare, t->copy, t->dispose, t->alloc, t->dealloc, param);
+        if (tt) {
+            tt->count = t->count;
+            if (t->root == NULL) {
+                return tt;
+            } else {
+                avl_node *a, *c, *s;
+                a = t->root;
+                tt->root = c = new_node(get_item(a), NULL, t);
+                if (c) {
+                    sub_right(c) = NULL;
+                    rbal(c) = rbal(a);
+                    while (1) {
+                        while (sub_left(a) != NULL) {
+                            a = sub_left(a);
+                            sub_left(c) = s = new_node(get_item(a), NULL, t);
+                            if (s) {
+                                s->up = c;
+                                sub_right(s) = c;
+                                c = s;
+                                rbal(c) = rbal(a);
+                            } else {
+                                goto recover;
+                            }
+                        }
+                        sub_left(c) = NULL;
+                        while (sub_right(a) == NULL) {
+                            s = sub_right(c);
+                            sub_right(c) = NULL;
+                            c = s;
+                            /* Find successor of [a] in original tree */
+                            do  {
+                                s = a;
+                                a = s->up;
+                                if (a == NULL) {
+                                    return tt;
+                                }
+                            }
+                            while (s != sub_left(a));
+                        }
+                        a = sub_right(a);
+                        s = new_node(get_item(a), NULL, t);
+                        if (s) {
+                            sub_right(s) = sub_right(c);
+                            sub_right(c) = s;
+                            s->up = c;
+                            c = s;
+                            rbal(c) = rbal(a);
+                        } else {
+                            goto recover;
+                        }
+                    }
+                    /* recovery code     */
+                  recover:
+                    while (1) {
+                        s = sub_right(c);
+                        sub_right(c) = NULL;
+                        if (s) {
+                            c = s;
+                        } else {
+                            break;
+                        }
+                    }
+                    node_empty(tt);
+                  abort:
+                    (*t->dealloc)(tt);
+                    AVL_SHOW_ERROR("%s\n", "couldn't allocate node in avl_dup()");
+                    return NULL;
+                } else {
+                    goto abort;
+                }
+            }
+        } else {
+            AVL_SHOW_ERROR("%s\n", "couldn't create new handle in avl_dup()");
+       }
+    }
+    return NULL;
+}
+
+avl_bool_t avl_isempty(avl_tree t)
+{
+     return t == NULL || t->root == NULL;
+}
+
+avl_size_t avl_size(avl_tree t)
+{
+    return t == NULL ? 0 : t->count;
+}
+
+static int depth(avl_node *a)
+{
+    int h = AVL_MIN_DEPTH;
+    for (; a != NULL; ++h) {
+        a = a->sub[is_rskew(a)];
+    }
+    return h;
+}
+
+static avl_node *node_first(avl_node *a)
+{
+    while (sub_left(a)) {
+        a = sub_left(a);
+    }
+    return a;
+}
+
+static avl_node *node_last(avl_node *a)
+{
+    while (sub_right(a)) {
+        a = sub_right(a);
+    }
+    return a;
+}
+
+/* [a] : non-null */
+
+static avl_node *node_next(avl_node *a)
+{
+    if (sub_right(a)) {
+        return node_first (sub_right(a));
+    } else {
+        avl_node *p;
+        do {
+            p = a;
+            a = p->up;
+        } while (a && sub_right(a) == p);
+        return a;
+    }
+}
+
+/* [a] : non-null */
+
+static avl_node *node_prev(avl_node *a)
+{
+    if (sub_left(a)) {
+        return node_last (sub_left(a));
+    } else {
+        avl_node *p;
+        do {
+            p = a;
+            a = p->up;
+        } while (a && sub_left(a) == p);
+        return a;
+    }
+}
+
+static avl_node *node_find(const void *item, avl_tree t)
+{
+    avl_node *a = t->root;
+    avl_compare_func cmp = t->compare;
+    while (a) {
+        int c = item_compare(cmp, t, item, get_item(a));
+        if (c < 0) {
+            a = sub_left(a);
+        } else if (c) {
+            a = sub_right(a);
+        } else {
+            break;
+        }
+    }
+    return a;
+}
+
+static avl_size_t get_index(avl_node *a)
+{
+    avl_size_t n = get_rank(a);
+    avl_node *p;
+    while ((p = a->up)) {
+        if (a != sub_left(p)) {
+            n += get_rank(p);
+        }
+        a = p;
+    }
+    return n;
+}
+
+/* Find item by index */
+
+static avl_node *node_find_index(avl_size_t idx, avl_tree t)
+{
+    avl_node *a = t->root;
+    if (idx == 0 || idx > t->count) {
+        return NULL;
+    } else if (idx == 1) {
+        return node_first(a);
+    } else if (idx == t->count) {
+        return node_last(a);
+    } else {
+        int c;
+        while ((c = (int) (idx - get_rank(a))) != 0) {
+            if (c < 0) {
+                a = sub_left(a);
+            } else {
+                idx = (avl_size_t) c;
+                a = sub_right(a);
+            }
+        }
+        return a;
+    }
+}
+
+/* Rebalance starting from node [a] where a->sub[d_] is deeper post-insertion */
+
+static avl_code_t rebalance_ins(avl_node *a, int dir, avl_tree t)
+{
+    if (a) {
+        avl_node *p;
+        while (1) {
+            incr_rank(a, (rbal_t) (!dir));
+            if (get_bal(a)) {
+                break;
+            } else {
+                set_skew(a, dir);
+                p = a->up;
+                if (p) {
+                    dir = a != sub_left(p);
+                    a = p;
+                } else {
+                    return 2;
+                }
+            }
+        }
+        /* Now bal(a) == -1 or +1 */
+        /* Rotate if need be */
+        if (dir == 0) {
+            if (is_rskew(a))
+                unset_rskew(a);
+            else {
+                avl_node *u = a->up;
+                avl_node **r = u ? &u->sub[a != sub_left(u)] : &t->root;
+                p = a;
+                if (is_lskew(sub_left(p))) {
+                    /* rotR(p) */
+                    a = sub_left(p);
+                    sub_left(p) = sub_right(a);
+                    if (sub_right(a)) {
+                        sub_right(a)->up = p;
+                    }
+                    sub_right(a) = p;
+                    unset_lskew(p);
+                    rbal(p) -= rzero(a);
+                } else {
+                    /* rotLR(p) */
+                    a = sub_right(sub_left(p));
+                    sub_right(sub_left(p)) = sub_left(a);
+                    if (sub_left(a)) {
+                        sub_left(a)->up = sub_left(p);
+                    }
+                    sub_left(p)->up = a;
+                    sub_left(a) = sub_left(p);
+                    sub_left(p) = sub_right(a);
+                    if (sub_right(a)) {
+                        sub_right(a)->up = p;
+                    }
+                    sub_right(a) = p;
+                    switch (get_bal(a)) {
+                        case 0: /* not skewed */
+                            unset_lskew(p);
+                            unset_rskew(sub_left(a));
+                            break;
+                        case 1: /* left skew */
+                            unset_lskew(p);
+                            set_rskew(p);
+                            unset_rskew(sub_left(a));
+                            break;
+                        case 2: /* right skew */
+                            unset_lskew(p);
+                            unset_rskew(sub_left(a));
+                            set_lskew(sub_left(a));
+                            break;
+                    }
+                    rbal(a) += rzero(sub_left(a));
+                    rbal(p) -= rzero(a);
+                }
+                rbal(a) &= ~3;
+                a->up = u;
+                p->up = a;
+                *r = a;
+            }
+        } else if (is_lskew(a)) {
+            unset_lskew(a);
+        } else {
+            avl_node *u = a->up;
+            avl_node **r = u != NULL ? &u->sub[a != sub_left(u)] : &t->root;
+            p = a;
+            if (is_rskew(sub_right(p))) {
+                /* rotL(p) */
+                a = sub_right(p);
+                sub_right(p) = sub_left(a);
+                if (sub_left(a)) {
+                    sub_left(a)->up = p;
+                }
+                sub_left(a) = p;
+                unset_rskew(p);
+                rbal(a) += rzero(p);
+            } else {
+                /* rotRL(p) */
+                a = sub_left(sub_right(p));
+                sub_left(sub_right(p)) = sub_right(a);
+                if (sub_right(a)) {
+                    sub_right(a)->up = sub_right(p);
+                }
+                sub_right(p)->up = a;
+                sub_right(a) = sub_right(p);
+                sub_right(p) = sub_left(a);
+                if (sub_left(a)) {
+                    sub_left(a)->up = p;
+                }
+                sub_left(a) = p;
+                switch (get_bal(a)) {
+                    case 0:	/* not skewed */
+                        unset_rskew(p);
+                        unset_lskew(sub_right(a));
+                        break;
+                    case 1:	/* left skew */
+                        unset_rskew(p);
+                        unset_lskew(sub_right(a));
+                        set_rskew(sub_right(a));
+                        break;
+                    case 2:	/* right skew */
+                        unset_rskew(p);
+                        set_lskew(p);
+                        unset_lskew(sub_right(a));
+                        break;
+                }
+                rbal(sub_right(a)) -= rzero(a);
+                rbal(a) += rzero(p);
+            }
+            rbal(a) &= ~3;
+            a->up = u;
+            p->up = a;
+            *r = a;
+        }
+        /* The tree rooted at 'a' is now valid */
+        /* Finish adjusting ranks */
+        while ((p = a->up)) {
+            incr_rank(p, (rbal_t)(a == sub_left(p)));
+            a = p;
+        }
+        return 1;
+    }
+    return 2;
+}
+
+/* detach [p] : non-null; only the linkage is tweaked */
+
+static avl_code_t rebalance_del(avl_node *p, avl_tree t, void **backup)
+{
+    rbal_t bal;
+    int dir = 0;
+    avl_node *a = p->up;
+    avl_node **r = a ? &a->sub[dir = p != sub_left(a)] : &t->root;
+    avl_node *c = sub_right(p);
+    if (! c && ! sub_left(p)) {
+        *r = NULL;
+    } else if (! c || ! sub_left(p)) {
+        *r = c ? c : sub_left(p);
+        (*r)->up = a;
+    } else {
+        if (sub_left(c)) {
+            do {
+                c = sub_left(c);
+            } while (sub_left(c));
+            a = c->up;
+            dir = 0;
+            sub_left(a) = sub_right(c);
+            if (sub_right(c)) {
+                sub_right(c)->up = a;
+            }
+            sub_right(c) = sub_right(p);
+            sub_right(c)->up = c;
+        } else {
+            a = c;
+            dir = 1;
+        }
+        sub_left(c) = sub_left(p);
+        sub_left(c)->up = c;
+        c->up = p->up;
+        rbal(c) = rbal(p);
+        *r = c;
+    }
+    if (backup) {
+        *backup = (*t->copy)(p->item);
+    }
+    detach_node(p, t, NULL);
+    /* Start backtracking : subtree of [a] in direction [dir] is less deep */
+    for (;; a = (*r)->up) {
+        if (a == NULL) {
+            return 2;
+        } else {
+            decr_rank(a, (rbal_t)(!dir));
+            bal = get_bal(a);
+            if (dir == 0) {
+                if (bal == 0) {
+                    set_rskew(a);
+                    break;
+                }
+                if (a->up) {
+                    dir = a != sub_left(a->up);
+                    r = &a->up->sub[dir];
+                } else {
+                    r = &t->root;
+                }
+                if (bal & 1) {
+                    unset_lskew(a);
+                }
+                if (get_bal(a)) {
+                    p = a;
+                    bal = get_bal(sub_right(p));
+                    if (! (bal & 1)) {
+                        /* bal = 0 or +1 */
+                        /* rotL(p) */
+                        a = sub_right(p);
+                        sub_right(p) = sub_left(a);
+                        if (sub_left(a)) {
+                            sub_left(a)->up = p;
+                        }
+                        sub_left(a) = p;
+                        if (bal) {
+                            unset_rskew(p);
+                            unset_rskew(a);
+                        } else {
+                            set_lskew(a);
+                        }
+                        rbal(a) += rzero(p);
+                    } else {
+                        /* rotRL(p) */
+                        a = sub_left(sub_right(p));
+                        sub_left(sub_right(p)) = sub_right(a);
+                        if (sub_right(a)) {
+                            sub_right(a)->up = sub_right(p);
+                        }
+                        sub_right(p)->up = a;
+                        sub_right(a) = sub_right(p);
+                        sub_right(p) = sub_left(a);
+                        if (sub_left(a)) {
+                            sub_left(a)->up = p;
+                        }
+                        sub_left(a) = p;
+                        switch (get_bal(a)) {
+                            case 0: /* not skewed */
+                                unset_rskew(p);
+                                unset_lskew(sub_right(a));
+                                break;
+                            case 1: /* left skew */
+                                unset_rskew(p);
+                                unset_lskew(sub_right(a));
+                                set_rskew(sub_right(a));
+                                break;
+                            case 2: /* right skew */
+                                unset_rskew(p);
+                                set_lskew(p);
+                                unset_lskew(sub_right(a));
+                                break;
+                        }
+                        rbal(a) &= ~3;
+                        rbal(sub_right(a)) -= rzero(a);
+                        rbal(a) += rzero(p);
+                    }
+                    a->up = p->up;
+                    p->up = a;
+                    /* Done with rotation */
+                    *r = a;
+                    if (bal == 0) {
+                        break;
+                    }
+                }
+            } else {
+                /* dir == 1 */
+                if (bal == 0) {
+                    set_lskew(a);
+                    break;
+                }
+                if (a->up == NULL) {
+                    r = &t->root;
+                } else {
+                    dir = a != sub_left(a->up);
+                    r = &a->up->sub[dir];
+                }
+                if (bal & 2) {
+                    unset_rskew(a);
+                }
+                if (get_bal(a)) {
+                    p = a;
+                    bal = get_bal(sub_left(p));
+                    if (! (bal & 2)) {
+                        /* bal = 0 or -1 */
+                        /* rotR(p) */
+                        a = sub_left(p);
+                        sub_left(p) = sub_right(a);
+                        if (sub_right(a)) {
+                            sub_right(a)->up = p;
+                        }
+                        sub_right(a) = p;
+                        if (bal) {
+                            unset_lskew(p);
+                            unset_lskew(a);
+                        } else {
+                            set_rskew(a);
+                        }
+                        rbal(p) -= rzero(a);
+                    } else {
+                        /* rotLR(p) */
+                        a = sub_right(sub_left(p));
+                        sub_right(sub_left(p)) = sub_left(a);
+                        if (sub_left(a)) {
+                            sub_left(a)->up = sub_left(p);
+                        }
+                        sub_left(p)->up = a;
+                        sub_left(a) = sub_left(p);
+                        sub_left(p) = sub_right(a);
+                        if (sub_right(a) != NULL) {
+                            sub_right(a)->up = p;
+                        }
+                        sub_right(a) = p;
+                        switch (get_bal(a)) {
+                            case 0: /* not skewed */
+                                unset_lskew(p);
+                                unset_rskew(sub_left(a));
+                                break;
+                            case 1: /* left skew */
+                                unset_lskew(p);
+                                set_rskew(p);
+                                unset_rskew(sub_left(a));
+                                break;
+                            case 2: /* right skew */
+                                unset_lskew(p);
+                                unset_rskew(sub_left(a));
+                                set_lskew(sub_left(a));
+                                break;
+                        }
+                        rbal(a) &= ~3;
+                        rbal(a) += rzero(sub_left(a));
+                        rbal(p) -= rzero(a);
+                    }
+                    a->up = p->up;
+                    p->up = a;
+                    /* Done with rotation */
+                    *r = a;
+                    if (bal == 0) {
+                        break;
+                    }
+                }
+            }
+        }
+    }
+    /* Finish adjusting ranks */
+    while ((p = a->up)) {
+        decr_rank(p, (rbal_t)(a == sub_left(p)));
+        a = p;
+    }
+    return 1;
+}
+
+void *avl_first(avl_tree t)
+{
+    if (t && t->root) {
+        return get_item(node_first(t->root));
+    } else {
+        return NULL;
+    }
+}
+
+void *avl_last(avl_tree t)
+{
+    if (t && t->root) {
+        return get_item(node_last(t->root));
+    } else {
+        return NULL;
+    }
+}
+
+void *avl_find(const void *item, avl_tree t)
+{
+    if (t) {
+        avl_node *a = node_find(item, t);
+        return a ? get_item(a) : NULL;
+    } else {
+        return NULL;
+    }
+}
+
+/*
+    Return smallest index i in [1:len] s.t. tree[i] matches [item], or zero if
+    not found
+*/
+
+avl_size_t avl_index(const void *item, avl_tree t)
+{
+    if (item && t && t->root) {
+        avl_compare_func cmp = t->compare;
+        avl_node *a, *p;
+        avl_size_t idx = 0, n = 0;
+        for (a = t->root;;) {
+            int c = item_compare(cmp, t, item, get_item(a));
+            if (! c) {
+                idx = n + get_rank(a);
+            } else if (c > 0) {
+                n += get_rank(a);
+            }
+            p = a->sub[c > 0];
+            if (p) {
+                a = p;
+            } else {
+                return idx;
+            }
+        }
+    } else {
+        return 0;
+    }
+}
+
+/*
+    (lo,hi) where lo smallest index s.t. t[lo] >= lo_item, or t->count+1 and hi
+    greatest index s.t. t[hi] <= hi_item, or 0
+*/
+
+avl_code_t avl_span(const void *lo_item, const void *hi_item, avl_tree t, avl_size_t *lo_idx, avl_size_t *hi_idx)
+{
+    if (t) {
+        *lo_idx = t->count + 1;
+        *hi_idx = 0;
+        if (t->root) {
+            avl_compare_func cmp = t->compare;
+            avl_node *a;
+            avl_size_t n = 0;
+            int c = item_compare(cmp, t, lo_item, hi_item) > 0;
+            if (c > 0) {
+                const void *temp = lo_item;
+                lo_item = hi_item;
+                hi_item = temp;
+            }
+            a = t->root;
+            do {
+                c = item_compare(cmp, t, lo_item, get_item(a));
+                if (c > 0) {
+                    n += get_rank(a);
+                    a = sub_right(a);
+                } else {
+                    *lo_idx = n + get_rank(a);
+                    a = sub_left(a);
+                }
+            } while (a);
+            a = t->root;
+            do {
+                c = item_compare(cmp, t, hi_item, get_item(a));
+                if (c < 0) {
+                    a = sub_left(a);
+                } else {
+                    *hi_idx += get_rank(a);
+                    a = sub_right(a);
+                }
+            } while (a);
+            return 0;
+        }
+    }
+    return -1;
+}
+
+/* Find the smallest item in tree [t] that is GEQ the passed item */
+
+void *avl_find_atleast(const void *item, avl_tree t)
+{
+    if (t && t->root) {
+        avl_compare_func cmp = t->compare;
+        avl_node *a = t->root;
+        void *p = NULL;
+        do {
+            int c = item_compare(cmp, t, item, get_item(a));
+            if (c > 0) {
+                a = sub_right(a);
+            } else {
+                p = get_item(a);
+                a = sub_left(a);
+            }
+        } while (a);
+        return p;
+    } else {
+        return NULL;
+    }
+}
+
+/* Find the greatest item in tree [t] that is LEQ the passed item */
+
+void *avl_find_atmost(const void *item, avl_tree t)
+{
+    if (t && t->root) {
+        avl_compare_func cmp = t->compare;
+        avl_node *a = t->root;
+        void *p = NULL;
+        do {
+            int c = item_compare(cmp, t, item, get_item(a));
+            if (c < 0) {
+                a = sub_left(a);
+            } else {
+                p = get_item(a);
+                a = sub_right(a);
+            }
+        } while (a);
+        return p;
+    } else {
+        return NULL;
+    }
+}
+
+/* Retrieve item of index [idx] in tree [t] */
+
+void *avl_find_index(avl_size_t idx, avl_tree t)
+{
+    if (t) {
+        avl_node *a = node_find_index(idx, t);
+        return a ? get_item(a) : NULL;
+    } else {
+        return NULL;
+    }
+}
+
+/* Iterative insertion */
+
+avl_code_t avl_ins (void *item, avl_tree t, avl_bool_t allow_duplicates)
+{
+    if (t) {
+        avl_compare_func cmp = t->compare;
+        avl_node **r, *a;
+        int dir = 0;
+        for (r = &t->root, a = NULL; *r != NULL; r = &a->sub[dir = dir > 0]) {
+            a = *r;
+            dir = item_compare(cmp, t, item, get_item(a));
+            if (!dir && !allow_duplicates)
+                return 0;
+        }
+        *r = new_node(item, a, t);
+        if (*r) {
+            t->count++;
+            return rebalance_ins(a, dir, t);
+        } else {
+            return -1;
+        }
+    } else {
+        return 0;
+    }
+}
+
+avl_code_t avl_del(void *item, avl_tree t, void **backup)
+{
+    if (t && t->root) {
+        avl_node *a = node_find(item, t);
+        if (a) {
+            return rebalance_del(a, t, backup);
+        } else {
+            return 0;
+        }
+    } else {
+        return 0;
+    }
+}
+
+/* helper function */
+
+static avl_code_t node_del_first(avl_tree t, struct ptr_handler *h)
+{
+    avl_node *c;
+    avl_node *p = node_first (t->root);
+    avl_node *a = p->up;
+    if (sub_right(p)) {
+        sub_right(p)->up = a;
+    }
+    if (a == NULL) {
+        t->root = sub_right(p);
+    } else {
+        sub_left(a) = sub_right(p);
+    }
+    detach_node (p, t, h);
+    /* Start backtracking : subtree of [a] in direction [0] is less deep */
+    for (;; a = c) {
+        if (a) {
+            rbal_t bal;
+            decr_rank(a, 1);
+            bal = get_bal(a);
+            if (bal == 0) {
+                set_rskew(a);
+                break;
+            } else {
+                if (bal & 1) {
+                    unset_lskew(a);
+                }
+                c = a->up;
+                if (get_bal(a)) {
+                    p = a;
+                    bal = get_bal(sub_right(p));
+                    if (! (bal & 1)) {
+                        /* bal = 0 or +1 */
+                        /* rotL(p) */
+                        a = sub_right(p);
+                        sub_right(p) = sub_left(a);
+                        if (sub_left(a)) {
+                            sub_left(a)->up = p;
+                        }
+                        sub_left(a) = p;
+                        if (bal) {
+                            unset_rskew(p);
+                            unset_rskew(a);
+                        } else {
+                            set_lskew(a);
+                        }
+                        rbal(a) += rzero(p);
+                    } else {
+                        /* rotRL(p) */
+                        a = sub_left(sub_right(p));
+                        sub_left(sub_right(p)) = sub_right(a);
+                        if (sub_right(a)) {
+                            sub_right(a)->up = sub_right(p);
+                        }
+                        sub_right(p)->up = a;
+                        sub_right(a) = sub_right(p);
+                        sub_right(p) = sub_left(a);
+                        if (sub_left(a)) {
+                            sub_left(a)->up = p;
+                        }
+                        sub_left(a) = p;
+                        switch (get_bal(a)) {
+                            case 0: /* not skewed */
+                                unset_rskew(p);
+                                unset_lskew(sub_right(a));
+                                break;
+                            case 1: /* left skew */
+                                unset_rskew(p);
+                                unset_lskew(sub_right(a));
+                                set_rskew(sub_right(a));
+                                break;
+                            case 2: /* right skew */
+                                unset_rskew(p);
+                                set_lskew(p);
+                                unset_lskew(sub_right(a));
+                                break;
+                        }
+                        rbal(a) &= ~3;
+                        rbal(sub_right(a)) -= rzero(a);
+                        rbal(a) += rzero(p);
+                    }
+                    a->up = p->up;
+                    p->up = a;
+                    /* Done with rotation */
+                    if (c) {
+                        sub_left(c) = a;
+                    } else {
+                        t->root = a;
+                    }
+                    if (bal == 0) {
+                        break;
+                    }
+                }
+            }
+        } else {
+            return 2;
+        }
+    }
+    /* Finish adjusting ranks */
+    while ((a = a->up)) {
+        decr_rank(a, 1);
+    }
+    return 1;
+}
+
+/* helper function */
+
+static avl_code_t node_del_last(avl_tree t, struct ptr_handler *h)
+{
+
+    avl_node *c;
+    avl_node *p = node_last (t->root);
+    avl_node *a = p->up;
+    if (sub_left(p)) {
+        sub_left(p)->up = a;
+    }
+    if (a) {
+        sub_right(a) = sub_left(p);
+    } else {
+        t->root = sub_left(p);
+    }
+    detach_node(p, t, h);
+    /* Start backtracking : subtree of [a] in direction [1] is less deep */
+    for (;; a = c) {
+        if (a) {
+            rbal_t bal = get_bal(a);
+            if (bal == 0) {
+                set_lskew(a);
+                break;
+            } else {
+                if (bal & 2) {
+                    unset_rskew(a);
+                }
+                c = a->up;
+                if (get_bal(a)) {
+                    p = a;
+                    bal = get_bal(sub_left(p));
+                    if (! (bal & 2)) {
+                        /* bal = 0 or -1 */
+                        /* rotR(p) */
+                        a = sub_left(p);
+                        sub_left(p) = sub_right(a);
+                        if (sub_right(a)) {
+                            sub_right(a)->up = p;
+                        }
+                        sub_right(a) = p;
+                        if (bal) {
+                            unset_lskew(p);
+                            unset_lskew(a);
+                        } else {
+                            set_rskew(a);
+                        }
+                        rbal(p) -= rzero(a);
+                    } else {
+                        /* rotLR(p) */
+                        a = sub_right(sub_left(p));
+                        sub_right(sub_left(p)) = sub_left(a);
+                        if (sub_left(a) != NULL)
+                        sub_left(a)->up = sub_left(p);
+                        sub_left(p)->up = a;
+                        sub_left(a) = sub_left(p);
+                        sub_left(p) = sub_right(a);
+                        if (sub_right(a)) {
+                            sub_right(a)->up = p;
+                        }
+                        sub_right(a) = p;
+                        switch (get_bal(a)) {
+                            case 0: /* not skewed */
+                                unset_lskew(p);
+                                unset_rskew(sub_left(a));
+                                break;
+                            case 1: /* left skew */
+                                unset_lskew(p);
+                                set_rskew(p);
+                                unset_rskew(sub_left(a));
+                                break;
+                            case 2: /* right skew */
+                                unset_lskew(p);
+                                unset_rskew(sub_left(a));
+                                set_lskew(sub_left(a));
+                                break;
+                        }
+                        rbal(a) &= ~3;
+                        rbal(a) += rzero(sub_left(a));
+                        rbal(p) -= rzero(a);
+                    }
+                    a->up = p->up;
+                    p->up = a;
+                    /* Done with rotation */
+                    if (c) {
+                        sub_right(c) = a;
+                    } else {
+                        t->root = a;
+                    }
+                    if (bal == 0) {
+                        break;
+                    }
+                }
+            }
+        } else {
+            return 2;
+        }
+    }
+    return 1;
+}
+
+/*
+    [p] : juncture node(zeroed out)
+    [n] : rank of [p] in resulting tree
+    [delta] = depth_1 - depth_0
+*/
+
+static avl_code_t join_left(avl_node *p, avl_node **r0, avl_node *r1, int delta, int n)
+{
+    avl_node *a = NULL;
+    avl_node **r = r0;
+    if (r1) {
+        while (delta < -1) {
+            a = *r;
+            delta += (int) (is_lskew(a) + 1);
+            n -= (int) get_rank(a);
+            r = &sub_right(a);
+        }
+        r1->up = p;
+        if (*r) {
+            (*r)->up = p;
+        }
+        if (delta) {
+            set_lskew(p);
+        }
+    } else {
+        while (*r != NULL) {
+            a = *r;
+            n -= (int) get_rank(a);
+            r = &sub_right(a);
+        }
+    }
+    /* at this point bal(*r) = -1 or 0 */
+    sub_left(p) = *r;
+    sub_right(p) = r1;
+    p->up = a;
+    set_rank(p, n);
+    *r = p;
+    for (;;) {
+        if (! a) {
+            return 2;
+        } else if (get_bal(a)) {
+            break;
+        } else {
+            set_rskew(a);
+            a = a->up;
+        }
+    }
+    /* Rotate if need be */
+    /* No (+2,0) rotation to do */
+    if (is_lskew(a)) {
+        unset_lskew(a);
+    } else {
+        p = a;
+        if (is_rskew(sub_right(p))) {
+            /* rotL(p) */
+            a = sub_right(p);
+            sub_right(p) = sub_left(a);
+            if (sub_left(a)) {
+                sub_left(a)->up = p;
+            }
+            sub_left(a) = p;
+            unset_rskew(p);
+            rbal(a) += rzero(p);
+        } else {
+            /* rotRL(p) */
+            a = sub_left(sub_right(p));
+            sub_left(sub_right(p)) = sub_right(a);
+            if (sub_right(a)) {
+                sub_right(a)->up = sub_right(p);
+            }
+            sub_right(p)->up = a;
+            sub_right(a) = sub_right(p);
+            sub_right(p) = sub_left(a);
+            if (sub_left(a)) {
+                sub_left(a)->up = p;
+            }
+            sub_left(a) = p;
+            switch (get_bal(a)) {
+                case 0: /* not skewed */
+                    unset_rskew(p);
+                    unset_lskew(sub_right(a));
+                    break;
+                case 1: /* left skew */
+                    unset_rskew(p);
+                    unset_lskew(sub_right(a));
+                    set_rskew(sub_right(a));
+                    break;
+                case 2: /* right skew */
+                    unset_rskew(p);
+                    set_lskew(p);
+                    unset_lskew(sub_right(a));
+                    break;
+            }
+            rbal(sub_right(a)) -= rzero(a);
+            rbal(a) += rzero(p);
+        }
+        rbal(a) &= ~3;
+        a->up = p->up;
+        p->up = a;
+        if (a->up) {
+            sub_right(a->up) = a;
+        } else {
+            *r0 = a;
+        }
+    }
+    return 1;
+}
+
+/*
+    [p] : juncture node
+    [n] : rank of [p] in resulting tree
+*/
+
+static avl_code_t join_right(avl_node *p, avl_node *r0, avl_node **r1, int delta, int n)
+{
+    avl_node *a = NULL;
+    avl_node **r = r1;
+    if (r0) {
+        while (delta > +1) {
+            a = *r;
+            delta -= (int) (is_rskew(a) + 1);
+            incr_rank(a, (rbal_t)n);
+            r = &sub_left(a);
+        }
+        r0->up = p;
+        if (*r != NULL) {
+            (*r)->up = p;
+        }
+        if (delta) {
+            set_rskew(p);
+        }
+    } else {
+        while (*r) {
+            a = *r;
+            incr_rank(a, (rbal_t) n);
+            r = &sub_left(a);
+        }
+        n = 1;
+    }
+    /* at this point bal(*r) = +1 or 0 */
+    sub_left(p) = r0;
+    sub_right(p) = *r;
+    set_rank(p, n);
+    p->up = a;
+    *r = p;
+    for (;;) {
+        if (! a) {
+            return 2;
+        } else if (get_bal(a)) {
+            break;
+        } else {
+            set_lskew(a);
+            a = a->up;
+        }
+    }
+    /* Rotate if need be */
+    /* No (-2,0) rotation to do */
+    if (is_rskew(a)) {
+        unset_rskew(a);
+    } else {
+        p = a;
+        if (is_lskew(sub_left(p))) {
+            /* rotR(p) */
+            a = sub_left(p);
+            sub_left(p) = sub_right(a);
+            if (sub_right(a)) {
+                sub_right(a)->up = p;
+            }
+            sub_right(a) = p;
+            unset_lskew(p);
+            rbal(p) -= rzero(a);
+        } else {
+            /* rotLR(p) */
+            a = sub_right(sub_left(p));
+            sub_right(sub_left(p)) = sub_left(a);
+            if (sub_left(a)) {
+                sub_left(a)->up = sub_left(p);
+            }
+            sub_left(p)->up = a;
+            sub_left(a) = sub_left(p);
+            sub_left(p) = sub_right(a);
+            if (sub_right(a)) {
+                sub_right(a)->up = p;
+            }
+            sub_right(a) = p;
+            switch (get_bal(a)) {
+                case 0: /* not skewed */
+                    unset_lskew(p);
+                    unset_rskew(sub_left(a));
+                    break;
+                case 1: /* left skew */
+                    unset_lskew(p);
+                    set_rskew(p);
+                    unset_rskew(sub_left(a));
+                    break;
+                case 2: /* right skew */
+                    unset_lskew(p);
+                    unset_rskew(sub_left(a));
+                    set_lskew(sub_left(a));
+                    break;
+            }
+            rbal(a) += rzero(sub_left(a));
+            rbal(p) -= rzero(a);
+        }
+        rbal(a) &= ~3;
+        a->up = p->up;
+        p->up = a;
+        if (a->up != NULL) {
+            sub_left(a->up) = a;
+        } else {
+            *r1 = a;
+        }
+    }
+    return 1;
+}
+
+avl_code_t avl_del_first(avl_tree t, void **backup)
+{
+    if (t && t->root) {
+        avl_code_t rv;
+        if (backup) {
+            ptr_handler h = { NULL, OP_BACKUP };
+            rv = node_del_first(t, &h);
+            *backup = h.ptr;
+        } else {
+            rv = node_del_first(t, NULL);
+        }
+        return rv;
+    } else {
+        return 0;
+    }
+}
+
+avl_code_t avl_del_last(avl_tree t, void **backup)
+{
+    if (t && t->root) {
+        if (backup == NULL) {
+            return node_del_last(t, NULL);
+        } else {
+            ptr_handler h = { NULL, OP_BACKUP };
+            avl_code_t rv = node_del_last(t, &h);
+            *backup = h.ptr;
+            return rv;
+        }
+    } else {
+        return 0;
+    }
+}
+
+avl_code_t avl_ins_index(void *item, avl_size_t idx, avl_tree t)
+{
+    if (idx == 0 || t == NULL || idx > t->count + 1) {
+        return 0;
+    } else {
+        avl_node *p = new_node(item, NULL, t);
+        if (p) {
+            t->count++;
+            /* Note: 'attach_node' macro increments t->count */
+            if (idx == 1) {
+                return join_right(p, (avl_node *) NULL, &t->root, /*delta= */ 0, 1);
+            } else if (idx == t->count) {
+                return join_left(p, &t->root, (avl_node *) NULL, /*delta= */ 0, (int) t->count);
+            } else {
+                avl_node *a = node_find_index(idx - 1, t);
+                int dir;
+                if (sub_right(a)) {
+                    a = node_first(sub_right(a));
+                    sub_left(a) = p;
+                    dir = 0;
+                } else {
+                    sub_right(a) = p;
+                    dir = 1;
+                }
+                p->up = a;
+                return rebalance_ins(a, dir, t);
+            }
+        } else {
+            return -1;
+        }
+    }
+}
+
+avl_code_t avl_del_index(avl_size_t idx, avl_tree t, void **backup)
+{
+    if (! t) {
+        return 0;
+    } else if (idx == 0 || idx > t->count) {
+        return 0;
+    } else if (idx == 1) {
+        return avl_del_first(t, backup);
+    } else if (idx == t->count) {
+        return avl_del_last(t, backup);
+    } else {
+        avl_node *a = node_find_index(idx, t);
+        return rebalance_del(a, t, backup);
+    }
+}
+
+/* Outcome: [t0] handles the concatenation of [t0] and [t1] */
+
+void avl_cat(avl_tree t0, avl_tree t1)
+{
+    if (! t0 || ! t1 ||! t1->root) {
+        return;
+    } else if (t0->root) {
+        int delta = depth(t1->root) - depth(t0->root);
+        ptr_handler h = { NULL, OP_DETACH };
+        if (delta <= 0) {
+            if (node_del_first (t1, &h) == 2) {
+                --delta;
+            }
+            (void) join_left((avl_node *) h.ptr, &t0->root, t1->root, delta, (int) (t0->count + 1));
+        } else {
+            if (node_del_last(t0, &h) == 2) {
+                ++delta;
+            }
+            (void) join_right((avl_node *) h.ptr, t0->root, &t1->root, delta, (int) (t0->count + 1));
+            t0->root = t1->root;
+        }
+        t1->root = NULL;
+        t0->count += t1->count + 1;
+        t1->count = 0;
+    } else {
+        t0->root = t1->root;
+        t0->count = t1->count;
+        t1->root = NULL;
+        t1->count = 0;
+    }
+}
+
+/*
+    - [t0] and [t1] are existing handles
+    - See Donald Knuth, TAOCP Vol.3 "Sorting and searching"
+*/
+
+avl_code_t avl_split(const void *item, avl_tree t, avl_tree t0, avl_tree t1)
+{
+    if (t && t->root) {
+        t0->root = NULL;
+        t1->root = NULL;
+        t0->count = 0;
+        t1->count = 0;
+        avl_compare_func cmp = t->compare;
+        avl_node *a, *p, *sn; /* sn: split node */
+        int k, na, an[AVL_STACK_CAPACITY];
+        /* invariant: [na]= size of tree rooted at [a] plus one */
+        for (a = t->root, na = (int) (t->count + 1), k = 0;;) {
+            int d_ = item_compare(cmp, t, item, get_item(a));
+            if (d_) {
+                p = a->sub[d_ = d_ > 0];
+                if (p) {
+                    an[k++] = na;
+                    if (d_) {
+                        na -= (int) get_rank(a);
+                    } else {
+                        na = (int) get_rank(a);
+                    }
+                    a = p;
+                } else {
+                    return 0;
+                }
+            } else {
+                break;
+            }
+        }
+        /* record split node */
+        sn = a;
+        if (k == 0) {
+            t0->root = sub_left(a);
+            t1->root = sub_right(a);
+            if (t0->root) {
+                t0->root->up = NULL;
+            }
+            if (t1->root) {
+                t1->root->up = NULL;
+            }
+            t0->count = get_rank(a) - 1;
+            t1->count = t->count - get_rank(a);
+        } else {
+            avl_node *r[2];
+            int h[2], ha;
+            avl_size_t n[2];
+            int d_;
+            r[0] = sub_left(a);
+            r[1] = sub_right(a);
+            if (r[0]) {
+                r[0]->up = NULL;
+            }
+            if (r[1]) {
+                r[1]->up = NULL;
+            }
+            ha = depth(a);
+            h[0] = ha - (is_rskew(a) ? 2 : 1);
+            h[1] = ha - (is_lskew(a) ? 2 : 1);
+            n[0] = get_rank(a);           /* size of r[0] plus one */
+            n[1] = (avl_size_t) na - n[0];  /* size of r[1] plus one */
+            for (p = a->up, d_ = a != sub_left(p);;) {
+                a = p; /* a: juncture node */
+                p = a->up;
+                if (d_ == 0) {
+                    int hh = h[1];
+                    int nn;
+                    ha += (is_rskew(a) ? 2 : 1);
+                    h[1] = ha - (is_lskew(a) ? 2 : 1);
+                    nn = n[1];
+                    n[1] += (avl_size_t) (an[k - 1] - (int) get_rank(a));
+                    if (p) {
+                        d_ = a != sub_left(p);
+                    }
+                    rbal(a) = 0;
+                    if (h[1] >= hh) {
+                        avl_node *rr = r[1];
+                        r[1] = sub_right(a);
+                        if (r[1]) {
+                            r[1]->up = NULL;
+                        }
+                        h[1] += (2 == join_right(a, rr, r + 1, h[1] - hh, (int) nn));
+                    } else {
+                        h[1] = hh + (2 == join_left(a, r + 1, sub_right(a), h[1] - hh, (int) nn));
+                  }
+            } else {
+                int hh = h[0];
+                int nn;
+                ha += (is_lskew(a) ? 2 : 1);
+                h[0] = ha - (is_rskew(a) ? 2 : 1);
+                nn = get_rank(a);
+                n[0] += nn;
+                if (p) {
+                    d_ = a != sub_left(p);
+                }
+                rbal(a) = 0;
+                if (h[0] >= hh) {
+                    avl_node *rr = r[0];
+                    r[0] = sub_left(a);
+                    if (r[0]) {
+                        r[0]->up = NULL;
+                    }
+                    h[0] += (2 == join_left(a, r, rr, hh - h[0], (int) nn));
+                } else {
+                    h[0] = hh + (2 == join_right(a, sub_left(a), r, hh - h[0], (int) nn));
+                }
+            }
+            if (--k == 0)
+                break;
+            }
+            t0->root = r[0];
+            t1->root = r[1];
+            t0->count = n[0] - 1;
+            t1->count = n[1] - 1;
+        }
+        /* Detach split node */
+        detach_node(sn, t, NULL);
+        t->root = NULL;
+        t->count = 0;
+        return 1;
+    } else {
+        return 0;
+    }
+}
+
+/* Inorder traversal */
+
+void avl_walk(avl_tree t, avl_item_func proc, void *param)
+{
+    if (t && t->root) {
+        avl_node *a = t->root, *p;
+        while (1) {
+            while (sub_left(a)) {
+                a = sub_left(a);
+            }
+            while (1) {
+                (*proc)(get_item(a), param);
+                if (sub_right(a)) {
+                    break;
+                } else {
+                    do {
+                        p = a;
+                        a = p->up;
+                        if (! a) {
+                            return;
+                        }
+                    }
+                    while (p != sub_left(a));
+                }
+            }
+            a = sub_right(a);
+        }
+    }
+}
+
+/* recursive helper for 'avl_slice' */
+
+static int node_slice(avl_node **root, avl_node **cur, avl_tree tree, avl_size_t len)
+{
+    avl_size_t mid = len / 2;
+    if (mid == 0) {
+        if ((*root = new_node ((*cur)->item, /*parent */ NULL, tree)) == NULL) {
+            return -1;
+        } else {
+            sub_left(*root) = NULL;
+            sub_right(*root) = NULL;
+            rbal(*root) = 4;
+            *cur = node_next(*cur);
+            return 0;
+        }
+    } else if ((*root = new_node(NULL, /*parent */ NULL, tree))) {
+        avl_node *p = *root;
+        int h0, h1 = -1;
+        rbal(p) = (mid + 1) << 2;
+        if ((h0 = node_slice(&sub_left(p), cur, tree, mid)) < 0) {
+            return -1;
+        } else {
+            p->item = (*tree->copy) ((*cur)->item);
+            sub_left(p)->up = p;
+            *cur = node_next(*cur);
+            if (len -= mid + 1) {
+                if ((h1 = node_slice(&sub_right(p), cur, tree, len)) < 0) {
+                    return -1;
+                } else {
+                    sub_right(p)->up = p;
+                }
+            }
+            if (h0 > h1) {
+                set_lskew(p);
+            } else if (h0 < h1) {
+                set_rskew(p);
+                return 1 + h1;
+            }
+            return 1 + h0;
+        }
+    } else {
+        return -1;
+    }
+}
+
+/* Return a slice t[lo,hi) as a new tree */
+
+avl_tree avl_slice(avl_tree t, avl_size_t lo_idx, avl_size_t hi_idx, void *param)
+{
+    if (! t || (lo_idx > hi_idx) || (lo_idx > t->count))  {
+        return NULL;
+    } else {
+        if (lo_idx < 1) {
+            lo_idx = 1;
+        }
+        if (hi_idx > t->count + 1) {
+            hi_idx = t->count + 1;
+        }
+        {
+            avl_tree tt = avl_create(t->compare, t->copy, t->dispose, t->alloc, t->dealloc, param);
+            if (tt) {
+                if (lo_idx < hi_idx) {
+                    avl_node *cur = node_find_index(lo_idx, t);
+                    if (node_slice(&tt->root, &cur, t, tt->count = hi_idx - lo_idx) < 0) {
+                        AVL_SHOW_ERROR("%s\n", "couldn't allocate node in avl_slice()");
+                        node_empty(tt);
+                        (*t->dealloc)(tt);
+                        return NULL;
+                    } else {
+                        tt->root->up = NULL;
+                    }
+                }
+                return tt;
+            } else {
+                AVL_SHOW_ERROR("%s\n", "couldn't allocate new handle in avl_slice()");
+                return NULL;
+            }
+        }
+    }
+}
+
+/* recursive helper for 'avl_xload' */
+
+static int node_load(avl_node **root, avl_itersource cur, void **pres, avl_tree desc, avl_size_t len)
+{
+    avl_size_t mid = len / 2;
+    if (mid == 0) {
+        if (0 != (*cur->f) (cur, pres) || (*root = new_node (*pres, /*parent */ NULL, desc)) == NULL) {
+            return -1;
+        } else {
+            sub_left(*root) = NULL;
+            sub_right(*root) = NULL;
+            rbal(*root) = 4;
+            return 0;
+        }
+    } else if ((*root = new_node (NULL, /*parent */ NULL, desc))) {
+        avl_node *p = *root;
+        int h0, h1 = -1;
+        rbal(p) = (mid + 1) << 2;
+        if ((h0 = node_load(&sub_left(p), cur, pres, desc, mid)) < 0) {
+            return -1;
+        } else if (0 != (*cur->f)(cur, pres)) {
+            return -1;
+        } else {
+            p->item = (*desc->copy)(*pres);
+            sub_left(p)->up = p;
+            if (len -= mid + 1) {
+                if ((h1 = node_load(&sub_right(p), cur, pres, desc, len)) < 0) {
+                    return -1;
+                } else {
+                    sub_right(p)->up = p;
+                }
+            }
+            if (h0 > h1) {
+                set_lskew(p);
+            } else if (h0 < h1) {
+                set_rskew(p);
+                return 1 + h1;
+            }
+            return 1 + h0;
+        }
+    } else {
+        return -1;
+    }
+}
+
+/* Load 'len' items from itersource */
+
+avl_tree avl_xload(avl_itersource src, void **pres, avl_size_t len, avl_config conf, void *tree_param)
+{
+    if (src) {
+        avl_tree tt = avl_create(conf->compare, conf->copy, conf->dispose, conf->alloc, conf->dealloc, tree_param);
+        if (! tt) {
+            AVL_SHOW_ERROR("%s\n", "couldn't allocate new handle in avl_load()");
+            return NULL;
+        } if (len) {
+            if (node_load(&tt->root, src, pres, tt, tt->count = len) < 0) {
+                AVL_SHOW_ERROR("%s\n", "couldn't allocate node in avl_load()");
+                node_empty(tt);
+                (*tt->dealloc)(tt);
+                return NULL;
+            } else {
+                tt->root->up = NULL;
+            }
+        }
+        return tt;
+    } else {
+        return NULL;
+    }
+}
+
+/* ITERATORS */
+
+typedef enum {
+    AVL_ITERATOR_PRE,
+    AVL_ITERATOR_POST,
+    AVL_ITERATOR_INTREE
+} avl_status_t;
+
+struct avl_iterator_
+{
+    avl_node     *pos;
+    avl_tree      tree;
+    avl_status_t  status;
+};
+
+# define get_root(i)          i->tree->root
+# define is_pre(i)            i->status == AVL_ITERATOR_PRE
+# define is_post(i)           i->status == AVL_ITERATOR_POST
+# define set_pre_iterator(i)  i->status = AVL_ITERATOR_PRE
+# define set_post_iterator(i) i->status = AVL_ITERATOR_POST
+# define set_in_iterator(i)   i->status = AVL_ITERATOR_INTREE
+
+/*
+    Position existing iterator [iter] at node matching [item] in its own tree,
+    if it exists ; otherwise do nothing
+*/
+
+void avl_iterator_seek(const void *item, avl_iterator iter)
+{
+    avl_node *p = node_find(item, iter->tree);
+    if (p) {
+        set_in_iterator(iter);
+        iter->pos = p;
+    }
+}
+
+void avl_iterator_seek_index(avl_size_t idx, avl_iterator iter)
+{
+    avl_node *p = node_find_index(idx, iter->tree);
+    if (p) {
+        set_in_iterator(iter);
+        iter->pos = p;
+    }
+}
+
+/* Return item pointer at current position */
+
+void *avl_iterator_cur(avl_iterator iter)
+{
+    return iter->pos ? get_item(iter->pos) : NULL;
+}
+
+avl_size_t avl_iterator_count(avl_iterator iter)
+{
+    return iter->tree->count;
+}
+
+avl_size_t avl_iterator_index(avl_iterator iter)
+{
+    if (iter->pos) {
+        return get_index(iter->pos);
+    } else if (is_pre(iter)) {
+        return 0;
+    } else {
+        return iter->tree->count + 1;
+    }
+}
+
+/* Rustic: */
+
+avl_iterator avl_iterator_new(avl_tree t, avl_ini_t ini, ...)
+{
+    va_list args;
+    avl_iterator iter = NULL;
+    va_start(args, ini);
+    if (! t) {
+        /* okay */
+    } else if ((iter = (*t->alloc) (sizeof(struct avl_iterator_)))) {
+        iter->pos = NULL;
+        iter->tree = t;
+        if (ini != AVL_ITERATOR_INI_INTREE) {
+            iter->status = (ini == AVL_ITERATOR_INI_PRE) ? AVL_ITERATOR_PRE : AVL_ITERATOR_POST;
+        } else {
+            const void *item = NULL;
+            item = va_arg(args, const void *);
+            set_pre_iterator(iter);
+            if (item == NULL) {
+                AVL_SHOW_ERROR("%s\n", "missing argument to avl_iterator_new()");
+            } else {
+                avl_iterator_seek(item, iter);
+            }
+        }
+    } else {
+        AVL_SHOW_ERROR("%s\n", "couldn't create iterator");
+    }
+    va_end(args);
+    return iter;
+}
+
+/*
+    The following used to write to memory after it was freed. Corrected by: David
+    Turner <novalis@openplans.org>
+*/
+
+void avl_iterator_kill(avl_iterator iter)
+{
+    if (iter != NULL) {
+        avl_dealloc_func dealloc = iter->tree->dealloc;
+        iter->pos = NULL;
+        iter->tree = NULL;
+        (*dealloc)(iter);
+    }
+}
+
+void *avl_iterator_next(avl_iterator iter)
+{
+    if (is_post(iter)) {
+        return NULL;
+    } else {
+        avl_node *a = iter->pos;
+        if (is_pre(iter)) {
+            a = get_root(iter);
+            if (a) {
+                a = node_first(a);
+                set_in_iterator(iter);
+            }
+        } else {
+            a = node_next(a);
+            if (! a) {
+                set_post_iterator(iter);
+            }
+        }
+        iter->pos = a;
+        return a != NULL ? get_item(a) : NULL;
+    }
+}
+
+void *avl_iterator_prev(avl_iterator iter)
+{
+    if (is_pre(iter)) {
+        return NULL;
+    } else {
+        avl_node *a = iter->pos;
+        if (is_post(iter)) {
+            a = get_root(iter);
+            if (a) {
+                a = node_last(a);
+                set_in_iterator(iter);
+            }
+        } else {
+            a = node_prev(a);
+            if (! a) {
+                set_pre_iterator(iter);
+            }
+        }
+        iter->pos = a;
+        return a ? get_item(a) : NULL;
+    }
+}
+
+/* Remove node at current position and move cursor to next position */
+
+avl_code_t avl_iterator_del(avl_iterator iter, void **backup)
+{
+    if (iter == NULL || iter->pos == NULL) {
+        return 0;
+    } else {
+        avl_node *a = iter->pos, *p;
+        p = node_next(a);
+        if (! p) {
+            set_post_iterator(iter);
+        }
+        iter->pos = p;
+        return rebalance_del(a, iter->tree, backup);
+    }
+}
diff --git a/source/luametatex/source/libraries/avl/avl.h b/source/luametatex/source/libraries/avl/avl.h
new file mode 100644
index 000000000..03a1384b7
--- /dev/null
+++ b/source/luametatex/source/libraries/avl/avl.h
@@ -0,0 +1,445 @@
+/*
+    This small C package is made of an independent set of routines dedicated to manipulating AVL
+    trees (files avl.c, avl.h), and of an extension module for Python that builds upon it (file
+    avlmodule.c). Unlike collectionsmodule.c, the latter file contains only Python bindings: it
+    adds nothing to the underlying implementation.
+
+    license: this package, pyavl, is donated to the public domain
+    author : Richard McGraw
+    email  : dasnar@fastmail.fm
+*/
+
+/*
+    This file is reformatted a little. As there has not been any changed in the original we assume
+    this is okay. No changes means that the code is fine and we never ran into issues.
+
+    The avl code is used for hashing strings. It is also used in the backend of luatex but in
+    luametatex we don't have that.
+*/
+
+# ifndef LIBRARIES_AVL_H
+# define LIBRARIES_AVL_H
+
+# include <stdarg.h>
+# include <stdio.h>
+# include <stdlib.h>
+
+// # define avl_del     mp_avl_del
+// # define avl_ins     mp_avl_ins
+// # define avl_tree    mp_avl_tree
+// # define avl_entry   mp_avl_entry
+// # define avl_find    mp_avl_find
+// # define avl_create  mp_avl_create
+// # define avl_destroy mp_avl_destroy
+
+typedef enum avl_bool_t {
+    avl_false,
+    avl_true
+} avl_bool_t;
+
+# include <inttypes.h>
+
+typedef int8_t   avl_code_t;
+typedef int8_t   avl_bal_t;
+typedef uint32_t avl_size_t;
+
+typedef int   (*avl_compare_func)      (void *param, const void *lhs, const void *rhs);
+typedef void *(*avl_item_copy_func)    (const void *item);
+typedef void *(*avl_item_dispose_func) (void *item);
+typedef void  (*avl_item_func)         (const void *item, void *param);
+typedef void *(*avl_alloc_func)        (size_t);
+typedef void  (*avl_dealloc_func)      (void *);
+
+/* At minimum, shallow copy */
+
+const void *avl_default_item_copy    (const void *);
+void       *avl_default_item_dispose (void *);
+
+# define AVL_STACK_CAPACITY 32  /* for avl_split() function */
+
+typedef enum avl_ini_t {
+    AVL_ITERATOR_INI_PRE,
+    AVL_ITERATOR_INI_POST,
+    AVL_ITERATOR_INI_INTREE
+} avl_ini_t;
+
+typedef struct avl_tree_        *avl_tree;
+typedef struct avl_iterator_    *avl_iterator;
+typedef struct avl_itersource_   avl_itersource_struct;
+typedef struct avl_itersource_  *avl_itersource;
+
+struct avl_itersource_ {
+    void *p;
+    /* return nonzero on error */
+    avl_code_t(*f) (avl_itersource from, void **to);
+};
+
+typedef struct {
+    avl_compare_func      compare;
+    avl_item_copy_func    copy;
+    avl_item_dispose_func dispose;
+    avl_alloc_func        alloc;
+    avl_dealloc_func      dealloc;
+} avl_config_struct, *avl_config;
+
+/* Public Functions */
+
+/*
+    --- CREATE ---
+    Return a new tree and set its config.
+    Return NULL on allocation failure.
+    * 'alloc' defaults to malloc from stdlib
+    * 'dealloc' defaults to free from stdlib
+    * 'param' user param/refcon
+*/
+
+avl_tree avl_create(
+    avl_compare_func       compare,
+    avl_item_copy_func     copy,
+    avl_item_dispose_func  dispose,
+    avl_alloc_func         alloc,
+    avl_dealloc_func       dealloc,
+    void                  *param
+);
+
+/*
+    --- RESET ---
+    Empty tree 't' as in 'avl_empty()' and modify its config.
+*/
+
+void avl_reset(
+    avl_tree              t,
+    avl_compare_func      compare,
+    avl_item_copy_func    copy,
+    avl_item_dispose_func dispose,
+    avl_alloc_func        alloc,
+    avl_dealloc_func      dealloc
+);
+
+/*
+    --- EMPTY ---
+    Empty tree 't', calling its dispose_func for each item in 't'. The config is
+    untouched.
+*/
+
+void avl_empty(avl_tree t);
+
+/*
+    --- DESTROY ---
+    Empty tree 't' and free the handle.
+*/
+
+void avl_destroy(avl_tree t);
+
+/*
+    --- DUPLICATE (COPY) ---
+    Return a copy of tree 't', using its copy_func for each item in 't'. Upon
+    failure to allocate room for some item, return NULL.
+*/
+
+avl_tree avl_dup(avl_tree t, void *param);
+
+/*
+    --- EMPTYNESS ---
+    Return 'avl_true' iff tree 't' is empty (i.e. the handle is NULL or 't'
+    contains no item).
+*/
+
+avl_bool_t avl_isempty(avl_tree t);
+
+/*
+    --- SIZE ---
+    Return number of items contained in tree 't'.
+*/
+
+avl_size_t avl_size(avl_tree t);
+
+/*
+    --- FIRST (MINIMUM) ---
+    Return first item in in-order traversal of 't'. Return NULL if 't' is empty.
+*/
+
+void *avl_first(avl_tree t);
+
+/*
+    --- LAST (MAXIMUM) ---
+    Return last item in in-order traversal of 't'. Return NULL if 't' is empty.
+*/
+
+void *avl_last(avl_tree t);
+
+/*
+    --- FIND MATCHING ITEM ---
+    Find item matching 'item' parameter in tree 't'. Return NULL if it's not
+    found. If there are multiple matches, the first one that is encountered
+    during the search is returned; it may not be the one with lowest rank.
+*/
+
+void *avl_find(const void *item, avl_tree t);
+
+/*
+    --- INDEX (RANK) OF ITEM ---
+    Return smallest index 'i' s.t. 't[i]' matches 'item', or zero if 'item' is
+    not found.
+*/
+
+avl_size_t avl_index(const void *item, avl_tree t);
+
+/*
+    --- SPAN ITEMS ---
+    Return integers 'i,j' s.t. 't[i,j]'
+      i smallest index s.t. t[i] >= lo_item, or t->count+1 and
+      j greatest one   s.t. t[j] <= hi_item, or 0.
+
+    If 'hi_item' is less than 'lo_item' those are swapped.
+
+    Return codes:
+      0 success
+     -1 error: tree had no root
+     -2 error: compare failed
+*/
+
+avl_code_t avl_span(
+    const void *lo_item,
+    const void *hi_item,
+    avl_tree    t,
+    avl_size_t *lo_idx,
+    avl_size_t *hi_idx
+);
+
+/*
+    --- FIND AT LEAST ---
+    Return smallest item in 't' that is GEQ 'item', or NULL.
+*/
+
+void *avl_find_atleast(const void *item, avl_tree t);
+
+/*
+    --- FIND AT MOST ---
+    Return largest item in 't' that is LEQ 'item', or NULL.
+*/
+
+void *avl_find_atmost(const void *item, avl_tree t);
+
+/*
+    --- FIND BY INDEX (RANK) ---
+    Find item in 't' by index, that is return 't[idx]'. If 'idx' is not in
+    '[1,avl_size(t)]' then return NULL. If a compare failed then return NULL.
+*/
+
+void *avl_find_index(avl_size_t idx, avl_tree t);
+
+/*
+    --- INSERTION ---
+    Insert 'item' in tree 't' with regard to its compare_func. Say
+    'avl_ins(item,t,avl_true)' to insert 'item' in 't' even if it is there
+    already. If 'item' is a duplicate and 'allow_duplicates' is avl_false,
+    nothing is done.
+
+    Return codes:
+     -1 error: allocation of new node failed
+     -2 error: compare failed, tree unchanged
+      0 nothing was done, no error
+     +1 operation successful
+     +2 the same and height(t) increased by one.
+*/
+
+avl_code_t avl_ins(void *item, avl_tree t, avl_bool_t allow_duplicates);
+
+/*
+    --- DELETION ---
+    Remove 'item' from tree 't', calling its dispose_func. To make a backup of
+    'item' involving its copy_func, say 't(item,backup)' where 'backup' is some
+    pointer to pointer to item. Otherwise set it to NULL.
+
+    Return codes:
+      0 item not found
+     -2 error: compare failed, tree unchanged
+     +1 operation successful
+     +2 the same and height(t) decreased by one.
+*/
+
+avl_code_t avl_del(void *item, avl_tree t, void **backup);
+
+/*
+    --- DELETE FIRST ---
+    Remove first item in in-order traversal from tree 't'. Note that only one
+    item is removed. Return +1 or +2 as above.
+*/
+
+avl_code_t avl_del_first(avl_tree t, void **backup);
+
+/*
+    --- DELETE LAST ---
+    Remove last item in in-order traversal from tree 't'. Note that only one item
+    is removed. Return +1 or +2 as above.
+*/
+
+avl_code_t avl_del_last(avl_tree t, void **backup);
+
+/*
+    --- INSERT IN FRONT OF INDEX ---
+    Insert 'item' in tree 't' so that afterwards, 't[idx]=item' except if
+    'idx<=0' or 'idx>size(t)+1'. To append 'item' to 't' regardless of order, say
+    'avl_ins_index(item,size+1,t)'.
+*/
+
+avl_code_t avl_ins_index(void *item, avl_size_t idx, avl_tree t);
+
+/*
+    --- DELETE ITEM BY INDEX ---
+    Remove item of rank 'idx' from tree 't' and return +1 or +2 as above except
+    if 'idx' is not in '[1,avl_size(t)]' in which case return 0.
+*/
+
+avl_code_t avl_del_index(avl_size_t idx, avl_tree t, void **backup);
+
+/*
+    --- IN-PLACE CONCATENATION ---
+    Pre-condition: 't0' and 't1' are valid avl_trees Note that the code does not
+    check whether the maximal item in 't0' is LEQ than the minimal item in 't1'.
+    Post-condition: 't0' handles the concatenation of 't0' and 't1' which becomes
+    empty (but its config is untouched).
+*/
+
+void avl_cat(avl_tree t0, avl_tree t1);
+
+/*
+    --- SPLITTING ---
+    Pre-condition: 't0' and 't1' are existing handles. Post-condition: items in
+    't0' all compare LEQ than 'item' and items in 't1' all compare GEQ than
+    'item'. This implementation removes one item.
+
+    Return codes:
+     0 item not found, no-op
+    -2 compare failed, tree unchanged
+    +1 success
+*/
+
+avl_code_t avl_split(const void *item, avl_tree t, avl_tree t0, avl_tree t1);
+
+/*
+    --- IN-ORDER TRAVERSAL ---
+    Walk tree 't' in in-order, applying 'proc' at each node. The 'param' pointer
+    is passed to 'proc', like this: '(*proc) (item_at_node,param)'.
+*/
+
+void avl_walk(avl_tree t, avl_item_func proc, void *param);
+
+/*
+    --- SLICE ---
+    Create a _new tree_ from the slice 't[lo_idx,hi_idx)' provided 'lo_idx <=
+    hi_idx' and these indices are both in range. If a new tree can't be created
+    or if some item can't be allocated, return NULL. Otherwise if the indices are
+    inconsistent return NULL.
+*/
+
+avl_tree avl_slice(avl_tree t, avl_size_t lo_idx, avl_size_t hi_idx, void *param);
+
+/*
+    ITERATORS
+
+    An iterator assigned to a tree 't' is still usable after any item is inserted
+    into 't' and after any item not located at this iterator's current position
+    is deleted. The 'avl_iterator_del()' function may be used to remove the item
+    at the iterator's current position.
+
+*/
+
+/*
+    --- ITERATOR --- SEEK
+    Find 'item' in this iterator's tree as in 'avl_find()' and make it the
+    current position.
+*/
+
+void avl_iterator_seek(const void *item, avl_iterator iter);
+
+/*
+    --- ITERATOR --- COUNT
+    Return size of this iterator's tree
+*/
+
+avl_size_t avl_iterator_count(avl_iterator iter);
+
+/*
+    --- ITERATOR --- SEEK BY INDEX
+    Set the current position of 'iter' to 't[idx]' where 't' is the tree that is
+    iterated over.
+*/
+
+void avl_iterator_seek_index(avl_size_t idx, avl_iterator iter);
+
+/*
+    --- ITERATOR --- CURRENT POSITION
+    Return item at current position of 'iter'.
+*/
+
+void *avl_iterator_cur(avl_iterator iter);
+
+/*
+    --- ITERATOR --- INDEX
+    Return rank of current item of 'iter' (as a result of computation) except it
+    returns 0 or size of tree plus one if 'iter' is a pre- or post- iterator.
+*/
+
+avl_size_t avl_iterator_index(avl_iterator iter);
+
+/*
+    --- ITERATOR --- CREATE
+    Return a new cursor for tree 't'. If allocation of an iterator struct is
+    impossible, return NULL. Say 'avl_iterator_new(t, ini)' with
+    'ini==AVL_ITERATOR_INI_PRE' or 'ini==AVL_ITERATOR_INI_POST' or say
+    'avl_iterator_new(t, AVL_ITERATOR_INI_INTREE, item_pointer)' to set the
+    iterator's current position via
+    'avl_iterator_seek(item_pointer,the_iterator)'. In the latter case, the
+    iterator is flagged as pre-iterator if the item is not found.
+*/
+
+avl_iterator avl_iterator_new(avl_tree t, avl_ini_t ini, ...);
+
+/*
+    --- ITERATOR --- KILL
+    Cleanup: free the iterator struct.
+*/
+
+void avl_iterator_kill(avl_iterator iter);
+
+/*
+    --- ITERATOR --- SUCCESSOR
+    Get next item pointer in iterator or NULL. 'iter' is flagged as post-iterator
+    if it's in post-position.
+*/
+
+void *avl_iterator_next(avl_iterator iter);
+
+/*
+    --- ITERATOR --- PREDECESSOR
+    Get next item pointer in iterator or NULL. 'iter' is flagged as pre-iterator
+    if it's in pre-position.
+*/
+
+void *avl_iterator_prev(avl_iterator iter);
+
+/*
+    --- ITERATOR --- DELETION
+    Remove item at current position of iterator 'iter' from its tree, if there is
+    one. Current position is set to next item or iterator is flagged as
+    post-iterator.
+*/
+
+avl_code_t avl_iterator_del(avl_iterator iter, void **backup);
+
+/*
+    --- LOAD ---
+    More general version of avl_slice
+*/
+
+avl_tree avl_xload(
+    avl_itersource   src,
+    void           **pres,
+    avl_size_t       len,
+    avl_config       conf,
+    void            *param
+);
+
+# endif
diff --git a/source/luametatex/source/libraries/avl/readme.txt b/source/luametatex/source/libraries/avl/readme.txt
new file mode 100644
index 000000000..de5d4993e
--- /dev/null
+++ b/source/luametatex/source/libraries/avl/readme.txt
@@ -0,0 +1,20 @@
+Remark
+
+Usage of the avl library (irr) showed up in pdfTeX when Hartmut added some functionality. It therefore
+also ended up in being used in LuaTeX. The two files avl.c and avl.h come from pyavl and are in the 
+public domain:
+
+  license: this package, pyavl, is donated to the public domain
+  author : Richard McGraw
+  email  : dasnar@fastmail.fm
+
+In the pdfTeX/LuaTeX the files were just there but I could track them down to 
+
+  https://github.com/pankajp/pyavl
+
+where the dates indicate that nothing has changed in the meantime. In the copies used here I added the 
+information mentioned above. The files had some (experimental) code as well as optional testing on NULL 
+values. As I don't expect updates (the code has been okay for quite a while) I made the tests mandate 
+and removed the experimental code. 
+
+Hans Hagen 
+\ No newline at end of file
diff --git a/source/luametatex/source/libraries/decnumber/decContext.c b/source/luametatex/source/libraries/decnumber/decContext.c
new file mode 100644
index 000000000..6db29be03
--- /dev/null
+++ b/source/luametatex/source/libraries/decnumber/decContext.c
@@ -0,0 +1,437 @@
+/* ------------------------------------------------------------------ */
+/* Decimal Context module                                             */
+/* ------------------------------------------------------------------ */
+/* Copyright (c) IBM Corporation, 2000, 2009.  All rights reserved.   */
+/*                                                                    */
+/* This software is made available under the terms of the             */
+/* ICU License -- ICU 1.8.1 and later.                                */
+/*                                                                    */
+/* The description and User's Guide ("The decNumber C Library") for   */
+/* this software is called decNumber.pdf.  This document is           */
+/* available, together with arithmetic and format specifications,     */
+/* testcases, and Web links, on the General Decimal Arithmetic page.  */
+/*                                                                    */
+/* Please send comments, suggestions, and corrections to the author:  */
+/*   mfc@uk.ibm.com                                                   */
+/*   Mike Cowlishaw, IBM Fellow                                       */
+/*   IBM UK, PO Box 31, Birmingham Road, Warwick CV34 5JL, UK         */
+/* ------------------------------------------------------------------ */
+/* This module comprises the routines for handling arithmetic         */
+/* context structures.                                                */
+/* ------------------------------------------------------------------ */
+
+#include <string.h>           // for strcmp
+#include <stdio.h>            // for printf if DECCHECK
+#include "decContext.h"       // context and base types
+#include "decNumberLocal.h"   // decNumber local types, etc.
+
+/* compile-time endian tester [assumes sizeof(Int)>1] */
+static  const  Int mfcone=1;                       // constant 1
+static  const  Flag *mfctop=(const Flag *)&mfcone; // -> top byte
+#define LITEND *mfctop        // named flag; 1=little-endian
+
+/* ------------------------------------------------------------------ */
+/* round-for-reround digits                                           */
+/* ------------------------------------------------------------------ */
+const uByte DECSTICKYTAB[10]={1,1,2,3,4,6,6,7,8,9}; /* used if sticky */
+
+/* ------------------------------------------------------------------ */
+/* Powers of ten (powers[n]==10**n, 0<=n<=9)                          */
+/* ------------------------------------------------------------------ */
+const uInt DECPOWERS[10]={1, 10, 100, 1000, 10000, 100000, 1000000,
+                          10000000, 100000000, 1000000000};
+
+/* ------------------------------------------------------------------ */
+/* decContextClearStatus -- clear bits in current status              */
+/*                                                                    */
+/*  context is the context structure to be queried                    */
+/*  mask indicates the bits to be cleared (the status bit that        */
+/*    corresponds to each 1 bit in the mask is cleared)               */
+/*  returns context                                                   */
+/*                                                                    */
+/* No error is possible.                                              */
+/* ------------------------------------------------------------------ */
+decContext *decContextClearStatus(decContext *context, uInt mask) {
+  context->status&=~mask;
+  return context;
+  } // decContextClearStatus
+
+/* ------------------------------------------------------------------ */
+/* decContextDefault -- initialize a context structure                */
+/*                                                                    */
+/*  context is the structure to be initialized                        */
+/*  kind selects the required set of default values, one of:          */
+/*      DEC_INIT_BASE       -- select ANSI X3-274 defaults            */
+/*      DEC_INIT_DECIMAL32  -- select IEEE 754 defaults, 32-bit       */
+/*      DEC_INIT_DECIMAL64  -- select IEEE 754 defaults, 64-bit       */
+/*      DEC_INIT_DECIMAL128 -- select IEEE 754 defaults, 128-bit      */
+/*      For any other value a valid context is returned, but with     */
+/*      Invalid_operation set in the status field.                    */
+/*  returns a context structure with the appropriate initial values.  */
+/* ------------------------------------------------------------------ */
+decContext * decContextDefault(decContext *context, Int kind) {
+  // set defaults...
+  context->digits=9;                         // 9 digits
+  context->emax=DEC_MAX_EMAX;                // 9-digit exponents
+  context->emin=DEC_MIN_EMIN;                // .. balanced
+  context->round=DEC_ROUND_HALF_UP;          // 0.5 rises
+  context->traps=DEC_Errors;                 // all but informational
+  context->status=0;                         // cleared
+  context->clamp=0;                          // no clamping
+  #if DECSUBSET
+  context->extended=0;                       // cleared
+  #endif
+  switch (kind) {
+    case DEC_INIT_BASE:
+      // [use defaults]
+      break;
+    case DEC_INIT_DECIMAL32:
+      context->digits=7;                     // digits
+      context->emax=96;                      // Emax
+      context->emin=-95;                     // Emin
+      context->round=DEC_ROUND_HALF_EVEN;    // 0.5 to nearest even
+      context->traps=0;                      // no traps set
+      context->clamp=1;                      // clamp exponents
+      #if DECSUBSET
+      context->extended=1;                   // set
+      #endif
+      break;
+    case DEC_INIT_DECIMAL64:
+      context->digits=16;                    // digits
+      context->emax=384;                     // Emax
+      context->emin=-383;                    // Emin
+      context->round=DEC_ROUND_HALF_EVEN;    // 0.5 to nearest even
+      context->traps=0;                      // no traps set
+      context->clamp=1;                      // clamp exponents
+      #if DECSUBSET
+      context->extended=1;                   // set
+      #endif
+      break;
+    case DEC_INIT_DECIMAL128:
+      context->digits=34;                    // digits
+      context->emax=6144;                    // Emax
+      context->emin=-6143;                   // Emin
+      context->round=DEC_ROUND_HALF_EVEN;    // 0.5 to nearest even
+      context->traps=0;                      // no traps set
+      context->clamp=1;                      // clamp exponents
+      #if DECSUBSET
+      context->extended=1;                   // set
+      #endif
+      break;
+
+    default:                                 // invalid Kind
+      // use defaults, and ..
+      decContextSetStatus(context, DEC_Invalid_operation); // trap
+    }
+
+  return context;} // decContextDefault
+
+/* ------------------------------------------------------------------ */
+/* decContextGetRounding -- return current rounding mode              */
+/*                                                                    */
+/*  context is the context structure to be queried                    */
+/*  returns the rounding mode                                         */
+/*                                                                    */
+/* No error is possible.                                              */
+/* ------------------------------------------------------------------ */
+enum rounding decContextGetRounding(decContext *context) {
+  return context->round;
+  } // decContextGetRounding
+
+/* ------------------------------------------------------------------ */
+/* decContextGetStatus -- return current status                       */
+/*                                                                    */
+/*  context is the context structure to be queried                    */
+/*  returns status                                                    */
+/*                                                                    */
+/* No error is possible.                                              */
+/* ------------------------------------------------------------------ */
+uInt decContextGetStatus(decContext *context) {
+  return context->status;
+  } // decContextGetStatus
+
+/* ------------------------------------------------------------------ */
+/* decContextRestoreStatus -- restore bits in current status          */
+/*                                                                    */
+/*  context is the context structure to be updated                    */
+/*  newstatus is the source for the bits to be restored               */
+/*  mask indicates the bits to be restored (the status bit that       */
+/*    corresponds to each 1 bit in the mask is set to the value of    */
+/*    the correspnding bit in newstatus)                              */
+/*  returns context                                                   */
+/*                                                                    */
+/* No error is possible.                                              */
+/* ------------------------------------------------------------------ */
+decContext *decContextRestoreStatus(decContext *context,
+                                    uInt newstatus, uInt mask) {
+  context->status&=~mask;               // clear the selected bits
+  context->status|=(mask&newstatus);    // or in the new bits
+  return context;
+  } // decContextRestoreStatus
+
+/* ------------------------------------------------------------------ */
+/* decContextSaveStatus -- save bits in current status                */
+/*                                                                    */
+/*  context is the context structure to be queried                    */
+/*  mask indicates the bits to be saved (the status bits that         */
+/*    correspond to each 1 bit in the mask are saved)                 */
+/*  returns the AND of the mask and the current status                */
+/*                                                                    */
+/* No error is possible.                                              */
+/* ------------------------------------------------------------------ */
+uInt decContextSaveStatus(decContext *context, uInt mask) {
+  return context->status&mask;
+  } // decContextSaveStatus
+
+/* ------------------------------------------------------------------ */
+/* decContextSetRounding -- set current rounding mode                 */
+/*                                                                    */
+/*  context is the context structure to be updated                    */
+/*  newround is the value which will replace the current mode         */
+/*  returns context                                                   */
+/*                                                                    */
+/* No error is possible.                                              */
+/* ------------------------------------------------------------------ */
+decContext *decContextSetRounding(decContext *context,
+                                  enum rounding newround) {
+  context->round=newround;
+  return context;
+  } // decContextSetRounding
+
+/* ------------------------------------------------------------------ */
+/* decContextSetStatus -- set status and raise trap if appropriate    */
+/*                                                                    */
+/*  context is the context structure to be updated                    */
+/*  status  is the DEC_ exception code                                */
+/*  returns the context structure                                     */
+/*                                                                    */
+/* Control may never return from this routine, if there is a signal   */
+/* handler and it takes a long jump.                                  */
+/* ------------------------------------------------------------------ */
+decContext * decContextSetStatus(decContext *context, uInt status) {
+  context->status|=status;
+  if (status & context->traps) raise(SIGFPE);
+  return context;} // decContextSetStatus
+
+/* ------------------------------------------------------------------ */
+/* decContextSetStatusFromString -- set status from a string + trap   */
+/*                                                                    */
+/*  context is the context structure to be updated                    */
+/*  string is a string exactly equal to one that might be returned    */
+/*            by decContextStatusToString                             */
+/*                                                                    */
+/*  The status bit corresponding to the string is set, and a trap     */
+/*  is raised if appropriate.                                         */
+/*                                                                    */
+/*  returns the context structure, unless the string is equal to      */
+/*    DEC_Condition_MU or is not recognized.  In these cases NULL is  */
+/*    returned.                                                       */
+/* ------------------------------------------------------------------ */
+decContext * decContextSetStatusFromString(decContext *context,
+                                           const char *string) {
+  if (strcmp(string, DEC_Condition_CS)==0)
+    return decContextSetStatus(context, DEC_Conversion_syntax);
+  if (strcmp(string, DEC_Condition_DZ)==0)
+    return decContextSetStatus(context, DEC_Division_by_zero);
+  if (strcmp(string, DEC_Condition_DI)==0)
+    return decContextSetStatus(context, DEC_Division_impossible);
+  if (strcmp(string, DEC_Condition_DU)==0)
+    return decContextSetStatus(context, DEC_Division_undefined);
+  if (strcmp(string, DEC_Condition_IE)==0)
+    return decContextSetStatus(context, DEC_Inexact);
+  if (strcmp(string, DEC_Condition_IS)==0)
+    return decContextSetStatus(context, DEC_Insufficient_storage);
+  if (strcmp(string, DEC_Condition_IC)==0)
+    return decContextSetStatus(context, DEC_Invalid_context);
+  if (strcmp(string, DEC_Condition_IO)==0)
+    return decContextSetStatus(context, DEC_Invalid_operation);
+  #if DECSUBSET
+  if (strcmp(string, DEC_Condition_LD)==0)
+    return decContextSetStatus(context, DEC_Lost_digits);
+  #endif
+  if (strcmp(string, DEC_Condition_OV)==0)
+    return decContextSetStatus(context, DEC_Overflow);
+  if (strcmp(string, DEC_Condition_PA)==0)
+    return decContextSetStatus(context, DEC_Clamped);
+  if (strcmp(string, DEC_Condition_RO)==0)
+    return decContextSetStatus(context, DEC_Rounded);
+  if (strcmp(string, DEC_Condition_SU)==0)
+    return decContextSetStatus(context, DEC_Subnormal);
+  if (strcmp(string, DEC_Condition_UN)==0)
+    return decContextSetStatus(context, DEC_Underflow);
+  if (strcmp(string, DEC_Condition_ZE)==0)
+    return context;
+  return NULL;  // Multiple status, or unknown
+  } // decContextSetStatusFromString
+
+/* ------------------------------------------------------------------ */
+/* decContextSetStatusFromStringQuiet -- set status from a string     */
+/*                                                                    */
+/*  context is the context structure to be updated                    */
+/*  string is a string exactly equal to one that might be returned    */
+/*            by decContextStatusToString                             */
+/*                                                                    */
+/*  The status bit corresponding to the string is set; no trap is     */
+/*  raised.                                                           */
+/*                                                                    */
+/*  returns the context structure, unless the string is equal to      */
+/*    DEC_Condition_MU or is not recognized.  In these cases NULL is  */
+/*    returned.                                                       */
+/* ------------------------------------------------------------------ */
+decContext * decContextSetStatusFromStringQuiet(decContext *context,
+                                                const char *string) {
+  if (strcmp(string, DEC_Condition_CS)==0)
+    return decContextSetStatusQuiet(context, DEC_Conversion_syntax);
+  if (strcmp(string, DEC_Condition_DZ)==0)
+    return decContextSetStatusQuiet(context, DEC_Division_by_zero);
+  if (strcmp(string, DEC_Condition_DI)==0)
+    return decContextSetStatusQuiet(context, DEC_Division_impossible);
+  if (strcmp(string, DEC_Condition_DU)==0)
+    return decContextSetStatusQuiet(context, DEC_Division_undefined);
+  if (strcmp(string, DEC_Condition_IE)==0)
+    return decContextSetStatusQuiet(context, DEC_Inexact);
+  if (strcmp(string, DEC_Condition_IS)==0)
+    return decContextSetStatusQuiet(context, DEC_Insufficient_storage);
+  if (strcmp(string, DEC_Condition_IC)==0)
+    return decContextSetStatusQuiet(context, DEC_Invalid_context);
+  if (strcmp(string, DEC_Condition_IO)==0)
+    return decContextSetStatusQuiet(context, DEC_Invalid_operation);
+  #if DECSUBSET
+  if (strcmp(string, DEC_Condition_LD)==0)
+    return decContextSetStatusQuiet(context, DEC_Lost_digits);
+  #endif
+  if (strcmp(string, DEC_Condition_OV)==0)
+    return decContextSetStatusQuiet(context, DEC_Overflow);
+  if (strcmp(string, DEC_Condition_PA)==0)
+    return decContextSetStatusQuiet(context, DEC_Clamped);
+  if (strcmp(string, DEC_Condition_RO)==0)
+    return decContextSetStatusQuiet(context, DEC_Rounded);
+  if (strcmp(string, DEC_Condition_SU)==0)
+    return decContextSetStatusQuiet(context, DEC_Subnormal);
+  if (strcmp(string, DEC_Condition_UN)==0)
+    return decContextSetStatusQuiet(context, DEC_Underflow);
+  if (strcmp(string, DEC_Condition_ZE)==0)
+    return context;
+  return NULL;  // Multiple status, or unknown
+  } // decContextSetStatusFromStringQuiet
+
+/* ------------------------------------------------------------------ */
+/* decContextSetStatusQuiet -- set status without trap                */
+/*                                                                    */
+/*  context is the context structure to be updated                    */
+/*  status  is the DEC_ exception code                                */
+/*  returns the context structure                                     */
+/*                                                                    */
+/* No error is possible.                                              */
+/* ------------------------------------------------------------------ */
+decContext * decContextSetStatusQuiet(decContext *context, uInt status) {
+  context->status|=status;
+  return context;} // decContextSetStatusQuiet
+
+/* ------------------------------------------------------------------ */
+/* decContextStatusToString -- convert status flags to a string       */
+/*                                                                    */
+/*  context is a context with valid status field                      */
+/*                                                                    */
+/*  returns a constant string describing the condition.  If multiple  */
+/*    (or no) flags are set, a generic constant message is returned.  */
+/* ------------------------------------------------------------------ */
+const char *decContextStatusToString(const decContext *context) {
+  Int status=context->status;
+
+  // test the five IEEE first, as some of the others are ambiguous when
+  // DECEXTFLAG=0
+  if (status==DEC_Invalid_operation    ) return DEC_Condition_IO;
+  if (status==DEC_Division_by_zero     ) return DEC_Condition_DZ;
+  if (status==DEC_Overflow             ) return DEC_Condition_OV;
+  if (status==DEC_Underflow            ) return DEC_Condition_UN;
+  if (status==DEC_Inexact              ) return DEC_Condition_IE;
+
+  if (status==DEC_Division_impossible  ) return DEC_Condition_DI;
+  if (status==DEC_Division_undefined   ) return DEC_Condition_DU;
+  if (status==DEC_Rounded              ) return DEC_Condition_RO;
+  if (status==DEC_Clamped              ) return DEC_Condition_PA;
+  if (status==DEC_Subnormal            ) return DEC_Condition_SU;
+  if (status==DEC_Conversion_syntax    ) return DEC_Condition_CS;
+  if (status==DEC_Insufficient_storage ) return DEC_Condition_IS;
+  if (status==DEC_Invalid_context      ) return DEC_Condition_IC;
+  #if DECSUBSET
+  if (status==DEC_Lost_digits          ) return DEC_Condition_LD;
+  #endif
+  if (status==0                        ) return DEC_Condition_ZE;
+  return DEC_Condition_MU;  // Multiple errors
+  } // decContextStatusToString
+
+/* ------------------------------------------------------------------ */
+/* decContextTestEndian -- test whether DECLITEND is set correctly    */
+/*                                                                    */
+/*  quiet is 1 to suppress message; 0 otherwise                       */
+/*  returns 0 if DECLITEND is correct                                 */
+/*          1 if DECLITEND is incorrect and should be 1               */
+/*         -1 if DECLITEND is incorrect and should be 0               */
+/*                                                                    */
+/* A message is displayed if the return value is not 0 and quiet==0.  */
+/*                                                                    */
+/* No error is possible.                                              */
+/* ------------------------------------------------------------------ */
+Int decContextTestEndian(Flag quiet) {
+  Int res=0;                  // optimist
+  uInt dle=(uInt)DECLITEND;   // unsign
+  if (dle>1) dle=1;           // ensure 0 or 1
+
+  if (LITEND!=DECLITEND) {
+    if (!quiet) {             // always refer to this
+      #if DECPRINT
+      const char *adj;
+      if (LITEND) adj="little";
+             else adj="big";
+      printf("Warning: DECLITEND is set to %d, but this computer appears to be %s-endian\n",
+             DECLITEND, adj);
+      #endif
+      }
+    res=(Int)LITEND-dle;
+    }
+  return res;
+  } // decContextTestEndian
+
+/* ------------------------------------------------------------------ */
+/* decContextTestSavedStatus -- test bits in saved status             */
+/*                                                                    */
+/*  oldstatus is the status word to be tested                         */
+/*  mask indicates the bits to be tested (the oldstatus bits that     */
+/*    correspond to each 1 bit in the mask are tested)                */
+/*  returns 1 if any of the tested bits are 1, or 0 otherwise         */
+/*                                                                    */
+/* No error is possible.                                              */
+/* ------------------------------------------------------------------ */
+uInt decContextTestSavedStatus(uInt oldstatus, uInt mask) {
+  return (oldstatus&mask)!=0;
+  } // decContextTestSavedStatus
+
+/* ------------------------------------------------------------------ */
+/* decContextTestStatus -- test bits in current status                */
+/*                                                                    */
+/*  context is the context structure to be updated                    */
+/*  mask indicates the bits to be tested (the status bits that        */
+/*    correspond to each 1 bit in the mask are tested)                */
+/*  returns 1 if any of the tested bits are 1, or 0 otherwise         */
+/*                                                                    */
+/* No error is possible.                                              */
+/* ------------------------------------------------------------------ */
+uInt decContextTestStatus(decContext *context, uInt mask) {
+  return (context->status&mask)!=0;
+  } // decContextTestStatus
+
+/* ------------------------------------------------------------------ */
+/* decContextZeroStatus -- clear all status bits                      */
+/*                                                                    */
+/*  context is the context structure to be updated                    */
+/*  returns context                                                   */
+/*                                                                    */
+/* No error is possible.                                              */
+/* ------------------------------------------------------------------ */
+decContext *decContextZeroStatus(decContext *context) {
+  context->status=0;
+  return context;
+  } // decContextZeroStatus
+
diff --git a/source/luametatex/source/libraries/decnumber/decContext.h b/source/luametatex/source/libraries/decnumber/decContext.h
new file mode 100644
index 000000000..10428eb3a
--- /dev/null
+++ b/source/luametatex/source/libraries/decnumber/decContext.h
@@ -0,0 +1,254 @@
+/* ------------------------------------------------------------------ */
+/* Decimal Context module header                                      */
+/* ------------------------------------------------------------------ */
+/* Copyright (c) IBM Corporation, 2000, 2010.  All rights reserved.   */
+/*                                                                    */
+/* This software is made available under the terms of the             */
+/* ICU License -- ICU 1.8.1 and later.                                */
+/*                                                                    */
+/* The description and User's Guide ("The decNumber C Library") for   */
+/* this software is called decNumber.pdf.  This document is           */
+/* available, together with arithmetic and format specifications,     */
+/* testcases, and Web links, on the General Decimal Arithmetic page.  */
+/*                                                                    */
+/* Please send comments, suggestions, and corrections to the author:  */
+/*   mfc@uk.ibm.com                                                   */
+/*   Mike Cowlishaw, IBM Fellow                                       */
+/*   IBM UK, PO Box 31, Birmingham Road, Warwick CV34 5JL, UK         */
+/* ------------------------------------------------------------------ */
+/*                                                                    */
+/* Context variables must always have valid values:                   */
+/*                                                                    */
+/*  status   -- [any bits may be cleared, but not set, by user]       */
+/*  round    -- must be one of the enumerated rounding modes          */
+/*                                                                    */
+/* The following variables are implied for fixed size formats (i.e.,  */
+/* they are ignored) but should still be set correctly in case used   */
+/* with decNumber functions:                                          */
+/*                                                                    */
+/*  clamp    -- must be either 0 or 1                                 */
+/*  digits   -- must be in the range 1 through 999999999              */
+/*  emax     -- must be in the range 0 through 999999999              */
+/*  emin     -- must be in the range 0 through -999999999             */
+/*  extended -- must be either 0 or 1 [present only if DECSUBSET]     */
+/*  traps    -- only defined bits may be set                          */
+/*                                                                    */
+/* ------------------------------------------------------------------ */
+
+#if !defined(DECCONTEXT)
+  #define DECCONTEXT
+  #define DECCNAME     "decContext"                     /* Short name */
+  #define DECCFULLNAME "Decimal Context Descriptor"   /* Verbose name */
+  #define DECCAUTHOR   "Mike Cowlishaw"               /* Who to blame */
+
+  #if !defined(int32_t)
+    #include <stdint.h>            /* C99 standard integers           */
+  #endif
+  #include <stdio.h>               /* for printf, etc.                */
+  #include <signal.h>              /* for traps                       */
+
+  /* Extended flags setting -- set this to 0 to use only IEEE flags   */
+  #if !defined(DECEXTFLAG)
+  #define DECEXTFLAG 1             /* 1=enable extended flags         */
+  #endif
+
+  /* Conditional code flag -- set this to 0 for best performance      */
+  #if !defined(DECSUBSET)
+  #define DECSUBSET  0             /* 1=enable subset arithmetic      */
+  #endif
+
+  /* Context for operations, with associated constants                */
+  enum rounding {
+    DEC_ROUND_CEILING,             /* round towards +infinity         */
+    DEC_ROUND_UP,                  /* round away from 0               */
+    DEC_ROUND_HALF_UP,             /* 0.5 rounds up                   */
+    DEC_ROUND_HALF_EVEN,           /* 0.5 rounds to nearest even      */
+    DEC_ROUND_HALF_DOWN,           /* 0.5 rounds down                 */
+    DEC_ROUND_DOWN,                /* round towards 0 (truncate)      */
+    DEC_ROUND_FLOOR,               /* round towards -infinity         */
+    DEC_ROUND_05UP,                /* round for reround               */
+    DEC_ROUND_MAX                  /* enum must be less than this     */
+    };
+  #define DEC_ROUND_DEFAULT DEC_ROUND_HALF_EVEN;
+
+  typedef struct decContext {
+    int32_t  digits;               /* working precision               */
+    int32_t  emax;                 /* maximum positive exponent       */
+    int32_t  emin;                 /* minimum negative exponent       */
+    enum     rounding round;       /* rounding mode                   */
+    uint32_t traps;                /* trap-enabler flags              */
+    uint32_t status;               /* status flags                    */
+    uint8_t  clamp;                /* flag: apply IEEE exponent clamp */
+    #if DECSUBSET
+    uint8_t  extended;             /* flag: special-values allowed    */
+    #endif
+    } decContext;
+
+  /* Maxima and Minima for context settings                           */
+  #define DEC_MAX_DIGITS 999999999
+  #define DEC_MIN_DIGITS         1
+  #define DEC_MAX_EMAX   999999999
+  #define DEC_MIN_EMAX           0
+  #define DEC_MAX_EMIN           0
+  #define DEC_MIN_EMIN  -999999999
+  #define DEC_MAX_MATH      999999 /* max emax, etc., for math funcs. */
+
+  /* Classifications for decimal numbers, aligned with 754 (note that */
+  /* 'normal' and 'subnormal' are meaningful only with a decContext   */
+  /* or a fixed size format).                                         */
+  enum decClass {
+    DEC_CLASS_SNAN,
+    DEC_CLASS_QNAN,
+    DEC_CLASS_NEG_INF,
+    DEC_CLASS_NEG_NORMAL,
+    DEC_CLASS_NEG_SUBNORMAL,
+    DEC_CLASS_NEG_ZERO,
+    DEC_CLASS_POS_ZERO,
+    DEC_CLASS_POS_SUBNORMAL,
+    DEC_CLASS_POS_NORMAL,
+    DEC_CLASS_POS_INF
+    };
+  /* Strings for the decClasses */
+  #define DEC_ClassString_SN  "sNaN"
+  #define DEC_ClassString_QN  "NaN"
+  #define DEC_ClassString_NI  "-Infinity"
+  #define DEC_ClassString_NN  "-Normal"
+  #define DEC_ClassString_NS  "-Subnormal"
+  #define DEC_ClassString_NZ  "-Zero"
+  #define DEC_ClassString_PZ  "+Zero"
+  #define DEC_ClassString_PS  "+Subnormal"
+  #define DEC_ClassString_PN  "+Normal"
+  #define DEC_ClassString_PI  "+Infinity"
+  #define DEC_ClassString_UN  "Invalid"
+
+  /* Trap-enabler and Status flags (exceptional conditions), and      */
+  /* their names.  The top byte is reserved for internal use          */
+  #if DECEXTFLAG
+    /* Extended flags */
+    #define DEC_Conversion_syntax    0x00000001
+    #define DEC_Division_by_zero     0x00000002
+    #define DEC_Division_impossible  0x00000004
+    #define DEC_Division_undefined   0x00000008
+    #define DEC_Insufficient_storage 0x00000010 /* [when malloc fails]  */
+    #define DEC_Inexact              0x00000020
+    #define DEC_Invalid_context      0x00000040
+    #define DEC_Invalid_operation    0x00000080
+    #if DECSUBSET
+    #define DEC_Lost_digits          0x00000100
+    #endif
+    #define DEC_Overflow             0x00000200
+    #define DEC_Clamped              0x00000400
+    #define DEC_Rounded              0x00000800
+    #define DEC_Subnormal            0x00001000
+    #define DEC_Underflow            0x00002000
+  #else
+    /* IEEE flags only */
+    #define DEC_Conversion_syntax    0x00000010
+    #define DEC_Division_by_zero     0x00000002
+    #define DEC_Division_impossible  0x00000010
+    #define DEC_Division_undefined   0x00000010
+    #define DEC_Insufficient_storage 0x00000010 /* [when malloc fails]  */
+    #define DEC_Inexact              0x00000001
+    #define DEC_Invalid_context      0x00000010
+    #define DEC_Invalid_operation    0x00000010
+    #if DECSUBSET
+    #define DEC_Lost_digits          0x00000000
+    #endif
+    #define DEC_Overflow             0x00000008
+    #define DEC_Clamped              0x00000000
+    #define DEC_Rounded              0x00000000
+    #define DEC_Subnormal            0x00000000
+    #define DEC_Underflow            0x00000004
+  #endif
+
+  /* IEEE 754 groupings for the flags                                 */
+  /* [DEC_Clamped, DEC_Lost_digits, DEC_Rounded, and DEC_Subnormal    */
+  /* are not in IEEE 754]                                             */
+  #define DEC_IEEE_754_Division_by_zero  (DEC_Division_by_zero)
+  #if DECSUBSET
+  #define DEC_IEEE_754_Inexact           (DEC_Inexact | DEC_Lost_digits)
+  #else
+  #define DEC_IEEE_754_Inexact           (DEC_Inexact)
+  #endif
+  #define DEC_IEEE_754_Invalid_operation (DEC_Conversion_syntax |     \
+                                          DEC_Division_impossible |   \
+                                          DEC_Division_undefined |    \
+                                          DEC_Insufficient_storage |  \
+                                          DEC_Invalid_context |       \
+                                          DEC_Invalid_operation)
+  #define DEC_IEEE_754_Overflow          (DEC_Overflow)
+  #define DEC_IEEE_754_Underflow         (DEC_Underflow)
+
+  /* flags which are normally errors (result is qNaN, infinite, or 0) */
+  #define DEC_Errors (DEC_IEEE_754_Division_by_zero |                 \
+                      DEC_IEEE_754_Invalid_operation |                \
+                      DEC_IEEE_754_Overflow | DEC_IEEE_754_Underflow)
+  /* flags which cause a result to become qNaN                        */
+  #define DEC_NaNs    DEC_IEEE_754_Invalid_operation
+
+  /* flags which are normally for information only (finite results)   */
+  #if DECSUBSET
+  #define DEC_Information (DEC_Clamped | DEC_Rounded | DEC_Inexact    \
+                          | DEC_Lost_digits)
+  #else
+  #define DEC_Information (DEC_Clamped | DEC_Rounded | DEC_Inexact)
+  #endif
+
+  /* IEEE 854 names (for compatibility with older decNumber versions) */
+  #define DEC_IEEE_854_Division_by_zero  DEC_IEEE_754_Division_by_zero
+  #define DEC_IEEE_854_Inexact           DEC_IEEE_754_Inexact
+  #define DEC_IEEE_854_Invalid_operation DEC_IEEE_754_Invalid_operation
+  #define DEC_IEEE_854_Overflow          DEC_IEEE_754_Overflow
+  #define DEC_IEEE_854_Underflow         DEC_IEEE_754_Underflow
+
+  /* Name strings for the exceptional conditions                      */
+  #define DEC_Condition_CS "Conversion syntax"
+  #define DEC_Condition_DZ "Division by zero"
+  #define DEC_Condition_DI "Division impossible"
+  #define DEC_Condition_DU "Division undefined"
+  #define DEC_Condition_IE "Inexact"
+  #define DEC_Condition_IS "Insufficient storage"
+  #define DEC_Condition_IC "Invalid context"
+  #define DEC_Condition_IO "Invalid operation"
+  #if DECSUBSET
+  #define DEC_Condition_LD "Lost digits"
+  #endif
+  #define DEC_Condition_OV "Overflow"
+  #define DEC_Condition_PA "Clamped"
+  #define DEC_Condition_RO "Rounded"
+  #define DEC_Condition_SU "Subnormal"
+  #define DEC_Condition_UN "Underflow"
+  #define DEC_Condition_ZE "No status"
+  #define DEC_Condition_MU "Multiple status"
+  #define DEC_Condition_Length 21  /* length of the longest string,   */
+                                   /* including terminator            */
+
+  /* Initialization descriptors, used by decContextDefault            */
+  #define DEC_INIT_BASE         0
+  #define DEC_INIT_DECIMAL32   32
+  #define DEC_INIT_DECIMAL64   64
+  #define DEC_INIT_DECIMAL128 128
+  /* Synonyms */
+  #define DEC_INIT_DECSINGLE  DEC_INIT_DECIMAL32
+  #define DEC_INIT_DECDOUBLE  DEC_INIT_DECIMAL64
+  #define DEC_INIT_DECQUAD    DEC_INIT_DECIMAL128
+
+  /* decContext routines                                              */
+  extern decContext  * decContextClearStatus(decContext *, uint32_t);
+  extern decContext  * decContextDefault(decContext *, int32_t);
+  extern enum rounding decContextGetRounding(decContext *);
+  extern uint32_t      decContextGetStatus(decContext *);
+  extern decContext  * decContextRestoreStatus(decContext *, uint32_t, uint32_t);
+  extern uint32_t      decContextSaveStatus(decContext *, uint32_t);
+  extern decContext  * decContextSetRounding(decContext *, enum rounding);
+  extern decContext  * decContextSetStatus(decContext *, uint32_t);
+  extern decContext  * decContextSetStatusFromString(decContext *, const char *);
+  extern decContext  * decContextSetStatusFromStringQuiet(decContext *, const char *);
+  extern decContext  * decContextSetStatusQuiet(decContext *, uint32_t);
+  extern const char  * decContextStatusToString(const decContext *);
+  extern int32_t       decContextTestEndian(uint8_t);
+  extern uint32_t      decContextTestSavedStatus(uint32_t, uint32_t);
+  extern uint32_t      decContextTestStatus(decContext *, uint32_t);
+  extern decContext  * decContextZeroStatus(decContext *);
+
+#endif
diff --git a/source/luametatex/source/libraries/decnumber/decNumber.c b/source/luametatex/source/libraries/decnumber/decNumber.c
new file mode 100644
index 000000000..8b8dd0d4b
--- /dev/null
+++ b/source/luametatex/source/libraries/decnumber/decNumber.c
@@ -0,0 +1,8145 @@
+/* ------------------------------------------------------------------ */
+/* Decimal Number arithmetic module                                   */
+/* ------------------------------------------------------------------ */
+/* Copyright (c) IBM Corporation, 2000, 2009.  All rights reserved.   */
+/*                                                                    */
+/* This software is made available under the terms of the             */
+/* ICU License -- ICU 1.8.1 and later.                                */
+/*                                                                    */
+/* The description and User's Guide ("The decNumber C Library") for   */
+/* this software is called decNumber.pdf.  This document is           */
+/* available, together with arithmetic and format specifications,     */
+/* testcases, and Web links, on the General Decimal Arithmetic page.  */
+/*                                                                    */
+/* Please send comments, suggestions, and corrections to the author:  */
+/*   mfc@uk.ibm.com                                                   */
+/*   Mike Cowlishaw, IBM Fellow                                       */
+/*   IBM UK, PO Box 31, Birmingham Road, Warwick CV34 5JL, UK         */
+/* ------------------------------------------------------------------ */
+/* This module comprises the routines for arbitrary-precision General */
+/* Decimal Arithmetic as defined in the specification which may be    */
+/* found on the General Decimal Arithmetic pages.  It implements both */
+/* the full ('extended') arithmetic and the simpler ('subset')        */
+/* arithmetic.                                                        */
+/*                                                                    */
+/* Usage notes:                                                       */
+/*                                                                    */
+/* 1. This code is ANSI C89 except:                                   */
+/*                                                                    */
+/*    a) C99 line comments (double forward slash) are used.  (Most C  */
+/*       compilers accept these.  If yours does not, a simple script  */
+/*       can be used to convert them to ANSI C comments.)             */
+/*                                                                    */
+/*    b) Types from C99 stdint.h are used.  If you do not have this   */
+/*       header file, see the User's Guide section of the decNumber   */
+/*       documentation; this lists the necessary definitions.         */
+/*                                                                    */
+/*    c) If DECDPUN>4 or DECUSE64=1, the C99 64-bit int64_t and       */
+/*       uint64_t types may be used.  To avoid these, set DECUSE64=0  */
+/*       and DECDPUN<=4 (see documentation).                          */
+/*                                                                    */
+/*    The code also conforms to C99 restrictions; in particular,      */
+/*    strict aliasing rules are observed.                             */
+/*                                                                    */
+/* 2. The decNumber format which this library uses is optimized for   */
+/*    efficient processing of relatively short numbers; in particular */
+/*    it allows the use of fixed sized structures and minimizes copy  */
+/*    and move operations.  It does, however, support arbitrary       */
+/*    precision (up to 999,999,999 digits) and arbitrary exponent     */
+/*    range (Emax in the range 0 through 999,999,999 and Emin in the  */
+/*    range -999,999,999 through 0).  Mathematical functions (for     */
+/*    example decNumberExp) as identified below are restricted more   */
+/*    tightly: digits, emax, and -emin in the context must be <=      */
+/*    DEC_MAX_MATH (999999), and their operand(s) must be within      */
+/*    these bounds.                                                   */
+/*                                                                    */
+/* 3. Logical functions are further restricted; their operands must   */
+/*    be finite, positive, have an exponent of zero, and all digits   */
+/*    must be either 0 or 1.  The result will only contain digits     */
+/*    which are 0 or 1 (and will have exponent=0 and a sign of 0).    */
+/*                                                                    */
+/* 4. Operands to operator functions are never modified unless they   */
+/*    are also specified to be the result number (which is always     */
+/*    permitted).  Other than that case, operands must not overlap.   */
+/*                                                                    */
+/* 5. Error handling: the type of the error is ORed into the status   */
+/*    flags in the current context (decContext structure).  The       */
+/*    SIGFPE signal is then raised if the corresponding trap-enabler  */
+/*    flag in the decContext is set (is 1).                           */
+/*                                                                    */
+/*    It is the responsibility of the caller to clear the status      */
+/*    flags as required.                                              */
+/*                                                                    */
+/*    The result of any routine which returns a number will always    */
+/*    be a valid number (which may be a special value, such as an     */
+/*    Infinity or NaN).                                               */
+/*                                                                    */
+/* 6. The decNumber format is not an exchangeable concrete            */
+/*    representation as it comprises fields which may be machine-     */
+/*    dependent (packed or unpacked, or special length, for example). */
+/*    Canonical conversions to and from strings are provided; other   */
+/*    conversions are available in separate modules.                  */
+/*                                                                    */
+/* 7. Normally, input operands are assumed to be valid.  Set DECCHECK */
+/*    to 1 for extended operand checking (including NULL operands).   */
+/*    Results are undefined if a badly-formed structure (or a NULL    */
+/*    pointer to a structure) is provided, though with DECCHECK       */
+/*    enabled the operator routines are protected against exceptions. */
+/*    (Except if the result pointer is NULL, which is unrecoverable.) */
+/*                                                                    */
+/*    However, the routines will never cause exceptions if they are   */
+/*    given well-formed operands, even if the value of the operands   */
+/*    is inappropriate for the operation and DECCHECK is not set.     */
+/*    (Except for SIGFPE, as and where documented.)                   */
+/*                                                                    */
+/* 8. Subset arithmetic is available only if DECSUBSET is set to 1.   */
+/* ------------------------------------------------------------------ */
+/* Implementation notes for maintenance of this module:               */
+/*                                                                    */
+/* 1. Storage leak protection:  Routines which use malloc are not     */
+/*    permitted to use return for fastpath or error exits (i.e.,      */
+/*    they follow strict structured programming conventions).         */
+/*    Instead they have a do{}while(0); construct surrounding the     */
+/*    code which is protected -- break may be used to exit this.      */
+/*    Other routines can safely use the return statement inline.      */
+/*                                                                    */
+/*    Storage leak accounting can be enabled using DECALLOC.          */
+/*                                                                    */
+/* 2. All loops use the for(;;) construct.  Any do construct does     */
+/*    not loop; it is for allocation protection as just described.    */
+/*                                                                    */
+/* 3. Setting status in the context must always be the very last      */
+/*    action in a routine, as non-0 status may raise a trap and hence */
+/*    the call to set status may not return (if the handler uses long */
+/*    jump).  Therefore all cleanup must be done first.  In general,  */
+/*    to achieve this status is accumulated and is only applied just  */
+/*    before return by calling decContextSetStatus (via decStatus).   */
+/*                                                                    */
+/*    Routines which allocate storage cannot, in general, use the     */
+/*    'top level' routines which could cause a non-returning          */
+/*    transfer of control.  The decXxxxOp routines are safe (do not   */
+/*    call decStatus even if traps are set in the context) and should */
+/*    be used instead (they are also a little faster).                */
+/*                                                                    */
+/* 4. Exponent checking is minimized by allowing the exponent to      */
+/*    grow outside its limits during calculations, provided that      */
+/*    the decFinalize function is called later.  Multiplication and   */
+/*    division, and intermediate calculations in exponentiation,      */
+/*    require more careful checks because of the risk of 31-bit       */
+/*    overflow (the most negative valid exponent is -1999999997, for  */
+/*    a 999999999-digit number with adjusted exponent of -999999999). */
+/*                                                                    */
+/* 5. Rounding is deferred until finalization of results, with any    */
+/*    'off to the right' data being represented as a single digit     */
+/*    residue (in the range -1 through 9).  This avoids any double-   */
+/*    rounding when more than one shortening takes place (for         */
+/*    example, when a result is subnormal).                           */
+/*                                                                    */
+/* 6. The digits count is allowed to rise to a multiple of DECDPUN    */
+/*    during many operations, so whole Units are handled and exact    */
+/*    accounting of digits is not needed.  The correct digits value   */
+/*    is found by decGetDigits, which accounts for leading zeros.     */
+/*    This must be called before any rounding if the number of digits */
+/*    is not known exactly.                                           */
+/*                                                                    */
+/* 7. The multiply-by-reciprocal 'trick' is used for partitioning     */
+/*    numbers up to four digits, using appropriate constants.  This   */
+/*    is not useful for longer numbers because overflow of 32 bits    */
+/*    would lead to 4 multiplies, which is almost as expensive as     */
+/*    a divide (unless a floating-point or 64-bit multiply is         */
+/*    assumed to be available).                                       */
+/*                                                                    */
+/* 8. Unusual abbreviations that may be used in the commentary:       */
+/*      lhs -- left hand side (operand, of an operation)              */
+/*      lsd -- least significant digit (of coefficient)               */
+/*      lsu -- least significant Unit (of coefficient)                */
+/*      msd -- most significant digit (of coefficient)                */
+/*      msi -- most significant item (in an array)                    */
+/*      msu -- most significant Unit (of coefficient)                 */
+/*      rhs -- right hand side (operand, of an operation)             */
+/*      +ve -- positive                                               */
+/*      -ve -- negative                                               */
+/*      **  -- raise to the power                                     */
+/* ------------------------------------------------------------------ */
+
+#include <stdlib.h>                // for malloc, free, etc.
+#include <stdio.h>                 // for printf [if needed]
+#include <string.h>                // for strcpy
+#include <ctype.h>                 // for lower
+#include "decNumber.h"             // base number library
+#include "decNumberLocal.h"        // decNumber local types, etc.
+
+/* Constants */
+// Public lookup table used by the D2U macro
+const uByte d2utable[DECMAXD2U+1]=D2UTABLE;
+
+#define DECVERB     1              // set to 1 for verbose DECCHECK
+#define powers      DECPOWERS      // old internal name
+
+// Local constants
+#define DIVIDE      0x80           // Divide operators
+#define REMAINDER   0x40           // ..
+#define DIVIDEINT   0x20           // ..
+#define REMNEAR     0x10           // ..
+#define COMPARE     0x01           // Compare operators
+#define COMPMAX     0x02           // ..
+#define COMPMIN     0x03           // ..
+#define COMPTOTAL   0x04           // ..
+#define COMPNAN     0x05           // .. [NaN processing]
+#define COMPSIG     0x06           // .. [signaling COMPARE]
+#define COMPMAXMAG  0x07           // ..
+#define COMPMINMAG  0x08           // ..
+
+#define DEC_sNaN     0x40000000    // local status: sNaN signal
+#define BADINT  (Int)0x80000000    // most-negative Int; error indicator
+// Next two indicate an integer >= 10**6, and its parity (bottom bit)
+#define BIGEVEN (Int)0x80000002
+#define BIGODD  (Int)0x80000003
+
+static Unit uarrone[1]={1};   // Unit array of 1, used for incrementing
+
+/* Granularity-dependent code */
+#if DECDPUN<=4
+  #define eInt  Int           // extended integer
+  #define ueInt uInt          // unsigned extended integer
+  // Constant multipliers for divide-by-power-of five using reciprocal
+  // multiply, after removing powers of 2 by shifting, and final shift
+  // of 17 [we only need up to **4]
+  static const uInt multies[]={131073, 26215, 5243, 1049, 210};
+  // QUOT10 -- macro to return the quotient of unit u divided by 10**n
+  #define QUOT10(u, n) ((((uInt)(u)>>(n))*multies[n])>>17)
+#else
+  // For DECDPUN>4 non-ANSI-89 64-bit types are needed.
+  #if !DECUSE64
+    #error decNumber.c: DECUSE64 must be 1 when DECDPUN>4
+  #endif
+  #define eInt  Long          // extended integer
+  #define ueInt uLong         // unsigned extended integer
+#endif
+
+/* Local routines */
+static decNumber * decAddOp(decNumber *, const decNumber *, const decNumber *,
+                              decContext *, uByte, uInt *);
+static Flag        decBiStr(const char *, const char *, const char *);
+static uInt        decCheckMath(const decNumber *, decContext *, uInt *);
+static void        decApplyRound(decNumber *, decContext *, Int, uInt *);
+static Int         decCompare(const decNumber *lhs, const decNumber *rhs, Flag);
+static decNumber * decCompareOp(decNumber *, const decNumber *,
+                              const decNumber *, decContext *,
+                              Flag, uInt *);
+static void        decCopyFit(decNumber *, const decNumber *, decContext *,
+                              Int *, uInt *);
+static decNumber * decDecap(decNumber *, Int);
+static decNumber * decDivideOp(decNumber *, const decNumber *,
+                              const decNumber *, decContext *, Flag, uInt *);
+static decNumber * decExpOp(decNumber *, const decNumber *,
+                              decContext *, uInt *);
+static void        decFinalize(decNumber *, decContext *, Int *, uInt *);
+static Int         decGetDigits(Unit *, Int);
+static Int         decGetInt(const decNumber *);
+static decNumber * decLnOp(decNumber *, const decNumber *,
+                              decContext *, uInt *);
+static decNumber * decMultiplyOp(decNumber *, const decNumber *,
+                              const decNumber *, decContext *,
+                              uInt *);
+static decNumber * decNaNs(decNumber *, const decNumber *,
+                              const decNumber *, decContext *, uInt *);
+static decNumber * decQuantizeOp(decNumber *, const decNumber *,
+                              const decNumber *, decContext *, Flag,
+                              uInt *);
+static void        decReverse(Unit *, Unit *);
+static void        decSetCoeff(decNumber *, decContext *, const Unit *,
+                              Int, Int *, uInt *);
+static void        decSetMaxValue(decNumber *, decContext *);
+static void        decSetOverflow(decNumber *, decContext *, uInt *);
+static void        decSetSubnormal(decNumber *, decContext *, Int *, uInt *);
+static Int         decShiftToLeast(Unit *, Int, Int);
+static Int         decShiftToMost(Unit *, Int, Int);
+static void        decStatus(decNumber *, uInt, decContext *);
+static void        decToString(const decNumber *, char[], Flag);
+static decNumber * decTrim(decNumber *, decContext *, Flag, Flag, Int *);
+static Int         decUnitAddSub(const Unit *, Int, const Unit *, Int, Int,
+                              Unit *, Int);
+static Int         decUnitCompare(const Unit *, Int, const Unit *, Int, Int);
+
+#if !DECSUBSET
+/* decFinish == decFinalize when no subset arithmetic needed */
+#define decFinish(a,b,c,d) decFinalize(a,b,c,d)
+#else
+static void        decFinish(decNumber *, decContext *, Int *, uInt *);
+static decNumber * decRoundOperand(const decNumber *, decContext *, uInt *);
+#endif
+
+/* Local macros */
+// masked special-values bits
+#define SPECIALARG  (rhs->bits & DECSPECIAL)
+#define SPECIALARGS ((lhs->bits | rhs->bits) & DECSPECIAL)
+
+/* Diagnostic macros, etc. */
+#if DECALLOC
+// Handle malloc/free accounting.  If enabled, our accountable routines
+// are used; otherwise the code just goes straight to the system malloc
+// and free routines.
+#define malloc(a) decMalloc(a)
+#define free(a) decFree(a)
+#define DECFENCE 0x5a              // corruption detector
+// 'Our' malloc and free:
+static void *decMalloc(size_t);
+static void  decFree(void *);
+uInt decAllocBytes=0;              // count of bytes allocated
+// Note that DECALLOC code only checks for storage buffer overflow.
+// To check for memory leaks, the decAllocBytes variable must be
+// checked to be 0 at appropriate times (e.g., after the test
+// harness completes a set of tests).  This checking may be unreliable
+// if the testing is done in a multi-thread environment.
+#endif
+
+# include "../../utilities/auxmemory.h"
+# define malloc lmt_memory_malloc
+# define free   lmt_memory_free
+
+#if DECCHECK
+// Optional checking routines.  Enabling these means that decNumber
+// and decContext operands to operator routines are checked for
+// correctness.  This roughly doubles the execution time of the
+// fastest routines (and adds 600+ bytes), so should not normally be
+// used in 'production'.
+// decCheckInexact is used to check that inexact results have a full
+// complement of digits (where appropriate -- this is not the case
+// for Quantize, for example)
+#define DECUNRESU ((decNumber *)(void *)0xffffffff)
+#define DECUNUSED ((const decNumber *)(void *)0xffffffff)
+#define DECUNCONT ((decContext *)(void *)(0xffffffff))
+static Flag decCheckOperands(decNumber *, const decNumber *,
+                             const decNumber *, decContext *);
+static Flag decCheckNumber(const decNumber *);
+static void decCheckInexact(const decNumber *, decContext *);
+#endif
+
+#if DECTRACE || DECCHECK
+// Optional trace/debugging routines (may or may not be used)
+void decNumberShow(const decNumber *);  // displays the components of a number
+static void decDumpAr(char, const Unit *, Int);
+#endif
+
+/* ================================================================== */
+/* Conversions                                                        */
+/* ================================================================== */
+
+/* ------------------------------------------------------------------ */
+/* from-int32 -- conversion from Int or uInt                          */
+/*                                                                    */
+/*  dn is the decNumber to receive the integer                        */
+/*  in or uin is the integer to be converted                          */
+/*  returns dn                                                        */
+/*                                                                    */
+/* No error is possible.                                              */
+/* ------------------------------------------------------------------ */
+decNumber * decNumberFromInt32(decNumber *dn, Int in) {
+  uInt unsig;
+  if (in>=0) unsig=in;
+   else {                               // negative (possibly BADINT)
+    if (in==BADINT) unsig=(uInt)1073741824*2; // special case
+     else unsig=-in;                    // invert
+    }
+  // in is now positive
+  decNumberFromUInt32(dn, unsig);
+  if (in<0) dn->bits=DECNEG;            // sign needed
+  return dn;
+  } // decNumberFromInt32
+
+decNumber * decNumberFromUInt32(decNumber *dn, uInt uin) {
+  Unit *up;                             // work pointer
+  decNumberZero(dn);                    // clean
+  if (uin==0) return dn;                // [or decGetDigits bad call]
+  for (up=dn->lsu; uin>0; up++) {
+    *up=(Unit)(uin%(DECDPUNMAX+1));
+    uin=uin/(DECDPUNMAX+1);
+    }
+  dn->digits=decGetDigits(dn->lsu, up-dn->lsu);
+  return dn;
+  } // decNumberFromUInt32
+
+/* ------------------------------------------------------------------ */
+/* to-int32 -- conversion to Int or uInt                              */
+/*                                                                    */
+/*  dn is the decNumber to convert                                    */
+/*  set is the context for reporting errors                           */
+/*  returns the converted decNumber, or 0 if Invalid is set           */
+/*                                                                    */
+/* Invalid is set if the decNumber does not have exponent==0 or if    */
+/* it is a NaN, Infinite, or out-of-range.                            */
+/* ------------------------------------------------------------------ */
+Int decNumberToInt32(const decNumber *dn, decContext *set) {
+  #if DECCHECK
+  if (decCheckOperands(DECUNRESU, DECUNUSED, dn, set)) return 0;
+  #endif
+
+  // special or too many digits, or bad exponent
+  if (dn->bits&DECSPECIAL || dn->digits>10 || dn->exponent!=0) ; // bad
+   else { // is a finite integer with 10 or fewer digits
+    Int d;                         // work
+    const Unit *up;                // ..
+    uInt hi=0, lo;                 // ..
+    up=dn->lsu;                    // -> lsu
+    lo=*up;                        // get 1 to 9 digits
+    #if DECDPUN>1                  // split to higher
+      hi=lo/10;
+      lo=lo%10;
+    #endif
+    up++;
+    // collect remaining Units, if any, into hi
+    for (d=DECDPUN; d<dn->digits; up++, d+=DECDPUN) hi+=*up*powers[d-1];
+    // now low has the lsd, hi the remainder
+    if (hi>214748364 || (hi==214748364 && lo>7)) { // out of range?
+      // most-negative is a reprieve
+      if (dn->bits&DECNEG && hi==214748364 && lo==8) return 0x80000000;
+      // bad -- drop through
+      }
+     else { // in-range always
+      Int i=X10(hi)+lo;
+      if (dn->bits&DECNEG) return -i;
+      return i;
+      }
+    } // integer
+  decContextSetStatus(set, DEC_Invalid_operation); // [may not return]
+  return 0;
+  } // decNumberToInt32
+
+uInt decNumberToUInt32(const decNumber *dn, decContext *set) {
+  #if DECCHECK
+  if (decCheckOperands(DECUNRESU, DECUNUSED, dn, set)) return 0;
+  #endif
+  // special or too many digits, or bad exponent, or negative (<0)
+  if (dn->bits&DECSPECIAL || dn->digits>10 || dn->exponent!=0
+    || (dn->bits&DECNEG && !ISZERO(dn)));                   // bad
+   else { // is a finite integer with 10 or fewer digits
+    Int d;                         // work
+    const Unit *up;                // ..
+    uInt hi=0, lo;                 // ..
+    up=dn->lsu;                    // -> lsu
+    lo=*up;                        // get 1 to 9 digits
+    #if DECDPUN>1                  // split to higher
+      hi=lo/10;
+      lo=lo%10;
+    #endif
+    up++;
+    // collect remaining Units, if any, into hi
+    for (d=DECDPUN; d<dn->digits; up++, d+=DECDPUN) hi+=*up*powers[d-1];
+
+    // now low has the lsd, hi the remainder
+    if (hi>429496729 || (hi==429496729 && lo>5)) ; // no reprieve possible
+     else return X10(hi)+lo;
+    } // integer
+  decContextSetStatus(set, DEC_Invalid_operation); // [may not return]
+  return 0;
+  } // decNumberToUInt32
+
+/* ------------------------------------------------------------------ */
+/* to-scientific-string -- conversion to numeric string               */
+/* to-engineering-string -- conversion to numeric string              */
+/*                                                                    */
+/*   decNumberToString(dn, string);                                   */
+/*   decNumberToEngString(dn, string);                                */
+/*                                                                    */
+/*  dn is the decNumber to convert                                    */
+/*  string is the string where the result will be laid out            */
+/*                                                                    */
+/*  string must be at least dn->digits+14 characters long             */
+/*                                                                    */
+/*  No error is possible, and no status can be set.                   */
+/* ------------------------------------------------------------------ */
+char * decNumberToString(const decNumber *dn, char *string){
+  decToString(dn, string, 0);
+  return string;
+  } // DecNumberToString
+
+char * decNumberToEngString(const decNumber *dn, char *string){
+  decToString(dn, string, 1);
+  return string;
+  } // DecNumberToEngString
+
+/* ------------------------------------------------------------------ */
+/* to-number -- conversion from numeric string                        */
+/*                                                                    */
+/* decNumberFromString -- convert string to decNumber                 */
+/*   dn        -- the number structure to fill                        */
+/*   chars[]   -- the string to convert ('\0' terminated)             */
+/*   set       -- the context used for processing any error,          */
+/*                determining the maximum precision available         */
+/*                (set.digits), determining the maximum and minimum   */
+/*                exponent (set.emax and set.emin), determining if    */
+/*                extended values are allowed, and checking the       */
+/*                rounding mode if overflow occurs or rounding is     */
+/*                needed.                                             */
+/*                                                                    */
+/* The length of the coefficient and the size of the exponent are     */
+/* checked by this routine, so the correct error (Underflow or        */
+/* Overflow) can be reported or rounding applied, as necessary.       */
+/*                                                                    */
+/* If bad syntax is detected, the result will be a quiet NaN.         */
+/* ------------------------------------------------------------------ */
+decNumber * decNumberFromString(decNumber *dn, const char chars[],
+                                decContext *set) {
+  Int   exponent=0;                // working exponent [assume 0]
+  uByte bits=0;                    // working flags [assume +ve]
+  Unit  *res;                      // where result will be built
+  Unit  resbuff[SD2U(DECBUFFER+9)];// local buffer in case need temporary
+                                   // [+9 allows for ln() constants]
+  Unit  *allocres=NULL;            // -> allocated result, iff allocated
+  Int   d=0;                       // count of digits found in decimal part
+  const char *dotchar=NULL;        // where dot was found
+  const char *cfirst=chars;        // -> first character of decimal part
+  const char *last=NULL;           // -> last digit of decimal part
+  const char *c;                   // work
+  Unit  *up;                       // ..
+  #if DECDPUN>1
+  Int   cut, out;                  // ..
+  #endif
+  Int   residue;                   // rounding residue
+  uInt  status=0;                  // error code
+
+  #if DECCHECK
+  if (decCheckOperands(DECUNRESU, DECUNUSED, DECUNUSED, set))
+    return decNumberZero(dn);
+  #endif
+
+  do {                             // status & malloc protection
+    for (c=chars;; c++) {          // -> input character
+      if (*c>='0' && *c<='9') {    // test for Arabic digit
+        last=c;
+        d++;                       // count of real digits
+        continue;                  // still in decimal part
+        }
+      if (*c=='.' && dotchar==NULL) { // first '.'
+        dotchar=c;                 // record offset into decimal part
+        if (c==cfirst) cfirst++;   // first digit must follow
+        continue;}
+      if (c==chars) {              // first in string...
+        if (*c=='-') {             // valid - sign
+          cfirst++;
+          bits=DECNEG;
+          continue;}
+        if (*c=='+') {             // valid + sign
+          cfirst++;
+          continue;}
+        }
+      // *c is not a digit, or a valid +, -, or '.'
+      break;
+      } // c
+
+    if (last==NULL) {              // no digits yet
+      status=DEC_Conversion_syntax;// assume the worst
+      if (*c=='\0') break;         // and no more to come...
+      #if DECSUBSET
+      // if subset then infinities and NaNs are not allowed
+      if (!set->extended) break;   // hopeless
+      #endif
+      // Infinities and NaNs are possible, here
+      if (dotchar!=NULL) break;    // .. unless had a dot
+      decNumberZero(dn);           // be optimistic
+      if (decBiStr(c, "infinity", "INFINITY")
+       || decBiStr(c, "inf", "INF")) {
+        dn->bits=bits | DECINF;
+        status=0;                  // is OK
+        break; // all done
+        }
+      // a NaN expected
+      // 2003.09.10 NaNs are now permitted to have a sign
+      dn->bits=bits | DECNAN;      // assume simple NaN
+      if (*c=='s' || *c=='S') {    // looks like an sNaN
+        c++;
+        dn->bits=bits | DECSNAN;
+        }
+      if (*c!='n' && *c!='N') break;    // check caseless "NaN"
+      c++;
+      if (*c!='a' && *c!='A') break;    // ..
+      c++;
+      if (*c!='n' && *c!='N') break;    // ..
+      c++;
+      // now either nothing, or nnnn payload, expected
+      // -> start of integer and skip leading 0s [including plain 0]
+      for (cfirst=c; *cfirst=='0';) cfirst++;
+      if (*cfirst=='\0') {         // "NaN" or "sNaN", maybe with all 0s
+        status=0;                  // it's good
+        break;                     // ..
+        }
+      // something other than 0s; setup last and d as usual [no dots]
+      for (c=cfirst;; c++, d++) {
+        if (*c<'0' || *c>'9') break; // test for Arabic digit
+        last=c;
+        }
+      if (*c!='\0') break;         // not all digits
+      if (d>set->digits-1) {
+        // [NB: payload in a decNumber can be full length unless
+        // clamped, in which case can only be digits-1]
+        if (set->clamp) break;
+        if (d>set->digits) break;
+        } // too many digits?
+      // good; drop through to convert the integer to coefficient
+      status=0;                    // syntax is OK
+      bits=dn->bits;               // for copy-back
+      } // last==NULL
+
+     else if (*c!='\0') {          // more to process...
+      // had some digits; exponent is only valid sequence now
+      Flag nege;                   // 1=negative exponent
+      const char *firstexp;        // -> first significant exponent digit
+      status=DEC_Conversion_syntax;// assume the worst
+      if (*c!='e' && *c!='E') break;
+      /* Found 'e' or 'E' -- now process explicit exponent */
+      // 1998.07.11: sign no longer required
+      nege=0;
+      c++;                         // to (possible) sign
+      if (*c=='-') {nege=1; c++;}
+       else if (*c=='+') c++;
+      if (*c=='\0') break;
+
+      for (; *c=='0' && *(c+1)!='\0';) c++;  // strip insignificant zeros
+      firstexp=c;                            // save exponent digit place
+      for (; ;c++) {
+        if (*c<'0' || *c>'9') break;         // not a digit
+        exponent=X10(exponent)+(Int)*c-(Int)'0';
+        } // c
+      // if not now on a '\0', *c must not be a digit
+      if (*c!='\0') break;
+
+      // (this next test must be after the syntax checks)
+      // if it was too long the exponent may have wrapped, so check
+      // carefully and set it to a certain overflow if wrap possible
+      if (c>=firstexp+9+1) {
+        if (c>firstexp+9+1 || *firstexp>'1') exponent=DECNUMMAXE*2;
+        // [up to 1999999999 is OK, for example 1E-1000000998]
+        }
+      if (nege) exponent=-exponent;     // was negative
+      status=0;                         // is OK
+      } // stuff after digits
+
+    // Here when whole string has been inspected; syntax is good
+    // cfirst->first digit (never dot), last->last digit (ditto)
+
+    // strip leading zeros/dot [leave final 0 if all 0's]
+    if (*cfirst=='0') {                 // [cfirst has stepped over .]
+      for (c=cfirst; c<last; c++, cfirst++) {
+        if (*c=='.') continue;          // ignore dots
+        if (*c!='0') break;             // non-zero found
+        d--;                            // 0 stripped
+        } // c
+      #if DECSUBSET
+      // make a rapid exit for easy zeros if !extended
+      if (*cfirst=='0' && !set->extended) {
+        decNumberZero(dn);              // clean result
+        break;                          // [could be return]
+        }
+      #endif
+      } // at least one leading 0
+
+    // Handle decimal point...
+    if (dotchar!=NULL && dotchar<last)  // non-trailing '.' found?
+      exponent-=(last-dotchar);         // adjust exponent
+    // [we can now ignore the .]
+
+    // OK, the digits string is good.  Assemble in the decNumber, or in
+    // a temporary units array if rounding is needed
+    if (d<=set->digits) res=dn->lsu;    // fits into supplied decNumber
+     else {                             // rounding needed
+      Int needbytes=D2U(d)*sizeof(Unit);// bytes needed
+      res=resbuff;                      // assume use local buffer
+      if (needbytes>(Int)sizeof(resbuff)) { // too big for local
+        allocres=(Unit *)malloc(needbytes);
+        if (allocres==NULL) {status|=DEC_Insufficient_storage; break;}
+        res=allocres;
+        }
+      }
+    // res now -> number lsu, buffer, or allocated storage for Unit array
+
+    // Place the coefficient into the selected Unit array
+    // [this is often 70% of the cost of this function when DECDPUN>1]
+    #if DECDPUN>1
+    out=0;                         // accumulator
+    up=res+D2U(d)-1;               // -> msu
+    cut=d-(up-res)*DECDPUN;        // digits in top unit
+    for (c=cfirst;; c++) {         // along the digits
+      if (*c=='.') continue;       // ignore '.' [don't decrement cut]
+      out=X10(out)+(Int)*c-(Int)'0';
+      if (c==last) break;          // done [never get to trailing '.']
+      cut--;
+      if (cut>0) continue;         // more for this unit
+      *up=(Unit)out;               // write unit
+      up--;                        // prepare for unit below..
+      cut=DECDPUN;                 // ..
+      out=0;                       // ..
+      } // c
+    *up=(Unit)out;                 // write lsu
+
+    #else
+    // DECDPUN==1
+    up=res;                        // -> lsu
+    for (c=last; c>=cfirst; c--) { // over each character, from least
+      if (*c=='.') continue;       // ignore . [don't step up]
+      *up=(Unit)((Int)*c-(Int)'0');
+      up++;
+      } // c
+    #endif
+
+    dn->bits=bits;
+    dn->exponent=exponent;
+    dn->digits=d;
+
+    // if not in number (too long) shorten into the number
+    if (d>set->digits) {
+      residue=0;
+      decSetCoeff(dn, set, res, d, &residue, &status);
+      // always check for overflow or subnormal and round as needed
+      decFinalize(dn, set, &residue, &status);
+      }
+     else { // no rounding, but may still have overflow or subnormal
+      // [these tests are just for performance; finalize repeats them]
+      if ((dn->exponent-1<set->emin-dn->digits)
+       || (dn->exponent-1>set->emax-set->digits)) {
+        residue=0;
+        decFinalize(dn, set, &residue, &status);
+        }
+      }
+    // decNumberShow(dn);
+    } while(0);                         // [for break]
+
+  if (allocres!=NULL) free(allocres);   // drop any storage used
+  if (status!=0) decStatus(dn, status, set);
+  return dn;
+  } /* decNumberFromString */
+
+/* ================================================================== */
+/* Operators                                                          */
+/* ================================================================== */
+
+/* ------------------------------------------------------------------ */
+/* decNumberAbs -- absolute value operator                            */
+/*                                                                    */
+/*   This computes C = abs(A)                                         */
+/*                                                                    */
+/*   res is C, the result.  C may be A                                */
+/*   rhs is A                                                         */
+/*   set is the context                                               */
+/*                                                                    */
+/* See also decNumberCopyAbs for a quiet bitwise version of this.     */
+/* C must have space for set->digits digits.                          */
+/* ------------------------------------------------------------------ */
+/* This has the same effect as decNumberPlus unless A is negative,    */
+/* in which case it has the same effect as decNumberMinus.            */
+/* ------------------------------------------------------------------ */
+decNumber * decNumberAbs(decNumber *res, const decNumber *rhs,
+                         decContext *set) {
+  decNumber dzero;                      // for 0
+  uInt status=0;                        // accumulator
+
+  #if DECCHECK
+  if (decCheckOperands(res, DECUNUSED, rhs, set)) return res;
+  #endif
+
+  decNumberZero(&dzero);                // set 0
+  dzero.exponent=rhs->exponent;         // [no coefficient expansion]
+  decAddOp(res, &dzero, rhs, set, (uByte)(rhs->bits & DECNEG), &status);
+  if (status!=0) decStatus(res, status, set);
+  #if DECCHECK
+  decCheckInexact(res, set);
+  #endif
+  return res;
+  } // decNumberAbs
+
+/* ------------------------------------------------------------------ */
+/* decNumberAdd -- add two Numbers                                    */
+/*                                                                    */
+/*   This computes C = A + B                                          */
+/*                                                                    */
+/*   res is C, the result.  C may be A and/or B (e.g., X=X+X)         */
+/*   lhs is A                                                         */
+/*   rhs is B                                                         */
+/*   set is the context                                               */
+/*                                                                    */
+/* C must have space for set->digits digits.                          */
+/* ------------------------------------------------------------------ */
+/* This just calls the routine shared with Subtract                   */
+decNumber * decNumberAdd(decNumber *res, const decNumber *lhs,
+                         const decNumber *rhs, decContext *set) {
+  uInt status=0;                        // accumulator
+  decAddOp(res, lhs, rhs, set, 0, &status);
+  if (status!=0) decStatus(res, status, set);
+  #if DECCHECK
+  decCheckInexact(res, set);
+  #endif
+  return res;
+  } // decNumberAdd
+
+/* ------------------------------------------------------------------ */
+/* decNumberAnd -- AND two Numbers, digitwise                         */
+/*                                                                    */
+/*   This computes C = A & B                                          */
+/*                                                                    */
+/*   res is C, the result.  C may be A and/or B (e.g., X=X&X)         */
+/*   lhs is A                                                         */
+/*   rhs is B                                                         */
+/*   set is the context (used for result length and error report)     */
+/*                                                                    */
+/* C must have space for set->digits digits.                          */
+/*                                                                    */
+/* Logical function restrictions apply (see above); a NaN is          */
+/* returned with Invalid_operation if a restriction is violated.      */
+/* ------------------------------------------------------------------ */
+decNumber * decNumberAnd(decNumber *res, const decNumber *lhs,
+                         const decNumber *rhs, decContext *set) {
+  const Unit *ua, *ub;                  // -> operands
+  const Unit *msua, *msub;              // -> operand msus
+  Unit *uc,  *msuc;                     // -> result and its msu
+  Int   msudigs;                        // digits in res msu
+  #if DECCHECK
+  if (decCheckOperands(res, lhs, rhs, set)) return res;
+  #endif
+
+  if (lhs->exponent!=0 || decNumberIsSpecial(lhs) || decNumberIsNegative(lhs)
+   || rhs->exponent!=0 || decNumberIsSpecial(rhs) || decNumberIsNegative(rhs)) {
+    decStatus(res, DEC_Invalid_operation, set);
+    return res;
+    }
+
+  // operands are valid
+  ua=lhs->lsu;                          // bottom-up
+  ub=rhs->lsu;                          // ..
+  uc=res->lsu;                          // ..
+  msua=ua+D2U(lhs->digits)-1;           // -> msu of lhs
+  msub=ub+D2U(rhs->digits)-1;           // -> msu of rhs
+  msuc=uc+D2U(set->digits)-1;           // -> msu of result
+  msudigs=MSUDIGITS(set->digits);       // [faster than remainder]
+  for (; uc<=msuc; ua++, ub++, uc++) {  // Unit loop
+    Unit a, b;                          // extract units
+    if (ua>msua) a=0;
+     else a=*ua;
+    if (ub>msub) b=0;
+     else b=*ub;
+    *uc=0;                              // can now write back
+    if (a|b) {                          // maybe 1 bits to examine
+      Int i, j;
+      *uc=0;                            // can now write back
+      // This loop could be unrolled and/or use BIN2BCD tables
+      for (i=0; i<DECDPUN; i++) {
+        if (a&b&1) *uc=*uc+(Unit)powers[i];  // effect AND
+        j=a%10;
+        a=a/10;
+        j|=b%10;
+        b=b/10;
+        if (j>1) {
+          decStatus(res, DEC_Invalid_operation, set);
+          return res;
+          }
+        if (uc==msuc && i==msudigs-1) break; // just did final digit
+        } // each digit
+      } // both OK
+    } // each unit
+  // [here uc-1 is the msu of the result]
+  res->digits=decGetDigits(res->lsu, uc-res->lsu);
+  res->exponent=0;                      // integer
+  res->bits=0;                          // sign=0
+  return res;  // [no status to set]
+  } // decNumberAnd
+
+/* ------------------------------------------------------------------ */
+/* decNumberCompare -- compare two Numbers                            */
+/*                                                                    */
+/*   This computes C = A ? B                                          */
+/*                                                                    */
+/*   res is C, the result.  C may be A and/or B (e.g., X=X?X)         */
+/*   lhs is A                                                         */
+/*   rhs is B                                                         */
+/*   set is the context                                               */
+/*                                                                    */
+/* C must have space for one digit (or NaN).                          */
+/* ------------------------------------------------------------------ */
+decNumber * decNumberCompare(decNumber *res, const decNumber *lhs,
+                             const decNumber *rhs, decContext *set) {
+  uInt status=0;                        // accumulator
+  decCompareOp(res, lhs, rhs, set, COMPARE, &status);
+  if (status!=0) decStatus(res, status, set);
+  return res;
+  } // decNumberCompare
+
+/* ------------------------------------------------------------------ */
+/* decNumberCompareSignal -- compare, signalling on all NaNs          */
+/*                                                                    */
+/*   This computes C = A ? B                                          */
+/*                                                                    */
+/*   res is C, the result.  C may be A and/or B (e.g., X=X?X)         */
+/*   lhs is A                                                         */
+/*   rhs is B                                                         */
+/*   set is the context                                               */
+/*                                                                    */
+/* C must have space for one digit (or NaN).                          */
+/* ------------------------------------------------------------------ */
+decNumber * decNumberCompareSignal(decNumber *res, const decNumber *lhs,
+                                   const decNumber *rhs, decContext *set) {
+  uInt status=0;                        // accumulator
+  decCompareOp(res, lhs, rhs, set, COMPSIG, &status);
+  if (status!=0) decStatus(res, status, set);
+  return res;
+  } // decNumberCompareSignal
+
+/* ------------------------------------------------------------------ */
+/* decNumberCompareTotal -- compare two Numbers, using total ordering */
+/*                                                                    */
+/*   This computes C = A ? B, under total ordering                    */
+/*                                                                    */
+/*   res is C, the result.  C may be A and/or B (e.g., X=X?X)         */
+/*   lhs is A                                                         */
+/*   rhs is B                                                         */
+/*   set is the context                                               */
+/*                                                                    */
+/* C must have space for one digit; the result will always be one of  */
+/* -1, 0, or 1.                                                       */
+/* ------------------------------------------------------------------ */
+decNumber * decNumberCompareTotal(decNumber *res, const decNumber *lhs,
+                                  const decNumber *rhs, decContext *set) {
+  uInt status=0;                        // accumulator
+  decCompareOp(res, lhs, rhs, set, COMPTOTAL, &status);
+  if (status!=0) decStatus(res, status, set);
+  return res;
+  } // decNumberCompareTotal
+
+/* ------------------------------------------------------------------ */
+/* decNumberCompareTotalMag -- compare, total ordering of magnitudes  */
+/*                                                                    */
+/*   This computes C = |A| ? |B|, under total ordering                */
+/*                                                                    */
+/*   res is C, the result.  C may be A and/or B (e.g., X=X?X)         */
+/*   lhs is A                                                         */
+/*   rhs is B                                                         */
+/*   set is the context                                               */
+/*                                                                    */
+/* C must have space for one digit; the result will always be one of  */
+/* -1, 0, or 1.                                                       */
+/* ------------------------------------------------------------------ */
+decNumber * decNumberCompareTotalMag(decNumber *res, const decNumber *lhs,
+                                     const decNumber *rhs, decContext *set) {
+  uInt status=0;                   // accumulator
+  uInt needbytes;                  // for space calculations
+  decNumber bufa[D2N(DECBUFFER+1)];// +1 in case DECBUFFER=0
+  decNumber *allocbufa=NULL;       // -> allocated bufa, iff allocated
+  decNumber bufb[D2N(DECBUFFER+1)];
+  decNumber *allocbufb=NULL;       // -> allocated bufb, iff allocated
+  decNumber *a, *b;                // temporary pointers
+
+  #if DECCHECK
+  if (decCheckOperands(res, lhs, rhs, set)) return res;
+  #endif
+
+  do {                                  // protect allocated storage
+    // if either is negative, take a copy and absolute
+    if (decNumberIsNegative(lhs)) {     // lhs<0
+      a=bufa;
+      needbytes=sizeof(decNumber)+(D2U(lhs->digits)-1)*sizeof(Unit);
+      if (needbytes>sizeof(bufa)) {     // need malloc space
+        allocbufa=(decNumber *)malloc(needbytes);
+        if (allocbufa==NULL) {          // hopeless -- abandon
+          status|=DEC_Insufficient_storage;
+          break;}
+        a=allocbufa;                    // use the allocated space
+        }
+      decNumberCopy(a, lhs);            // copy content
+      a->bits&=~DECNEG;                 // .. and clear the sign
+      lhs=a;                            // use copy from here on
+      }
+    if (decNumberIsNegative(rhs)) {     // rhs<0
+      b=bufb;
+      needbytes=sizeof(decNumber)+(D2U(rhs->digits)-1)*sizeof(Unit);
+      if (needbytes>sizeof(bufb)) {     // need malloc space
+        allocbufb=(decNumber *)malloc(needbytes);
+        if (allocbufb==NULL) {          // hopeless -- abandon
+          status|=DEC_Insufficient_storage;
+          break;}
+        b=allocbufb;                    // use the allocated space
+        }
+      decNumberCopy(b, rhs);            // copy content
+      b->bits&=~DECNEG;                 // .. and clear the sign
+      rhs=b;                            // use copy from here on
+      }
+    decCompareOp(res, lhs, rhs, set, COMPTOTAL, &status);
+    } while(0);                         // end protected
+
+  if (allocbufa!=NULL) free(allocbufa); // drop any storage used
+  if (allocbufb!=NULL) free(allocbufb); // ..
+  if (status!=0) decStatus(res, status, set);
+  return res;
+  } // decNumberCompareTotalMag
+
+/* ------------------------------------------------------------------ */
+/* decNumberDivide -- divide one number by another                    */
+/*                                                                    */
+/*   This computes C = A / B                                          */
+/*                                                                    */
+/*   res is C, the result.  C may be A and/or B (e.g., X=X/X)         */
+/*   lhs is A                                                         */
+/*   rhs is B                                                         */
+/*   set is the context                                               */
+/*                                                                    */
+/* C must have space for set->digits digits.                          */
+/* ------------------------------------------------------------------ */
+decNumber * decNumberDivide(decNumber *res, const decNumber *lhs,
+                            const decNumber *rhs, decContext *set) {
+  uInt status=0;                        // accumulator
+  decDivideOp(res, lhs, rhs, set, DIVIDE, &status);
+  if (status!=0) decStatus(res, status, set);
+  #if DECCHECK
+  decCheckInexact(res, set);
+  #endif
+  return res;
+  } // decNumberDivide
+
+/* ------------------------------------------------------------------ */
+/* decNumberDivideInteger -- divide and return integer quotient       */
+/*                                                                    */
+/*   This computes C = A # B, where # is the integer divide operator  */
+/*                                                                    */
+/*   res is C, the result.  C may be A and/or B (e.g., X=X#X)         */
+/*   lhs is A                                                         */
+/*   rhs is B                                                         */
+/*   set is the context                                               */
+/*                                                                    */
+/* C must have space for set->digits digits.                          */
+/* ------------------------------------------------------------------ */
+decNumber * decNumberDivideInteger(decNumber *res, const decNumber *lhs,
+                                   const decNumber *rhs, decContext *set) {
+  uInt status=0;                        // accumulator
+  decDivideOp(res, lhs, rhs, set, DIVIDEINT, &status);
+  if (status!=0) decStatus(res, status, set);
+  return res;
+  } // decNumberDivideInteger
+
+/* ------------------------------------------------------------------ */
+/* decNumberExp -- exponentiation                                     */
+/*                                                                    */
+/*   This computes C = exp(A)                                         */
+/*                                                                    */
+/*   res is C, the result.  C may be A                                */
+/*   rhs is A                                                         */
+/*   set is the context; note that rounding mode has no effect        */
+/*                                                                    */
+/* C must have space for set->digits digits.                          */
+/*                                                                    */
+/* Mathematical function restrictions apply (see above); a NaN is     */
+/* returned with Invalid_operation if a restriction is violated.      */
+/*                                                                    */
+/* Finite results will always be full precision and Inexact, except   */
+/* when A is a zero or -Infinity (giving 1 or 0 respectively).        */
+/*                                                                    */
+/* An Inexact result is rounded using DEC_ROUND_HALF_EVEN; it will    */
+/* almost always be correctly rounded, but may be up to 1 ulp in      */
+/* error in rare cases.                                               */
+/* ------------------------------------------------------------------ */
+/* This is a wrapper for decExpOp which can handle the slightly wider */
+/* (double) range needed by Ln (which has to be able to calculate     */
+/* exp(-a) where a can be the tiniest number (Ntiny).                 */
+/* ------------------------------------------------------------------ */
+decNumber * decNumberExp(decNumber *res, const decNumber *rhs,
+                         decContext *set) {
+  uInt status=0;                        // accumulator
+  #if DECSUBSET
+  decNumber *allocrhs=NULL;        // non-NULL if rounded rhs allocated
+  #endif
+
+  #if DECCHECK
+  if (decCheckOperands(res, DECUNUSED, rhs, set)) return res;
+  #endif
+
+  // Check restrictions; these restrictions ensure that if h=8 (see
+  // decExpOp) then the result will either overflow or underflow to 0.
+  // Other math functions restrict the input range, too, for inverses.
+  // If not violated then carry out the operation.
+  if (!decCheckMath(rhs, set, &status)) do { // protect allocation
+    #if DECSUBSET
+    if (!set->extended) {
+      // reduce operand and set lostDigits status, as needed
+      if (rhs->digits>set->digits) {
+        allocrhs=decRoundOperand(rhs, set, &status);
+        if (allocrhs==NULL) break;
+        rhs=allocrhs;
+        }
+      }
+    #endif
+    decExpOp(res, rhs, set, &status);
+    } while(0);                         // end protected
+
+  #if DECSUBSET
+  if (allocrhs !=NULL) free(allocrhs);  // drop any storage used
+  #endif
+  // apply significant status
+  if (status!=0) decStatus(res, status, set);
+  #if DECCHECK
+  decCheckInexact(res, set);
+  #endif
+  return res;
+  } // decNumberExp
+
+/* ------------------------------------------------------------------ */
+/* decNumberFMA -- fused multiply add                                 */
+/*                                                                    */
+/*   This computes D = (A * B) + C with only one rounding             */
+/*                                                                    */
+/*   res is D, the result.  D may be A or B or C (e.g., X=FMA(X,X,X)) */
+/*   lhs is A                                                         */
+/*   rhs is B                                                         */
+/*   fhs is C [far hand side]                                         */
+/*   set is the context                                               */
+/*                                                                    */
+/* Mathematical function restrictions apply (see above); a NaN is     */
+/* returned with Invalid_operation if a restriction is violated.      */
+/*                                                                    */
+/* C must have space for set->digits digits.                          */
+/* ------------------------------------------------------------------ */
+decNumber * decNumberFMA(decNumber *res, const decNumber *lhs,
+                         const decNumber *rhs, const decNumber *fhs,
+                         decContext *set) {
+  uInt status=0;                   // accumulator
+  decContext dcmul;                // context for the multiplication
+  uInt needbytes;                  // for space calculations
+  decNumber bufa[D2N(DECBUFFER*2+1)];
+  decNumber *allocbufa=NULL;       // -> allocated bufa, iff allocated
+  decNumber *acc;                  // accumulator pointer
+  decNumber dzero;                 // work
+
+  #if DECCHECK
+  if (decCheckOperands(res, lhs, rhs, set)) return res;
+  if (decCheckOperands(res, fhs, DECUNUSED, set)) return res;
+  #endif
+
+  do {                                  // protect allocated storage
+    #if DECSUBSET
+    if (!set->extended) {               // [undefined if subset]
+      status|=DEC_Invalid_operation;
+      break;}
+    #endif
+    // Check math restrictions [these ensure no overflow or underflow]
+    if ((!decNumberIsSpecial(lhs) && decCheckMath(lhs, set, &status))
+     || (!decNumberIsSpecial(rhs) && decCheckMath(rhs, set, &status))
+     || (!decNumberIsSpecial(fhs) && decCheckMath(fhs, set, &status))) break;
+    // set up context for multiply
+    dcmul=*set;
+    dcmul.digits=lhs->digits+rhs->digits; // just enough
+    // [The above may be an over-estimate for subset arithmetic, but that's OK]
+    dcmul.emax=DEC_MAX_EMAX;            // effectively unbounded ..
+    dcmul.emin=DEC_MIN_EMIN;            // [thanks to Math restrictions]
+    // set up decNumber space to receive the result of the multiply
+    acc=bufa;                           // may fit
+    needbytes=sizeof(decNumber)+(D2U(dcmul.digits)-1)*sizeof(Unit);
+    if (needbytes>sizeof(bufa)) {       // need malloc space
+      allocbufa=(decNumber *)malloc(needbytes);
+      if (allocbufa==NULL) {            // hopeless -- abandon
+        status|=DEC_Insufficient_storage;
+        break;}
+      acc=allocbufa;                    // use the allocated space
+      }
+    // multiply with extended range and necessary precision
+    //printf("emin=%ld\n", dcmul.emin);
+    decMultiplyOp(acc, lhs, rhs, &dcmul, &status);
+    // Only Invalid operation (from sNaN or Inf * 0) is possible in
+    // status; if either is seen than ignore fhs (in case it is
+    // another sNaN) and set acc to NaN unless we had an sNaN
+    // [decMultiplyOp leaves that to caller]
+    // Note sNaN has to go through addOp to shorten payload if
+    // necessary
+    if ((status&DEC_Invalid_operation)!=0) {
+      if (!(status&DEC_sNaN)) {         // but be true invalid
+        decNumberZero(res);             // acc not yet set
+        res->bits=DECNAN;
+        break;
+        }
+      decNumberZero(&dzero);            // make 0 (any non-NaN would do)
+      fhs=&dzero;                       // use that
+      }
+    #if DECCHECK
+     else { // multiply was OK
+      if (status!=0) printf("Status=%08lx after FMA multiply\n", (LI)status);
+      }
+    #endif
+    // add the third operand and result -> res, and all is done
+    decAddOp(res, acc, fhs, set, 0, &status);
+    } while(0);                         // end protected
+
+  if (allocbufa!=NULL) free(allocbufa); // drop any storage used
+  if (status!=0) decStatus(res, status, set);
+  #if DECCHECK
+  decCheckInexact(res, set);
+  #endif
+  return res;
+  } // decNumberFMA
+
+/* ------------------------------------------------------------------ */
+/* decNumberInvert -- invert a Number, digitwise                      */
+/*                                                                    */
+/*   This computes C = ~A                                             */
+/*                                                                    */
+/*   res is C, the result.  C may be A (e.g., X=~X)                   */
+/*   rhs is A                                                         */
+/*   set is the context (used for result length and error report)     */
+/*                                                                    */
+/* C must have space for set->digits digits.                          */
+/*                                                                    */
+/* Logical function restrictions apply (see above); a NaN is          */
+/* returned with Invalid_operation if a restriction is violated.      */
+/* ------------------------------------------------------------------ */
+decNumber * decNumberInvert(decNumber *res, const decNumber *rhs,
+                            decContext *set) {
+  const Unit *ua, *msua;                // -> operand and its msu
+  Unit  *uc, *msuc;                     // -> result and its msu
+  Int   msudigs;                        // digits in res msu
+  #if DECCHECK
+  if (decCheckOperands(res, DECUNUSED, rhs, set)) return res;
+  #endif
+
+  if (rhs->exponent!=0 || decNumberIsSpecial(rhs) || decNumberIsNegative(rhs)) {
+    decStatus(res, DEC_Invalid_operation, set);
+    return res;
+    }
+  // operand is valid
+  ua=rhs->lsu;                          // bottom-up
+  uc=res->lsu;                          // ..
+  msua=ua+D2U(rhs->digits)-1;           // -> msu of rhs
+  msuc=uc+D2U(set->digits)-1;           // -> msu of result
+  msudigs=MSUDIGITS(set->digits);       // [faster than remainder]
+  for (; uc<=msuc; ua++, uc++) {        // Unit loop
+    Unit a;                             // extract unit
+    Int  i, j;                          // work
+    if (ua>msua) a=0;
+     else a=*ua;
+    *uc=0;                              // can now write back
+    // always need to examine all bits in rhs
+    // This loop could be unrolled and/or use BIN2BCD tables
+    for (i=0; i<DECDPUN; i++) {
+      if ((~a)&1) *uc=*uc+(Unit)powers[i];   // effect INVERT
+      j=a%10;
+      a=a/10;
+      if (j>1) {
+        decStatus(res, DEC_Invalid_operation, set);
+        return res;
+        }
+      if (uc==msuc && i==msudigs-1) break;   // just did final digit
+      } // each digit
+    } // each unit
+  // [here uc-1 is the msu of the result]
+  res->digits=decGetDigits(res->lsu, uc-res->lsu);
+  res->exponent=0;                      // integer
+  res->bits=0;                          // sign=0
+  return res;  // [no status to set]
+  } // decNumberInvert
+
+/* ------------------------------------------------------------------ */
+/* decNumberLn -- natural logarithm                                   */
+/*                                                                    */
+/*   This computes C = ln(A)                                          */
+/*                                                                    */
+/*   res is C, the result.  C may be A                                */
+/*   rhs is A                                                         */
+/*   set is the context; note that rounding mode has no effect        */
+/*                                                                    */
+/* C must have space for set->digits digits.                          */
+/*                                                                    */
+/* Notable cases:                                                     */
+/*   A<0 -> Invalid                                                   */
+/*   A=0 -> -Infinity (Exact)                                         */
+/*   A=+Infinity -> +Infinity (Exact)                                 */
+/*   A=1 exactly -> 0 (Exact)                                         */
+/*                                                                    */
+/* Mathematical function restrictions apply (see above); a NaN is     */
+/* returned with Invalid_operation if a restriction is violated.      */
+/*                                                                    */
+/* An Inexact result is rounded using DEC_ROUND_HALF_EVEN; it will    */
+/* almost always be correctly rounded, but may be up to 1 ulp in      */
+/* error in rare cases.                                               */
+/* ------------------------------------------------------------------ */
+/* This is a wrapper for decLnOp which can handle the slightly wider  */
+/* (+11) range needed by Ln, Log10, etc. (which may have to be able   */
+/* to calculate at p+e+2).                                            */
+/* ------------------------------------------------------------------ */
+decNumber * decNumberLn(decNumber *res, const decNumber *rhs,
+                        decContext *set) {
+  uInt status=0;                   // accumulator
+  #if DECSUBSET
+  decNumber *allocrhs=NULL;        // non-NULL if rounded rhs allocated
+  #endif
+
+  #if DECCHECK
+  if (decCheckOperands(res, DECUNUSED, rhs, set)) return res;
+  #endif
+
+  // Check restrictions; this is a math function; if not violated
+  // then carry out the operation.
+  if (!decCheckMath(rhs, set, &status)) do { // protect allocation
+    #if DECSUBSET
+    if (!set->extended) {
+      // reduce operand and set lostDigits status, as needed
+      if (rhs->digits>set->digits) {
+        allocrhs=decRoundOperand(rhs, set, &status);
+        if (allocrhs==NULL) break;
+        rhs=allocrhs;
+        }
+      // special check in subset for rhs=0
+      if (ISZERO(rhs)) {                // +/- zeros -> error
+        status|=DEC_Invalid_operation;
+        break;}
+      } // extended=0
+    #endif
+    decLnOp(res, rhs, set, &status);
+    } while(0);                         // end protected
+
+  #if DECSUBSET
+  if (allocrhs !=NULL) free(allocrhs);  // drop any storage used
+  #endif
+  // apply significant status
+  if (status!=0) decStatus(res, status, set);
+  #if DECCHECK
+  decCheckInexact(res, set);
+  #endif
+  return res;
+  } // decNumberLn
+
+/* ------------------------------------------------------------------ */
+/* decNumberLogB - get adjusted exponent, by 754 rules                */
+/*                                                                    */
+/*   This computes C = adjustedexponent(A)                            */
+/*                                                                    */
+/*   res is C, the result.  C may be A                                */
+/*   rhs is A                                                         */
+/*   set is the context, used only for digits and status              */
+/*                                                                    */
+/* For an unrounded result, digits may need to be 10 (A might have    */
+/* 10**9 digits and an exponent of +999999999, or one digit and an    */
+/* exponent of -1999999999).                                          */
+/*                                                                    */
+/* This returns the adjusted exponent of A after (in theory) padding  */
+/* with zeros on the right to set->digits digits while keeping the    */
+/* same value.  The exponent is not limited by emin/emax.             */
+/*                                                                    */
+/* Notable cases:                                                     */
+/*   A<0 -> Use |A|                                                   */
+/*   A=0 -> -Infinity (Division by zero)                              */
+/*   A=Infinite -> +Infinity (Exact)                                  */
+/*   A=1 exactly -> 0 (Exact)                                         */
+/*   NaNs are propagated as usual                                     */
+/* ------------------------------------------------------------------ */
+decNumber * decNumberLogB(decNumber *res, const decNumber *rhs,
+                          decContext *set) {
+  uInt status=0;                   // accumulator
+
+  #if DECCHECK
+  if (decCheckOperands(res, DECUNUSED, rhs, set)) return res;
+  #endif
+
+  // NaNs as usual; Infinities return +Infinity; 0->oops
+  if (decNumberIsNaN(rhs)) decNaNs(res, rhs, NULL, set, &status);
+   else if (decNumberIsInfinite(rhs)) decNumberCopyAbs(res, rhs);
+   else if (decNumberIsZero(rhs)) {
+    decNumberZero(res);                 // prepare for Infinity
+    res->bits=DECNEG|DECINF;            // -Infinity
+    status|=DEC_Division_by_zero;       // as per 754
+    }
+   else { // finite non-zero
+    Int ae=rhs->exponent+rhs->digits-1; // adjusted exponent
+    if (set->digits>=10) decNumberFromInt32(res, ae);  // lay it out
+     else {
+      decNumber buft[D2N(10)];          // temporary number
+      decNumber *t=buft;                // ..
+      decNumberFromInt32(t, ae);        // lay it out
+      decNumberPlus(res, t, set);       // round as necessary
+      }
+    }
+
+  if (status!=0) decStatus(res, status, set);
+  return res;
+  } // decNumberLogB
+
+/* ------------------------------------------------------------------ */
+/* decNumberLog10 -- logarithm in base 10                             */
+/*                                                                    */
+/*   This computes C = log10(A)                                       */
+/*                                                                    */
+/*   res is C, the result.  C may be A                                */
+/*   rhs is A                                                         */
+/*   set is the context; note that rounding mode has no effect        */
+/*                                                                    */
+/* C must have space for set->digits digits.                          */
+/*                                                                    */
+/* Notable cases:                                                     */
+/*   A<0 -> Invalid                                                   */
+/*   A=0 -> -Infinity (Exact)                                         */
+/*   A=+Infinity -> +Infinity (Exact)                                 */
+/*   A=10**n (if n is an integer) -> n (Exact)                        */
+/*                                                                    */
+/* Mathematical function restrictions apply (see above); a NaN is     */
+/* returned with Invalid_operation if a restriction is violated.      */
+/*                                                                    */
+/* An Inexact result is rounded using DEC_ROUND_HALF_EVEN; it will    */
+/* almost always be correctly rounded, but may be up to 1 ulp in      */
+/* error in rare cases.                                               */
+/* ------------------------------------------------------------------ */
+/* This calculates ln(A)/ln(10) using appropriate precision.  For     */
+/* ln(A) this is the max(p, rhs->digits + t) + 3, where p is the      */
+/* requested digits and t is the number of digits in the exponent     */
+/* (maximum 6).  For ln(10) it is p + 3; this is often handled by the */
+/* fastpath in decLnOp.  The final division is done to the requested  */
+/* precision.                                                         */
+/* ------------------------------------------------------------------ */
+decNumber * decNumberLog10(decNumber *res, const decNumber *rhs,
+                          decContext *set) {
+  uInt status=0, ignore=0;         // status accumulators
+  uInt needbytes;                  // for space calculations
+  Int p;                           // working precision
+  Int t;                           // digits in exponent of A
+
+  // buffers for a and b working decimals
+  // (adjustment calculator, same size)
+  decNumber bufa[D2N(DECBUFFER+2)];
+  decNumber *allocbufa=NULL;       // -> allocated bufa, iff allocated
+  decNumber *a=bufa;               // temporary a
+  decNumber bufb[D2N(DECBUFFER+2)];
+  decNumber *allocbufb=NULL;       // -> allocated bufb, iff allocated
+  decNumber *b=bufb;               // temporary b
+  decNumber bufw[D2N(10)];         // working 2-10 digit number
+  decNumber *w=bufw;               // ..
+  #if DECSUBSET
+  decNumber *allocrhs=NULL;        // non-NULL if rounded rhs allocated
+  #endif
+
+  decContext aset;                 // working context
+
+  #if DECCHECK
+  if (decCheckOperands(res, DECUNUSED, rhs, set)) return res;
+  #endif
+
+  // Check restrictions; this is a math function; if not violated
+  // then carry out the operation.
+  if (!decCheckMath(rhs, set, &status)) do { // protect malloc
+    #if DECSUBSET
+    if (!set->extended) {
+      // reduce operand and set lostDigits status, as needed
+      if (rhs->digits>set->digits) {
+        allocrhs=decRoundOperand(rhs, set, &status);
+        if (allocrhs==NULL) break;
+        rhs=allocrhs;
+        }
+      // special check in subset for rhs=0
+      if (ISZERO(rhs)) {                // +/- zeros -> error
+        status|=DEC_Invalid_operation;
+        break;}
+      } // extended=0
+    #endif
+
+    decContextDefault(&aset, DEC_INIT_DECIMAL64); // clean context
+
+    // handle exact powers of 10; only check if +ve finite
+    if (!(rhs->bits&(DECNEG|DECSPECIAL)) && !ISZERO(rhs)) {
+      Int residue=0;               // (no residue)
+      uInt copystat=0;             // clean status
+
+      // round to a single digit...
+      aset.digits=1;
+      decCopyFit(w, rhs, &aset, &residue, &copystat); // copy & shorten
+      // if exact and the digit is 1, rhs is a power of 10
+      if (!(copystat&DEC_Inexact) && w->lsu[0]==1) {
+        // the exponent, conveniently, is the power of 10; making
+        // this the result needs a little care as it might not fit,
+        // so first convert it into the working number, and then move
+        // to res
+        decNumberFromInt32(w, w->exponent);
+        residue=0;
+        decCopyFit(res, w, set, &residue, &status); // copy & round
+        decFinish(res, set, &residue, &status);     // cleanup/set flags
+        break;
+        } // not a power of 10
+      } // not a candidate for exact
+
+    // simplify the information-content calculation to use 'total
+    // number of digits in a, including exponent' as compared to the
+    // requested digits, as increasing this will only rarely cost an
+    // iteration in ln(a) anyway
+    t=6;                                // it can never be >6
+
+    // allocate space when needed...
+    p=(rhs->digits+t>set->digits?rhs->digits+t:set->digits)+3;
+    needbytes=sizeof(decNumber)+(D2U(p)-1)*sizeof(Unit);
+    if (needbytes>sizeof(bufa)) {       // need malloc space
+      allocbufa=(decNumber *)malloc(needbytes);
+      if (allocbufa==NULL) {            // hopeless -- abandon
+        status|=DEC_Insufficient_storage;
+        break;}
+      a=allocbufa;                      // use the allocated space
+      }
+    aset.digits=p;                      // as calculated
+    aset.emax=DEC_MAX_MATH;             // usual bounds
+    aset.emin=-DEC_MAX_MATH;            // ..
+    aset.clamp=0;                       // and no concrete format
+    decLnOp(a, rhs, &aset, &status);    // a=ln(rhs)
+
+    // skip the division if the result so far is infinite, NaN, or
+    // zero, or there was an error; note NaN from sNaN needs copy
+    if (status&DEC_NaNs && !(status&DEC_sNaN)) break;
+    if (a->bits&DECSPECIAL || ISZERO(a)) {
+      decNumberCopy(res, a);            // [will fit]
+      break;}
+
+    // for ln(10) an extra 3 digits of precision are needed
+    p=set->digits+3;
+    needbytes=sizeof(decNumber)+(D2U(p)-1)*sizeof(Unit);
+    if (needbytes>sizeof(bufb)) {       // need malloc space
+      allocbufb=(decNumber *)malloc(needbytes);
+      if (allocbufb==NULL) {            // hopeless -- abandon
+        status|=DEC_Insufficient_storage;
+        break;}
+      b=allocbufb;                      // use the allocated space
+      }
+    decNumberZero(w);                   // set up 10...
+    #if DECDPUN==1
+    w->lsu[1]=1; w->lsu[0]=0;           // ..
+    #else
+    w->lsu[0]=10;                       // ..
+    #endif
+    w->digits=2;                        // ..
+
+    aset.digits=p;
+    decLnOp(b, w, &aset, &ignore);      // b=ln(10)
+
+    aset.digits=set->digits;            // for final divide
+    decDivideOp(res, a, b, &aset, DIVIDE, &status); // into result
+    } while(0);                         // [for break]
+
+  if (allocbufa!=NULL) free(allocbufa); // drop any storage used
+  if (allocbufb!=NULL) free(allocbufb); // ..
+  #if DECSUBSET
+  if (allocrhs !=NULL) free(allocrhs);  // ..
+  #endif
+  // apply significant status
+  if (status!=0) decStatus(res, status, set);
+  #if DECCHECK
+  decCheckInexact(res, set);
+  #endif
+  return res;
+  } // decNumberLog10
+
+/* ------------------------------------------------------------------ */
+/* decNumberMax -- compare two Numbers and return the maximum         */
+/*                                                                    */
+/*   This computes C = A ? B, returning the maximum by 754 rules      */
+/*                                                                    */
+/*   res is C, the result.  C may be A and/or B (e.g., X=X?X)         */
+/*   lhs is A                                                         */
+/*   rhs is B                                                         */
+/*   set is the context                                               */
+/*                                                                    */
+/* C must have space for set->digits digits.                          */
+/* ------------------------------------------------------------------ */
+decNumber * decNumberMax(decNumber *res, const decNumber *lhs,
+                         const decNumber *rhs, decContext *set) {
+  uInt status=0;                        // accumulator
+  decCompareOp(res, lhs, rhs, set, COMPMAX, &status);
+  if (status!=0) decStatus(res, status, set);
+  #if DECCHECK
+  decCheckInexact(res, set);
+  #endif
+  return res;
+  } // decNumberMax
+
+/* ------------------------------------------------------------------ */
+/* decNumberMaxMag -- compare and return the maximum by magnitude     */
+/*                                                                    */
+/*   This computes C = A ? B, returning the maximum by 754 rules      */
+/*                                                                    */
+/*   res is C, the result.  C may be A and/or B (e.g., X=X?X)         */
+/*   lhs is A                                                         */
+/*   rhs is B                                                         */
+/*   set is the context                                               */
+/*                                                                    */
+/* C must have space for set->digits digits.                          */
+/* ------------------------------------------------------------------ */
+decNumber * decNumberMaxMag(decNumber *res, const decNumber *lhs,
+                         const decNumber *rhs, decContext *set) {
+  uInt status=0;                        // accumulator
+  decCompareOp(res, lhs, rhs, set, COMPMAXMAG, &status);
+  if (status!=0) decStatus(res, status, set);
+  #if DECCHECK
+  decCheckInexact(res, set);
+  #endif
+  return res;
+  } // decNumberMaxMag
+
+/* ------------------------------------------------------------------ */
+/* decNumberMin -- compare two Numbers and return the minimum         */
+/*                                                                    */
+/*   This computes C = A ? B, returning the minimum by 754 rules      */
+/*                                                                    */
+/*   res is C, the result.  C may be A and/or B (e.g., X=X?X)         */
+/*   lhs is A                                                         */
+/*   rhs is B                                                         */
+/*   set is the context                                               */
+/*                                                                    */
+/* C must have space for set->digits digits.                          */
+/* ------------------------------------------------------------------ */
+decNumber * decNumberMin(decNumber *res, const decNumber *lhs,
+                         const decNumber *rhs, decContext *set) {
+  uInt status=0;                        // accumulator
+  decCompareOp(res, lhs, rhs, set, COMPMIN, &status);
+  if (status!=0) decStatus(res, status, set);
+  #if DECCHECK
+  decCheckInexact(res, set);
+  #endif
+  return res;
+  } // decNumberMin
+
+/* ------------------------------------------------------------------ */
+/* decNumberMinMag -- compare and return the minimum by magnitude     */
+/*                                                                    */
+/*   This computes C = A ? B, returning the minimum by 754 rules      */
+/*                                                                    */
+/*   res is C, the result.  C may be A and/or B (e.g., X=X?X)         */
+/*   lhs is A                                                         */
+/*   rhs is B                                                         */
+/*   set is the context                                               */
+/*                                                                    */
+/* C must have space for set->digits digits.                          */
+/* ------------------------------------------------------------------ */
+decNumber * decNumberMinMag(decNumber *res, const decNumber *lhs,
+                         const decNumber *rhs, decContext *set) {
+  uInt status=0;                        // accumulator
+  decCompareOp(res, lhs, rhs, set, COMPMINMAG, &status);
+  if (status!=0) decStatus(res, status, set);
+  #if DECCHECK
+  decCheckInexact(res, set);
+  #endif
+  return res;
+  } // decNumberMinMag
+
+/* ------------------------------------------------------------------ */
+/* decNumberMinus -- prefix minus operator                            */
+/*                                                                    */
+/*   This computes C = 0 - A                                          */
+/*                                                                    */
+/*   res is C, the result.  C may be A                                */
+/*   rhs is A                                                         */
+/*   set is the context                                               */
+/*                                                                    */
+/* See also decNumberCopyNegate for a quiet bitwise version of this.  */
+/* C must have space for set->digits digits.                          */
+/* ------------------------------------------------------------------ */
+/* Simply use AddOp for the subtract, which will do the necessary.    */
+/* ------------------------------------------------------------------ */
+decNumber * decNumberMinus(decNumber *res, const decNumber *rhs,
+                           decContext *set) {
+  decNumber dzero;
+  uInt status=0;                        // accumulator
+
+  #if DECCHECK
+  if (decCheckOperands(res, DECUNUSED, rhs, set)) return res;
+  #endif
+
+  decNumberZero(&dzero);                // make 0
+  dzero.exponent=rhs->exponent;         // [no coefficient expansion]
+  decAddOp(res, &dzero, rhs, set, DECNEG, &status);
+  if (status!=0) decStatus(res, status, set);
+  #if DECCHECK
+  decCheckInexact(res, set);
+  #endif
+  return res;
+  } // decNumberMinus
+
+/* ------------------------------------------------------------------ */
+/* decNumberNextMinus -- next towards -Infinity                       */
+/*                                                                    */
+/*   This computes C = A - infinitesimal, rounded towards -Infinity   */
+/*                                                                    */
+/*   res is C, the result.  C may be A                                */
+/*   rhs is A                                                         */
+/*   set is the context                                               */
+/*                                                                    */
+/* This is a generalization of 754 NextDown.                          */
+/* ------------------------------------------------------------------ */
+decNumber * decNumberNextMinus(decNumber *res, const decNumber *rhs,
+                               decContext *set) {
+  decNumber dtiny;                           // constant
+  decContext workset=*set;                   // work
+  uInt status=0;                             // accumulator
+  #if DECCHECK
+  if (decCheckOperands(res, DECUNUSED, rhs, set)) return res;
+  #endif
+
+  // +Infinity is the special case
+  if ((rhs->bits&(DECINF|DECNEG))==DECINF) {
+    decSetMaxValue(res, set);                // is +ve
+    // there is no status to set
+    return res;
+    }
+  decNumberZero(&dtiny);                     // start with 0
+  dtiny.lsu[0]=1;                            // make number that is ..
+  dtiny.exponent=DEC_MIN_EMIN-1;             // .. smaller than tiniest
+  workset.round=DEC_ROUND_FLOOR;
+  decAddOp(res, rhs, &dtiny, &workset, DECNEG, &status);
+  status&=DEC_Invalid_operation|DEC_sNaN;    // only sNaN Invalid please
+  if (status!=0) decStatus(res, status, set);
+  return res;
+  } // decNumberNextMinus
+
+/* ------------------------------------------------------------------ */
+/* decNumberNextPlus -- next towards +Infinity                        */
+/*                                                                    */
+/*   This computes C = A + infinitesimal, rounded towards +Infinity   */
+/*                                                                    */
+/*   res is C, the result.  C may be A                                */
+/*   rhs is A                                                         */
+/*   set is the context                                               */
+/*                                                                    */
+/* This is a generalization of 754 NextUp.                            */
+/* ------------------------------------------------------------------ */
+decNumber * decNumberNextPlus(decNumber *res, const decNumber *rhs,
+                              decContext *set) {
+  decNumber dtiny;                           // constant
+  decContext workset=*set;                   // work
+  uInt status=0;                             // accumulator
+  #if DECCHECK
+  if (decCheckOperands(res, DECUNUSED, rhs, set)) return res;
+  #endif
+
+  // -Infinity is the special case
+  if ((rhs->bits&(DECINF|DECNEG))==(DECINF|DECNEG)) {
+    decSetMaxValue(res, set);
+    res->bits=DECNEG;                        // negative
+    // there is no status to set
+    return res;
+    }
+  decNumberZero(&dtiny);                     // start with 0
+  dtiny.lsu[0]=1;                            // make number that is ..
+  dtiny.exponent=DEC_MIN_EMIN-1;             // .. smaller than tiniest
+  workset.round=DEC_ROUND_CEILING;
+  decAddOp(res, rhs, &dtiny, &workset, 0, &status);
+  status&=DEC_Invalid_operation|DEC_sNaN;    // only sNaN Invalid please
+  if (status!=0) decStatus(res, status, set);
+  return res;
+  } // decNumberNextPlus
+
+/* ------------------------------------------------------------------ */
+/* decNumberNextToward -- next towards rhs                            */
+/*                                                                    */
+/*   This computes C = A +/- infinitesimal, rounded towards           */
+/*   +/-Infinity in the direction of B, as per 754-1985 nextafter     */
+/*   modified during revision but dropped from 754-2008.              */
+/*                                                                    */
+/*   res is C, the result.  C may be A or B.                          */
+/*   lhs is A                                                         */
+/*   rhs is B                                                         */
+/*   set is the context                                               */
+/*                                                                    */
+/* This is a generalization of 754-1985 NextAfter.                    */
+/* ------------------------------------------------------------------ */
+decNumber * decNumberNextToward(decNumber *res, const decNumber *lhs,
+                                const decNumber *rhs, decContext *set) {
+  decNumber dtiny;                           // constant
+  decContext workset=*set;                   // work
+  Int result;                                // ..
+  uInt status=0;                             // accumulator
+  #if DECCHECK
+  if (decCheckOperands(res, lhs, rhs, set)) return res;
+  #endif
+
+  if (decNumberIsNaN(lhs) || decNumberIsNaN(rhs)) {
+    decNaNs(res, lhs, rhs, set, &status);
+    }
+   else { // Is numeric, so no chance of sNaN Invalid, etc.
+    result=decCompare(lhs, rhs, 0);     // sign matters
+    if (result==BADINT) status|=DEC_Insufficient_storage; // rare
+     else { // valid compare
+      if (result==0) decNumberCopySign(res, lhs, rhs); // easy
+       else { // differ: need NextPlus or NextMinus
+        uByte sub;                      // add or subtract
+        if (result<0) {                 // lhs<rhs, do nextplus
+          // -Infinity is the special case
+          if ((lhs->bits&(DECINF|DECNEG))==(DECINF|DECNEG)) {
+            decSetMaxValue(res, set);
+            res->bits=DECNEG;           // negative
+            return res;                 // there is no status to set
+            }
+          workset.round=DEC_ROUND_CEILING;
+          sub=0;                        // add, please
+          } // plus
+         else {                         // lhs>rhs, do nextminus
+          // +Infinity is the special case
+          if ((lhs->bits&(DECINF|DECNEG))==DECINF) {
+            decSetMaxValue(res, set);
+            return res;                 // there is no status to set
+            }
+          workset.round=DEC_ROUND_FLOOR;
+          sub=DECNEG;                   // subtract, please
+          } // minus
+        decNumberZero(&dtiny);          // start with 0
+        dtiny.lsu[0]=1;                 // make number that is ..
+        dtiny.exponent=DEC_MIN_EMIN-1;  // .. smaller than tiniest
+        decAddOp(res, lhs, &dtiny, &workset, sub, &status); // + or -
+        // turn off exceptions if the result is a normal number
+        // (including Nmin), otherwise let all status through
+        if (decNumberIsNormal(res, set)) status=0;
+        } // unequal
+      } // compare OK
+    } // numeric
+  if (status!=0) decStatus(res, status, set);
+  return res;
+  } // decNumberNextToward
+
+/* ------------------------------------------------------------------ */
+/* decNumberOr -- OR two Numbers, digitwise                           */
+/*                                                                    */
+/*   This computes C = A | B                                          */
+/*                                                                    */
+/*   res is C, the result.  C may be A and/or B (e.g., X=X|X)         */
+/*   lhs is A                                                         */
+/*   rhs is B                                                         */
+/*   set is the context (used for result length and error report)     */
+/*                                                                    */
+/* C must have space for set->digits digits.                          */
+/*                                                                    */
+/* Logical function restrictions apply (see above); a NaN is          */
+/* returned with Invalid_operation if a restriction is violated.      */
+/* ------------------------------------------------------------------ */
+decNumber * decNumberOr(decNumber *res, const decNumber *lhs,
+                        const decNumber *rhs, decContext *set) {
+  const Unit *ua, *ub;                  // -> operands
+  const Unit *msua, *msub;              // -> operand msus
+  Unit  *uc, *msuc;                     // -> result and its msu
+  Int   msudigs;                        // digits in res msu
+  #if DECCHECK
+  if (decCheckOperands(res, lhs, rhs, set)) return res;
+  #endif
+
+  if (lhs->exponent!=0 || decNumberIsSpecial(lhs) || decNumberIsNegative(lhs)
+   || rhs->exponent!=0 || decNumberIsSpecial(rhs) || decNumberIsNegative(rhs)) {
+    decStatus(res, DEC_Invalid_operation, set);
+    return res;
+    }
+  // operands are valid
+  ua=lhs->lsu;                          // bottom-up
+  ub=rhs->lsu;                          // ..
+  uc=res->lsu;                          // ..
+  msua=ua+D2U(lhs->digits)-1;           // -> msu of lhs
+  msub=ub+D2U(rhs->digits)-1;           // -> msu of rhs
+  msuc=uc+D2U(set->digits)-1;           // -> msu of result
+  msudigs=MSUDIGITS(set->digits);       // [faster than remainder]
+  for (; uc<=msuc; ua++, ub++, uc++) {  // Unit loop
+    Unit a, b;                          // extract units
+    if (ua>msua) a=0;
+     else a=*ua;
+    if (ub>msub) b=0;
+     else b=*ub;
+    *uc=0;                              // can now write back
+    if (a|b) {                          // maybe 1 bits to examine
+      Int i, j;
+      // This loop could be unrolled and/or use BIN2BCD tables
+      for (i=0; i<DECDPUN; i++) {
+        if ((a|b)&1) *uc=*uc+(Unit)powers[i];     // effect OR
+        j=a%10;
+        a=a/10;
+        j|=b%10;
+        b=b/10;
+        if (j>1) {
+          decStatus(res, DEC_Invalid_operation, set);
+          return res;
+          }
+        if (uc==msuc && i==msudigs-1) break;      // just did final digit
+        } // each digit
+      } // non-zero
+    } // each unit
+  // [here uc-1 is the msu of the result]
+  res->digits=decGetDigits(res->lsu, uc-res->lsu);
+  res->exponent=0;                      // integer
+  res->bits=0;                          // sign=0
+  return res;  // [no status to set]
+  } // decNumberOr
+
+/* ------------------------------------------------------------------ */
+/* decNumberPlus -- prefix plus operator                              */
+/*                                                                    */
+/*   This computes C = 0 + A                                          */
+/*                                                                    */
+/*   res is C, the result.  C may be A                                */
+/*   rhs is A                                                         */
+/*   set is the context                                               */
+/*                                                                    */
+/* See also decNumberCopy for a quiet bitwise version of this.        */
+/* C must have space for set->digits digits.                          */
+/* ------------------------------------------------------------------ */
+/* This simply uses AddOp; Add will take fast path after preparing A. */
+/* Performance is a concern here, as this routine is often used to    */
+/* check operands and apply rounding and overflow/underflow testing.  */
+/* ------------------------------------------------------------------ */
+decNumber * decNumberPlus(decNumber *res, const decNumber *rhs,
+                          decContext *set) {
+  decNumber dzero;
+  uInt status=0;                        // accumulator
+  #if DECCHECK
+  if (decCheckOperands(res, DECUNUSED, rhs, set)) return res;
+  #endif
+
+  decNumberZero(&dzero);                // make 0
+  dzero.exponent=rhs->exponent;         // [no coefficient expansion]
+  decAddOp(res, &dzero, rhs, set, 0, &status);
+  if (status!=0) decStatus(res, status, set);
+  #if DECCHECK
+  decCheckInexact(res, set);
+  #endif
+  return res;
+  } // decNumberPlus
+
+/* ------------------------------------------------------------------ */
+/* decNumberMultiply -- multiply two Numbers                          */
+/*                                                                    */
+/*   This computes C = A x B                                          */
+/*                                                                    */
+/*   res is C, the result.  C may be A and/or B (e.g., X=X+X)         */
+/*   lhs is A                                                         */
+/*   rhs is B                                                         */
+/*   set is the context                                               */
+/*                                                                    */
+/* C must have space for set->digits digits.                          */
+/* ------------------------------------------------------------------ */
+decNumber * decNumberMultiply(decNumber *res, const decNumber *lhs,
+                              const decNumber *rhs, decContext *set) {
+  uInt status=0;                   // accumulator
+  decMultiplyOp(res, lhs, rhs, set, &status);
+  if (status!=0) decStatus(res, status, set);
+  #if DECCHECK
+  decCheckInexact(res, set);
+  #endif
+  return res;
+  } // decNumberMultiply
+
+/* ------------------------------------------------------------------ */
+/* decNumberPower -- raise a number to a power                        */
+/*                                                                    */
+/*   This computes C = A ** B                                         */
+/*                                                                    */
+/*   res is C, the result.  C may be A and/or B (e.g., X=X**X)        */
+/*   lhs is A                                                         */
+/*   rhs is B                                                         */
+/*   set is the context                                               */
+/*                                                                    */
+/* C must have space for set->digits digits.                          */
+/*                                                                    */
+/* Mathematical function restrictions apply (see above); a NaN is     */
+/* returned with Invalid_operation if a restriction is violated.      */
+/*                                                                    */
+/* However, if 1999999997<=B<=999999999 and B is an integer then the  */
+/* restrictions on A and the context are relaxed to the usual bounds, */
+/* for compatibility with the earlier (integer power only) version    */
+/* of this function.                                                  */
+/*                                                                    */
+/* When B is an integer, the result may be exact, even if rounded.    */
+/*                                                                    */
+/* The final result is rounded according to the context; it will      */
+/* almost always be correctly rounded, but may be up to 1 ulp in      */
+/* error in rare cases.                                               */
+/* ------------------------------------------------------------------ */
+decNumber * decNumberPower(decNumber *res, const decNumber *lhs,
+                           const decNumber *rhs, decContext *set) {
+  #if DECSUBSET
+  decNumber *alloclhs=NULL;        // non-NULL if rounded lhs allocated
+  decNumber *allocrhs=NULL;        // .., rhs
+  #endif
+  decNumber *allocdac=NULL;        // -> allocated acc buffer, iff used
+  decNumber *allocinv=NULL;        // -> allocated 1/x buffer, iff used
+  Int   reqdigits=set->digits;     // requested DIGITS
+  Int   n;                         // rhs in binary
+  Flag  rhsint=0;                  // 1 if rhs is an integer
+  Flag  useint=0;                  // 1 if can use integer calculation
+  Flag  isoddint=0;                // 1 if rhs is an integer and odd
+  Int   i;                         // work
+  #if DECSUBSET
+  Int   dropped;                   // ..
+  #endif
+  uInt  needbytes;                 // buffer size needed
+  Flag  seenbit;                   // seen a bit while powering
+  Int   residue=0;                 // rounding residue
+  uInt  status=0;                  // accumulators
+  uByte bits=0;                    // result sign if errors
+  decContext aset;                 // working context
+  decNumber dnOne;                 // work value 1...
+  // local accumulator buffer [a decNumber, with digits+elength+1 digits]
+  decNumber dacbuff[D2N(DECBUFFER+9)];
+  decNumber *dac=dacbuff;          // -> result accumulator
+  // same again for possible 1/lhs calculation
+  decNumber invbuff[D2N(DECBUFFER+9)];
+
+  #if DECCHECK
+  if (decCheckOperands(res, lhs, rhs, set)) return res;
+  #endif
+
+  do {                             // protect allocated storage
+    #if DECSUBSET
+    if (!set->extended) { // reduce operands and set status, as needed
+      if (lhs->digits>reqdigits) {
+        alloclhs=decRoundOperand(lhs, set, &status);
+        if (alloclhs==NULL) break;
+        lhs=alloclhs;
+        }
+      if (rhs->digits>reqdigits) {
+        allocrhs=decRoundOperand(rhs, set, &status);
+        if (allocrhs==NULL) break;
+        rhs=allocrhs;
+        }
+      }
+    #endif
+    // [following code does not require input rounding]
+
+    // handle NaNs and rhs Infinity (lhs infinity is harder)
+    if (SPECIALARGS) {
+      if (decNumberIsNaN(lhs) || decNumberIsNaN(rhs)) { // NaNs
+        decNaNs(res, lhs, rhs, set, &status);
+        break;}
+      if (decNumberIsInfinite(rhs)) {   // rhs Infinity
+        Flag rhsneg=rhs->bits&DECNEG;   // save rhs sign
+        if (decNumberIsNegative(lhs)    // lhs<0
+         && !decNumberIsZero(lhs))      // ..
+          status|=DEC_Invalid_operation;
+         else {                         // lhs >=0
+          decNumberZero(&dnOne);        // set up 1
+          dnOne.lsu[0]=1;
+          decNumberCompare(dac, lhs, &dnOne, set); // lhs ? 1
+          decNumberZero(res);           // prepare for 0/1/Infinity
+          if (decNumberIsNegative(dac)) {    // lhs<1
+            if (rhsneg) res->bits|=DECINF;   // +Infinity [else is +0]
+            }
+           else if (dac->lsu[0]==0) {        // lhs=1
+            // 1**Infinity is inexact, so return fully-padded 1.0000
+            Int shift=set->digits-1;
+            *res->lsu=1;                     // was 0, make int 1
+            res->digits=decShiftToMost(res->lsu, 1, shift);
+            res->exponent=-shift;            // make 1.0000...
+            status|=DEC_Inexact|DEC_Rounded; // deemed inexact
+            }
+           else {                            // lhs>1
+            if (!rhsneg) res->bits|=DECINF;  // +Infinity [else is +0]
+            }
+          } // lhs>=0
+        break;}
+      // [lhs infinity drops through]
+      } // specials
+
+    // Original rhs may be an integer that fits and is in range
+    n=decGetInt(rhs);
+    if (n!=BADINT) {                    // it is an integer
+      rhsint=1;                         // record the fact for 1**n
+      isoddint=(Flag)n&1;               // [works even if big]
+      if (n!=BIGEVEN && n!=BIGODD)      // can use integer path?
+        useint=1;                       // looks good
+      }
+
+    if (decNumberIsNegative(lhs)        // -x ..
+      && isoddint) bits=DECNEG;         // .. to an odd power
+
+    // handle LHS infinity
+    if (decNumberIsInfinite(lhs)) {     // [NaNs already handled]
+      uByte rbits=rhs->bits;            // save
+      decNumberZero(res);               // prepare
+      if (n==0) *res->lsu=1;            // [-]Inf**0 => 1
+       else {
+        // -Inf**nonint -> error
+        if (!rhsint && decNumberIsNegative(lhs)) {
+          status|=DEC_Invalid_operation;     // -Inf**nonint is error
+          break;}
+        if (!(rbits & DECNEG)) bits|=DECINF; // was not a **-n
+        // [otherwise will be 0 or -0]
+        res->bits=bits;
+        }
+      break;}
+
+    // similarly handle LHS zero
+    if (decNumberIsZero(lhs)) {
+      if (n==0) {                            // 0**0 => Error
+        #if DECSUBSET
+        if (!set->extended) {                // [unless subset]
+          decNumberZero(res);
+          *res->lsu=1;                       // return 1
+          break;}
+        #endif
+        status|=DEC_Invalid_operation;
+        }
+       else {                                // 0**x
+        uByte rbits=rhs->bits;               // save
+        if (rbits & DECNEG) {                // was a 0**(-n)
+          #if DECSUBSET
+          if (!set->extended) {              // [bad if subset]
+            status|=DEC_Invalid_operation;
+            break;}
+          #endif
+          bits|=DECINF;
+          }
+        decNumberZero(res);                  // prepare
+        // [otherwise will be 0 or -0]
+        res->bits=bits;
+        }
+      break;}
+
+    // here both lhs and rhs are finite; rhs==0 is handled in the
+    // integer path.  Next handle the non-integer cases
+    if (!useint) {                      // non-integral rhs
+      // any -ve lhs is bad, as is either operand or context out of
+      // bounds
+      if (decNumberIsNegative(lhs)) {
+        status|=DEC_Invalid_operation;
+        break;}
+      if (decCheckMath(lhs, set, &status)
+       || decCheckMath(rhs, set, &status)) break; // variable status
+
+      decContextDefault(&aset, DEC_INIT_DECIMAL64); // clean context
+      aset.emax=DEC_MAX_MATH;           // usual bounds
+      aset.emin=-DEC_MAX_MATH;          // ..
+      aset.clamp=0;                     // and no concrete format
+
+      // calculate the result using exp(ln(lhs)*rhs), which can
+      // all be done into the accumulator, dac.  The precision needed
+      // is enough to contain the full information in the lhs (which
+      // is the total digits, including exponent), or the requested
+      // precision, if larger, + 4; 6 is used for the exponent
+      // maximum length, and this is also used when it is shorter
+      // than the requested digits as it greatly reduces the >0.5 ulp
+      // cases at little cost (because Ln doubles digits each
+      // iteration so a few extra digits rarely causes an extra
+      // iteration)
+      aset.digits=MAXI(lhs->digits, set->digits)+6+4;
+      } // non-integer rhs
+
+     else { // rhs is in-range integer
+      if (n==0) {                       // x**0 = 1
+        // (0**0 was handled above)
+        decNumberZero(res);             // result=1
+        *res->lsu=1;                    // ..
+        break;}
+      // rhs is a non-zero integer
+      if (n<0) n=-n;                    // use abs(n)
+
+      aset=*set;                        // clone the context
+      aset.round=DEC_ROUND_HALF_EVEN;   // internally use balanced
+      // calculate the working DIGITS
+      aset.digits=reqdigits+(rhs->digits+rhs->exponent)+2;
+      #if DECSUBSET
+      if (!set->extended) aset.digits--;     // use classic precision
+      #endif
+      // it's an error if this is more than can be handled
+      if (aset.digits>DECNUMMAXP) {status|=DEC_Invalid_operation; break;}
+      } // integer path
+
+    // aset.digits is the count of digits for the accumulator needed
+    // if accumulator is too long for local storage, then allocate
+    needbytes=sizeof(decNumber)+(D2U(aset.digits)-1)*sizeof(Unit);
+    // [needbytes also used below if 1/lhs needed]
+    if (needbytes>sizeof(dacbuff)) {
+      allocdac=(decNumber *)malloc(needbytes);
+      if (allocdac==NULL) {   // hopeless -- abandon
+        status|=DEC_Insufficient_storage;
+        break;}
+      dac=allocdac;           // use the allocated space
+      }
+    // here, aset is set up and accumulator is ready for use
+
+    if (!useint) {                           // non-integral rhs
+      // x ** y; special-case x=1 here as it will otherwise always
+      // reduce to integer 1; decLnOp has a fastpath which detects
+      // the case of x=1
+      decLnOp(dac, lhs, &aset, &status);     // dac=ln(lhs)
+      // [no error possible, as lhs 0 already handled]
+      if (ISZERO(dac)) {                     // x==1, 1.0, etc.
+        // need to return fully-padded 1.0000 etc., but rhsint->1
+        *dac->lsu=1;                         // was 0, make int 1
+        if (!rhsint) {                       // add padding
+          Int shift=set->digits-1;
+          dac->digits=decShiftToMost(dac->lsu, 1, shift);
+          dac->exponent=-shift;              // make 1.0000...
+          status|=DEC_Inexact|DEC_Rounded;   // deemed inexact
+          }
+        }
+       else {
+        decMultiplyOp(dac, dac, rhs, &aset, &status);  // dac=dac*rhs
+        decExpOp(dac, dac, &aset, &status);            // dac=exp(dac)
+        }
+      // and drop through for final rounding
+      } // non-integer rhs
+
+     else {                             // carry on with integer
+      decNumberZero(dac);               // acc=1
+      *dac->lsu=1;                      // ..
+
+      // if a negative power the constant 1 is needed, and if not subset
+      // invert the lhs now rather than inverting the result later
+      if (decNumberIsNegative(rhs)) {   // was a **-n [hence digits>0]
+        decNumber *inv=invbuff;         // asssume use fixed buffer
+        decNumberCopy(&dnOne, dac);     // dnOne=1;  [needed now or later]
+        #if DECSUBSET
+        if (set->extended) {            // need to calculate 1/lhs
+        #endif
+          // divide lhs into 1, putting result in dac [dac=1/dac]
+          decDivideOp(dac, &dnOne, lhs, &aset, DIVIDE, &status);
+          // now locate or allocate space for the inverted lhs
+          if (needbytes>sizeof(invbuff)) {
+            allocinv=(decNumber *)malloc(needbytes);
+            if (allocinv==NULL) {       // hopeless -- abandon
+              status|=DEC_Insufficient_storage;
+              break;}
+            inv=allocinv;               // use the allocated space
+            }
+          // [inv now points to big-enough buffer or allocated storage]
+          decNumberCopy(inv, dac);      // copy the 1/lhs
+          decNumberCopy(dac, &dnOne);   // restore acc=1
+          lhs=inv;                      // .. and go forward with new lhs
+        #if DECSUBSET
+          }
+        #endif
+        }
+
+      // Raise-to-the-power loop...
+      seenbit=0;                   // set once a 1-bit is encountered
+      for (i=1;;i++){              // for each bit [top bit ignored]
+        // abandon if had overflow or terminal underflow
+        if (status & (DEC_Overflow|DEC_Underflow)) { // interesting?
+          if (status&DEC_Overflow || ISZERO(dac)) break;
+          }
+        // [the following two lines revealed an optimizer bug in a C++
+        // compiler, with symptom: 5**3 -> 25, when n=n+n was used]
+        n=n<<1;                    // move next bit to testable position
+        if (n<0) {                 // top bit is set
+          seenbit=1;               // OK, significant bit seen
+          decMultiplyOp(dac, dac, lhs, &aset, &status); // dac=dac*x
+          }
+        if (i==31) break;          // that was the last bit
+        if (!seenbit) continue;    // no need to square 1
+        decMultiplyOp(dac, dac, dac, &aset, &status); // dac=dac*dac [square]
+        } /*i*/ // 32 bits
+
+      // complete internal overflow or underflow processing
+      if (status & (DEC_Overflow|DEC_Underflow)) {
+        #if DECSUBSET
+        // If subset, and power was negative, reverse the kind of -erflow
+        // [1/x not yet done]
+        if (!set->extended && decNumberIsNegative(rhs)) {
+          if (status & DEC_Overflow)
+            status^=DEC_Overflow | DEC_Underflow | DEC_Subnormal;
+           else { // trickier -- Underflow may or may not be set
+            status&=~(DEC_Underflow | DEC_Subnormal); // [one or both]
+            status|=DEC_Overflow;
+            }
+          }
+        #endif
+        dac->bits=(dac->bits & ~DECNEG) | bits; // force correct sign
+        // round subnormals [to set.digits rather than aset.digits]
+        // or set overflow result similarly as required
+        decFinalize(dac, set, &residue, &status);
+        decNumberCopy(res, dac);   // copy to result (is now OK length)
+        break;
+        }
+
+      #if DECSUBSET
+      if (!set->extended &&                  // subset math
+          decNumberIsNegative(rhs)) {        // was a **-n [hence digits>0]
+        // so divide result into 1 [dac=1/dac]
+        decDivideOp(dac, &dnOne, dac, &aset, DIVIDE, &status);
+        }
+      #endif
+      } // rhs integer path
+
+    // reduce result to the requested length and copy to result
+    decCopyFit(res, dac, set, &residue, &status);
+    decFinish(res, set, &residue, &status);  // final cleanup
+    #if DECSUBSET
+    if (!set->extended) decTrim(res, set, 0, 1, &dropped); // trailing zeros
+    #endif
+    } while(0);                         // end protected
+
+  if (allocdac!=NULL) free(allocdac);   // drop any storage used
+  if (allocinv!=NULL) free(allocinv);   // ..
+  #if DECSUBSET
+  if (alloclhs!=NULL) free(alloclhs);   // ..
+  if (allocrhs!=NULL) free(allocrhs);   // ..
+  #endif
+  if (status!=0) decStatus(res, status, set);
+  #if DECCHECK
+  decCheckInexact(res, set);
+  #endif
+  return res;
+  } // decNumberPower
+
+/* ------------------------------------------------------------------ */
+/* decNumberQuantize -- force exponent to requested value             */
+/*                                                                    */
+/*   This computes C = op(A, B), where op adjusts the coefficient     */
+/*   of C (by rounding or shifting) such that the exponent (-scale)   */
+/*   of C has exponent of B.  The numerical value of C will equal A,  */
+/*   except for the effects of any rounding that occurred.            */
+/*                                                                    */
+/*   res is C, the result.  C may be A or B                           */
+/*   lhs is A, the number to adjust                                   */
+/*   rhs is B, the number with exponent to match                      */
+/*   set is the context                                               */
+/*                                                                    */
+/* C must have space for set->digits digits.                          */
+/*                                                                    */
+/* Unless there is an error or the result is infinite, the exponent   */
+/* after the operation is guaranteed to be equal to that of B.        */
+/* ------------------------------------------------------------------ */
+decNumber * decNumberQuantize(decNumber *res, const decNumber *lhs,
+                              const decNumber *rhs, decContext *set) {
+  uInt status=0;                        // accumulator
+  decQuantizeOp(res, lhs, rhs, set, 1, &status);
+  if (status!=0) decStatus(res, status, set);
+  return res;
+  } // decNumberQuantize
+
+/* ------------------------------------------------------------------ */
+/* decNumberReduce -- remove trailing zeros                           */
+/*                                                                    */
+/*   This computes C = 0 + A, and normalizes the result               */
+/*                                                                    */
+/*   res is C, the result.  C may be A                                */
+/*   rhs is A                                                         */
+/*   set is the context                                               */
+/*                                                                    */
+/* C must have space for set->digits digits.                          */
+/* ------------------------------------------------------------------ */
+// Previously known as Normalize
+decNumber * decNumberNormalize(decNumber *res, const decNumber *rhs,
+                               decContext *set) {
+  return decNumberReduce(res, rhs, set);
+  } // decNumberNormalize
+
+decNumber * decNumberReduce(decNumber *res, const decNumber *rhs,
+                            decContext *set) {
+  #if DECSUBSET
+  decNumber *allocrhs=NULL;        // non-NULL if rounded rhs allocated
+  #endif
+  uInt status=0;                   // as usual
+  Int  residue=0;                  // as usual
+  Int  dropped;                    // work
+
+  #if DECCHECK
+  if (decCheckOperands(res, DECUNUSED, rhs, set)) return res;
+  #endif
+
+  do {                             // protect allocated storage
+    #if DECSUBSET
+    if (!set->extended) {
+      // reduce operand and set lostDigits status, as needed
+      if (rhs->digits>set->digits) {
+        allocrhs=decRoundOperand(rhs, set, &status);
+        if (allocrhs==NULL) break;
+        rhs=allocrhs;
+        }
+      }
+    #endif
+    // [following code does not require input rounding]
+
+    // Infinities copy through; NaNs need usual treatment
+    if (decNumberIsNaN(rhs)) {
+      decNaNs(res, rhs, NULL, set, &status);
+      break;
+      }
+
+    // reduce result to the requested length and copy to result
+    decCopyFit(res, rhs, set, &residue, &status); // copy & round
+    decFinish(res, set, &residue, &status);       // cleanup/set flags
+    decTrim(res, set, 1, 0, &dropped);            // normalize in place
+                                                  // [may clamp]
+    } while(0);                              // end protected
+
+  #if DECSUBSET
+  if (allocrhs !=NULL) free(allocrhs);       // ..
+  #endif
+  if (status!=0) decStatus(res, status, set);// then report status
+  return res;
+  } // decNumberReduce
+
+/* ------------------------------------------------------------------ */
+/* decNumberRescale -- force exponent to requested value              */
+/*                                                                    */
+/*   This computes C = op(A, B), where op adjusts the coefficient     */
+/*   of C (by rounding or shifting) such that the exponent (-scale)   */
+/*   of C has the value B.  The numerical value of C will equal A,    */
+/*   except for the effects of any rounding that occurred.            */
+/*                                                                    */
+/*   res is C, the result.  C may be A or B                           */
+/*   lhs is A, the number to adjust                                   */
+/*   rhs is B, the requested exponent                                 */
+/*   set is the context                                               */
+/*                                                                    */
+/* C must have space for set->digits digits.                          */
+/*                                                                    */
+/* Unless there is an error or the result is infinite, the exponent   */
+/* after the operation is guaranteed to be equal to B.                */
+/* ------------------------------------------------------------------ */
+decNumber * decNumberRescale(decNumber *res, const decNumber *lhs,
+                             const decNumber *rhs, decContext *set) {
+  uInt status=0;                        // accumulator
+  decQuantizeOp(res, lhs, rhs, set, 0, &status);
+  if (status!=0) decStatus(res, status, set);
+  return res;
+  } // decNumberRescale
+
+/* ------------------------------------------------------------------ */
+/* decNumberRemainder -- divide and return remainder                  */
+/*                                                                    */
+/*   This computes C = A % B                                          */
+/*                                                                    */
+/*   res is C, the result.  C may be A and/or B (e.g., X=X%X)         */
+/*   lhs is A                                                         */
+/*   rhs is B                                                         */
+/*   set is the context                                               */
+/*                                                                    */
+/* C must have space for set->digits digits.                          */
+/* ------------------------------------------------------------------ */
+decNumber * decNumberRemainder(decNumber *res, const decNumber *lhs,
+                               const decNumber *rhs, decContext *set) {
+  uInt status=0;                        // accumulator
+  decDivideOp(res, lhs, rhs, set, REMAINDER, &status);
+  if (status!=0) decStatus(res, status, set);
+  #if DECCHECK
+  decCheckInexact(res, set);
+  #endif
+  return res;
+  } // decNumberRemainder
+
+/* ------------------------------------------------------------------ */
+/* decNumberRemainderNear -- divide and return remainder from nearest */
+/*                                                                    */
+/*   This computes C = A % B, where % is the IEEE remainder operator  */
+/*                                                                    */
+/*   res is C, the result.  C may be A and/or B (e.g., X=X%X)         */
+/*   lhs is A                                                         */
+/*   rhs is B                                                         */
+/*   set is the context                                               */
+/*                                                                    */
+/* C must have space for set->digits digits.                          */
+/* ------------------------------------------------------------------ */
+decNumber * decNumberRemainderNear(decNumber *res, const decNumber *lhs,
+                                   const decNumber *rhs, decContext *set) {
+  uInt status=0;                        // accumulator
+  decDivideOp(res, lhs, rhs, set, REMNEAR, &status);
+  if (status!=0) decStatus(res, status, set);
+  #if DECCHECK
+  decCheckInexact(res, set);
+  #endif
+  return res;
+  } // decNumberRemainderNear
+
+/* ------------------------------------------------------------------ */
+/* decNumberRotate -- rotate the coefficient of a Number left/right   */
+/*                                                                    */
+/*   This computes C = A rot B  (in base ten and rotating set->digits */
+/*   digits).                                                         */
+/*                                                                    */
+/*   res is C, the result.  C may be A and/or B (e.g., X=XrotX)       */
+/*   lhs is A                                                         */
+/*   rhs is B, the number of digits to rotate (-ve to right)          */
+/*   set is the context                                               */
+/*                                                                    */
+/* The digits of the coefficient of A are rotated to the left (if B   */
+/* is positive) or to the right (if B is negative) without adjusting  */
+/* the exponent or the sign of A.  If lhs->digits is less than        */
+/* set->digits the coefficient is padded with zeros on the left       */
+/* before the rotate.  Any leading zeros in the result are removed    */
+/* as usual.                                                          */
+/*                                                                    */
+/* B must be an integer (q=0) and in the range -set->digits through   */
+/* +set->digits.                                                      */
+/* C must have space for set->digits digits.                          */
+/* NaNs are propagated as usual.  Infinities are unaffected (but      */
+/* B must be valid).  No status is set unless B is invalid or an      */
+/* operand is an sNaN.                                                */
+/* ------------------------------------------------------------------ */
+decNumber * decNumberRotate(decNumber *res, const decNumber *lhs,
+                           const decNumber *rhs, decContext *set) {
+  uInt status=0;              // accumulator
+  Int  rotate;                // rhs as an Int
+
+  #if DECCHECK
+  if (decCheckOperands(res, lhs, rhs, set)) return res;
+  #endif
+
+  // NaNs propagate as normal
+  if (decNumberIsNaN(lhs) || decNumberIsNaN(rhs))
+    decNaNs(res, lhs, rhs, set, &status);
+   // rhs must be an integer
+   else if (decNumberIsInfinite(rhs) || rhs->exponent!=0)
+    status=DEC_Invalid_operation;
+   else { // both numeric, rhs is an integer
+    rotate=decGetInt(rhs);                   // [cannot fail]
+    if (rotate==BADINT                       // something bad ..
+     || rotate==BIGODD || rotate==BIGEVEN    // .. very big ..
+     || abs(rotate)>set->digits)             // .. or out of range
+      status=DEC_Invalid_operation;
+     else {                                  // rhs is OK
+      decNumberCopy(res, lhs);
+      // convert -ve rotate to equivalent positive rotation
+      if (rotate<0) rotate=set->digits+rotate;
+      if (rotate!=0 && rotate!=set->digits   // zero or full rotation
+       && !decNumberIsInfinite(res)) {       // lhs was infinite
+        // left-rotate to do; 0 < rotate < set->digits
+        uInt units, shift;                   // work
+        uInt msudigits;                      // digits in result msu
+        Unit *msu=res->lsu+D2U(res->digits)-1;    // current msu
+        Unit *msumax=res->lsu+D2U(set->digits)-1; // rotation msu
+        for (msu++; msu<=msumax; msu++) *msu=0;   // ensure high units=0
+        res->digits=set->digits;                  // now full-length
+        msudigits=MSUDIGITS(res->digits);         // actual digits in msu
+
+        // rotation here is done in-place, in three steps
+        // 1. shift all to least up to one unit to unit-align final
+        //    lsd [any digits shifted out are rotated to the left,
+        //    abutted to the original msd (which may require split)]
+        //
+        //    [if there are no whole units left to rotate, the
+        //    rotation is now complete]
+        //
+        // 2. shift to least, from below the split point only, so that
+        //    the final msd is in the right place in its Unit [any
+        //    digits shifted out will fit exactly in the current msu,
+        //    left aligned, no split required]
+        //
+        // 3. rotate all the units by reversing left part, right
+        //    part, and then whole
+        //
+        // example: rotate right 8 digits (2 units + 2), DECDPUN=3.
+        //
+        //   start: 00a bcd efg hij klm npq
+        //
+        //      1a  000 0ab cde fgh|ijk lmn [pq saved]
+        //      1b  00p qab cde fgh|ijk lmn
+        //
+        //      2a  00p qab cde fgh|00i jkl [mn saved]
+        //      2b  mnp qab cde fgh|00i jkl
+        //
+        //      3a  fgh cde qab mnp|00i jkl
+        //      3b  fgh cde qab mnp|jkl 00i
+        //      3c  00i jkl mnp qab cde fgh
+
+        // Step 1: amount to shift is the partial right-rotate count
+        rotate=set->digits-rotate;      // make it right-rotate
+        units=rotate/DECDPUN;           // whole units to rotate
+        shift=rotate%DECDPUN;           // left-over digits count
+        if (shift>0) {                  // not an exact number of units
+          uInt save=res->lsu[0]%powers[shift];    // save low digit(s)
+          decShiftToLeast(res->lsu, D2U(res->digits), shift);
+          if (shift>msudigits) {        // msumax-1 needs >0 digits
+            uInt rem=save%powers[shift-msudigits];// split save
+            *msumax=(Unit)(save/powers[shift-msudigits]); // and insert
+            *(msumax-1)=*(msumax-1)
+                       +(Unit)(rem*powers[DECDPUN-(shift-msudigits)]); // ..
+            }
+           else { // all fits in msumax
+            *msumax=*msumax+(Unit)(save*powers[msudigits-shift]); // [maybe *1]
+            }
+          } // digits shift needed
+
+        // If whole units to rotate...
+        if (units>0) {                  // some to do
+          // Step 2: the units to touch are the whole ones in rotate,
+          //   if any, and the shift is DECDPUN-msudigits (which may be
+          //   0, again)
+          shift=DECDPUN-msudigits;
+          if (shift>0) {                // not an exact number of units
+            uInt save=res->lsu[0]%powers[shift];  // save low digit(s)
+            decShiftToLeast(res->lsu, units, shift);
+            *msumax=*msumax+(Unit)(save*powers[msudigits]);
+            } // partial shift needed
+
+          // Step 3: rotate the units array using triple reverse
+          // (reversing is easy and fast)
+          decReverse(res->lsu+units, msumax);     // left part
+          decReverse(res->lsu, res->lsu+units-1); // right part
+          decReverse(res->lsu, msumax);           // whole
+          } // whole units to rotate
+        // the rotation may have left an undetermined number of zeros
+        // on the left, so true length needs to be calculated
+        res->digits=decGetDigits(res->lsu, msumax-res->lsu+1);
+        } // rotate needed
+      } // rhs OK
+    } // numerics
+  if (status!=0) decStatus(res, status, set);
+  return res;
+  } // decNumberRotate
+
+/* ------------------------------------------------------------------ */
+/* decNumberSameQuantum -- test for equal exponents                   */
+/*                                                                    */
+/*   res is the result number, which will contain either 0 or 1       */
+/*   lhs is a number to test                                          */
+/*   rhs is the second (usually a pattern)                            */
+/*                                                                    */
+/* No errors are possible and no context is needed.                   */
+/* ------------------------------------------------------------------ */
+decNumber * decNumberSameQuantum(decNumber *res, const decNumber *lhs,
+                                 const decNumber *rhs) {
+  Unit ret=0;                      // return value
+
+  #if DECCHECK
+  if (decCheckOperands(res, lhs, rhs, DECUNCONT)) return res;
+  #endif
+
+  if (SPECIALARGS) {
+    if (decNumberIsNaN(lhs) && decNumberIsNaN(rhs)) ret=1;
+     else if (decNumberIsInfinite(lhs) && decNumberIsInfinite(rhs)) ret=1;
+     // [anything else with a special gives 0]
+    }
+   else if (lhs->exponent==rhs->exponent) ret=1;
+
+  decNumberZero(res);              // OK to overwrite an operand now
+  *res->lsu=ret;
+  return res;
+  } // decNumberSameQuantum
+
+/* ------------------------------------------------------------------ */
+/* decNumberScaleB -- multiply by a power of 10                       */
+/*                                                                    */
+/* This computes C = A x 10**B where B is an integer (q=0) with       */
+/* maximum magnitude 2*(emax+digits)                                  */
+/*                                                                    */
+/*   res is C, the result.  C may be A or B                           */
+/*   lhs is A, the number to adjust                                   */
+/*   rhs is B, the requested power of ten to use                      */
+/*   set is the context                                               */
+/*                                                                    */
+/* C must have space for set->digits digits.                          */
+/*                                                                    */
+/* The result may underflow or overflow.                              */
+/* ------------------------------------------------------------------ */
+decNumber * decNumberScaleB(decNumber *res, const decNumber *lhs,
+                            const decNumber *rhs, decContext *set) {
+  Int  reqexp;                // requested exponent change [B]
+  uInt status=0;              // accumulator
+  Int  residue;               // work
+
+  #if DECCHECK
+  if (decCheckOperands(res, lhs, rhs, set)) return res;
+  #endif
+
+  // Handle special values except lhs infinite
+  if (decNumberIsNaN(lhs) || decNumberIsNaN(rhs))
+    decNaNs(res, lhs, rhs, set, &status);
+    // rhs must be an integer
+   else if (decNumberIsInfinite(rhs) || rhs->exponent!=0)
+    status=DEC_Invalid_operation;
+   else {
+    // lhs is a number; rhs is a finite with q==0
+    reqexp=decGetInt(rhs);                   // [cannot fail]
+    // maximum range is larger than getInt can handle, so this is
+    // more restrictive than the specification
+    if (reqexp==BADINT                       // something bad ..
+     || reqexp==BIGODD || reqexp==BIGEVEN    // it was huge
+     || (abs(reqexp)+1)/2>(set->digits+set->emax)) // .. or out of range
+      status=DEC_Invalid_operation;
+     else {                                  // rhs is OK
+      decNumberCopy(res, lhs);               // all done if infinite lhs
+      if (!decNumberIsInfinite(res)) {       // prepare to scale
+        Int exp=res->exponent;               // save for overflow test
+        res->exponent+=reqexp;               // adjust the exponent
+        if (((exp^reqexp)>=0)                // same sign ...
+         && ((exp^res->exponent)<0)) {       // .. but result had different
+          // the calculation overflowed, so force right treatment
+          if (exp<0) res->exponent=DEC_MIN_EMIN-DEC_MAX_DIGITS;
+           else      res->exponent=DEC_MAX_EMAX+1;
+          }
+        residue=0;
+        decFinalize(res, set, &residue, &status); // final check
+        } // finite LHS
+      } // rhs OK
+    } // rhs finite
+  if (status!=0) decStatus(res, status, set);
+  return res;
+  } // decNumberScaleB
+
+/* ------------------------------------------------------------------ */
+/* decNumberShift -- shift the coefficient of a Number left or right  */
+/*                                                                    */
+/*   This computes C = A << B or C = A >> -B  (in base ten).          */
+/*                                                                    */
+/*   res is C, the result.  C may be A and/or B (e.g., X=X<<X)        */
+/*   lhs is A                                                         */
+/*   rhs is B, the number of digits to shift (-ve to right)           */
+/*   set is the context                                               */
+/*                                                                    */
+/* The digits of the coefficient of A are shifted to the left (if B   */
+/* is positive) or to the right (if B is negative) without adjusting  */
+/* the exponent or the sign of A.                                     */
+/*                                                                    */
+/* B must be an integer (q=0) and in the range -set->digits through   */
+/* +set->digits.                                                      */
+/* C must have space for set->digits digits.                          */
+/* NaNs are propagated as usual.  Infinities are unaffected (but      */
+/* B must be valid).  No status is set unless B is invalid or an      */
+/* operand is an sNaN.                                                */
+/* ------------------------------------------------------------------ */
+decNumber * decNumberShift(decNumber *res, const decNumber *lhs,
+                           const decNumber *rhs, decContext *set) {
+  uInt status=0;              // accumulator
+  Int  shift;                 // rhs as an Int
+
+  #if DECCHECK
+  if (decCheckOperands(res, lhs, rhs, set)) return res;
+  #endif
+
+  // NaNs propagate as normal
+  if (decNumberIsNaN(lhs) || decNumberIsNaN(rhs))
+    decNaNs(res, lhs, rhs, set, &status);
+   // rhs must be an integer
+   else if (decNumberIsInfinite(rhs) || rhs->exponent!=0)
+    status=DEC_Invalid_operation;
+   else { // both numeric, rhs is an integer
+    shift=decGetInt(rhs);                    // [cannot fail]
+    if (shift==BADINT                        // something bad ..
+     || shift==BIGODD || shift==BIGEVEN      // .. very big ..
+     || abs(shift)>set->digits)              // .. or out of range
+      status=DEC_Invalid_operation;
+     else {                                  // rhs is OK
+      decNumberCopy(res, lhs);
+      if (shift!=0 && !decNumberIsInfinite(res)) { // something to do
+        if (shift>0) {                       // to left
+          if (shift==set->digits) {          // removing all
+            *res->lsu=0;                     // so place 0
+            res->digits=1;                   // ..
+            }
+           else {                            //
+            // first remove leading digits if necessary
+            if (res->digits+shift>set->digits) {
+              decDecap(res, res->digits+shift-set->digits);
+              // that updated res->digits; may have gone to 1 (for a
+              // single digit or for zero
+              }
+            if (res->digits>1 || *res->lsu)  // if non-zero..
+              res->digits=decShiftToMost(res->lsu, res->digits, shift);
+            } // partial left
+          } // left
+         else { // to right
+          if (-shift>=res->digits) {         // discarding all
+            *res->lsu=0;                     // so place 0
+            res->digits=1;                   // ..
+            }
+           else {
+            decShiftToLeast(res->lsu, D2U(res->digits), -shift);
+            res->digits-=(-shift);
+            }
+          } // to right
+        } // non-0 non-Inf shift
+      } // rhs OK
+    } // numerics
+  if (status!=0) decStatus(res, status, set);
+  return res;
+  } // decNumberShift
+
+/* ------------------------------------------------------------------ */
+/* decNumberSquareRoot -- square root operator                        */
+/*                                                                    */
+/*   This computes C = squareroot(A)                                  */
+/*                                                                    */
+/*   res is C, the result.  C may be A                                */
+/*   rhs is A                                                         */
+/*   set is the context; note that rounding mode has no effect        */
+/*                                                                    */
+/* C must have space for set->digits digits.                          */
+/* ------------------------------------------------------------------ */
+/* This uses the following varying-precision algorithm in:            */
+/*                                                                    */
+/*   Properly Rounded Variable Precision Square Root, T. E. Hull and  */
+/*   A. Abrham, ACM Transactions on Mathematical Software, Vol 11 #3, */
+/*   pp229-237, ACM, September 1985.                                  */
+/*                                                                    */
+/* The square-root is calculated using Newton's method, after which   */
+/* a check is made to ensure the result is correctly rounded.         */
+/*                                                                    */
+/* % [Reformatted original Numerical Turing source code follows.]     */
+/* function sqrt(x : real) : real                                     */
+/* % sqrt(x) returns the properly rounded approximation to the square */
+/* % root of x, in the precision of the calling environment, or it    */
+/* % fails if x < 0.                                                  */
+/* % t e hull and a abrham, august, 1984                              */
+/* if x <= 0 then                                                     */
+/*   if x < 0 then                                                    */
+/*     assert false                                                   */
+/*   else                                                             */
+/*     result 0                                                       */
+/*   end if                                                           */
+/* end if                                                             */
+/* var f := setexp(x, 0)  % fraction part of x   [0.1 <= x < 1]       */
+/* var e := getexp(x)     % exponent part of x                        */
+/* var approx : real                                                  */
+/* if e mod 2 = 0  then                                               */
+/*   approx := .259 + .819 * f   % approx to root of f                */
+/* else                                                               */
+/*   f := f/l0                   % adjustments                        */
+/*   e := e + 1                  %   for odd                          */
+/*   approx := .0819 + 2.59 * f  %   exponent                         */
+/* end if                                                             */
+/*                                                                    */
+/* var p:= 3                                                          */
+/* const maxp := currentprecision + 2                                 */
+/* loop                                                               */
+/*   p := min(2*p - 2, maxp)     % p = 4,6,10, . . . , maxp           */
+/*   precision p                                                      */
+/*   approx := .5 * (approx + f/approx)                               */
+/*   exit when p = maxp                                               */
+/* end loop                                                           */
+/*                                                                    */
+/* % approx is now within 1 ulp of the properly rounded square root   */
+/* % of f; to ensure proper rounding, compare squares of (approx -    */
+/* % l/2 ulp) and (approx + l/2 ulp) with f.                          */
+/* p := currentprecision                                              */
+/* begin                                                              */
+/*   precision p + 2                                                  */
+/*   const approxsubhalf := approx - setexp(.5, -p)                   */
+/*   if mulru(approxsubhalf, approxsubhalf) > f then                  */
+/*     approx := approx - setexp(.l, -p + 1)                          */
+/*   else                                                             */
+/*     const approxaddhalf := approx + setexp(.5, -p)                 */
+/*     if mulrd(approxaddhalf, approxaddhalf) < f then                */
+/*       approx := approx + setexp(.l, -p + 1)                        */
+/*     end if                                                         */
+/*   end if                                                           */
+/* end                                                                */
+/* result setexp(approx, e div 2)  % fix exponent                     */
+/* end sqrt                                                           */
+/* ------------------------------------------------------------------ */
+decNumber * decNumberSquareRoot(decNumber *res, const decNumber *rhs,
+                                decContext *set) {
+  decContext workset, approxset;   // work contexts
+  decNumber dzero;                 // used for constant zero
+  Int  maxp;                       // largest working precision
+  Int  workp;                      // working precision
+  Int  residue=0;                  // rounding residue
+  uInt status=0, ignore=0;         // status accumulators
+  uInt rstatus;                    // ..
+  Int  exp;                        // working exponent
+  Int  ideal;                      // ideal (preferred) exponent
+  Int  needbytes;                  // work
+  Int  dropped;                    // ..
+
+  #if DECSUBSET
+  decNumber *allocrhs=NULL;        // non-NULL if rounded rhs allocated
+  #endif
+  // buffer for f [needs +1 in case DECBUFFER 0]
+  decNumber buff[D2N(DECBUFFER+1)];
+  // buffer for a [needs +2 to match likely maxp]
+  decNumber bufa[D2N(DECBUFFER+2)];
+  // buffer for temporary, b [must be same size as a]
+  decNumber bufb[D2N(DECBUFFER+2)];
+  decNumber *allocbuff=NULL;       // -> allocated buff, iff allocated
+  decNumber *allocbufa=NULL;       // -> allocated bufa, iff allocated
+  decNumber *allocbufb=NULL;       // -> allocated bufb, iff allocated
+  decNumber *f=buff;               // reduced fraction
+  decNumber *a=bufa;               // approximation to result
+  decNumber *b=bufb;               // intermediate result
+  // buffer for temporary variable, up to 3 digits
+  decNumber buft[D2N(3)];
+  decNumber *t=buft;               // up-to-3-digit constant or work
+
+  #if DECCHECK
+  if (decCheckOperands(res, DECUNUSED, rhs, set)) return res;
+  #endif
+
+  do {                             // protect allocated storage
+    #if DECSUBSET
+    if (!set->extended) {
+      // reduce operand and set lostDigits status, as needed
+      if (rhs->digits>set->digits) {
+        allocrhs=decRoundOperand(rhs, set, &status);
+        if (allocrhs==NULL) break;
+        // [Note: 'f' allocation below could reuse this buffer if
+        // used, but as this is rare they are kept separate for clarity.]
+        rhs=allocrhs;
+        }
+      }
+    #endif
+    // [following code does not require input rounding]
+
+    // handle infinities and NaNs
+    if (SPECIALARG) {
+      if (decNumberIsInfinite(rhs)) {         // an infinity
+        if (decNumberIsNegative(rhs)) status|=DEC_Invalid_operation;
+         else decNumberCopy(res, rhs);        // +Infinity
+        }
+       else decNaNs(res, rhs, NULL, set, &status); // a NaN
+      break;
+      }
+
+    // calculate the ideal (preferred) exponent [floor(exp/2)]
+    // [It would be nicer to write: ideal=rhs->exponent>>1, but this
+    // generates a compiler warning.  Generated code is the same.]
+    ideal=(rhs->exponent&~1)/2;         // target
+
+    // handle zeros
+    if (ISZERO(rhs)) {
+      decNumberCopy(res, rhs);          // could be 0 or -0
+      res->exponent=ideal;              // use the ideal [safe]
+      // use decFinish to clamp any out-of-range exponent, etc.
+      decFinish(res, set, &residue, &status);
+      break;
+      }
+
+    // any other -x is an oops
+    if (decNumberIsNegative(rhs)) {
+      status|=DEC_Invalid_operation;
+      break;
+      }
+
+    // space is needed for three working variables
+    //   f -- the same precision as the RHS, reduced to 0.01->0.99...
+    //   a -- Hull's approximation -- precision, when assigned, is
+    //        currentprecision+1 or the input argument precision,
+    //        whichever is larger (+2 for use as temporary)
+    //   b -- intermediate temporary result (same size as a)
+    // if any is too long for local storage, then allocate
+    workp=MAXI(set->digits+1, rhs->digits);  // actual rounding precision
+    workp=MAXI(workp, 7);                    // at least 7 for low cases
+    maxp=workp+2;                            // largest working precision
+
+    needbytes=sizeof(decNumber)+(D2U(rhs->digits)-1)*sizeof(Unit);
+    if (needbytes>(Int)sizeof(buff)) {
+      allocbuff=(decNumber *)malloc(needbytes);
+      if (allocbuff==NULL) {  // hopeless -- abandon
+        status|=DEC_Insufficient_storage;
+        break;}
+      f=allocbuff;            // use the allocated space
+      }
+    // a and b both need to be able to hold a maxp-length number
+    needbytes=sizeof(decNumber)+(D2U(maxp)-1)*sizeof(Unit);
+    if (needbytes>(Int)sizeof(bufa)) {            // [same applies to b]
+      allocbufa=(decNumber *)malloc(needbytes);
+      allocbufb=(decNumber *)malloc(needbytes);
+      if (allocbufa==NULL || allocbufb==NULL) {   // hopeless
+        status|=DEC_Insufficient_storage;
+        break;}
+      a=allocbufa;            // use the allocated spaces
+      b=allocbufb;            // ..
+      }
+
+    // copy rhs -> f, save exponent, and reduce so 0.1 <= f < 1
+    decNumberCopy(f, rhs);
+    exp=f->exponent+f->digits;               // adjusted to Hull rules
+    f->exponent=-(f->digits);                // to range
+
+    // set up working context
+    decContextDefault(&workset, DEC_INIT_DECIMAL64);
+    workset.emax=DEC_MAX_EMAX;
+    workset.emin=DEC_MIN_EMIN;
+
+    // [Until further notice, no error is possible and status bits
+    // (Rounded, etc.) should be ignored, not accumulated.]
+
+    // Calculate initial approximation, and allow for odd exponent
+    workset.digits=workp;                    // p for initial calculation
+    t->bits=0; t->digits=3;
+    a->bits=0; a->digits=3;
+    if ((exp & 1)==0) {                      // even exponent
+      // Set t=0.259, a=0.819
+      t->exponent=-3;
+      a->exponent=-3;
+      #if DECDPUN>=3
+        t->lsu[0]=259;
+        a->lsu[0]=819;
+      #elif DECDPUN==2
+        t->lsu[0]=59; t->lsu[1]=2;
+        a->lsu[0]=19; a->lsu[1]=8;
+      #else
+        t->lsu[0]=9; t->lsu[1]=5; t->lsu[2]=2;
+        a->lsu[0]=9; a->lsu[1]=1; a->lsu[2]=8;
+      #endif
+      }
+     else {                                  // odd exponent
+      // Set t=0.0819, a=2.59
+      f->exponent--;                         // f=f/10
+      exp++;                                 // e=e+1
+      t->exponent=-4;
+      a->exponent=-2;
+      #if DECDPUN>=3
+        t->lsu[0]=819;
+        a->lsu[0]=259;
+      #elif DECDPUN==2
+        t->lsu[0]=19; t->lsu[1]=8;
+        a->lsu[0]=59; a->lsu[1]=2;
+      #else
+        t->lsu[0]=9; t->lsu[1]=1; t->lsu[2]=8;
+        a->lsu[0]=9; a->lsu[1]=5; a->lsu[2]=2;
+      #endif
+      }
+
+    decMultiplyOp(a, a, f, &workset, &ignore);    // a=a*f
+    decAddOp(a, a, t, &workset, 0, &ignore);      // ..+t
+    // [a is now the initial approximation for sqrt(f), calculated with
+    // currentprecision, which is also a's precision.]
+
+    // the main calculation loop
+    decNumberZero(&dzero);                   // make 0
+    decNumberZero(t);                        // set t = 0.5
+    t->lsu[0]=5;                             // ..
+    t->exponent=-1;                          // ..
+    workset.digits=3;                        // initial p
+    for (; workset.digits<maxp;) {
+      // set p to min(2*p - 2, maxp)  [hence 3; or: 4, 6, 10, ... , maxp]
+      workset.digits=MINI(workset.digits*2-2, maxp);
+      // a = 0.5 * (a + f/a)
+      // [calculated at p then rounded to currentprecision]
+      decDivideOp(b, f, a, &workset, DIVIDE, &ignore); // b=f/a
+      decAddOp(b, b, a, &workset, 0, &ignore);         // b=b+a
+      decMultiplyOp(a, b, t, &workset, &ignore);       // a=b*0.5
+      } // loop
+
+    // Here, 0.1 <= a < 1 [Hull], and a has maxp digits
+    // now reduce to length, etc.; this needs to be done with a
+    // having the correct exponent so as to handle subnormals
+    // correctly
+    approxset=*set;                          // get emin, emax, etc.
+    approxset.round=DEC_ROUND_HALF_EVEN;
+    a->exponent+=exp/2;                      // set correct exponent
+    rstatus=0;                               // clear status
+    residue=0;                               // .. and accumulator
+    decCopyFit(a, a, &approxset, &residue, &rstatus);  // reduce (if needed)
+    decFinish(a, &approxset, &residue, &rstatus);      // clean and finalize
+
+    // Overflow was possible if the input exponent was out-of-range,
+    // in which case quit
+    if (rstatus&DEC_Overflow) {
+      status=rstatus;                        // use the status as-is
+      decNumberCopy(res, a);                 // copy to result
+      break;
+      }
+
+    // Preserve status except Inexact/Rounded
+    status|=(rstatus & ~(DEC_Rounded|DEC_Inexact));
+
+    // Carry out the Hull correction
+    a->exponent-=exp/2;                      // back to 0.1->1
+
+    // a is now at final precision and within 1 ulp of the properly
+    // rounded square root of f; to ensure proper rounding, compare
+    // squares of (a - l/2 ulp) and (a + l/2 ulp) with f.
+    // Here workset.digits=maxp and t=0.5, and a->digits determines
+    // the ulp
+    workset.digits--;                             // maxp-1 is OK now
+    t->exponent=-a->digits-1;                     // make 0.5 ulp
+    decAddOp(b, a, t, &workset, DECNEG, &ignore); // b = a - 0.5 ulp
+    workset.round=DEC_ROUND_UP;
+    decMultiplyOp(b, b, b, &workset, &ignore);    // b = mulru(b, b)
+    decCompareOp(b, f, b, &workset, COMPARE, &ignore); // b ? f, reversed
+    if (decNumberIsNegative(b)) {                 // f < b [i.e., b > f]
+      // this is the more common adjustment, though both are rare
+      t->exponent++;                              // make 1.0 ulp
+      t->lsu[0]=1;                                // ..
+      decAddOp(a, a, t, &workset, DECNEG, &ignore); // a = a - 1 ulp
+      // assign to approx [round to length]
+      approxset.emin-=exp/2;                      // adjust to match a
+      approxset.emax-=exp/2;
+      decAddOp(a, &dzero, a, &approxset, 0, &ignore);
+      }
+     else {
+      decAddOp(b, a, t, &workset, 0, &ignore);    // b = a + 0.5 ulp
+      workset.round=DEC_ROUND_DOWN;
+      decMultiplyOp(b, b, b, &workset, &ignore);  // b = mulrd(b, b)
+      decCompareOp(b, b, f, &workset, COMPARE, &ignore);   // b ? f
+      if (decNumberIsNegative(b)) {               // b < f
+        t->exponent++;                            // make 1.0 ulp
+        t->lsu[0]=1;                              // ..
+        decAddOp(a, a, t, &workset, 0, &ignore);  // a = a + 1 ulp
+        // assign to approx [round to length]
+        approxset.emin-=exp/2;                    // adjust to match a
+        approxset.emax-=exp/2;
+        decAddOp(a, &dzero, a, &approxset, 0, &ignore);
+        }
+      }
+    // [no errors are possible in the above, and rounding/inexact during
+    // estimation are irrelevant, so status was not accumulated]
+
+    // Here, 0.1 <= a < 1  (still), so adjust back
+    a->exponent+=exp/2;                      // set correct exponent
+
+    // count droppable zeros [after any subnormal rounding] by
+    // trimming a copy
+    decNumberCopy(b, a);
+    decTrim(b, set, 1, 1, &dropped);         // [drops trailing zeros]
+
+    // Set Inexact and Rounded.  The answer can only be exact if
+    // it is short enough so that squaring it could fit in workp
+    // digits, so this is the only (relatively rare) condition that
+    // a careful check is needed
+    if (b->digits*2-1 > workp) {             // cannot fit
+      status|=DEC_Inexact|DEC_Rounded;
+      }
+     else {                                  // could be exact/unrounded
+      uInt mstatus=0;                        // local status
+      decMultiplyOp(b, b, b, &workset, &mstatus); // try the multiply
+      if (mstatus&DEC_Overflow) {            // result just won't fit
+        status|=DEC_Inexact|DEC_Rounded;
+        }
+       else {                                // plausible
+        decCompareOp(t, b, rhs, &workset, COMPARE, &mstatus); // b ? rhs
+        if (!ISZERO(t)) status|=DEC_Inexact|DEC_Rounded; // not equal
+         else {                              // is Exact
+          // here, dropped is the count of trailing zeros in 'a'
+          // use closest exponent to ideal...
+          Int todrop=ideal-a->exponent;      // most that can be dropped
+          if (todrop<0) status|=DEC_Rounded; // ideally would add 0s
+           else {                            // unrounded
+            // there are some to drop, but emax may not allow all
+            Int maxexp=set->emax-set->digits+1;
+            Int maxdrop=maxexp-a->exponent;
+            if (todrop>maxdrop && set->clamp) { // apply clamping
+              todrop=maxdrop;
+              status|=DEC_Clamped;
+              }
+            if (dropped<todrop) {            // clamp to those available
+              todrop=dropped;
+              status|=DEC_Clamped;
+              }
+            if (todrop>0) {                  // have some to drop
+              decShiftToLeast(a->lsu, D2U(a->digits), todrop);
+              a->exponent+=todrop;           // maintain numerical value
+              a->digits-=todrop;             // new length
+              }
+            }
+          }
+        }
+      }
+
+    // double-check Underflow, as perhaps the result could not have
+    // been subnormal (initial argument too big), or it is now Exact
+    if (status&DEC_Underflow) {
+      Int ae=rhs->exponent+rhs->digits-1;    // adjusted exponent
+      // check if truly subnormal
+      #if DECEXTFLAG                         // DEC_Subnormal too
+        if (ae>=set->emin*2) status&=~(DEC_Subnormal|DEC_Underflow);
+      #else
+        if (ae>=set->emin*2) status&=~DEC_Underflow;
+      #endif
+      // check if truly inexact
+      if (!(status&DEC_Inexact)) status&=~DEC_Underflow;
+      }
+
+    decNumberCopy(res, a);                   // a is now the result
+    } while(0);                              // end protected
+
+  if (allocbuff!=NULL) free(allocbuff);      // drop any storage used
+  if (allocbufa!=NULL) free(allocbufa);      // ..
+  if (allocbufb!=NULL) free(allocbufb);      // ..
+  #if DECSUBSET
+  if (allocrhs !=NULL) free(allocrhs);       // ..
+  #endif
+  if (status!=0) decStatus(res, status, set);// then report status
+  #if DECCHECK
+  decCheckInexact(res, set);
+  #endif
+  return res;
+  } // decNumberSquareRoot
+
+/* ------------------------------------------------------------------ */
+/* decNumberSubtract -- subtract two Numbers                          */
+/*                                                                    */
+/*   This computes C = A - B                                          */
+/*                                                                    */
+/*   res is C, the result.  C may be A and/or B (e.g., X=X-X)         */
+/*   lhs is A                                                         */
+/*   rhs is B                                                         */
+/*   set is the context                                               */
+/*                                                                    */
+/* C must have space for set->digits digits.                          */
+/* ------------------------------------------------------------------ */
+decNumber * decNumberSubtract(decNumber *res, const decNumber *lhs,
+                              const decNumber *rhs, decContext *set) {
+  uInt status=0;                        // accumulator
+
+  decAddOp(res, lhs, rhs, set, DECNEG, &status);
+  if (status!=0) decStatus(res, status, set);
+  #if DECCHECK
+  decCheckInexact(res, set);
+  #endif
+  return res;
+  } // decNumberSubtract
+
+/* ------------------------------------------------------------------ */
+/* decNumberToIntegralExact -- round-to-integral-value with InExact   */
+/* decNumberToIntegralValue -- round-to-integral-value                */
+/*                                                                    */
+/*   res is the result                                                */
+/*   rhs is input number                                              */
+/*   set is the context                                               */
+/*                                                                    */
+/* res must have space for any value of rhs.                          */
+/*                                                                    */
+/* This implements the IEEE special operators and therefore treats    */
+/* special values as valid.  For finite numbers it returns            */
+/* rescale(rhs, 0) if rhs->exponent is <0.                            */
+/* Otherwise the result is rhs (so no error is possible, except for   */
+/* sNaN).                                                             */
+/*                                                                    */
+/* The context is used for rounding mode and status after sNaN, but   */
+/* the digits setting is ignored.  The Exact version will signal      */
+/* Inexact if the result differs numerically from rhs; the other      */
+/* never signals Inexact.                                             */
+/* ------------------------------------------------------------------ */
+decNumber * decNumberToIntegralExact(decNumber *res, const decNumber *rhs,
+                                     decContext *set) {
+  decNumber dn;
+  decContext workset;              // working context
+  uInt status=0;                   // accumulator
+
+  #if DECCHECK
+  if (decCheckOperands(res, DECUNUSED, rhs, set)) return res;
+  #endif
+
+  // handle infinities and NaNs
+  if (SPECIALARG) {
+    if (decNumberIsInfinite(rhs)) decNumberCopy(res, rhs); // an Infinity
+     else decNaNs(res, rhs, NULL, set, &status); // a NaN
+    }
+   else { // finite
+    // have a finite number; no error possible (res must be big enough)
+    if (rhs->exponent>=0) return decNumberCopy(res, rhs);
+    // that was easy, but if negative exponent there is work to do...
+    workset=*set;                  // clone rounding, etc.
+    workset.digits=rhs->digits;    // no length rounding
+    workset.traps=0;               // no traps
+    decNumberZero(&dn);            // make a number with exponent 0
+    decNumberQuantize(res, rhs, &dn, &workset);
+    status|=workset.status;
+    }
+  if (status!=0) decStatus(res, status, set);
+  return res;
+  } // decNumberToIntegralExact
+
+decNumber * decNumberToIntegralValue(decNumber *res, const decNumber *rhs,
+                                     decContext *set) {
+  decContext workset=*set;         // working context
+  workset.traps=0;                 // no traps
+  decNumberToIntegralExact(res, rhs, &workset);
+  // this never affects set, except for sNaNs; NaN will have been set
+  // or propagated already, so no need to call decStatus
+  set->status|=workset.status&DEC_Invalid_operation;
+  return res;
+  } // decNumberToIntegralValue
+
+/* ------------------------------------------------------------------ */
+/* decNumberXor -- XOR two Numbers, digitwise                         */
+/*                                                                    */
+/*   This computes C = A ^ B                                          */
+/*                                                                    */
+/*   res is C, the result.  C may be A and/or B (e.g., X=X^X)         */
+/*   lhs is A                                                         */
+/*   rhs is B                                                         */
+/*   set is the context (used for result length and error report)     */
+/*                                                                    */
+/* C must have space for set->digits digits.                          */
+/*                                                                    */
+/* Logical function restrictions apply (see above); a NaN is          */
+/* returned with Invalid_operation if a restriction is violated.      */
+/* ------------------------------------------------------------------ */
+decNumber * decNumberXor(decNumber *res, const decNumber *lhs,
+                         const decNumber *rhs, decContext *set) {
+  const Unit *ua, *ub;                  // -> operands
+  const Unit *msua, *msub;              // -> operand msus
+  Unit  *uc, *msuc;                     // -> result and its msu
+  Int   msudigs;                        // digits in res msu
+  #if DECCHECK
+  if (decCheckOperands(res, lhs, rhs, set)) return res;
+  #endif
+
+  if (lhs->exponent!=0 || decNumberIsSpecial(lhs) || decNumberIsNegative(lhs)
+   || rhs->exponent!=0 || decNumberIsSpecial(rhs) || decNumberIsNegative(rhs)) {
+    decStatus(res, DEC_Invalid_operation, set);
+    return res;
+    }
+  // operands are valid
+  ua=lhs->lsu;                          // bottom-up
+  ub=rhs->lsu;                          // ..
+  uc=res->lsu;                          // ..
+  msua=ua+D2U(lhs->digits)-1;           // -> msu of lhs
+  msub=ub+D2U(rhs->digits)-1;           // -> msu of rhs
+  msuc=uc+D2U(set->digits)-1;           // -> msu of result
+  msudigs=MSUDIGITS(set->digits);       // [faster than remainder]
+  for (; uc<=msuc; ua++, ub++, uc++) {  // Unit loop
+    Unit a, b;                          // extract units
+    if (ua>msua) a=0;
+     else a=*ua;
+    if (ub>msub) b=0;
+     else b=*ub;
+    *uc=0;                              // can now write back
+    if (a|b) {                          // maybe 1 bits to examine
+      Int i, j;
+      // This loop could be unrolled and/or use BIN2BCD tables
+      for (i=0; i<DECDPUN; i++) {
+        if ((a^b)&1) *uc=*uc+(Unit)powers[i];     // effect XOR
+        j=a%10;
+        a=a/10;
+        j|=b%10;
+        b=b/10;
+        if (j>1) {
+          decStatus(res, DEC_Invalid_operation, set);
+          return res;
+          }
+        if (uc==msuc && i==msudigs-1) break;      // just did final digit
+        } // each digit
+      } // non-zero
+    } // each unit
+  // [here uc-1 is the msu of the result]
+  res->digits=decGetDigits(res->lsu, uc-res->lsu);
+  res->exponent=0;                      // integer
+  res->bits=0;                          // sign=0
+  return res;  // [no status to set]
+  } // decNumberXor
+
+
+/* ================================================================== */
+/* Utility routines                                                   */
+/* ================================================================== */
+
+/* ------------------------------------------------------------------ */
+/* decNumberClass -- return the decClass of a decNumber               */
+/*   dn -- the decNumber to test                                      */
+/*   set -- the context to use for Emin                               */
+/*   returns the decClass enum                                        */
+/* ------------------------------------------------------------------ */
+enum decClass decNumberClass(const decNumber *dn, decContext *set) {
+  if (decNumberIsSpecial(dn)) {
+    if (decNumberIsQNaN(dn)) return DEC_CLASS_QNAN;
+    if (decNumberIsSNaN(dn)) return DEC_CLASS_SNAN;
+    // must be an infinity
+    if (decNumberIsNegative(dn)) return DEC_CLASS_NEG_INF;
+    return DEC_CLASS_POS_INF;
+    }
+  // is finite
+  if (decNumberIsNormal(dn, set)) { // most common
+    if (decNumberIsNegative(dn)) return DEC_CLASS_NEG_NORMAL;
+    return DEC_CLASS_POS_NORMAL;
+    }
+  // is subnormal or zero
+  if (decNumberIsZero(dn)) {    // most common
+    if (decNumberIsNegative(dn)) return DEC_CLASS_NEG_ZERO;
+    return DEC_CLASS_POS_ZERO;
+    }
+  if (decNumberIsNegative(dn)) return DEC_CLASS_NEG_SUBNORMAL;
+  return DEC_CLASS_POS_SUBNORMAL;
+  } // decNumberClass
+
+/* ------------------------------------------------------------------ */
+/* decNumberClassToString -- convert decClass to a string             */
+/*                                                                    */
+/*  eclass is a valid decClass                                        */
+/*  returns a constant string describing the class (max 13+1 chars)   */
+/* ------------------------------------------------------------------ */
+const char *decNumberClassToString(enum decClass eclass) {
+  if (eclass==DEC_CLASS_POS_NORMAL)    return DEC_ClassString_PN;
+  if (eclass==DEC_CLASS_NEG_NORMAL)    return DEC_ClassString_NN;
+  if (eclass==DEC_CLASS_POS_ZERO)      return DEC_ClassString_PZ;
+  if (eclass==DEC_CLASS_NEG_ZERO)      return DEC_ClassString_NZ;
+  if (eclass==DEC_CLASS_POS_SUBNORMAL) return DEC_ClassString_PS;
+  if (eclass==DEC_CLASS_NEG_SUBNORMAL) return DEC_ClassString_NS;
+  if (eclass==DEC_CLASS_POS_INF)       return DEC_ClassString_PI;
+  if (eclass==DEC_CLASS_NEG_INF)       return DEC_ClassString_NI;
+  if (eclass==DEC_CLASS_QNAN)          return DEC_ClassString_QN;
+  if (eclass==DEC_CLASS_SNAN)          return DEC_ClassString_SN;
+  return DEC_ClassString_UN;           // Unknown
+  } // decNumberClassToString
+
+/* ------------------------------------------------------------------ */
+/* decNumberCopy -- copy a number                                     */
+/*                                                                    */
+/*   dest is the target decNumber                                     */
+/*   src  is the source decNumber                                     */
+/*   returns dest                                                     */
+/*                                                                    */
+/* (dest==src is allowed and is a no-op)                              */
+/* All fields are updated as required.  This is a utility operation,  */
+/* so special values are unchanged and no error is possible.          */
+/* ------------------------------------------------------------------ */
+decNumber * decNumberCopy(decNumber *dest, const decNumber *src) {
+
+  #if DECCHECK
+  if (src==NULL) return decNumberZero(dest);
+  #endif
+
+  if (dest==src) return dest;                // no copy required
+
+  // Use explicit assignments here as structure assignment could copy
+  // more than just the lsu (for small DECDPUN).  This would not affect
+  // the value of the results, but could disturb test harness spill
+  // checking.
+  dest->bits=src->bits;
+  dest->exponent=src->exponent;
+  dest->digits=src->digits;
+  dest->lsu[0]=src->lsu[0];
+  if (src->digits>DECDPUN) {                 // more Units to come
+    const Unit *smsup, *s;                   // work
+    Unit  *d;                                // ..
+    // memcpy for the remaining Units would be safe as they cannot
+    // overlap.  However, this explicit loop is faster in short cases.
+    d=dest->lsu+1;                           // -> first destination
+    smsup=src->lsu+D2U(src->digits);         // -> source msu+1
+    for (s=src->lsu+1; s<smsup; s++, d++) *d=*s;
+    }
+  return dest;
+  } // decNumberCopy
+
+/* ------------------------------------------------------------------ */
+/* decNumberCopyAbs -- quiet absolute value operator                  */
+/*                                                                    */
+/*   This sets C = abs(A)                                             */
+/*                                                                    */
+/*   res is C, the result.  C may be A                                */
+/*   rhs is A                                                         */
+/*                                                                    */
+/* C must have space for set->digits digits.                          */
+/* No exception or error can occur; this is a quiet bitwise operation.*/
+/* See also decNumberAbs for a checking version of this.              */
+/* ------------------------------------------------------------------ */
+decNumber * decNumberCopyAbs(decNumber *res, const decNumber *rhs) {
+  #if DECCHECK
+  if (decCheckOperands(res, DECUNUSED, rhs, DECUNCONT)) return res;
+  #endif
+  decNumberCopy(res, rhs);
+  res->bits&=~DECNEG;                   // turn off sign
+  return res;
+  } // decNumberCopyAbs
+
+/* ------------------------------------------------------------------ */
+/* decNumberCopyNegate -- quiet negate value operator                 */
+/*                                                                    */
+/*   This sets C = negate(A)                                          */
+/*                                                                    */
+/*   res is C, the result.  C may be A                                */
+/*   rhs is A                                                         */
+/*                                                                    */
+/* C must have space for set->digits digits.                          */
+/* No exception or error can occur; this is a quiet bitwise operation.*/
+/* See also decNumberMinus for a checking version of this.            */
+/* ------------------------------------------------------------------ */
+decNumber * decNumberCopyNegate(decNumber *res, const decNumber *rhs) {
+  #if DECCHECK
+  if (decCheckOperands(res, DECUNUSED, rhs, DECUNCONT)) return res;
+  #endif
+  decNumberCopy(res, rhs);
+  res->bits^=DECNEG;                    // invert the sign
+  return res;
+  } // decNumberCopyNegate
+
+/* ------------------------------------------------------------------ */
+/* decNumberCopySign -- quiet copy and set sign operator              */
+/*                                                                    */
+/*   This sets C = A with the sign of B                               */
+/*                                                                    */
+/*   res is C, the result.  C may be A                                */
+/*   lhs is A                                                         */
+/*   rhs is B                                                         */
+/*                                                                    */
+/* C must have space for set->digits digits.                          */
+/* No exception or error can occur; this is a quiet bitwise operation.*/
+/* ------------------------------------------------------------------ */
+decNumber * decNumberCopySign(decNumber *res, const decNumber *lhs,
+                              const decNumber *rhs) {
+  uByte sign;                           // rhs sign
+  #if DECCHECK
+  if (decCheckOperands(res, DECUNUSED, rhs, DECUNCONT)) return res;
+  #endif
+  sign=rhs->bits & DECNEG;              // save sign bit
+  decNumberCopy(res, lhs);
+  res->bits&=~DECNEG;                   // clear the sign
+  res->bits|=sign;                      // set from rhs
+  return res;
+  } // decNumberCopySign
+
+/* ------------------------------------------------------------------ */
+/* decNumberGetBCD -- get the coefficient in BCD8                     */
+/*   dn is the source decNumber                                       */
+/*   bcd is the uInt array that will receive dn->digits BCD bytes,    */
+/*     most-significant at offset 0                                   */
+/*   returns bcd                                                      */
+/*                                                                    */
+/* bcd must have at least dn->digits bytes.  No error is possible; if */
+/* dn is a NaN or Infinite, digits must be 1 and the coefficient 0.   */
+/* ------------------------------------------------------------------ */
+uByte * decNumberGetBCD(const decNumber *dn, uByte *bcd) {
+  uByte *ub=bcd+dn->digits-1;      // -> lsd
+  const Unit *up=dn->lsu;          // Unit pointer, -> lsu
+
+  #if DECDPUN==1                   // trivial simple copy
+    for (; ub>=bcd; ub--, up++) *ub=*up;
+  #else                            // chopping needed
+    uInt u=*up;                    // work
+    uInt cut=DECDPUN;              // downcounter through unit
+    for (; ub>=bcd; ub--) {
+      *ub=(uByte)(u%10);           // [*6554 trick inhibits, here]
+      u=u/10;
+      cut--;
+      if (cut>0) continue;         // more in this unit
+      up++;
+      u=*up;
+      cut=DECDPUN;
+      }
+  #endif
+  return bcd;
+  } // decNumberGetBCD
+
+/* ------------------------------------------------------------------ */
+/* decNumberSetBCD -- set (replace) the coefficient from BCD8         */
+/*   dn is the target decNumber                                       */
+/*   bcd is the uInt array that will source n BCD bytes, most-        */
+/*     significant at offset 0                                        */
+/*   n is the number of digits in the source BCD array (bcd)          */
+/*   returns dn                                                       */
+/*                                                                    */
+/* dn must have space for at least n digits.  No error is possible;   */
+/* if dn is a NaN, or Infinite, or is to become a zero, n must be 1   */
+/* and bcd[0] zero.                                                   */
+/* ------------------------------------------------------------------ */
+decNumber * decNumberSetBCD(decNumber *dn, const uByte *bcd, uInt n) {
+  Unit *up=dn->lsu+D2U(dn->digits)-1;   // -> msu [target pointer]
+  const uByte *ub=bcd;                  // -> source msd
+
+  #if DECDPUN==1                        // trivial simple copy
+    for (; ub<bcd+n; ub++, up--) *up=*ub;
+  #else                                 // some assembly needed
+    // calculate how many digits in msu, and hence first cut
+    Int cut=MSUDIGITS(n);               // [faster than remainder]
+    for (;up>=dn->lsu; up--) {          // each Unit from msu
+      *up=0;                            // will take <=DECDPUN digits
+      for (; cut>0; ub++, cut--) *up=X10(*up)+*ub;
+      cut=DECDPUN;                      // next Unit has all digits
+      }
+  #endif
+  dn->digits=n;                         // set digit count
+  return dn;
+  } // decNumberSetBCD
+
+/* ------------------------------------------------------------------ */
+/* decNumberIsNormal -- test normality of a decNumber                 */
+/*   dn is the decNumber to test                                      */
+/*   set is the context to use for Emin                               */
+/*   returns 1 if |dn| is finite and >=Nmin, 0 otherwise              */
+/* ------------------------------------------------------------------ */
+Int decNumberIsNormal(const decNumber *dn, decContext *set) {
+  Int ae;                               // adjusted exponent
+  #if DECCHECK
+  if (decCheckOperands(DECUNRESU, DECUNUSED, dn, set)) return 0;
+  #endif
+
+  if (decNumberIsSpecial(dn)) return 0; // not finite
+  if (decNumberIsZero(dn)) return 0;    // not non-zero
+
+  ae=dn->exponent+dn->digits-1;         // adjusted exponent
+  if (ae<set->emin) return 0;           // is subnormal
+  return 1;
+  } // decNumberIsNormal
+
+/* ------------------------------------------------------------------ */
+/* decNumberIsSubnormal -- test subnormality of a decNumber           */
+/*   dn is the decNumber to test                                      */
+/*   set is the context to use for Emin                               */
+/*   returns 1 if |dn| is finite, non-zero, and <Nmin, 0 otherwise    */
+/* ------------------------------------------------------------------ */
+Int decNumberIsSubnormal(const decNumber *dn, decContext *set) {
+  Int ae;                               // adjusted exponent
+  #if DECCHECK
+  if (decCheckOperands(DECUNRESU, DECUNUSED, dn, set)) return 0;
+  #endif
+
+  if (decNumberIsSpecial(dn)) return 0; // not finite
+  if (decNumberIsZero(dn)) return 0;    // not non-zero
+
+  ae=dn->exponent+dn->digits-1;         // adjusted exponent
+  if (ae<set->emin) return 1;           // is subnormal
+  return 0;
+  } // decNumberIsSubnormal
+
+/* ------------------------------------------------------------------ */
+/* decNumberTrim -- remove insignificant zeros                        */
+/*                                                                    */
+/*   dn is the number to trim                                         */
+/*   returns dn                                                       */
+/*                                                                    */
+/* All fields are updated as required.  This is a utility operation,  */
+/* so special values are unchanged and no error is possible.  The     */
+/* zeros are removed unconditionally.                                 */
+/* ------------------------------------------------------------------ */
+decNumber * decNumberTrim(decNumber *dn) {
+  Int  dropped;                    // work
+  decContext set;                  // ..
+  #if DECCHECK
+  if (decCheckOperands(DECUNRESU, DECUNUSED, dn, DECUNCONT)) return dn;
+  #endif
+  decContextDefault(&set, DEC_INIT_BASE);    // clamp=0
+  return decTrim(dn, &set, 0, 1, &dropped);
+  } // decNumberTrim
+
+/* ------------------------------------------------------------------ */
+/* decNumberVersion -- return the name and version of this module     */
+/*                                                                    */
+/* No error is possible.                                              */
+/* ------------------------------------------------------------------ */
+const char * decNumberVersion(void) {
+  return DECVERSION;
+  } // decNumberVersion
+
+/* ------------------------------------------------------------------ */
+/* decNumberZero -- set a number to 0                                 */
+/*                                                                    */
+/*   dn is the number to set, with space for one digit                */
+/*   returns dn                                                       */
+/*                                                                    */
+/* No error is possible.                                              */
+/* ------------------------------------------------------------------ */
+// Memset is not used as it is much slower in some environments.
+decNumber * decNumberZero(decNumber *dn) {
+
+  #if DECCHECK
+  if (decCheckOperands(dn, DECUNUSED, DECUNUSED, DECUNCONT)) return dn;
+  #endif
+
+  dn->bits=0;
+  dn->exponent=0;
+  dn->digits=1;
+  dn->lsu[0]=0;
+  return dn;
+  } // decNumberZero
+
+/* ================================================================== */
+/* Local routines                                                     */
+/* ================================================================== */
+
+/* ------------------------------------------------------------------ */
+/* decToString -- lay out a number into a string                      */
+/*                                                                    */
+/*   dn     is the number to lay out                                  */
+/*   string is where to lay out the number                            */
+/*   eng    is 1 if Engineering, 0 if Scientific                      */
+/*                                                                    */
+/* string must be at least dn->digits+14 characters long              */
+/* No error is possible.                                              */
+/*                                                                    */
+/* Note that this routine can generate a -0 or 0.000.  These are      */
+/* never generated in subset to-number or arithmetic, but can occur   */
+/* in non-subset arithmetic (e.g., -1*0 or 1.234-1.234).              */
+/* ------------------------------------------------------------------ */
+// If DECCHECK is enabled the string "?" is returned if a number is
+// invalid.
+static void decToString(const decNumber *dn, char *string, Flag eng) {
+  Int exp=dn->exponent;       // local copy
+  Int e;                      // E-part value
+  Int pre;                    // digits before the '.'
+  Int cut;                    // for counting digits in a Unit
+  char *c=string;             // work [output pointer]
+  const Unit *up=dn->lsu+D2U(dn->digits)-1; // -> msu [input pointer]
+  uInt u, pow;                // work
+
+  #if DECCHECK
+  if (decCheckOperands(DECUNRESU, dn, DECUNUSED, DECUNCONT)) {
+    strcpy(string, "?");
+    return;}
+  #endif
+
+  if (decNumberIsNegative(dn)) {   // Negatives get a minus
+    *c='-';
+    c++;
+    }
+  if (dn->bits&DECSPECIAL) {       // Is a special value
+    if (decNumberIsInfinite(dn)) {
+      strcpy(c,   "Inf");
+      strcpy(c+3, "inity");
+      return;}
+    // a NaN
+    if (dn->bits&DECSNAN) {        // signalling NaN
+      *c='s';
+      c++;
+      }
+    strcpy(c, "NaN");
+    c+=3;                          // step past
+    // if not a clean non-zero coefficient, that's all there is in a
+    // NaN string
+    if (exp!=0 || (*dn->lsu==0 && dn->digits==1)) return;
+    // [drop through to add integer]
+    }
+
+  // calculate how many digits in msu, and hence first cut
+  cut=MSUDIGITS(dn->digits);       // [faster than remainder]
+  cut--;                           // power of ten for digit
+
+  if (exp==0) {                    // simple integer [common fastpath]
+    for (;up>=dn->lsu; up--) {     // each Unit from msu
+      u=*up;                       // contains DECDPUN digits to lay out
+      for (; cut>=0; c++, cut--) TODIGIT(u, cut, c, pow);
+      cut=DECDPUN-1;               // next Unit has all digits
+      }
+    *c='\0';                       // terminate the string
+    return;}
+
+  /* non-0 exponent -- assume plain form */
+  pre=dn->digits+exp;              // digits before '.'
+  e=0;                             // no E
+  if ((exp>0) || (pre<-5)) {       // need exponential form
+    e=exp+dn->digits-1;            // calculate E value
+    pre=1;                         // assume one digit before '.'
+    if (eng && (e!=0)) {           // engineering: may need to adjust
+      Int adj;                     // adjustment
+      // The C remainder operator is undefined for negative numbers, so
+      // a positive remainder calculation must be used here
+      if (e<0) {
+        adj=(-e)%3;
+        if (adj!=0) adj=3-adj;
+        }
+       else { // e>0
+        adj=e%3;
+        }
+      e=e-adj;
+      // if dealing with zero still produce an exponent which is a
+      // multiple of three, as expected, but there will only be the
+      // one zero before the E, still.  Otherwise note the padding.
+      if (!ISZERO(dn)) pre+=adj;
+       else {  // is zero
+        if (adj!=0) {              // 0.00Esnn needed
+          e=e+3;
+          pre=-(2-adj);
+          }
+        } // zero
+      } // eng
+    } // need exponent
+
+  /* lay out the digits of the coefficient, adding 0s and . as needed */
+  u=*up;
+  if (pre>0) {                     // xxx.xxx or xx00 (engineering) form
+    Int n=pre;
+    for (; pre>0; pre--, c++, cut--) {
+      if (cut<0) {                 // need new Unit
+        if (up==dn->lsu) break;    // out of input digits (pre>digits)
+        up--;
+        cut=DECDPUN-1;
+        u=*up;
+        }
+      TODIGIT(u, cut, c, pow);
+      }
+    if (n<dn->digits) {            // more to come, after '.'
+      *c='.'; c++;
+      for (;; c++, cut--) {
+        if (cut<0) {               // need new Unit
+          if (up==dn->lsu) break;  // out of input digits
+          up--;
+          cut=DECDPUN-1;
+          u=*up;
+          }
+        TODIGIT(u, cut, c, pow);
+        }
+      }
+     else for (; pre>0; pre--, c++) *c='0'; // 0 padding (for engineering) needed
+    }
+   else {                          // 0.xxx or 0.000xxx form
+    *c='0'; c++;
+    *c='.'; c++;
+    for (; pre<0; pre++, c++) *c='0';   // add any 0's after '.'
+    for (; ; c++, cut--) {
+      if (cut<0) {                 // need new Unit
+        if (up==dn->lsu) break;    // out of input digits
+        up--;
+        cut=DECDPUN-1;
+        u=*up;
+        }
+      TODIGIT(u, cut, c, pow);
+      }
+    }
+
+  /* Finally add the E-part, if needed.  It will never be 0, has a
+     base maximum and minimum of +999999999 through -999999999, but
+     could range down to -1999999998 for anormal numbers */
+  if (e!=0) {
+    Flag had=0;               // 1=had non-zero
+    *c='E'; c++;
+    *c='+'; c++;              // assume positive
+    u=e;                      // ..
+    if (e<0) {
+      *(c-1)='-';             // oops, need -
+      u=-e;                   // uInt, please
+      }
+    // lay out the exponent [_itoa or equivalent is not ANSI C]
+    for (cut=9; cut>=0; cut--) {
+      TODIGIT(u, cut, c, pow);
+      if (*c=='0' && !had) continue;    // skip leading zeros
+      had=1;                            // had non-0
+      c++;                              // step for next
+      } // cut
+    }
+  *c='\0';          // terminate the string (all paths)
+  return;
+  } // decToString
+
+/* ------------------------------------------------------------------ */
+/* decAddOp -- add/subtract operation                                 */
+/*                                                                    */
+/*   This computes C = A + B                                          */
+/*                                                                    */
+/*   res is C, the result.  C may be A and/or B (e.g., X=X+X)         */
+/*   lhs is A                                                         */
+/*   rhs is B                                                         */
+/*   set is the context                                               */
+/*   negate is DECNEG if rhs should be negated, or 0 otherwise        */
+/*   status accumulates status for the caller                         */
+/*                                                                    */
+/* C must have space for set->digits digits.                          */
+/* Inexact in status must be 0 for correct Exact zero sign in result  */
+/* ------------------------------------------------------------------ */
+/* If possible, the coefficient is calculated directly into C.        */
+/* However, if:                                                       */
+/*   -- a digits+1 calculation is needed because the numbers are      */
+/*      unaligned and span more than set->digits digits               */
+/*   -- a carry to digits+1 digits looks possible                     */
+/*   -- C is the same as A or B, and the result would destructively   */
+/*      overlap the A or B coefficient                                */
+/* then the result must be calculated into a temporary buffer.  In    */
+/* this case a local (stack) buffer is used if possible, and only if  */
+/* too long for that does malloc become the final resort.             */
+/*                                                                    */
+/* Misalignment is handled as follows:                                */
+/*   Apad: (AExp>BExp) Swap operands and proceed as for BExp>AExp.    */
+/*   BPad: Apply the padding by a combination of shifting (whole      */
+/*         units) and multiplication (part units).                    */
+/*                                                                    */
+/* Addition, especially x=x+1, is speed-critical.                     */
+/* The static buffer is larger than might be expected to allow for    */
+/* calls from higher-level funtions (notable exp).                    */
+/* ------------------------------------------------------------------ */
+static decNumber * decAddOp(decNumber *res, const decNumber *lhs,
+                            const decNumber *rhs, decContext *set,
+                            uByte negate, uInt *status) {
+  #if DECSUBSET
+  decNumber *alloclhs=NULL;        // non-NULL if rounded lhs allocated
+  decNumber *allocrhs=NULL;        // .., rhs
+  #endif
+  Int   rhsshift;                  // working shift (in Units)
+  Int   maxdigits;                 // longest logical length
+  Int   mult;                      // multiplier
+  Int   residue;                   // rounding accumulator
+  uByte bits;                      // result bits
+  Flag  diffsign;                  // non-0 if arguments have different sign
+  Unit  *acc;                      // accumulator for result
+  Unit  accbuff[SD2U(DECBUFFER*2+20)]; // local buffer [*2+20 reduces many
+                                   // allocations when called from
+                                   // other operations, notable exp]
+  Unit  *allocacc=NULL;            // -> allocated acc buffer, iff allocated
+  Int   reqdigits=set->digits;     // local copy; requested DIGITS
+  Int   padding;                   // work
+
+  #if DECCHECK
+  if (decCheckOperands(res, lhs, rhs, set)) return res;
+  #endif
+
+  do {                             // protect allocated storage
+    #if DECSUBSET
+    if (!set->extended) {
+      // reduce operands and set lostDigits status, as needed
+      if (lhs->digits>reqdigits) {
+        alloclhs=decRoundOperand(lhs, set, status);
+        if (alloclhs==NULL) break;
+        lhs=alloclhs;
+        }
+      if (rhs->digits>reqdigits) {
+        allocrhs=decRoundOperand(rhs, set, status);
+        if (allocrhs==NULL) break;
+        rhs=allocrhs;
+        }
+      }
+    #endif
+    // [following code does not require input rounding]
+
+    // note whether signs differ [used all paths]
+    diffsign=(Flag)((lhs->bits^rhs->bits^negate)&DECNEG);
+
+    // handle infinities and NaNs
+    if (SPECIALARGS) {                  // a special bit set
+      if (SPECIALARGS & (DECSNAN | DECNAN))  // a NaN
+        decNaNs(res, lhs, rhs, set, status);
+       else { // one or two infinities
+        if (decNumberIsInfinite(lhs)) { // LHS is infinity
+          // two infinities with different signs is invalid
+          if (decNumberIsInfinite(rhs) && diffsign) {
+            *status|=DEC_Invalid_operation;
+            break;
+            }
+          bits=lhs->bits & DECNEG;      // get sign from LHS
+          }
+         else bits=(rhs->bits^negate) & DECNEG;// RHS must be Infinity
+        bits|=DECINF;
+        decNumberZero(res);
+        res->bits=bits;                 // set +/- infinity
+        } // an infinity
+      break;
+      }
+
+    // Quick exit for add 0s; return the non-0, modified as need be
+    if (ISZERO(lhs)) {
+      Int adjust;                       // work
+      Int lexp=lhs->exponent;           // save in case LHS==RES
+      bits=lhs->bits;                   // ..
+      residue=0;                        // clear accumulator
+      decCopyFit(res, rhs, set, &residue, status); // copy (as needed)
+      res->bits^=negate;                // flip if rhs was negated
+      #if DECSUBSET
+      if (set->extended) {              // exponents on zeros count
+      #endif
+        // exponent will be the lower of the two
+        adjust=lexp-res->exponent;      // adjustment needed [if -ve]
+        if (ISZERO(res)) {              // both 0: special IEEE 754 rules
+          if (adjust<0) res->exponent=lexp;  // set exponent
+          // 0-0 gives +0 unless rounding to -infinity, and -0-0 gives -0
+          if (diffsign) {
+            if (set->round!=DEC_ROUND_FLOOR) res->bits=0;
+             else res->bits=DECNEG;     // preserve 0 sign
+            }
+          }
+         else { // non-0 res
+          if (adjust<0) {     // 0-padding needed
+            if ((res->digits-adjust)>set->digits) {
+              adjust=res->digits-set->digits;     // to fit exactly
+              *status|=DEC_Rounded;               // [but exact]
+              }
+            res->digits=decShiftToMost(res->lsu, res->digits, -adjust);
+            res->exponent+=adjust;                // set the exponent.
+            }
+          } // non-0 res
+      #if DECSUBSET
+        } // extended
+      #endif
+      decFinish(res, set, &residue, status);      // clean and finalize
+      break;}
+
+    if (ISZERO(rhs)) {                  // [lhs is non-zero]
+      Int adjust;                       // work
+      Int rexp=rhs->exponent;           // save in case RHS==RES
+      bits=rhs->bits;                   // be clean
+      residue=0;                        // clear accumulator
+      decCopyFit(res, lhs, set, &residue, status); // copy (as needed)
+      #if DECSUBSET
+      if (set->extended) {              // exponents on zeros count
+      #endif
+        // exponent will be the lower of the two
+        // [0-0 case handled above]
+        adjust=rexp-res->exponent;      // adjustment needed [if -ve]
+        if (adjust<0) {     // 0-padding needed
+          if ((res->digits-adjust)>set->digits) {
+            adjust=res->digits-set->digits;     // to fit exactly
+            *status|=DEC_Rounded;               // [but exact]
+            }
+          res->digits=decShiftToMost(res->lsu, res->digits, -adjust);
+          res->exponent+=adjust;                // set the exponent.
+          }
+      #if DECSUBSET
+        } // extended
+      #endif
+      decFinish(res, set, &residue, status);      // clean and finalize
+      break;}
+
+    // [NB: both fastpath and mainpath code below assume these cases
+    // (notably 0-0) have already been handled]
+
+    // calculate the padding needed to align the operands
+    padding=rhs->exponent-lhs->exponent;
+
+    // Fastpath cases where the numbers are aligned and normal, the RHS
+    // is all in one unit, no operand rounding is needed, and no carry,
+    // lengthening, or borrow is needed
+    if (padding==0
+        && rhs->digits<=DECDPUN
+        && rhs->exponent>=set->emin     // [some normals drop through]
+        && rhs->exponent<=set->emax-set->digits+1 // [could clamp]
+        && rhs->digits<=reqdigits
+        && lhs->digits<=reqdigits) {
+      Int partial=*lhs->lsu;
+      if (!diffsign) {                  // adding
+        partial+=*rhs->lsu;
+        if ((partial<=DECDPUNMAX)       // result fits in unit
+         && (lhs->digits>=DECDPUN ||    // .. and no digits-count change
+             partial<(Int)powers[lhs->digits])) { // ..
+          if (res!=lhs) decNumberCopy(res, lhs);  // not in place
+          *res->lsu=(Unit)partial;      // [copy could have overwritten RHS]
+          break;
+          }
+        // else drop out for careful add
+        }
+       else {                           // signs differ
+        partial-=*rhs->lsu;
+        if (partial>0) { // no borrow needed, and non-0 result
+          if (res!=lhs) decNumberCopy(res, lhs);  // not in place
+          *res->lsu=(Unit)partial;
+          // this could have reduced digits [but result>0]
+          res->digits=decGetDigits(res->lsu, D2U(res->digits));
+          break;
+          }
+        // else drop out for careful subtract
+        }
+      }
+
+    // Now align (pad) the lhs or rhs so they can be added or
+    // subtracted, as necessary.  If one number is much larger than
+    // the other (that is, if in plain form there is a least one
+    // digit between the lowest digit of one and the highest of the
+    // other) padding with up to DIGITS-1 trailing zeros may be
+    // needed; then apply rounding (as exotic rounding modes may be
+    // affected by the residue).
+    rhsshift=0;               // rhs shift to left (padding) in Units
+    bits=lhs->bits;           // assume sign is that of LHS
+    mult=1;                   // likely multiplier
+
+    // [if padding==0 the operands are aligned; no padding is needed]
+    if (padding!=0) {
+      // some padding needed; always pad the RHS, as any required
+      // padding can then be effected by a simple combination of
+      // shifts and a multiply
+      Flag swapped=0;
+      if (padding<0) {                  // LHS needs the padding
+        const decNumber *t;
+        padding=-padding;               // will be +ve
+        bits=(uByte)(rhs->bits^negate); // assumed sign is now that of RHS
+        t=lhs; lhs=rhs; rhs=t;
+        swapped=1;
+        }
+
+      // If, after pad, rhs would be longer than lhs by digits+1 or
+      // more then lhs cannot affect the answer, except as a residue,
+      // so only need to pad up to a length of DIGITS+1.
+      if (rhs->digits+padding > lhs->digits+reqdigits+1) {
+        // The RHS is sufficient
+        // for residue use the relative sign indication...
+        Int shift=reqdigits-rhs->digits;     // left shift needed
+        residue=1;                           // residue for rounding
+        if (diffsign) residue=-residue;      // signs differ
+        // copy, shortening if necessary
+        decCopyFit(res, rhs, set, &residue, status);
+        // if it was already shorter, then need to pad with zeros
+        if (shift>0) {
+          res->digits=decShiftToMost(res->lsu, res->digits, shift);
+          res->exponent-=shift;              // adjust the exponent.
+          }
+        // flip the result sign if unswapped and rhs was negated
+        if (!swapped) res->bits^=negate;
+        decFinish(res, set, &residue, status);    // done
+        break;}
+
+      // LHS digits may affect result
+      rhsshift=D2U(padding+1)-1;        // this much by Unit shift ..
+      mult=powers[padding-(rhsshift*DECDPUN)]; // .. this by multiplication
+      } // padding needed
+
+    if (diffsign) mult=-mult;           // signs differ
+
+    // determine the longer operand
+    maxdigits=rhs->digits+padding;      // virtual length of RHS
+    if (lhs->digits>maxdigits) maxdigits=lhs->digits;
+
+    // Decide on the result buffer to use; if possible place directly
+    // into result.
+    acc=res->lsu;                       // assume add direct to result
+    // If destructive overlap, or the number is too long, or a carry or
+    // borrow to DIGITS+1 might be possible, a buffer must be used.
+    // [Might be worth more sophisticated tests when maxdigits==reqdigits]
+    if ((maxdigits>=reqdigits)          // is, or could be, too large
+     || (res==rhs && rhsshift>0)) {     // destructive overlap
+      // buffer needed, choose it; units for maxdigits digits will be
+      // needed, +1 Unit for carry or borrow
+      Int need=D2U(maxdigits)+1;
+      acc=accbuff;                      // assume use local buffer
+      if (need*sizeof(Unit)>sizeof(accbuff)) {
+        // printf("malloc add %ld %ld\n", need, sizeof(accbuff));
+        allocacc=(Unit *)malloc(need*sizeof(Unit));
+        if (allocacc==NULL) {           // hopeless -- abandon
+          *status|=DEC_Insufficient_storage;
+          break;}
+        acc=allocacc;
+        }
+      }
+
+    res->bits=(uByte)(bits&DECNEG);     // it's now safe to overwrite..
+    res->exponent=lhs->exponent;        // .. operands (even if aliased)
+
+    #if DECTRACE
+      decDumpAr('A', lhs->lsu, D2U(lhs->digits));
+      decDumpAr('B', rhs->lsu, D2U(rhs->digits));
+      printf("  :h: %ld %ld\n", rhsshift, mult);
+    #endif
+
+    // add [A+B*m] or subtract [A+B*(-m)]
+    res->digits=decUnitAddSub(lhs->lsu, D2U(lhs->digits),
+                              rhs->lsu, D2U(rhs->digits),
+                              rhsshift, acc, mult)
+               *DECDPUN;           // [units -> digits]
+    if (res->digits<0) {           // borrowed...
+      res->digits=-res->digits;
+      res->bits^=DECNEG;           // flip the sign
+      }
+    #if DECTRACE
+      decDumpAr('+', acc, D2U(res->digits));
+    #endif
+
+    // If a buffer was used the result must be copied back, possibly
+    // shortening.  (If no buffer was used then the result must have
+    // fit, so can't need rounding and residue must be 0.)
+    residue=0;                     // clear accumulator
+    if (acc!=res->lsu) {
+      #if DECSUBSET
+      if (set->extended) {         // round from first significant digit
+      #endif
+        // remove leading zeros that were added due to rounding up to
+        // integral Units -- before the test for rounding.
+        if (res->digits>reqdigits)
+          res->digits=decGetDigits(acc, D2U(res->digits));
+        decSetCoeff(res, set, acc, res->digits, &residue, status);
+      #if DECSUBSET
+        }
+       else { // subset arithmetic rounds from original significant digit
+        // May have an underestimate.  This only occurs when both
+        // numbers fit in DECDPUN digits and are padding with a
+        // negative multiple (-10, -100...) and the top digit(s) become
+        // 0.  (This only matters when using X3.274 rules where the
+        // leading zero could be included in the rounding.)
+        if (res->digits<maxdigits) {
+          *(acc+D2U(res->digits))=0; // ensure leading 0 is there
+          res->digits=maxdigits;
+          }
+         else {
+          // remove leading zeros that added due to rounding up to
+          // integral Units (but only those in excess of the original
+          // maxdigits length, unless extended) before test for rounding.
+          if (res->digits>reqdigits) {
+            res->digits=decGetDigits(acc, D2U(res->digits));
+            if (res->digits<maxdigits) res->digits=maxdigits;
+            }
+          }
+        decSetCoeff(res, set, acc, res->digits, &residue, status);
+        // Now apply rounding if needed before removing leading zeros.
+        // This is safe because subnormals are not a possibility
+        if (residue!=0) {
+          decApplyRound(res, set, residue, status);
+          residue=0;                 // did what needed to be done
+          }
+        } // subset
+      #endif
+      } // used buffer
+
+    // strip leading zeros [these were left on in case of subset subtract]
+    res->digits=decGetDigits(res->lsu, D2U(res->digits));
+
+    // apply checks and rounding
+    decFinish(res, set, &residue, status);
+
+    // "When the sum of two operands with opposite signs is exactly
+    // zero, the sign of that sum shall be '+' in all rounding modes
+    // except round toward -Infinity, in which mode that sign shall be
+    // '-'."  [Subset zeros also never have '-', set by decFinish.]
+    if (ISZERO(res) && diffsign
+     #if DECSUBSET
+     && set->extended
+     #endif
+     && (*status&DEC_Inexact)==0) {
+      if (set->round==DEC_ROUND_FLOOR) res->bits|=DECNEG;   // sign -
+                                  else res->bits&=~DECNEG;  // sign +
+      }
+    } while(0);                              // end protected
+
+  if (allocacc!=NULL) free(allocacc);        // drop any storage used
+  #if DECSUBSET
+  if (allocrhs!=NULL) free(allocrhs);        // ..
+  if (alloclhs!=NULL) free(alloclhs);        // ..
+  #endif
+  return res;
+  } // decAddOp
+
+/* ------------------------------------------------------------------ */
+/* decDivideOp -- division operation                                  */
+/*                                                                    */
+/*  This routine performs the calculations for all four division      */
+/*  operators (divide, divideInteger, remainder, remainderNear).      */
+/*                                                                    */
+/*  C=A op B                                                          */
+/*                                                                    */
+/*   res is C, the result.  C may be A and/or B (e.g., X=X/X)         */
+/*   lhs is A                                                         */
+/*   rhs is B                                                         */
+/*   set is the context                                               */
+/*   op  is DIVIDE, DIVIDEINT, REMAINDER, or REMNEAR respectively.    */
+/*   status is the usual accumulator                                  */
+/*                                                                    */
+/* C must have space for set->digits digits.                          */
+/*                                                                    */
+/* ------------------------------------------------------------------ */
+/*   The underlying algorithm of this routine is the same as in the   */
+/*   1981 S/370 implementation, that is, non-restoring long division  */
+/*   with bi-unit (rather than bi-digit) estimation for each unit     */
+/*   multiplier.  In this pseudocode overview, complications for the  */
+/*   Remainder operators and division residues for exact rounding are */
+/*   omitted for clarity.                                             */
+/*                                                                    */
+/*     Prepare operands and handle special values                     */
+/*     Test for x/0 and then 0/x                                      */
+/*     Exp =Exp1 - Exp2                                               */
+/*     Exp =Exp +len(var1) -len(var2)                                 */
+/*     Sign=Sign1 * Sign2                                             */
+/*     Pad accumulator (Var1) to double-length with 0's (pad1)        */
+/*     Pad Var2 to same length as Var1                                */
+/*     msu2pair/plus=1st 2 or 1 units of var2, +1 to allow for round  */
+/*     have=0                                                         */
+/*     Do until (have=digits+1 OR residue=0)                          */
+/*       if exp<0 then if integer divide/residue then leave           */
+/*       this_unit=0                                                  */
+/*       Do forever                                                   */
+/*          compare numbers                                           */
+/*          if <0 then leave inner_loop                               */
+/*          if =0 then (* quick exit without subtract *) do           */
+/*             this_unit=this_unit+1; output this_unit                */
+/*             leave outer_loop; end                                  */
+/*          Compare lengths of numbers (mantissae):                   */
+/*          If same then tops2=msu2pair -- {units 1&2 of var2}        */
+/*                  else tops2=msu2plus -- {0, unit 1 of var2}        */
+/*          tops1=first_unit_of_Var1*10**DECDPUN +second_unit_of_var1 */
+/*          mult=tops1/tops2  -- Good and safe guess at divisor       */
+/*          if mult=0 then mult=1                                     */
+/*          this_unit=this_unit+mult                                  */
+/*          subtract                                                  */
+/*          end inner_loop                                            */
+/*        if have\=0 | this_unit\=0 then do                           */
+/*          output this_unit                                          */
+/*          have=have+1; end                                          */
+/*        var2=var2/10                                                */
+/*        exp=exp-1                                                   */
+/*        end outer_loop                                              */
+/*     exp=exp+1   -- set the proper exponent                         */
+/*     if have=0 then generate answer=0                               */
+/*     Return (Result is defined by Var1)                             */
+/*                                                                    */
+/* ------------------------------------------------------------------ */
+/* Two working buffers are needed during the division; one (digits+   */
+/* 1) to accumulate the result, and the other (up to 2*digits+1) for  */
+/* long subtractions.  These are acc and var1 respectively.           */
+/* var1 is a copy of the lhs coefficient, var2 is the rhs coefficient.*/
+/* The static buffers may be larger than might be expected to allow   */
+/* for calls from higher-level funtions (notable exp).                */
+/* ------------------------------------------------------------------ */
+static decNumber * decDivideOp(decNumber *res,
+                               const decNumber *lhs, const decNumber *rhs,
+                               decContext *set, Flag op, uInt *status) {
+  #if DECSUBSET
+  decNumber *alloclhs=NULL;        // non-NULL if rounded lhs allocated
+  decNumber *allocrhs=NULL;        // .., rhs
+  #endif
+  Unit  accbuff[SD2U(DECBUFFER+DECDPUN+10)]; // local buffer
+  Unit  *acc=accbuff;              // -> accumulator array for result
+  Unit  *allocacc=NULL;            // -> allocated buffer, iff allocated
+  Unit  *accnext;                  // -> where next digit will go
+  Int   acclength;                 // length of acc needed [Units]
+  Int   accunits;                  // count of units accumulated
+  Int   accdigits;                 // count of digits accumulated
+
+  Unit  varbuff[SD2U(DECBUFFER*2+DECDPUN)];  // buffer for var1
+  Unit  *var1=varbuff;             // -> var1 array for long subtraction
+  Unit  *varalloc=NULL;            // -> allocated buffer, iff used
+  Unit  *msu1;                     // -> msu of var1
+
+  const Unit *var2;                // -> var2 array
+  const Unit *msu2;                // -> msu of var2
+  Int   msu2plus;                  // msu2 plus one [does not vary]
+  eInt  msu2pair;                  // msu2 pair plus one [does not vary]
+
+  Int   var1units, var2units;      // actual lengths
+  Int   var2ulen;                  // logical length (units)
+  Int   var1initpad=0;             // var1 initial padding (digits)
+  Int   maxdigits;                 // longest LHS or required acc length
+  Int   mult;                      // multiplier for subtraction
+  Unit  thisunit;                  // current unit being accumulated
+  Int   residue;                   // for rounding
+  Int   reqdigits=set->digits;     // requested DIGITS
+  Int   exponent;                  // working exponent
+  Int   maxexponent=0;             // DIVIDE maximum exponent if unrounded
+  uByte bits;                      // working sign
+  Unit  *target;                   // work
+  const Unit *source;              // ..
+  uInt  const *pow;                // ..
+  Int   shift, cut;                // ..
+  #if DECSUBSET
+  Int   dropped;                   // work
+  #endif
+
+  #if DECCHECK
+  if (decCheckOperands(res, lhs, rhs, set)) return res;
+  #endif
+
+  do {                             // protect allocated storage
+    #if DECSUBSET
+    if (!set->extended) {
+      // reduce operands and set lostDigits status, as needed
+      if (lhs->digits>reqdigits) {
+        alloclhs=decRoundOperand(lhs, set, status);
+        if (alloclhs==NULL) break;
+        lhs=alloclhs;
+        }
+      if (rhs->digits>reqdigits) {
+        allocrhs=decRoundOperand(rhs, set, status);
+        if (allocrhs==NULL) break;
+        rhs=allocrhs;
+        }
+      }
+    #endif
+    // [following code does not require input rounding]
+
+    bits=(lhs->bits^rhs->bits)&DECNEG;  // assumed sign for divisions
+
+    // handle infinities and NaNs
+    if (SPECIALARGS) {                  // a special bit set
+      if (SPECIALARGS & (DECSNAN | DECNAN)) { // one or two NaNs
+        decNaNs(res, lhs, rhs, set, status);
+        break;
+        }
+      // one or two infinities
+      if (decNumberIsInfinite(lhs)) {   // LHS (dividend) is infinite
+        if (decNumberIsInfinite(rhs) || // two infinities are invalid ..
+            op & (REMAINDER | REMNEAR)) { // as is remainder of infinity
+          *status|=DEC_Invalid_operation;
+          break;
+          }
+        // [Note that infinity/0 raises no exceptions]
+        decNumberZero(res);
+        res->bits=bits|DECINF;          // set +/- infinity
+        break;
+        }
+       else {                           // RHS (divisor) is infinite
+        residue=0;
+        if (op&(REMAINDER|REMNEAR)) {
+          // result is [finished clone of] lhs
+          decCopyFit(res, lhs, set, &residue, status);
+          }
+         else {  // a division
+          decNumberZero(res);
+          res->bits=bits;               // set +/- zero
+          // for DIVIDEINT the exponent is always 0.  For DIVIDE, result
+          // is a 0 with infinitely negative exponent, clamped to minimum
+          if (op&DIVIDE) {
+            res->exponent=set->emin-set->digits+1;
+            *status|=DEC_Clamped;
+            }
+          }
+        decFinish(res, set, &residue, status);
+        break;
+        }
+      }
+
+    // handle 0 rhs (x/0)
+    if (ISZERO(rhs)) {                  // x/0 is always exceptional
+      if (ISZERO(lhs)) {
+        decNumberZero(res);             // [after lhs test]
+        *status|=DEC_Division_undefined;// 0/0 will become NaN
+        }
+       else {
+        decNumberZero(res);
+        if (op&(REMAINDER|REMNEAR)) *status|=DEC_Invalid_operation;
+         else {
+          *status|=DEC_Division_by_zero; // x/0
+          res->bits=bits|DECINF;         // .. is +/- Infinity
+          }
+        }
+      break;}
+
+    // handle 0 lhs (0/x)
+    if (ISZERO(lhs)) {                  // 0/x [x!=0]
+      #if DECSUBSET
+      if (!set->extended) decNumberZero(res);
+       else {
+      #endif
+        if (op&DIVIDE) {
+          residue=0;
+          exponent=lhs->exponent-rhs->exponent; // ideal exponent
+          decNumberCopy(res, lhs);      // [zeros always fit]
+          res->bits=bits;               // sign as computed
+          res->exponent=exponent;       // exponent, too
+          decFinalize(res, set, &residue, status);   // check exponent
+          }
+         else if (op&DIVIDEINT) {
+          decNumberZero(res);           // integer 0
+          res->bits=bits;               // sign as computed
+          }
+         else {                         // a remainder
+          exponent=rhs->exponent;       // [save in case overwrite]
+          decNumberCopy(res, lhs);      // [zeros always fit]
+          if (exponent<res->exponent) res->exponent=exponent; // use lower
+          }
+      #if DECSUBSET
+        }
+      #endif
+      break;}
+
+    // Precalculate exponent.  This starts off adjusted (and hence fits
+    // in 31 bits) and becomes the usual unadjusted exponent as the
+    // division proceeds.  The order of evaluation is important, here,
+    // to avoid wrap.
+    exponent=(lhs->exponent+lhs->digits)-(rhs->exponent+rhs->digits);
+
+    // If the working exponent is -ve, then some quick exits are
+    // possible because the quotient is known to be <1
+    // [for REMNEAR, it needs to be < -1, as -0.5 could need work]
+    if (exponent<0 && !(op==DIVIDE)) {
+      if (op&DIVIDEINT) {
+        decNumberZero(res);                  // integer part is 0
+        #if DECSUBSET
+        if (set->extended)
+        #endif
+          res->bits=bits;                    // set +/- zero
+        break;}
+      // fastpath remainders so long as the lhs has the smaller
+      // (or equal) exponent
+      if (lhs->exponent<=rhs->exponent) {
+        if (op&REMAINDER || exponent<-1) {
+          // It is REMAINDER or safe REMNEAR; result is [finished
+          // clone of] lhs  (r = x - 0*y)
+          residue=0;
+          decCopyFit(res, lhs, set, &residue, status);
+          decFinish(res, set, &residue, status);
+          break;
+          }
+        // [unsafe REMNEAR drops through]
+        }
+      } // fastpaths
+
+    /* Long (slow) division is needed; roll up the sleeves... */
+
+    // The accumulator will hold the quotient of the division.
+    // If it needs to be too long for stack storage, then allocate.
+    acclength=D2U(reqdigits+DECDPUN);   // in Units
+    if (acclength*sizeof(Unit)>sizeof(accbuff)) {
+      // printf("malloc dvacc %ld units\n", acclength);
+      allocacc=(Unit *)malloc(acclength*sizeof(Unit));
+      if (allocacc==NULL) {             // hopeless -- abandon
+        *status|=DEC_Insufficient_storage;
+        break;}
+      acc=allocacc;                     // use the allocated space
+      }
+
+    // var1 is the padded LHS ready for subtractions.
+    // If it needs to be too long for stack storage, then allocate.
+    // The maximum units needed for var1 (long subtraction) is:
+    // Enough for
+    //     (rhs->digits+reqdigits-1) -- to allow full slide to right
+    // or  (lhs->digits)             -- to allow for long lhs
+    // whichever is larger
+    //   +1                -- for rounding of slide to right
+    //   +1                -- for leading 0s
+    //   +1                -- for pre-adjust if a remainder or DIVIDEINT
+    // [Note: unused units do not participate in decUnitAddSub data]
+    maxdigits=rhs->digits+reqdigits-1;
+    if (lhs->digits>maxdigits) maxdigits=lhs->digits;
+    var1units=D2U(maxdigits)+2;
+    // allocate a guard unit above msu1 for REMAINDERNEAR
+    if (!(op&DIVIDE)) var1units++;
+    if ((var1units+1)*sizeof(Unit)>sizeof(varbuff)) {
+      // printf("malloc dvvar %ld units\n", var1units+1);
+      varalloc=(Unit *)malloc((var1units+1)*sizeof(Unit));
+      if (varalloc==NULL) {             // hopeless -- abandon
+        *status|=DEC_Insufficient_storage;
+        break;}
+      var1=varalloc;                    // use the allocated space
+      }
+
+    // Extend the lhs and rhs to full long subtraction length.  The lhs
+    // is truly extended into the var1 buffer, with 0 padding, so a
+    // subtract in place is always possible.  The rhs (var2) has
+    // virtual padding (implemented by decUnitAddSub).
+    // One guard unit was allocated above msu1 for rem=rem+rem in
+    // REMAINDERNEAR.
+    msu1=var1+var1units-1;              // msu of var1
+    source=lhs->lsu+D2U(lhs->digits)-1; // msu of input array
+    for (target=msu1; source>=lhs->lsu; source--, target--) *target=*source;
+    for (; target>=var1; target--) *target=0;
+
+    // rhs (var2) is left-aligned with var1 at the start
+    var2ulen=var1units;                 // rhs logical length (units)
+    var2units=D2U(rhs->digits);         // rhs actual length (units)
+    var2=rhs->lsu;                      // -> rhs array
+    msu2=var2+var2units-1;              // -> msu of var2 [never changes]
+    // now set up the variables which will be used for estimating the
+    // multiplication factor.  If these variables are not exact, add
+    // 1 to make sure that the multiplier is never overestimated.
+    msu2plus=*msu2;                     // it's value ..
+    if (var2units>1) msu2plus++;        // .. +1 if any more
+    msu2pair=(eInt)*msu2*(DECDPUNMAX+1);// top two pair ..
+    if (var2units>1) {                  // .. [else treat 2nd as 0]
+      msu2pair+=*(msu2-1);              // ..
+      if (var2units>2) msu2pair++;      // .. +1 if any more
+      }
+
+    // The calculation is working in units, which may have leading zeros,
+    // but the exponent was calculated on the assumption that they are
+    // both left-aligned.  Adjust the exponent to compensate: add the
+    // number of leading zeros in var1 msu and subtract those in var2 msu.
+    // [This is actually done by counting the digits and negating, as
+    // lead1=DECDPUN-digits1, and similarly for lead2.]
+    for (pow=&powers[1]; *msu1>=*pow; pow++) exponent--;
+    for (pow=&powers[1]; *msu2>=*pow; pow++) exponent++;
+
+    // Now, if doing an integer divide or remainder, ensure that
+    // the result will be Unit-aligned.  To do this, shift the var1
+    // accumulator towards least if need be.  (It's much easier to
+    // do this now than to reassemble the residue afterwards, if
+    // doing a remainder.)  Also ensure the exponent is not negative.
+    if (!(op&DIVIDE)) {
+      Unit *u;                          // work
+      // save the initial 'false' padding of var1, in digits
+      var1initpad=(var1units-D2U(lhs->digits))*DECDPUN;
+      // Determine the shift to do.
+      if (exponent<0) cut=-exponent;
+       else cut=DECDPUN-exponent%DECDPUN;
+      decShiftToLeast(var1, var1units, cut);
+      exponent+=cut;                    // maintain numerical value
+      var1initpad-=cut;                 // .. and reduce padding
+      // clean any most-significant units which were just emptied
+      for (u=msu1; cut>=DECDPUN; cut-=DECDPUN, u--) *u=0;
+      } // align
+     else { // is DIVIDE
+      maxexponent=lhs->exponent-rhs->exponent;    // save
+      // optimization: if the first iteration will just produce 0,
+      // preadjust to skip it [valid for DIVIDE only]
+      if (*msu1<*msu2) {
+        var2ulen--;                     // shift down
+        exponent-=DECDPUN;              // update the exponent
+        }
+      }
+
+    // ---- start the long-division loops ------------------------------
+    accunits=0;                         // no units accumulated yet
+    accdigits=0;                        // .. or digits
+    accnext=acc+acclength-1;            // -> msu of acc [NB: allows digits+1]
+    for (;;) {                          // outer forever loop
+      thisunit=0;                       // current unit assumed 0
+      // find the next unit
+      for (;;) {                        // inner forever loop
+        // strip leading zero units [from either pre-adjust or from
+        // subtract last time around].  Leave at least one unit.
+        for (; *msu1==0 && msu1>var1; msu1--) var1units--;
+
+        if (var1units<var2ulen) break;       // var1 too low for subtract
+        if (var1units==var2ulen) {           // unit-by-unit compare needed
+          // compare the two numbers, from msu
+          const Unit *pv1, *pv2;
+          Unit v2;                           // units to compare
+          pv2=msu2;                          // -> msu
+          for (pv1=msu1; ; pv1--, pv2--) {
+            // v1=*pv1 -- always OK
+            v2=0;                            // assume in padding
+            if (pv2>=var2) v2=*pv2;          // in range
+            if (*pv1!=v2) break;             // no longer the same
+            if (pv1==var1) break;            // done; leave pv1 as is
+            }
+          // here when all inspected or a difference seen
+          if (*pv1<v2) break;                // var1 too low to subtract
+          if (*pv1==v2) {                    // var1 == var2
+            // reach here if var1 and var2 are identical; subtraction
+            // would increase digit by one, and the residue will be 0 so
+            // the calculation is done; leave the loop with residue=0.
+            thisunit++;                      // as though subtracted
+            *var1=0;                         // set var1 to 0
+            var1units=1;                     // ..
+            break;  // from inner
+            } // var1 == var2
+          // *pv1>v2.  Prepare for real subtraction; the lengths are equal
+          // Estimate the multiplier (there's always a msu1-1)...
+          // Bring in two units of var2 to provide a good estimate.
+          mult=(Int)(((eInt)*msu1*(DECDPUNMAX+1)+*(msu1-1))/msu2pair);
+          } // lengths the same
+         else { // var1units > var2ulen, so subtraction is safe
+          // The var2 msu is one unit towards the lsu of the var1 msu,
+          // so only one unit for var2 can be used.
+          mult=(Int)(((eInt)*msu1*(DECDPUNMAX+1)+*(msu1-1))/msu2plus);
+          }
+        if (mult==0) mult=1;                 // must always be at least 1
+        // subtraction needed; var1 is > var2
+        thisunit=(Unit)(thisunit+mult);      // accumulate
+        // subtract var1-var2, into var1; only the overlap needs
+        // processing, as this is an in-place calculation
+        shift=var2ulen-var2units;
+        #if DECTRACE
+          decDumpAr('1', &var1[shift], var1units-shift);
+          decDumpAr('2', var2, var2units);
+          printf("m=%ld\n", -mult);
+        #endif
+        decUnitAddSub(&var1[shift], var1units-shift,
+                      var2, var2units, 0,
+                      &var1[shift], -mult);
+        #if DECTRACE
+          decDumpAr('#', &var1[shift], var1units-shift);
+        #endif
+        // var1 now probably has leading zeros; these are removed at the
+        // top of the inner loop.
+        } // inner loop
+
+      // The next unit has been calculated in full; unless it's a
+      // leading zero, add to acc
+      if (accunits!=0 || thisunit!=0) {      // is first or non-zero
+        *accnext=thisunit;                   // store in accumulator
+        // account exactly for the new digits
+        if (accunits==0) {
+          accdigits++;                       // at least one
+          for (pow=&powers[1]; thisunit>=*pow; pow++) accdigits++;
+          }
+         else accdigits+=DECDPUN;
+        accunits++;                          // update count
+        accnext--;                           // ready for next
+        if (accdigits>reqdigits) break;      // have enough digits
+        }
+
+      // if the residue is zero, the operation is done (unless divide
+      // or divideInteger and still not enough digits yet)
+      if (*var1==0 && var1units==1) {        // residue is 0
+        if (op&(REMAINDER|REMNEAR)) break;
+        if ((op&DIVIDE) && (exponent<=maxexponent)) break;
+        // [drop through if divideInteger]
+        }
+      // also done enough if calculating remainder or integer
+      // divide and just did the last ('units') unit
+      if (exponent==0 && !(op&DIVIDE)) break;
+
+      // to get here, var1 is less than var2, so divide var2 by the per-
+      // Unit power of ten and go for the next digit
+      var2ulen--;                            // shift down
+      exponent-=DECDPUN;                     // update the exponent
+      } // outer loop
+
+    // ---- division is complete ---------------------------------------
+    // here: acc      has at least reqdigits+1 of good results (or fewer
+    //                if early stop), starting at accnext+1 (its lsu)
+    //       var1     has any residue at the stopping point
+    //       accunits is the number of digits collected in acc
+    if (accunits==0) {             // acc is 0
+      accunits=1;                  // show have a unit ..
+      accdigits=1;                 // ..
+      *accnext=0;                  // .. whose value is 0
+      }
+     else accnext++;               // back to last placed
+    // accnext now -> lowest unit of result
+
+    residue=0;                     // assume no residue
+    if (op&DIVIDE) {
+      // record the presence of any residue, for rounding
+      if (*var1!=0 || var1units>1) residue=1;
+       else { // no residue
+        // Had an exact division; clean up spurious trailing 0s.
+        // There will be at most DECDPUN-1, from the final multiply,
+        // and then only if the result is non-0 (and even) and the
+        // exponent is 'loose'.
+        #if DECDPUN>1
+        Unit lsu=*accnext;
+        if (!(lsu&0x01) && (lsu!=0)) {
+          // count the trailing zeros
+          Int drop=0;
+          for (;; drop++) {    // [will terminate because lsu!=0]
+            if (exponent>=maxexponent) break;     // don't chop real 0s
+            #if DECDPUN<=4
+              if ((lsu-QUOT10(lsu, drop+1)
+                  *powers[drop+1])!=0) break;     // found non-0 digit
+            #else
+              if (lsu%powers[drop+1]!=0) break;   // found non-0 digit
+            #endif
+            exponent++;
+            }
+          if (drop>0) {
+            accunits=decShiftToLeast(accnext, accunits, drop);
+            accdigits=decGetDigits(accnext, accunits);
+            accunits=D2U(accdigits);
+            // [exponent was adjusted in the loop]
+            }
+          } // neither odd nor 0
+        #endif
+        } // exact divide
+      } // divide
+     else /* op!=DIVIDE */ {
+      // check for coefficient overflow
+      if (accdigits+exponent>reqdigits) {
+        *status|=DEC_Division_impossible;
+        break;
+        }
+      if (op & (REMAINDER|REMNEAR)) {
+        // [Here, the exponent will be 0, because var1 was adjusted
+        // appropriately.]
+        Int postshift;                       // work
+        Flag wasodd=0;                       // integer was odd
+        Unit *quotlsu;                       // for save
+        Int  quotdigits;                     // ..
+
+        bits=lhs->bits;                      // remainder sign is always as lhs
+
+        // Fastpath when residue is truly 0 is worthwhile [and
+        // simplifies the code below]
+        if (*var1==0 && var1units==1) {      // residue is 0
+          Int exp=lhs->exponent;             // save min(exponents)
+          if (rhs->exponent<exp) exp=rhs->exponent;
+          decNumberZero(res);                // 0 coefficient
+          #if DECSUBSET
+          if (set->extended)
+          #endif
+          res->exponent=exp;                 // .. with proper exponent
+          res->bits=(uByte)(bits&DECNEG);          // [cleaned]
+          decFinish(res, set, &residue, status);   // might clamp
+          break;
+          }
+        // note if the quotient was odd
+        if (*accnext & 0x01) wasodd=1;       // acc is odd
+        quotlsu=accnext;                     // save in case need to reinspect
+        quotdigits=accdigits;                // ..
+
+        // treat the residue, in var1, as the value to return, via acc
+        // calculate the unused zero digits.  This is the smaller of:
+        //   var1 initial padding (saved above)
+        //   var2 residual padding, which happens to be given by:
+        postshift=var1initpad+exponent-lhs->exponent+rhs->exponent;
+        // [the 'exponent' term accounts for the shifts during divide]
+        if (var1initpad<postshift) postshift=var1initpad;
+
+        // shift var1 the requested amount, and adjust its digits
+        var1units=decShiftToLeast(var1, var1units, postshift);
+        accnext=var1;
+        accdigits=decGetDigits(var1, var1units);
+        accunits=D2U(accdigits);
+
+        exponent=lhs->exponent;         // exponent is smaller of lhs & rhs
+        if (rhs->exponent<exponent) exponent=rhs->exponent;
+
+        // Now correct the result if doing remainderNear; if it
+        // (looking just at coefficients) is > rhs/2, or == rhs/2 and
+        // the integer was odd then the result should be rem-rhs.
+        if (op&REMNEAR) {
+          Int compare, tarunits;        // work
+          Unit *up;                     // ..
+          // calculate remainder*2 into the var1 buffer (which has
+          // 'headroom' of an extra unit and hence enough space)
+          // [a dedicated 'double' loop would be faster, here]
+          tarunits=decUnitAddSub(accnext, accunits, accnext, accunits,
+                                 0, accnext, 1);
+          // decDumpAr('r', accnext, tarunits);
+
+          // Here, accnext (var1) holds tarunits Units with twice the
+          // remainder's coefficient, which must now be compared to the
+          // RHS.  The remainder's exponent may be smaller than the RHS's.
+          compare=decUnitCompare(accnext, tarunits, rhs->lsu, D2U(rhs->digits),
+                                 rhs->exponent-exponent);
+          if (compare==BADINT) {             // deep trouble
+            *status|=DEC_Insufficient_storage;
+            break;}
+
+          // now restore the remainder by dividing by two; the lsu
+          // is known to be even.
+          for (up=accnext; up<accnext+tarunits; up++) {
+            Int half;              // half to add to lower unit
+            half=*up & 0x01;
+            *up/=2;                // [shift]
+            if (!half) continue;
+            *(up-1)+=(DECDPUNMAX+1)/2;
+            }
+          // [accunits still describes the original remainder length]
+
+          if (compare>0 || (compare==0 && wasodd)) { // adjustment needed
+            Int exp, expunits, exprem;       // work
+            // This is effectively causing round-up of the quotient,
+            // so if it was the rare case where it was full and all
+            // nines, it would overflow and hence division-impossible
+            // should be raised
+            Flag allnines=0;                 // 1 if quotient all nines
+            if (quotdigits==reqdigits) {     // could be borderline
+              for (up=quotlsu; ; up++) {
+                if (quotdigits>DECDPUN) {
+                  if (*up!=DECDPUNMAX) break;// non-nines
+                  }
+                 else {                      // this is the last Unit
+                  if (*up==powers[quotdigits]-1) allnines=1;
+                  break;
+                  }
+                quotdigits-=DECDPUN;         // checked those digits
+                } // up
+              } // borderline check
+            if (allnines) {
+              *status|=DEC_Division_impossible;
+              break;}
+
+            // rem-rhs is needed; the sign will invert.  Again, var1
+            // can safely be used for the working Units array.
+            exp=rhs->exponent-exponent;      // RHS padding needed
+            // Calculate units and remainder from exponent.
+            expunits=exp/DECDPUN;
+            exprem=exp%DECDPUN;
+            // subtract [A+B*(-m)]; the result will always be negative
+            accunits=-decUnitAddSub(accnext, accunits,
+                                    rhs->lsu, D2U(rhs->digits),
+                                    expunits, accnext, -(Int)powers[exprem]);
+            accdigits=decGetDigits(accnext, accunits); // count digits exactly
+            accunits=D2U(accdigits);    // and recalculate the units for copy
+            // [exponent is as for original remainder]
+            bits^=DECNEG;               // flip the sign
+            }
+          } // REMNEAR
+        } // REMAINDER or REMNEAR
+      } // not DIVIDE
+
+    // Set exponent and bits
+    res->exponent=exponent;
+    res->bits=(uByte)(bits&DECNEG);          // [cleaned]
+
+    // Now the coefficient.
+    decSetCoeff(res, set, accnext, accdigits, &residue, status);
+
+    decFinish(res, set, &residue, status);   // final cleanup
+
+    #if DECSUBSET
+    // If a divide then strip trailing zeros if subset [after round]
+    if (!set->extended && (op==DIVIDE)) decTrim(res, set, 0, 1, &dropped);
+    #endif
+    } while(0);                              // end protected
+
+  if (varalloc!=NULL) free(varalloc);   // drop any storage used
+  if (allocacc!=NULL) free(allocacc);   // ..
+  #if DECSUBSET
+  if (allocrhs!=NULL) free(allocrhs);   // ..
+  if (alloclhs!=NULL) free(alloclhs);   // ..
+  #endif
+  return res;
+  } // decDivideOp
+
+/* ------------------------------------------------------------------ */
+/* decMultiplyOp -- multiplication operation                          */
+/*                                                                    */
+/*  This routine performs the multiplication C=A x B.                 */
+/*                                                                    */
+/*   res is C, the result.  C may be A and/or B (e.g., X=X*X)         */
+/*   lhs is A                                                         */
+/*   rhs is B                                                         */
+/*   set is the context                                               */
+/*   status is the usual accumulator                                  */
+/*                                                                    */
+/* C must have space for set->digits digits.                          */
+/*                                                                    */
+/* ------------------------------------------------------------------ */
+/* 'Classic' multiplication is used rather than Karatsuba, as the     */
+/* latter would give only a minor improvement for the short numbers   */
+/* expected to be handled most (and uses much more memory).           */
+/*                                                                    */
+/* There are two major paths here: the general-purpose ('old code')   */
+/* path which handles all DECDPUN values, and a fastpath version      */
+/* which is used if 64-bit ints are available, DECDPUN<=4, and more   */
+/* than two calls to decUnitAddSub would be made.                     */
+/*                                                                    */
+/* The fastpath version lumps units together into 8-digit or 9-digit  */
+/* chunks, and also uses a lazy carry strategy to minimise expensive  */
+/* 64-bit divisions.  The chunks are then broken apart again into     */
+/* units for continuing processing.  Despite this overhead, the       */
+/* fastpath can speed up some 16-digit operations by 10x (and much    */
+/* more for higher-precision calculations).                           */
+/*                                                                    */
+/* A buffer always has to be used for the accumulator; in the         */
+/* fastpath, buffers are also always needed for the chunked copies of */
+/* of the operand coefficients.                                       */
+/* Static buffers are larger than needed just for multiply, to allow  */
+/* for calls from other operations (notably exp).                     */
+/* ------------------------------------------------------------------ */
+#define FASTMUL (DECUSE64 && DECDPUN<5)
+static decNumber * decMultiplyOp(decNumber *res, const decNumber *lhs,
+                                 const decNumber *rhs, decContext *set,
+                                 uInt *status) {
+  Int    accunits;                 // Units of accumulator in use
+  Int    exponent;                 // work
+  Int    residue=0;                // rounding residue
+  uByte  bits;                     // result sign
+  Unit  *acc;                      // -> accumulator Unit array
+  Int    needbytes;                // size calculator
+  void  *allocacc=NULL;            // -> allocated accumulator, iff allocated
+  Unit  accbuff[SD2U(DECBUFFER*4+1)]; // buffer (+1 for DECBUFFER==0,
+                                   // *4 for calls from other operations)
+  const Unit *mer, *mermsup;       // work
+  Int   madlength;                 // Units in multiplicand
+  Int   shift;                     // Units to shift multiplicand by
+
+  #if FASTMUL
+    // if DECDPUN is 1 or 3 work in base 10**9, otherwise
+    // (DECDPUN is 2 or 4) then work in base 10**8
+    #if DECDPUN & 1                // odd
+      #define FASTBASE 1000000000  // base
+      #define FASTDIGS          9  // digits in base
+      #define FASTLAZY         18  // carry resolution point [1->18]
+    #else
+      #define FASTBASE  100000000
+      #define FASTDIGS          8
+      #define FASTLAZY       1844  // carry resolution point [1->1844]
+    #endif
+    // three buffers are used, two for chunked copies of the operands
+    // (base 10**8 or base 10**9) and one base 2**64 accumulator with
+    // lazy carry evaluation
+    uInt   zlhibuff[(DECBUFFER*2+1)/8+1]; // buffer (+1 for DECBUFFER==0)
+    uInt  *zlhi=zlhibuff;                 // -> lhs array
+    uInt  *alloclhi=NULL;                 // -> allocated buffer, iff allocated
+    uInt   zrhibuff[(DECBUFFER*2+1)/8+1]; // buffer (+1 for DECBUFFER==0)
+    uInt  *zrhi=zrhibuff;                 // -> rhs array
+    uInt  *allocrhi=NULL;                 // -> allocated buffer, iff allocated
+    uLong  zaccbuff[(DECBUFFER*2+1)/4+2]; // buffer (+1 for DECBUFFER==0)
+    // [allocacc is shared for both paths, as only one will run]
+    uLong *zacc=zaccbuff;          // -> accumulator array for exact result
+    #if DECDPUN==1
+    Int    zoff;                   // accumulator offset
+    #endif
+    uInt  *lip, *rip;              // item pointers
+    uInt  *lmsi, *rmsi;            // most significant items
+    Int    ilhs, irhs, iacc;       // item counts in the arrays
+    Int    lazy;                   // lazy carry counter
+    uLong  lcarry;                 // uLong carry
+    uInt   carry;                  // carry (NB not uLong)
+    Int    count;                  // work
+    const  Unit *cup;              // ..
+    Unit  *up;                     // ..
+    uLong *lp;                     // ..
+    Int    p;                      // ..
+  #endif
+
+  #if DECSUBSET
+    decNumber *alloclhs=NULL;      // -> allocated buffer, iff allocated
+    decNumber *allocrhs=NULL;      // -> allocated buffer, iff allocated
+  #endif
+
+  #if DECCHECK
+  if (decCheckOperands(res, lhs, rhs, set)) return res;
+  #endif
+
+  // precalculate result sign
+  bits=(uByte)((lhs->bits^rhs->bits)&DECNEG);
+
+  // handle infinities and NaNs
+  if (SPECIALARGS) {               // a special bit set
+    if (SPECIALARGS & (DECSNAN | DECNAN)) { // one or two NaNs
+      decNaNs(res, lhs, rhs, set, status);
+      return res;}
+    // one or two infinities; Infinity * 0 is invalid
+    if (((lhs->bits & DECINF)==0 && ISZERO(lhs))
+      ||((rhs->bits & DECINF)==0 && ISZERO(rhs))) {
+      *status|=DEC_Invalid_operation;
+      return res;}
+    decNumberZero(res);
+    res->bits=bits|DECINF;         // infinity
+    return res;}
+
+  // For best speed, as in DMSRCN [the original Rexx numerics
+  // module], use the shorter number as the multiplier (rhs) and
+  // the longer as the multiplicand (lhs) to minimise the number of
+  // adds (partial products)
+  if (lhs->digits<rhs->digits) {   // swap...
+    const decNumber *hold=lhs;
+    lhs=rhs;
+    rhs=hold;
+    }
+
+  do {                             // protect allocated storage
+    #if DECSUBSET
+    if (!set->extended) {
+      // reduce operands and set lostDigits status, as needed
+      if (lhs->digits>set->digits) {
+        alloclhs=decRoundOperand(lhs, set, status);
+        if (alloclhs==NULL) break;
+        lhs=alloclhs;
+        }
+      if (rhs->digits>set->digits) {
+        allocrhs=decRoundOperand(rhs, set, status);
+        if (allocrhs==NULL) break;
+        rhs=allocrhs;
+        }
+      }
+    #endif
+    // [following code does not require input rounding]
+
+    #if FASTMUL                    // fastpath can be used
+    // use the fast path if there are enough digits in the shorter
+    // operand to make the setup and takedown worthwhile
+    #define NEEDTWO (DECDPUN*2)    // within two decUnitAddSub calls
+    if (rhs->digits>NEEDTWO) {     // use fastpath...
+      // calculate the number of elements in each array
+      ilhs=(lhs->digits+FASTDIGS-1)/FASTDIGS; // [ceiling]
+      irhs=(rhs->digits+FASTDIGS-1)/FASTDIGS; // ..
+      iacc=ilhs+irhs;
+
+      // allocate buffers if required, as usual
+      needbytes=ilhs*sizeof(uInt);
+      if (needbytes>(Int)sizeof(zlhibuff)) {
+        alloclhi=(uInt *)malloc(needbytes);
+        zlhi=alloclhi;}
+      needbytes=irhs*sizeof(uInt);
+      if (needbytes>(Int)sizeof(zrhibuff)) {
+        allocrhi=(uInt *)malloc(needbytes);
+        zrhi=allocrhi;}
+
+      // Allocating the accumulator space needs a special case when
+      // DECDPUN=1 because when converting the accumulator to Units
+      // after the multiplication each 8-byte item becomes 9 1-byte
+      // units.  Therefore iacc extra bytes are needed at the front
+      // (rounded up to a multiple of 8 bytes), and the uLong
+      // accumulator starts offset the appropriate number of units
+      // to the right to avoid overwrite during the unchunking.
+      needbytes=iacc*sizeof(uLong);
+      #if DECDPUN==1
+      zoff=(iacc+7)/8;        // items to offset by
+      needbytes+=zoff*8;
+      #endif
+      if (needbytes>(Int)sizeof(zaccbuff)) {
+        allocacc=(uLong *)malloc(needbytes);
+        zacc=(uLong *)allocacc;}
+      if (zlhi==NULL||zrhi==NULL||zacc==NULL) {
+        *status|=DEC_Insufficient_storage;
+        break;}
+
+      acc=(Unit *)zacc;       // -> target Unit array
+      #if DECDPUN==1
+      zacc+=zoff;             // start uLong accumulator to right
+      #endif
+
+      // assemble the chunked copies of the left and right sides
+      for (count=lhs->digits, cup=lhs->lsu, lip=zlhi; count>0; lip++)
+        for (p=0, *lip=0; p<FASTDIGS && count>0;
+             p+=DECDPUN, cup++, count-=DECDPUN)
+          *lip+=*cup*powers[p];
+      lmsi=lip-1;     // save -> msi
+      for (count=rhs->digits, cup=rhs->lsu, rip=zrhi; count>0; rip++)
+        for (p=0, *rip=0; p<FASTDIGS && count>0;
+             p+=DECDPUN, cup++, count-=DECDPUN)
+          *rip+=*cup*powers[p];
+      rmsi=rip-1;     // save -> msi
+
+      // zero the accumulator
+      for (lp=zacc; lp<zacc+iacc; lp++) *lp=0;
+
+      /* Start the multiplication */
+      // Resolving carries can dominate the cost of accumulating the
+      // partial products, so this is only done when necessary.
+      // Each uLong item in the accumulator can hold values up to
+      // 2**64-1, and each partial product can be as large as
+      // (10**FASTDIGS-1)**2.  When FASTDIGS=9, this can be added to
+      // itself 18.4 times in a uLong without overflowing, so during
+      // the main calculation resolution is carried out every 18th
+      // add -- every 162 digits.  Similarly, when FASTDIGS=8, the
+      // partial products can be added to themselves 1844.6 times in
+      // a uLong without overflowing, so intermediate carry
+      // resolution occurs only every 14752 digits.  Hence for common
+      // short numbers usually only the one final carry resolution
+      // occurs.
+      // (The count is set via FASTLAZY to simplify experiments to
+      // measure the value of this approach: a 35% improvement on a
+      // [34x34] multiply.)
+      lazy=FASTLAZY;                         // carry delay count
+      for (rip=zrhi; rip<=rmsi; rip++) {     // over each item in rhs
+        lp=zacc+(rip-zrhi);                  // where to add the lhs
+        for (lip=zlhi; lip<=lmsi; lip++, lp++) { // over each item in lhs
+          *lp+=(uLong)(*lip)*(*rip);         // [this should in-line]
+          } // lip loop
+        lazy--;
+        if (lazy>0 && rip!=rmsi) continue;
+        lazy=FASTLAZY;                       // reset delay count
+        // spin up the accumulator resolving overflows
+        for (lp=zacc; lp<zacc+iacc; lp++) {
+          if (*lp<FASTBASE) continue;        // it fits
+          lcarry=*lp/FASTBASE;               // top part [slow divide]
+          // lcarry can exceed 2**32-1, so check again; this check
+          // and occasional extra divide (slow) is well worth it, as
+          // it allows FASTLAZY to be increased to 18 rather than 4
+          // in the FASTDIGS=9 case
+          if (lcarry<FASTBASE) carry=(uInt)lcarry;  // [usual]
+           else { // two-place carry [fairly rare]
+            uInt carry2=(uInt)(lcarry/FASTBASE);    // top top part
+            *(lp+2)+=carry2;                        // add to item+2
+            *lp-=((uLong)FASTBASE*FASTBASE*carry2); // [slow]
+            carry=(uInt)(lcarry-((uLong)FASTBASE*carry2)); // [inline]
+            }
+          *(lp+1)+=carry;                    // add to item above [inline]
+          *lp-=((uLong)FASTBASE*carry);      // [inline]
+          } // carry resolution
+        } // rip loop
+
+      // The multiplication is complete; time to convert back into
+      // units.  This can be done in-place in the accumulator and in
+      // 32-bit operations, because carries were resolved after the
+      // final add.  This needs N-1 divides and multiplies for
+      // each item in the accumulator (which will become up to N
+      // units, where 2<=N<=9).
+      for (lp=zacc, up=acc; lp<zacc+iacc; lp++) {
+        uInt item=(uInt)*lp;                 // decapitate to uInt
+        for (p=0; p<FASTDIGS-DECDPUN; p+=DECDPUN, up++) {
+          uInt part=item/(DECDPUNMAX+1);
+          *up=(Unit)(item-(part*(DECDPUNMAX+1)));
+          item=part;
+          } // p
+        *up=(Unit)item; up++;                // [final needs no division]
+        } // lp
+      accunits=up-acc;                       // count of units
+      }
+     else { // here to use units directly, without chunking ['old code']
+    #endif
+
+      // if accumulator will be too long for local storage, then allocate
+      acc=accbuff;                 // -> assume buffer for accumulator
+      needbytes=(D2U(lhs->digits)+D2U(rhs->digits))*sizeof(Unit);
+      if (needbytes>(Int)sizeof(accbuff)) {
+        allocacc=(Unit *)malloc(needbytes);
+        if (allocacc==NULL) {*status|=DEC_Insufficient_storage; break;}
+        acc=(Unit *)allocacc;                // use the allocated space
+        }
+
+      /* Now the main long multiplication loop */
+      // Unlike the equivalent in the IBM Java implementation, there
+      // is no advantage in calculating from msu to lsu.  So, do it
+      // by the book, as it were.
+      // Each iteration calculates ACC=ACC+MULTAND*MULT
+      accunits=1;                  // accumulator starts at '0'
+      *acc=0;                      // .. (lsu=0)
+      shift=0;                     // no multiplicand shift at first
+      madlength=D2U(lhs->digits);  // this won't change
+      mermsup=rhs->lsu+D2U(rhs->digits); // -> msu+1 of multiplier
+
+      for (mer=rhs->lsu; mer<mermsup; mer++) {
+        // Here, *mer is the next Unit in the multiplier to use
+        // If non-zero [optimization] add it...
+        if (*mer!=0) accunits=decUnitAddSub(&acc[shift], accunits-shift,
+                                            lhs->lsu, madlength, 0,
+                                            &acc[shift], *mer)
+                                            + shift;
+         else { // extend acc with a 0; it will be used shortly
+          *(acc+accunits)=0;       // [this avoids length of <=0 later]
+          accunits++;
+          }
+        // multiply multiplicand by 10**DECDPUN for next Unit to left
+        shift++;                   // add this for 'logical length'
+        } // n
+    #if FASTMUL
+      } // unchunked units
+    #endif
+    // common end-path
+    #if DECTRACE
+      decDumpAr('*', acc, accunits);         // Show exact result
+    #endif
+
+    // acc now contains the exact result of the multiplication,
+    // possibly with a leading zero unit; build the decNumber from
+    // it, noting if any residue
+    res->bits=bits;                          // set sign
+    res->digits=decGetDigits(acc, accunits); // count digits exactly
+
+    // There can be a 31-bit wrap in calculating the exponent.
+    // This can only happen if both input exponents are negative and
+    // both their magnitudes are large.  If there was a wrap, set a
+    // safe very negative exponent, from which decFinalize() will
+    // raise a hard underflow shortly.
+    exponent=lhs->exponent+rhs->exponent;    // calculate exponent
+    if (lhs->exponent<0 && rhs->exponent<0 && exponent>0)
+      exponent=-2*DECNUMMAXE;                // force underflow
+    res->exponent=exponent;                  // OK to overwrite now
+
+
+    // Set the coefficient.  If any rounding, residue records
+    decSetCoeff(res, set, acc, res->digits, &residue, status);
+    decFinish(res, set, &residue, status);   // final cleanup
+    } while(0);                         // end protected
+
+  if (allocacc!=NULL) free(allocacc);   // drop any storage used
+  #if DECSUBSET
+  if (allocrhs!=NULL) free(allocrhs);   // ..
+  if (alloclhs!=NULL) free(alloclhs);   // ..
+  #endif
+  #if FASTMUL
+  if (allocrhi!=NULL) free(allocrhi);   // ..
+  if (alloclhi!=NULL) free(alloclhi);   // ..
+  #endif
+  return res;
+  } // decMultiplyOp
+
+/* ------------------------------------------------------------------ */
+/* decExpOp -- effect exponentiation                                  */
+/*                                                                    */
+/*   This computes C = exp(A)                                         */
+/*                                                                    */
+/*   res is C, the result.  C may be A                                */
+/*   rhs is A                                                         */
+/*   set is the context; note that rounding mode has no effect        */
+/*                                                                    */
+/* C must have space for set->digits digits. status is updated but    */
+/* not set.                                                           */
+/*                                                                    */
+/* Restrictions:                                                      */
+/*                                                                    */
+/*   digits, emax, and -emin in the context must be less than         */
+/*   2*DEC_MAX_MATH (1999998), and the rhs must be within these       */
+/*   bounds or a zero.  This is an internal routine, so these         */
+/*   restrictions are contractual and not enforced.                   */
+/*                                                                    */
+/* A finite result is rounded using DEC_ROUND_HALF_EVEN; it will      */
+/* almost always be correctly rounded, but may be up to 1 ulp in      */
+/* error in rare cases.                                               */
+/*                                                                    */
+/* Finite results will always be full precision and Inexact, except   */
+/* when A is a zero or -Infinity (giving 1 or 0 respectively).        */
+/* ------------------------------------------------------------------ */
+/* This approach used here is similar to the algorithm described in   */
+/*                                                                    */
+/*   Variable Precision Exponential Function, T. E. Hull and          */
+/*   A. Abrham, ACM Transactions on Mathematical Software, Vol 12 #2, */
+/*   pp79-91, ACM, June 1986.                                         */
+/*                                                                    */
+/* with the main difference being that the iterations in the series   */
+/* evaluation are terminated dynamically (which does not require the  */
+/* extra variable-precision variables which are expensive in this     */
+/* context).                                                          */
+/*                                                                    */
+/* The error analysis in Hull & Abrham's paper applies except for the */
+/* round-off error accumulation during the series evaluation.  This   */
+/* code does not precalculate the number of iterations and so cannot  */
+/* use Horner's scheme.  Instead, the accumulation is done at double- */
+/* precision, which ensures that the additions of the terms are exact */
+/* and do not accumulate round-off (and any round-off errors in the   */
+/* terms themselves move 'to the right' faster than they can          */
+/* accumulate).  This code also extends the calculation by allowing,  */
+/* in the spirit of other decNumber operators, the input to be more   */
+/* precise than the result (the precision used is based on the more   */
+/* precise of the input or requested result).                         */
+/*                                                                    */
+/* Implementation notes:                                              */
+/*                                                                    */
+/* 1. This is separated out as decExpOp so it can be called from      */
+/*    other Mathematical functions (notably Ln) with a wider range    */
+/*    than normal.  In particular, it can handle the slightly wider   */
+/*    (double) range needed by Ln (which has to be able to calculate  */
+/*    exp(-x) where x can be the tiniest number (Ntiny).              */
+/*                                                                    */
+/* 2. Normalizing x to be <=0.1 (instead of <=1) reduces loop         */
+/*    iterations by appoximately a third with additional (although    */
+/*    diminishing) returns as the range is reduced to even smaller    */
+/*    fractions.  However, h (the power of 10 used to correct the     */
+/*    result at the end, see below) must be kept <=8 as otherwise     */
+/*    the final result cannot be computed.  Hence the leverage is a   */
+/*    sliding value (8-h), where potentially the range is reduced     */
+/*    more for smaller values.                                        */
+/*                                                                    */
+/*    The leverage that can be applied in this way is severely        */
+/*    limited by the cost of the raise-to-the power at the end,       */
+/*    which dominates when the number of iterations is small (less    */
+/*    than ten) or when rhs is short.  As an example, the adjustment  */
+/*    x**10,000,000 needs 31 multiplications, all but one full-width. */
+/*                                                                    */
+/* 3. The restrictions (especially precision) could be raised with    */
+/*    care, but the full decNumber range seems very hard within the   */
+/*    32-bit limits.                                                  */
+/*                                                                    */
+/* 4. The working precisions for the static buffers are twice the     */
+/*    obvious size to allow for calls from decNumberPower.            */
+/* ------------------------------------------------------------------ */
+decNumber * decExpOp(decNumber *res, const decNumber *rhs,
+                         decContext *set, uInt *status) {
+  uInt ignore=0;                   // working status
+  Int h;                           // adjusted exponent for 0.xxxx
+  Int p;                           // working precision
+  Int residue;                     // rounding residue
+  uInt needbytes;                  // for space calculations
+  const decNumber *x=rhs;          // (may point to safe copy later)
+  decContext aset, tset, dset;     // working contexts
+  Int comp;                        // work
+
+  // the argument is often copied to normalize it, so (unusually) it
+  // is treated like other buffers, using DECBUFFER, +1 in case
+  // DECBUFFER is 0
+  decNumber bufr[D2N(DECBUFFER*2+1)];
+  decNumber *allocrhs=NULL;        // non-NULL if rhs buffer allocated
+
+  // the working precision will be no more than set->digits+8+1
+  // so for on-stack buffers DECBUFFER+9 is used, +1 in case DECBUFFER
+  // is 0 (and twice that for the accumulator)
+
+  // buffer for t, term (working precision plus)
+  decNumber buft[D2N(DECBUFFER*2+9+1)];
+  decNumber *allocbuft=NULL;       // -> allocated buft, iff allocated
+  decNumber *t=buft;               // term
+  // buffer for a, accumulator (working precision * 2), at least 9
+  decNumber bufa[D2N(DECBUFFER*4+18+1)];
+  decNumber *allocbufa=NULL;       // -> allocated bufa, iff allocated
+  decNumber *a=bufa;               // accumulator
+  // decNumber for the divisor term; this needs at most 9 digits
+  // and so can be fixed size [16 so can use standard context]
+  decNumber bufd[D2N(16)];
+  decNumber *d=bufd;               // divisor
+  decNumber numone;                // constant 1
+
+  #if DECCHECK
+  Int iterations=0;                // for later sanity check
+  if (decCheckOperands(res, DECUNUSED, rhs, set)) return res;
+  #endif
+
+  do {                                  // protect allocated storage
+    if (SPECIALARG) {                   // handle infinities and NaNs
+      if (decNumberIsInfinite(rhs)) {   // an infinity
+        if (decNumberIsNegative(rhs))   // -Infinity -> +0
+          decNumberZero(res);
+         else decNumberCopy(res, rhs);  // +Infinity -> self
+        }
+       else decNaNs(res, rhs, NULL, set, status); // a NaN
+      break;}
+
+    if (ISZERO(rhs)) {                  // zeros -> exact 1
+      decNumberZero(res);               // make clean 1
+      *res->lsu=1;                      // ..
+      break;}                           // [no status to set]
+
+    // e**x when 0 < x < 0.66 is < 1+3x/2, hence can fast-path
+    // positive and negative tiny cases which will result in inexact
+    // 1.  This also allows the later add-accumulate to always be
+    // exact (because its length will never be more than twice the
+    // working precision).
+    // The comparator (tiny) needs just one digit, so use the
+    // decNumber d for it (reused as the divisor, etc., below); its
+    // exponent is such that if x is positive it will have
+    // set->digits-1 zeros between the decimal point and the digit,
+    // which is 4, and if x is negative one more zero there as the
+    // more precise result will be of the form 0.9999999 rather than
+    // 1.0000001.  Hence, tiny will be 0.0000004  if digits=7 and x>0
+    // or 0.00000004 if digits=7 and x<0.  If RHS not larger than
+    // this then the result will be 1.000000
+    decNumberZero(d);                   // clean
+    *d->lsu=4;                          // set 4 ..
+    d->exponent=-set->digits;           // * 10**(-d)
+    if (decNumberIsNegative(rhs)) d->exponent--;  // negative case
+    comp=decCompare(d, rhs, 1);         // signless compare
+    if (comp==BADINT) {
+      *status|=DEC_Insufficient_storage;
+      break;}
+    if (comp>=0) {                      // rhs < d
+      Int shift=set->digits-1;
+      decNumberZero(res);               // set 1
+      *res->lsu=1;                      // ..
+      res->digits=decShiftToMost(res->lsu, 1, shift);
+      res->exponent=-shift;                  // make 1.0000...
+      *status|=DEC_Inexact | DEC_Rounded;    // .. inexactly
+      break;} // tiny
+
+    // set up the context to be used for calculating a, as this is
+    // used on both paths below
+    decContextDefault(&aset, DEC_INIT_DECIMAL64);
+    // accumulator bounds are as requested (could underflow)
+    aset.emax=set->emax;                // usual bounds
+    aset.emin=set->emin;                // ..
+    aset.clamp=0;                       // and no concrete format
+
+    // calculate the adjusted (Hull & Abrham) exponent (where the
+    // decimal point is just to the left of the coefficient msd)
+    h=rhs->exponent+rhs->digits;
+    // if h>8 then 10**h cannot be calculated safely; however, when
+    // h=8 then exp(|rhs|) will be at least exp(1E+7) which is at
+    // least 6.59E+4342944, so (due to the restriction on Emax/Emin)
+    // overflow (or underflow to 0) is guaranteed -- so this case can
+    // be handled by simply forcing the appropriate excess
+    if (h>8) {                          // overflow/underflow
+      // set up here so Power call below will over or underflow to
+      // zero; set accumulator to either 2 or 0.02
+      // [stack buffer for a is always big enough for this]
+      decNumberZero(a);
+      *a->lsu=2;                        // not 1 but < exp(1)
+      if (decNumberIsNegative(rhs)) a->exponent=-2; // make 0.02
+      h=8;                              // clamp so 10**h computable
+      p=9;                              // set a working precision
+      }
+     else {                             // h<=8
+      Int maxlever=(rhs->digits>8?1:0);
+      // [could/should increase this for precisions >40 or so, too]
+
+      // if h is 8, cannot normalize to a lower upper limit because
+      // the final result will not be computable (see notes above),
+      // but leverage can be applied whenever h is less than 8.
+      // Apply as much as possible, up to a MAXLEVER digits, which
+      // sets the tradeoff against the cost of the later a**(10**h).
+      // As h is increased, the working precision below also
+      // increases to compensate for the "constant digits at the
+      // front" effect.
+      Int lever=MINI(8-h, maxlever);    // leverage attainable
+      Int use=-rhs->digits-lever;       // exponent to use for RHS
+      h+=lever;                         // apply leverage selected
+      if (h<0) {                        // clamp
+        use+=h;                         // [may end up subnormal]
+        h=0;
+        }
+      // Take a copy of RHS if it needs normalization (true whenever x>=1)
+      if (rhs->exponent!=use) {
+        decNumber *newrhs=bufr;         // assume will fit on stack
+        needbytes=sizeof(decNumber)+(D2U(rhs->digits)-1)*sizeof(Unit);
+        if (needbytes>sizeof(bufr)) {   // need malloc space
+          allocrhs=(decNumber *)malloc(needbytes);
+          if (allocrhs==NULL) {         // hopeless -- abandon
+            *status|=DEC_Insufficient_storage;
+            break;}
+          newrhs=allocrhs;              // use the allocated space
+          }
+        decNumberCopy(newrhs, rhs);     // copy to safe space
+        newrhs->exponent=use;           // normalize; now <1
+        x=newrhs;                       // ready for use
+        // decNumberShow(x);
+        }
+
+      // Now use the usual power series to evaluate exp(x).  The
+      // series starts as 1 + x + x^2/2 ... so prime ready for the
+      // third term by setting the term variable t=x, the accumulator
+      // a=1, and the divisor d=2.
+
+      // First determine the working precision.  From Hull & Abrham
+      // this is set->digits+h+2.  However, if x is 'over-precise' we
+      // need to allow for all its digits to potentially participate
+      // (consider an x where all the excess digits are 9s) so in
+      // this case use x->digits+h+2
+      p=MAXI(x->digits, set->digits)+h+2;    // [h<=8]
+
+      // a and t are variable precision, and depend on p, so space
+      // must be allocated for them if necessary
+
+      // the accumulator needs to be able to hold 2p digits so that
+      // the additions on the second and subsequent iterations are
+      // sufficiently exact.
+      needbytes=sizeof(decNumber)+(D2U(p*2)-1)*sizeof(Unit);
+      if (needbytes>sizeof(bufa)) {     // need malloc space
+        allocbufa=(decNumber *)malloc(needbytes);
+        if (allocbufa==NULL) {          // hopeless -- abandon
+          *status|=DEC_Insufficient_storage;
+          break;}
+        a=allocbufa;                    // use the allocated space
+        }
+      // the term needs to be able to hold p digits (which is
+      // guaranteed to be larger than x->digits, so the initial copy
+      // is safe); it may also be used for the raise-to-power
+      // calculation below, which needs an extra two digits
+      needbytes=sizeof(decNumber)+(D2U(p+2)-1)*sizeof(Unit);
+      if (needbytes>sizeof(buft)) {     // need malloc space
+        allocbuft=(decNumber *)malloc(needbytes);
+        if (allocbuft==NULL) {          // hopeless -- abandon
+          *status|=DEC_Insufficient_storage;
+          break;}
+        t=allocbuft;                    // use the allocated space
+        }
+
+      decNumberCopy(t, x);              // term=x
+      decNumberZero(a); *a->lsu=1;      // accumulator=1
+      decNumberZero(d); *d->lsu=2;      // divisor=2
+      decNumberZero(&numone); *numone.lsu=1; // constant 1 for increment
+
+      // set up the contexts for calculating a, t, and d
+      decContextDefault(&tset, DEC_INIT_DECIMAL64);
+      dset=tset;
+      // accumulator bounds are set above, set precision now
+      aset.digits=p*2;                  // double
+      // term bounds avoid any underflow or overflow
+      tset.digits=p;
+      tset.emin=DEC_MIN_EMIN;           // [emax is plenty]
+      // [dset.digits=16, etc., are sufficient]
+
+      // finally ready to roll
+      for (;;) {
+        #if DECCHECK
+        iterations++;
+        #endif
+        // only the status from the accumulation is interesting
+        // [but it should remain unchanged after first add]
+        decAddOp(a, a, t, &aset, 0, status);           // a=a+t
+        decMultiplyOp(t, t, x, &tset, &ignore);        // t=t*x
+        decDivideOp(t, t, d, &tset, DIVIDE, &ignore);  // t=t/d
+        // the iteration ends when the term cannot affect the result,
+        // if rounded to p digits, which is when its value is smaller
+        // than the accumulator by p+1 digits.  There must also be
+        // full precision in a.
+        if (((a->digits+a->exponent)>=(t->digits+t->exponent+p+1))
+            && (a->digits>=p)) break;
+        decAddOp(d, d, &numone, &dset, 0, &ignore);    // d=d+1
+        } // iterate
+
+      #if DECCHECK
+      // just a sanity check; comment out test to show always
+      if (iterations>p+3)
+        printf("Exp iterations=%ld, status=%08lx, p=%ld, d=%ld\n",
+               (LI)iterations, (LI)*status, (LI)p, (LI)x->digits);
+      #endif
+      } // h<=8
+
+    // apply postconditioning: a=a**(10**h) -- this is calculated
+    // at a slightly higher precision than Hull & Abrham suggest
+    if (h>0) {
+      Int seenbit=0;               // set once a 1-bit is seen
+      Int i;                       // counter
+      Int n=powers[h];             // always positive
+      aset.digits=p+2;             // sufficient precision
+      // avoid the overhead and many extra digits of decNumberPower
+      // as all that is needed is the short 'multipliers' loop; here
+      // accumulate the answer into t
+      decNumberZero(t); *t->lsu=1; // acc=1
+      for (i=1;;i++){              // for each bit [top bit ignored]
+        // abandon if have had overflow or terminal underflow
+        if (*status & (DEC_Overflow|DEC_Underflow)) { // interesting?
+          if (*status&DEC_Overflow || ISZERO(t)) break;}
+        n=n<<1;                    // move next bit to testable position
+        if (n<0) {                 // top bit is set
+          seenbit=1;               // OK, have a significant bit
+          decMultiplyOp(t, t, a, &aset, status); // acc=acc*x
+          }
+        if (i==31) break;          // that was the last bit
+        if (!seenbit) continue;    // no need to square 1
+        decMultiplyOp(t, t, t, &aset, status); // acc=acc*acc [square]
+        } /*i*/ // 32 bits
+      // decNumberShow(t);
+      a=t;                         // and carry on using t instead of a
+      }
+
+    // Copy and round the result to res
+    residue=1;                          // indicate dirt to right ..
+    if (ISZERO(a)) residue=0;           // .. unless underflowed to 0
+    aset.digits=set->digits;            // [use default rounding]
+    decCopyFit(res, a, &aset, &residue, status); // copy & shorten
+    decFinish(res, set, &residue, status);       // cleanup/set flags
+    } while(0);                         // end protected
+
+  if (allocrhs !=NULL) free(allocrhs);  // drop any storage used
+  if (allocbufa!=NULL) free(allocbufa); // ..
+  if (allocbuft!=NULL) free(allocbuft); // ..
+  // [status is handled by caller]
+  return res;
+  } // decExpOp
+
+/* ------------------------------------------------------------------ */
+/* Initial-estimate natural logarithm table                           */
+/*                                                                    */
+/*   LNnn -- 90-entry 16-bit table for values from .10 through .99.   */
+/*           The result is a 4-digit encode of the coefficient (c=the */
+/*           top 14 bits encoding 0-9999) and a 2-digit encode of the */
+/*           exponent (e=the bottom 2 bits encoding 0-3)              */
+/*                                                                    */
+/*           The resulting value is given by:                         */
+/*                                                                    */
+/*             v = -c * 10**(-e-3)                                    */
+/*                                                                    */
+/*           where e and c are extracted from entry k = LNnn[x-10]    */
+/*           where x is truncated (NB) into the range 10 through 99,  */
+/*           and then c = k>>2 and e = k&3.                           */
+/* ------------------------------------------------------------------ */
+const uShort LNnn[90]={9016,  8652,  8316,  8008,  7724,  7456,  7208,
+  6972,  6748,  6540,  6340,  6148,  5968,  5792,  5628,  5464,  5312,
+  5164,  5020,  4884,  4748,  4620,  4496,  4376,  4256,  4144,  4032,
+ 39233, 38181, 37157, 36157, 35181, 34229, 33297, 32389, 31501, 30629,
+ 29777, 28945, 28129, 27329, 26545, 25777, 25021, 24281, 23553, 22837,
+ 22137, 21445, 20769, 20101, 19445, 18801, 18165, 17541, 16925, 16321,
+ 15721, 15133, 14553, 13985, 13421, 12865, 12317, 11777, 11241, 10717,
+ 10197,  9685,  9177,  8677,  8185,  7697,  7213,  6737,  6269,  5801,
+  5341,  4889,  4437, 39930, 35534, 31186, 26886, 22630, 18418, 14254,
+ 10130,  6046, 20055};
+
+/* ------------------------------------------------------------------ */
+/* decLnOp -- effect natural logarithm                                */
+/*                                                                    */
+/*   This computes C = ln(A)                                          */
+/*                                                                    */
+/*   res is C, the result.  C may be A                                */
+/*   rhs is A                                                         */
+/*   set is the context; note that rounding mode has no effect        */
+/*                                                                    */
+/* C must have space for set->digits digits.                          */
+/*                                                                    */
+/* Notable cases:                                                     */
+/*   A<0 -> Invalid                                                   */
+/*   A=0 -> -Infinity (Exact)                                         */
+/*   A=+Infinity -> +Infinity (Exact)                                 */
+/*   A=1 exactly -> 0 (Exact)                                         */
+/*                                                                    */
+/* Restrictions (as for Exp):                                         */
+/*                                                                    */
+/*   digits, emax, and -emin in the context must be less than         */
+/*   DEC_MAX_MATH+11 (1000010), and the rhs must be within these      */
+/*   bounds or a zero.  This is an internal routine, so these         */
+/*   restrictions are contractual and not enforced.                   */
+/*                                                                    */
+/* A finite result is rounded using DEC_ROUND_HALF_EVEN; it will      */
+/* almost always be correctly rounded, but may be up to 1 ulp in      */
+/* error in rare cases.                                               */
+/* ------------------------------------------------------------------ */
+/* The result is calculated using Newton's method, with each          */
+/* iteration calculating a' = a + x * exp(-a) - 1.  See, for example, */
+/* Epperson 1989.                                                     */
+/*                                                                    */
+/* The iteration ends when the adjustment x*exp(-a)-1 is tiny enough. */
+/* This has to be calculated at the sum of the precision of x and the */
+/* working precision.                                                 */
+/*                                                                    */
+/* Implementation notes:                                              */
+/*                                                                    */
+/* 1. This is separated out as decLnOp so it can be called from       */
+/*    other Mathematical functions (e.g., Log 10) with a wider range  */
+/*    than normal.  In particular, it can handle the slightly wider   */
+/*    (+9+2) range needed by a power function.                        */
+/*                                                                    */
+/* 2. The speed of this function is about 10x slower than exp, as     */
+/*    it typically needs 4-6 iterations for short numbers, and the    */
+/*    extra precision needed adds a squaring effect, twice.           */
+/*                                                                    */
+/* 3. Fastpaths are included for ln(10) and ln(2), up to length 40,   */
+/*    as these are common requests.  ln(10) is used by log10(x).      */
+/*                                                                    */
+/* 4. An iteration might be saved by widening the LNnn table, and     */
+/*    would certainly save at least one if it were made ten times     */
+/*    bigger, too (for truncated fractions 0.100 through 0.999).      */
+/*    However, for most practical evaluations, at least four or five  */
+/*    iterations will be neede -- so this would only speed up by      */
+/*    20-25% and that probably does not justify increasing the table  */
+/*    size.                                                           */
+/*                                                                    */
+/* 5. The static buffers are larger than might be expected to allow   */
+/*    for calls from decNumberPower.                                  */
+/* ------------------------------------------------------------------ */
+decNumber * decLnOp(decNumber *res, const decNumber *rhs,
+                    decContext *set, uInt *status) {
+  uInt ignore=0;                   // working status accumulator
+  uInt needbytes;                  // for space calculations
+  Int residue;                     // rounding residue
+  Int r;                           // rhs=f*10**r [see below]
+  Int p;                           // working precision
+  Int pp;                          // precision for iteration
+  Int t;                           // work
+
+  // buffers for a (accumulator, typically precision+2) and b
+  // (adjustment calculator, same size)
+  decNumber bufa[D2N(DECBUFFER+12)];
+  decNumber *allocbufa=NULL;       // -> allocated bufa, iff allocated
+  decNumber *a=bufa;               // accumulator/work
+  decNumber bufb[D2N(DECBUFFER*2+2)];
+  decNumber *allocbufb=NULL;       // -> allocated bufa, iff allocated
+  decNumber *b=bufb;               // adjustment/work
+
+  decNumber  numone;               // constant 1
+  decNumber  cmp;                  // work
+  decContext aset, bset;           // working contexts
+
+  #if DECCHECK
+  Int iterations=0;                // for later sanity check
+  if (decCheckOperands(res, DECUNUSED, rhs, set)) return res;
+  #endif
+
+  do {                                  // protect allocated storage
+    if (SPECIALARG) {                   // handle infinities and NaNs
+      if (decNumberIsInfinite(rhs)) {   // an infinity
+        if (decNumberIsNegative(rhs))   // -Infinity -> error
+          *status|=DEC_Invalid_operation;
+         else decNumberCopy(res, rhs);  // +Infinity -> self
+        }
+       else decNaNs(res, rhs, NULL, set, status); // a NaN
+      break;}
+
+    if (ISZERO(rhs)) {                  // +/- zeros -> -Infinity
+      decNumberZero(res);               // make clean
+      res->bits=DECINF|DECNEG;          // set - infinity
+      break;}                           // [no status to set]
+
+    // Non-zero negatives are bad...
+    if (decNumberIsNegative(rhs)) {     // -x -> error
+      *status|=DEC_Invalid_operation;
+      break;}
+
+    // Here, rhs is positive, finite, and in range
+
+    // lookaside fastpath code for ln(2) and ln(10) at common lengths
+    if (rhs->exponent==0 && set->digits<=40) {
+      #if DECDPUN==1
+      if (rhs->lsu[0]==0 && rhs->lsu[1]==1 && rhs->digits==2) { // ln(10)
+      #else
+      if (rhs->lsu[0]==10 && rhs->digits==2) {                  // ln(10)
+      #endif
+        aset=*set; aset.round=DEC_ROUND_HALF_EVEN;
+        #define LN10 "2.302585092994045684017991454684364207601"
+        decNumberFromString(res, LN10, &aset);
+        *status|=(DEC_Inexact | DEC_Rounded); // is inexact
+        break;}
+      if (rhs->lsu[0]==2 && rhs->digits==1) { // ln(2)
+        aset=*set; aset.round=DEC_ROUND_HALF_EVEN;
+        #define LN2 "0.6931471805599453094172321214581765680755"
+        decNumberFromString(res, LN2, &aset);
+        *status|=(DEC_Inexact | DEC_Rounded);
+        break;}
+      } // integer and short
+
+    // Determine the working precision.  This is normally the
+    // requested precision + 2, with a minimum of 9.  However, if
+    // the rhs is 'over-precise' then allow for all its digits to
+    // potentially participate (consider an rhs where all the excess
+    // digits are 9s) so in this case use rhs->digits+2.
+    p=MAXI(rhs->digits, MAXI(set->digits, 7))+2;
+
+    // Allocate space for the accumulator and the high-precision
+    // adjustment calculator, if necessary.  The accumulator must
+    // be able to hold p digits, and the adjustment up to
+    // rhs->digits+p digits.  They are also made big enough for 16
+    // digits so that they can be used for calculating the initial
+    // estimate.
+    needbytes=sizeof(decNumber)+(D2U(MAXI(p,16))-1)*sizeof(Unit);
+    if (needbytes>sizeof(bufa)) {     // need malloc space
+      allocbufa=(decNumber *)malloc(needbytes);
+      if (allocbufa==NULL) {          // hopeless -- abandon
+        *status|=DEC_Insufficient_storage;
+        break;}
+      a=allocbufa;                    // use the allocated space
+      }
+    pp=p+rhs->digits;
+    needbytes=sizeof(decNumber)+(D2U(MAXI(pp,16))-1)*sizeof(Unit);
+    if (needbytes>sizeof(bufb)) {     // need malloc space
+      allocbufb=(decNumber *)malloc(needbytes);
+      if (allocbufb==NULL) {          // hopeless -- abandon
+        *status|=DEC_Insufficient_storage;
+        break;}
+      b=allocbufb;                    // use the allocated space
+      }
+
+    // Prepare an initial estimate in acc. Calculate this by
+    // considering the coefficient of x to be a normalized fraction,
+    // f, with the decimal point at far left and multiplied by
+    // 10**r.  Then, rhs=f*10**r and 0.1<=f<1, and
+    //   ln(x) = ln(f) + ln(10)*r
+    // Get the initial estimate for ln(f) from a small lookup
+    // table (see above) indexed by the first two digits of f,
+    // truncated.
+
+    decContextDefault(&aset, DEC_INIT_DECIMAL64); // 16-digit extended
+    r=rhs->exponent+rhs->digits;        // 'normalised' exponent
+    decNumberFromInt32(a, r);           // a=r
+    decNumberFromInt32(b, 2302585);     // b=ln(10) (2.302585)
+    b->exponent=-6;                     //  ..
+    decMultiplyOp(a, a, b, &aset, &ignore);  // a=a*b
+    // now get top two digits of rhs into b by simple truncate and
+    // force to integer
+    residue=0;                          // (no residue)
+    aset.digits=2; aset.round=DEC_ROUND_DOWN;
+    decCopyFit(b, rhs, &aset, &residue, &ignore); // copy & shorten
+    b->exponent=0;                      // make integer
+    t=decGetInt(b);                     // [cannot fail]
+    if (t<10) t=X10(t);                 // adjust single-digit b
+    t=LNnn[t-10];                       // look up ln(b)
+    decNumberFromInt32(b, t>>2);        // b=ln(b) coefficient
+    b->exponent=-(t&3)-3;               // set exponent
+    b->bits=DECNEG;                     // ln(0.10)->ln(0.99) always -ve
+    aset.digits=16; aset.round=DEC_ROUND_HALF_EVEN; // restore
+    decAddOp(a, a, b, &aset, 0, &ignore); // acc=a+b
+    // the initial estimate is now in a, with up to 4 digits correct.
+    // When rhs is at or near Nmax the estimate will be low, so we
+    // will approach it from below, avoiding overflow when calling exp.
+
+    decNumberZero(&numone); *numone.lsu=1;   // constant 1 for adjustment
+
+    // accumulator bounds are as requested (could underflow, but
+    // cannot overflow)
+    aset.emax=set->emax;
+    aset.emin=set->emin;
+    aset.clamp=0;                       // no concrete format
+    // set up a context to be used for the multiply and subtract
+    bset=aset;
+    bset.emax=DEC_MAX_MATH*2;           // use double bounds for the
+    bset.emin=-DEC_MAX_MATH*2;          // adjustment calculation
+                                        // [see decExpOp call below]
+    // for each iteration double the number of digits to calculate,
+    // up to a maximum of p
+    pp=9;                               // initial precision
+    // [initially 9 as then the sequence starts 7+2, 16+2, and
+    // 34+2, which is ideal for standard-sized numbers]
+    aset.digits=pp;                     // working context
+    bset.digits=pp+rhs->digits;         // wider context
+    for (;;) {                          // iterate
+      #if DECCHECK
+      iterations++;
+      if (iterations>24) break;         // consider 9 * 2**24
+      #endif
+      // calculate the adjustment (exp(-a)*x-1) into b.  This is a
+      // catastrophic subtraction but it really is the difference
+      // from 1 that is of interest.
+      // Use the internal entry point to Exp as it allows the double
+      // range for calculating exp(-a) when a is the tiniest subnormal.
+      a->bits^=DECNEG;                  // make -a
+      decExpOp(b, a, &bset, &ignore);   // b=exp(-a)
+      a->bits^=DECNEG;                  // restore sign of a
+      // now multiply by rhs and subtract 1, at the wider precision
+      decMultiplyOp(b, b, rhs, &bset, &ignore);        // b=b*rhs
+      decAddOp(b, b, &numone, &bset, DECNEG, &ignore); // b=b-1
+
+      // the iteration ends when the adjustment cannot affect the
+      // result by >=0.5 ulp (at the requested digits), which
+      // is when its value is smaller than the accumulator by
+      // set->digits+1 digits (or it is zero) -- this is a looser
+      // requirement than for Exp because all that happens to the
+      // accumulator after this is the final rounding (but note that
+      // there must also be full precision in a, or a=0).
+
+      if (decNumberIsZero(b) ||
+          (a->digits+a->exponent)>=(b->digits+b->exponent+set->digits+1)) {
+        if (a->digits==p) break;
+        if (decNumberIsZero(a)) {
+          decCompareOp(&cmp, rhs, &numone, &aset, COMPARE, &ignore); // rhs=1 ?
+          if (cmp.lsu[0]==0) a->exponent=0;            // yes, exact 0
+           else *status|=(DEC_Inexact | DEC_Rounded);  // no, inexact
+          break;
+          }
+        // force padding if adjustment has gone to 0 before full length
+        if (decNumberIsZero(b)) b->exponent=a->exponent-p;
+        }
+
+      // not done yet ...
+      decAddOp(a, a, b, &aset, 0, &ignore);  // a=a+b for next estimate
+      if (pp==p) continue;                   // precision is at maximum
+      // lengthen the next calculation
+      pp=pp*2;                               // double precision
+      if (pp>p) pp=p;                        // clamp to maximum
+      aset.digits=pp;                        // working context
+      bset.digits=pp+rhs->digits;            // wider context
+      } // Newton's iteration
+
+    #if DECCHECK
+    // just a sanity check; remove the test to show always
+    if (iterations>24)
+      printf("Ln iterations=%ld, status=%08lx, p=%ld, d=%ld\n",
+            (LI)iterations, (LI)*status, (LI)p, (LI)rhs->digits);
+    #endif
+
+    // Copy and round the result to res
+    residue=1;                          // indicate dirt to right
+    if (ISZERO(a)) residue=0;           // .. unless underflowed to 0
+    aset.digits=set->digits;            // [use default rounding]
+    decCopyFit(res, a, &aset, &residue, status); // copy & shorten
+    decFinish(res, set, &residue, status);       // cleanup/set flags
+    } while(0);                         // end protected
+
+  if (allocbufa!=NULL) free(allocbufa); // drop any storage used
+  if (allocbufb!=NULL) free(allocbufb); // ..
+  // [status is handled by caller]
+  return res;
+  } // decLnOp
+
+/* ------------------------------------------------------------------ */
+/* decQuantizeOp  -- force exponent to requested value                */
+/*                                                                    */
+/*   This computes C = op(A, B), where op adjusts the coefficient     */
+/*   of C (by rounding or shifting) such that the exponent (-scale)   */
+/*   of C has the value B or matches the exponent of B.               */
+/*   The numerical value of C will equal A, except for the effects of */
+/*   any rounding that occurred.                                      */
+/*                                                                    */
+/*   res is C, the result.  C may be A or B                           */
+/*   lhs is A, the number to adjust                                   */
+/*   rhs is B, the requested exponent                                 */
+/*   set is the context                                               */
+/*   quant is 1 for quantize or 0 for rescale                         */
+/*   status is the status accumulator (this can be called without     */
+/*          risk of control loss)                                     */
+/*                                                                    */
+/* C must have space for set->digits digits.                          */
+/*                                                                    */
+/* Unless there is an error or the result is infinite, the exponent   */
+/* after the operation is guaranteed to be that requested.            */
+/* ------------------------------------------------------------------ */
+static decNumber * decQuantizeOp(decNumber *res, const decNumber *lhs,
+                                 const decNumber *rhs, decContext *set,
+                                 Flag quant, uInt *status) {
+  #if DECSUBSET
+  decNumber *alloclhs=NULL;        // non-NULL if rounded lhs allocated
+  decNumber *allocrhs=NULL;        // .., rhs
+  #endif
+  const decNumber *inrhs=rhs;      // save original rhs
+  Int   reqdigits=set->digits;     // requested DIGITS
+  Int   reqexp;                    // requested exponent [-scale]
+  Int   residue=0;                 // rounding residue
+  Int   etiny=set->emin-(reqdigits-1);
+
+  #if DECCHECK
+  if (decCheckOperands(res, lhs, rhs, set)) return res;
+  #endif
+
+  do {                             // protect allocated storage
+    #if DECSUBSET
+    if (!set->extended) {
+      // reduce operands and set lostDigits status, as needed
+      if (lhs->digits>reqdigits) {
+        alloclhs=decRoundOperand(lhs, set, status);
+        if (alloclhs==NULL) break;
+        lhs=alloclhs;
+        }
+      if (rhs->digits>reqdigits) { // [this only checks lostDigits]
+        allocrhs=decRoundOperand(rhs, set, status);
+        if (allocrhs==NULL) break;
+        rhs=allocrhs;
+        }
+      }
+    #endif
+    // [following code does not require input rounding]
+
+    // Handle special values
+    if (SPECIALARGS) {
+      // NaNs get usual processing
+      if (SPECIALARGS & (DECSNAN | DECNAN))
+        decNaNs(res, lhs, rhs, set, status);
+      // one infinity but not both is bad
+      else if ((lhs->bits ^ rhs->bits) & DECINF)
+        *status|=DEC_Invalid_operation;
+      // both infinity: return lhs
+      else decNumberCopy(res, lhs);          // [nop if in place]
+      break;
+      }
+
+    // set requested exponent
+    if (quant) reqexp=inrhs->exponent;  // quantize -- match exponents
+     else {                             // rescale -- use value of rhs
+      // Original rhs must be an integer that fits and is in range,
+      // which could be from -1999999997 to +999999999, thanks to
+      // subnormals
+      reqexp=decGetInt(inrhs);               // [cannot fail]
+      }
+
+    #if DECSUBSET
+    if (!set->extended) etiny=set->emin;     // no subnormals
+    #endif
+
+    if (reqexp==BADINT                       // bad (rescale only) or ..
+     || reqexp==BIGODD || reqexp==BIGEVEN    // very big (ditto) or ..
+     || (reqexp<etiny)                       // < lowest
+     || (reqexp>set->emax)) {                // > emax
+      *status|=DEC_Invalid_operation;
+      break;}
+
+    // the RHS has been processed, so it can be overwritten now if necessary
+    if (ISZERO(lhs)) {                       // zero coefficient unchanged
+      decNumberCopy(res, lhs);               // [nop if in place]
+      res->exponent=reqexp;                  // .. just set exponent
+      #if DECSUBSET
+      if (!set->extended) res->bits=0;       // subset specification; no -0
+      #endif
+      }
+     else {                                  // non-zero lhs
+      Int adjust=reqexp-lhs->exponent;       // digit adjustment needed
+      // if adjusted coefficient will definitely not fit, give up now
+      if ((lhs->digits-adjust)>reqdigits) {
+        *status|=DEC_Invalid_operation;
+        break;
+        }
+
+      if (adjust>0) {                        // increasing exponent
+        // this will decrease the length of the coefficient by adjust
+        // digits, and must round as it does so
+        decContext workset;                  // work
+        workset=*set;                        // clone rounding, etc.
+        workset.digits=lhs->digits-adjust;   // set requested length
+        // [note that the latter can be <1, here]
+        decCopyFit(res, lhs, &workset, &residue, status); // fit to result
+        decApplyRound(res, &workset, residue, status);    // .. and round
+        residue=0;                                        // [used]
+        // If just rounded a 999s case, exponent will be off by one;
+        // adjust back (after checking space), if so.
+        if (res->exponent>reqexp) {
+          // re-check needed, e.g., for quantize(0.9999, 0.001) under
+          // set->digits==3
+          if (res->digits==reqdigits) {      // cannot shift by 1
+            *status&=~(DEC_Inexact | DEC_Rounded); // [clean these]
+            *status|=DEC_Invalid_operation;
+            break;
+            }
+          res->digits=decShiftToMost(res->lsu, res->digits, 1); // shift
+          res->exponent--;                   // (re)adjust the exponent.
+          }
+        #if DECSUBSET
+        if (ISZERO(res) && !set->extended) res->bits=0; // subset; no -0
+        #endif
+        } // increase
+       else /* adjust<=0 */ {                // decreasing or = exponent
+        // this will increase the length of the coefficient by -adjust
+        // digits, by adding zero or more trailing zeros; this is
+        // already checked for fit, above
+        decNumberCopy(res, lhs);             // [it will fit]
+        // if padding needed (adjust<0), add it now...
+        if (adjust<0) {
+          res->digits=decShiftToMost(res->lsu, res->digits, -adjust);
+          res->exponent+=adjust;             // adjust the exponent
+          }
+        } // decrease
+      } // non-zero
+
+    // Check for overflow [do not use Finalize in this case, as an
+    // overflow here is a "don't fit" situation]
+    if (res->exponent>set->emax-res->digits+1) {  // too big
+      *status|=DEC_Invalid_operation;
+      break;
+      }
+     else {
+      decFinalize(res, set, &residue, status);    // set subnormal flags
+      *status&=~DEC_Underflow;          // suppress Underflow [as per 754]
+      }
+    } while(0);                         // end protected
+
+  #if DECSUBSET
+  if (allocrhs!=NULL) free(allocrhs);   // drop any storage used
+  if (alloclhs!=NULL) free(alloclhs);   // ..
+  #endif
+  return res;
+  } // decQuantizeOp
+
+/* ------------------------------------------------------------------ */
+/* decCompareOp -- compare, min, or max two Numbers                   */
+/*                                                                    */
+/*   This computes C = A ? B and carries out one of four operations:  */
+/*     COMPARE    -- returns the signum (as a number) giving the      */
+/*                   result of a comparison unless one or both        */
+/*                   operands is a NaN (in which case a NaN results)  */
+/*     COMPSIG    -- as COMPARE except that a quiet NaN raises        */
+/*                   Invalid operation.                               */
+/*     COMPMAX    -- returns the larger of the operands, using the    */
+/*                   754 maxnum operation                             */
+/*     COMPMAXMAG -- ditto, comparing absolute values                 */
+/*     COMPMIN    -- the 754 minnum operation                         */
+/*     COMPMINMAG -- ditto, comparing absolute values                 */
+/*     COMTOTAL   -- returns the signum (as a number) giving the      */
+/*                   result of a comparison using 754 total ordering  */
+/*                                                                    */
+/*   res is C, the result.  C may be A and/or B (e.g., X=X?X)         */
+/*   lhs is A                                                         */
+/*   rhs is B                                                         */
+/*   set is the context                                               */
+/*   op  is the operation flag                                        */
+/*   status is the usual accumulator                                  */
+/*                                                                    */
+/* C must have space for one digit for COMPARE or set->digits for     */
+/* COMPMAX, COMPMIN, COMPMAXMAG, or COMPMINMAG.                       */
+/* ------------------------------------------------------------------ */
+/* The emphasis here is on speed for common cases, and avoiding       */
+/* coefficient comparison if possible.                                */
+/* ------------------------------------------------------------------ */
+decNumber * decCompareOp(decNumber *res, const decNumber *lhs,
+                         const decNumber *rhs, decContext *set,
+                         Flag op, uInt *status) {
+  #if DECSUBSET
+  decNumber *alloclhs=NULL;        // non-NULL if rounded lhs allocated
+  decNumber *allocrhs=NULL;        // .., rhs
+  #endif
+  Int   result=0;                  // default result value
+  uByte merged;                    // work
+
+  #if DECCHECK
+  if (decCheckOperands(res, lhs, rhs, set)) return res;
+  #endif
+
+  do {                             // protect allocated storage
+    #if DECSUBSET
+    if (!set->extended) {
+      // reduce operands and set lostDigits status, as needed
+      if (lhs->digits>set->digits) {
+        alloclhs=decRoundOperand(lhs, set, status);
+        if (alloclhs==NULL) {result=BADINT; break;}
+        lhs=alloclhs;
+        }
+      if (rhs->digits>set->digits) {
+        allocrhs=decRoundOperand(rhs, set, status);
+        if (allocrhs==NULL) {result=BADINT; break;}
+        rhs=allocrhs;
+        }
+      }
+    #endif
+    // [following code does not require input rounding]
+
+    // If total ordering then handle differing signs 'up front'
+    if (op==COMPTOTAL) {                // total ordering
+      if (decNumberIsNegative(lhs) & !decNumberIsNegative(rhs)) {
+        result=-1;
+        break;
+        }
+      if (!decNumberIsNegative(lhs) & decNumberIsNegative(rhs)) {
+        result=+1;
+        break;
+        }
+      }
+
+    // handle NaNs specially; let infinities drop through
+    // This assumes sNaN (even just one) leads to NaN.
+    merged=(lhs->bits | rhs->bits) & (DECSNAN | DECNAN);
+    if (merged) {                       // a NaN bit set
+      if (op==COMPARE);                 // result will be NaN
+       else if (op==COMPSIG)            // treat qNaN as sNaN
+        *status|=DEC_Invalid_operation | DEC_sNaN;
+       else if (op==COMPTOTAL) {        // total ordering, always finite
+        // signs are known to be the same; compute the ordering here
+        // as if the signs are both positive, then invert for negatives
+        if (!decNumberIsNaN(lhs)) result=-1;
+         else if (!decNumberIsNaN(rhs)) result=+1;
+         // here if both NaNs
+         else if (decNumberIsSNaN(lhs) && decNumberIsQNaN(rhs)) result=-1;
+         else if (decNumberIsQNaN(lhs) && decNumberIsSNaN(rhs)) result=+1;
+         else { // both NaN or both sNaN
+          // now it just depends on the payload
+          result=decUnitCompare(lhs->lsu, D2U(lhs->digits),
+                                rhs->lsu, D2U(rhs->digits), 0);
+          // [Error not possible, as these are 'aligned']
+          } // both same NaNs
+        if (decNumberIsNegative(lhs)) result=-result;
+        break;
+        } // total order
+
+       else if (merged & DECSNAN);           // sNaN -> qNaN
+       else { // here if MIN or MAX and one or two quiet NaNs
+        // min or max -- 754 rules ignore single NaN
+        if (!decNumberIsNaN(lhs) || !decNumberIsNaN(rhs)) {
+          // just one NaN; force choice to be the non-NaN operand
+          op=COMPMAX;
+          if (lhs->bits & DECNAN) result=-1; // pick rhs
+                             else result=+1; // pick lhs
+          break;
+          }
+        } // max or min
+      op=COMPNAN;                            // use special path
+      decNaNs(res, lhs, rhs, set, status);   // propagate NaN
+      break;
+      }
+    // have numbers
+    if (op==COMPMAXMAG || op==COMPMINMAG) result=decCompare(lhs, rhs, 1);
+     else result=decCompare(lhs, rhs, 0);    // sign matters
+    } while(0);                              // end protected
+
+  if (result==BADINT) *status|=DEC_Insufficient_storage; // rare
+   else {
+    if (op==COMPARE || op==COMPSIG ||op==COMPTOTAL) { // returning signum
+      if (op==COMPTOTAL && result==0) {
+        // operands are numerically equal or same NaN (and same sign,
+        // tested first); if identical, leave result 0
+        if (lhs->exponent!=rhs->exponent) {
+          if (lhs->exponent<rhs->exponent) result=-1;
+           else result=+1;
+          if (decNumberIsNegative(lhs)) result=-result;
+          } // lexp!=rexp
+        } // total-order by exponent
+      decNumberZero(res);               // [always a valid result]
+      if (result!=0) {                  // must be -1 or +1
+        *res->lsu=1;
+        if (result<0) res->bits=DECNEG;
+        }
+      }
+     else if (op==COMPNAN);             // special, drop through
+     else {                             // MAX or MIN, non-NaN result
+      Int residue=0;                    // rounding accumulator
+      // choose the operand for the result
+      const decNumber *choice;
+      if (result==0) { // operands are numerically equal
+        // choose according to sign then exponent (see 754)
+        uByte slhs=(lhs->bits & DECNEG);
+        uByte srhs=(rhs->bits & DECNEG);
+        #if DECSUBSET
+        if (!set->extended) {           // subset: force left-hand
+          op=COMPMAX;
+          result=+1;
+          }
+        else
+        #endif
+        if (slhs!=srhs) {          // signs differ
+          if (slhs) result=-1;     // rhs is max
+               else result=+1;     // lhs is max
+          }
+         else if (slhs && srhs) {  // both negative
+          if (lhs->exponent<rhs->exponent) result=+1;
+                                      else result=-1;
+          // [if equal, use lhs, technically identical]
+          }
+         else {                    // both positive
+          if (lhs->exponent>rhs->exponent) result=+1;
+                                      else result=-1;
+          // [ditto]
+          }
+        } // numerically equal
+      // here result will be non-0; reverse if looking for MIN
+      if (op==COMPMIN || op==COMPMINMAG) result=-result;
+      choice=(result>0 ? lhs : rhs);    // choose
+      // copy chosen to result, rounding if need be
+      decCopyFit(res, choice, set, &residue, status);
+      decFinish(res, set, &residue, status);
+      }
+    }
+  #if DECSUBSET
+  if (allocrhs!=NULL) free(allocrhs);   // free any storage used
+  if (alloclhs!=NULL) free(alloclhs);   // ..
+  #endif
+  return res;
+  } // decCompareOp
+
+/* ------------------------------------------------------------------ */
+/* decCompare -- compare two decNumbers by numerical value            */
+/*                                                                    */
+/*  This routine compares A ? B without altering them.                */
+/*                                                                    */
+/*  Arg1 is A, a decNumber which is not a NaN                         */
+/*  Arg2 is B, a decNumber which is not a NaN                         */
+/*  Arg3 is 1 for a sign-independent compare, 0 otherwise             */
+/*                                                                    */
+/*  returns -1, 0, or 1 for A<B, A==B, or A>B, or BADINT if failure   */
+/*  (the only possible failure is an allocation error)                */
+/* ------------------------------------------------------------------ */
+static Int decCompare(const decNumber *lhs, const decNumber *rhs,
+                      Flag abs) {
+  Int   result;                    // result value
+  Int   sigr;                      // rhs signum
+  Int   compare;                   // work
+
+  result=1;                                  // assume signum(lhs)
+  if (ISZERO(lhs)) result=0;
+  if (abs) {
+    if (ISZERO(rhs)) return result;          // LHS wins or both 0
+    // RHS is non-zero
+    if (result==0) return -1;                // LHS is 0; RHS wins
+    // [here, both non-zero, result=1]
+    }
+   else {                                    // signs matter
+    if (result && decNumberIsNegative(lhs)) result=-1;
+    sigr=1;                                  // compute signum(rhs)
+    if (ISZERO(rhs)) sigr=0;
+     else if (decNumberIsNegative(rhs)) sigr=-1;
+    if (result > sigr) return +1;            // L > R, return 1
+    if (result < sigr) return -1;            // L < R, return -1
+    if (result==0) return 0;                   // both 0
+    }
+
+  // signums are the same; both are non-zero
+  if ((lhs->bits | rhs->bits) & DECINF) {    // one or more infinities
+    if (decNumberIsInfinite(rhs)) {
+      if (decNumberIsInfinite(lhs)) result=0;// both infinite
+       else result=-result;                  // only rhs infinite
+      }
+    return result;
+    }
+  // must compare the coefficients, allowing for exponents
+  if (lhs->exponent>rhs->exponent) {         // LHS exponent larger
+    // swap sides, and sign
+    const decNumber *temp=lhs;
+    lhs=rhs;
+    rhs=temp;
+    result=-result;
+    }
+  compare=decUnitCompare(lhs->lsu, D2U(lhs->digits),
+                         rhs->lsu, D2U(rhs->digits),
+                         rhs->exponent-lhs->exponent);
+  if (compare!=BADINT) compare*=result;      // comparison succeeded
+  return compare;
+  } // decCompare
+
+/* ------------------------------------------------------------------ */
+/* decUnitCompare -- compare two >=0 integers in Unit arrays          */
+/*                                                                    */
+/*  This routine compares A ? B*10**E where A and B are unit arrays   */
+/*  A is a plain integer                                              */
+/*  B has an exponent of E (which must be non-negative)               */
+/*                                                                    */
+/*  Arg1 is A first Unit (lsu)                                        */
+/*  Arg2 is A length in Units                                         */
+/*  Arg3 is B first Unit (lsu)                                        */
+/*  Arg4 is B length in Units                                         */
+/*  Arg5 is E (0 if the units are aligned)                            */
+/*                                                                    */
+/*  returns -1, 0, or 1 for A<B, A==B, or A>B, or BADINT if failure   */
+/*  (the only possible failure is an allocation error, which can      */
+/*  only occur if E!=0)                                               */
+/* ------------------------------------------------------------------ */
+static Int decUnitCompare(const Unit *a, Int alength,
+                          const Unit *b, Int blength, Int exp) {
+  Unit  *acc;                      // accumulator for result
+  Unit  accbuff[SD2U(DECBUFFER*2+1)]; // local buffer
+  Unit  *allocacc=NULL;            // -> allocated acc buffer, iff allocated
+  Int   accunits, need;            // units in use or needed for acc
+  const Unit *l, *r, *u;           // work
+  Int   expunits, exprem, result;  // ..
+
+  if (exp==0) {                    // aligned; fastpath
+    if (alength>blength) return 1;
+    if (alength<blength) return -1;
+    // same number of units in both -- need unit-by-unit compare
+    l=a+alength-1;
+    r=b+alength-1;
+    for (;l>=a; l--, r--) {
+      if (*l>*r) return 1;
+      if (*l<*r) return -1;
+      }
+    return 0;                      // all units match
+    } // aligned
+
+  // Unaligned.  If one is >1 unit longer than the other, padded
+  // approximately, then can return easily
+  if (alength>blength+(Int)D2U(exp)) return 1;
+  if (alength+1<blength+(Int)D2U(exp)) return -1;
+
+  // Need to do a real subtract.  For this, a result buffer is needed
+  // even though only the sign is of interest.  Its length needs
+  // to be the larger of alength and padded blength, +2
+  need=blength+D2U(exp);                // maximum real length of B
+  if (need<alength) need=alength;
+  need+=2;
+  acc=accbuff;                          // assume use local buffer
+  if (need*sizeof(Unit)>sizeof(accbuff)) {
+    allocacc=(Unit *)malloc(need*sizeof(Unit));
+    if (allocacc==NULL) return BADINT;  // hopeless -- abandon
+    acc=allocacc;
+    }
+  // Calculate units and remainder from exponent.
+  expunits=exp/DECDPUN;
+  exprem=exp%DECDPUN;
+  // subtract [A+B*(-m)]
+  accunits=decUnitAddSub(a, alength, b, blength, expunits, acc,
+                         -(Int)powers[exprem]);
+  // [UnitAddSub result may have leading zeros, even on zero]
+  if (accunits<0) result=-1;            // negative result
+   else {                               // non-negative result
+    // check units of the result before freeing any storage
+    for (u=acc; u<acc+accunits-1 && *u==0;) u++;
+    result=(*u==0 ? 0 : +1);
+    }
+  // clean up and return the result
+  if (allocacc!=NULL) free(allocacc);   // drop any storage used
+  return result;
+  } // decUnitCompare
+
+/* ------------------------------------------------------------------ */
+/* decUnitAddSub -- add or subtract two >=0 integers in Unit arrays   */
+/*                                                                    */
+/*  This routine performs the calculation:                            */
+/*                                                                    */
+/*  C=A+(B*M)                                                         */
+/*                                                                    */
+/*  Where M is in the range -DECDPUNMAX through +DECDPUNMAX.          */
+/*                                                                    */
+/*  A may be shorter or longer than B.                                */
+/*                                                                    */
+/*  Leading zeros are not removed after a calculation.  The result is */
+/*  either the same length as the longer of A and B (adding any       */
+/*  shift), or one Unit longer than that (if a Unit carry occurred).  */
+/*                                                                    */
+/*  A and B content are not altered unless C is also A or B.          */
+/*  C may be the same array as A or B, but only if no zero padding is */
+/*  requested (that is, C may be B only if bshift==0).                */
+/*  C is filled from the lsu; only those units necessary to complete  */
+/*  the calculation are referenced.                                   */
+/*                                                                    */
+/*  Arg1 is A first Unit (lsu)                                        */
+/*  Arg2 is A length in Units                                         */
+/*  Arg3 is B first Unit (lsu)                                        */
+/*  Arg4 is B length in Units                                         */
+/*  Arg5 is B shift in Units  (>=0; pads with 0 units if positive)    */
+/*  Arg6 is C first Unit (lsu)                                        */
+/*  Arg7 is M, the multiplier                                         */
+/*                                                                    */
+/*  returns the count of Units written to C, which will be non-zero   */
+/*  and negated if the result is negative.  That is, the sign of the  */
+/*  returned Int is the sign of the result (positive for zero) and    */
+/*  the absolute value of the Int is the count of Units.              */
+/*                                                                    */
+/*  It is the caller's responsibility to make sure that C size is     */
+/*  safe, allowing space if necessary for a one-Unit carry.           */
+/*                                                                    */
+/*  This routine is severely performance-critical; *any* change here  */
+/*  must be measured (timed) to assure no performance degradation.    */
+/*  In particular, trickery here tends to be counter-productive, as   */
+/*  increased complexity of code hurts register optimizations on      */
+/*  register-poor architectures.  Avoiding divisions is nearly        */
+/*  always a Good Idea, however.                                      */
+/*                                                                    */
+/* Special thanks to Rick McGuire (IBM Cambridge, MA) and Dave Clark  */
+/* (IBM Warwick, UK) for some of the ideas used in this routine.      */
+/* ------------------------------------------------------------------ */
+static Int decUnitAddSub(const Unit *a, Int alength,
+                         const Unit *b, Int blength, Int bshift,
+                         Unit *c, Int m) {
+  const Unit *alsu=a;              // A lsu [need to remember it]
+  Unit *clsu=c;                    // C ditto
+  Unit *minC;                      // low water mark for C
+  Unit *maxC;                      // high water mark for C
+  eInt carry=0;                    // carry integer (could be Long)
+  Int  add;                        // work
+  #if DECDPUN<=4                   // myriadal, millenary, etc.
+  Int  est;                        // estimated quotient
+  #endif
+
+  #if DECTRACE
+  if (alength<1 || blength<1)
+    printf("decUnitAddSub: alen blen m %ld %ld [%ld]\n", alength, blength, m);
+  #endif
+
+  maxC=c+alength;                  // A is usually the longer
+  minC=c+blength;                  // .. and B the shorter
+  if (bshift!=0) {                 // B is shifted; low As copy across
+    minC+=bshift;
+    // if in place [common], skip copy unless there's a gap [rare]
+    if (a==c && bshift<=alength) {
+      c+=bshift;
+      a+=bshift;
+      }
+     else for (; c<clsu+bshift; a++, c++) {  // copy needed
+      if (a<alsu+alength) *c=*a;
+       else *c=0;
+      }
+    }
+  if (minC>maxC) { // swap
+    Unit *hold=minC;
+    minC=maxC;
+    maxC=hold;
+    }
+
+  // For speed, do the addition as two loops; the first where both A
+  // and B contribute, and the second (if necessary) where only one or
+  // other of the numbers contribute.
+  // Carry handling is the same (i.e., duplicated) in each case.
+  for (; c<minC; c++) {
+    carry+=*a;
+    a++;
+    carry+=((eInt)*b)*m;                // [special-casing m=1/-1
+    b++;                                // here is not a win]
+    // here carry is new Unit of digits; it could be +ve or -ve
+    if ((ueInt)carry<=DECDPUNMAX) {     // fastpath 0-DECDPUNMAX
+      *c=(Unit)carry;
+      carry=0;
+      continue;
+      }
+    #if DECDPUN==4                           // use divide-by-multiply
+      if (carry>=0) {
+        est=(((ueInt)carry>>11)*53687)>>18;
+        *c=(Unit)(carry-est*(DECDPUNMAX+1)); // remainder
+        carry=est;                           // likely quotient [89%]
+        if (*c<DECDPUNMAX+1) continue;       // estimate was correct
+        carry++;
+        *c-=DECDPUNMAX+1;
+        continue;
+        }
+      // negative case
+      carry=carry+(eInt)(DECDPUNMAX+1)*(DECDPUNMAX+1); // make positive
+      est=(((ueInt)carry>>11)*53687)>>18;
+      *c=(Unit)(carry-est*(DECDPUNMAX+1));
+      carry=est-(DECDPUNMAX+1);              // correctly negative
+      if (*c<DECDPUNMAX+1) continue;         // was OK
+      carry++;
+      *c-=DECDPUNMAX+1;
+    #elif DECDPUN==3
+      if (carry>=0) {
+        est=(((ueInt)carry>>3)*16777)>>21;
+        *c=(Unit)(carry-est*(DECDPUNMAX+1)); // remainder
+        carry=est;                           // likely quotient [99%]
+        if (*c<DECDPUNMAX+1) continue;       // estimate was correct
+        carry++;
+        *c-=DECDPUNMAX+1;
+        continue;
+        }
+      // negative case
+      carry=carry+(eInt)(DECDPUNMAX+1)*(DECDPUNMAX+1); // make positive
+      est=(((ueInt)carry>>3)*16777)>>21;
+      *c=(Unit)(carry-est*(DECDPUNMAX+1));
+      carry=est-(DECDPUNMAX+1);              // correctly negative
+      if (*c<DECDPUNMAX+1) continue;         // was OK
+      carry++;
+      *c-=DECDPUNMAX+1;
+    #elif DECDPUN<=2
+      // Can use QUOT10 as carry <= 4 digits
+      if (carry>=0) {
+        est=QUOT10(carry, DECDPUN);
+        *c=(Unit)(carry-est*(DECDPUNMAX+1)); // remainder
+        carry=est;                           // quotient
+        continue;
+        }
+      // negative case
+      carry=carry+(eInt)(DECDPUNMAX+1)*(DECDPUNMAX+1); // make positive
+      est=QUOT10(carry, DECDPUN);
+      *c=(Unit)(carry-est*(DECDPUNMAX+1));
+      carry=est-(DECDPUNMAX+1);              // correctly negative
+    #else
+      // remainder operator is undefined if negative, so must test
+      if ((ueInt)carry<(DECDPUNMAX+1)*2) {   // fastpath carry +1
+        *c=(Unit)(carry-(DECDPUNMAX+1));     // [helps additions]
+        carry=1;
+        continue;
+        }
+      if (carry>=0) {
+        *c=(Unit)(carry%(DECDPUNMAX+1));
+        carry=carry/(DECDPUNMAX+1);
+        continue;
+        }
+      // negative case
+      carry=carry+(eInt)(DECDPUNMAX+1)*(DECDPUNMAX+1); // make positive
+      *c=(Unit)(carry%(DECDPUNMAX+1));
+      carry=carry/(DECDPUNMAX+1)-(DECDPUNMAX+1);
+    #endif
+    } // c
+
+  // now may have one or other to complete
+  // [pretest to avoid loop setup/shutdown]
+  if (c<maxC) for (; c<maxC; c++) {
+    if (a<alsu+alength) {               // still in A
+      carry+=*a;
+      a++;
+      }
+     else {                             // inside B
+      carry+=((eInt)*b)*m;
+      b++;
+      }
+    // here carry is new Unit of digits; it could be +ve or -ve and
+    // magnitude up to DECDPUNMAX squared
+    if ((ueInt)carry<=DECDPUNMAX) {     // fastpath 0-DECDPUNMAX
+      *c=(Unit)carry;
+      carry=0;
+      continue;
+      }
+    // result for this unit is negative or >DECDPUNMAX
+    #if DECDPUN==4                           // use divide-by-multiply
+      if (carry>=0) {
+        est=(((ueInt)carry>>11)*53687)>>18;
+        *c=(Unit)(carry-est*(DECDPUNMAX+1)); // remainder
+        carry=est;                           // likely quotient [79.7%]
+        if (*c<DECDPUNMAX+1) continue;       // estimate was correct
+        carry++;
+        *c-=DECDPUNMAX+1;
+        continue;
+        }
+      // negative case
+      carry=carry+(eInt)(DECDPUNMAX+1)*(DECDPUNMAX+1); // make positive
+      est=(((ueInt)carry>>11)*53687)>>18;
+      *c=(Unit)(carry-est*(DECDPUNMAX+1));
+      carry=est-(DECDPUNMAX+1);              // correctly negative
+      if (*c<DECDPUNMAX+1) continue;         // was OK
+      carry++;
+      *c-=DECDPUNMAX+1;
+    #elif DECDPUN==3
+      if (carry>=0) {
+        est=(((ueInt)carry>>3)*16777)>>21;
+        *c=(Unit)(carry-est*(DECDPUNMAX+1)); // remainder
+        carry=est;                           // likely quotient [99%]
+        if (*c<DECDPUNMAX+1) continue;       // estimate was correct
+        carry++;
+        *c-=DECDPUNMAX+1;
+        continue;
+        }
+      // negative case
+      carry=carry+(eInt)(DECDPUNMAX+1)*(DECDPUNMAX+1); // make positive
+      est=(((ueInt)carry>>3)*16777)>>21;
+      *c=(Unit)(carry-est*(DECDPUNMAX+1));
+      carry=est-(DECDPUNMAX+1);              // correctly negative
+      if (*c<DECDPUNMAX+1) continue;         // was OK
+      carry++;
+      *c-=DECDPUNMAX+1;
+    #elif DECDPUN<=2
+      if (carry>=0) {
+        est=QUOT10(carry, DECDPUN);
+        *c=(Unit)(carry-est*(DECDPUNMAX+1)); // remainder
+        carry=est;                           // quotient
+        continue;
+        }
+      // negative case
+      carry=carry+(eInt)(DECDPUNMAX+1)*(DECDPUNMAX+1); // make positive
+      est=QUOT10(carry, DECDPUN);
+      *c=(Unit)(carry-est*(DECDPUNMAX+1));
+      carry=est-(DECDPUNMAX+1);              // correctly negative
+    #else
+      if ((ueInt)carry<(DECDPUNMAX+1)*2){    // fastpath carry 1
+        *c=(Unit)(carry-(DECDPUNMAX+1));
+        carry=1;
+        continue;
+        }
+      // remainder operator is undefined if negative, so must test
+      if (carry>=0) {
+        *c=(Unit)(carry%(DECDPUNMAX+1));
+        carry=carry/(DECDPUNMAX+1);
+        continue;
+        }
+      // negative case
+      carry=carry+(eInt)(DECDPUNMAX+1)*(DECDPUNMAX+1); // make positive
+      *c=(Unit)(carry%(DECDPUNMAX+1));
+      carry=carry/(DECDPUNMAX+1)-(DECDPUNMAX+1);
+    #endif
+    } // c
+
+  // OK, all A and B processed; might still have carry or borrow
+  // return number of Units in the result, negated if a borrow
+  if (carry==0) return c-clsu;     // no carry, so no more to do
+  if (carry>0) {                   // positive carry
+    *c=(Unit)carry;                // place as new unit
+    c++;                           // ..
+    return c-clsu;
+    }
+  // -ve carry: it's a borrow; complement needed
+  add=1;                           // temporary carry...
+  for (c=clsu; c<maxC; c++) {
+    add=DECDPUNMAX+add-*c;
+    if (add<=DECDPUNMAX) {
+      *c=(Unit)add;
+      add=0;
+      }
+     else {
+      *c=0;
+      add=1;
+      }
+    }
+  // add an extra unit iff it would be non-zero
+  #if DECTRACE
+    printf("UAS borrow: add %ld, carry %ld\n", add, carry);
+  #endif
+  if ((add-carry-1)!=0) {
+    *c=(Unit)(add-carry-1);
+    c++;                      // interesting, include it
+    }
+  return clsu-c;              // -ve result indicates borrowed
+  } // decUnitAddSub
+
+/* ------------------------------------------------------------------ */
+/* decTrim -- trim trailing zeros or normalize                        */
+/*                                                                    */
+/*   dn is the number to trim or normalize                            */
+/*   set is the context to use to check for clamp                     */
+/*   all is 1 to remove all trailing zeros, 0 for just fraction ones  */
+/*   noclamp is 1 to unconditional (unclamped) trim                   */
+/*   dropped returns the number of discarded trailing zeros           */
+/*   returns dn                                                       */
+/*                                                                    */
+/* If clamp is set in the context then the number of zeros trimmed    */
+/* may be limited if the exponent is high.                            */
+/* All fields are updated as required.  This is a utility operation,  */
+/* so special values are unchanged and no error is possible.          */
+/* ------------------------------------------------------------------ */
+static decNumber * decTrim(decNumber *dn, decContext *set, Flag all,
+                           Flag noclamp, Int *dropped) {
+  Int   d, exp;                    // work
+  uInt  cut;                       // ..
+  Unit  *up;                       // -> current Unit
+
+  #if DECCHECK
+  if (decCheckOperands(dn, DECUNUSED, DECUNUSED, DECUNCONT)) return dn;
+  #endif
+
+  *dropped=0;                           // assume no zeros dropped
+  if ((dn->bits & DECSPECIAL)           // fast exit if special ..
+    || (*dn->lsu & 0x01)) return dn;    // .. or odd
+  if (ISZERO(dn)) {                     // .. or 0
+    dn->exponent=0;                     // (sign is preserved)
+    return dn;
+    }
+
+  // have a finite number which is even
+  exp=dn->exponent;
+  cut=1;                           // digit (1-DECDPUN) in Unit
+  up=dn->lsu;                      // -> current Unit
+  for (d=0; d<dn->digits-1; d++) { // [don't strip the final digit]
+    // slice by powers
+    #if DECDPUN<=4
+      uInt quot=QUOT10(*up, cut);
+      if ((*up-quot*powers[cut])!=0) break;  // found non-0 digit
+    #else
+      if (*up%powers[cut]!=0) break;         // found non-0 digit
+    #endif
+    // have a trailing 0
+    if (!all) {                    // trimming
+      // [if exp>0 then all trailing 0s are significant for trim]
+      if (exp<=0) {                // if digit might be significant
+        if (exp==0) break;         // then quit
+        exp++;                     // next digit might be significant
+        }
+      }
+    cut++;                         // next power
+    if (cut>DECDPUN) {             // need new Unit
+      up++;
+      cut=1;
+      }
+    } // d
+  if (d==0) return dn;             // none to drop
+
+  // may need to limit drop if clamping
+  if (set->clamp && !noclamp) {
+    Int maxd=set->emax-set->digits+1-dn->exponent;
+    if (maxd<=0) return dn;        // nothing possible
+    if (d>maxd) d=maxd;
+    }
+
+  // effect the drop
+  decShiftToLeast(dn->lsu, D2U(dn->digits), d);
+  dn->exponent+=d;                 // maintain numerical value
+  dn->digits-=d;                   // new length
+  *dropped=d;                      // report the count
+  return dn;
+  } // decTrim
+
+/* ------------------------------------------------------------------ */
+/* decReverse -- reverse a Unit array in place                        */
+/*                                                                    */
+/*   ulo    is the start of the array                                 */
+/*   uhi    is the end of the array (highest Unit to include)         */
+/*                                                                    */
+/* The units ulo through uhi are reversed in place (if the number     */
+/* of units is odd, the middle one is untouched).  Note that the      */
+/* digit(s) in each unit are unaffected.                              */
+/* ------------------------------------------------------------------ */
+static void decReverse(Unit *ulo, Unit *uhi) {
+  Unit temp;
+  for (; ulo<uhi; ulo++, uhi--) {
+    temp=*ulo;
+    *ulo=*uhi;
+    *uhi=temp;
+    }
+  return;
+  } // decReverse
+
+/* ------------------------------------------------------------------ */
+/* decShiftToMost -- shift digits in array towards most significant   */
+/*                                                                    */
+/*   uar    is the array                                              */
+/*   digits is the count of digits in use in the array                */
+/*   shift  is the number of zeros to pad with (least significant);   */
+/*     it must be zero or positive                                    */
+/*                                                                    */
+/*   returns the new length of the integer in the array, in digits    */
+/*                                                                    */
+/* No overflow is permitted (that is, the uar array must be known to  */
+/* be large enough to hold the result, after shifting).               */
+/* ------------------------------------------------------------------ */
+static Int decShiftToMost(Unit *uar, Int digits, Int shift) {
+  Unit  *target, *source, *first;  // work
+  Int   cut;                       // odd 0's to add
+  uInt  next;                      // work
+
+  if (shift==0) return digits;     // [fastpath] nothing to do
+  if ((digits+shift)<=DECDPUN) {   // [fastpath] single-unit case
+    *uar=(Unit)(*uar*powers[shift]);
+    return digits+shift;
+    }
+
+  next=0;                          // all paths
+  source=uar+D2U(digits)-1;        // where msu comes from
+  target=source+D2U(shift);        // where upper part of first cut goes
+  cut=DECDPUN-MSUDIGITS(shift);    // where to slice
+  if (cut==0) {                    // unit-boundary case
+    for (; source>=uar; source--, target--) *target=*source;
+    }
+   else {
+    first=uar+D2U(digits+shift)-1; // where msu of source will end up
+    for (; source>=uar; source--, target--) {
+      // split the source Unit and accumulate remainder for next
+      #if DECDPUN<=4
+        uInt quot=QUOT10(*source, cut);
+        uInt rem=*source-quot*powers[cut];
+        next+=quot;
+      #else
+        uInt rem=*source%powers[cut];
+        next+=*source/powers[cut];
+      #endif
+      if (target<=first) *target=(Unit)next;   // write to target iff valid
+      next=rem*powers[DECDPUN-cut];            // save remainder for next Unit
+      }
+    } // shift-move
+
+  // propagate any partial unit to one below and clear the rest
+  for (; target>=uar; target--) {
+    *target=(Unit)next;
+    next=0;
+    }
+  return digits+shift;
+  } // decShiftToMost
+
+/* ------------------------------------------------------------------ */
+/* decShiftToLeast -- shift digits in array towards least significant */
+/*                                                                    */
+/*   uar   is the array                                               */
+/*   units is length of the array, in units                           */
+/*   shift is the number of digits to remove from the lsu end; it     */
+/*     must be zero or positive and <= than units*DECDPUN.            */
+/*                                                                    */
+/*   returns the new length of the integer in the array, in units     */
+/*                                                                    */
+/* Removed digits are discarded (lost).  Units not required to hold   */
+/* the final result are unchanged.                                    */
+/* ------------------------------------------------------------------ */
+static Int decShiftToLeast(Unit *uar, Int units, Int shift) {
+  Unit  *target, *up;              // work
+  Int   cut, count;                // work
+  Int   quot, rem;                 // for division
+
+  if (shift==0) return units;      // [fastpath] nothing to do
+  if (shift==units*DECDPUN) {      // [fastpath] little to do
+    *uar=0;                        // all digits cleared gives zero
+    return 1;                      // leaves just the one
+    }
+
+  target=uar;                      // both paths
+  cut=MSUDIGITS(shift);
+  if (cut==DECDPUN) {              // unit-boundary case; easy
+    up=uar+D2U(shift);
+    for (; up<uar+units; target++, up++) *target=*up;
+    return target-uar;
+    }
+
+  // messier
+  up=uar+D2U(shift-cut);           // source; correct to whole Units
+  count=units*DECDPUN-shift;       // the maximum new length
+  #if DECDPUN<=4
+    quot=QUOT10(*up, cut);
+  #else
+    quot=*up/powers[cut];
+  #endif
+  for (; ; target++) {
+    *target=(Unit)quot;
+    count-=(DECDPUN-cut);
+    if (count<=0) break;
+    up++;
+    quot=*up;
+    #if DECDPUN<=4
+      quot=QUOT10(quot, cut);
+      rem=*up-quot*powers[cut];
+    #else
+      rem=quot%powers[cut];
+      quot=quot/powers[cut];
+    #endif
+    *target=(Unit)(*target+rem*powers[DECDPUN-cut]);
+    count-=cut;
+    if (count<=0) break;
+    }
+  return target-uar+1;
+  } // decShiftToLeast
+
+#if DECSUBSET
+/* ------------------------------------------------------------------ */
+/* decRoundOperand -- round an operand  [used for subset only]        */
+/*                                                                    */
+/*   dn is the number to round (dn->digits is > set->digits)          */
+/*   set is the relevant context                                      */
+/*   status is the status accumulator                                 */
+/*                                                                    */
+/*   returns an allocated decNumber with the rounded result.          */
+/*                                                                    */
+/* lostDigits and other status may be set by this.                    */
+/*                                                                    */
+/* Since the input is an operand, it must not be modified.            */
+/* Instead, return an allocated decNumber, rounded as required.       */
+/* It is the caller's responsibility to free the allocated storage.   */
+/*                                                                    */
+/* If no storage is available then the result cannot be used, so NULL */
+/* is returned.                                                       */
+/* ------------------------------------------------------------------ */
+static decNumber *decRoundOperand(const decNumber *dn, decContext *set,
+                                  uInt *status) {
+  decNumber *res;                       // result structure
+  uInt newstatus=0;                     // status from round
+  Int  residue=0;                       // rounding accumulator
+
+  // Allocate storage for the returned decNumber, big enough for the
+  // length specified by the context
+  res=(decNumber *)malloc(sizeof(decNumber)
+                          +(D2U(set->digits)-1)*sizeof(Unit));
+  if (res==NULL) {
+    *status|=DEC_Insufficient_storage;
+    return NULL;
+    }
+  decCopyFit(res, dn, set, &residue, &newstatus);
+  decApplyRound(res, set, residue, &newstatus);
+
+  // If that set Inexact then "lost digits" is raised...
+  if (newstatus & DEC_Inexact) newstatus|=DEC_Lost_digits;
+  *status|=newstatus;
+  return res;
+  } // decRoundOperand
+#endif
+
+/* ------------------------------------------------------------------ */
+/* decCopyFit -- copy a number, truncating the coefficient if needed  */
+/*                                                                    */
+/*   dest is the target decNumber                                     */
+/*   src  is the source decNumber                                     */
+/*   set is the context [used for length (digits) and rounding mode]  */
+/*   residue is the residue accumulator                               */
+/*   status contains the current status to be updated                 */
+/*                                                                    */
+/* (dest==src is allowed and will be a no-op if fits)                 */
+/* All fields are updated as required.                                */
+/* ------------------------------------------------------------------ */
+static void decCopyFit(decNumber *dest, const decNumber *src,
+                       decContext *set, Int *residue, uInt *status) {
+  dest->bits=src->bits;
+  dest->exponent=src->exponent;
+  decSetCoeff(dest, set, src->lsu, src->digits, residue, status);
+  } // decCopyFit
+
+/* ------------------------------------------------------------------ */
+/* decSetCoeff -- set the coefficient of a number                     */
+/*                                                                    */
+/*   dn    is the number whose coefficient array is to be set.        */
+/*         It must have space for set->digits digits                  */
+/*   set   is the context [for size]                                  */
+/*   lsu   -> lsu of the source coefficient [may be dn->lsu]          */
+/*   len   is digits in the source coefficient [may be dn->digits]    */
+/*   residue is the residue accumulator.  This has values as in       */
+/*         decApplyRound, and will be unchanged unless the            */
+/*         target size is less than len.  In this case, the           */
+/*         coefficient is truncated and the residue is updated to     */
+/*         reflect the previous residue and the dropped digits.       */
+/*   status is the status accumulator, as usual                       */
+/*                                                                    */
+/* The coefficient may already be in the number, or it can be an      */
+/* external intermediate array.  If it is in the number, lsu must ==  */
+/* dn->lsu and len must == dn->digits.                                */
+/*                                                                    */
+/* Note that the coefficient length (len) may be < set->digits, and   */
+/* in this case this merely copies the coefficient (or is a no-op     */
+/* if dn->lsu==lsu).                                                  */
+/*                                                                    */
+/* Note also that (only internally, from decQuantizeOp and            */
+/* decSetSubnormal) the value of set->digits may be less than one,    */
+/* indicating a round to left.  This routine handles that case        */
+/* correctly; caller ensures space.                                   */
+/*                                                                    */
+/* dn->digits, dn->lsu (and as required), and dn->exponent are        */
+/* updated as necessary.   dn->bits (sign) is unchanged.              */
+/*                                                                    */
+/* DEC_Rounded status is set if any digits are discarded.             */
+/* DEC_Inexact status is set if any non-zero digits are discarded, or */
+/*                       incoming residue was non-0 (implies rounded) */
+/* ------------------------------------------------------------------ */
+// mapping array: maps 0-9 to canonical residues, so that a residue
+// can be adjusted in the range [-1, +1] and achieve correct rounding
+//                             0  1  2  3  4  5  6  7  8  9
+static const uByte resmap[10]={0, 3, 3, 3, 3, 5, 7, 7, 7, 7};
+static void decSetCoeff(decNumber *dn, decContext *set, const Unit *lsu,
+                        Int len, Int *residue, uInt *status) {
+  Int   discard;              // number of digits to discard
+  uInt  cut;                  // cut point in Unit
+  const Unit *up;             // work
+  Unit  *target;              // ..
+  Int   count;                // ..
+  #if DECDPUN<=4
+  uInt  temp;                 // ..
+  #endif
+
+  discard=len-set->digits;    // digits to discard
+  if (discard<=0) {           // no digits are being discarded
+    if (dn->lsu!=lsu) {       // copy needed
+      // copy the coefficient array to the result number; no shift needed
+      count=len;              // avoids D2U
+      up=lsu;
+      for (target=dn->lsu; count>0; target++, up++, count-=DECDPUN)
+        *target=*up;
+      dn->digits=len;         // set the new length
+      }
+    // dn->exponent and residue are unchanged, record any inexactitude
+    if (*residue!=0) *status|=(DEC_Inexact | DEC_Rounded);
+    return;
+    }
+
+  // some digits must be discarded ...
+  dn->exponent+=discard;      // maintain numerical value
+  *status|=DEC_Rounded;       // accumulate Rounded status
+  if (*residue>1) *residue=1; // previous residue now to right, so reduce
+
+  if (discard>len) {          // everything, +1, is being discarded
+    // guard digit is 0
+    // residue is all the number [NB could be all 0s]
+    if (*residue<=0) {        // not already positive
+      count=len;              // avoids D2U
+      for (up=lsu; count>0; up++, count-=DECDPUN) if (*up!=0) { // found non-0
+        *residue=1;
+        break;                // no need to check any others
+        }
+      }
+    if (*residue!=0) *status|=DEC_Inexact; // record inexactitude
+    *dn->lsu=0;               // coefficient will now be 0
+    dn->digits=1;             // ..
+    return;
+    } // total discard
+
+  // partial discard [most common case]
+  // here, at least the first (most significant) discarded digit exists
+
+  // spin up the number, noting residue during the spin, until get to
+  // the Unit with the first discarded digit.  When reach it, extract
+  // it and remember its position
+  count=0;
+  for (up=lsu;; up++) {
+    count+=DECDPUN;
+    if (count>=discard) break; // full ones all checked
+    if (*up!=0) *residue=1;
+    } // up
+
+  // here up -> Unit with first discarded digit
+  cut=discard-(count-DECDPUN)-1;
+  if (cut==DECDPUN-1) {       // unit-boundary case (fast)
+    Unit half=(Unit)powers[DECDPUN]>>1;
+    // set residue directly
+    if (*up>=half) {
+      if (*up>half) *residue=7;
+      else *residue+=5;       // add sticky bit
+      }
+     else { // <half
+      if (*up!=0) *residue=3; // [else is 0, leave as sticky bit]
+      }
+    if (set->digits<=0) {     // special for Quantize/Subnormal :-(
+      *dn->lsu=0;             // .. result is 0
+      dn->digits=1;           // ..
+      }
+     else {                   // shift to least
+      count=set->digits;      // now digits to end up with
+      dn->digits=count;       // set the new length
+      up++;                   // move to next
+      // on unit boundary, so shift-down copy loop is simple
+      for (target=dn->lsu; count>0; target++, up++, count-=DECDPUN)
+        *target=*up;
+      }
+    } // unit-boundary case
+
+   else { // discard digit is in low digit(s), and not top digit
+    uInt  discard1;                // first discarded digit
+    uInt  quot, rem;               // for divisions
+    if (cut==0) quot=*up;          // is at bottom of unit
+     else /* cut>0 */ {            // it's not at bottom of unit
+      #if DECDPUN<=4
+        quot=QUOT10(*up, cut);
+        rem=*up-quot*powers[cut];
+      #else
+        rem=*up%powers[cut];
+        quot=*up/powers[cut];
+      #endif
+      if (rem!=0) *residue=1;
+      }
+    // discard digit is now at bottom of quot
+    #if DECDPUN<=4
+      temp=(quot*6554)>>16;        // fast /10
+      // Vowels algorithm here not a win (9 instructions)
+      discard1=quot-X10(temp);
+      quot=temp;
+    #else
+      discard1=quot%10;
+      quot=quot/10;
+    #endif
+    // here, discard1 is the guard digit, and residue is everything
+    // else [use mapping array to accumulate residue safely]
+    *residue+=resmap[discard1];
+    cut++;                         // update cut
+    // here: up -> Unit of the array with bottom digit
+    //       cut is the division point for each Unit
+    //       quot holds the uncut high-order digits for the current unit
+    if (set->digits<=0) {          // special for Quantize/Subnormal :-(
+      *dn->lsu=0;                  // .. result is 0
+      dn->digits=1;                // ..
+      }
+     else {                        // shift to least needed
+      count=set->digits;           // now digits to end up with
+      dn->digits=count;            // set the new length
+      // shift-copy the coefficient array to the result number
+      for (target=dn->lsu; ; target++) {
+        *target=(Unit)quot;
+        count-=(DECDPUN-cut);
+        if (count<=0) break;
+        up++;
+        quot=*up;
+        #if DECDPUN<=4
+          quot=QUOT10(quot, cut);
+          rem=*up-quot*powers[cut];
+        #else
+          rem=quot%powers[cut];
+          quot=quot/powers[cut];
+        #endif
+        *target=(Unit)(*target+rem*powers[DECDPUN-cut]);
+        count-=cut;
+        if (count<=0) break;
+        } // shift-copy loop
+      } // shift to least
+    } // not unit boundary
+
+  if (*residue!=0) *status|=DEC_Inexact; // record inexactitude
+  return;
+  } // decSetCoeff
+
+/* ------------------------------------------------------------------ */
+/* decApplyRound -- apply pending rounding to a number                */
+/*                                                                    */
+/*   dn    is the number, with space for set->digits digits           */
+/*   set   is the context [for size and rounding mode]                */
+/*   residue indicates pending rounding, being any accumulated        */
+/*         guard and sticky information.  It may be:                  */
+/*         6-9: rounding digit is >5                                  */
+/*         5:   rounding digit is exactly half-way                    */
+/*         1-4: rounding digit is <5 and >0                           */
+/*         0:   the coefficient is exact                              */
+/*        -1:   as 1, but the hidden digits are subtractive, that     */
+/*              is, of the opposite sign to dn.  In this case the     */
+/*              coefficient must be non-0.  This case occurs when     */
+/*              subtracting a small number (which can be reduced to   */
+/*              a sticky bit); see decAddOp.                          */
+/*   status is the status accumulator, as usual                       */
+/*                                                                    */
+/* This routine applies rounding while keeping the length of the      */
+/* coefficient constant.  The exponent and status are unchanged       */
+/* except if:                                                         */
+/*                                                                    */
+/*   -- the coefficient was increased and is all nines (in which      */
+/*      case Overflow could occur, and is handled directly here so    */
+/*      the caller does not need to re-test for overflow)             */
+/*                                                                    */
+/*   -- the coefficient was decreased and becomes all nines (in which */
+/*      case Underflow could occur, and is also handled directly).    */
+/*                                                                    */
+/* All fields in dn are updated as required.                          */
+/*                                                                    */
+/* ------------------------------------------------------------------ */
+static void decApplyRound(decNumber *dn, decContext *set, Int residue,
+                          uInt *status) {
+  Int  bump;                  // 1 if coefficient needs to be incremented
+                              // -1 if coefficient needs to be decremented
+
+  if (residue==0) return;     // nothing to apply
+
+  bump=0;                     // assume a smooth ride
+
+  // now decide whether, and how, to round, depending on mode
+  switch (set->round) {
+    case DEC_ROUND_05UP: {    // round zero or five up (for reround)
+      // This is the same as DEC_ROUND_DOWN unless there is a
+      // positive residue and the lsd of dn is 0 or 5, in which case
+      // it is bumped; when residue is <0, the number is therefore
+      // bumped down unless the final digit was 1 or 6 (in which
+      // case it is bumped down and then up -- a no-op)
+      Int lsd5=*dn->lsu%5;     // get lsd and quintate
+      if (residue<0 && lsd5!=1) bump=-1;
+       else if (residue>0 && lsd5==0) bump=1;
+      // [bump==1 could be applied directly; use common path for clarity]
+      break;} // r-05
+
+    case DEC_ROUND_DOWN: {
+      // no change, except if negative residue
+      if (residue<0) bump=-1;
+      break;} // r-d
+
+    case DEC_ROUND_HALF_DOWN: {
+      if (residue>5) bump=1;
+      break;} // r-h-d
+
+    case DEC_ROUND_HALF_EVEN: {
+      if (residue>5) bump=1;            // >0.5 goes up
+       else if (residue==5) {           // exactly 0.5000...
+        // 0.5 goes up iff [new] lsd is odd
+        if (*dn->lsu & 0x01) bump=1;
+        }
+      break;} // r-h-e
+
+    case DEC_ROUND_HALF_UP: {
+      if (residue>=5) bump=1;
+      break;} // r-h-u
+
+    case DEC_ROUND_UP: {
+      if (residue>0) bump=1;
+      break;} // r-u
+
+    case DEC_ROUND_CEILING: {
+      // same as _UP for positive numbers, and as _DOWN for negatives
+      // [negative residue cannot occur on 0]
+      if (decNumberIsNegative(dn)) {
+        if (residue<0) bump=-1;
+        }
+       else {
+        if (residue>0) bump=1;
+        }
+      break;} // r-c
+
+    case DEC_ROUND_FLOOR: {
+      // same as _UP for negative numbers, and as _DOWN for positive
+      // [negative residue cannot occur on 0]
+      if (!decNumberIsNegative(dn)) {
+        if (residue<0) bump=-1;
+        }
+       else {
+        if (residue>0) bump=1;
+        }
+      break;} // r-f
+
+    default: {      // e.g., DEC_ROUND_MAX
+      *status|=DEC_Invalid_context;
+      #if DECTRACE || (DECCHECK && DECVERB)
+      printf("Unknown rounding mode: %d\n", set->round);
+      #endif
+      break;}
+    } // switch
+
+  // now bump the number, up or down, if need be
+  if (bump==0) return;                       // no action required
+
+  // Simply use decUnitAddSub unless bumping up and the number is
+  // all nines.  In this special case set to 100... explicitly
+  // and adjust the exponent by one (as otherwise could overflow
+  // the array)
+  // Similarly handle all-nines result if bumping down.
+  if (bump>0) {
+    Unit *up;                                // work
+    uInt count=dn->digits;                   // digits to be checked
+    for (up=dn->lsu; ; up++) {
+      if (count<=DECDPUN) {
+        // this is the last Unit (the msu)
+        if (*up!=powers[count]-1) break;     // not still 9s
+        // here if it, too, is all nines
+        *up=(Unit)powers[count-1];           // here 999 -> 100 etc.
+        for (up=up-1; up>=dn->lsu; up--) *up=0; // others all to 0
+        dn->exponent++;                      // and bump exponent
+        // [which, very rarely, could cause Overflow...]
+        if ((dn->exponent+dn->digits)>set->emax+1) {
+          decSetOverflow(dn, set, status);
+          }
+        return;                              // done
+        }
+      // a full unit to check, with more to come
+      if (*up!=DECDPUNMAX) break;            // not still 9s
+      count-=DECDPUN;
+      } // up
+    } // bump>0
+   else {                                    // -1
+    // here checking for a pre-bump of 1000... (leading 1, all
+    // other digits zero)
+    Unit *up, *sup;                          // work
+    uInt count=dn->digits;                   // digits to be checked
+    for (up=dn->lsu; ; up++) {
+      if (count<=DECDPUN) {
+        // this is the last Unit (the msu)
+        if (*up!=powers[count-1]) break;     // not 100..
+        // here if have the 1000... case
+        sup=up;                              // save msu pointer
+        *up=(Unit)powers[count]-1;           // here 100 in msu -> 999
+        // others all to all-nines, too
+        for (up=up-1; up>=dn->lsu; up--) *up=(Unit)powers[DECDPUN]-1;
+        dn->exponent--;                      // and bump exponent
+
+        // iff the number was at the subnormal boundary (exponent=etiny)
+        // then the exponent is now out of range, so it will in fact get
+        // clamped to etiny and the final 9 dropped.
+        // printf(">> emin=%d exp=%d sdig=%d\n", set->emin,
+        //        dn->exponent, set->digits);
+        if (dn->exponent+1==set->emin-set->digits+1) {
+          if (count==1 && dn->digits==1) *sup=0;  // here 9 -> 0[.9]
+           else {
+            *sup=(Unit)powers[count-1]-1;    // here 999.. in msu -> 99..
+            dn->digits--;
+            }
+          dn->exponent++;
+          *status|=DEC_Underflow | DEC_Subnormal | DEC_Inexact | DEC_Rounded;
+          }
+        return;                              // done
+        }
+
+      // a full unit to check, with more to come
+      if (*up!=0) break;                     // not still 0s
+      count-=DECDPUN;
+      } // up
+
+    } // bump<0
+
+  // Actual bump needed.  Do it.
+  decUnitAddSub(dn->lsu, D2U(dn->digits), uarrone, 1, 0, dn->lsu, bump);
+  } // decApplyRound
+
+#if DECSUBSET
+/* ------------------------------------------------------------------ */
+/* decFinish -- finish processing a number                            */
+/*                                                                    */
+/*   dn is the number                                                 */
+/*   set is the context                                               */
+/*   residue is the rounding accumulator (as in decApplyRound)        */
+/*   status is the accumulator                                        */
+/*                                                                    */
+/* This finishes off the current number by:                           */
+/*    1. If not extended:                                             */
+/*       a. Converting a zero result to clean '0'                     */
+/*       b. Reducing positive exponents to 0, if would fit in digits  */
+/*    2. Checking for overflow and subnormals (always)                */
+/* Note this is just Finalize when no subset arithmetic.              */
+/* All fields are updated as required.                                */
+/* ------------------------------------------------------------------ */
+static void decFinish(decNumber *dn, decContext *set, Int *residue,
+                      uInt *status) {
+  if (!set->extended) {
+    if ISZERO(dn) {                // value is zero
+      dn->exponent=0;              // clean exponent ..
+      dn->bits=0;                  // .. and sign
+      return;                      // no error possible
+      }
+    if (dn->exponent>=0) {         // non-negative exponent
+      // >0; reduce to integer if possible
+      if (set->digits >= (dn->exponent+dn->digits)) {
+        dn->digits=decShiftToMost(dn->lsu, dn->digits, dn->exponent);
+        dn->exponent=0;
+        }
+      }
+    } // !extended
+
+  decFinalize(dn, set, residue, status);
+  } // decFinish
+#endif
+
+/* ------------------------------------------------------------------ */
+/* decFinalize -- final check, clamp, and round of a number           */
+/*                                                                    */
+/*   dn is the number                                                 */
+/*   set is the context                                               */
+/*   residue is the rounding accumulator (as in decApplyRound)        */
+/*   status is the status accumulator                                 */
+/*                                                                    */
+/* This finishes off the current number by checking for subnormal     */
+/* results, applying any pending rounding, checking for overflow,     */
+/* and applying any clamping.                                         */
+/* Underflow and overflow conditions are raised as appropriate.       */
+/* All fields are updated as required.                                */
+/* ------------------------------------------------------------------ */
+static void decFinalize(decNumber *dn, decContext *set, Int *residue,
+                        uInt *status) {
+  Int shift;                            // shift needed if clamping
+  Int tinyexp=set->emin-dn->digits+1;   // precalculate subnormal boundary
+
+  // Must be careful, here, when checking the exponent as the
+  // adjusted exponent could overflow 31 bits [because it may already
+  // be up to twice the expected].
+
+  // First test for subnormal.  This must be done before any final
+  // round as the result could be rounded to Nmin or 0.
+  if (dn->exponent<=tinyexp) {          // prefilter
+    Int comp;
+    decNumber nmin;
+    // A very nasty case here is dn == Nmin and residue<0
+    if (dn->exponent<tinyexp) {
+      // Go handle subnormals; this will apply round if needed.
+      decSetSubnormal(dn, set, residue, status);
+      return;
+      }
+    // Equals case: only subnormal if dn=Nmin and negative residue
+    decNumberZero(&nmin);
+    nmin.lsu[0]=1;
+    nmin.exponent=set->emin;
+    comp=decCompare(dn, &nmin, 1);                // (signless compare)
+    if (comp==BADINT) {                           // oops
+      *status|=DEC_Insufficient_storage;          // abandon...
+      return;
+      }
+    if (*residue<0 && comp==0) {                  // neg residue and dn==Nmin
+      decApplyRound(dn, set, *residue, status);   // might force down
+      decSetSubnormal(dn, set, residue, status);
+      return;
+      }
+    }
+
+  // now apply any pending round (this could raise overflow).
+  if (*residue!=0) decApplyRound(dn, set, *residue, status);
+
+  // Check for overflow [redundant in the 'rare' case] or clamp
+  if (dn->exponent<=set->emax-set->digits+1) return;   // neither needed
+
+
+  // here when might have an overflow or clamp to do
+  if (dn->exponent>set->emax-dn->digits+1) {           // too big
+    decSetOverflow(dn, set, status);
+    return;
+    }
+  // here when the result is normal but in clamp range
+  if (!set->clamp) return;
+
+  // here when need to apply the IEEE exponent clamp (fold-down)
+  shift=dn->exponent-(set->emax-set->digits+1);
+
+  // shift coefficient (if non-zero)
+  if (!ISZERO(dn)) {
+    dn->digits=decShiftToMost(dn->lsu, dn->digits, shift);
+    }
+  dn->exponent-=shift;   // adjust the exponent to match
+  *status|=DEC_Clamped;  // and record the dirty deed
+  return;
+  } // decFinalize
+
+/* ------------------------------------------------------------------ */
+/* decSetOverflow -- set number to proper overflow value              */
+/*                                                                    */
+/*   dn is the number (used for sign [only] and result)               */
+/*   set is the context [used for the rounding mode, etc.]            */
+/*   status contains the current status to be updated                 */
+/*                                                                    */
+/* This sets the sign of a number and sets its value to either        */
+/* Infinity or the maximum finite value, depending on the sign of     */
+/* dn and the rounding mode, following IEEE 754 rules.                */
+/* ------------------------------------------------------------------ */
+static void decSetOverflow(decNumber *dn, decContext *set, uInt *status) {
+  Flag needmax=0;                  // result is maximum finite value
+  uByte sign=dn->bits&DECNEG;      // clean and save sign bit
+
+  if (ISZERO(dn)) {                // zero does not overflow magnitude
+    Int emax=set->emax;                      // limit value
+    if (set->clamp) emax-=set->digits-1;     // lower if clamping
+    if (dn->exponent>emax) {                 // clamp required
+      dn->exponent=emax;
+      *status|=DEC_Clamped;
+      }
+    return;
+    }
+
+  decNumberZero(dn);
+  switch (set->round) {
+    case DEC_ROUND_DOWN: {
+      needmax=1;                   // never Infinity
+      break;} // r-d
+    case DEC_ROUND_05UP: {
+      needmax=1;                   // never Infinity
+      break;} // r-05
+    case DEC_ROUND_CEILING: {
+      if (sign) needmax=1;         // Infinity if non-negative
+      break;} // r-c
+    case DEC_ROUND_FLOOR: {
+      if (!sign) needmax=1;        // Infinity if negative
+      break;} // r-f
+    default: break;                // Infinity in all other cases
+    }
+  if (needmax) {
+    decSetMaxValue(dn, set);
+    dn->bits=sign;                 // set sign
+    }
+   else dn->bits=sign|DECINF;      // Value is +/-Infinity
+  *status|=DEC_Overflow | DEC_Inexact | DEC_Rounded;
+  } // decSetOverflow
+
+/* ------------------------------------------------------------------ */
+/* decSetMaxValue -- set number to +Nmax (maximum normal value)       */
+/*                                                                    */
+/*   dn is the number to set                                          */
+/*   set is the context [used for digits and emax]                    */
+/*                                                                    */
+/* This sets the number to the maximum positive value.                */
+/* ------------------------------------------------------------------ */
+static void decSetMaxValue(decNumber *dn, decContext *set) {
+  Unit *up;                        // work
+  Int count=set->digits;           // nines to add
+  dn->digits=count;
+  // fill in all nines to set maximum value
+  for (up=dn->lsu; ; up++) {
+    if (count>DECDPUN) *up=DECDPUNMAX;  // unit full o'nines
+     else {                             // this is the msu
+      *up=(Unit)(powers[count]-1);
+      break;
+      }
+    count-=DECDPUN;                // filled those digits
+    } // up
+  dn->bits=0;                      // + sign
+  dn->exponent=set->emax-set->digits+1;
+  } // decSetMaxValue
+
+/* ------------------------------------------------------------------ */
+/* decSetSubnormal -- process value whose exponent is <Emin           */
+/*                                                                    */
+/*   dn is the number (used as input as well as output; it may have   */
+/*         an allowed subnormal value, which may need to be rounded)  */
+/*   set is the context [used for the rounding mode]                  */
+/*   residue is any pending residue                                   */
+/*   status contains the current status to be updated                 */
+/*                                                                    */
+/* If subset mode, set result to zero and set Underflow flags.        */
+/*                                                                    */
+/* Value may be zero with a low exponent; this does not set Subnormal */
+/* but the exponent will be clamped to Etiny.                         */
+/*                                                                    */
+/* Otherwise ensure exponent is not out of range, and round as        */
+/* necessary.  Underflow is set if the result is Inexact.             */
+/* ------------------------------------------------------------------ */
+static void decSetSubnormal(decNumber *dn, decContext *set, Int *residue,
+                            uInt *status) {
+  decContext workset;         // work
+  Int        etiny, adjust;   // ..
+
+  #if DECSUBSET
+  // simple set to zero and 'hard underflow' for subset
+  if (!set->extended) {
+    decNumberZero(dn);
+    // always full overflow
+    *status|=DEC_Underflow | DEC_Subnormal | DEC_Inexact | DEC_Rounded;
+    return;
+    }
+  #endif
+
+  // Full arithmetic -- allow subnormals, rounded to minimum exponent
+  // (Etiny) if needed
+  etiny=set->emin-(set->digits-1);      // smallest allowed exponent
+
+  if ISZERO(dn) {                       // value is zero
+    // residue can never be non-zero here
+    #if DECCHECK
+      if (*residue!=0) {
+        printf("++ Subnormal 0 residue %ld\n", (LI)*residue);
+        *status|=DEC_Invalid_operation;
+        }
+    #endif
+    if (dn->exponent<etiny) {           // clamp required
+      dn->exponent=etiny;
+      *status|=DEC_Clamped;
+      }
+    return;
+    }
+
+  *status|=DEC_Subnormal;               // have a non-zero subnormal
+  adjust=etiny-dn->exponent;            // calculate digits to remove
+  if (adjust<=0) {                      // not out of range; unrounded
+    // residue can never be non-zero here, except in the Nmin-residue
+    // case (which is a subnormal result), so can take fast-path here
+    // it may already be inexact (from setting the coefficient)
+    if (*status&DEC_Inexact) *status|=DEC_Underflow;
+    return;
+    }
+
+  // adjust>0, so need to rescale the result so exponent becomes Etiny
+  // [this code is similar to that in rescale]
+  workset=*set;                         // clone rounding, etc.
+  workset.digits=dn->digits-adjust;     // set requested length
+  workset.emin-=adjust;                 // and adjust emin to match
+  // [note that the latter can be <1, here, similar to Rescale case]
+  decSetCoeff(dn, &workset, dn->lsu, dn->digits, residue, status);
+  decApplyRound(dn, &workset, *residue, status);
+
+  // Use 754 default rule: Underflow is set iff Inexact
+  // [independent of whether trapped]
+  if (*status&DEC_Inexact) *status|=DEC_Underflow;
+
+  // if rounded up a 999s case, exponent will be off by one; adjust
+  // back if so [it will fit, because it was shortened earlier]
+  if (dn->exponent>etiny) {
+    dn->digits=decShiftToMost(dn->lsu, dn->digits, 1);
+    dn->exponent--;                     // (re)adjust the exponent.
+    }
+
+  // if rounded to zero, it is by definition clamped...
+  if (ISZERO(dn)) *status|=DEC_Clamped;
+  } // decSetSubnormal
+
+/* ------------------------------------------------------------------ */
+/* decCheckMath - check entry conditions for a math function          */
+/*                                                                    */
+/*   This checks the context and the operand                          */
+/*                                                                    */
+/*   rhs is the operand to check                                      */
+/*   set is the context to check                                      */
+/*   status is unchanged if both are good                             */
+/*                                                                    */
+/* returns non-zero if status is changed, 0 otherwise                 */
+/*                                                                    */
+/* Restrictions enforced:                                             */
+/*                                                                    */
+/*   digits, emax, and -emin in the context must be less than         */
+/*   DEC_MAX_MATH (999999), and A must be within these bounds if      */
+/*   non-zero.  Invalid_operation is set in the status if a           */
+/*   restriction is violated.                                         */
+/* ------------------------------------------------------------------ */
+static uInt decCheckMath(const decNumber *rhs, decContext *set,
+                         uInt *status) {
+  uInt save=*status;                         // record
+  if (set->digits>DEC_MAX_MATH
+   || set->emax>DEC_MAX_MATH
+   || -set->emin>DEC_MAX_MATH) *status|=DEC_Invalid_context;
+   else if ((rhs->digits>DEC_MAX_MATH
+     || rhs->exponent+rhs->digits>DEC_MAX_MATH+1
+     || rhs->exponent+rhs->digits<2*(1-DEC_MAX_MATH))
+     && !ISZERO(rhs)) *status|=DEC_Invalid_operation;
+  return (*status!=save);
+  } // decCheckMath
+
+/* ------------------------------------------------------------------ */
+/* decGetInt -- get integer from a number                             */
+/*                                                                    */
+/*   dn is the number [which will not be altered]                     */
+/*                                                                    */
+/*   returns one of:                                                  */
+/*     BADINT if there is a non-zero fraction                         */
+/*     the converted integer                                          */
+/*     BIGEVEN if the integer is even and magnitude > 2*10**9         */
+/*     BIGODD  if the integer is odd  and magnitude > 2*10**9         */
+/*                                                                    */
+/* This checks and gets a whole number from the input decNumber.      */
+/* The sign can be determined from dn by the caller when BIGEVEN or   */
+/* BIGODD is returned.                                                */
+/* ------------------------------------------------------------------ */
+static Int decGetInt(const decNumber *dn) {
+  Int  theInt;                          // result accumulator
+  const Unit *up;                       // work
+  Int  got;                             // digits (real or not) processed
+  Int  ilength=dn->digits+dn->exponent; // integral length
+  Flag neg=decNumberIsNegative(dn);     // 1 if -ve
+
+  // The number must be an integer that fits in 10 digits
+  // Assert, here, that 10 is enough for any rescale Etiny
+  #if DEC_MAX_EMAX > 999999999
+    #error GetInt may need updating [for Emax]
+  #endif
+  #if DEC_MIN_EMIN < -999999999
+    #error GetInt may need updating [for Emin]
+  #endif
+  if (ISZERO(dn)) return 0;             // zeros are OK, with any exponent
+
+  up=dn->lsu;                           // ready for lsu
+  theInt=0;                             // ready to accumulate
+  if (dn->exponent>=0) {                // relatively easy
+    // no fractional part [usual]; allow for positive exponent
+    got=dn->exponent;
+    }
+   else { // -ve exponent; some fractional part to check and discard
+    Int count=-dn->exponent;            // digits to discard
+    // spin up whole units until reach the Unit with the unit digit
+    for (; count>=DECDPUN; up++) {
+      if (*up!=0) return BADINT;        // non-zero Unit to discard
+      count-=DECDPUN;
+      }
+    if (count==0) got=0;                // [a multiple of DECDPUN]
+     else {                             // [not multiple of DECDPUN]
+      Int rem;                          // work
+      // slice off fraction digits and check for non-zero
+      #if DECDPUN<=4
+        theInt=QUOT10(*up, count);
+        rem=*up-theInt*powers[count];
+      #else
+        rem=*up%powers[count];          // slice off discards
+        theInt=*up/powers[count];
+      #endif
+      if (rem!=0) return BADINT;        // non-zero fraction
+      // it looks good
+      got=DECDPUN-count;                // number of digits so far
+      up++;                             // ready for next
+      }
+    }
+  // now it's known there's no fractional part
+
+  // tricky code now, to accumulate up to 9.3 digits
+  if (got==0) {theInt=*up; got+=DECDPUN; up++;} // ensure lsu is there
+
+  if (ilength<11) {
+    Int save=theInt;
+    // collect any remaining unit(s)
+    for (; got<ilength; up++) {
+      theInt+=*up*powers[got];
+      got+=DECDPUN;
+      }
+    if (ilength==10) {                  // need to check for wrap
+      if (theInt/(Int)powers[got-DECDPUN]!=(Int)*(up-1)) ilength=11;
+         // [that test also disallows the BADINT result case]
+       else if (neg && theInt>1999999997) ilength=11;
+       else if (!neg && theInt>999999999) ilength=11;
+      if (ilength==11) theInt=save;     // restore correct low bit
+      }
+    }
+
+  if (ilength>10) {                     // too big
+    if (theInt&1) return BIGODD;        // bottom bit 1
+    return BIGEVEN;                     // bottom bit 0
+    }
+
+  if (neg) theInt=-theInt;              // apply sign
+  return theInt;
+  } // decGetInt
+
+/* ------------------------------------------------------------------ */
+/* decDecap -- decapitate the coefficient of a number                 */
+/*                                                                    */
+/*   dn   is the number to be decapitated                             */
+/*   drop is the number of digits to be removed from the left of dn;  */
+/*     this must be <= dn->digits (if equal, the coefficient is       */
+/*     set to 0)                                                      */
+/*                                                                    */
+/* Returns dn; dn->digits will be <= the initial digits less drop     */
+/* (after removing drop digits there may be leading zero digits       */
+/* which will also be removed).  Only dn->lsu and dn->digits change.  */
+/* ------------------------------------------------------------------ */
+static decNumber *decDecap(decNumber *dn, Int drop) {
+  Unit *msu;                            // -> target cut point
+  Int cut;                              // work
+  if (drop>=dn->digits) {               // losing the whole thing
+    #if DECCHECK
+    if (drop>dn->digits)
+      printf("decDecap called with drop>digits [%ld>%ld]\n",
+             (LI)drop, (LI)dn->digits);
+    #endif
+    dn->lsu[0]=0;
+    dn->digits=1;
+    return dn;
+    }
+  msu=dn->lsu+D2U(dn->digits-drop)-1;   // -> likely msu
+  cut=MSUDIGITS(dn->digits-drop);       // digits to be in use in msu
+  if (cut!=DECDPUN) *msu%=powers[cut];  // clear left digits
+  // that may have left leading zero digits, so do a proper count...
+  dn->digits=decGetDigits(dn->lsu, msu-dn->lsu+1);
+  return dn;
+  } // decDecap
+
+/* ------------------------------------------------------------------ */
+/* decBiStr -- compare string with pairwise options                   */
+/*                                                                    */
+/*   targ is the string to compare                                    */
+/*   str1 is one of the strings to compare against (length may be 0)  */
+/*   str2 is the other; it must be the same length as str1            */
+/*                                                                    */
+/*   returns 1 if strings compare equal, (that is, it is the same     */
+/*   length as str1 and str2, and each character of targ is in either */
+/*   str1 or str2 in the corresponding position), or 0 otherwise      */
+/*                                                                    */
+/* This is used for generic caseless compare, including the awkward   */
+/* case of the Turkish dotted and dotless Is.  Use as (for example):  */
+/*   if (decBiStr(test, "mike", "MIKE")) ...                          */
+/* ------------------------------------------------------------------ */
+static Flag decBiStr(const char *targ, const char *str1, const char *str2) {
+  for (;;targ++, str1++, str2++) {
+    if (*targ!=*str1 && *targ!=*str2) return 0;
+    // *targ has a match in one (or both, if terminator)
+    if (*targ=='\0') break;
+    } // forever
+  return 1;
+  } // decBiStr
+
+/* ------------------------------------------------------------------ */
+/* decNaNs -- handle NaN operand or operands                          */
+/*                                                                    */
+/*   res     is the result number                                     */
+/*   lhs     is the first operand                                     */
+/*   rhs     is the second operand, or NULL if none                   */
+/*   context is used to limit payload length                          */
+/*   status  contains the current status                              */
+/*   returns res in case convenient                                   */
+/*                                                                    */
+/* Called when one or both operands is a NaN, and propagates the      */
+/* appropriate result to res.  When an sNaN is found, it is changed   */
+/* to a qNaN and Invalid operation is set.                            */
+/* ------------------------------------------------------------------ */
+static decNumber * decNaNs(decNumber *res, const decNumber *lhs,
+                           const decNumber *rhs, decContext *set,
+                           uInt *status) {
+  // This decision tree ends up with LHS being the source pointer,
+  // and status updated if need be
+  if (lhs->bits & DECSNAN)
+    *status|=DEC_Invalid_operation | DEC_sNaN;
+   else if (rhs==NULL);
+   else if (rhs->bits & DECSNAN) {
+    lhs=rhs;
+    *status|=DEC_Invalid_operation | DEC_sNaN;
+    }
+   else if (lhs->bits & DECNAN);
+   else lhs=rhs;
+
+  // propagate the payload
+  if (lhs->digits<=set->digits) decNumberCopy(res, lhs); // easy
+   else { // too long
+    const Unit *ul;
+    Unit *ur, *uresp1;
+    // copy safe number of units, then decapitate
+    res->bits=lhs->bits;                // need sign etc.
+    uresp1=res->lsu+D2U(set->digits);
+    for (ur=res->lsu, ul=lhs->lsu; ur<uresp1; ur++, ul++) *ur=*ul;
+    res->digits=D2U(set->digits)*DECDPUN;
+    // maybe still too long
+    if (res->digits>set->digits) decDecap(res, res->digits-set->digits);
+    }
+
+  res->bits&=~DECSNAN;        // convert any sNaN to NaN, while
+  res->bits|=DECNAN;          // .. preserving sign
+  res->exponent=0;            // clean exponent
+                              // [coefficient was copied/decapitated]
+  return res;
+  } // decNaNs
+
+/* ------------------------------------------------------------------ */
+/* decStatus -- apply non-zero status                                 */
+/*                                                                    */
+/*   dn     is the number to set if error                             */
+/*   status contains the current status (not yet in context)          */
+/*   set    is the context                                            */
+/*                                                                    */
+/* If the status is an error status, the number is set to a NaN,      */
+/* unless the error was an overflow, divide-by-zero, or underflow,    */
+/* in which case the number will have already been set.               */
+/*                                                                    */
+/* The context status is then updated with the new status.  Note that */
+/* this may raise a signal, so control may never return from this     */
+/* routine (hence resources must be recovered before it is called).   */
+/* ------------------------------------------------------------------ */
+static void decStatus(decNumber *dn, uInt status, decContext *set) {
+  if (status & DEC_NaNs) {              // error status -> NaN
+    // if cause was an sNaN, clear and propagate [NaN is already set up]
+    if (status & DEC_sNaN) status&=~DEC_sNaN;
+     else {
+      decNumberZero(dn);                // other error: clean throughout
+      dn->bits=DECNAN;                  // and make a quiet NaN
+      }
+    }
+  decContextSetStatus(set, status);     // [may not return]
+  return;
+  } // decStatus
+
+/* ------------------------------------------------------------------ */
+/* decGetDigits -- count digits in a Units array                      */
+/*                                                                    */
+/*   uar is the Unit array holding the number (this is often an       */
+/*          accumulator of some sort)                                 */
+/*   len is the length of the array in units [>=1]                    */
+/*                                                                    */
+/*   returns the number of (significant) digits in the array          */
+/*                                                                    */
+/* All leading zeros are excluded, except the last if the array has   */
+/* only zero Units.                                                   */
+/* ------------------------------------------------------------------ */
+// This may be called twice during some operations.
+static Int decGetDigits(Unit *uar, Int len) {
+  Unit *up=uar+(len-1);            // -> msu
+  Int  digits=(len-1)*DECDPUN+1;   // possible digits excluding msu
+  #if DECDPUN>4
+  uInt const *pow;                 // work
+  #endif
+                                   // (at least 1 in final msu)
+  #if DECCHECK
+  if (len<1) printf("decGetDigits called with len<1 [%ld]\n", (LI)len);
+  #endif
+
+  for (; up>=uar; up--) {
+    if (*up==0) {                  // unit is all 0s
+      if (digits==1) break;        // a zero has one digit
+      digits-=DECDPUN;             // adjust for 0 unit
+      continue;}
+    // found the first (most significant) non-zero Unit
+    #if DECDPUN>1                  // not done yet
+    if (*up<10) break;             // is 1-9
+    digits++;
+    #if DECDPUN>2                  // not done yet
+    if (*up<100) break;            // is 10-99
+    digits++;
+    #if DECDPUN>3                  // not done yet
+    if (*up<1000) break;           // is 100-999
+    digits++;
+    #if DECDPUN>4                  // count the rest ...
+    for (pow=&powers[4]; *up>=*pow; pow++) digits++;
+    #endif
+    #endif
+    #endif
+    #endif
+    break;
+    } // up
+  return digits;
+  } // decGetDigits
+
+#if DECTRACE | DECCHECK
+/* ------------------------------------------------------------------ */
+/* decNumberShow -- display a number [debug aid]                      */
+/*   dn is the number to show                                         */
+/*                                                                    */
+/* Shows: sign, exponent, coefficient (msu first), digits             */
+/*    or: sign, special-value                                         */
+/* ------------------------------------------------------------------ */
+// this is public so other modules can use it
+void decNumberShow(const decNumber *dn) {
+  const Unit *up;                  // work
+  uInt u, d;                       // ..
+  Int cut;                         // ..
+  char isign='+';                  // main sign
+  if (dn==NULL) {
+    printf("NULL\n");
+    return;}
+  if (decNumberIsNegative(dn)) isign='-';
+  printf(" >> %c ", isign);
+  if (dn->bits&DECSPECIAL) {       // Is a special value
+    if (decNumberIsInfinite(dn)) printf("Infinity");
+     else {                                  // a NaN
+      if (dn->bits&DECSNAN) printf("sNaN");  // signalling NaN
+       else printf("NaN");
+      }
+    // if coefficient and exponent are 0, no more to do
+    if (dn->exponent==0 && dn->digits==1 && *dn->lsu==0) {
+      printf("\n");
+      return;}
+    // drop through to report other information
+    printf(" ");
+    }
+
+  // now carefully display the coefficient
+  up=dn->lsu+D2U(dn->digits)-1;         // msu
+  printf("%ld", (LI)*up);
+  for (up=up-1; up>=dn->lsu; up--) {
+    u=*up;
+    printf(":");
+    for (cut=DECDPUN-1; cut>=0; cut--) {
+      d=u/powers[cut];
+      u-=d*powers[cut];
+      printf("%ld", (LI)d);
+      } // cut
+    } // up
+  if (dn->exponent!=0) {
+    char esign='+';
+    if (dn->exponent<0) esign='-';
+    printf(" E%c%ld", esign, (LI)abs(dn->exponent));
+    }
+  printf(" [%ld]\n", (LI)dn->digits);
+  } // decNumberShow
+#endif
+
+#if DECTRACE || DECCHECK
+/* ------------------------------------------------------------------ */
+/* decDumpAr -- display a unit array [debug/check aid]                */
+/*   name is a single-character tag name                              */
+/*   ar   is the array to display                                     */
+/*   len  is the length of the array in Units                         */
+/* ------------------------------------------------------------------ */
+static void decDumpAr(char name, const Unit *ar, Int len) {
+  Int i;
+  const char *spec;
+  #if DECDPUN==9
+    spec="%09d ";
+  #elif DECDPUN==8
+    spec="%08d ";
+  #elif DECDPUN==7
+    spec="%07d ";
+  #elif DECDPUN==6
+    spec="%06d ";
+  #elif DECDPUN==5
+    spec="%05d ";
+  #elif DECDPUN==4
+    spec="%04d ";
+  #elif DECDPUN==3
+    spec="%03d ";
+  #elif DECDPUN==2
+    spec="%02d ";
+  #else
+    spec="%d ";
+  #endif
+  printf("  :%c: ", name);
+  for (i=len-1; i>=0; i--) {
+    if (i==len-1) printf("%ld ", (LI)ar[i]);
+     else printf(spec, ar[i]);
+    }
+  printf("\n");
+  return;}
+#endif
+
+#if DECCHECK
+/* ------------------------------------------------------------------ */
+/* decCheckOperands -- check operand(s) to a routine                  */
+/*   res is the result structure (not checked; it will be set to      */
+/*          quiet NaN if error found (and it is not NULL))            */
+/*   lhs is the first operand (may be DECUNRESU)                      */
+/*   rhs is the second (may be DECUNUSED)                             */
+/*   set is the context (may be DECUNCONT)                            */
+/*   returns 0 if both operands, and the context are clean, or 1      */
+/*     otherwise (in which case the context will show an error,       */
+/*     unless NULL).  Note that res is not cleaned; caller should     */
+/*     handle this so res=NULL case is safe.                          */
+/* The caller is expected to abandon immediately if 1 is returned.    */
+/* ------------------------------------------------------------------ */
+static Flag decCheckOperands(decNumber *res, const decNumber *lhs,
+                             const decNumber *rhs, decContext *set) {
+  Flag bad=0;
+  if (set==NULL) {                 // oops; hopeless
+    #if DECTRACE || DECVERB
+    printf("Reference to context is NULL.\n");
+    #endif
+    bad=1;
+    return 1;}
+   else if (set!=DECUNCONT
+     && (set->digits<1 || set->round>=DEC_ROUND_MAX)) {
+    bad=1;
+    #if DECTRACE || DECVERB
+    printf("Bad context [digits=%ld round=%ld].\n",
+           (LI)set->digits, (LI)set->round);
+    #endif
+    }
+   else {
+    if (res==NULL) {
+      bad=1;
+      #if DECTRACE
+      // this one not DECVERB as standard tests include NULL
+      printf("Reference to result is NULL.\n");
+      #endif
+      }
+    if (!bad && lhs!=DECUNUSED) bad=(decCheckNumber(lhs));
+    if (!bad && rhs!=DECUNUSED) bad=(decCheckNumber(rhs));
+    }
+  if (bad) {
+    if (set!=DECUNCONT) decContextSetStatus(set, DEC_Invalid_operation);
+    if (res!=DECUNRESU && res!=NULL) {
+      decNumberZero(res);
+      res->bits=DECNAN;       // qNaN
+      }
+    }
+  return bad;
+  } // decCheckOperands
+
+/* ------------------------------------------------------------------ */
+/* decCheckNumber -- check a number                                   */
+/*   dn is the number to check                                        */
+/*   returns 0 if the number is clean, or 1 otherwise                 */
+/*                                                                    */
+/* The number is considered valid if it could be a result from some   */
+/* operation in some valid context.                                   */
+/* ------------------------------------------------------------------ */
+static Flag decCheckNumber(const decNumber *dn) {
+  const Unit *up;             // work
+  uInt maxuint;               // ..
+  Int ae, d, digits;          // ..
+  Int emin, emax;             // ..
+
+  if (dn==NULL) {             // hopeless
+    #if DECTRACE
+    // this one not DECVERB as standard tests include NULL
+    printf("Reference to decNumber is NULL.\n");
+    #endif
+    return 1;}
+
+  // check special values
+  if (dn->bits & DECSPECIAL) {
+    if (dn->exponent!=0) {
+      #if DECTRACE || DECVERB
+      printf("Exponent %ld (not 0) for a special value [%02x].\n",
+             (LI)dn->exponent, dn->bits);
+      #endif
+      return 1;}
+
+    // 2003.09.08: NaNs may now have coefficients, so next tests Inf only
+    if (decNumberIsInfinite(dn)) {
+      if (dn->digits!=1) {
+        #if DECTRACE || DECVERB
+        printf("Digits %ld (not 1) for an infinity.\n", (LI)dn->digits);
+        #endif
+        return 1;}
+      if (*dn->lsu!=0) {
+        #if DECTRACE || DECVERB
+        printf("LSU %ld (not 0) for an infinity.\n", (LI)*dn->lsu);
+        #endif
+        decDumpAr('I', dn->lsu, D2U(dn->digits));
+        return 1;}
+      } // Inf
+    // 2002.12.26: negative NaNs can now appear through proposed IEEE
+    //             concrete formats (decimal64, etc.).
+    return 0;
+    }
+
+  // check the coefficient
+  if (dn->digits<1 || dn->digits>DECNUMMAXP) {
+    #if DECTRACE || DECVERB
+    printf("Digits %ld in number.\n", (LI)dn->digits);
+    #endif
+    return 1;}
+
+  d=dn->digits;
+
+  for (up=dn->lsu; d>0; up++) {
+    if (d>DECDPUN) maxuint=DECDPUNMAX;
+     else {                   // reached the msu
+      maxuint=powers[d]-1;
+      if (dn->digits>1 && *up<powers[d-1]) {
+        #if DECTRACE || DECVERB
+        printf("Leading 0 in number.\n");
+        decNumberShow(dn);
+        #endif
+        return 1;}
+      }
+    if (*up>maxuint) {
+      #if DECTRACE || DECVERB
+      printf("Bad Unit [%08lx] in %ld-digit number at offset %ld [maxuint %ld].\n",
+              (LI)*up, (LI)dn->digits, (LI)(up-dn->lsu), (LI)maxuint);
+      #endif
+      return 1;}
+    d-=DECDPUN;
+    }
+
+  // check the exponent.  Note that input operands can have exponents
+  // which are out of the set->emin/set->emax and set->digits range
+  // (just as they can have more digits than set->digits).
+  ae=dn->exponent+dn->digits-1;    // adjusted exponent
+  emax=DECNUMMAXE;
+  emin=DECNUMMINE;
+  digits=DECNUMMAXP;
+  if (ae<emin-(digits-1)) {
+    #if DECTRACE || DECVERB
+    printf("Adjusted exponent underflow [%ld].\n", (LI)ae);
+    decNumberShow(dn);
+    #endif
+    return 1;}
+  if (ae>+emax) {
+    #if DECTRACE || DECVERB
+    printf("Adjusted exponent overflow [%ld].\n", (LI)ae);
+    decNumberShow(dn);
+    #endif
+    return 1;}
+
+  return 0;              // it's OK
+  } // decCheckNumber
+
+/* ------------------------------------------------------------------ */
+/* decCheckInexact -- check a normal finite inexact result has digits */
+/*   dn is the number to check                                        */
+/*   set is the context (for status and precision)                    */
+/*   sets Invalid operation, etc., if some digits are missing         */
+/* [this check is not made for DECSUBSET compilation or when          */
+/* subnormal is not set]                                              */
+/* ------------------------------------------------------------------ */
+static void decCheckInexact(const decNumber *dn, decContext *set) {
+  #if !DECSUBSET && DECEXTFLAG
+    if ((set->status & (DEC_Inexact|DEC_Subnormal))==DEC_Inexact
+     && (set->digits!=dn->digits) && !(dn->bits & DECSPECIAL)) {
+      #if DECTRACE || DECVERB
+      printf("Insufficient digits [%ld] on normal Inexact result.\n",
+             (LI)dn->digits);
+      decNumberShow(dn);
+      #endif
+      decContextSetStatus(set, DEC_Invalid_operation);
+      }
+  #else
+    // next is a noop for quiet compiler
+    if (dn!=NULL && dn->digits==0) set->status|=DEC_Invalid_operation;
+  #endif
+  return;
+  } // decCheckInexact
+#endif
+
+#if DECALLOC
+#undef malloc
+#undef free
+/* ------------------------------------------------------------------ */
+/* decMalloc -- accountable allocation routine                        */
+/*   n is the number of bytes to allocate                             */
+/*                                                                    */
+/* Semantics is the same as the stdlib malloc routine, but bytes      */
+/* allocated are accounted for globally, and corruption fences are    */
+/* added before and after the 'actual' storage.                       */
+/* ------------------------------------------------------------------ */
+/* This routine allocates storage with an extra twelve bytes; 8 are   */
+/* at the start and hold:                                             */
+/*   0-3 the original length requested                                */
+/*   4-7 buffer corruption detection fence (DECFENCE, x4)             */
+/* The 4 bytes at the end also hold a corruption fence (DECFENCE, x4) */
+/* ------------------------------------------------------------------ */
+static void *decMalloc(size_t n) {
+  uInt  size=n+12;                 // true size
+  void  *alloc;                    // -> allocated storage
+  uByte *b, *b0;                   // work
+  uInt  uiwork;                    // for macros
+
+  alloc=malloc(size);              // -> allocated storage
+  if (alloc==NULL) return NULL;    // out of strorage
+  b0=(uByte *)alloc;               // as bytes
+  decAllocBytes+=n;                // account for storage
+  UBFROMUI(alloc, n);              // save n
+  // printf(" alloc ++ dAB: %ld (%ld)\n", (LI)decAllocBytes, (LI)n);
+  for (b=b0+4; b<b0+8; b++) *b=DECFENCE;
+  for (b=b0+n+8; b<b0+n+12; b++) *b=DECFENCE;
+  return b0+8;                     // -> play area
+  } // decMalloc
+
+/* ------------------------------------------------------------------ */
+/* decFree -- accountable free routine                                */
+/*   alloc is the storage to free                                     */
+/*                                                                    */
+/* Semantics is the same as the stdlib malloc routine, except that    */
+/* the global storage accounting is updated and the fences are        */
+/* checked to ensure that no routine has written 'out of bounds'.     */
+/* ------------------------------------------------------------------ */
+/* This routine first checks that the fences have not been corrupted. */
+/* It then frees the storage using the 'truw' storage address (that   */
+/* is, offset by 8).                                                  */
+/* ------------------------------------------------------------------ */
+static void decFree(void *alloc) {
+  uInt  n;                         // original length
+  uByte *b, *b0;                   // work
+  uInt  uiwork;                    // for macros
+
+  if (alloc==NULL) return;         // allowed; it's a nop
+  b0=(uByte *)alloc;               // as bytes
+  b0-=8;                           // -> true start of storage
+  n=UBTOUI(b0);                    // lift length
+  for (b=b0+4; b<b0+8; b++) if (*b!=DECFENCE)
+    printf("=== Corrupt byte [%02x] at offset %d from %ld ===\n", *b,
+           b-b0-8, (LI)b0);
+  for (b=b0+n+8; b<b0+n+12; b++) if (*b!=DECFENCE)
+    printf("=== Corrupt byte [%02x] at offset +%d from %ld, n=%ld ===\n", *b,
+           b-b0-8, (LI)b0, (LI)n);
+  free(b0);                        // drop the storage
+  decAllocBytes-=n;                // account for storage
+  // printf(" free -- dAB: %d (%d)\n", decAllocBytes, -n);
+  } // decFree
+#define malloc(a) decMalloc(a)
+#define free(a) decFree(a)
+#endif
diff --git a/source/luametatex/source/libraries/decnumber/decNumber.h b/source/luametatex/source/libraries/decnumber/decNumber.h
new file mode 100644
index 000000000..2981c73e0
--- /dev/null
+++ b/source/luametatex/source/libraries/decnumber/decNumber.h
@@ -0,0 +1,182 @@
+/* ------------------------------------------------------------------ */
+/* Decimal Number arithmetic module header                            */
+/* ------------------------------------------------------------------ */
+/* Copyright (c) IBM Corporation, 2000, 2010.  All rights reserved.   */
+/*                                                                    */
+/* This software is made available under the terms of the             */
+/* ICU License -- ICU 1.8.1 and later.                                */
+/*                                                                    */
+/* The description and User's Guide ("The decNumber C Library") for   */
+/* this software is called decNumber.pdf.  This document is           */
+/* available, together with arithmetic and format specifications,     */
+/* testcases, and Web links, on the General Decimal Arithmetic page.  */
+/*                                                                    */
+/* Please send comments, suggestions, and corrections to the author:  */
+/*   mfc@uk.ibm.com                                                   */
+/*   Mike Cowlishaw, IBM Fellow                                       */
+/*   IBM UK, PO Box 31, Birmingham Road, Warwick CV34 5JL, UK         */
+/* ------------------------------------------------------------------ */
+
+#if !defined(DECNUMBER)
+  #define DECNUMBER
+  #define DECNAME     "decNumber"                       /* Short name */
+  #define DECFULLNAME "Decimal Number Module"         /* Verbose name */
+  #define DECAUTHOR   "Mike Cowlishaw"                /* Who to blame */
+
+  #if !defined(DECCONTEXT)
+    #include "decContext.h"
+  #endif
+
+  /* Bit settings for decNumber.bits                                  */
+  #define DECNEG    0x80      /* Sign; 1=negative, 0=positive or zero */
+  #define DECINF    0x40      /* 1=Infinity                           */
+  #define DECNAN    0x20      /* 1=NaN                                */
+  #define DECSNAN   0x10      /* 1=sNaN                               */
+  /* The remaining bits are reserved; they must be 0                  */
+  #define DECSPECIAL (DECINF|DECNAN|DECSNAN) /* any special value     */
+
+  /* Define the decNumber data structure.  The size and shape of the  */
+  /* units array in the structure is determined by the following      */
+  /* constant.  This must not be changed without recompiling the      */
+  /* decNumber library modules. */
+
+  #define DECDPUN 3           /* DECimal Digits Per UNit [must be >0  */
+                              /* and <10; 3 or powers of 2 are best]. */
+
+  /* DECNUMDIGITS is the default number of digits that can be held in */
+  /* the structure.  If undefined, 1 is assumed and it is assumed     */
+  /* that the structure will be immediately followed by extra space,  */
+  /* as required.  DECNUMDIGITS is always >0.                         */
+  #if !defined(DECNUMDIGITS)
+    #define DECNUMDIGITS 1
+  #endif
+
+  /* The size (integer data type) of each unit is determined by the   */
+  /* number of digits it will hold.                                   */
+  #if   DECDPUN<=2
+    #define decNumberUnit uint8_t
+  #elif DECDPUN<=4
+    #define decNumberUnit uint16_t
+  #else
+    #define decNumberUnit uint32_t
+  #endif
+  /* The number of units needed is ceil(DECNUMDIGITS/DECDPUN)         */
+  #define DECNUMUNITS ((DECNUMDIGITS+DECDPUN-1)/DECDPUN)
+
+  /* The data structure... */
+  typedef struct  decNumber {
+    int32_t digits;      /* Count of digits in the coefficient; >0    */
+    int32_t exponent;    /* Unadjusted exponent, unbiased, in         */
+                         /* range: -1999999997 through 999999999      */
+    uint8_t bits;        /* Indicator bits (see above)                */
+                         /* Coefficient, from least significant unit  */
+    decNumberUnit lsu[DECNUMUNITS];
+  } decNumber;
+
+  /* Notes:                                                           */
+  /* 1. If digits is > DECDPUN then there will one or more            */
+  /*    decNumberUnits immediately following the first element of lsu.*/
+  /*    These contain the remaining (more significant) digits of the  */
+  /*    number, and may be in the lsu array, or may be guaranteed by  */
+  /*    some other mechanism (such as being contained in another      */
+  /*    structure, or being overlaid on dynamically allocated         */
+  /*    storage).                                                     */
+  /*                                                                  */
+  /*    Each integer of the coefficient (except potentially the last) */
+  /*    contains DECDPUN digits (e.g., a value in the range 0 through */
+  /*    99999999 if DECDPUN is 8, or 0 through 999 if DECDPUN is 3).  */
+  /*                                                                  */
+  /* 2. A decNumber converted to a string may need up to digits+14    */
+  /*    characters.  The worst cases (non-exponential and exponential */
+  /*    formats) are -0.00000{9...}# and -9.{9...}E+999999999#        */
+  /*    (where # is '\0')                                             */
+
+
+  /* ---------------------------------------------------------------- */
+  /* decNumber public functions and macros                            */
+  /* ---------------------------------------------------------------- */
+  /* Conversions                                                      */
+  decNumber * decNumberFromInt32(decNumber *, int32_t);
+  decNumber * decNumberFromUInt32(decNumber *, uint32_t);
+  decNumber * decNumberFromString(decNumber *, const char *, decContext *);
+  char      * decNumberToString(const decNumber *, char *);
+  char      * decNumberToEngString(const decNumber *, char *);
+  uint32_t    decNumberToUInt32(const decNumber *, decContext *);
+  int32_t     decNumberToInt32(const decNumber *, decContext *);
+  uint8_t   * decNumberGetBCD(const decNumber *, uint8_t *);
+  decNumber * decNumberSetBCD(decNumber *, const uint8_t *, uint32_t);
+
+  /* Operators and elementary functions                               */
+  decNumber * decNumberAbs(decNumber *, const decNumber *, decContext *);
+  decNumber * decNumberAdd(decNumber *, const decNumber *, const decNumber *, decContext *);
+  decNumber * decNumberAnd(decNumber *, const decNumber *, const decNumber *, decContext *);
+  decNumber * decNumberCompare(decNumber *, const decNumber *, const decNumber *, decContext *);
+  decNumber * decNumberCompareSignal(decNumber *, const decNumber *, const decNumber *, decContext *);
+  decNumber * decNumberCompareTotal(decNumber *, const decNumber *, const decNumber *, decContext *);
+  decNumber * decNumberCompareTotalMag(decNumber *, const decNumber *, const decNumber *, decContext *);
+  decNumber * decNumberDivide(decNumber *, const decNumber *, const decNumber *, decContext *);
+  decNumber * decNumberDivideInteger(decNumber *, const decNumber *, const decNumber *, decContext *);
+  decNumber * decNumberExp(decNumber *, const decNumber *, decContext *);
+  decNumber * decNumberFMA(decNumber *, const decNumber *, const decNumber *, const decNumber *, decContext *);
+  decNumber * decNumberInvert(decNumber *, const decNumber *, decContext *);
+  decNumber * decNumberLn(decNumber *, const decNumber *, decContext *);
+  decNumber * decNumberLogB(decNumber *, const decNumber *, decContext *);
+  decNumber * decNumberLog10(decNumber *, const decNumber *, decContext *);
+  decNumber * decNumberMax(decNumber *, const decNumber *, const decNumber *, decContext *);
+  decNumber * decNumberMaxMag(decNumber *, const decNumber *, const decNumber *, decContext *);
+  decNumber * decNumberMin(decNumber *, const decNumber *, const decNumber *, decContext *);
+  decNumber * decNumberMinMag(decNumber *, const decNumber *, const decNumber *, decContext *);
+  decNumber * decNumberMinus(decNumber *, const decNumber *, decContext *);
+  decNumber * decNumberMultiply(decNumber *, const decNumber *, const decNumber *, decContext *);
+  decNumber * decNumberNormalize(decNumber *, const decNumber *, decContext *);
+  decNumber * decNumberOr(decNumber *, const decNumber *, const decNumber *, decContext *);
+  decNumber * decNumberPlus(decNumber *, const decNumber *, decContext *);
+  decNumber * decNumberPower(decNumber *, const decNumber *, const decNumber *, decContext *);
+  decNumber * decNumberQuantize(decNumber *, const decNumber *, const decNumber *, decContext *);
+  decNumber * decNumberReduce(decNumber *, const decNumber *, decContext *);
+  decNumber * decNumberRemainder(decNumber *, const decNumber *, const decNumber *, decContext *);
+  decNumber * decNumberRemainderNear(decNumber *, const decNumber *, const decNumber *, decContext *);
+  decNumber * decNumberRescale(decNumber *, const decNumber *, const decNumber *, decContext *);
+  decNumber * decNumberRotate(decNumber *, const decNumber *, const decNumber *, decContext *);
+  decNumber * decNumberSameQuantum(decNumber *, const decNumber *, const decNumber *);
+  decNumber * decNumberScaleB(decNumber *, const decNumber *, const decNumber *, decContext *);
+  decNumber * decNumberShift(decNumber *, const decNumber *, const decNumber *, decContext *);
+  decNumber * decNumberSquareRoot(decNumber *, const decNumber *, decContext *);
+  decNumber * decNumberSubtract(decNumber *, const decNumber *, const decNumber *, decContext *);
+  decNumber * decNumberToIntegralExact(decNumber *, const decNumber *, decContext *);
+  decNumber * decNumberToIntegralValue(decNumber *, const decNumber *, decContext *);
+  decNumber * decNumberXor(decNumber *, const decNumber *, const decNumber *, decContext *);
+
+  /* Utilities                                                        */
+  enum decClass decNumberClass(const decNumber *, decContext *);
+  const char * decNumberClassToString(enum decClass);
+  decNumber  * decNumberCopy(decNumber *, const decNumber *);
+  decNumber  * decNumberCopyAbs(decNumber *, const decNumber *);
+  decNumber  * decNumberCopyNegate(decNumber *, const decNumber *);
+  decNumber  * decNumberCopySign(decNumber *, const decNumber *, const decNumber *);
+  decNumber  * decNumberNextMinus(decNumber *, const decNumber *, decContext *);
+  decNumber  * decNumberNextPlus(decNumber *, const decNumber *, decContext *);
+  decNumber  * decNumberNextToward(decNumber *, const decNumber *, const decNumber *, decContext *);
+  decNumber  * decNumberTrim(decNumber *);
+  const char * decNumberVersion(void);
+  decNumber  * decNumberZero(decNumber *);
+
+  /* Functions for testing decNumbers (normality depends on context)  */
+  int32_t decNumberIsNormal(const decNumber *, decContext *);
+  int32_t decNumberIsSubnormal(const decNumber *, decContext *);
+
+  /* Macros for testing decNumber *dn                                 */
+  #define decNumberIsCanonical(dn) (1)  /* All decNumbers are saintly */
+  #define decNumberIsFinite(dn)    (((dn)->bits&DECSPECIAL)==0)
+  #define decNumberIsInfinite(dn)  (((dn)->bits&DECINF)!=0)
+  #define decNumberIsNaN(dn)       (((dn)->bits&(DECNAN|DECSNAN))!=0)
+  #define decNumberIsNegative(dn)  (((dn)->bits&DECNEG)!=0)
+  #define decNumberIsQNaN(dn)      (((dn)->bits&(DECNAN))!=0)
+  #define decNumberIsSNaN(dn)      (((dn)->bits&(DECSNAN))!=0)
+  #define decNumberIsSpecial(dn)   (((dn)->bits&DECSPECIAL)!=0)
+  #define decNumberIsZero(dn)      (*(dn)->lsu==0 \
+                                    && (dn)->digits==1 \
+                                    && (((dn)->bits&DECSPECIAL)==0))
+  #define decNumberRadix(dn)       (10)
+
+#endif
diff --git a/source/luametatex/source/libraries/decnumber/decNumberLocal.h b/source/luametatex/source/libraries/decnumber/decNumberLocal.h
new file mode 100644
index 000000000..bf874ae44
--- /dev/null
+++ b/source/luametatex/source/libraries/decnumber/decNumberLocal.h
@@ -0,0 +1,757 @@
+/* ------------------------------------------------------------------ */
+/* decNumber package local type, tuning, and macro definitions        */
+/* ------------------------------------------------------------------ */
+/* Copyright (c) IBM Corporation, 2000, 2010.  All rights reserved.   */
+/*                                                                    */
+/* This software is made available under the terms of the             */
+/* ICU License -- ICU 1.8.1 and later.                                */
+/*                                                                    */
+/* The description and User's Guide ("The decNumber C Library") for   */
+/* this software is called decNumber.pdf.  This document is           */
+/* available, together with arithmetic and format specifications,     */
+/* testcases, and Web links, on the General Decimal Arithmetic page.  */
+/*                                                                    */
+/* Please send comments, suggestions, and corrections to the author:  */
+/*   mfc@uk.ibm.com                                                   */
+/*   Mike Cowlishaw, IBM Fellow                                       */
+/*   IBM UK, PO Box 31, Birmingham Road, Warwick CV34 5JL, UK         */
+/* ------------------------------------------------------------------ */
+/* This header file is included by all modules in the decNumber       */
+/* library, and contains local type definitions, tuning parameters,   */
+/* etc.  It should not need to be used by application programs.       */
+/* decNumber.h or one of decDouble (etc.) must be included first.     */
+/* ------------------------------------------------------------------ */
+
+#if !defined(DECNUMBERLOC)
+  #define DECNUMBERLOC
+  #define DECVERSION    "decNumber 3.68" /* Package Version [16 max.] */
+  #define DECNLAUTHOR   "Mike Cowlishaw"              /* Who to blame */
+
+  #include <stdlib.h>         /* for abs                              */
+  #include <string.h>         /* for memset, strcpy                   */
+
+  /* Conditional code flag -- set this to match hardware platform     */
+  #if !defined(DECLITEND)
+  #define DECLITEND 1         /* 1=little-endian, 0=big-endian        */
+  #endif
+
+  /* Conditional code flag -- set this to 1 for best performance      */
+  #if !defined(DECUSE64)
+  #define DECUSE64  1         /* 1=use int64s, 0=int32 & smaller only */
+  #endif
+
+  /* Conditional code flag -- set this to 0 to exclude printf calls   */
+  #if !defined(DECPRINT)
+  #define DECPRINT  1         /* 1=allow printf calls; 0=no printf    */
+  #endif
+
+  /* Conditional check flags -- set these to 0 for best performance   */
+  #if !defined(DECCHECK)
+  #define DECCHECK  0         /* 1 to enable robust checking          */
+  #endif
+  #if !defined(DECALLOC)
+  #define DECALLOC  0         /* 1 to enable memory accounting        */
+  #endif
+  #if !defined(DECTRACE)
+  #define DECTRACE  0         /* 1 to trace certain internals, etc.   */
+  #endif
+
+  /* Tuning parameter for decNumber (arbitrary precision) module      */
+  #if !defined(DECBUFFER)
+  #define DECBUFFER 36        /* Size basis for local buffers.  This  */
+                              /* should be a common maximum precision */
+                              /* rounded up to a multiple of 4; must  */
+                              /* be zero or positive.                 */
+  #endif
+
+
+  /* ---------------------------------------------------------------- */
+  /* Check parameter dependencies                                     */
+  /* ---------------------------------------------------------------- */
+  #if DECCHECK & !DECPRINT
+    #error DECCHECK needs DECPRINT to be useful
+  #endif
+  #if DECALLOC & !DECPRINT
+    #error DECALLOC needs DECPRINT to be useful
+  #endif
+  #if DECTRACE & !DECPRINT
+    #error DECTRACE needs DECPRINT to be useful
+  #endif
+
+  /* ---------------------------------------------------------------- */
+  /* Definitions for all modules (general-purpose)                    */
+  /* ---------------------------------------------------------------- */
+
+  /* Local names for common types -- for safety, decNumber modules do */
+  /* not use int or long directly.                                    */
+  #define Flag   uint8_t
+  #define Byte   int8_t
+  #define uByte  uint8_t
+  #define Short  int16_t
+  #define uShort uint16_t
+  #define Int    int32_t
+  #define uInt   uint32_t
+  #define Unit   decNumberUnit
+  #if DECUSE64
+  #define Long   int64_t
+  #define uLong  uint64_t
+  #endif
+
+  /* Development-use definitions                                      */
+  typedef long int LI;        /* for printf arguments only            */
+  #define DECNOINT  0         /* 1 to check no internal use of 'int'  */
+                              /*   or stdint types                    */
+  #if DECNOINT
+    /* if these interfere with your C includes, do not set DECNOINT   */
+    #define int     ?         /* enable to ensure that plain C 'int'  */
+    #define long    ??        /* .. or 'long' types are not used      */
+  #endif
+
+  /* Shared lookup tables                                             */
+  extern const uByte  DECSTICKYTAB[10]; /* re-round digits if sticky  */
+  extern const uInt   DECPOWERS[10];    /* powers of ten table        */
+  /* The following are included from decDPD.h                         */
+  extern const uShort DPD2BIN[1024];    /* DPD -> 0-999               */
+  extern const uShort BIN2DPD[1000];    /* 0-999 -> DPD               */
+  extern const uInt   DPD2BINK[1024];   /* DPD -> 0-999000            */
+  extern const uInt   DPD2BINM[1024];   /* DPD -> 0-999000000         */
+  extern const uByte  DPD2BCD8[4096];   /* DPD -> ddd + len           */
+  extern const uByte  BIN2BCD8[4000];   /* 0-999 -> ddd + len         */
+  extern const uShort BCD2DPD[2458];    /* 0-0x999 -> DPD (0x999=2457)*/
+
+  /* LONGMUL32HI -- set w=(u*v)>>32, where w, u, and v are uInts      */
+  /* (that is, sets w to be the high-order word of the 64-bit result; */
+  /* the low-order word is simply u*v.)                               */
+  /* This version is derived from Knuth via Hacker's Delight;         */
+  /* it seems to optimize better than some others tried               */
+  #define LONGMUL32HI(w, u, v) {             \
+    uInt u0, u1, v0, v1, w0, w1, w2, t;      \
+    u0=u & 0xffff; u1=u>>16;                 \
+    v0=v & 0xffff; v1=v>>16;                 \
+    w0=u0*v0;                                \
+    t=u1*v0 + (w0>>16);                      \
+    w1=t & 0xffff; w2=t>>16;                 \
+    w1=u0*v1 + w1;                           \
+    (w)=u1*v1 + w2 + (w1>>16);}
+
+  /* ROUNDUP -- round an integer up to a multiple of n                */
+  #define ROUNDUP(i, n) ((((i)+(n)-1)/n)*n)
+  #define ROUNDUP4(i)   (((i)+3)&~3)    /* special for n=4            */
+
+  /* ROUNDDOWN -- round an integer down to a multiple of n            */
+  #define ROUNDDOWN(i, n) (((i)/n)*n)
+  #define ROUNDDOWN4(i)   ((i)&~3)      /* special for n=4            */
+
+  /* References to multi-byte sequences under different sizes; these  */
+  /* require locally declared variables, but do not violate strict    */
+  /* aliasing or alignment (as did the UINTAT simple cast to uInt).   */
+  /* Variables needed are uswork, uiwork, etc. [so do not use at same */
+  /* level in an expression, e.g., UBTOUI(x)==UBTOUI(y) may fail].    */
+
+  /* Return a uInt, etc., from bytes starting at a char* or uByte*    */
+  #define UBTOUS(b)  (memcpy((void *)&uswork, b, 2), uswork)
+  #define UBTOUI(b)  (memcpy((void *)&uiwork, b, 4), uiwork)
+
+  /* Store a uInt, etc., into bytes starting at a char* or uByte*.    */
+  /* Returns i, evaluated, for convenience; has to use uiwork because */
+  /* i may be an expression.                                          */
+  #define UBFROMUS(b, i)  (uswork=(i), memcpy(b, (void *)&uswork, 2), uswork)
+  #define UBFROMUI(b, i)  (uiwork=(i), memcpy(b, (void *)&uiwork, 4), uiwork)
+
+  /* X10 and X100 -- multiply integer i by 10 or 100                  */
+  /* [shifts are usually faster than multiply; could be conditional]  */
+  #define X10(i)  (((i)<<1)+((i)<<3))
+  #define X100(i) (((i)<<2)+((i)<<5)+((i)<<6))
+
+  /* MAXI and MINI -- general max & min (not in ANSI) for integers    */
+  #define MAXI(x,y) ((x)<(y)?(y):(x))
+  #define MINI(x,y) ((x)>(y)?(y):(x))
+
+  /* Useful constants                                                 */
+  #define BILLION      1000000000            /* 10**9                 */
+  /* CHARMASK: 0x30303030 for ASCII/UTF8; 0xF0F0F0F0 for EBCDIC       */
+  #define CHARMASK ((((((((uInt)'0')<<8)+'0')<<8)+'0')<<8)+'0')
+
+
+  /* ---------------------------------------------------------------- */
+  /* Definitions for arbitary-precision modules (only valid after     */
+  /* decNumber.h has been included)                                   */
+  /* ---------------------------------------------------------------- */
+
+  /* Limits and constants                                             */
+  #define DECNUMMAXP 999999999  /* maximum precision code can handle  */
+  #define DECNUMMAXE 999999999  /* maximum adjusted exponent ditto    */
+  #define DECNUMMINE -999999999 /* minimum adjusted exponent ditto    */
+  #if (DECNUMMAXP != DEC_MAX_DIGITS)
+    #error Maximum digits mismatch
+  #endif
+  #if (DECNUMMAXE != DEC_MAX_EMAX)
+    #error Maximum exponent mismatch
+  #endif
+  #if (DECNUMMINE != DEC_MIN_EMIN)
+    #error Minimum exponent mismatch
+  #endif
+
+  /* Set DECDPUNMAX -- the maximum integer that fits in DECDPUN       */
+  /* digits, and D2UTABLE -- the initializer for the D2U table        */
+  #if   DECDPUN==1
+    #define DECDPUNMAX 9
+    #define D2UTABLE {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,  \
+                      18,19,20,21,22,23,24,25,26,27,28,29,30,31,32, \
+                      33,34,35,36,37,38,39,40,41,42,43,44,45,46,47, \
+                      48,49}
+  #elif DECDPUN==2
+    #define DECDPUNMAX 99
+    #define D2UTABLE {0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,  \
+                      11,11,12,12,13,13,14,14,15,15,16,16,17,17,18, \
+                      18,19,19,20,20,21,21,22,22,23,23,24,24,25}
+  #elif DECDPUN==3
+    #define DECDPUNMAX 999
+    #define D2UTABLE {0,1,1,1,2,2,2,3,3,3,4,4,4,5,5,5,6,6,6,7,7,7,  \
+                      8,8,8,9,9,9,10,10,10,11,11,11,12,12,12,13,13, \
+                      13,14,14,14,15,15,15,16,16,16,17}
+  #elif DECDPUN==4
+    #define DECDPUNMAX 9999
+    #define D2UTABLE {0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,6,  \
+                      6,6,6,7,7,7,7,8,8,8,8,9,9,9,9,10,10,10,10,11, \
+                      11,11,11,12,12,12,12,13}
+  #elif DECDPUN==5
+    #define DECDPUNMAX 99999
+    #define D2UTABLE {0,1,1,1,1,1,2,2,2,2,2,3,3,3,3,3,4,4,4,4,4,5,  \
+                      5,5,5,5,6,6,6,6,6,7,7,7,7,7,8,8,8,8,8,9,9,9,  \
+                      9,9,10,10,10,10}
+  #elif DECDPUN==6
+    #define DECDPUNMAX 999999
+    #define D2UTABLE {0,1,1,1,1,1,1,2,2,2,2,2,2,3,3,3,3,3,3,4,4,4,  \
+                      4,4,4,5,5,5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7,8,  \
+                      8,8,8,8,8,9}
+  #elif DECDPUN==7
+    #define DECDPUNMAX 9999999
+    #define D2UTABLE {0,1,1,1,1,1,1,1,2,2,2,2,2,2,2,3,3,3,3,3,3,3,  \
+                      4,4,4,4,4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6,6,7,  \
+                      7,7,7,7,7,7}
+  #elif DECDPUN==8
+    #define DECDPUNMAX 99999999
+    #define D2UTABLE {0,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,3,3,3,3,3,  \
+                      3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,6,6,6,  \
+                      6,6,6,6,6,7}
+  #elif DECDPUN==9
+    #define DECDPUNMAX 999999999
+    #define D2UTABLE {0,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,3,3,3,  \
+                      3,3,3,3,3,3,4,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,  \
+                      5,5,6,6,6,6}
+  #elif defined(DECDPUN)
+    #error DECDPUN must be in the range 1-9
+  #endif
+
+  /* ----- Shared data (in decNumber.c) ----- */
+  /* Public lookup table used by the D2U macro (see below)            */
+  #define DECMAXD2U 49
+  extern const uByte d2utable[DECMAXD2U+1];
+
+  /* ----- Macros ----- */
+  /* ISZERO -- return true if decNumber dn is a zero                  */
+  /* [performance-critical in some situations]                        */
+  #define ISZERO(dn) decNumberIsZero(dn)     /* now just a local name */
+
+  /* D2U -- return the number of Units needed to hold d digits        */
+  /* (runtime version, with table lookaside for small d)              */
+  #if DECDPUN==8
+    #define D2U(d) ((unsigned)((d)<=DECMAXD2U?d2utable[d]:((d)+7)>>3))
+  #elif DECDPUN==4
+    #define D2U(d) ((unsigned)((d)<=DECMAXD2U?d2utable[d]:((d)+3)>>2))
+  #else
+    #define D2U(d) ((d)<=DECMAXD2U?d2utable[d]:((d)+DECDPUN-1)/DECDPUN)
+  #endif
+  /* SD2U -- static D2U macro (for compile-time calculation)          */
+  #define SD2U(d) (((d)+DECDPUN-1)/DECDPUN)
+
+  /* MSUDIGITS -- returns digits in msu, from digits, calculated      */
+  /* using D2U                                                        */
+  #define MSUDIGITS(d) ((d)-(D2U(d)-1)*DECDPUN)
+
+  /* D2N -- return the number of decNumber structs that would be      */
+  /* needed to contain that number of digits (and the initial         */
+  /* decNumber struct) safely.  Note that one Unit is included in the */
+  /* initial structure.  Used for allocating space that is aligned on */
+  /* a decNumber struct boundary. */
+  #define D2N(d) \
+    ((((SD2U(d)-1)*sizeof(Unit))+sizeof(decNumber)*2-1)/sizeof(decNumber))
+
+  /* TODIGIT -- macro to remove the leading digit from the unsigned   */
+  /* integer u at column cut (counting from the right, LSD=0) and     */
+  /* place it as an ASCII character into the character pointed to by  */
+  /* c.  Note that cut must be <= 9, and the maximum value for u is   */
+  /* 2,000,000,000 (as is needed for negative exponents of            */
+  /* subnormals).  The unsigned integer pow is used as a temporary    */
+  /* variable. */
+  #define TODIGIT(u, cut, c, pow) {       \
+    *(c)='0';                             \
+    pow=DECPOWERS[cut]*2;                 \
+    if ((u)>pow) {                        \
+      pow*=4;                             \
+      if ((u)>=pow) {(u)-=pow; *(c)+=8;}  \
+      pow/=2;                             \
+      if ((u)>=pow) {(u)-=pow; *(c)+=4;}  \
+      pow/=2;                             \
+      }                                   \
+    if ((u)>=pow) {(u)-=pow; *(c)+=2;}    \
+    pow/=2;                               \
+    if ((u)>=pow) {(u)-=pow; *(c)+=1;}    \
+    }
+
+  /* ---------------------------------------------------------------- */
+  /* Definitions for fixed-precision modules (only valid after        */
+  /* decSingle.h, decDouble.h, or decQuad.h has been included)        */
+  /* ---------------------------------------------------------------- */
+
+  /* bcdnum -- a structure describing a format-independent finite     */
+  /* number, whose coefficient is a string of bcd8 uBytes             */
+  typedef struct bcdnum {
+    uByte   *msd;             /* -> most significant digit            */
+    uByte   *lsd;             /* -> least ditto                       */
+    uInt     sign;            /* 0=positive, DECFLOAT_Sign=negative   */
+    Int      exponent;        /* Unadjusted signed exponent (q), or   */
+                              /* DECFLOAT_NaN etc. for a special      */
+    } bcdnum;
+
+  /* Test if exponent or bcdnum exponent must be a special, etc.      */
+  #define EXPISSPECIAL(exp) ((exp)>=DECFLOAT_MinSp)
+  #define EXPISINF(exp) (exp==DECFLOAT_Inf)
+  #define EXPISNAN(exp) (exp==DECFLOAT_qNaN || exp==DECFLOAT_sNaN)
+  #define NUMISSPECIAL(num) (EXPISSPECIAL((num)->exponent))
+
+  /* Refer to a 32-bit word or byte in a decFloat (df) by big-endian  */
+  /* (array) notation (the 0 word or byte contains the sign bit),     */
+  /* automatically adjusting for endianness; similarly address a word */
+  /* in the next-wider format (decFloatWider, or dfw)                 */
+  #define DECWORDS  (DECBYTES/4)
+  #define DECWWORDS (DECWBYTES/4)
+  #if DECLITEND
+    #define DFBYTE(df, off)   ((df)->bytes[DECBYTES-1-(off)])
+    #define DFWORD(df, off)   ((df)->words[DECWORDS-1-(off)])
+    #define DFWWORD(dfw, off) ((dfw)->words[DECWWORDS-1-(off)])
+  #else
+    #define DFBYTE(df, off)   ((df)->bytes[off])
+    #define DFWORD(df, off)   ((df)->words[off])
+    #define DFWWORD(dfw, off) ((dfw)->words[off])
+  #endif
+
+  /* Tests for sign or specials, directly on DECFLOATs                */
+  #define DFISSIGNED(df)  ((DFWORD(df, 0)&0x80000000)!=0)
+  #define DFISSPECIAL(df) ((DFWORD(df, 0)&0x78000000)==0x78000000)
+  #define DFISINF(df)     ((DFWORD(df, 0)&0x7c000000)==0x78000000)
+  #define DFISNAN(df)     ((DFWORD(df, 0)&0x7c000000)==0x7c000000)
+  #define DFISQNAN(df)    ((DFWORD(df, 0)&0x7e000000)==0x7c000000)
+  #define DFISSNAN(df)    ((DFWORD(df, 0)&0x7e000000)==0x7e000000)
+
+  /* Shared lookup tables                                             */
+  extern const uInt   DECCOMBMSD[64];   /* Combination field -> MSD   */
+  extern const uInt   DECCOMBFROM[48];  /* exp+msd -> Combination     */
+
+  /* Private generic (utility) routine                                */
+  #if DECCHECK || DECTRACE
+    extern void decShowNum(const bcdnum *, const char *);
+  #endif
+
+  /* Format-dependent macros and constants                            */
+  #if defined(DECPMAX)
+
+    /* Useful constants                                               */
+    #define DECPMAX9  (ROUNDUP(DECPMAX, 9)/9)  /* 'Pmax' in 10**9s    */
+    /* Top words for a zero                                           */
+    #define SINGLEZERO   0x22500000
+    #define DOUBLEZERO   0x22380000
+    #define QUADZERO     0x22080000
+    /* [ZEROWORD is defined to be one of these in the DFISZERO macro] */
+
+    /* Format-dependent common tests:                                 */
+    /*   DFISZERO   -- test for (any) zero                            */
+    /*   DFISCCZERO -- test for coefficient continuation being zero   */
+    /*   DFISCC01   -- test for coefficient contains only 0s and 1s   */
+    /*   DFISINT    -- test for finite and exponent q=0               */
+    /*   DFISUINT01 -- test for sign=0, finite, exponent q=0, and     */
+    /*                 MSD=0 or 1                                     */
+    /*   ZEROWORD is also defined here.                               */
+    /*                                                                */
+    /* In DFISZERO the first test checks the least-significant word   */
+    /* (most likely to be non-zero); the penultimate tests MSD and    */
+    /* DPDs in the signword, and the final test excludes specials and */
+    /* MSD>7.  DFISINT similarly has to allow for the two forms of    */
+    /* MSD codes.  DFISUINT01 only has to allow for one form of MSD   */
+    /* code.                                                          */
+    #if DECPMAX==7
+      #define ZEROWORD SINGLEZERO
+      /* [test macros not needed except for Zero]                     */
+      #define DFISZERO(df)  ((DFWORD(df, 0)&0x1c0fffff)==0         \
+                          && (DFWORD(df, 0)&0x60000000)!=0x60000000)
+    #elif DECPMAX==16
+      #define ZEROWORD DOUBLEZERO
+      #define DFISZERO(df)  ((DFWORD(df, 1)==0                     \
+                          && (DFWORD(df, 0)&0x1c03ffff)==0         \
+                          && (DFWORD(df, 0)&0x60000000)!=0x60000000))
+      #define DFISINT(df) ((DFWORD(df, 0)&0x63fc0000)==0x22380000  \
+                         ||(DFWORD(df, 0)&0x7bfc0000)==0x6a380000)
+      #define DFISUINT01(df) ((DFWORD(df, 0)&0xfbfc0000)==0x22380000)
+      #define DFISCCZERO(df) (DFWORD(df, 1)==0                     \
+                          && (DFWORD(df, 0)&0x0003ffff)==0)
+      #define DFISCC01(df)  ((DFWORD(df, 0)&~0xfffc9124)==0        \
+                          && (DFWORD(df, 1)&~0x49124491)==0)
+    #elif DECPMAX==34
+      #define ZEROWORD QUADZERO
+      #define DFISZERO(df)  ((DFWORD(df, 3)==0                     \
+                          &&  DFWORD(df, 2)==0                     \
+                          &&  DFWORD(df, 1)==0                     \
+                          && (DFWORD(df, 0)&0x1c003fff)==0         \
+                          && (DFWORD(df, 0)&0x60000000)!=0x60000000))
+      #define DFISINT(df) ((DFWORD(df, 0)&0x63ffc000)==0x22080000  \
+                         ||(DFWORD(df, 0)&0x7bffc000)==0x6a080000)
+      #define DFISUINT01(df) ((DFWORD(df, 0)&0xfbffc000)==0x22080000)
+      #define DFISCCZERO(df) (DFWORD(df, 3)==0                     \
+                          &&  DFWORD(df, 2)==0                     \
+                          &&  DFWORD(df, 1)==0                     \
+                          && (DFWORD(df, 0)&0x00003fff)==0)
+
+      #define DFISCC01(df)   ((DFWORD(df, 0)&~0xffffc912)==0       \
+                          &&  (DFWORD(df, 1)&~0x44912449)==0       \
+                          &&  (DFWORD(df, 2)&~0x12449124)==0       \
+                          &&  (DFWORD(df, 3)&~0x49124491)==0)
+    #endif
+
+    /* Macros to test if a certain 10 bits of a uInt or pair of uInts */
+    /* are a canonical declet [higher or lower bits are ignored].     */
+    /* declet is at offset 0 (from the right) in a uInt:              */
+    #define CANONDPD(dpd) (((dpd)&0x300)==0 || ((dpd)&0x6e)!=0x6e)
+    /* declet is at offset k (a multiple of 2) in a uInt:             */
+    #define CANONDPDOFF(dpd, k) (((dpd)&(0x300<<(k)))==0            \
+      || ((dpd)&(((uInt)0x6e)<<(k)))!=(((uInt)0x6e)<<(k)))
+    /* declet is at offset k (a multiple of 2) in a pair of uInts:    */
+    /* [the top 2 bits will always be in the more-significant uInt]   */
+    #define CANONDPDTWO(hi, lo, k) (((hi)&(0x300>>(32-(k))))==0     \
+      || ((hi)&(0x6e>>(32-(k))))!=(0x6e>>(32-(k)))                  \
+      || ((lo)&(((uInt)0x6e)<<(k)))!=(((uInt)0x6e)<<(k)))
+
+    /* Macro to test whether a full-length (length DECPMAX) BCD8      */
+    /* coefficient, starting at uByte u, is all zeros                 */
+    /* Test just the LSWord first, then the remainder as a sequence   */
+    /* of tests in order to avoid same-level use of UBTOUI            */
+    #if DECPMAX==7
+      #define ISCOEFFZERO(u) (                                      \
+           UBTOUI((u)+DECPMAX-4)==0                                 \
+        && UBTOUS((u)+DECPMAX-6)==0                                 \
+        && *(u)==0)
+    #elif DECPMAX==16
+      #define ISCOEFFZERO(u) (                                      \
+           UBTOUI((u)+DECPMAX-4)==0                                 \
+        && UBTOUI((u)+DECPMAX-8)==0                                 \
+        && UBTOUI((u)+DECPMAX-12)==0                                \
+        && UBTOUI(u)==0)
+    #elif DECPMAX==34
+      #define ISCOEFFZERO(u) (                                      \
+           UBTOUI((u)+DECPMAX-4)==0                                 \
+        && UBTOUI((u)+DECPMAX-8)==0                                 \
+        && UBTOUI((u)+DECPMAX-12)==0                                \
+        && UBTOUI((u)+DECPMAX-16)==0                                \
+        && UBTOUI((u)+DECPMAX-20)==0                                \
+        && UBTOUI((u)+DECPMAX-24)==0                                \
+        && UBTOUI((u)+DECPMAX-28)==0                                \
+        && UBTOUI((u)+DECPMAX-32)==0                                \
+        && UBTOUS(u)==0)
+    #endif
+
+    /* Macros and masks for the sign, exponent continuation, and MSD  */
+    /* Get the sign as DECFLOAT_Sign or 0                             */
+    #define GETSIGN(df) (DFWORD(df, 0)&0x80000000)
+    /* Get the exponent continuation from a decFloat *df as an Int    */
+    #define GETECON(df) ((Int)((DFWORD((df), 0)&0x03ffffff)>>(32-6-DECECONL)))
+    /* Ditto, from the next-wider format                              */
+    #define GETWECON(df) ((Int)((DFWWORD((df), 0)&0x03ffffff)>>(32-6-DECWECONL)))
+    /* Get the biased exponent similarly                              */
+    #define GETEXP(df)  ((Int)(DECCOMBEXP[DFWORD((df), 0)>>26]+GETECON(df)))
+    /* Get the unbiased exponent similarly                            */
+    #define GETEXPUN(df) ((Int)GETEXP(df)-DECBIAS)
+    /* Get the MSD similarly (as uInt)                                */
+    #define GETMSD(df)   (DECCOMBMSD[DFWORD((df), 0)>>26])
+
+    /* Compile-time computes of the exponent continuation field masks */
+    /* full exponent continuation field:                              */
+    #define ECONMASK ((0x03ffffff>>(32-6-DECECONL))<<(32-6-DECECONL))
+    /* same, not including its first digit (the qNaN/sNaN selector):  */
+    #define ECONNANMASK ((0x01ffffff>>(32-6-DECECONL))<<(32-6-DECECONL))
+
+    /* Macros to decode the coefficient in a finite decFloat *df into */
+    /* a BCD string (uByte *bcdin) of length DECPMAX uBytes.          */
+
+    /* In-line sequence to convert least significant 10 bits of uInt  */
+    /* dpd to three BCD8 digits starting at uByte u.  Note that an    */
+    /* extra byte is written to the right of the three digits because */
+    /* four bytes are moved at a time for speed; the alternative      */
+    /* macro moves exactly three bytes (usually slower).              */
+    #define dpd2bcd8(u, dpd)  memcpy(u, &DPD2BCD8[((dpd)&0x3ff)*4], 4)
+    #define dpd2bcd83(u, dpd) memcpy(u, &DPD2BCD8[((dpd)&0x3ff)*4], 3)
+
+    /* Decode the declets.  After extracting each one, it is decoded  */
+    /* to BCD8 using a table lookup (also used for variable-length    */
+    /* decode).  Each DPD decode is 3 bytes BCD8 plus a one-byte      */
+    /* length which is not used, here).  Fixed-length 4-byte moves    */
+    /* are fast, however, almost everywhere, and so are used except   */
+    /* for the final three bytes (to avoid overrun).  The code below  */
+    /* is 36 instructions for Doubles and about 70 for Quads, even    */
+    /* on IA32.                                                       */
+
+    /* Two macros are defined for each format:                        */
+    /*   GETCOEFF extracts the coefficient of the current format      */
+    /*   GETWCOEFF extracts the coefficient of the next-wider format. */
+    /* The latter is a copy of the next-wider GETCOEFF using DFWWORD. */
+
+    #if DECPMAX==7
+    #define GETCOEFF(df, bcd) {                          \
+      uInt sourhi=DFWORD(df, 0);                         \
+      *(bcd)=(uByte)DECCOMBMSD[sourhi>>26];              \
+      dpd2bcd8(bcd+1, sourhi>>10);                       \
+      dpd2bcd83(bcd+4, sourhi);}
+    #define GETWCOEFF(df, bcd) {                         \
+      uInt sourhi=DFWWORD(df, 0);                        \
+      uInt sourlo=DFWWORD(df, 1);                        \
+      *(bcd)=(uByte)DECCOMBMSD[sourhi>>26];              \
+      dpd2bcd8(bcd+1, sourhi>>8);                        \
+      dpd2bcd8(bcd+4, (sourhi<<2) | (sourlo>>30));       \
+      dpd2bcd8(bcd+7, sourlo>>20);                       \
+      dpd2bcd8(bcd+10, sourlo>>10);                      \
+      dpd2bcd83(bcd+13, sourlo);}
+
+    #elif DECPMAX==16
+    #define GETCOEFF(df, bcd) {                          \
+      uInt sourhi=DFWORD(df, 0);                         \
+      uInt sourlo=DFWORD(df, 1);                         \
+      *(bcd)=(uByte)DECCOMBMSD[sourhi>>26];              \
+      dpd2bcd8(bcd+1, sourhi>>8);                        \
+      dpd2bcd8(bcd+4, (sourhi<<2) | (sourlo>>30));       \
+      dpd2bcd8(bcd+7, sourlo>>20);                       \
+      dpd2bcd8(bcd+10, sourlo>>10);                      \
+      dpd2bcd83(bcd+13, sourlo);}
+    #define GETWCOEFF(df, bcd) {                         \
+      uInt sourhi=DFWWORD(df, 0);                        \
+      uInt sourmh=DFWWORD(df, 1);                        \
+      uInt sourml=DFWWORD(df, 2);                        \
+      uInt sourlo=DFWWORD(df, 3);                        \
+      *(bcd)=(uByte)DECCOMBMSD[sourhi>>26];              \
+      dpd2bcd8(bcd+1, sourhi>>4);                        \
+      dpd2bcd8(bcd+4, ((sourhi)<<6) | (sourmh>>26));     \
+      dpd2bcd8(bcd+7, sourmh>>16);                       \
+      dpd2bcd8(bcd+10, sourmh>>6);                       \
+      dpd2bcd8(bcd+13, ((sourmh)<<4) | (sourml>>28));    \
+      dpd2bcd8(bcd+16, sourml>>18);                      \
+      dpd2bcd8(bcd+19, sourml>>8);                       \
+      dpd2bcd8(bcd+22, ((sourml)<<2) | (sourlo>>30));    \
+      dpd2bcd8(bcd+25, sourlo>>20);                      \
+      dpd2bcd8(bcd+28, sourlo>>10);                      \
+      dpd2bcd83(bcd+31, sourlo);}
+
+    #elif DECPMAX==34
+    #define GETCOEFF(df, bcd) {                          \
+      uInt sourhi=DFWORD(df, 0);                         \
+      uInt sourmh=DFWORD(df, 1);                         \
+      uInt sourml=DFWORD(df, 2);                         \
+      uInt sourlo=DFWORD(df, 3);                         \
+      *(bcd)=(uByte)DECCOMBMSD[sourhi>>26];              \
+      dpd2bcd8(bcd+1, sourhi>>4);                        \
+      dpd2bcd8(bcd+4, ((sourhi)<<6) | (sourmh>>26));     \
+      dpd2bcd8(bcd+7, sourmh>>16);                       \
+      dpd2bcd8(bcd+10, sourmh>>6);                       \
+      dpd2bcd8(bcd+13, ((sourmh)<<4) | (sourml>>28));    \
+      dpd2bcd8(bcd+16, sourml>>18);                      \
+      dpd2bcd8(bcd+19, sourml>>8);                       \
+      dpd2bcd8(bcd+22, ((sourml)<<2) | (sourlo>>30));    \
+      dpd2bcd8(bcd+25, sourlo>>20);                      \
+      dpd2bcd8(bcd+28, sourlo>>10);                      \
+      dpd2bcd83(bcd+31, sourlo);}
+
+      #define GETWCOEFF(df, bcd) {??} /* [should never be used]       */
+    #endif
+
+    /* Macros to decode the coefficient in a finite decFloat *df into */
+    /* a base-billion uInt array, with the least-significant          */
+    /* 0-999999999 'digit' at offset 0.                               */
+
+    /* Decode the declets.  After extracting each one, it is decoded  */
+    /* to binary using a table lookup.  Three tables are used; one    */
+    /* the usual DPD to binary, the other two pre-multiplied by 1000  */
+    /* and 1000000 to avoid multiplication during decode.  These      */
+    /* tables can also be used for multiplying up the MSD as the DPD  */
+    /* code for 0 through 9 is the identity.                          */
+    #define DPD2BIN0 DPD2BIN         /* for prettier code             */
+
+    #if DECPMAX==7
+    #define GETCOEFFBILL(df, buf) {                           \
+      uInt sourhi=DFWORD(df, 0);                              \
+      (buf)[0]=DPD2BIN0[sourhi&0x3ff]                         \
+              +DPD2BINK[(sourhi>>10)&0x3ff]                   \
+              +DPD2BINM[DECCOMBMSD[sourhi>>26]];}
+
+    #elif DECPMAX==16
+    #define GETCOEFFBILL(df, buf) {                           \
+      uInt sourhi, sourlo;                                    \
+      sourlo=DFWORD(df, 1);                                   \
+      (buf)[0]=DPD2BIN0[sourlo&0x3ff]                         \
+              +DPD2BINK[(sourlo>>10)&0x3ff]                   \
+              +DPD2BINM[(sourlo>>20)&0x3ff];                  \
+      sourhi=DFWORD(df, 0);                                   \
+      (buf)[1]=DPD2BIN0[((sourhi<<2) | (sourlo>>30))&0x3ff]   \
+              +DPD2BINK[(sourhi>>8)&0x3ff]                    \
+              +DPD2BINM[DECCOMBMSD[sourhi>>26]];}
+
+    #elif DECPMAX==34
+    #define GETCOEFFBILL(df, buf) {                           \
+      uInt sourhi, sourmh, sourml, sourlo;                    \
+      sourlo=DFWORD(df, 3);                                   \
+      (buf)[0]=DPD2BIN0[sourlo&0x3ff]                         \
+              +DPD2BINK[(sourlo>>10)&0x3ff]                   \
+              +DPD2BINM[(sourlo>>20)&0x3ff];                  \
+      sourml=DFWORD(df, 2);                                   \
+      (buf)[1]=DPD2BIN0[((sourml<<2) | (sourlo>>30))&0x3ff]   \
+              +DPD2BINK[(sourml>>8)&0x3ff]                    \
+              +DPD2BINM[(sourml>>18)&0x3ff];                  \
+      sourmh=DFWORD(df, 1);                                   \
+      (buf)[2]=DPD2BIN0[((sourmh<<4) | (sourml>>28))&0x3ff]   \
+              +DPD2BINK[(sourmh>>6)&0x3ff]                    \
+              +DPD2BINM[(sourmh>>16)&0x3ff];                  \
+      sourhi=DFWORD(df, 0);                                   \
+      (buf)[3]=DPD2BIN0[((sourhi<<6) | (sourmh>>26))&0x3ff]   \
+              +DPD2BINK[(sourhi>>4)&0x3ff]                    \
+              +DPD2BINM[DECCOMBMSD[sourhi>>26]];}
+
+    #endif
+
+    /* Macros to decode the coefficient in a finite decFloat *df into */
+    /* a base-thousand uInt array (of size DECLETS+1, to allow for    */
+    /* the MSD), with the least-significant 0-999 'digit' at offset 0.*/
+
+    /* Decode the declets.  After extracting each one, it is decoded  */
+    /* to binary using a table lookup.                                */
+    #if DECPMAX==7
+    #define GETCOEFFTHOU(df, buf) {                           \
+      uInt sourhi=DFWORD(df, 0);                              \
+      (buf)[0]=DPD2BIN[sourhi&0x3ff];                         \
+      (buf)[1]=DPD2BIN[(sourhi>>10)&0x3ff];                   \
+      (buf)[2]=DECCOMBMSD[sourhi>>26];}
+
+    #elif DECPMAX==16
+    #define GETCOEFFTHOU(df, buf) {                           \
+      uInt sourhi, sourlo;                                    \
+      sourlo=DFWORD(df, 1);                                   \
+      (buf)[0]=DPD2BIN[sourlo&0x3ff];                         \
+      (buf)[1]=DPD2BIN[(sourlo>>10)&0x3ff];                   \
+      (buf)[2]=DPD2BIN[(sourlo>>20)&0x3ff];                   \
+      sourhi=DFWORD(df, 0);                                   \
+      (buf)[3]=DPD2BIN[((sourhi<<2) | (sourlo>>30))&0x3ff];   \
+      (buf)[4]=DPD2BIN[(sourhi>>8)&0x3ff];                    \
+      (buf)[5]=DECCOMBMSD[sourhi>>26];}
+
+    #elif DECPMAX==34
+    #define GETCOEFFTHOU(df, buf) {                           \
+      uInt sourhi, sourmh, sourml, sourlo;                    \
+      sourlo=DFWORD(df, 3);                                   \
+      (buf)[0]=DPD2BIN[sourlo&0x3ff];                         \
+      (buf)[1]=DPD2BIN[(sourlo>>10)&0x3ff];                   \
+      (buf)[2]=DPD2BIN[(sourlo>>20)&0x3ff];                   \
+      sourml=DFWORD(df, 2);                                   \
+      (buf)[3]=DPD2BIN[((sourml<<2) | (sourlo>>30))&0x3ff];   \
+      (buf)[4]=DPD2BIN[(sourml>>8)&0x3ff];                    \
+      (buf)[5]=DPD2BIN[(sourml>>18)&0x3ff];                   \
+      sourmh=DFWORD(df, 1);                                   \
+      (buf)[6]=DPD2BIN[((sourmh<<4) | (sourml>>28))&0x3ff];   \
+      (buf)[7]=DPD2BIN[(sourmh>>6)&0x3ff];                    \
+      (buf)[8]=DPD2BIN[(sourmh>>16)&0x3ff];                   \
+      sourhi=DFWORD(df, 0);                                   \
+      (buf)[9]=DPD2BIN[((sourhi<<6) | (sourmh>>26))&0x3ff];   \
+      (buf)[10]=DPD2BIN[(sourhi>>4)&0x3ff];                   \
+      (buf)[11]=DECCOMBMSD[sourhi>>26];}
+    #endif
+
+
+    /* Macros to decode the coefficient in a finite decFloat *df and  */
+    /* add to a base-thousand uInt array (as for GETCOEFFTHOU).       */
+    /* After the addition then most significant 'digit' in the array  */
+    /* might have a value larger then 10 (with a maximum of 19).      */
+    #if DECPMAX==7
+    #define ADDCOEFFTHOU(df, buf) {                           \
+      uInt sourhi=DFWORD(df, 0);                              \
+      (buf)[0]+=DPD2BIN[sourhi&0x3ff];                        \
+      if (buf[0]>999) {buf[0]-=1000; buf[1]++;}               \
+      (buf)[1]+=DPD2BIN[(sourhi>>10)&0x3ff];                  \
+      if (buf[1]>999) {buf[1]-=1000; buf[2]++;}               \
+      (buf)[2]+=DECCOMBMSD[sourhi>>26];}
+
+    #elif DECPMAX==16
+    #define ADDCOEFFTHOU(df, buf) {                           \
+      uInt sourhi, sourlo;                                    \
+      sourlo=DFWORD(df, 1);                                   \
+      (buf)[0]+=DPD2BIN[sourlo&0x3ff];                        \
+      if (buf[0]>999) {buf[0]-=1000; buf[1]++;}               \
+      (buf)[1]+=DPD2BIN[(sourlo>>10)&0x3ff];                  \
+      if (buf[1]>999) {buf[1]-=1000; buf[2]++;}               \
+      (buf)[2]+=DPD2BIN[(sourlo>>20)&0x3ff];                  \
+      if (buf[2]>999) {buf[2]-=1000; buf[3]++;}               \
+      sourhi=DFWORD(df, 0);                                   \
+      (buf)[3]+=DPD2BIN[((sourhi<<2) | (sourlo>>30))&0x3ff];  \
+      if (buf[3]>999) {buf[3]-=1000; buf[4]++;}               \
+      (buf)[4]+=DPD2BIN[(sourhi>>8)&0x3ff];                   \
+      if (buf[4]>999) {buf[4]-=1000; buf[5]++;}               \
+      (buf)[5]+=DECCOMBMSD[sourhi>>26];}
+
+    #elif DECPMAX==34
+    #define ADDCOEFFTHOU(df, buf) {                           \
+      uInt sourhi, sourmh, sourml, sourlo;                    \
+      sourlo=DFWORD(df, 3);                                   \
+      (buf)[0]+=DPD2BIN[sourlo&0x3ff];                        \
+      if (buf[0]>999) {buf[0]-=1000; buf[1]++;}               \
+      (buf)[1]+=DPD2BIN[(sourlo>>10)&0x3ff];                  \
+      if (buf[1]>999) {buf[1]-=1000; buf[2]++;}               \
+      (buf)[2]+=DPD2BIN[(sourlo>>20)&0x3ff];                  \
+      if (buf[2]>999) {buf[2]-=1000; buf[3]++;}               \
+      sourml=DFWORD(df, 2);                                   \
+      (buf)[3]+=DPD2BIN[((sourml<<2) | (sourlo>>30))&0x3ff];  \
+      if (buf[3]>999) {buf[3]-=1000; buf[4]++;}               \
+      (buf)[4]+=DPD2BIN[(sourml>>8)&0x3ff];                   \
+      if (buf[4]>999) {buf[4]-=1000; buf[5]++;}               \
+      (buf)[5]+=DPD2BIN[(sourml>>18)&0x3ff];                  \
+      if (buf[5]>999) {buf[5]-=1000; buf[6]++;}               \
+      sourmh=DFWORD(df, 1);                                   \
+      (buf)[6]+=DPD2BIN[((sourmh<<4) | (sourml>>28))&0x3ff];  \
+      if (buf[6]>999) {buf[6]-=1000; buf[7]++;}               \
+      (buf)[7]+=DPD2BIN[(sourmh>>6)&0x3ff];                   \
+      if (buf[7]>999) {buf[7]-=1000; buf[8]++;}               \
+      (buf)[8]+=DPD2BIN[(sourmh>>16)&0x3ff];                  \
+      if (buf[8]>999) {buf[8]-=1000; buf[9]++;}               \
+      sourhi=DFWORD(df, 0);                                   \
+      (buf)[9]+=DPD2BIN[((sourhi<<6) | (sourmh>>26))&0x3ff];  \
+      if (buf[9]>999) {buf[9]-=1000; buf[10]++;}              \
+      (buf)[10]+=DPD2BIN[(sourhi>>4)&0x3ff];                  \
+      if (buf[10]>999) {buf[10]-=1000; buf[11]++;}            \
+      (buf)[11]+=DECCOMBMSD[sourhi>>26];}
+    #endif
+
+
+    /* Set a decFloat to the maximum positive finite number (Nmax)    */
+    #if DECPMAX==7
+    #define DFSETNMAX(df)            \
+      {DFWORD(df, 0)=0x77f3fcff;}
+    #elif DECPMAX==16
+    #define DFSETNMAX(df)            \
+      {DFWORD(df, 0)=0x77fcff3f;     \
+       DFWORD(df, 1)=0xcff3fcff;}
+    #elif DECPMAX==34
+    #define DFSETNMAX(df)            \
+      {DFWORD(df, 0)=0x77ffcff3;     \
+       DFWORD(df, 1)=0xfcff3fcf;     \
+       DFWORD(df, 2)=0xf3fcff3f;     \
+       DFWORD(df, 3)=0xcff3fcff;}
+    #endif
+
+  /* [end of format-dependent macros and constants]                   */
+  #endif
+
+#else
+  #error decNumberLocal included more than once
+#endif
diff --git a/source/luametatex/source/libraries/hnj/hnjhyphen.c b/source/luametatex/source/libraries/hnj/hnjhyphen.c
new file mode 100644
index 000000000..ad9d87683
--- /dev/null
+++ b/source/luametatex/source/libraries/hnj/hnjhyphen.c
@@ -0,0 +1,627 @@
+/*
+    See license.txt in the root of this project.
+*/
+
+/*
+
+    This file is derived from libhnj which is is dual licensed under LGPL and MPL. Boilerplate
+    for both licenses follows.
+
+    LibHnj - a library for high quality hyphenation and justification
+
+    (C) 1998 Raph Levien,
+    (C) 2001 ALTLinux, Moscow (http://www.alt-linux.org),
+    (C) 2001 Peter Novodvorsky (nidd@cs.msu.su)
+
+    This library is free software; you can redistribute it and/or modify it under the terms of the
+    GNU Library General Public License as published by the Free Software Foundation; either version
+    2 of the License, or (at your option) any later version.
+
+    This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
+    without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
+    the GNU Library General Public License for more details.
+
+    You should have received a copy of the GNU Library General Public License along with this
+    library; if not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+    Boston, MA 02111-1307 USA.
+
+    The contents of this file are subject to the Mozilla Public License Version 1.0 (the "MPL");
+    you may not use this file except in compliance with the MPL. You may obtain a copy of the MPL
+    at http://www.mozilla.org/MPL/
+
+    Software distributed under the MPL is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
+    KIND, either express or implied. See the MPL for the specific language governing rights and
+    limitations under the MPL.
+
+    Remark: I'm not sure if something fundamental was adapted in the perspective of using this
+    library in LuaTeX. However, for instance error reporting has been hooked into the Lua(Meta)TeX
+    error reporting mechanisms. Also a bit of reformatting was done. This module won't change.
+    Also, the code has been adapted a little in order to fit in the rest (function names etc)
+    because it is more exposed. We use the alternative memory allocator.
+
+*/
+
+/*tex We need the warning subsystem, so: */
+
+# include "luametatex.h"
+
+/*tex A few helpers (from |hnjalloc|): */
+
+static void *hnj_malloc(int size)
+{
+    void *p = lmt_memory_malloc((size_t) size);
+    if (! p) {
+        tex_formatted_error("hyphenation", "allocating %d bytes failed\n", size);
+    }
+    return p;
+}
+
+static void *hnj_realloc(void *p, int size)
+{
+    void *n = lmt_memory_realloc(p, (size_t) size);
+    if (! n) {
+        tex_formatted_error("hyphenation", "reallocating %d bytes failed\n", size);
+    }
+    return n;
+}
+
+static void hnj_free(void *p)
+{
+    lmt_memory_free(p);
+}
+
+static unsigned char *hnj_strdup(const unsigned char *s)
+{
+    size_t l = strlen((const char *) s);
+    unsigned char *n = hnj_malloc((int) l + 1);
+    memcpy(n, s, l);
+    n[l] = 0;
+    return n;
+}
+
+/*tex
+
+    Combine two right-aligned number patterns, 04000 + 020 becomes 04020. This works also for utf8
+    sequences because the substring is identical to the last |substring - length| bytes of expr
+    except for the (single byte) hyphenation encoders
+
+*/
+
+static char *combine_patterns(char *expr, const char *subexpr)
+{
+    size_t l1 = strlen(expr);
+    size_t l2 = strlen(subexpr);
+    size_t off = l1 - l2;
+    for (unsigned j = 0; j < l2; j++) {
+        if (expr[off + j] < subexpr[j]) {
+            expr[off + j] = subexpr[j];
+        }
+    }
+    return expr;
+}
+
+/*tex Some original code: */
+
+static hjn_hashiterator *new_hashiterator(hjn_hashtable *h)
+{
+    hjn_hashiterator *i = hnj_malloc(sizeof(hjn_hashiterator));
+    i->e = h->entries;
+    i->cur = NULL;
+    i->ndx = -1;
+    return i;
+}
+
+static int nexthashstealpattern(hjn_hashiterator *i, unsigned char **word, char **pattern)
+{
+    while (i->cur == NULL) {
+        if (i->ndx >= HNJ_HASH_SIZE - 1) {
+            return 0;
+        } else {
+            i->cur = i->e[++i->ndx];
+        }
+    }
+    *word = i->cur->key;
+    *pattern = i->cur->u.hyppat;
+    i->cur->u.hyppat = NULL;
+    i->cur = i->cur->next;
+    return 1;
+}
+
+static int nexthash(hjn_hashiterator *i, unsigned char **word)
+{
+    while (! i->cur) {
+        if (i->ndx >= HNJ_HASH_SIZE - 1) {
+            return 0;
+        } else {
+            i->cur = i->e[++i->ndx];
+        }
+    }
+    *word = i->cur->key;
+    i->cur = i->cur->next;
+    return 1;
+}
+
+static int eachhash(hjn_hashiterator *i, unsigned char **word, char **pattern)
+{
+    while (! i->cur) {
+        if (i->ndx >= HNJ_HASH_SIZE - 1) {
+            return 0;
+        } else {
+            i->cur = i->e[++i->ndx];
+        }
+    }
+    *word = i->cur->key;
+    *pattern = i->cur->u.hyppat;
+    i->cur = i->cur->next;
+    return 1;
+}
+
+static void delete_hashiterator(hjn_hashiterator *i)
+{
+    hnj_free(i);
+}
+
+/*tex A |char*| hash function from ASU, adapted from |Gtk+|: */
+
+static unsigned int string_hash(const unsigned char *s)
+{
+    const unsigned char *p = s;
+    unsigned int h = 0, g;
+    for (; *p != '\0'; p += 1) {
+        h = (h << 4) + *p;
+        g = h & 0xf0000000;
+        if (g) {
+            h = h ^ (g >> 24);
+            h = h ^ g;
+        }
+    }
+    return h;
+}
+
+/*tex This assumes that key is not already present! */
+
+static void state_insert(hjn_hashtable *hashtab, unsigned char *key, int state)
+{
+    int i = (int) (string_hash(key) % HNJ_HASH_SIZE);
+    hjn_hashentry* e = hnj_malloc(sizeof(hjn_hashentry));
+    e->next = hashtab->entries[i];
+    e->key = key;
+    e->u.state = state;
+    hashtab->entries[i] = e;
+}
+
+/*tex This also assumes that key is not already present! */
+
+static void insert_pattern(hjn_hashtable *hashtab, unsigned char *key, char *hyppat, int trace)
+{
+    hjn_hashentry *e;
+    int i = (int) (string_hash(key) % HNJ_HASH_SIZE);
+    for (e = hashtab->entries[i]; e; e = e->next) {
+        if (strcmp((char *) e->key, (char *) key) == 0) {
+            if (e->u.hyppat) {
+                if (trace && hyppat && strcmp((char *) e->u.hyppat, (char *) hyppat) != 0) {
+                    tex_formatted_warning("hyphenation", "a conflicting pattern '%s' has been ignored", hyppat);
+                }
+                hnj_free(e->u.hyppat);
+            }
+            e->u.hyppat = hyppat;
+            hnj_free(key);
+            return;
+        }
+    }
+    e = hnj_malloc(sizeof(hjn_hashentry));
+    e->next = hashtab->entries[i];
+    e->key = key;
+    e->u.hyppat = hyppat;
+    hashtab->entries[i] = e;
+}
+
+/*tex We return |state| if found, otherwise |-1|. */
+
+static int state_lookup(hjn_hashtable *hashtab, const unsigned char *key)
+{
+    int i = (int) (string_hash(key) % HNJ_HASH_SIZE);
+    for (hjn_hashentry *e = hashtab->entries[i]; e; e = e->next) {
+        if (! strcmp((const char *) key, (const char *) e->key)) {
+            return e->u.state;
+        }
+    }
+    return -1;
+}
+
+/*tex We return |state| if found, otherwise |-1|. The 256 should be enough. */
+
+static char *lookup_pattern(hjn_hashtable * hashtab, const unsigned char *chars, int l)
+{
+    int i;
+    unsigned char key[256];
+    strncpy((char *) key, (const char *) chars, (size_t) l);
+    key[l] = 0;
+    i = (int) (string_hash(key) % HNJ_HASH_SIZE);
+    for (hjn_hashentry *e = hashtab->entries[i]; e; e = e->next) {
+        if (! strcmp((char *) key, (char *) e->key)) {
+            return e->u.hyppat;
+        }
+    }
+    return NULL;
+}
+
+/*tex Get the state number, allocating a new state if necessary. */
+
+static int hnj_get_state(hjn_dictionary *dict, const unsigned char *str, int *state_num)
+{
+    *state_num = state_lookup(dict->state_num, str);
+    if (*state_num >= 0) {
+        return *state_num;
+    } else {
+        state_insert(dict->state_num, hnj_strdup(str), dict->num_states);
+        /*tex The predicate is true if |dict->num_states| is a power of two: */
+        if (! (dict->num_states & (dict->num_states - 1))) {
+            dict->states = hnj_realloc(dict->states, (int) ((dict->num_states << 1) * (int) sizeof(hjn_state)));
+        }
+        dict->states[dict->num_states] = (hjn_state) { .match = NULL, .fallback_state = -1, .num_trans = 0, .trans = NULL };
+        return dict->num_states++;
+    }
+}
+
+/*tex
+
+    Add a transition from state1 to state2 through ch - assumes that the transition does not
+    already exist.
+
+*/
+
+static void hnj_add_trans(hjn_dictionary *dict, int state1, int state2, int chr)
+{
+    /*tex
+
+        This test was a bit too strict, it is quite normal for old patterns to have chars in the
+        range 0-31 or 127-159 (inclusive). To ease the transition, let's only disallow |nul| for
+        now, which probably is a requirement of the code anyway.
+
+    */
+    if (chr) {
+        int num_trans = dict->states[state1].num_trans;
+        if (num_trans == 0) {
+            dict->states[state1].trans = hnj_malloc(sizeof(hjn_transition));
+        } else {
+            /*tex
+
+                The old version did:
+
+                \starttyping
+                } else if (!(num_trans & (num_trans - 1))) {
+                    ... = hnj_realloc(dict->states[state1].trans,
+                            (int) ((num_trans << 1) * sizeof(HyphenTrans)));
+                \stoptyping
+
+                but that is incredibly nasty when adding patters one-at-a-time. Controlled growth
+                would be nicer than the current +1, but if no one complains, and no one did in a
+                decade, this is good enough.
+
+            */
+            dict->states[state1].trans = hnj_realloc(dict->states[state1].trans, (int) ((num_trans + 1) * sizeof(hjn_transition)));
+        }
+        dict->states[state1].trans[num_trans].uni_ch = chr;
+        dict->states[state1].trans[num_trans].new_state = state2;
+        dict->states[state1].num_trans++;
+    } else {
+        tex_normal_error("hyphenation","a nul character is not permited");
+    }
+}
+
+/*tex
+
+    We did change the semantics a bit here: |hnj_hyphen_load| used to operate on a file, but now
+    the argument is a string buffer.
+
+*/
+
+/* define tex_isspace(c) (c == ' ' || c == '\t') */
+#  define tex_isspace(c) (c == ' ')
+
+static const unsigned char *next_pattern(size_t* length, const unsigned char** buf)
+{
+    const unsigned char *here, *rover = *buf;
+    while (*rover && tex_isspace(*rover)) {
+        rover++;
+    }
+    here = rover;
+    while (*rover) {
+        if (tex_isspace(*rover)) {
+            *length = (size_t) (rover - here);
+            *buf = rover;
+            return here;
+        } else {
+            rover++;
+        }
+    }
+    *length = (size_t) (rover - here);
+    *buf = rover;
+    return *length ? here : NULL;
+}
+
+static void init_hash(hjn_hashtable **h)
+{
+    if (! *h) {
+        *h = hnj_malloc(sizeof(hjn_hashtable));
+        for (int i = 0; i < HNJ_HASH_SIZE; i++) {
+            (*h)->entries[i] = NULL;
+        }
+    }
+}
+
+static void clear_state_hash(hjn_hashtable **h)
+{
+    if (*h) {
+        for (int i = 0; i < HNJ_HASH_SIZE; i++) {
+            hjn_hashentry *e, *next;
+            for (e = (*h)->entries[i]; e; e = next) {
+                next = e->next;
+                hnj_free(e->key);
+                hnj_free(e);
+            }
+        }
+        hnj_free(*h);
+        *h = NULL;
+    }
+}
+
+static void clear_pattern_hash(hjn_hashtable **h)
+{
+    if (*h) {
+        for (int i = 0; i < HNJ_HASH_SIZE; i++) {
+            hjn_hashentry *e, *next;
+            for (e = (*h)->entries[i]; e; e = next) {
+                next = e->next;
+                hnj_free(e->key);
+                if (e->u.hyppat) {
+                    hnj_free(e->u.hyppat);
+                }
+                hnj_free(e);
+            }
+        }
+        hnj_free(*h);
+        *h = NULL;
+    }
+}
+
+static void init_dictionary(hjn_dictionary *dict)
+{
+    dict->num_states = 1;
+    dict->pat_length = 0;
+    dict->states = hnj_malloc(sizeof(hjn_state));
+    dict->states[0] = (hjn_state) { .match = NULL, .fallback_state = -1, .num_trans = 0, .trans = NULL };
+    dict->patterns = NULL;
+    dict->merged = NULL;
+    dict->state_num = NULL;
+    init_hash(&dict->patterns);
+}
+
+static void clear_dictionary(hjn_dictionary *dict)
+{
+    for (int state_num = 0; state_num < dict->num_states; state_num++) {
+        hjn_state *hstate = &dict->states[state_num];
+        if (hstate->match) {
+            hnj_free(hstate->match);
+        }
+        if (hstate->trans) {
+            hnj_free(hstate->trans);
+        }
+    }
+    hnj_free(dict->states);
+    clear_pattern_hash(&dict->patterns);
+    clear_pattern_hash(&dict->merged);
+    clear_state_hash(&dict->state_num);
+}
+
+hjn_dictionary *hnj_dictionary_new(void)
+{
+    hjn_dictionary *dict = hnj_malloc(sizeof(hjn_dictionary));
+    init_dictionary(dict);
+    return dict;
+}
+
+void hnj_dictionary_clear(hjn_dictionary *dict)
+{
+    clear_dictionary(dict);
+    init_dictionary(dict);
+}
+
+void hnj_dictionary_free(hjn_dictionary *dict)
+{
+    clear_dictionary(dict);
+    hnj_free(dict);
+}
+
+unsigned char *hnj_dictionary_tostring(hjn_dictionary *dict)
+{
+    unsigned char *word;
+    char *pattern;
+    unsigned char *buf = hnj_malloc(dict->pat_length);
+    unsigned char *cur = buf;
+    hjn_hashiterator *v = new_hashiterator(dict->patterns);
+    while (eachhash(v, &word, &pattern)) {
+        int i = 0;
+        int e = 0;
+        while (word[e + i]) {
+            if (pattern[i] != '0') {
+                *cur++ = (unsigned char) pattern[i];
+            }
+            *cur++ = word[e + i++];
+            while (is_utf8_follow(word[e + i])) {
+                *cur++ = word[i + e++];
+            }
+        }
+        if (pattern[i] != '0') {
+            *cur++ = (unsigned char) pattern[i];
+        }
+        *cur++ = ' ';
+    }
+    delete_hashiterator(v);
+    *cur = 0;
+    return buf;
+}
+
+/*tex
+
+    In hyphenation patterns we use signed bytes where |0|, or actually any negative number,
+    indicates end:
+
+    \starttyping
+    prio(1+),startpos,length,len1,[replace],len2,[replace]
+    \starttyping
+
+    A basic example is:
+
+    \starttyping
+    p n 0 0 0
+    \starttyping
+
+    for a hyphenation point between characters.
+
+*/
+
+void hnj_dictionary_load(hjn_dictionary *dict, const unsigned char *f, int trace)
+{
+    int state_num, last_state;
+    int ch;
+    int found;
+    hjn_hashiterator *v;
+    unsigned char *word;
+    char *pattern;
+    size_t l = 0;
+    const unsigned char *format;
+    const unsigned char *begin = f;
+    while ((format = next_pattern(&l, &f)) != NULL) {
+        if (l > 0 && l < 255) {
+            int i, j, e1;
+            for (i = 0, j = 0, e1 = 0; (unsigned) i < l; i++) {
+                if (format[i] >= '0' && format[i] <= '9') {
+                    j++;
+                }
+                if (is_utf8_follow(format[i])) {
+                    e1++;
+                }
+            }
+            /*tex
+                Here |l-e1| is the number of {\em characters} not {\em bytes}, |l-j| the number of
+                pattern bytes and |l-e1-j| the number of pattern characters.
+            */
+            {
+                unsigned char *pat = (unsigned char *) hnj_malloc((1 + (int) l - j));
+                char *org = (char *) hnj_malloc(2 + (int) l - e1 - j);
+                /*tex Remove hyphenation encoders (digits) from pat. */
+                org[0] = '0';
+                for (i = 0, j = 0, e1 = 0; (unsigned) i < l; i++) {
+                    unsigned char c = format[i];
+                    if (is_utf8_follow(c)) {
+                        pat[j + e1++] = c;
+                    } else if (c < '0' || c > '9') {
+                        pat[e1 + j++] = c;
+                        org[j] = '0';
+                    } else {
+                        org[j] = (char) c;
+                    }
+                }
+                pat[e1 + j] = 0;
+                org[j + 1] = 0;
+                insert_pattern(dict->patterns, pat, org, trace);
+            }
+        } else {
+           tex_normal_warning("hyphenation", "a pattern of more than 254 bytes is ignored");
+        }
+    }
+    /*tex We add 2 bytes for spurious spaces. */
+    dict->pat_length += (int) ((f - begin) + 2);
+    init_hash(&dict->merged);
+    v = new_hashiterator(dict->patterns);
+    while (nexthash(v, &word)) {
+        int wordsize = (int) strlen((char *) word);
+        for (int l1 = 1; l1 <= wordsize; l1++) {
+            if (is_utf8_follow(word[l1])) {
+                /*tex Do not clip an utf8 sequence. */
+            } else {
+                for (int j1 = 1; j1 <= l1; j1++) {
+                    int i1 = l1 - j1;
+                    if (is_utf8_follow(word[i1])) {
+                        /*tex Do not start halfway an utf8 sequence. */
+                    } else {
+                        char *subpat_pat = lookup_pattern(dict->patterns, word + i1, j1);
+                        if (subpat_pat) {
+                            char *newpat_pat = lookup_pattern(dict->merged, word, l1);
+                            if (! newpat_pat) {
+                                char *neworg;
+                                unsigned char *newword = (unsigned char *) hnj_malloc((size_t) (l1 + 1));
+                                int e1 = 0;
+                                strncpy((char *) newword, (char *) word, (size_t) l1);
+                                newword[l1] = 0;
+                                for (i1 = 0; i1 < l1; i1++) {
+                                    if (is_utf8_follow(newword[i1])) {
+                                        e1++;
+                                    }
+                                }
+                                neworg = hnj_malloc((size_t) (l1 + 2 - e1));
+                                /*tex Fill with right amount of zeros: */
+                                sprintf(neworg, "%0*d", l1 + 1 - e1, 0);
+                                insert_pattern(dict->merged, newword, combine_patterns(neworg, subpat_pat), trace);
+                            } else {
+                                combine_patterns(newpat_pat, subpat_pat);
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+    delete_hashiterator(v);
+    init_hash(&dict->state_num);
+    state_insert(dict->state_num, hnj_strdup((const unsigned char *) ""), 0);
+    v = new_hashiterator(dict->merged);
+    while (nexthashstealpattern(v, &word, &pattern)) {
+        static unsigned char mask[] = { 0x3F, 0x1F, 0xF, 0x7 };
+        int j1 = (int) strlen((char *) word);
+        state_num = hnj_get_state(dict, word, &found);
+        dict->states[state_num].match = pattern;
+        /*tex Now, put in the prefix transitions. */
+        while (found < 0) {
+            j1--;
+            last_state = state_num;
+            ch = word[j1];
+            if (ch >= 0x80) { /* why not is_utf8_follow(ch) here */
+                int m;
+                int i1 = 1;
+                while (is_utf8_follow(word[j1 - i1])) {
+                    i1++;
+                }
+                ch = word[j1 - i1] & mask[i1];
+                m = j1 - i1;
+                while (i1--) {
+                    ch = (ch << 6) + (0x3F & word[j1 - i1]);
+                }
+                j1 = m;
+            }
+            word[j1] = '\0';
+            state_num = hnj_get_state(dict, word, &found);
+            hnj_add_trans(dict, state_num, last_state, ch);
+        }
+    }
+    delete_hashiterator(v);
+    clear_pattern_hash(&dict->merged);
+    /*tex Put in the fallback states. */
+    for (int i = 0; i < HNJ_HASH_SIZE; i++) {
+        for (hjn_hashentry *e = dict->state_num->entries[i]; e; e = e->next) {
+            /*tex Do not do |state == 0| otherwise things get confused. */
+            if (e->u.state) {
+                for (int j = 1; 1; j++) {
+                    state_num = state_lookup(dict->state_num, e->key + j);
+                    if (state_num >= 0) {
+                        break;
+                    }
+                }
+                dict->states[e->u.state].fallback_state = state_num;
+            }
+        }
+    }
+    clear_state_hash(&dict->state_num);
+}
diff --git a/source/luametatex/source/libraries/hnj/hnjhyphen.h b/source/luametatex/source/libraries/hnj/hnjhyphen.h
new file mode 100644
index 000000000..1f176f3e9
--- /dev/null
+++ b/source/luametatex/source/libraries/hnj/hnjhyphen.h
@@ -0,0 +1,123 @@
+/*
+    See license.txt in the root of this project.
+*/
+
+/*
+
+    The code is derived from LibHnj which is is dual licensed under LGPL and MPL. Boilerplate for
+    both licenses follows.
+
+*/
+
+/*
+
+    LibHnj - a library for high quality hyphenation and justification
+
+    Copyright (C) 1998 Raph Levien, (C) 2001 ALTLinux, Moscow
+
+    This library is free software; you can redistribute it and/or modify it under the terms of the
+    GNU Library General Public License as published by the Free Software Foundation; either version
+    2 of the License, or (at your option) any later version.
+
+    This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
+    without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
+    the GNU Library General Public License for more details.
+
+    You should have received a copy of the GNU Library General Public License along with this
+    library; if not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+    Boston, MA 02111-1307 USA.
+
+*/
+
+/*
+    The contents of this file are subject to the Mozilla Public License Version 1.0 (the "MPL");
+    you may not use this file except in compliance with the MPL. You may obtain a copy of the MPL
+    at http://www.mozilla.org/MPL/
+
+    Software distributed under the MPL is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
+    KIND, either express or implied. See the MPL for the specific language governing rights and
+    limitations under the MPL.
+
+ */
+
+# ifndef LMT_HNJHYPHEN_H
+# define LMT_HNJHYPHEN_H
+
+/*tex
+
+    First some type definitions and a little bit of a hash table implementation. This simply maps
+    strings to state numbers. In \LUATEX\ we have node related code in |hnjhyphen.c| but in
+    \LUAMETATEX\ we moved that to |texlanguage.c| so we need to make some type definitions public.
+
+*/
+
+# define HNJ_MAXPATHS  40960
+# define HNJ_HASH_SIZE 31627
+# define HNJ_MAX_CHARS   256
+# define HNJ_MAX_NAME     24
+
+typedef struct _hjn_hashtable    hjn_hashtable;
+typedef struct _hjn_hashentry    hjn_hashentry;
+typedef struct _hjn_hashiterator hjn_hashiterator;
+typedef union  _hjn_hashvalue    hjn_hashvalue;
+
+/*tex A cheap, but effective, hack. */
+
+struct _hjn_hashtable {
+    hjn_hashentry *entries[HNJ_HASH_SIZE];
+};
+
+union _hjn_hashvalue {
+    char *hyppat;
+    int   state;
+    int   padding;
+};
+
+struct _hjn_hashentry {
+    hjn_hashentry *next;
+    unsigned char *key;
+    hjn_hashvalue  u;
+};
+
+struct _hjn_hashiterator {
+    hjn_hashentry **e;
+    hjn_hashentry  *cur;
+    int             ndx;
+    int             padding;
+};
+
+/*tex The state state machine. */
+
+typedef struct _hjn_transition hjn_transition;
+typedef struct _hjn_state      hjn_state;
+typedef struct _hjn_dictionary hjn_dictionary;
+
+struct _hjn_transition {
+    int uni_ch;
+    int new_state;
+};
+
+struct _hjn_state {
+    char           *match;
+    int             fallback_state;
+    int             num_trans;
+    hjn_transition *trans;
+};
+
+struct _hjn_dictionary {
+    int            num_states;
+    int            pat_length;
+    char           cset[HNJ_MAX_NAME];
+    hjn_state     *states;
+    hjn_hashtable *patterns;
+    hjn_hashtable *merged;
+    hjn_hashtable *state_num;
+};
+
+extern hjn_dictionary *hnj_dictionary_new      (void);
+extern void            hnj_dictionary_load     (hjn_dictionary *dict, const unsigned char *fn, int trace);
+extern void            hnj_dictionary_free     (hjn_dictionary *dict);
+extern void            hnj_dictionary_clear    (hjn_dictionary *dict);
+extern unsigned char  *hnj_dictionary_tostring (hjn_dictionary *dict);
+
+# endif
diff --git a/source/luametatex/source/libraries/libcerf/CHANGELOG b/source/luametatex/source/libraries/libcerf/CHANGELOG
new file mode 100644
index 000000000..9ac940088
--- /dev/null
+++ b/source/luametatex/source/libraries/libcerf/CHANGELOG
@@ -0,0 +1,118 @@
+== Revision history of libcerf, maintained by Joachim Wuttke ==
+
+Homepage moved to https://jugit.fz-juelich.de/mlz/libcerf, 17mar19
+
+libcerf-1.13, released 28feb19:
+  - Further adjustments for compilation under Windows
+
+libcerf-1.12, released 7feb19:
+  - Require CMake 3.6, outcomment code that requires 3.13.
+  - Relative paths in CMake sources, for use as subproject.
+  - When compiling as CPP, then #include<complex>, not <complex.h>;
+    revise the entire C-vs-CPP machinery.
+  - Remove tests with different inf or nan results on different systems or under
+    different compilers.
+
+libcerf-1.11, released 28dec18:
+  - Cover voigt by test_voigt.
+  - Implement new function voigt_hwhm.
+  - Restore libcerf.pc.
+  - Add INSTALL instructions, and other minor adjustments for use of libcerf in C++ projects.
+  - Support 'ctest', which runs the numeric accuracy tests from test1.c.
+  - Rename type cmplx into _cerf_cmplx to avoid name clash with Gnuplot pre 5.3.
+
+libcerf-1.8 [2oct18], libcerf-1.9 [16oct18] and libcerf-1.10 [20dec18]
+  MUST NOT BE USED
+  - A bug introduced in v1.8 had broken the normalization of the Voigt function.
+  - The git history leading to v1.10 has been rewritten, starting anew from v1.7
+
+libcerf-1.7, released 26sep18:
+  - Option -DCERF_CPP allows to choose C++ compilation, which is useful
+    because MS VisualStudio supports C++14, but not yet C99, and in
+    particular does not support complex.h under C.
+
+libcerf-1.6, released 20sep18:
+  - Migrated from automake to CMake.
+  - Corrected typos in man pages.
+
+libcerf-1.5, released 12oct16:
+  - Removed unused inline function (detected by clang-1.3., reported by Luke Benes)
+
+libcerf-1.4, released 27aug14:
+  - HTML version of man pages no longer installs to man/html.
+  - More concise man pages.
+  - Delete a few unused include's.
+  - Autotools script corrected (suggestions by Christoph Junghans).
+
+libcerf-1.3, released 17jul13:
+  - Now supporting pkg-config (suggested by Mojca Miklavec).
+
+libcerf-1.2, released 16jul13:
+  - Test programs no longer install to $bindir (reported by Mojca Miklavec).
+
+libcerf-1.1, released 12may13:
+  - Added Fortran binding by Antonio Cervellino.
+
+libcerf-1.0, released 31jan13 by Joachim Wuttke:
+  - Based on http://ab-initio.mit.edu/Faddeeva as of 28jan13.
+  - Verified accuracy using double-exponential transform.
+  - Simplified function names;
+    use leading 'c' for complex functions (except in w_of_z).
+  - Added function voigt(x,sigma,gamma).
+  - Added configure.ac, Makefile.am &c to allow for autotools standard
+    installation (commands ./configure, make, sudo make install).
+  - Splitted source code into directories lib/ and test/.
+  - Eliminated unused alternate code (!USE_CONTINUED_FRACTION).
+  - Eliminated relerr arguments.
+  - Replaced "complex" by "_Complex" for C++ compatibility.
+  - Wrote man pages w_of_z(3), dawson(3), voigt(3), cerf(3), erfcx(3), erfi(3).
+  - Created project home page http://apps.jcns.fz-juelich.de/libcerf.
+  - Registered project "libcerf" at sourceforge.net.
+
+== Revision history of Faddeeva.cc by Steven G. Johnson ==
+
+Project at http://ab-initio.mit.edu/Faddeeva
+
+  4 October 2012: Initial public release (SGJ)
+  5 October 2012: Revised (SGJ) to fix spelling error,
+                  start summation for large x at round(x/a) (> 1)
+                  rather than ceil(x/a) as in the original
+                  paper, which should slightly improve performance
+                  (and, apparently, slightly improves accuracy)
+ 19 October 2012: Revised (SGJ) to fix bugs for large x, large -y,
+                  and 15<x<26. Performance improvements. Prototype
+                  now supplies default value for relerr.
+ 24 October 2012: Switch to continued-fraction expansion for
+                  sufficiently large z, for performance reasons.
+                  Also, avoid spurious overflow for |z| > 1e154.
+                  Set relerr argument to min(relerr,0.1).
+ 27 October 2012: Enhance accuracy in Re[w(z)] taken by itself,
+                  by switching to Alg. 916 in a region near
+                  the real-z axis where continued fractions
+                  have poor relative accuracy in Re[w(z)].  Thanks
+                  to M. Zaghloul for the tip.
+ 29 October 2012: Replace SLATEC-derived erfcx routine with
+                  completely rewritten code by me, using a very
+                  different algorithm which is much faster.
+ 30 October 2012: Implemented special-case code for real z
+                  (where real part is exp(-x^2) and imag part is
+                   Dawson integral), using algorithm similar to erfx.
+                  Export ImFaddeeva_w function to make Dawson's
+                  integral directly accessible.
+ 3 November 2012: Provide implementations of erf, erfc, erfcx,
+                  and Dawson functions in Faddeeva:: namespace,
+                  in addition to Faddeeva::w.  Provide header
+                  file Faddeeva.hh.
+ 4 November 2012: Slightly faster erf for real arguments.
+                  Updated MATLAB and Octave plugins.
+27 November 2012: Support compilation with either C++ or
+                  plain C (using C99 complex numbers).
+                  For real x, use standard-library erf(x)
+                  and erfc(x) if available (for C99 or C++11).
+                  #include "config.h" if HAVE_CONFIG_H is #defined.
+15 December 2012: Portability fixes (copysign, Inf/NaN creation),
+                  use CMPLX/__builtin_complex if available in C,
+                  slight accuracy improvements to erf and dawson
+                  functions near the origin.  Use gnulib functions
+                  if GNULIB_NAMESPACE is defined.
+18 December 2012: Slight tweaks (remove recomputation of x*x in Dawson)
diff --git a/source/luametatex/source/libraries/libcerf/LICENSE b/source/luametatex/source/libraries/libcerf/LICENSE
new file mode 100644
index 000000000..30979bbd8
--- /dev/null
+++ b/source/luametatex/source/libraries/libcerf/LICENSE
@@ -0,0 +1,22 @@
+/* Copyright (c) 2012 Massachusetts Institute of Technology
+ * Copyright (c) 2013 Forschungszentrum Jülich GmbH
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
diff --git a/source/luametatex/source/libraries/libcerf/README.md b/source/luametatex/source/libraries/libcerf/README.md
new file mode 100644
index 000000000..e825f8f99
--- /dev/null
+++ b/source/luametatex/source/libraries/libcerf/README.md
@@ -0,0 +1,109 @@
+# libcerf
+
+This is the home page of **libcerf**, a self-contained numeric library that provides an efficient and accurate implementation of complex error functions, along with Dawson, Faddeeva, and Voigt functions.
+
+# User Documentation
+
+## Synopsis
+
+In the following, "complex" stands for the C99 data type "double _Complex":
+
+  * complex [cerf](http://apps.jcns.fz-juelich.de/man/cerf.html) (complex): The complex error function erf(z).
+  * complex [cerfc](http://apps.jcns.fz-juelich.de/man/cerf.html) (complex): The complex complementary error function erfc(z) = 1 - erf(z).
+  * complex [cerfcx](http://apps.jcns.fz-juelich.de/man/erfcx.html) (complex z): The underflow-compensating function erfcx(z) = exp(z^2) erfc(z).
+  * double [erfcx](http://apps.jcns.fz-juelich.de/man/erfcx.html) (double x): The same for real x.
+  * complex [cerfi](http://apps.jcns.fz-juelich.de/man/erfi.html) (complex z): The imaginary error function erfi(z) = -i erf(iz).
+  * double [erfi](http://apps.jcns.fz-juelich.de/man/erfi.html) (double x): The same for real x.
+  * complex [w_of_z](http://apps.jcns.fz-juelich.de/man/w_of_z.html) (complex z): Faddeeva's scaled complex error function w(z) = exp(-z^2) erfc(-iz).
+  * double [im_w_of_x](http://apps.jcns.fz-juelich.de/man/w_of_z.html) (double x): The same for real x, returning the purely imaginary result as a real number.
+  * complex [cdawson](http://apps.jcns.fz-juelich.de/man/dawson.html) (complex z): Dawson's integral D(z) = sqrt(pi)/2 * exp(-z^2) * erfi(z).
+  * double [dawson](http://apps.jcns.fz-juelich.de/man/dawson.html) (double x): The same for real x.
+  * double [voigt](http://apps.jcns.fz-juelich.de/man/voigt.html) (double x, double sigma, double gamma): The convolution of a Gaussian and a Lorentzian.
+  * double [voigt_hwhm](http://apps.jcns.fz-juelich.de/man/voigt_hwhm.html) (double sigma, double gamma): The half width at half maximum of the Voigt profile.
+
+## Accuracy
+
+By construction, it is expected that the relative accuracy is generally better than 1E-13. This has been confirmed by comparison with high-precision Maple computations and with a *long double* computation using Fourier transform representation and double-exponential transform.
+
+## Copyright and Citation
+
+Copyright (C) [Steven G. Johnson](http:*math.mit.edu/~stevenj), Massachusetts Institute of Technology, 2012; [Joachim Wuttke](http:*www.fz-juelich.de/SharedDocs/Personen/JCNS/EN/Wuttke_J.html), Forschungszentrum Jülich, 2013.
+
+License: [MIT License](http://opensource.org/licenses/MIT)
+
+When using libcerf in scientific work, please cite as follows:
+  * S. G. Johnson, A. Cervellino, J. Wuttke: libcerf, numeric library for complex error functions, version [...], http://apps.jcns.fz-juelich.de/libcerf
+
+Please send bug reports to the authors, or submit them through the Gitlab issue tracker.
+
+## Further references
+
+Most function evaluations in this library rely on Faddeeva's function w(z).
+
+This function has been reimplemented from scratch by [Steven G. Johnson](http://math.mit.edu/~stevenj);
+project web site http://ab-initio.mit.edu/Faddeeva. The implementation partly relies on algorithms from the following publications:
+  * Walter Gautschi, *Efficient computation of the complex error function,* SIAM J. Numer. Anal. 7, 187 (1970).
+  * G. P. M. Poppe and C. M. J. Wijers, *More efficient computation of the complex error function,* ACM Trans. Math. Soft. 16, 38 (1990).
+  * Mofreh R. Zaghloul and Ahmed N. Ali, *Algorithm 916: Computing the Faddeyeva and Voigt Functions,* ACM Trans. Math. Soft. 38, 15 (2011).
+
+# Installation
+
+## From source
+
+Download location: http://apps.jcns.fz-juelich.de/src/libcerf/
+
+Build&install are based on CMake. Out-of-source build is enforced.
+After unpacking the source, go to the source directory and do:
+
+  mkdir build
+  cd build
+  cmake ..
+  make
+  make install
+
+To test, run the programs in directory test/.
+
+The library has been developed using gcc-4.7. Reports about successful compilation with older versions of gcc would be welcome. For correct support of complex numbers it seems that at least gcc-4.3 is required. Compilation with gcc-4.2 works after removing of the "-Werror" flag from *configure*.
+
+## Binary packages
+
+  * Linux:
+    * [rpm package](https://build.opensuse.org/package/show/science/libcerf) by Christoph Junghans
+    * [Gentoo package](http://packages.gentoo.org/package/sci-libs/libcerf) by Christoph Junghans
+    * [Debian package](https://packages.debian.org/jessie/libs/libcerf1) by Eugen Wintersberger
+  * OS X:
+    * [MacPorts::libcerf](http://www.macports.org/ports.php?by=name&substr=libcerf), by Mojca Miklavec
+    * [Homebrew/homebrew-science/libcerf.rb](https://formulae.brew.sh/formula/libcerf), by Roman Garnett
+
+# Code structure
+
+The code consists of
+- the library's C source (directory lib/),
+- test code (directory test/),
+- manual pages (directory man/),
+- build utilities (aclocal.m4, build-aux/, config*, m4/, Makefile*).
+
+## Compilation
+
+The library libcerf is written in C. It can be compiled as C code (default) or as C++ code (with option -DCERF_CPP=ON). Compilation as C++ is useful especially under MS Windows because as per 2018 the C compiler of Visual Studio does not support C90, nor any newer language standard, and is unable to cope with complex numbers.
+
+Otherwise, the library is self-contained, and installation should be
+straightforward, using the usual command sequence
+
+  ./configure
+  make
+  sudo make install
+
+The command ./configure takes various options that are explained in the
+file INSTALL.
+
+## Language bindings
+
+For use with other programming languages, libcerf should be either linked directly, or provided with a trivial wrapper. Such language bindings are added to the libcerf package as contributed by their authors.
+
+The following bindings are available:
+  * **fortran**, by Antonio Cervellino (Paul Scherrer Institut)
+
+Further contributions will be highly welcome.
+
+Please report bugs to the package maintainer.
diff --git a/source/luametatex/source/libraries/libcerf/cerf.h b/source/luametatex/source/libraries/libcerf/cerf.h
new file mode 100644
index 000000000..3c280b597
--- /dev/null
+++ b/source/luametatex/source/libraries/libcerf/cerf.h
@@ -0,0 +1,93 @@
+/* Library libcerf:
+ *   Compute complex error functions, based on a new implementation of
+ *   Faddeeva's w_of_z. Also provide Dawson and Voigt functions.
+ *
+ * File cerf.h:
+ *   Declare exported functions.
+ *
+ * Copyright:
+ *   (C) 2012 Massachusetts Institute of Technology
+ *   (C) 2013 Forschungszentrum Jülich GmbH
+ *
+ * Licence:
+ *   Permission is hereby granted, free of charge, to any person obtaining
+ *   a copy of this software and associated documentation files (the
+ *   "Software"), to deal in the Software without restriction, including
+ *   without limitation the rights to use, copy, modify, merge, publish,
+ *   distribute, sublicense, and/or sell copies of the Software, and to
+ *   permit persons to whom the Software is furnished to do so, subject to
+ *   the following conditions:
+ *
+ *   The above copyright notice and this permission notice shall be
+ *   included in all copies or substantial portions of the Software.
+ *
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ *   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ *   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ *   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ *   LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ *   OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ *   WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *   Steven G. Johnson, Massachusetts Institute of Technology, 2012, core author
+ *   Joachim Wuttke, Forschungszentrum Jülich, 2013, package maintainer
+ *
+ * Website:
+ *   http://apps.jcns.fz-juelich.de/libcerf
+ *
+ * Revision history:
+ *   ../CHANGELOG
+ *
+ * Man pages:
+ *   w_of_z(3), dawson(3), voigt(3), cerf(3), erfcx(3), erfi(3)
+ */
+
+ /*
+
+    This file is patched by Mojca Miklavec and Hans Hagen for usage in LuaMetaTeX where we use
+    only C and also want to compile with the Microsoft compiler. So, when updating this library
+    one has to check for changes. Not that we expect many as this is a rather stable library.
+
+    In the other files there are a few macros used that deal with the multiplication and addition
+    of complex and real numbers. Of course the original code is kept as-is.
+
+ */
+
+# ifndef __CERF_H
+#   define __CERF_H
+
+# include <complex.h>
+
+# if (_MSC_VER)
+    # define _cerf_cmplx _Dcomplex
+# else
+    typedef double _Complex _cerf_cmplx;
+# endif
+
+# define EXPORT
+
+extern _cerf_cmplx w_of_z                (_cerf_cmplx z);                          /* compute w(z) = exp(-z^2) erfc(-iz), Faddeeva's scaled complex error function */
+extern double      im_w_of_x             (double x);                               /* special case Im[w(x)] of real x */
+extern double      re_w_of_z             (double x, double y);                     
+extern double      im_w_of_z             (double x, double y);                     
+                                                                                   
+extern _cerf_cmplx cerf                  (_cerf_cmplx z);                          /* compute erf(z), the error function of complex arguments */
+extern _cerf_cmplx cerfc                 (_cerf_cmplx z);                          /* compute erfc(z) = 1 - erf(z), the complementary error function */
+                                                                                   
+extern _cerf_cmplx cerfcx                (_cerf_cmplx z);                          /* compute erfcx(z) = exp(z^2) erfc(z), an underflow-compensated version of erfc */
+extern double      erfcx                 (double x);                               /* special case for real x */
+                                                                                   
+extern _cerf_cmplx cerfi                 (_cerf_cmplx z);                          /* compute erfi(z) = -i erf(iz), the imaginary error function */
+extern double      erfi                  (double x);                               /* special case for real x */
+                                                                                   
+extern _cerf_cmplx cdawson               (_cerf_cmplx z);                          /* compute dawson(z) = sqrt(pi)/2 * exp(-z^2) * erfi(z), Dawson's integral */
+extern double      dawson                (double x);                               /* special case for real x */
+
+extern double      voigt                 (double x, double sigma, double gamma);   /* compute voigt(x,...), the convolution of a Gaussian and a Lorentzian */
+extern double      voigt_hwhm            (double sigma, double gamma, int *error); /* compute the full width at half maximum of the Voigt function */
+
+extern double      cerf_experimental_imw (double x, double y);
+extern double      cerf_experimental_rew (double x, double y);
+
+#endif
diff --git a/source/luametatex/source/libraries/libcerf/defs.h b/source/luametatex/source/libraries/libcerf/defs.h
new file mode 100644
index 000000000..8bc6e3af6
--- /dev/null
+++ b/source/luametatex/source/libraries/libcerf/defs.h
@@ -0,0 +1,97 @@
+/* Library libcerf:
+ *   compute complex error functions,
+ *   along with Dawson, Faddeeva and Voigt functions
+ *
+ * File defs.h:
+ *   Language-dependent includes.
+ *
+ * Copyright:
+ *   (C) 2012 Massachusetts Institute of Technology
+ *   (C) 2013 Forschungszentrum Jülich GmbH
+ *
+ * Licence:
+ *   MIT Licence.
+ *   See ../COPYING
+ *
+ * Authors:
+ *   Steven G. Johnson, Massachusetts Institute of Technology, 2012, core author
+ *   Joachim Wuttke, Forschungszentrum Jülich, 2013, package maintainer
+ *
+ * Website:
+ *   http://apps.jcns.fz-juelich.de/libcerf
+ */
+
+/*
+
+    This file is patched by Mojca Miklavec and Hans Hagen for usage in LuaMetaTeX where we use
+    only C and also want to compile with the Microsoft compiler. So, when updating this library
+    one has to check for changes. Not that we expect many as this is a rather stable library.
+
+    In the other files there are a few macros used that deal with the multiplication and addition
+    of complex and real nmbers. Of course the original code is kept as-is.
+
+*/
+
+# ifndef __CERF_C_H
+#   define __CERF_C_H
+
+# define _GNU_SOURCE // enable GNU libc NAN extension if possible
+
+/*
+    Constructing complex numbers like 0+i*NaN is problematic in C99
+    without the C11 CMPLX macro, because 0.+I*NAN may give NaN+i*NAN if
+    I is a complex (rather than imaginary) constant.  For some reason,
+    however, it works fine in (pre-4.7) gcc if I define Inf and NaN as
+    1/0 and 0/0 (and only if I compile with optimization -O1 or more),
+    but not if I use the INFINITY or NAN macros.
+*/
+
+/*
+    __builtin_complex was introduced in gcc 4.7, but the C11 CMPLX
+    macro may not be defined unless we are using a recent (2012) version
+    of glibc and compile with -std=c11... note that icc lies about being
+    gcc and probably doesn't have this builtin(?), so exclude icc
+    explicitly.
+*/
+
+# if (_MSC_VER)
+    # define C(a,b) _Cbuild((double)(a), (double)(b))
+    # define Inf    INFINITY
+    # define NaN    NAN
+# else
+    # define C(a,b) ((a) + I*(b))
+    # define Inf    (1./0.)
+    # define NaN    (0./0.)
+# endif
+
+# include <complex.h>
+
+# if (_MSC_VER)
+
+    # define _cerf_cmplx _Dcomplex
+
+    static _Dcomplex complex_neg   (_Dcomplex x)              { return _Cmulcr(x, -1.0); }
+    static _Dcomplex complex_add_cc(_Dcomplex x, _Dcomplex y) { return _Cbuild(creal(x) + creal(y), cimag(x) + cimag(y)); }
+    static _Dcomplex complex_add_rc(double    x, _Dcomplex y) { return _Cbuild(x + creal(y), x + cimag(y)); }
+    static _Dcomplex complex_sub_cc(_Dcomplex x, _Dcomplex y) { return _Cbuild(creal(x) - creal(y), cimag(x) - cimag(y)); }
+    static _Dcomplex complex_sub_rc(double    x, _Dcomplex y) { return _Cbuild(x - creal(y), x - cimag(y)); }
+    static _Dcomplex complex_mul_cc(_Dcomplex x, _Dcomplex y) { return _Cmulcc((y), (x)); }
+    static _Dcomplex complex_mul_rc(double    x, _Dcomplex y) { return _Cmulcr((y), (x)); }
+    static _Dcomplex complex_mul_cr(_Dcomplex x, double    y) { return _Cmulcr((x), (y)); }
+
+# else
+
+    typedef double _Complex _cerf_cmplx;
+
+    # define complex_neg(x)       (-x)
+    # define complex_add_cc(x,y)  (x+y)
+    # define complex_add_rc(x,y)  (x+y)
+    # define complex_sub_cc(x,y)  (x-y)
+    # define complex_sub_rc(x,y)  (x-y)
+    # define complex_mul_cc(x,y)  (x*y)
+    # define complex_mul_rc(x,y)  (x*y)
+    # define complex_mul_cr(x,y)  (x*y)
+
+# endif
+
+# endif
diff --git a/source/luametatex/source/libraries/libcerf/erfcx.c b/source/luametatex/source/libraries/libcerf/erfcx.c
new file mode 100644
index 000000000..259ef911a
--- /dev/null
+++ b/source/luametatex/source/libraries/libcerf/erfcx.c
@@ -0,0 +1,528 @@
+/* Library libcerf:
+ *   Compute complex error functions, based on a new implementation of
+ *   Faddeeva's w_of_z. Also provide Dawson and Voigt functions.
+ *
+ * File erfcx.c:
+ *   Compute erfcx(x) = exp(x^2) erfc(x) function, for real x,
+ *   using a novel algorithm that is much faster than DERFC of SLATEC.
+ *   This function is used in the computation of Faddeeva, Dawson, and
+ *   other complex error functions.
+ *
+ * Copyright:
+ *   (C) 2012 Massachusetts Institute of Technology
+ *   (C) 2013 Forschungszentrum Jülich GmbH
+ *
+ * Licence:
+ *   Permission is hereby granted, free of charge, to any person obtaining
+ *   a copy of this software and associated documentation files (the
+ *   "Software"), to deal in the Software without restriction, including
+ *   without limitation the rights to use, copy, modify, merge, publish,
+ *   distribute, sublicense, and/or sell copies of the Software, and to
+ *   permit persons to whom the Software is furnished to do so, subject to
+ *   the following conditions:
+ *
+ *   The above copyright notice and this permission notice shall be
+ *   included in all copies or substantial portions of the Software.
+ *
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ *   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ *   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ *   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ *   LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ *   OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ *   WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *   Steven G. Johnson, Massachusetts Institute of Technology, 2012, core author
+ *   Joachim Wuttke, Forschungszentrum Jülich, 2013, package maintainer
+ *
+ * Website:
+ *   http://apps.jcns.fz-juelich.de/libcerf
+ *
+ * Revision history:
+ *   ../CHANGELOG
+ *
+ * Manual page:
+ *   man 3 erfcx
+ */
+
+#include "cerf.h"
+#include <math.h>
+#include "defs.h" // defines _cerf_cmplx, NaN, C, cexp, ...
+
+/******************************************************************************/
+/* Lookup-table for Chebyshev polynomials for smaller |x|                     */
+/******************************************************************************/
+
+static double erfcx_y100(double y100)
+{
+    // Steven G. Johnson, October 2012.
+
+    // Given y100=100*y, where y = 4/(4+x) for x >= 0, compute erfc(x).
+
+    // Uses a look-up table of 100 different Chebyshev polynomials
+    // for y intervals [0,0.01], [0.01,0.02], ...., [0.99,1], generated
+    // with the help of Maple and a little shell script.  This allows
+    // the Chebyshev polynomials to be of significantly lower degree (about 1/4)
+    // compared to fitting the whole [0,1] interval with a single polynomial.
+
+    switch ((int) y100) {
+    case 0: {
+        double t = 2*y100 - 1;
+        return 0.70878032454106438663e-3 + (0.71234091047026302958e-3 + (0.35779077297597742384e-5 + (0.17403143962587937815e-7 + (0.81710660047307788845e-10 + (0.36885022360434957634e-12 + 0.15917038551111111111e-14 * t) * t) * t) * t) * t) * t;
+    }
+    case 1: {
+        double t = 2*y100 - 3;
+        return 0.21479143208285144230e-2 + (0.72686402367379996033e-3 + (0.36843175430938995552e-5 + (0.18071841272149201685e-7 + (0.85496449296040325555e-10 + (0.38852037518534291510e-12 + 0.16868473576888888889e-14 * t) * t) * t) * t) * t) * t;
+    }
+    case 2: {
+        double t = 2*y100 - 5;
+        return 0.36165255935630175090e-2 + (0.74182092323555510862e-3 + (0.37948319957528242260e-5 + (0.18771627021793087350e-7 + (0.89484715122415089123e-10 + (0.40935858517772440862e-12 + 0.17872061464888888889e-14 * t) * t) * t) * t) * t) * t;
+    }
+    case 3: {
+        double t = 2*y100 - 7;
+        return 0.51154983860031979264e-2 + (0.75722840734791660540e-3 + (0.39096425726735703941e-5 + (0.19504168704300468210e-7 + (0.93687503063178993915e-10 + (0.43143925959079664747e-12 + 0.18939926435555555556e-14 * t) * t) * t) * t) * t) * t;
+    }
+    case 4: {
+        double t = 2*y100 - 9;
+        return 0.66457513172673049824e-2 + (0.77310406054447454920e-3 + (0.40289510589399439385e-5 + (0.20271233238288381092e-7 + (0.98117631321709100264e-10 + (0.45484207406017752971e-12 + 0.20076352213333333333e-14 * t) * t) * t) * t) * t) * t;
+    }
+    case 5: {
+        double t = 2*y100 - 11;
+        return 0.82082389970241207883e-2 + (0.78946629611881710721e-3 + (0.41529701552622656574e-5 + (0.21074693344544655714e-7 + (0.10278874108587317989e-9 + (0.47965201390613339638e-12 + 0.21285907413333333333e-14 * t) * t) * t) * t) * t) * t;
+    }
+    case 6: {
+        double t = 2*y100 - 13;
+        return 0.98039537275352193165e-2 + (0.80633440108342840956e-3 + (0.42819241329736982942e-5 + (0.21916534346907168612e-7 + (0.10771535136565470914e-9 + (0.50595972623692822410e-12 + 0.22573462684444444444e-14 * t) * t) * t) * t) * t) * t;
+    }
+    case 7: {
+        double t = 2*y100 - 15;
+        return 0.11433927298290302370e-1 + (0.82372858383196561209e-3 + (0.44160495311765438816e-5 + (0.22798861426211986056e-7 + (0.11291291745879239736e-9 + (0.53386189365816880454e-12 + 0.23944209546666666667e-14 * t) * t) * t) * t) * t) * t;
+    }
+    case 8: {
+        double t = 2*y100 - 17;
+        return 0.13099232878814653979e-1 + (0.84167002467906968214e-3 + (0.45555958988457506002e-5 + (0.23723907357214175198e-7 + (0.11839789326602695603e-9 + (0.56346163067550237877e-12 + 0.25403679644444444444e-14 * t) * t) * t) * t) * t) * t;
+    }
+    case 9: {
+        double t = 2*y100 - 19;
+        return 0.14800987015587535621e-1 + (0.86018092946345943214e-3 + (0.47008265848816866105e-5 + (0.24694040760197315333e-7 + (0.12418779768752299093e-9 + (0.59486890370320261949e-12 + 0.26957764568888888889e-14 * t) * t) * t) * t) * t) * t;
+    }
+    case 10: {
+        double t = 2*y100 - 21;
+        return 0.16540351739394069380e-1 + (0.87928458641241463952e-3 + (0.48520195793001753903e-5 + (0.25711774900881709176e-7 + (0.13030128534230822419e-9 + (0.62820097586874779402e-12 + 0.28612737351111111111e-14 * t) * t) * t) * t) * t) * t;
+    }
+    case 11: {
+        double t = 2*y100 - 23;
+        return 0.18318536789842392647e-1 + (0.89900542647891721692e-3 + (0.50094684089553365810e-5 + (0.26779777074218070482e-7 + (0.13675822186304615566e-9 + (0.66358287745352705725e-12 + 0.30375273884444444444e-14 * t) * t) * t) * t) * t) * t;
+    }
+    case 12: {
+        double t = 2*y100 - 25;
+        return 0.20136801964214276775e-1 + (0.91936908737673676012e-3 + (0.51734830914104276820e-5 + (0.27900878609710432673e-7 + (0.14357976402809042257e-9 + (0.70114790311043728387e-12 + 0.32252476000000000000e-14 * t) * t) * t) * t) * t) * t;
+    }
+    case 13: {
+        double t = 2*y100 - 27;
+        return 0.21996459598282740954e-1 + (0.94040248155366777784e-3 + (0.53443911508041164739e-5 + (0.29078085538049374673e-7 + (0.15078844500329731137e-9 + (0.74103813647499204269e-12 + 0.34251892320000000000e-14 * t) * t) * t) * t) * t) * t;
+    }
+    case 14: {
+        double t = 2*y100 - 29;
+        return 0.23898877187226319502e-1 + (0.96213386835900177540e-3 + (0.55225386998049012752e-5 + (0.30314589961047687059e-7 + (0.15840826497296335264e-9 + (0.78340500472414454395e-12 + 0.36381553564444444445e-14 * t) * t) * t) * t) * t) * t;
+    }
+    case 15: {
+        double t = 2*y100 - 31;
+        return 0.25845480155298518485e-1 + (0.98459293067820123389e-3 + (0.57082915920051843672e-5 + (0.31613782169164830118e-7 + (0.16646478745529630813e-9 + (0.82840985928785407942e-12 + 0.38649975768888888890e-14 * t) * t) * t) * t) * t) * t;
+    }
+    case 16: {
+        double t = 2*y100 - 33;
+        return 0.27837754783474696598e-1 + (0.10078108563256892757e-2 + (0.59020366493792212221e-5 + (0.32979263553246520417e-7 + (0.17498524159268458073e-9 + (0.87622459124842525110e-12 + 0.41066206488888888890e-14 * t) * t) * t) * t) * t) * t;
+    }
+    case 17: {
+        double t = 2*y100 - 35;
+        return 0.29877251304899307550e-1 + (0.10318204245057349310e-2 + (0.61041829697162055093e-5 + (0.34414860359542720579e-7 + (0.18399863072934089607e-9 + (0.92703227366365046533e-12 + 0.43639844053333333334e-14 * t) * t) * t) * t) * t) * t;
+    }
+    case 18: {
+        double t = 2*y100 - 37;
+        return 0.31965587178596443475e-1 + (0.10566560976716574401e-2 + (0.63151633192414586770e-5 + (0.35924638339521924242e-7 + (0.19353584758781174038e-9 + (0.98102783859889264382e-12 + 0.46381060817777777779e-14 * t) * t) * t) * t) * t) * t;
+    }
+    case 19: {
+        double t = 2*y100 - 39;
+        return 0.34104450552588334840e-1 + (0.10823541191350532574e-2 + (0.65354356159553934436e-5 + (0.37512918348533521149e-7 + (0.20362979635817883229e-9 + (0.10384187833037282363e-11 + 0.49300625262222222221e-14 * t) * t) * t) * t) * t) * t;
+    }
+    case 20: {
+        double t = 2*y100 - 41;
+        return 0.36295603928292425716e-1 + (0.11089526167995268200e-2 + (0.67654845095518363577e-5 + (0.39184292949913591646e-7 + (0.21431552202133775150e-9 + (0.10994259106646731797e-11 + 0.52409949102222222221e-14 * t) * t) * t) * t) * t) * t;
+    }
+    case 21: {
+        double t = 2*y100 - 43;
+        return 0.38540888038840509795e-1 + (0.11364917134175420009e-2 + (0.70058230641246312003e-5 + (0.40943644083718586939e-7 + (0.22563034723692881631e-9 + (0.11642841011361992885e-11 + 0.55721092871111111110e-14 * t) * t) * t) * t) * t) * t;
+    }
+    case 22: {
+        double t = 2*y100 - 45;
+        return 0.40842225954785960651e-1 + (0.11650136437945673891e-2 + (0.72569945502343006619e-5 + (0.42796161861855042273e-7 + (0.23761401711005024162e-9 + (0.12332431172381557035e-11 + 0.59246802364444444445e-14 * t) * t) * t) * t) * t) * t;
+    }
+    case 23: {
+        double t = 2*y100 - 47;
+        return 0.43201627431540222422e-1 + (0.11945628793917272199e-2 + (0.75195743532849206263e-5 + (0.44747364553960993492e-7 + (0.25030885216472953674e-9 + (0.13065684400300476484e-11 + 0.63000532853333333334e-14 * t) * t) * t) * t) * t) * t;
+    }
+    case 24: {
+        double t = 2*y100 - 49;
+        return 0.45621193513810471438e-1 + (0.12251862608067529503e-2 + (0.77941720055551920319e-5 + (0.46803119830954460212e-7 + (0.26375990983978426273e-9 + (0.13845421370977119765e-11 + 0.66996477404444444445e-14 * t) * t) * t) * t) * t) * t;
+    }
+    case 25: {
+        double t = 2*y100 - 51;
+        return 0.48103121413299865517e-1 + (0.12569331386432195113e-2 + (0.80814333496367673980e-5 + (0.48969667335682018324e-7 + (0.27801515481905748484e-9 + (0.14674637611609884208e-11 + 0.71249589351111111110e-14 * t) * t) * t) * t) * t) * t;
+    }
+    case 26: {
+        double t = 2*y100 - 53;
+        return 0.50649709676983338501e-1 + (0.12898555233099055810e-2 + (0.83820428414568799654e-5 + (0.51253642652551838659e-7 + (0.29312563849675507232e-9 + (0.15556512782814827846e-11 + 0.75775607822222222221e-14 * t) * t) * t) * t) * t) * t;
+    }
+    case 27: {
+        double t = 2*y100 - 55;
+        return 0.53263363664388864181e-1 + (0.13240082443256975769e-2 + (0.86967260015007658418e-5 + (0.53662102750396795566e-7 + (0.30914568786634796807e-9 + (0.16494420240828493176e-11 + 0.80591079644444444445e-14 * t) * t) * t) * t) * t) * t;
+    }
+    case 28: {
+        double t = 2*y100 - 57;
+        return 0.55946601353500013794e-1 + (0.13594491197408190706e-2 + (0.90262520233016380987e-5 + (0.56202552975056695376e-7 + (0.32613310410503135996e-9 + (0.17491936862246367398e-11 + 0.85713381688888888890e-14 * t) * t) * t) * t) * t) * t;
+    }
+    case 29: {
+        double t = 2*y100 - 59;
+        return 0.58702059496154081813e-1 + (0.13962391363223647892e-2 + (0.93714365487312784270e-5 + (0.58882975670265286526e-7 + (0.34414937110591753387e-9 + (0.18552853109751857859e-11 + 0.91160736711111111110e-14 * t) * t) * t) * t) * t) * t;
+    }
+    case 30: {
+        double t = 2*y100 - 61;
+        return 0.61532500145144778048e-1 + (0.14344426411912015247e-2 + (0.97331446201016809696e-5 + (0.61711860507347175097e-7 + (0.36325987418295300221e-9 + (0.19681183310134518232e-11 + 0.96952238400000000000e-14 * t) * t) * t) * t) * t) * t;
+    }
+    case 31: {
+        double t = 2*y100 - 63;
+        return 0.64440817576653297993e-1 + (0.14741275456383131151e-2 + (0.10112293819576437838e-4 + (0.64698236605933246196e-7 + (0.38353412915303665586e-9 + (0.20881176114385120186e-11 + 0.10310784480000000000e-13 * t) * t) * t) * t) * t) * t;
+    }
+    case 32: {
+        double t = 2*y100 - 65;
+        return 0.67430045633130393282e-1 + (0.15153655418916540370e-2 + (0.10509857606888328667e-4 + (0.67851706529363332855e-7 + (0.40504602194811140006e-9 + (0.22157325110542534469e-11 + 0.10964842115555555556e-13 * t) * t) * t) * t) * t) * t;
+    }
+    case 33: {
+        double t = 2*y100 - 67;
+        return 0.70503365513338850709e-1 + (0.15582323336495709827e-2 + (0.10926868866865231089e-4 + (0.71182482239613507542e-7 + (0.42787405890153386710e-9 + (0.23514379522274416437e-11 + 0.11659571751111111111e-13 * t) * t) * t) * t) * t) * t;
+    }
+    case 34: {
+        double t = 2*y100 - 69;
+        return 0.73664114037944596353e-1 + (0.16028078812438820413e-2 + (0.11364423678778207991e-4 + (0.74701423097423182009e-7 + (0.45210162777476488324e-9 + (0.24957355004088569134e-11 + 0.12397238257777777778e-13 * t) * t) * t) * t) * t) * t;
+    }
+    case 35: {
+        double t = 2*y100 - 71;
+        return 0.76915792420819562379e-1 + (0.16491766623447889354e-2 + (0.11823685320041302169e-4 + (0.78420075993781544386e-7 + (0.47781726956916478925e-9 + (0.26491544403815724749e-11 + 0.13180196462222222222e-13 * t) * t) * t) * t) * t) * t;
+    }
+    case 36: {
+        double t = 2*y100 - 73;
+        return 0.80262075578094612819e-1 + (0.16974279491709504117e-2 + (0.12305888517309891674e-4 + (0.82350717698979042290e-7 + (0.50511496109857113929e-9 + (0.28122528497626897696e-11 + 0.14010889635555555556e-13 * t) * t) * t) * t) * t) * t;
+    }
+    case 37: {
+        double t = 2*y100 - 75;
+        return 0.83706822008980357446e-1 + (0.17476561032212656962e-2 + (0.12812343958540763368e-4 + (0.86506399515036435592e-7 + (0.53409440823869467453e-9 + (0.29856186620887555043e-11 + 0.14891851591111111111e-13 * t) * t) * t) * t) * t) * t;
+    }
+    case 38: {
+        double t = 2*y100 - 77;
+        return 0.87254084284461718231e-1 + (0.17999608886001962327e-2 + (0.13344443080089492218e-4 + (0.90900994316429008631e-7 + (0.56486134972616465316e-9 + (0.31698707080033956934e-11 + 0.15825697795555555556e-13 * t) * t) * t) * t) * t) * t;
+    }
+    case 39: {
+        double t = 2*y100 - 79;
+        return 0.90908120182172748487e-1 + (0.18544478050657699758e-2 + (0.13903663143426120077e-4 + (0.95549246062549906177e-7 + (0.59752787125242054315e-9 + (0.33656597366099099413e-11 + 0.16815130613333333333e-13 * t) * t) * t) * t) * t) * t;
+    }
+    case 40: {
+        double t = 2*y100 - 81;
+        return 0.94673404508075481121e-1 + (0.19112284419887303347e-2 + (0.14491572616545004930e-4 + (0.10046682186333613697e-6 + (0.63221272959791000515e-9 + (0.35736693975589130818e-11 + 0.17862931591111111111e-13 * t) * t) * t) * t) * t) * t;
+    }
+    case 41: {
+        double t = 2*y100 - 83;
+        return 0.98554641648004456555e-1 + (0.19704208544725622126e-2 + (0.15109836875625443935e-4 + (0.10567036667675984067e-6 + (0.66904168640019354565e-9 + (0.37946171850824333014e-11 + 0.18971959040000000000e-13 * t) * t) * t) * t) * t) * t;
+    }
+    case 42: {
+        double t = 2*y100 - 85;
+        return 0.10255677889470089531e0 + (0.20321499629472857418e-2 + (0.15760224242962179564e-4 + (0.11117756071353507391e-6 + (0.70814785110097658502e-9 + (0.40292553276632563925e-11 + 0.20145143075555555556e-13 * t) * t) * t) * t) * t) * t;
+    }
+    case 43: {
+        double t = 2*y100 - 87;
+        return 0.10668502059865093318e0 + (0.20965479776148731610e-2 + (0.16444612377624983565e-4 + (0.11700717962026152749e-6 + (0.74967203250938418991e-9 + (0.42783716186085922176e-11 + 0.21385479360000000000e-13 * t) * t) * t) * t) * t) * t;
+    }
+    case 44: {
+        double t = 2*y100 - 89;
+        return 0.11094484319386444474e0 + (0.21637548491908170841e-2 + (0.17164995035719657111e-4 + (0.12317915750735938089e-6 + (0.79376309831499633734e-9 + (0.45427901763106353914e-11 + 0.22696025653333333333e-13 * t) * t) * t) * t) * t) * t;
+    }
+    case 45: {
+        double t = 2*y100 - 91;
+        return 0.11534201115268804714e0 + (0.22339187474546420375e-2 + (0.17923489217504226813e-4 + (0.12971465288245997681e-6 + (0.84057834180389073587e-9 + (0.48233721206418027227e-11 + 0.24079890062222222222e-13 * t) * t) * t) * t) * t) * t;
+    }
+    case 46: {
+        double t = 2*y100 - 93;
+        return 0.11988259392684094740e0 + (0.23071965691918689601e-2 + (0.18722342718958935446e-4 + (0.13663611754337957520e-6 + (0.89028385488493287005e-9 + (0.51210161569225846701e-11 + 0.25540227111111111111e-13 * t) * t) * t) * t) * t) * t;
+    }
+    case 47: {
+        double t = 2*y100 - 95;
+        return 0.12457298393509812907e0 + (0.23837544771809575380e-2 + (0.19563942105711612475e-4 + (0.14396736847739470782e-6 + (0.94305490646459247016e-9 + (0.54366590583134218096e-11 + 0.27080225920000000000e-13 * t) * t) * t) * t) * t) * t;
+    }
+    case 48: {
+        double t = 2*y100 - 97;
+        return 0.12941991566142438816e0 + (0.24637684719508859484e-2 + (0.20450821127475879816e-4 + (0.15173366280523906622e-6 + (0.99907632506389027739e-9 + (0.57712760311351625221e-11 + 0.28703099555555555556e-13 * t) * t) * t) * t) * t) * t;
+    }
+    case 49: {
+        double t = 2*y100 - 99;
+        return 0.13443048593088696613e0 + (0.25474249981080823877e-2 + (0.21385669591362915223e-4 + (0.15996177579900443030e-6 + (0.10585428844575134013e-8 + (0.61258809536787882989e-11 + 0.30412080142222222222e-13 * t) * t) * t) * t) * t) * t;
+    }
+    case 50: {
+        double t = 2*y100 - 101;
+        return 0.13961217543434561353e0 + (0.26349215871051761416e-2 + (0.22371342712572567744e-4 + (0.16868008199296822247e-6 + (0.11216596910444996246e-8 + (0.65015264753090890662e-11 + 0.32210394506666666666e-13 * t) * t) * t) * t) * t) * t;
+    }
+    case 51: {
+        double t = 2*y100 - 103;
+        return 0.14497287157673800690e0 + (0.27264675383982439814e-2 + (0.23410870961050950197e-4 + (0.17791863939526376477e-6 + (0.11886425714330958106e-8 + (0.68993039665054288034e-11 + 0.34101266222222222221e-13 * t) * t) * t) * t) * t) * t;
+    }
+    case 52: {
+        double t = 2*y100 - 105;
+        return 0.15052089272774618151e0 + (0.28222846410136238008e-2 + (0.24507470422713397006e-4 + (0.18770927679626136909e-6 + (0.12597184587583370712e-8 + (0.73203433049229821618e-11 + 0.36087889048888888890e-13 * t) * t) * t) * t) * t) * t;
+    }
+    case 53: {
+        double t = 2*y100 - 107;
+        return 0.15626501395774612325e0 + (0.29226079376196624949e-2 + (0.25664553693768450545e-4 + (0.19808568415654461964e-6 + (0.13351257759815557897e-8 + (0.77658124891046760667e-11 + 0.38173420035555555555e-13 * t) * t) * t) * t) * t) * t;
+    }
+    case 54: {
+        double t = 2*y100 - 109;
+        return 0.16221449434620737567e0 + (0.30276865332726475672e-2 + (0.26885741326534564336e-4 + (0.20908350604346384143e-6 + (0.14151148144240728728e-8 + (0.82369170665974313027e-11 + 0.40360957457777777779e-13 * t) * t) * t) * t) * t) * t;
+    }
+    case 55: {
+        double t = 2*y100 - 111;
+        return 0.16837910595412130659e0 + (0.31377844510793082301e-2 + (0.28174873844911175026e-4 + (0.22074043807045782387e-6 + (0.14999481055996090039e-8 + (0.87348993661930809254e-11 + 0.42653528977777777779e-13 * t) * t) * t) * t) * t) * t;
+    }
+    case 56: {
+        double t = 2*y100 - 113;
+        return 0.17476916455659369953e0 + (0.32531815370903068316e-2 + (0.29536024347344364074e-4 + (0.23309632627767074202e-6 + (0.15899007843582444846e-8 + (0.92610375235427359475e-11 + 0.45054073102222222221e-13 * t) * t) * t) * t) * t) * t;
+    }
+    case 57: {
+        double t = 2*y100 - 115;
+        return 0.18139556223643701364e0 + (0.33741744168096996041e-2 + (0.30973511714709500836e-4 + (0.24619326937592290996e-6 + (0.16852609412267750744e-8 + (0.98166442942854895573e-11 + 0.47565418097777777779e-13 * t) * t) * t) * t) * t) * t;
+    }
+    case 58: {
+        double t = 2*y100 - 117;
+        return 0.18826980194443664549e0 + (0.35010775057740317997e-2 + (0.32491914440014267480e-4 + (0.26007572375886319028e-6 + (0.17863299617388376116e-8 + (0.10403065638343878679e-10 + 0.50190265831111111110e-13 * t) * t) * t) * t) * t) * t;
+    }
+    case 59: {
+        double t = 2*y100 - 119;
+        return 0.19540403413693967350e0 + (0.36342240767211326315e-2 + (0.34096085096200907289e-4 + (0.27479061117017637474e-6 + (0.18934228504790032826e-8 + (0.11021679075323598664e-10 + 0.52931171733333333334e-13 * t) * t) * t) * t) * t) * t;
+    }
+    case 60: {
+        double t = 2*y100 - 121;
+        return 0.20281109560651886959e0 + (0.37739673859323597060e-2 + (0.35791165457592409054e-4 + (0.29038742889416172404e-6 + (0.20068685374849001770e-8 + (0.11673891799578381999e-10 + 0.55790523093333333334e-13 * t) * t) * t) * t) * t) * t;
+    }
+    case 61: {
+        double t = 2*y100 - 123;
+        return 0.21050455062669334978e0 + (0.39206818613925652425e-2 + (0.37582602289680101704e-4 + (0.30691836231886877385e-6 + (0.21270101645763677824e-8 + (0.12361138551062899455e-10 + 0.58770520160000000000e-13 * t) * t) * t) * t) * t) * t;
+    }
+    case 62: {
+        double t = 2*y100 - 125;
+        return 0.21849873453703332479e0 + (0.40747643554689586041e-2 + (0.39476163820986711501e-4 + (0.32443839970139918836e-6 + (0.22542053491518680200e-8 + (0.13084879235290858490e-10 + 0.61873153262222222221e-13 * t) * t) * t) * t) * t) * t;
+    }
+    case 63: {
+        double t = 2*y100 - 127;
+        return 0.22680879990043229327e0 + (0.42366354648628516935e-2 + (0.41477956909656896779e-4 + (0.34300544894502810002e-6 + (0.23888264229264067658e-8 + (0.13846596292818514601e-10 + 0.65100183751111111110e-13 * t) * t) * t) * t) * t) * t;
+    }
+    case 64: {
+        double t = 2*y100 - 129;
+        return 0.23545076536988703937e0 + (0.44067409206365170888e-2 + (0.43594444916224700881e-4 + (0.36268045617760415178e-6 + (0.25312606430853202748e-8 + (0.14647791812837903061e-10 + 0.68453122631111111110e-13 * t) * t) * t) * t) * t) * t;
+    }
+    case 65: {
+        double t = 2*y100 - 131;
+        return 0.24444156740777432838e0 + (0.45855530511605787178e-2 + (0.45832466292683085475e-4 + (0.38352752590033030472e-6 + (0.26819103733055603460e-8 + (0.15489984390884756993e-10 + 0.71933206364444444445e-13 * t) * t) * t) * t) * t) * t;
+    }
+    case 66: {
+        double t = 2*y100 - 133;
+        return 0.25379911500634264643e0 + (0.47735723208650032167e-2 + (0.48199253896534185372e-4 + (0.40561404245564732314e-6 + (0.28411932320871165585e-8 + (0.16374705736458320149e-10 + 0.75541379822222222221e-13 * t) * t) * t) * t) * t) * t;
+    }
+    case 67: {
+        double t = 2*y100 - 135;
+        return 0.26354234756393613032e0 + (0.49713289477083781266e-2 + (0.50702455036930367504e-4 + (0.42901079254268185722e-6 + (0.30095422058900481753e-8 + (0.17303497025347342498e-10 + 0.79278273368888888890e-13 * t) * t) * t) * t) * t) * t;
+    }
+    case 68: {
+        double t = 2*y100 - 137;
+        return 0.27369129607732343398e0 + (0.51793846023052643767e-2 + (0.53350152258326602629e-4 + (0.45379208848865015485e-6 + (0.31874057245814381257e-8 + (0.18277905010245111046e-10 + 0.83144182364444444445e-13 * t) * t) * t) * t) * t) * t;
+    }
+    case 69: {
+        double t = 2*y100 - 139;
+        return 0.28426714781640316172e0 + (0.53983341916695141966e-2 + (0.56150884865255810638e-4 + (0.48003589196494734238e-6 + (0.33752476967570796349e-8 + (0.19299477888083469086e-10 + 0.87139049137777777779e-13 * t) * t) * t) * t) * t) * t;
+    }
+    case 70: {
+        double t = 2*y100 - 141;
+        return 0.29529231465348519920e0 + (0.56288077305420795663e-2 + (0.59113671189913307427e-4 + (0.50782393781744840482e-6 + (0.35735475025851713168e-8 + (0.20369760937017070382e-10 + 0.91262442613333333334e-13 * t) * t) * t) * t) * t) * t;
+    }
+    case 71: {
+        double t = 2*y100 - 143;
+        return 0.30679050522528838613e0 + (0.58714723032745403331e-2 + (0.62248031602197686791e-4 + (0.53724185766200945789e-6 + (0.37827999418960232678e-8 + (0.21490291930444538307e-10 + 0.95513539182222222221e-13 * t) * t) * t) * t) * t) * t;
+    }
+    case 72: {
+        double t = 2*y100 - 145;
+        return 0.31878680111173319425e0 + (0.61270341192339103514e-2 + (0.65564012259707640976e-4 + (0.56837930287837738996e-6 + (0.40035151353392378882e-8 + (0.22662596341239294792e-10 + 0.99891109760000000000e-13 * t) * t) * t) * t) * t) * t;
+    }
+    case 73: {
+        double t = 2*y100 - 147;
+        return 0.33130773722152622027e0 + (0.63962406646798080903e-2 + (0.69072209592942396666e-4 + (0.60133006661885941812e-6 + (0.42362183765883466691e-8 + (0.23888182347073698382e-10 + 0.10439349811555555556e-12 * t) * t) * t) * t) * t) * t;
+    }
+    case 74: {
+        double t = 2*y100 - 149;
+        return 0.34438138658041336523e0 + (0.66798829540414007258e-2 + (0.72783795518603561144e-4 + (0.63619220443228800680e-6 + (0.44814499336514453364e-8 + (0.25168535651285475274e-10 + 0.10901861383111111111e-12 * t) * t) * t) * t) * t) * t;
+    }
+    case 75: {
+        double t = 2*y100 - 151;
+        return 0.35803744972380175583e0 + (0.69787978834882685031e-2 + (0.76710543371454822497e-4 + (0.67306815308917386747e-6 + (0.47397647975845228205e-8 + (0.26505114141143050509e-10 + 0.11376390933333333333e-12 * t) * t) * t) * t) * t) * t;
+    }
+    case 76: {
+        double t = 2*y100 - 153;
+        return 0.37230734890119724188e0 + (0.72938706896461381003e-2 + (0.80864854542670714092e-4 + (0.71206484718062688779e-6 + (0.50117323769745883805e-8 + (0.27899342394100074165e-10 + 0.11862637614222222222e-12 * t) * t) * t) * t) * t) * t;
+    }
+    case 77: {
+        double t = 2*y100 - 155;
+        return 0.38722432730555448223e0 + (0.76260375162549802745e-2 + (0.85259785810004603848e-4 + (0.75329383305171327677e-6 + (0.52979361368388119355e-8 + (0.29352606054164086709e-10 + 0.12360253370666666667e-12 * t) * t) * t) * t) * t) * t;
+    }
+    case 78: {
+        double t = 2*y100 - 157;
+        return 0.40282355354616940667e0 + (0.79762880915029728079e-2 + (0.89909077342438246452e-4 + (0.79687137961956194579e-6 + (0.55989731807360403195e-8 + (0.30866246101464869050e-10 + 0.12868841946666666667e-12 * t) * t) * t) * t) * t) * t;
+    }
+    case 79: {
+        double t = 2*y100 - 159;
+        return 0.41914223158913787649e0 + (0.83456685186950463538e-2 + (0.94827181359250161335e-4 + (0.84291858561783141014e-6 + (0.59154537751083485684e-8 + (0.32441553034347469291e-10 + 0.13387957943111111111e-12 * t) * t) * t) * t) * t) * t;
+    }
+    case 80: {
+        double t = 2*y100 - 161;
+        return 0.43621971639463786896e0 + (0.87352841828289495773e-2 + (0.10002929142066799966e-3 + (0.89156148280219880024e-6 + (0.62480008150788597147e-8 + (0.34079760983458878910e-10 + 0.13917107176888888889e-12 * t) * t) * t) * t) * t) * t;
+    }
+    case 81: {
+        double t = 2*y100 - 163;
+        return 0.45409763548534330981e0 + (0.91463027755548240654e-2 + (0.10553137232446167258e-3 + (0.94293113464638623798e-6 + (0.65972492312219959885e-8 + (0.35782041795476563662e-10 + 0.14455745872000000000e-12 * t) * t) * t) * t) * t) * t;
+    }
+    case 82: {
+        double t = 2*y100 - 165;
+        return 0.47282001668512331468e0 + (0.95799574408860463394e-2 + (0.11135019058000067469e-3 + (0.99716373005509038080e-6 + (0.69638453369956970347e-8 + (0.37549499088161345850e-10 + 0.15003280712888888889e-12 * t) * t) * t) * t) * t) * t;
+    }
+    case 83: {
+        double t = 2*y100 - 167;
+        return 0.49243342227179841649e0 + (0.10037550043909497071e-1 + (0.11750334542845234952e-3 + (0.10544006716188967172e-5 + (0.73484461168242224872e-8 + (0.39383162326435752965e-10 + 0.15559069118222222222e-12 * t) * t) * t) * t) * t) * t;
+    }
+    case 84: {
+        double t = 2*y100 - 169;
+        return 0.51298708979209258326e0 + (0.10520454564612427224e-1 + (0.12400930037494996655e-3 + (0.11147886579371265246e-5 + (0.77517184550568711454e-8 + (0.41283980931872622611e-10 + 0.16122419680000000000e-12 * t) * t) * t) * t) * t) * t;
+    }
+    case 85: {
+        double t = 2*y100 - 171;
+        return 0.53453307979101369843e0 + (0.11030120618800726938e-1 + (0.13088741519572269581e-3 + (0.11784797595374515432e-5 + (0.81743383063044825400e-8 + (0.43252818449517081051e-10 + 0.16692592640000000000e-12 * t) * t) * t) * t) * t) * t;
+    }
+    case 86: {
+        double t = 2*y100 - 173;
+        return 0.55712643071169299478e0 + (0.11568077107929735233e-1 + (0.13815797838036651289e-3 + (0.12456314879260904558e-5 + (0.86169898078969313597e-8 + (0.45290446811539652525e-10 + 0.17268801084444444444e-12 * t) * t) * t) * t) * t) * t;
+    }
+    case 87: {
+        double t = 2*y100 - 175;
+        return 0.58082532122519320968e0 + (0.12135935999503877077e-1 + (0.14584223996665838559e-3 + (0.13164068573095710742e-5 + (0.90803643355106020163e-8 + (0.47397540713124619155e-10 + 0.17850211608888888889e-12 * t) * t) * t) * t) * t) * t;
+    }
+    case 88: {
+        double t = 2*y100 - 177;
+        return 0.60569124025293375554e0 + (0.12735396239525550361e-1 + (0.15396244472258863344e-3 + (0.13909744385382818253e-5 + (0.95651595032306228245e-8 + (0.49574672127669041550e-10 + 0.18435945564444444444e-12 * t) * t) * t) * t) * t) * t;
+    }
+    case 89: {
+        double t = 2*y100 - 179;
+        return 0.63178916494715716894e0 + (0.13368247798287030927e-1 + (0.16254186562762076141e-3 + (0.14695084048334056083e-5 + (0.10072078109604152350e-7 + (0.51822304995680707483e-10 + 0.19025081422222222222e-12 * t) * t) * t) * t) * t) * t;
+    }
+    case 90: {
+        double t = 2*y100 - 181;
+        return 0.65918774689725319200e0 + (0.14036375850601992063e-1 + (0.17160483760259706354e-3 + (0.15521885688723188371e-5 + (0.10601827031535280590e-7 + (0.54140790105837520499e-10 + 0.19616655146666666667e-12 * t) * t) * t) * t) * t) * t;
+    }
+    case 91: {
+        double t = 2*y100 - 183;
+        return 0.68795950683174433822e0 + (0.14741765091365869084e-1 + (0.18117679143520433835e-3 + (0.16392004108230585213e-5 + (0.11155116068018043001e-7 + (0.56530360194925690374e-10 + 0.20209663662222222222e-12 * t) * t) * t) * t) * t) * t;
+    }
+    case 92: {
+        double t = 2*y100 - 185;
+        return 0.71818103808729967036e0 + (0.15486504187117112279e-1 + (0.19128428784550923217e-3 + (0.17307350969359975848e-5 + (0.11732656736113607751e-7 + (0.58991125287563833603e-10 + 0.20803065333333333333e-12 * t) * t) * t) * t) * t) * t;
+    }
+    case 93: {
+        double t = 2*y100 - 187;
+        return 0.74993321911726254661e0 + (0.16272790364044783382e-1 + (0.20195505163377912645e-3 + (0.18269894883203346953e-5 + (0.12335161021630225535e-7 + (0.61523068312169087227e-10 + 0.21395783431111111111e-12 * t) * t) * t) * t) * t) * t;
+    }
+    case 94: {
+        double t = 2*y100 - 189;
+        return 0.78330143531283492729e0 + (0.17102934132652429240e-1 + (0.21321800585063327041e-3 + (0.19281661395543913713e-5 + (0.12963340087354341574e-7 + (0.64126040998066348872e-10 + 0.21986708942222222222e-12 * t) * t) * t) * t) * t) * t;
+    }
+    case 95: {
+        double t = 2*y100 - 191;
+        return 0.81837581041023811832e0 + (0.17979364149044223802e-1 + (0.22510330592753129006e-3 + (0.20344732868018175389e-5 + (0.13617902941839949718e-7 + (0.66799760083972474642e-10 + 0.22574701262222222222e-12 * t) * t) * t) * t) * t) * t;
+    }
+    case 96: {
+        double t = 2*y100 - 193;
+        return 0.85525144775685126237e0 + (0.18904632212547561026e-1 + (0.23764237370371255638e-3 + (0.21461248251306387979e-5 + (0.14299555071870523786e-7 + (0.69543803864694171934e-10 + 0.23158593688888888889e-12 * t) * t) * t) * t) * t) * t;
+    }
+    case 97: {
+        double t = 2*y100 - 195;
+        return 0.89402868170849933734e0 + (0.19881418399127202569e-1 + (0.25086793128395995798e-3 + (0.22633402747585233180e-5 + (0.15008997042116532283e-7 + (0.72357609075043941261e-10 + 0.23737194737777777778e-12 * t) * t) * t) * t) * t) * t;
+    }
+    case 98: {
+        double t = 2*y100 - 197;
+        return 0.93481333942870796363e0 + (0.20912536329780368893e-1 + (0.26481403465998477969e-3 + (0.23863447359754921676e-5 + (0.15746923065472184451e-7 + (0.75240468141720143653e-10 + 0.24309291271111111111e-12 * t) * t) * t) * t) * t) * t;
+    }
+    case 99: {
+        double t = 2*y100 - 199;
+        return 0.97771701335885035464e0 + (0.22000938572830479551e-1 + (0.27951610702682383001e-3 + (0.25153688325245314530e-5 + (0.16514019547822821453e-7 + (0.78191526829368231251e-10 + 0.24873652355555555556e-12 * t) * t) * t) * t) * t) * t;
+    }
+    }
+    // we only get here if y = 1, i.e. |x| < 4*eps, in which case
+    // erfcx is within 1e-15 of 1..
+    return 1.0;
+} // erfcx_y100
+
+/******************************************************************************/
+/*  Library function erfcx                                                    */
+/******************************************************************************/
+
+double erfcx(double x)
+{
+    // Steven G. Johnson, October 2012.
+
+    // This function combines a few different ideas.
+
+    // First, for x > 50, it uses a continued-fraction expansion (same as
+    // for the Faddeeva function, but with algebraic simplifications for z=i*x).
+
+    // Second, for 0 <= x <= 50, it uses Chebyshev polynomial approximations,
+    // but with two twists:
+    //
+    // a) It maps x to y = 4 / (4+x) in [0,1].  This simple transformation,
+    // inspired by a similar transformation in the octave-forge/specfun
+    // erfcx by Soren Hauberg, results in much faster Chebyshev convergence
+    // than other simple transformations I have examined.
+    //
+    // b) Instead of using a single Chebyshev polynomial for the entire
+    // [0,1] y interval, we break the interval up into 100 equal
+    // subintervals, with a switch/lookup table, and use much lower
+    // degree Chebyshev polynomials in each subinterval. This greatly
+    // improves performance in my tests.
+    //
+    // For x < 0, we use the relationship erfcx(-x) = 2 exp(x^2) - erfc(x),
+    // with the usual checks for overflow etcetera.
+
+    // Performance-wise, it seems to be substantially faster than either
+    // the SLATEC DERFC function [or an erfcx function derived therefrom]
+    // or Cody's CALERF function (from netlib.org/specfun), while
+    // retaining near machine precision in accuracy.
+
+    if (x >= 0) {
+        if (x > 50) { 
+            // continued-fraction expansion is faster
+            const double ispi = 0.56418958354775628694807945156; // 1 / sqrt(pi)
+            if (x > 5e7) { 
+                // 1-term expansion, important to avoid overflow */
+                return ispi / x;
+            } else {
+                // 5-term expansion (rely on compiler for CSE), simplified from: ispi / (x+0.5/(x+1/(x+1.5/(x+2/x)))) 
+                return ispi * ((x*x) * (x*x+4.5) + 2) / (x * ((x*x) * (x*x+5) + 3.75));
+            }
+        }
+        return erfcx_y100(400/(4+x));
+    } else {
+        return x < -26.7 ? HUGE_VAL : (x < -6.1 ? 2*exp(x*x) : 2*exp(x*x) - erfcx_y100(400/(4-x)));
+    }
+
+} // erfcx
diff --git a/source/luametatex/source/libraries/libcerf/err_fcts.c b/source/luametatex/source/libraries/libcerf/err_fcts.c
new file mode 100644
index 000000000..9c0c7aed9
--- /dev/null
+++ b/source/luametatex/source/libraries/libcerf/err_fcts.c
@@ -0,0 +1,438 @@
+/* Library libcerf:
+ *   Compute complex error functions, based on a new implementation of
+ *   Faddeeva's w_of_z. Also provide Dawson and Voigt functions.
+ *
+ * File err_fcts.c:
+ *   Computate Dawson, Voigt, and several error functions,
+ *   based on erfcx, im_w_of_x, w_of_z as implemented in separate files.
+ *
+ *   Given w(z), the error functions are mostly straightforward
+ *   to compute, except for certain regions where we have to
+ *   switch to Taylor expansions to avoid cancellation errors
+ *   [e.g. near the origin for erf(z)].
+ *
+ * Copyright:
+ *   (C) 2012 Massachusetts Institute of Technology
+ *   (C) 2013 Forschungszentrum Jülich GmbH
+ *
+ * Licence:
+ *   Permission is hereby granted, free of charge, to any person obtaining
+ *   a copy of this software and associated documentation files (the
+ *   "Software"), to deal in the Software without restriction, including
+ *   without limitation the rights to use, copy, modify, merge, publish,
+ *   distribute, sublicense, and/or sell copies of the Software, and to
+ *   permit persons to whom the Software is furnished to do so, subject to
+ *   the following conditions:
+ *
+ *   The above copyright notice and this permission notice shall be
+ *   included in all copies or substantial portions of the Software.
+ *
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ *   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ *   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ *   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ *   LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ *   OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ *   WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *   Steven G. Johnson, Massachusetts Institute of Technology, 2012, core author
+ *   Joachim Wuttke, Forschungszentrum Jülich, 2013, package maintainer
+ *
+ * Website:
+ *   http://apps.jcns.fz-juelich.de/libcerf
+ *
+ * Revision history:
+ *   ../CHANGELOG
+ *
+ * Man pages:
+ *   cerf(3), dawson(3), voigt(3)
+ */
+
+#include "cerf.h"
+#include <math.h>
+#include "defs.h" // defines _cerf_cmplx, NaN, C, cexp, ...
+
+const double spi2 = 0.8862269254527580136490837416705725913990; // sqrt(pi)/2
+const double s2pi = 2.5066282746310005024157652848110; // sqrt(2*pi)
+const double pi   = 3.141592653589793238462643383279503;
+
+/******************************************************************************/
+/*  Simple wrappers: cerfcx, cerfi, erfi, dawson                              */
+/******************************************************************************/
+
+_cerf_cmplx cerfcx(_cerf_cmplx z)
+{
+    // Compute erfcx(z) = exp(z^2) erfc(z),
+    // the complex underflow-compensated complementary error function,
+    // trivially related to Faddeeva's w_of_z.
+
+    return w_of_z(C(-cimag(z), creal(z)));
+}
+
+_cerf_cmplx cerfi(_cerf_cmplx z)
+{
+    // Compute erfi(z) = -i erf(iz),
+    // the rotated complex error function.
+
+    _cerf_cmplx e = cerf(C(-cimag(z),creal(z)));
+    return C(cimag(e), -creal(e));
+}
+
+double erfi(double x)
+{
+    // Compute erfi(x) = -i erf(ix),
+    // the imaginary error function.
+
+    return x*x > 720 ? (x > 0 ? Inf : -Inf) : exp(x*x) * im_w_of_x(x);
+}
+
+double dawson(double x)
+{
+
+    // Compute dawson(x) = sqrt(pi)/2 * exp(-x^2) * erfi(x),
+    // Dawson's integral for a real argument.
+
+    return spi2 * im_w_of_x(x);
+}
+
+double re_w_of_z( double x, double y )
+{
+    return creal( w_of_z( C(x,y) ) );
+}
+
+double im_w_of_z( double x, double y )
+{
+    return cimag( w_of_z( C(x,y) ) );
+}
+
+/******************************************************************************/
+/*  voigt                                                                     */
+/******************************************************************************/
+
+double voigt( double x, double sigma, double gamma )
+{
+    // Joachim Wuttke, January 2013.
+
+    // Compute Voigt's convolution of a Gaussian
+    //    G(x,sigma) = 1/sqrt(2*pi)/|sigma| * exp(-x^2/2/sigma^2)
+    // and a Lorentzian
+    //    L(x,gamma) = |gamma| / pi / ( x^2 + gamma^2 ),
+    // namely
+    //    voigt(x,sigma,gamma) =
+    //          \int_{-infty}^{infty} dx' G(x',sigma) L(x-x',gamma)
+    // using the relation
+    //    voigt(x,sigma,gamma) = Re{ w(z) } / sqrt(2*pi) / |sigma|
+    // with
+    //    z = (x+i*|gamma|) / sqrt(2) / |sigma|.
+
+    // Reference: Abramowitz&Stegun (1964), formula (7.4.13).
+
+    double gam = gamma < 0 ? -gamma : gamma;
+    double sig = sigma < 0 ? -sigma : sigma;
+
+    if ( gam==0 ) {
+        if ( sig==0 ) {
+            // It's kind of a delta function
+            return x ? 0 : Inf;
+        } else {
+            // It's a pure Gaussian
+            return exp( -x*x/2/(sig*sig) ) / s2pi / sig;
+        }
+    } else {
+        if ( sig==0 ) {
+            // It's a pure Lorentzian
+            return gam / pi / (x*x + gam*gam);
+        } else {
+            // Regular case, both parameters are nonzero
+            _cerf_cmplx z = complex_mul_cr(C(x, gam), 1. / sqrt(2) / sig);
+            return creal( w_of_z(z) ) / s2pi / sig;
+            // TODO: correct and activate the following:
+//            double w = sqrt(gam*gam+sig*sig); // to work in reduced units
+//            _cerf_cmplx z = C(x/w,gam/w) / sqrt(2) / (sig/w);
+//            return creal( w_of_z(z) ) / s2pi / (sig/w);
+        }
+    }
+}
+
+/******************************************************************************/
+/*  cerf                                                                      */
+/******************************************************************************/
+
+_cerf_cmplx cerf(_cerf_cmplx z)
+{
+
+    // Steven G. Johnson, October 2012.
+
+    // Compute erf(z), the complex error function,
+    // using w_of_z except for certain regions.
+
+    double x = creal(z), y = cimag(z);
+
+    if (y == 0)
+        return C(erf(x), y); // preserve sign of 0
+    if (x == 0) // handle separately for speed & handling of y = Inf or NaN
+        return C(x, // preserve sign of 0
+                 /* handle y -> Inf limit manually, since
+                    exp(y^2) -> Inf but Im[w(y)] -> 0, so
+                    IEEE will give us a NaN when it should be Inf */
+                 y*y > 720 ? (y > 0 ? Inf : -Inf)
+                 : exp(y*y) * im_w_of_x(y));
+
+    double mRe_z2 = (y - x) * (x + y); // Re(-z^2), being careful of overflow
+    double mIm_z2 = -2*x*y; // Im(-z^2)
+    if (mRe_z2 < -750) // underflow
+        return (x >= 0 ? C(1.0, 0.0) : C(-1.0, 0.0));;
+
+    /* Handle positive and negative x via different formulas,
+       using the mirror symmetries of w, to avoid overflow/underflow
+       problems from multiplying exponentially large and small quantities. */
+    if (x >= 0) {
+        if (x < 8e-2) {
+            if (fabs(y) < 1e-2)
+                goto taylor;
+            else if (fabs(mIm_z2) < 5e-3 && x < 5e-3)
+                goto taylor_erfi;
+        }
+        /* don't use complex exp function, since that will produce spurious NaN
+           values when multiplying w in an overflow situation. */
+        return complex_sub_rc(1.0, complex_mul_rc(exp(mRe_z2), complex_mul_cc(C(cos(mIm_z2), sin(mIm_z2)), w_of_z(C(-y, x)))));
+    }
+    else { // x < 0
+        if (x > -8e-2) { // duplicate from above to avoid fabs(x) call
+            if (fabs(y) < 1e-2)
+                goto taylor;
+            else if (fabs(mIm_z2) < 5e-3 && x > -5e-3)
+                goto taylor_erfi;
+        }
+        else if (isnan(x))
+            return C(NaN, y == 0 ? 0 : NaN);
+        /* don't use complex exp function, since that will produce spurious NaN
+           values when multiplying w in an overflow situation. */
+        return complex_add_rc(-1.0, complex_mul_rc(exp(mRe_z2), complex_mul_cc(C(cos(mIm_z2), sin(mIm_z2)), w_of_z(C(y, -x)))));
+
+    }
+
+    // Use Taylor series for small |z|, to avoid cancellation inaccuracy
+    //   erf(z) = 2/sqrt(pi) * z * (1 - z^2/3 + z^4/10 - z^6/42 + z^8/216 + ...)
+taylor:
+    {
+        _cerf_cmplx mz2 = C(mRe_z2, mIm_z2); // -z^2
+        return
+            complex_mul_cc(z,   complex_add_rc(1.1283791670955125739,
+            complex_mul_cc(mz2, complex_add_rc(0.37612638903183752464,
+            complex_mul_cc(mz2, complex_add_rc(0.11283791670955125739,
+            complex_mul_cc(mz2, complex_add_rc(0.026866170645131251760,
+            complex_mul_cr(mz2,                0.0052239776254421878422)))))))));
+
+
+    }
+
+    /* for small |x| and small |xy|,
+       use Taylor series to avoid cancellation inaccuracy:
+       erf(x+iy) = erf(iy)
+       + 2*exp(y^2)/sqrt(pi) *
+       [ x * (1 - x^2 * (1+2y^2)/3 + x^4 * (3+12y^2+4y^4)/30 + ...
+       - i * x^2 * y * (1 - x^2 * (3+2y^2)/6 + ...) ]
+       where:
+       erf(iy) = exp(y^2) * Im[w(y)]
+    */
+taylor_erfi:
+    {
+        double x2 = x*x, y2 = y*y;
+        double expy2 = exp(y2);
+        return C
+            (expy2 * x * (1.1283791670955125739
+                          - x2 * (0.37612638903183752464
+                                  + 0.75225277806367504925*y2)
+                          + x2*x2 * (0.11283791670955125739
+                                     + y2 * (0.45135166683820502956
+                                             + 0.15045055561273500986*y2))),
+             expy2 * (im_w_of_x(y)
+                      - x2*y * (1.1283791670955125739
+                                - x2 * (0.56418958354775628695
+                                        + 0.37612638903183752464*y2))));
+    }
+} // cerf
+
+/******************************************************************************/
+/*  cerfc                                                                     */
+/******************************************************************************/
+
+_cerf_cmplx cerfc(_cerf_cmplx z)
+{
+    // Steven G. Johnson, October 2012.
+
+    // Compute erfc(z) = 1 - erf(z), the complex complementary error function,
+    // using w_of_z except for certain regions.
+
+    double x = creal(z), y = cimag(z);
+
+    if (x == 0.)
+        return C(1,
+                 /* handle y -> Inf limit manually, since
+                    exp(y^2) -> Inf but Im[w(y)] -> 0, so
+                    IEEE will give us a NaN when it should be Inf */
+                 y*y > 720 ? (y > 0 ? -Inf : Inf)
+                 : -exp(y*y) * im_w_of_x(y));
+    if (y == 0.) {
+        if (x*x > 750) // underflow
+            return C(x >= 0 ? 0.0 : 2.0,
+                     -y); // preserve sign of 0
+        return C(x >= 0 ? exp(-x*x) * erfcx(x)
+                 : 2. - exp(-x*x) * erfcx(-x),
+                 -y); // preserve sign of zero
+    }
+
+    double mRe_z2 = (y - x) * (x + y); // Re(-z^2), being careful of overflow
+    double mIm_z2 = -2*x*y; // Im(-z^2)
+    if (mRe_z2 < -750) // underflow
+        return C((x >= 0 ? 0.0 : 2.0), 0.0);
+
+    if (x >= 0)
+        return cexp(complex_mul_cc(C(mRe_z2, mIm_z2), w_of_z(C(-y,x))));
+    else
+        return complex_sub_rc(2.0, complex_mul_cc(cexp(C(mRe_z2, mIm_z2)), w_of_z(C(y, -x))));
+} // cerfc
+
+/******************************************************************************/
+/*  cdawson                                                                   */
+/******************************************************************************/
+
+_cerf_cmplx cdawson(_cerf_cmplx z)
+{
+
+    // Steven G. Johnson, October 2012.
+
+    // Compute Dawson(z) = sqrt(pi)/2  *  exp(-z^2) * erfi(z),
+    // Dawson's integral for a complex argument,
+    // using w_of_z except for certain regions.
+
+    double x = creal(z), y = cimag(z);
+
+    // handle axes separately for speed & proper handling of x or y = Inf or NaN
+    if (y == 0)
+        return C(spi2 * im_w_of_x(x),
+                 -y); // preserve sign of 0
+    if (x == 0) {
+        double y2 = y*y;
+        if (y2 < 2.5e-5) { // Taylor expansion
+            return C(x, // preserve sign of 0
+                     y * (1.
+                          + y2 * (0.6666666666666666666666666666666666666667
+                                  + y2 * 0.26666666666666666666666666666666666667)));
+        }
+        return C(x, // preserve sign of 0
+                 spi2 * (y >= 0
+                         ? exp(y2) - erfcx(y)
+                         : erfcx(-y) - exp(y2)));
+    }
+
+    double mRe_z2 = (y - x) * (x + y); // Re(-z^2), being careful of overflow
+    double mIm_z2 = -2*x*y; // Im(-z^2)
+    _cerf_cmplx mz2 = C(mRe_z2, mIm_z2); // -z^2
+
+    /* Handle positive and negative x via different formulas,
+       using the mirror symmetries of w, to avoid overflow/underflow
+       problems from multiplying exponentially large and small quantities. */
+    if (y >= 0) {
+        if (y < 5e-3) {
+            if (fabs(x) < 5e-3)
+                goto taylor;
+            else if (fabs(mIm_z2) < 5e-3)
+                goto taylor_realaxis;
+        }
+        _cerf_cmplx res = complex_sub_cc(cexp(mz2), w_of_z(z));
+        return complex_mul_rc(spi2, C(-cimag(res), creal(res)));
+    }
+    else { // y < 0
+        if (y > -5e-3) { // duplicate from above to avoid fabs(x) call
+            if (fabs(x) < 5e-3)
+                goto taylor;
+            else if (fabs(mIm_z2) < 5e-3)
+                goto taylor_realaxis;
+        }
+        else if (isnan(y))
+            return C(x == 0 ? 0 : NaN, NaN);
+        {
+            _cerf_cmplx res = complex_sub_cc(w_of_z(complex_neg(z)), cexp(mz2));
+            return complex_mul_rc(spi2, C(-cimag(res), creal(res)));
+        }
+    }
+
+    // Use Taylor series for small |z|, to avoid cancellation inaccuracy
+    //     dawson(z) = z - 2/3 z^3 + 4/15 z^5 + ...
+taylor:
+    return complex_mul_cc(z, complex_add_rc(1.,
+        complex_mul_cc(mz2, complex_add_rc(0.6666666666666666666666666666666666666667,
+            complex_mul_cr(mz2, 0.2666666666666666666666666666666666666667)))));
+    /* for small |y| and small |xy|,
+       use Taylor series to avoid cancellation inaccuracy:
+       dawson(x + iy)
+       = D + y^2 (D + x - 2Dx^2)
+       + y^4 (D/2 + 5x/6 - 2Dx^2 - x^3/3 + 2Dx^4/3)
+       + iy [ (1-2Dx) + 2/3 y^2 (1 - 3Dx - x^2 + 2Dx^3)
+       + y^4/15 (4 - 15Dx - 9x^2 + 20Dx^3 + 2x^4 - 4Dx^5) ] + ...
+       where D = dawson(x)
+
+       However, for large |x|, 2Dx -> 1 which gives cancellation problems in
+       this series (many of the leading terms cancel).  So, for large |x|,
+       we need to substitute a continued-fraction expansion for D.
+
+       dawson(x) = 0.5 / (x-0.5/(x-1/(x-1.5/(x-2/(x-2.5/(x...))))))
+
+       The 6 terms shown here seems to be the minimum needed to be
+       accurate as soon as the simpler Taylor expansion above starts
+       breaking down.  Using this 6-term expansion, factoring out the
+       denominator, and simplifying with Maple, we obtain:
+
+       Re dawson(x + iy) * (-15 + 90x^2 - 60x^4 + 8x^6) / x
+       = 33 - 28x^2 + 4x^4 + y^2 (18 - 4x^2) + 4 y^4
+       Im dawson(x + iy) * (-15 + 90x^2 - 60x^4 + 8x^6) / y
+       = -15 + 24x^2 - 4x^4 + 2/3 y^2 (6x^2 - 15) - 4 y^4
+
+       Finally, for |x| > 5e7, we can use a simpler 1-term continued-fraction
+       expansion for the real part, and a 2-term expansion for the imaginary
+       part.  (This avoids overflow problems for huge |x|.)  This yields:
+
+       Re dawson(x + iy) = [1 + y^2 (1 + y^2/2 - (xy)^2/3)] / (2x)
+       Im dawson(x + iy) = y [ -1 - 2/3 y^2 + y^4/15 (2x^2 - 4) ] / (2x^2 - 1)
+
+    */
+taylor_realaxis:
+    {
+        double x2 = x*x;
+        if (x2 > 1600) { // |x| > 40
+            double y2 = y*y;
+            if (x2 > 25e14) {// |x| > 5e7
+                double xy2 = (x*y)*(x*y);
+                return C((0.5 + y2 * (0.5 + 0.25*y2
+                                      - 0.16666666666666666667*xy2)) / x,
+                         y * (-1 + y2 * (-0.66666666666666666667
+                                         + 0.13333333333333333333*xy2
+                                         - 0.26666666666666666667*y2))
+                         / (2*x2 - 1));
+            }
+            return complex_mul_rc((1. / (-15 + x2 * (90 + x2 * (-60 + 8 * x2)))),
+                C(x * (33 + x2 * (-28 + 4 * x2)
+                    + +y2 * (18 - 4 * x2 + 4 * y2)),
+                    +y * (-15 + x2 * (24 - 4 * x2)
+                        + +y2 * (4 * x2 - 10 - 4 * y2))));
+        }
+        else {
+            double D = spi2 * im_w_of_x(x);
+            double y2 = y*y;
+            return C
+                (D + y2 * (D + x - 2*D*x2)
+                 + y2*y2 * (D * (0.5 - x2 * (2 - 0.66666666666666666667*x2))
+                            + x * (0.83333333333333333333
+                                   - 0.33333333333333333333 * x2)),
+                 y * (1 - 2*D*x
+                      + y2 * 0.66666666666666666667 * (1 - x2 - D*x * (3 - 2*x2))
+                      + y2*y2 * (0.26666666666666666667 -
+                                 x2 * (0.6 - 0.13333333333333333333 * x2)
+                                 - D*x * (1 - x2 * (1.3333333333333333333
+                                                    - 0.26666666666666666667 * x2)))));
+        }
+    }
+} // cdawson
diff --git a/source/luametatex/source/libraries/libcerf/experimental.c b/source/luametatex/source/libraries/libcerf/experimental.c
new file mode 100644
index 000000000..f5ba9477e
--- /dev/null
+++ b/source/luametatex/source/libraries/libcerf/experimental.c
@@ -0,0 +1,178 @@
+/******************************************************************************/
+/*  Experimental code                                                         */
+/******************************************************************************/
+
+/*
+    Compute w_of_z via Fourier integration using Ooura-Mori transform.
+    Agreement with Johnson's code usually < 1E-15, so far always < 1E-13.
+    Todo:
+    - sign for negative x or y
+    - determine application limits
+    - more systematical comparison with Johnson's code
+    - comparison with Abrarov&Quine
+ */
+
+#define max_iter_int 10
+#define num_range 5
+#define PI           3.14159265358979323846L  /* pi */
+#define SQR(x) ((x)*(x))
+#include <errno.h>
+
+double cerf_experimental_integration( int kind, double x, double y )
+// kind: 0 cos, 1 sin transform (precomputing arrays[2] depend on this)
+{
+    // unused parameters
+    static int mu = 0;
+    int intgr_debug = 0;
+    static double intgr_delta=2.2e-16, intgr_eps=5.5e-20;
+
+    if( x<0 || y<0 ) {
+        fprintf( stderr, "negative arguments not yet implemented\n" );
+        exit( EDOM );
+    }
+
+    double w = sqrt(2)*x;
+    double gamma = sqrt(2)*y;
+
+    int iter;
+    int kaux;
+    int isig;
+    int N;
+    int j;               // range
+    long double S=0;     // trapezoid sum
+    long double S_last;  // - in last iteration
+    long double s;       // term contributing to S
+    long double T;       // sum of abs(s)
+    // precomputed coefficients
+    static int firstCall=1;
+    static int iterDone[2][num_range]; // Nm,Np,ak,bk are precomputed up to this
+    static int Nm[num_range][max_iter_int];
+    static int Np[num_range][max_iter_int];
+    static long double *ak[2][num_range][max_iter_int];
+    static long double *bk[2][num_range][max_iter_int];
+    // auxiliary for computing ak and bk
+    long double u;
+    long double e;
+    long double tk;
+    long double chi;
+    long double dchi;
+    long double h;
+    long double k;
+    long double f;
+    long double ahk;
+    long double chk;
+    long double dhk;
+    double p;
+    double q;
+    const double Smin=2e-20; // to assess worst truncation error
+
+    // dynamic initialization upon first call
+    if ( firstCall ) {
+        for ( j=0; j<num_range; ++ j ) {
+            iterDone[0][j] = -1;
+            iterDone[1][j] = -1;
+        }
+        firstCall = 0;
+    }
+
+    // determine range, set p,q
+    j=1; p=1.4; q=0.6;
+
+    // iterative integration
+    if( intgr_debug & 4 )
+        N = 100;
+    else
+        N = 40;
+    for ( iter=0; iter<max_iter_int; ++iter ) {
+        // static initialisation of Nm, Np, ak, bk for given 'iter'
+        if ( iter>iterDone[kind][j] ) {
+            if ( N>1e6 )
+                return -3; // integral limits overflow
+            Nm[j][iter] = N;
+            Np[j][iter] = N;
+            if ( !( ak[kind][j][iter]=malloc((sizeof(long double))*
+                                         (Nm[j][iter]+1+Np[j][iter])) ) ||
+                !( bk[kind][j][iter]=malloc((sizeof(long double))*
+                                         (Nm[j][iter]+1+Np[j][iter])) ) ) {
+                fprintf( stderr, "Workspace allocation failed\n" );
+                exit( ENOMEM );
+            }
+            h = logl( logl( 42*N/intgr_delta/Smin ) / p ) / N; // 42=(pi+1)*10
+            isig=1-2*(Nm[j][iter]&1);
+            for ( kaux=-Nm[j][iter]; kaux<=Np[j][iter]; ++kaux ) {
+                k = kaux;
+                if( !kind )
+                    k -= 0.5;
+                u = k*h;
+                chi  = 2*p*sinhl(u) + 2*q*u;
+                dchi = 2*p*coshl(u) + 2*q;
+                if ( u==0 ) {
+                    if ( k!=0 )
+                        return -4; // integration variable underflow
+                    // special treatment to bridge singularity at u=0
+                    ahk = PI/h/dchi;
+                    dhk = 0.5;
+                    chk = sin( ahk );
+                } else {
+                    if ( -chi>DBL_MAX_EXP/2 )
+                        return -5; // integral transformation overflow
+                    e = expl( -chi );
+                    ahk = PI/h * u/(1-e);
+                    dhk = 1/(1-e) - u*e*dchi/SQR(1-e);
+                    chk = e>1 ?
+                        ( kind ? sinl( PI*k/(1-e) ) : cosl( PI*k/(1-e) ) ) :
+                        isig * sinl( PI*k*e/(1-e) );
+                }
+                ak[kind][j][iter][kaux+Nm[j][iter]] = ahk;
+                bk[kind][j][iter][kaux+Nm[j][iter]] = dhk * chk;
+                isig = -isig;
+            }
+            iterDone[kind][j] = iter;
+        }
+        // integrate according to trapezoidal rule
+        S_last = S;
+        S = 0;
+        T = 0;
+        for ( kaux=-Nm[j][iter]; kaux<=Np[j][iter]; ++kaux ) {
+            tk = ak[kind][j][iter][kaux+Nm[j][iter]] / w;
+            f = expl(-tk*gamma-SQR(tk)/2); // Fourier kernel
+            if ( mu )
+                f /= tk; // TODO
+            s = bk[kind][j][iter][kaux+Nm[j][iter]] * f;
+            S += s;
+            T += fabsl(s);
+            if( intgr_debug & 2 )
+                printf( "%2i %6i %12.4Lg %12.4Lg"
+                        " %12.4Lg %12.4Lg %12.4Lg %12.4Lg\n",
+                        iter, kaux, ak[kind][j][iter][kaux+Nm[j][iter]],
+                        bk[kind][j][iter][kaux+Nm[j][iter]], f, s, S, T );
+        }
+        if( intgr_debug & 1 )
+            printf( "%23.17Le  %23.17Le\n", S, T );
+        // intgr_num_of_terms += Np[j][iter]-(-Nm[j][iter])+1;
+        // termination criteria
+        if      ( intgr_debug & 4 )
+            return -1; // we want to inspect just one sum
+        else if ( S < 0 )
+            return -6; // cancelling terms lead to negative S
+        else if ( intgr_eps*T > intgr_delta*fabs(S) )
+            return -2; // cancellation
+        else if ( iter && fabs(S-S_last) + intgr_eps*T < intgr_delta*fabs(S) )
+            return S*sqrt(2*PI)/w; // success
+            // factor 2 from int_-infty^+infty = 2 * int_0^+infty
+            // factor pi/w from formula 48 in kww paper
+            // factor 1/sqrt(2*pi) from Gaussian
+        N *= 2; // retry with more points
+    }
+    return -9; // not converged
+}
+
+double cerf_experimental_imw( double x, double y )
+{
+    return cerf_experimental_integration( 1, x, y );
+}
+
+double cerf_experimental_rew( double x, double y )
+{
+    return cerf_experimental_integration( 0, x, y );
+}
diff --git a/source/luametatex/source/libraries/libcerf/im_w_of_x.c b/source/luametatex/source/libraries/libcerf/im_w_of_x.c
new file mode 100644
index 000000000..505c8c3fe
--- /dev/null
+++ b/source/luametatex/source/libraries/libcerf/im_w_of_x.c
@@ -0,0 +1,519 @@
+/* Library libcerf:
+ *   Compute complex error functions, based on a new implementation of
+ *   Faddeeva's w_of_z. Also provide Dawson and Voigt functions.
+ *
+ * File im_w_of_x.c:
+ *   Compute scaled Dawson integral im_w_of_x(x) = 2*dawson(x)/sqrt(pi),
+ *   equivalent to the imaginary part of the Faddeeva function w(x) for real x.
+ *
+ * Copyright:
+ *   (C) 2012 Massachusetts Institute of Technology
+ *   (C) 2013 Forschungszentrum Jülich GmbH
+ *
+ * Licence:
+ *   Permission is hereby granted, free of charge, to any person obtaining
+ *   a copy of this software and associated documentation files (the
+ *   "Software"), to deal in the Software without restriction, including
+ *   without limitation the rights to use, copy, modify, merge, publish,
+ *   distribute, sublicense, and/or sell copies of the Software, and to
+ *   permit persons to whom the Software is furnished to do so, subject to
+ *   the following conditions:
+ *
+ *   The above copyright notice and this permission notice shall be
+ *   included in all copies or substantial portions of the Software.
+ *
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ *   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ *   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ *   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ *   LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ *   OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ *   WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *   Steven G. Johnson, Massachusetts Institute of Technology, 2012, core author
+ *   Joachim Wuttke, Forschungszentrum Jülich, 2013, package maintainer
+ *
+ * Website:
+ *   http://apps.jcns.fz-juelich.de/libcerf
+ *
+ * Revision history:
+ *   ../CHANGELOG
+ *
+ * Manual page:
+ *   man 3 im_w_of_x
+ */
+
+#include "cerf.h"
+#include <math.h>
+#include "defs.h" // defines _cerf_cmplx, NaN, C, cexp, ...
+
+/******************************************************************************/
+/* Lookup-table for Chebyshev polynomials for smaller |x|                     */
+/******************************************************************************/
+
+static double w_im_y100(double y100, double x)
+{
+    // Steven G. Johnson, October 2012.
+
+    // Given y100=100*y, where y = 1/(1+x) for x >= 0, compute w_im(x).
+
+    // Uses a look-up table of 100 different Chebyshev polynomials
+    // for y intervals [0,0.01], [0.01,0.02], ...., [0.99,1], generated
+    // with the help of Maple and a little shell script.
+    // This allows the Chebyshev polynomials to be of significantly lower
+    // degree (about 1/30) compared to fitting the whole [0,1] interval
+    // with a single polynomial.
+
+    switch ((int) y100) {
+    case 0: {
+        double t = 2*y100 - 1;
+        return 0.28351593328822191546e-2 + (0.28494783221378400759e-2 + (0.14427470563276734183e-4 + (0.10939723080231588129e-6 + (0.92474307943275042045e-9 + (0.89128907666450075245e-11 + 0.92974121935111111110e-13 * t) * t) * t) * t) * t) * t;
+    }
+    case 1: {
+        double t = 2*y100 - 3;
+        return 0.85927161243940350562e-2 + (0.29085312941641339862e-2 + (0.15106783707725582090e-4 + (0.11716709978531327367e-6 + (0.10197387816021040024e-8 + (0.10122678863073360769e-10 + 0.10917479678400000000e-12 * t) * t) * t) * t) * t) * t;
+    }
+    case 2: {
+        double t = 2*y100 - 5;
+        return 0.14471159831187703054e-1 + (0.29703978970263836210e-2 + (0.15835096760173030976e-4 + (0.12574803383199211596e-6 + (0.11278672159518415848e-8 + (0.11547462300333495797e-10 + 0.12894535335111111111e-12 * t) * t) * t) * t) * t) * t;
+    }
+    case 3: {
+        double t = 2*y100 - 7;
+        return 0.20476320420324610618e-1 + (0.30352843012898665856e-2 + (0.16617609387003727409e-4 + (0.13525429711163116103e-6 + (0.12515095552507169013e-8 + (0.13235687543603382345e-10 + 0.15326595042666666667e-12 * t) * t) * t) * t) * t) * t;
+    }
+    case 4: {
+        double t = 2*y100 - 9;
+        return 0.26614461952489004566e-1 + (0.31034189276234947088e-2 + (0.17460268109986214274e-4 + (0.14582130824485709573e-6 + (0.13935959083809746345e-8 + (0.15249438072998932900e-10 + 0.18344741882133333333e-12 * t) * t) * t) * t) * t) * t;
+    }
+    case 5: {
+        double t = 2*y100 - 11;
+        return 0.32892330248093586215e-1 + (0.31750557067975068584e-2 + (0.18369907582308672632e-4 + (0.15761063702089457882e-6 + (0.15577638230480894382e-8 + (0.17663868462699097951e-10 + (0.22126732680711111111e-12 + 0.30273474177737853668e-14 * t) * t) * t) * t) * t) * t) * t;
+    }
+    case 6: {
+        double t = 2*y100 - 13;
+        return 0.39317207681134336024e-1 + (0.32504779701937539333e-2 + (0.19354426046513400534e-4 + (0.17081646971321290539e-6 + (0.17485733959327106250e-8 + (0.20593687304921961410e-10 + (0.26917401949155555556e-12 + 0.38562123837725712270e-14 * t) * t) * t) * t) * t) * t) * t;
+    }
+    case 7: {
+        double t = 2*y100 - 15;
+        return 0.45896976511367738235e-1 + (0.33300031273110976165e-2 + (0.20423005398039037313e-4 + (0.18567412470376467303e-6 + (0.19718038363586588213e-8 + (0.24175006536781219807e-10 + (0.33059982791466666666e-12 + 0.49756574284439426165e-14 * t) * t) * t) * t) * t) * t) * t;
+    }
+    case 8: {
+        double t = 2*y100 - 17;
+        return 0.52640192524848962855e-1 + (0.34139883358846720806e-2 + (0.21586390240603337337e-4 + (0.20247136501568904646e-6 + (0.22348696948197102935e-8 + (0.28597516301950162548e-10 + (0.41045502119111111110e-12 + 0.65151614515238361946e-14 * t) * t) * t) * t) * t) * t) * t;
+    }
+    case 9: {
+        double t = 2*y100 - 19;
+        return 0.59556171228656770456e-1 + (0.35028374386648914444e-2 + (0.22857246150998562824e-4 + (0.22156372146525190679e-6 + (0.25474171590893813583e-8 + (0.34122390890697400584e-10 + (0.51593189879111111110e-12 + 0.86775076853908006938e-14 * t) * t) * t) * t) * t) * t) * t;
+    }
+    case 10: {
+        double t = 2*y100 - 21;
+        return 0.66655089485108212551e-1 + (0.35970095381271285568e-2 + (0.24250626164318672928e-4 + (0.24339561521785040536e-6 + (0.29221990406518411415e-8 + (0.41117013527967776467e-10 + (0.65786450716444444445e-12 + 0.11791885745450623331e-13 * t) * t) * t) * t) * t) * t) * t;
+    }
+    case 11: {
+        double t = 2*y100 - 23;
+        return 0.73948106345519174661e-1 + (0.36970297216569341748e-2 + (0.25784588137312868792e-4 + (0.26853012002366752770e-6 + (0.33763958861206729592e-8 + (0.50111549981376976397e-10 + (0.85313857496888888890e-12 + 0.16417079927706899860e-13 * t) * t) * t) * t) * t) * t) * t;
+    }
+    case 12: {
+        double t = 2*y100 - 25;
+        return 0.81447508065002963203e-1 + (0.38035026606492705117e-2 + (0.27481027572231851896e-4 + (0.29769200731832331364e-6 + (0.39336816287457655076e-8 + (0.61895471132038157624e-10 + (0.11292303213511111111e-11 + 0.23558532213703884304e-13 * t) * t) * t) * t) * t) * t) * t;
+    }
+    case 13: {
+        double t = 2*y100 - 27;
+        return 0.89166884027582716628e-1 + (0.39171301322438946014e-2 + (0.29366827260422311668e-4 + (0.33183204390350724895e-6 + (0.46276006281647330524e-8 + (0.77692631378169813324e-10 + (0.15335153258844444444e-11 + 0.35183103415916026911e-13 * t) * t) * t) * t) * t) * t) * t;
+    }
+    case 14: {
+        double t = 2*y100 - 29;
+        return 0.97121342888032322019e-1 + (0.40387340353207909514e-2 + (0.31475490395950776930e-4 + (0.37222714227125135042e-6 + (0.55074373178613809996e-8 + (0.99509175283990337944e-10 + (0.21552645758222222222e-11 + 0.55728651431872687605e-13 * t) * t) * t) * t) * t) * t) * t;
+    }
+    case 15: {
+        double t = 2*y100 - 31;
+        return 0.10532778218603311137e0 + (0.41692873614065380607e-2 + (0.33849549774889456984e-4 + (0.42064596193692630143e-6 + (0.66494579697622432987e-8 + (0.13094103581931802337e-9 + (0.31896187409777777778e-11 + 0.97271974184476560742e-13 * t) * t) * t) * t) * t) * t) * t;
+    }
+    case 16: {
+        double t = 2*y100 - 33;
+        return 0.11380523107427108222e0 + (0.43099572287871821013e-2 + (0.36544324341565929930e-4 + (0.47965044028581857764e-6 + (0.81819034238463698796e-8 + (0.17934133239549647357e-9 + (0.50956666166186293627e-11 + (0.18850487318190638010e-12 + 0.79697813173519853340e-14 * t) * t) * t) * t) * t) * t) * t) * t;
+    }
+    case 17: {
+        double t = 2*y100 - 35;
+        return 0.12257529703447467345e0 + (0.44621675710026986366e-2 + (0.39634304721292440285e-4 + (0.55321553769873381819e-6 + (0.10343619428848520870e-7 + (0.26033830170470368088e-9 + (0.87743837749108025357e-11 + (0.34427092430230063401e-12 + 0.10205506615709843189e-13 * t) * t) * t) * t) * t) * t) * t) * t;
+    }
+    case 18: {
+        double t = 2*y100 - 37;
+        return 0.13166276955656699478e0 + (0.46276970481783001803e-2 + (0.43225026380496399310e-4 + (0.64799164020016902656e-6 + (0.13580082794704641782e-7 + (0.39839800853954313927e-9 + (0.14431142411840000000e-10 + 0.42193457308830027541e-12 * t) * t) * t) * t) * t) * t) * t;
+    }
+    case 19: {
+        double t = 2*y100 - 39;
+        return 0.14109647869803356475e0 + (0.48088424418545347758e-2 + (0.47474504753352150205e-4 + (0.77509866468724360352e-6 + (0.18536851570794291724e-7 + (0.60146623257887570439e-9 + (0.18533978397305276318e-10 + (0.41033845938901048380e-13 - 0.46160680279304825485e-13 * t) * t) * t) * t) * t) * t) * t) * t;
+    }
+    case 20: {
+        double t = 2*y100 - 41;
+        return 0.15091057940548936603e0 + (0.50086864672004685703e-2 + (0.52622482832192230762e-4 + (0.95034664722040355212e-6 + (0.25614261331144718769e-7 + (0.80183196716888606252e-9 + (0.12282524750534352272e-10 + (-0.10531774117332273617e-11 - 0.86157181395039646412e-13 * t) * t) * t) * t) * t) * t) * t) * t;
+    }
+    case 21: {
+        double t = 2*y100 - 43;
+        return 0.16114648116017010770e0 + (0.52314661581655369795e-2 + (0.59005534545908331315e-4 + (0.11885518333915387760e-5 + (0.33975801443239949256e-7 + (0.82111547144080388610e-9 + (-0.12357674017312854138e-10 + (-0.24355112256914479176e-11 - 0.75155506863572930844e-13 * t) * t) * t) * t) * t) * t) * t) * t;
+    }
+    case 22: {
+        double t = 2*y100 - 45;
+        return 0.17185551279680451144e0 + (0.54829002967599420860e-2 + (0.67013226658738082118e-4 + (0.14897400671425088807e-5 + (0.40690283917126153701e-7 + (0.44060872913473778318e-9 + (-0.52641873433280000000e-10 - 0.30940587864543343124e-11 * t) * t) * t) * t) * t) * t) * t;
+    }
+    case 23: {
+        double t = 2*y100 - 47;
+        return 0.18310194559815257381e0 + (0.57701559375966953174e-2 + (0.76948789401735193483e-4 + (0.18227569842290822512e-5 + (0.41092208344387212276e-7 + (-0.44009499965694442143e-9 + (-0.92195414685628803451e-10 + (-0.22657389705721753299e-11 + 0.10004784908106839254e-12 * t) * t) * t) * t) * t) * t) * t) * t;
+    }
+    case 24: {
+        double t = 2*y100 - 49;
+        return 0.19496527191546630345e0 + (0.61010853144364724856e-2 + (0.88812881056342004864e-4 + (0.21180686746360261031e-5 + (0.30652145555130049203e-7 + (-0.16841328574105890409e-8 + (-0.11008129460612823934e-9 + (-0.12180794204544515779e-12 + 0.15703325634590334097e-12 * t) * t) * t) * t) * t) * t) * t) * t;
+    }
+    case 25: {
+        double t = 2*y100 - 51;
+        return 0.20754006813966575720e0 + (0.64825787724922073908e-2 + (0.10209599627522311893e-3 + (0.22785233392557600468e-5 + (0.73495224449907568402e-8 + (-0.29442705974150112783e-8 + (-0.94082603434315016546e-10 + (0.23609990400179321267e-11 + 0.14141908654269023788e-12 * t) * t) * t) * t) * t) * t) * t) * t;
+    }
+    case 26: {
+        double t = 2*y100 - 53;
+        return 0.22093185554845172146e0 + (0.69182878150187964499e-2 + (0.11568723331156335712e-3 + (0.22060577946323627739e-5 + (-0.26929730679360840096e-7 + (-0.38176506152362058013e-8 + (-0.47399503861054459243e-10 + (0.40953700187172127264e-11 + 0.69157730376118511127e-13 * t) * t) * t) * t) * t) * t) * t) * t;
+    }
+    case 27: {
+        double t = 2*y100 - 55;
+        return 0.23524827304057813918e0 + (0.74063350762008734520e-2 + (0.12796333874615790348e-3 + (0.18327267316171054273e-5 + (-0.66742910737957100098e-7 + (-0.40204740975496797870e-8 + (0.14515984139495745330e-10 + (0.44921608954536047975e-11 - 0.18583341338983776219e-13 * t) * t) * t) * t) * t) * t) * t) * t;
+    }
+    case 28: {
+        double t = 2*y100 - 57;
+        return 0.25058626331812744775e0 + (0.79377285151602061328e-2 + (0.13704268650417478346e-3 + (0.11427511739544695861e-5 + (-0.10485442447768377485e-6 + (-0.34850364756499369763e-8 + (0.72656453829502179208e-10 + (0.36195460197779299406e-11 - 0.84882136022200714710e-13 * t) * t) * t) * t) * t) * t) * t) * t;
+    }
+    case 29: {
+        double t = 2*y100 - 59;
+        return 0.26701724900280689785e0 + (0.84959936119625864274e-2 + (0.14112359443938883232e-3 + (0.17800427288596909634e-6 + (-0.13443492107643109071e-6 + (-0.23512456315677680293e-8 + (0.11245846264695936769e-9 + (0.19850501334649565404e-11 - 0.11284666134635050832e-12 * t) * t) * t) * t) * t) * t) * t) * t;
+    }
+    case 30: {
+        double t = 2*y100 - 61;
+        return 0.28457293586253654144e0 + (0.90581563892650431899e-2 + (0.13880520331140646738e-3 + (-0.97262302362522896157e-6 + (-0.15077100040254187366e-6 + (-0.88574317464577116689e-9 + (0.12760311125637474581e-9 + (0.20155151018282695055e-12 - 0.10514169375181734921e-12 * t) * t) * t) * t) * t) * t) * t) * t;
+    }
+    case 31: {
+        double t = 2*y100 - 63;
+        return 0.30323425595617385705e0 + (0.95968346790597422934e-2 + (0.12931067776725883939e-3 + (-0.21938741702795543986e-5 + (-0.15202888584907373963e-6 + (0.61788350541116331411e-9 + (0.11957835742791248256e-9 + (-0.12598179834007710908e-11 - 0.75151817129574614194e-13 * t) * t) * t) * t) * t) * t) * t) * t;
+    }
+    case 32: {
+        double t = 2*y100 - 65;
+        return 0.32292521181517384379e0 + (0.10082957727001199408e-1 + (0.11257589426154962226e-3 + (-0.33670890319327881129e-5 + (-0.13910529040004008158e-6 + (0.19170714373047512945e-8 + (0.94840222377720494290e-10 + (-0.21650018351795353201e-11 - 0.37875211678024922689e-13 * t) * t) * t) * t) * t) * t) * t) * t;
+    }
+    case 33: {
+        double t = 2*y100 - 67;
+        return 0.34351233557911753862e0 + (0.10488575435572745309e-1 + (0.89209444197248726614e-4 + (-0.43893459576483345364e-5 + (-0.11488595830450424419e-6 + (0.28599494117122464806e-8 + (0.61537542799857777779e-10 - 0.24935749227658002212e-11 * t) * t) * t) * t) * t) * t) * t;
+    }
+    case 34: {
+        double t = 2*y100 - 69;
+        return 0.36480946642143669093e0 + (0.10789304203431861366e-1 + (0.60357993745283076834e-4 + (-0.51855862174130669389e-5 + (-0.83291664087289801313e-7 + (0.33898011178582671546e-8 + (0.27082948188277716482e-10 + (-0.23603379397408694974e-11 + 0.19328087692252869842e-13 * t) * t) * t) * t) * t) * t) * t) * t;
+    }
+    case 35: {
+        double t = 2*y100 - 71;
+        return 0.38658679935694939199e0 + (0.10966119158288804999e-1 + (0.27521612041849561426e-4 + (-0.57132774537670953638e-5 + (-0.48404772799207914899e-7 + (0.35268354132474570493e-8 + (-0.32383477652514618094e-11 + (-0.19334202915190442501e-11 + 0.32333189861286460270e-13 * t) * t) * t) * t) * t) * t) * t) * t;
+    }
+    case 36: {
+        double t = 2*y100 - 73;
+        return 0.40858275583808707870e0 + (0.11006378016848466550e-1 + (-0.76396376685213286033e-5 + (-0.59609835484245791439e-5 + (-0.13834610033859313213e-7 + (0.33406952974861448790e-8 + (-0.26474915974296612559e-10 + (-0.13750229270354351983e-11 + 0.36169366979417390637e-13 * t) * t) * t) * t) * t) * t) * t) * t;
+    }
+    case 37: {
+        double t = 2*y100 - 75;
+        return 0.43051714914006682977e0 + (0.10904106549500816155e-1 + (-0.43477527256787216909e-4 + (-0.59429739547798343948e-5 + (0.17639200194091885949e-7 + (0.29235991689639918688e-8 + (-0.41718791216277812879e-10 + (-0.81023337739508049606e-12 + 0.33618915934461994428e-13 * t) * t) * t) * t) * t) * t) * t) * t;
+    }
+    case 38: {
+        double t = 2*y100 - 77;
+        return 0.45210428135559607406e0 + (0.10659670756384400554e-1 + (-0.78488639913256978087e-4 + (-0.56919860886214735936e-5 + (0.44181850467477733407e-7 + (0.23694306174312688151e-8 + (-0.49492621596685443247e-10 + (-0.31827275712126287222e-12 + 0.27494438742721623654e-13 * t) * t) * t) * t) * t) * t) * t) * t;
+    }
+    case 39: {
+        double t = 2*y100 - 79;
+        return 0.47306491195005224077e0 + (0.10279006119745977570e-1 + (-0.11140268171830478306e-3 + (-0.52518035247451432069e-5 + (0.64846898158889479518e-7 + (0.17603624837787337662e-8 + (-0.51129481592926104316e-10 + (0.62674584974141049511e-13 + 0.20055478560829935356e-13 * t) * t) * t) * t) * t) * t) * t) * t;
+    }
+    case 40: {
+        double t = 2*y100 - 81;
+        return 0.49313638965719857647e0 + (0.97725799114772017662e-2 + (-0.14122854267291533334e-3 + (-0.46707252568834951907e-5 + (0.79421347979319449524e-7 + (0.11603027184324708643e-8 + (-0.48269605844397175946e-10 + (0.32477251431748571219e-12 + 0.12831052634143527985e-13 * t) * t) * t) * t) * t) * t) * t) * t;
+    }
+    case 41: {
+        double t = 2*y100 - 83;
+        return 0.51208057433416004042e0 + (0.91542422354009224951e-2 + (-0.16726530230228647275e-3 + (-0.39964621752527649409e-5 + (0.88232252903213171454e-7 + (0.61343113364949928501e-9 + (-0.42516755603130443051e-10 + (0.47910437172240209262e-12 + 0.66784341874437478953e-14 * t) * t) * t) * t) * t) * t) * t) * t;
+    }
+    case 42: {
+        double t = 2*y100 - 85;
+        return 0.52968945458607484524e0 + (0.84400880445116786088e-2 + (-0.18908729783854258774e-3 + (-0.32725905467782951931e-5 + (0.91956190588652090659e-7 + (0.14593989152420122909e-9 + (-0.35239490687644444445e-10 + 0.54613829888448694898e-12 * t) * t) * t) * t) * t) * t) * t;
+    }
+    case 43: {
+        double t = 2*y100 - 87;
+        return 0.54578857454330070965e0 + (0.76474155195880295311e-2 + (-0.20651230590808213884e-3 + (-0.25364339140543131706e-5 + (0.91455367999510681979e-7 + (-0.23061359005297528898e-9 + (-0.27512928625244444444e-10 + 0.54895806008493285579e-12 * t) * t) * t) * t) * t) * t) * t;
+    }
+    case 44: {
+        double t = 2*y100 - 89;
+        return 0.56023851910298493910e0 + (0.67938321739997196804e-2 + (-0.21956066613331411760e-3 + (-0.18181127670443266395e-5 + (0.87650335075416845987e-7 + (-0.51548062050366615977e-9 + (-0.20068462174044444444e-10 + 0.50912654909758187264e-12 * t) * t) * t) * t) * t) * t) * t;
+    }
+    case 45: {
+        double t = 2*y100 - 91;
+        return 0.57293478057455721150e0 + (0.58965321010394044087e-2 + (-0.22841145229276575597e-3 + (-0.11404605562013443659e-5 + (0.81430290992322326296e-7 + (-0.71512447242755357629e-9 + (-0.13372664928000000000e-10 + 0.44461498336689298148e-12 * t) * t) * t) * t) * t) * t) * t;
+    }
+    case 46: {
+        double t = 2*y100 - 93;
+        return 0.58380635448407827360e0 + (0.49717469530842831182e-2 + (-0.23336001540009645365e-3 + (-0.51952064448608850822e-6 + (0.73596577815411080511e-7 + (-0.84020916763091566035e-9 + (-0.76700972702222222221e-11 + 0.36914462807972467044e-12 * t) * t) * t) * t) * t) * t) * t;
+    }
+    case 47: {
+        double t = 2*y100 - 95;
+        return 0.59281340237769489597e0 + (0.40343592069379730568e-2 + (-0.23477963738658326185e-3 + (0.34615944987790224234e-7 + (0.64832803248395814574e-7 + (-0.90329163587627007971e-9 + (-0.30421940400000000000e-11 + 0.29237386653743536669e-12 * t) * t) * t) * t) * t) * t) * t;
+    }
+    case 48: {
+        double t = 2*y100 - 97;
+        return 0.59994428743114271918e0 + (0.30976579788271744329e-2 + (-0.23308875765700082835e-3 + (0.51681681023846925160e-6 + (0.55694594264948268169e-7 + (-0.91719117313243464652e-9 + (0.53982743680000000000e-12 + 0.22050829296187771142e-12 * t) * t) * t) * t) * t) * t) * t;
+    }
+    case 49: {
+        double t = 2*y100 - 99;
+        return 0.60521224471819875444e0 + (0.21732138012345456060e-2 + (-0.22872428969625997456e-3 + (0.92588959922653404233e-6 + (0.46612665806531930684e-7 + (-0.89393722514414153351e-9 + (0.31718550353777777778e-11 + 0.15705458816080549117e-12 * t) * t) * t) * t) * t) * t) * t;
+    }
+    case 50: {
+        double t = 2*y100 - 101;
+        return 0.60865189969791123620e0 + (0.12708480848877451719e-2 + (-0.22212090111534847166e-3 + (0.12636236031532793467e-5 + (0.37904037100232937574e-7 + (-0.84417089968101223519e-9 + (0.49843180828444444445e-11 + 0.10355439441049048273e-12 * t) * t) * t) * t) * t) * t) * t;
+    }
+    case 51: {
+        double t = 2*y100 - 103;
+        return 0.61031580103499200191e0 + (0.39867436055861038223e-3 + (-0.21369573439579869291e-3 + (0.15339402129026183670e-5 + (0.29787479206646594442e-7 + (-0.77687792914228632974e-9 + (0.61192452741333333334e-11 + 0.60216691829459295780e-13 * t) * t) * t) * t) * t) * t) * t;
+    }
+    case 52: {
+        double t = 2*y100 - 105;
+        return 0.61027109047879835868e0 + (-0.43680904508059878254e-3 + (-0.20383783788303894442e-3 + (0.17421743090883439959e-5 + (0.22400425572175715576e-7 + (-0.69934719320045128997e-9 + (0.67152759655111111110e-11 + 0.26419960042578359995e-13 * t) * t) * t) * t) * t) * t) * t;
+    }
+    case 53: {
+        double t = 2*y100 - 107;
+        return 0.60859639489217430521e0 + (-0.12305921390962936873e-2 + (-0.19290150253894682629e-3 + (0.18944904654478310128e-5 + (0.15815530398618149110e-7 + (-0.61726850580964876070e-9 + 0.68987888999111111110e-11 * t) * t) * t) * t) * t) * t;
+    }
+    case 54: {
+        double t = 2*y100 - 109;
+        return 0.60537899426486075181e0 + (-0.19790062241395705751e-2 + (-0.18120271393047062253e-3 + (0.19974264162313241405e-5 + (0.10055795094298172492e-7 + (-0.53491997919318263593e-9 + (0.67794550295111111110e-11 - 0.17059208095741511603e-13 * t) * t) * t) * t) * t) * t) * t;
+    }
+    case 55: {
+        double t = 2*y100 - 111;
+        return 0.60071229457904110537e0 + (-0.26795676776166354354e-2 + (-0.16901799553627508781e-3 + (0.20575498324332621581e-5 + (0.51077165074461745053e-8 + (-0.45536079828057221858e-9 + (0.64488005516444444445e-11 - 0.29311677573152766338e-13 * t) * t) * t) * t) * t) * t) * t;
+    }
+    case 56: {
+        double t = 2*y100 - 113;
+        return 0.59469361520112714738e0 + (-0.33308208190600993470e-2 + (-0.15658501295912405679e-3 + (0.20812116912895417272e-5 + (0.93227468760614182021e-9 + (-0.38066673740116080415e-9 + (0.59806790359111111110e-11 - 0.36887077278950440597e-13 * t) * t) * t) * t) * t) * t) * t;
+    }
+    case 57: {
+        double t = 2*y100 - 115;
+        return 0.58742228631775388268e0 + (-0.39321858196059227251e-2 + (-0.14410441141450122535e-3 + (0.20743790018404020716e-5 + (-0.25261903811221913762e-8 + (-0.31212416519526924318e-9 + (0.54328422462222222221e-11 - 0.40864152484979815972e-13 * t) * t) * t) * t) * t) * t) * t;
+    }
+    case 58: {
+        double t = 2*y100 - 117;
+        return 0.57899804200033018447e0 + (-0.44838157005618913447e-2 + (-0.13174245966501437965e-3 + (0.20425306888294362674e-5 + (-0.53330296023875447782e-8 + (-0.25041289435539821014e-9 + (0.48490437205333333334e-11 - 0.42162206939169045177e-13 * t) * t) * t) * t) * t) * t) * t;
+    }
+    case 59: {
+        double t = 2*y100 - 119;
+        return 0.56951968796931245974e0 + (-0.49864649488074868952e-2 + (-0.11963416583477567125e-3 + (0.19906021780991036425e-5 + (-0.75580140299436494248e-8 + (-0.19576060961919820491e-9 + (0.42613011928888888890e-11 - 0.41539443304115604377e-13 * t) * t) * t) * t) * t) * t) * t;
+    }
+    case 60: {
+        double t = 2*y100 - 121;
+        return 0.55908401930063918964e0 + (-0.54413711036826877753e-2 + (-0.10788661102511914628e-3 + (0.19229663322982839331e-5 + (-0.92714731195118129616e-8 + (-0.14807038677197394186e-9 + (0.36920870298666666666e-11 - 0.39603726688419162617e-13 * t) * t) * t) * t) * t) * t) * t;
+    }
+    case 61: {
+        double t = 2*y100 - 123;
+        return 0.54778496152925675315e0 + (-0.58501497933213396670e-2 + (-0.96582314317855227421e-4 + (0.18434405235069270228e-5 + (-0.10541580254317078711e-7 + (-0.10702303407788943498e-9 + (0.31563175582222222222e-11 - 0.36829748079110481422e-13 * t) * t) * t) * t) * t) * t) * t;
+    }
+    case 62: {
+        double t = 2*y100 - 125;
+        return 0.53571290831682823999e0 + (-0.62147030670760791791e-2 + (-0.85782497917111760790e-4 + (0.17553116363443470478e-5 + (-0.11432547349815541084e-7 + (-0.72157091369041330520e-10 + (0.26630811607111111111e-11 - 0.33578660425893164084e-13 * t) * t) * t) * t) * t) * t) * t;
+    }
+    case 63: {
+        double t = 2*y100 - 127;
+        return 0.52295422962048434978e0 + (-0.65371404367776320720e-2 + (-0.75530164941473343780e-4 + (0.16613725797181276790e-5 + (-0.12003521296598910761e-7 + (-0.42929753689181106171e-10 + (0.22170894940444444444e-11 - 0.30117697501065110505e-13 * t) * t) * t) * t) * t) * t) * t;
+    }
+    case 64: {
+        double t = 2*y100 - 129;
+        return 0.50959092577577886140e0 + (-0.68197117603118591766e-2 + (-0.65852936198953623307e-4 + (0.15639654113906716939e-5 + (-0.12308007991056524902e-7 + (-0.18761997536910939570e-10 + (0.18198628922666666667e-11 - 0.26638355362285200932e-13 * t) * t) * t) * t) * t) * t) * t;
+    }
+    case 65: {
+        double t = 2*y100 - 131;
+        return 0.49570040481823167970e0 + (-0.70647509397614398066e-2 + (-0.56765617728962588218e-4 + (0.14650274449141448497e-5 + (-0.12393681471984051132e-7 + (0.92904351801168955424e-12 + (0.14706755960177777778e-11 - 0.23272455351266325318e-13 * t) * t) * t) * t) * t) * t) * t;
+    }
+    case 66: {
+        double t = 2*y100 - 133;
+        return 0.48135536250935238066e0 + (-0.72746293327402359783e-2 + (-0.48272489495730030780e-4 + (0.13661377309113939689e-5 + (-0.12302464447599382189e-7 + (0.16707760028737074907e-10 + (0.11672928324444444444e-11 - 0.20105801424709924499e-13 * t) * t) * t) * t) * t) * t) * t;
+    }
+    case 67: {
+        double t = 2*y100 - 135;
+        return 0.46662374675511439448e0 + (-0.74517177649528487002e-2 + (-0.40369318744279128718e-4 + (0.12685621118898535407e-5 + (-0.12070791463315156250e-7 + (0.29105507892605823871e-10 + (0.90653314645333333334e-12 - 0.17189503312102982646e-13 * t) * t) * t) * t) * t) * t) * t;
+    }
+    case 68: {
+        double t = 2*y100 - 137;
+        return 0.45156879030168268778e0 + (-0.75983560650033817497e-2 + (-0.33045110380705139759e-4 + (0.11732956732035040896e-5 + (-0.11729986947158201869e-7 + (0.38611905704166441308e-10 + (0.68468768305777777779e-12 - 0.14549134330396754575e-13 * t) * t) * t) * t) * t) * t) * t;
+    }
+    case 69: {
+        double t = 2*y100 - 139;
+        return 0.43624909769330896904e0 + (-0.77168291040309554679e-2 + (-0.26283612321339907756e-4 + (0.10811018836893550820e-5 + (-0.11306707563739851552e-7 + (0.45670446788529607380e-10 + (0.49782492549333333334e-12 - 0.12191983967561779442e-13 * t) * t) * t) * t) * t) * t) * t;
+    }
+    case 70: {
+        double t = 2*y100 - 141;
+        return 0.42071877443548481181e0 + (-0.78093484015052730097e-2 + (-0.20064596897224934705e-4 + (0.99254806680671890766e-6 + (-0.10823412088884741451e-7 + (0.50677203326904716247e-10 + (0.34200547594666666666e-12 - 0.10112698698356194618e-13 * t) * t) * t) * t) * t) * t) * t;
+    }
+    case 71: {
+        double t = 2*y100 - 143;
+        return 0.40502758809710844280e0 + (-0.78780384460872937555e-2 + (-0.14364940764532853112e-4 + (0.90803709228265217384e-6 + (-0.10298832847014466907e-7 + (0.53981671221969478551e-10 + (0.21342751381333333333e-12 - 0.82975901848387729274e-14 * t) * t) * t) * t) * t) * t) * t;
+    }
+    case 72: {
+        double t = 2*y100 - 145;
+        return 0.38922115269731446690e0 + (-0.79249269708242064120e-2 + (-0.91595258799106970453e-5 + (0.82783535102217576495e-6 + (-0.97484311059617744437e-8 + (0.55889029041660225629e-10 + (0.10851981336888888889e-12 - 0.67278553237853459757e-14 * t) * t) * t) * t) * t) * t) * t;
+    }
+    case 73: {
+        double t = 2*y100 - 147;
+        return 0.37334112915460307335e0 + (-0.79519385109223148791e-2 + (-0.44219833548840469752e-5 + (0.75209719038240314732e-6 + (-0.91848251458553190451e-8 + (0.56663266668051433844e-10 + (0.23995894257777777778e-13 - 0.53819475285389344313e-14 * t) * t) * t) * t) * t) * t) * t;
+    }
+    case 74: {
+        double t = 2*y100 - 149;
+        return 0.35742543583374223085e0 + (-0.79608906571527956177e-2 + (-0.12530071050975781198e-6 + (0.68088605744900552505e-6 + (-0.86181844090844164075e-8 + (0.56530784203816176153e-10 + (-0.43120012248888888890e-13 - 0.42372603392496813810e-14 * t) * t) * t) * t) * t) * t) * t;
+    }
+    case 75: {
+        double t = 2*y100 - 151;
+        return 0.34150846431979618536e0 + (-0.79534924968773806029e-2 + (0.37576885610891515813e-5 + (0.61419263633090524326e-6 + (-0.80565865409945960125e-8 + (0.55684175248749269411e-10 + (-0.95486860764444444445e-13 - 0.32712946432984510595e-14 * t) * t) * t) * t) * t) * t) * t;
+    }
+    case 76: {
+        double t = 2*y100 - 153;
+        return 0.32562129649136346824e0 + (-0.79313448067948884309e-2 + (0.72539159933545300034e-5 + (0.55195028297415503083e-6 + (-0.75063365335570475258e-8 + (0.54281686749699595941e-10 - 0.13545424295111111111e-12 * t) * t) * t) * t) * t) * t;
+    }
+    case 77: {
+        double t = 2*y100 - 155;
+        return 0.30979191977078391864e0 + (-0.78959416264207333695e-2 + (0.10389774377677210794e-4 + (0.49404804463196316464e-6 + (-0.69722488229411164685e-8 + (0.52469254655951393842e-10 - 0.16507860650666666667e-12 * t) * t) * t) * t) * t) * t;
+    }
+    case 78: {
+        double t = 2*y100 - 157;
+        return 0.29404543811214459904e0 + (-0.78486728990364155356e-2 + (0.13190885683106990459e-4 + (0.44034158861387909694e-6 + (-0.64578942561562616481e-8 + (0.50354306498006928984e-10 - 0.18614473550222222222e-12 * t) * t) * t) * t) * t) * t;
+    }
+    case 79: {
+        double t = 2*y100 - 159;
+        return 0.27840427686253660515e0 + (-0.77908279176252742013e-2 + (0.15681928798708548349e-4 + (0.39066226205099807573e-6 + (-0.59658144820660420814e-8 + (0.48030086420373141763e-10 - 0.20018995173333333333e-12 * t) * t) * t) * t) * t) * t;
+    }
+    case 80: {
+        double t = 2*y100 - 161;
+        return 0.26288838011163800908e0 + (-0.77235993576119469018e-2 + (0.17886516796198660969e-4 + (0.34482457073472497720e-6 + (-0.54977066551955420066e-8 + (0.45572749379147269213e-10 - 0.20852924954666666667e-12 * t) * t) * t) * t) * t) * t;
+    }
+    case 81: {
+        double t = 2*y100 - 163;
+        return 0.24751539954181029717e0 + (-0.76480877165290370975e-2 + (0.19827114835033977049e-4 + (0.30263228619976332110e-6 + (-0.50545814570120129947e-8 + (0.43043879374212005966e-10 - 0.21228012028444444444e-12 * t) * t) * t) * t) * t) * t;
+    }
+    case 82: {
+        double t = 2*y100 - 165;
+        return 0.23230087411688914593e0 + (-0.75653060136384041587e-2 + (0.21524991113020016415e-4 + (0.26388338542539382413e-6 + (-0.46368974069671446622e-8 + (0.40492715758206515307e-10 - 0.21238627815111111111e-12 * t) * t) * t) * t) * t) * t;
+    }
+    case 83: {
+        double t = 2*y100 - 167;
+        return 0.21725840021297341931e0 + (-0.74761846305979730439e-2 + (0.23000194404129495243e-4 + (0.22837400135642906796e-6 + (-0.42446743058417541277e-8 + (0.37958104071765923728e-10 - 0.20963978568888888889e-12 * t) * t) * t) * t) * t) * t;
+    }
+    case 84: {
+        double t = 2*y100 - 169;
+        return 0.20239979200788191491e0 + (-0.73815761980493466516e-2 + (0.24271552727631854013e-4 + (0.19590154043390012843e-6 + (-0.38775884642456551753e-8 + (0.35470192372162901168e-10 - 0.20470131678222222222e-12 * t) * t) * t) * t) * t) * t;
+    }
+    case 85: {
+        double t = 2*y100 - 171;
+        return 0.18773523211558098962e0 + (-0.72822604530339834448e-2 + (0.25356688567841293697e-4 + (0.16626710297744290016e-6 + (-0.35350521468015310830e-8 + (0.33051896213898864306e-10 - 0.19811844544000000000e-12 * t) * t) * t) * t) * t) * t;
+    }
+    case 86: {
+        double t = 2*y100 - 173;
+        return 0.17327341258479649442e0 + (-0.71789490089142761950e-2 + (0.26272046822383820476e-4 + (0.13927732375657362345e-6 + (-0.32162794266956859603e-8 + (0.30720156036105652035e-10 - 0.19034196304000000000e-12 * t) * t) * t) * t) * t) * t;
+    }
+    case 87: {
+        double t = 2*y100 - 175;
+        return 0.15902166648328672043e0 + (-0.70722899934245504034e-2 + (0.27032932310132226025e-4 + (0.11474573347816568279e-6 + (-0.29203404091754665063e-8 + (0.28487010262547971859e-10 - 0.18174029063111111111e-12 * t) * t) * t) * t) * t) * t;
+    }
+    case 88: {
+        double t = 2*y100 - 177;
+        return 0.14498609036610283865e0 + (-0.69628725220045029273e-2 + (0.27653554229160596221e-4 + (0.92493727167393036470e-7 + (-0.26462055548683583849e-8 + (0.26360506250989943739e-10 - 0.17261211260444444444e-12 * t) * t) * t) * t) * t) * t;
+    }
+    case 89: {
+        double t = 2*y100 - 179;
+        return 0.13117165798208050667e0 + (-0.68512309830281084723e-2 + (0.28147075431133863774e-4 + (0.72351212437979583441e-7 + (-0.23927816200314358570e-8 + (0.24345469651209833155e-10 - 0.16319736960000000000e-12 * t) * t) * t) * t) * t) * t;
+    }
+    case 90: {
+        double t = 2*y100 - 181;
+        return 0.11758232561160626306e0 + (-0.67378491192463392927e-2 + (0.28525664781722907847e-4 + (0.54156999310046790024e-7 + (-0.21589405340123827823e-8 + (0.22444150951727334619e-10 - 0.15368675584000000000e-12 * t) * t) * t) * t) * t) * t;
+    }
+    case 91: {
+        double t = 2*y100 - 183;
+        return 0.10422112945361673560e0 + (-0.66231638959845581564e-2 + (0.28800551216363918088e-4 + (0.37758983397952149613e-7 + (-0.19435423557038933431e-8 + (0.20656766125421362458e-10 - 0.14422990012444444444e-12 * t) * t) * t) * t) * t) * t;
+    }
+    case 92: {
+        double t = 2*y100 - 185;
+        return 0.91090275493541084785e-1 + (-0.65075691516115160062e-2 + (0.28982078385527224867e-4 + (0.23014165807643012781e-7 + (-0.17454532910249875958e-8 + (0.18981946442680092373e-10 - 0.13494234691555555556e-12 * t) * t) * t) * t) * t) * t;
+    }
+    case 93: {
+        double t = 2*y100 - 187;
+        return 0.78191222288771379358e-1 + (-0.63914190297303976434e-2 + (0.29079759021299682675e-4 + (0.97885458059415717014e-8 + (-0.15635596116134296819e-8 + (0.17417110744051331974e-10 - 0.12591151763555555556e-12 * t) * t) * t) * t) * t) * t;
+    }
+    case 94: {
+        double t = 2*y100 - 189;
+        return 0.65524757106147402224e-1 + (-0.62750311956082444159e-2 + (0.29102328354323449795e-4 + (-0.20430838882727954582e-8 + (-0.13967781903855367270e-8 + (0.15958771833747057569e-10 - 0.11720175765333333333e-12 * t) * t) * t) * t) * t) * t;
+    }
+    case 95: {
+        double t = 2*y100 - 191;
+        return 0.53091065838453612773e-1 + (-0.61586898417077043662e-2 + (0.29057796072960100710e-4 + (-0.12597414620517987536e-7 + (-0.12440642607426861943e-8 + (0.14602787128447932137e-10 - 0.10885859114666666667e-12 * t) * t) * t) * t) * t) * t;
+    }
+    case 96: {
+        double t = 2*y100 - 193;
+        return 0.40889797115352738582e-1 + (-0.60426484889413678200e-2 + (0.28953496450191694606e-4 + (-0.21982952021823718400e-7 + (-0.11044169117553026211e-8 + (0.13344562332430552171e-10 - 0.10091231402844444444e-12 * t) * t) * t) * t) * t) * t;
+    }
+    case 97: case 98:
+    case 99: case 100: { // use Taylor expansion for small x (|x| <= 0.0309...)
+        //  (2/sqrt(pi)) * (x - 2/3 x^3  + 4/15 x^5  - 8/105 x^7 + 16/945 x^9)
+        double x2 = x*x;
+        return x * (1.1283791670955125739
+                    - x2 * (0.75225277806367504925
+                            - x2 * (0.30090111122547001970
+                                    - x2 * (0.085971746064420005629
+                                            - x2 * 0.016931216931216931217))));
+    }
+    }
+    /* Since 0 <= y100 < 101, this is only reached if x is NaN,
+       in which case we should return NaN. */
+    return NaN;
+} // w_im_y100
+
+/******************************************************************************/
+/*  Library function im_w_of_z                                                */
+/******************************************************************************/
+
+double im_w_of_x(double x)
+{
+
+    // Steven G. Johnson, October 2012.
+
+    // Uses methods similar to the erfcx calculation:
+    // continued fractions for large |x|,
+    // a lookup table of Chebyshev polynomials for smaller |x|,
+    // and finally a Taylor expansion for |x|<0.01.
+
+    const double ispi = 0.56418958354775628694807945156; // 1 / sqrt(pi)
+
+    if (x >= 0) {
+        if (x > 45) { 
+            // continued-fraction expansion is faster
+            if (x > 5e7) {
+                // 1-term expansion, important to avoid overflow
+                return ispi / x;
+            } else { 
+                // 5-term expansion (rely on compiler for CSE), simplified from: ispi / (x-0.5/(x-1/(x-1.5/(x-2/x)))) 
+                return ispi*((x*x) * (x*x-4.5) + 2) / (x * ((x*x) * (x*x-5) + 3.75));
+            }
+        } else {   
+            return w_im_y100(100/(1+x), x);
+        }
+    } else { 
+        // = -im_w_of_x(-x)
+        if (x < -45) { 
+            // continued-fraction expansion is faster
+            if (x < -5e7) {
+                // 1-term expansion, important to avoid overflow
+                return ispi / x;
+            } else {
+                // 5-term expansion (rely on compiler for CSE), simplified from:  ispi / (x-0.5/(x-1/(x-1.5/(x-2/x)))) 
+                return ispi*((x*x) * (x*x-4.5) + 2) / (x * ((x*x) * (x*x-5) + 3.75));
+            }
+        } else {
+            return -w_im_y100(100/(1-x), -x);
+        }
+    }
+} 
+
+// im_w_of_z
diff --git a/source/luametatex/source/libraries/libcerf/readme-luametatex.txt b/source/luametatex/source/libraries/libcerf/readme-luametatex.txt
new file mode 100644
index 000000000..bb552f263
--- /dev/null
+++ b/source/luametatex/source/libraries/libcerf/readme-luametatex.txt
@@ -0,0 +1,26 @@
+LS,
+
+In the following files you can find the comment below. We don't want to bother or burden the
+original authors with our problems. The cerf code is mostly used in MetaFun macros (by Alan
+Braslau). The c.h and cpp.h files are gone.
+
+    defs.h
+    cerf.h
+
+---------------------------------------------------------------------------------------------
+This file is patched by Mojca Miklavec and Hans Hagen for usage in LuaMetaTeX where we use
+only C and also want to compile with the Microsoft compiler. So, when updating this library
+one has to check for changes. Not that we expect many as this is a rather stable library.
+
+In the other files there are a few macros used that deal with the multiplication and addition
+of complex and real nmbers. Of course the original code is kept as-is.
+---------------------------------------------------------------------------------------------
+
+So, when updating the library you need to diff for the changes that are needed in order to
+compile the files with the Microsoft compiler.
+
+At some point I might patch the files so that we can intercept error messages in a way that
+permits recovery and also plugs them into our normal message handlers. Maybe I should also
+merge the code into just one file because it doesn't change.
+
+Hans
diff --git a/source/luametatex/source/libraries/libcerf/w_of_z.c b/source/luametatex/source/libraries/libcerf/w_of_z.c
new file mode 100644
index 000000000..33778979c
--- /dev/null
+++ b/source/luametatex/source/libraries/libcerf/w_of_z.c
@@ -0,0 +1,393 @@
+/* Library libcerf:
+ *   Compute complex error functions, based on a new implementation of
+ *   Faddeeva's w_of_z. Also provide Dawson and Voigt functions.
+ *
+ * File w_of_z.c:
+ *   Computation of Faddeeva's complex scaled error function,
+ *      w(z) = exp(-z^2) * erfc(-i*z),
+ *   nameless function (7.1.3) of Abramowitz&Stegun (1964),
+ *   also known as the plasma dispersion function.
+ *
+ *   This implementation uses a combination of different algorithms.
+ *   See man 3 w_of_z for references.
+ *
+ * Copyright:
+ *   (C) 2012 Massachusetts Institute of Technology
+ *   (C) 2013 Forschungszentrum Jülich GmbH
+ *
+ * Licence:
+ *   Permission is hereby granted, free of charge, to any person obtaining
+ *   a copy of this software and associated documentation files (the
+ *   "Software"), to deal in the Software without restriction, including
+ *   without limitation the rights to use, copy, modify, merge, publish,
+ *   distribute, sublicense, and/or sell copies of the Software, and to
+ *   permit persons to whom the Software is furnished to do so, subject to
+ *   the following conditions:
+ *
+ *   The above copyright notice and this permission notice shall be
+ *   included in all copies or substantial portions of the Software.
+ *
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ *   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ *   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ *   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ *   LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ *   OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ *   WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *   Steven G. Johnson, Massachusetts Institute of Technology, 2012, core author
+ *   Joachim Wuttke, Forschungszentrum Jülich, 2013, package maintainer
+ *
+ * Website:
+ *   http://apps.jcns.fz-juelich.de/libcerf
+ *
+ * Revision history:
+ *   ../CHANGELOG
+ *
+ * Man page:
+ *   w_of_z(3)
+ */
+
+/*
+
+    Todo: use local declarations (older compilers) (HH).  
+
+*/
+
+/*
+   Computes various error functions (erf, erfc, erfi, erfcx),
+   including the Dawson integral, in the complex plane, based
+   on algorithms for the computation of the Faddeeva function
+              w(z) = exp(-z^2) * erfc(-i*z).
+   Given w(z), the error functions are mostly straightforward
+   to compute, except for certain regions where we have to
+   switch to Taylor expansions to avoid cancellation errors
+   [e.g. near the origin for erf(z)].
+
+*/
+
+#include "cerf.h"
+#include <float.h>
+#include <math.h>
+#include "defs.h" // defines _cerf_cmplx, NaN, C, cexp, ...
+
+// for analysing the algorithm:
+EXPORT int faddeeva_algorithm;
+EXPORT int faddeeva_nofterms;
+
+/******************************************************************************/
+/*  auxiliary functions                                                       */
+/******************************************************************************/
+
+static inline double sinc(double x, double sinx)
+{
+    // return sinc(x) = sin(x)/x, given both x and sin(x)
+    // [since we only use this in cases where sin(x) has already been computed]
+    return fabs(x) < 1e-4 ? 1 - (0.1666666666666666666667)*x*x : sinx / x;
+}
+
+static inline double sinh_taylor(double x)
+{
+    // sinh(x) via Taylor series, accurate to machine precision for |x| < 1e-2
+    return x * (1 + (x*x) * (0.1666666666666666666667 + 0.00833333333333333333333 * (x*x)));
+}
+
+static inline double sqr(double x) { return x*x; }
+
+/******************************************************************************/
+/* precomputed table of expa2n2[n-1] = exp(-a2*n*n)                           */
+/* for double-precision a2 = 0.26865... in w_of_z, below.                     */
+/******************************************************************************/
+
+static const double expa2n2[] = {
+    7.64405281671221563e-01,
+    3.41424527166548425e-01,
+    8.91072646929412548e-02,
+    1.35887299055460086e-02,
+    1.21085455253437481e-03,
+    6.30452613933449404e-05,
+    1.91805156577114683e-06,
+    3.40969447714832381e-08,
+    3.54175089099469393e-10,
+    2.14965079583260682e-12,
+    7.62368911833724354e-15,
+    1.57982797110681093e-17,
+    1.91294189103582677e-20,
+    1.35344656764205340e-23,
+    5.59535712428588720e-27,
+    1.35164257972401769e-30,
+    1.90784582843501167e-34,
+    1.57351920291442930e-38,
+    7.58312432328032845e-43,
+    2.13536275438697082e-47,
+    3.51352063787195769e-52,
+    3.37800830266396920e-57,
+    1.89769439468301000e-62,
+    6.22929926072668851e-68,
+    1.19481172006938722e-73,
+    1.33908181133005953e-79,
+    8.76924303483223939e-86,
+    3.35555576166254986e-92,
+    7.50264110688173024e-99,
+    9.80192200745410268e-106,
+    7.48265412822268959e-113,
+    3.33770122566809425e-120,
+    8.69934598159861140e-128,
+    1.32486951484088852e-135,
+    1.17898144201315253e-143,
+    6.13039120236180012e-152,
+    1.86258785950822098e-160,
+    3.30668408201432783e-169,
+    3.43017280887946235e-178,
+    2.07915397775808219e-187,
+    7.36384545323984966e-197,
+    1.52394760394085741e-206,
+    1.84281935046532100e-216,
+    1.30209553802992923e-226,
+    5.37588903521080531e-237,
+    1.29689584599763145e-247,
+    1.82813078022866562e-258,
+    1.50576355348684241e-269,
+    7.24692320799294194e-281,
+    2.03797051314726829e-292,
+    3.34880215927873807e-304,
+    0.0 // underflow (also prevents reads past array end, below)
+}; // expa2n2
+
+/******************************************************************************/
+/*  w_of_z, Faddeeva's scaled complex error function                          */
+/******************************************************************************/
+
+_cerf_cmplx w_of_z(_cerf_cmplx z)
+{
+    faddeeva_nofterms = 0;
+
+    // Steven G. Johnson, October 2012.
+
+    if (creal(z) == 0.0) {
+        // Purely imaginary input, purely real output.
+        // However, use creal(z) to give correct sign of 0 in cimag(w).
+        return C(erfcx(cimag(z)), creal(z));
+    }
+    if (cimag(z) == 0) {
+        // Purely real input, complex output.
+        return C(exp(-sqr(creal(z))),  im_w_of_x(creal(z)));
+    }
+
+    const double relerr = DBL_EPSILON;
+    const double a = 0.518321480430085929872; // pi / sqrt(-log(eps*0.5))
+    const double c = 0.329973702884629072537; // (2/pi) * a;
+    const double a2 = 0.268657157075235951582; // a^2
+
+    const double x = fabs(creal(z));
+    const double y = cimag(z);
+    const double ya = fabs(y);
+
+    _cerf_cmplx ret = C(0., 0.); // return value
+
+    double sum1 = 0, sum2 = 0, sum3 = 0, sum4 = 0, sum5 = 0;
+
+    if (ya > 7 || (x > 6  // continued fraction is faster
+                   /* As pointed out by M. Zaghloul, the continued
+                      fraction seems to give a large relative error in
+                      Re w(z) for |x| ~ 6 and small |y|, so use
+                      algorithm 816 in this region: */
+                   && (ya > 0.1 || (x > 8 && ya > 1e-10) || x > 28))) {
+
+        faddeeva_algorithm = 100;
+
+        /* Poppe & Wijers suggest using a number of terms
+           nu = 3 + 1442 / (26*rho + 77)
+           where rho = sqrt((x/x0)^2 + (y/y0)^2) where x0=6.3, y0=4.4.
+           (They only use this expansion for rho >= 1, but rho a little less
+           than 1 seems okay too.)
+           Instead, I did my own fit to a slightly different function
+           that avoids the hypotenuse calculation, using NLopt to minimize
+           the sum of the squares of the errors in nu with the constraint
+           that the estimated nu be >= minimum nu to attain machine precision.
+           I also separate the regions where nu == 2 and nu == 1. */
+        const double ispi = 0.56418958354775628694807945156; // 1 / sqrt(pi)
+        double xs = y < 0 ? -creal(z) : creal(z); // compute for -z if y < 0
+        if (x + ya > 4000) { // nu <= 2
+            if (x + ya > 1e7) { // nu == 1, w(z) = i/sqrt(pi) / z
+                // scale to avoid overflow
+                if (x > ya) {
+                    faddeeva_algorithm += 1;
+                    double yax = ya / xs;
+                    faddeeva_algorithm = 100;
+                    double denom = ispi / (xs + yax*ya);
+                    ret = C(denom*yax, denom);
+                }
+                else if (isinf(ya)) {
+                    faddeeva_algorithm += 2;
+                    return ((isnan(x) || y < 0)
+                            ? C(NaN,NaN) : C(0,0));
+                }
+                else {
+                    faddeeva_algorithm += 3;
+                    double xya = xs / ya;
+                    double denom = ispi / (xya*xs + ya);
+                    ret = C(denom, denom*xya);
+                }
+            }
+            else { // nu == 2, w(z) = i/sqrt(pi) * z / (z*z - 0.5)
+                faddeeva_algorithm += 4;
+                double dr = xs*xs - ya*ya - 0.5, di = 2*xs*ya;
+                double denom = ispi / (dr*dr + di*di);
+                ret = C(denom * (xs*di-ya*dr), denom * (xs*dr+ya*di));
+            }
+        }
+        else { // compute nu(z) estimate and do general continued fraction
+            faddeeva_algorithm += 5;
+            const double c0=3.9, c1=11.398, c2=0.08254, c3=0.1421, c4=0.2023; // fit
+            double nu = floor(c0 + c1 / (c2*x + c3*ya + c4));
+            double wr = xs, wi = ya;
+            for (nu = 0.5 * (nu - 1); nu > 0.4; nu -= 0.5) {
+                // w <- z - nu/w:
+                double denom = nu / (wr*wr + wi*wi);
+                wr = xs - wr * denom;
+                wi = ya + wi * denom;
+            }
+            { // w(z) = i/sqrt(pi) / w:
+                double denom = ispi / (wr*wr + wi*wi);
+                ret = C(denom*wi, denom*wr);
+            }
+        }
+        if (y < 0) {
+            faddeeva_algorithm += 10;
+            // use w(z) = 2.0*exp(-z*z) - w(-z),
+            // but be careful of overflow in exp(-z*z)
+            //                                = exp(-(xs*xs-ya*ya) -2*i*xs*ya)
+            return complex_sub_cc(complex_mul_rc(2.0,cexp(C((ya-xs)*(xs+ya), 2*xs*y))), ret);
+        }
+        else
+            return ret;
+    }
+
+    /* Note: The test that seems to be suggested in the paper is x <
+       sqrt(-log(DBL_MIN)), about 26.6, since otherwise exp(-x^2)
+       underflows to zero and sum1,sum2,sum4 are zero.  However, long
+       before this occurs, the sum1,sum2,sum4 contributions are
+       negligible in double precision; I find that this happens for x >
+       about 6, for all y.  On the other hand, I find that the case
+       where we compute all of the sums is faster (at least with the
+       precomputed expa2n2 table) until about x=10.  Furthermore, if we
+       try to compute all of the sums for x > 20, I find that we
+       sometimes run into numerical problems because underflow/overflow
+       problems start to appear in the various coefficients of the sums,
+       below.  Therefore, we use x < 10 here. */
+    else if (x < 10) {
+
+        faddeeva_algorithm = 200;
+
+        double prod2ax = 1, prodm2ax = 1;
+        double expx2;
+
+        if (isnan(y)) {
+            faddeeva_algorithm += 99;
+            return C(y,y);
+        }
+
+        if (x < 5e-4) { // compute sum4 and sum5 together as sum5-sum4
+                        // This special case is needed for accuracy.
+            faddeeva_algorithm += 1;
+            const double x2 = x*x;
+            expx2 = 1 - x2 * (1 - 0.5*x2); // exp(-x*x) via Taylor
+            // compute exp(2*a*x) and exp(-2*a*x) via Taylor, to double precision
+            const double ax2 = 1.036642960860171859744*x; // 2*a*x
+            const double exp2ax =
+                1 + ax2 * (1 + ax2 * (0.5 + 0.166666666666666666667*ax2));
+            const double expm2ax =
+                1 - ax2 * (1 - ax2 * (0.5 - 0.166666666666666666667*ax2));
+            for (int n = 1; ; ++n) {
+                ++faddeeva_nofterms;
+                const double coef = expa2n2[n-1] * expx2 / (a2*(n*n) + y*y);
+                prod2ax *= exp2ax;
+                prodm2ax *= expm2ax;
+                sum1 += coef;
+                sum2 += coef * prodm2ax;
+                sum3 += coef * prod2ax;
+
+                // really = sum5 - sum4
+                sum5 += coef * (2*a) * n * sinh_taylor((2*a)*n*x);
+
+                // test convergence via sum3
+                if (coef * prod2ax < relerr * sum3) break;
+            }
+        }
+        else { // x > 5e-4, compute sum4 and sum5 separately
+            faddeeva_algorithm += 2;
+            expx2 = exp(-x*x);
+            const double exp2ax = exp((2*a)*x), expm2ax = 1 / exp2ax;
+            for (int n = 1; ; ++n) {
+                ++faddeeva_nofterms;
+                const double coef = expa2n2[n-1] * expx2 / (a2*(n*n) + y*y);
+                prod2ax *= exp2ax;
+                prodm2ax *= expm2ax;
+                sum1 += coef;
+                sum2 += coef * prodm2ax;
+                sum4 += (coef * prodm2ax) * (a*n);
+                sum3 += coef * prod2ax;
+                sum5 += (coef * prod2ax) * (a*n);
+                // test convergence via sum5, since this sum has the slowest decay
+                if ((coef * prod2ax) * (a*n) < relerr * sum5) break;
+            }
+        }
+        const double expx2erfcxy = // avoid spurious overflow for large negative y
+            y > -6 // for y < -6, erfcx(y) = 2*exp(y*y) to double precision
+            ? expx2*erfcx(y) : 2*exp(y*y-x*x);
+        if (y > 5) { // imaginary terms cancel
+            faddeeva_algorithm += 10;
+            const double sinxy = sin(x*y);
+            ret = C((expx2erfcxy - c*y*sum1) * cos(2*x*y) + (c*x*expx2) * sinxy * sinc(x*y, sinxy), 0.0);
+        }
+        else {
+            faddeeva_algorithm += 20;
+            double xs = creal(z);
+            const double sinxy = sin(xs*y);
+            const double sin2xy = sin(2*xs*y), cos2xy = cos(2*xs*y);
+            const double coef1 = expx2erfcxy - c*y*sum1;
+            const double coef2 = c*xs*expx2;
+            ret = C(coef1 * cos2xy + coef2 * sinxy * sinc(xs*y, sinxy),
+                    coef2 * sinc(2*xs*y, sin2xy) - coef1 * sin2xy);
+        }
+    }
+    else { // x large: only sum3 & sum5 contribute (see above note)
+
+        faddeeva_algorithm = 300;
+
+        if (isnan(x))
+            return C(x,x);
+        if (isnan(y))
+            return C(y,y);
+
+        ret = C(exp(-x*x),0.0); // |y| < 1e-10, so we only need exp(-x*x) term
+        // (round instead of ceil as in original paper; note that x/a > 1 here)
+        double n0 = floor(x/a + 0.5); // sum in both directions, starting at n0
+        double dx = a*n0 - x;
+        sum3 = exp(-dx*dx) / (a2*(n0*n0) + y*y);
+        sum5 = a*n0 * sum3;
+        double exp1 = exp(4*a*dx), exp1dn = 1;
+        int dn;
+        for (dn = 1; n0 - dn > 0; ++dn) { // loop over n0-dn and n0+dn terms
+            double np = n0 + dn, nm = n0 - dn;
+            double tp = exp(-sqr(a*dn+dx));
+            double tm = tp * (exp1dn *= exp1); // trick to get tm from tp
+            tp /= (a2*(np*np) + y*y);
+            tm /= (a2*(nm*nm) + y*y);
+            sum3 += tp + tm;
+            sum5 += a * (np * tp + nm * tm);
+            if (a * (np * tp + nm * tm) < relerr * sum5) goto finish;
+        }
+        while (1) { // loop over n0+dn terms only (since n0-dn <= 0)
+            double np = n0 + dn++;
+            double tp = exp(-sqr(a*dn+dx)) / (a2*(np*np) + y*y);
+            sum3 += tp;
+            sum5 += a * np * tp;
+            if (a * np * tp < relerr * sum5) goto finish;
+        }
+    }
+finish:
+    return complex_add_cc(ret, C((0.5*c)*y*(sum2+sum3),(0.5*c)*copysign(sum5-sum4, creal(z))));
+} // w_of_z
diff --git a/source/luametatex/source/libraries/libcerf/width.c b/source/luametatex/source/libraries/libcerf/width.c
new file mode 100644
index 000000000..a844377ff
--- /dev/null
+++ b/source/luametatex/source/libraries/libcerf/width.c
@@ -0,0 +1,100 @@
+/* Library libcerf:
+ *   Compute complex error functions, based on a new implementation of
+ *   Faddeeva's w_of_z. Also provide Dawson and Voigt functions.
+ *
+ * File width.c:
+ *   Computate voigt_hwhm, using Newton's iteration.
+ *
+ * Copyright:
+ *   (C) 2018 Forschungszentrum Jülich GmbH
+ *
+ * Licence:
+ *   Permission is hereby granted, free of charge, to any person obtaining
+ *   a copy of this software and associated documentation files (the
+ *   "Software"), to deal in the Software without restriction, including
+ *   without limitation the rights to use, copy, modify, merge, publish,
+ *   distribute, sublicense, and/or sell copies of the Software, and to
+ *   permit persons to whom the Software is furnished to do so, subject to
+ *   the following conditions:
+ *
+ *   The above copyright notice and this permission notice shall be
+ *   included in all copies or substantial portions of the Software.
+ *
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ *   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ *   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ *   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ *   LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ *   OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ *   WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *   Joachim Wuttke, Forschungszentrum Jülich, 2018
+ *
+ * Website:
+ *   http://apps.jcns.fz-juelich.de/libcerf
+ *
+ * Revision history:
+ *   ../CHANGELOG
+ *
+ * Man pages:
+ *   voigt_fwhm(3)
+ */
+
+/*
+
+    This file is patched by Hans Hagen for usage in LuaMetaTeX where we don't want to exit on an 
+    error so we intercept it. 
+
+*/
+
+#include "cerf.h"
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+double dvoigt(double x, double sigma, double gamma, double v0)
+{
+    return voigt(x, sigma, gamma)/v0 - .5;
+}
+
+double voigt_hwhm(double sigma, double gamma, int *error)
+{
+    *error = 0;
+    if (sigma == 0 && gamma == 0) {
+        return 0;
+    } else if (isnan(sigma) || isnan(gamma)) {        
+        *error = 1; 
+        return 0; // return NAN;
+    } else {
+        // start from an excellent approximation [Olivero & Longbothum, J Quant Spec Rad Transf 1977]:
+        const double eps = 1e-14;
+        const double hwhm0 = .5*(1.06868*gamma+sqrt(0.86743*gamma*gamma+4*2*log(2)*sigma*sigma));
+        const double del = eps*hwhm0;
+        double ret = hwhm0;
+        const double v0 = voigt(0, sigma, gamma);
+        for (int i=0; i<300; ++i) {
+            double val = dvoigt(ret, sigma, gamma, v0);
+            if (fabs(val) < 1e-15) {
+                return ret;
+            } else {
+                double step = -del/(dvoigt(ret+del, sigma, gamma, v0)/val-1);
+                double nxt = ret + step;
+                if (nxt < ret/3) {
+                    *error = 2; // fprintf(stderr, "voigt_fwhm terminated because of huge deviation from 1st approx\n");
+                    nxt = ret/3;
+                } else if (nxt > 2*ret) {
+                    *error = 2; // fprintf(stderr, "voigt_fwhm terminated because of huge deviation from 1st approx\n");
+                    nxt = 2*ret;
+                }
+                if (fabs(ret-nxt) < del) {
+                    return nxt;
+                } else { 
+                    ret = nxt;
+                }
+            }
+        }
+        *error = 3; // fprintf(stderr, "voigt_fwhm failed: Newton's iteration did not converge with sigma = %f and gamma = %f\n", sigma, gamma);  exit(-1);
+        return 0;
+    }
+}
diff --git a/source/luametatex/source/libraries/mimalloc/CMakeLists.txt b/source/luametatex/source/libraries/mimalloc/CMakeLists.txt
new file mode 100644
index 000000000..8127e0965
--- /dev/null
+++ b/source/luametatex/source/libraries/mimalloc/CMakeLists.txt
@@ -0,0 +1,413 @@
+cmake_minimum_required(VERSION 3.0)
+project(libmimalloc C CXX)
+
+set(CMAKE_C_STANDARD 11)
+set(CMAKE_CXX_STANDARD 17)
+
+option(MI_SECURE            "Use full security mitigations (like guard pages, allocation randomization, double-free mitigation, and free-list corruption detection)" OFF)
+option(MI_DEBUG_FULL        "Use full internal heap invariant checking in DEBUG mode (expensive)" OFF)
+option(MI_PADDING           "Enable padding to detect heap block overflow (used only in DEBUG mode)" ON)
+option(MI_OVERRIDE          "Override the standard malloc interface (e.g. define entry points for malloc() etc)" ON)
+option(MI_XMALLOC           "Enable abort() call on memory allocation failure by default" OFF)
+option(MI_SHOW_ERRORS       "Show error and warning messages by default (only enabled by default in DEBUG mode)" OFF)
+option(MI_USE_CXX           "Use the C++ compiler to compile the library (instead of the C compiler)" OFF)
+option(MI_SEE_ASM           "Generate assembly files" OFF)
+option(MI_OSX_INTERPOSE     "Use interpose to override standard malloc on macOS" ON)
+option(MI_OSX_ZONE          "Use malloc zone to override standard malloc on macOS" ON) 
+option(MI_LOCAL_DYNAMIC_TLS "Use slightly slower, dlopen-compatible TLS mechanism (Unix)" OFF)
+option(MI_BUILD_SHARED      "Build shared library" ON)
+option(MI_BUILD_STATIC      "Build static library" ON)
+option(MI_BUILD_OBJECT      "Build object library" ON)
+option(MI_BUILD_TESTS       "Build test executables" ON)
+option(MI_DEBUG_TSAN        "Build with thread sanitizer (needs clang)" OFF)
+option(MI_DEBUG_UBSAN       "Build with undefined-behavior sanitizer (needs clang++)" OFF)
+option(MI_SKIP_COLLECT_ON_EXIT, "Skip collecting memory on program exit" OFF)
+
+# deprecated options
+option(MI_CHECK_FULL        "Use full internal invariant checking in DEBUG mode (deprecated, use MI_DEBUG_FULL instead)" OFF)
+option(MI_INSTALL_TOPLEVEL  "Install directly into $CMAKE_INSTALL_PREFIX instead of PREFIX/lib/mimalloc-version (deprecated)" OFF)
+option(MI_USE_LIBATOMIC     "Explicitly link with -latomic (on older systems) (deprecated and detected automatically)" OFF)
+
+include(GNUInstallDirs)
+include("cmake/mimalloc-config-version.cmake")
+
+set(mi_sources
+    src/stats.c
+    src/random.c
+    src/os.c
+    src/bitmap.c
+    src/arena.c
+    src/segment-cache.c
+    src/segment.c
+    src/page.c
+    src/alloc.c
+    src/alloc-aligned.c
+    src/alloc-posix.c
+    src/heap.c
+    src/options.c
+    src/init.c)
+
+
+# -----------------------------------------------------------------------------
+# Convenience: set default build type depending on the build directory
+# -----------------------------------------------------------------------------
+
+message(STATUS "")    
+if (NOT CMAKE_BUILD_TYPE)
+  if ("${CMAKE_BINARY_DIR}" MATCHES ".*(D|d)ebug$" OR  MI_DEBUG_FULL)
+    message(STATUS "No build type selected, default to: Debug")
+    set(CMAKE_BUILD_TYPE "Debug")
+  else()
+    message(STATUS "No build type selected, default to: Release")
+    set(CMAKE_BUILD_TYPE "Release")
+  endif()
+endif()
+
+if("${CMAKE_BINARY_DIR}" MATCHES ".*(S|s)ecure$")
+  message(STATUS "Default to secure build")
+  set(MI_SECURE "ON")
+endif()
+
+
+# -----------------------------------------------------------------------------
+# Process options
+# -----------------------------------------------------------------------------
+
+if(CMAKE_C_COMPILER_ID MATCHES "MSVC|Intel")
+  set(MI_USE_CXX "ON")
+endif()
+
+if(MI_OVERRIDE)
+  message(STATUS "Override standard malloc (MI_OVERRIDE=ON)")
+  if(APPLE)
+    if(MI_OSX_ZONE)
+      # use zone's on macOS
+      message(STATUS "  Use malloc zone to override malloc (MI_OSX_ZONE=ON)")
+      list(APPEND mi_sources src/alloc-override-osx.c)
+      list(APPEND mi_defines MI_OSX_ZONE=1)      
+      if (NOT MI_OSX_INTERPOSE)
+        message(STATUS "  WARNING: zone overriding usually also needs interpose (use -DMI_OSX_INTERPOSE=ON)")
+      endif()
+    endif()
+    if(MI_OSX_INTERPOSE)
+      # use interpose on macOS
+      message(STATUS "  Use interpose to override malloc (MI_OSX_INTERPOSE=ON)")
+      list(APPEND mi_defines MI_OSX_INTERPOSE=1)
+      if (NOT MI_OSX_ZONE)
+        message(STATUS "  WARNING: interpose usually also needs zone overriding (use -DMI_OSX_INTERPOSE=ON)")
+      endif()
+    endif()
+    if(MI_USE_CXX AND MI_OSX_INTERPOSE)
+      message(STATUS "  WARNING: if dynamically overriding malloc/free, it is more reliable to build mimalloc as C code (use -DMI_USE_CXX=OFF)")
+    endif()
+  endif()
+endif()
+
+if(MI_SECURE)
+  message(STATUS "Set full secure build (MI_SECURE=ON)")
+  list(APPEND mi_defines MI_SECURE=4)
+endif()
+
+if(MI_SEE_ASM)
+  message(STATUS "Generate assembly listings (MI_SEE_ASM=ON)")
+  list(APPEND mi_cflags -save-temps)
+endif()
+
+if(MI_CHECK_FULL)
+  message(STATUS "The MI_CHECK_FULL option is deprecated, use MI_DEBUG_FULL instead")
+  set(MI_DEBUG_FULL "ON")
+endif()
+
+if (MI_SKIP_COLLECT_ON_EXIT)
+  message(STATUS "Skip collecting memory on program exit (MI_SKIP_COLLECT_ON_EXIT=ON)")
+  list(APPEND mi_defines MI_SKIP_COLLECT_ON_EXIT=1)
+endif()
+
+if(MI_DEBUG_FULL)
+  message(STATUS "Set debug level to full internal invariant checking (MI_DEBUG_FULL=ON)")
+  list(APPEND mi_defines MI_DEBUG=3)   # full invariant checking
+endif()
+
+if(NOT MI_PADDING)
+  message(STATUS "Disable padding of heap blocks in debug mode (MI_PADDING=OFF)")
+  list(APPEND mi_defines MI_PADDING=0)
+endif()
+
+if(MI_XMALLOC)
+  message(STATUS "Enable abort() calls on memory allocation failure (MI_XMALLOC=ON)")
+  list(APPEND mi_defines MI_XMALLOC=1)
+endif()
+
+if(MI_SHOW_ERRORS)
+  message(STATUS "Enable printing of error and warning messages by default (MI_SHOW_ERRORS=ON)")
+  list(APPEND mi_defines MI_SHOW_ERRORS=1)
+endif()
+
+if(MI_DEBUG_TSAN)
+  if(CMAKE_C_COMPILER_ID MATCHES "Clang")
+    message(STATUS "Build with thread sanitizer (MI_DEBUG_TSAN=ON)")
+    list(APPEND mi_defines MI_TSAN=1)
+    list(APPEND mi_cflags -fsanitize=thread -g -O1)
+    list(APPEND CMAKE_EXE_LINKER_FLAGS -fsanitize=thread)
+  else()
+    message(WARNING "Can only use thread sanitizer with clang (MI_DEBUG_TSAN=ON but ignored)")    
+  endif()  
+endif()
+
+if(MI_DEBUG_UBSAN)
+  if(CMAKE_BUILD_TYPE MATCHES "Debug")    
+    if(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+      message(STATUS "Build with undefined-behavior sanitizer (MI_DEBUG_UBSAN=ON)")
+      list(APPEND mi_cflags -fsanitize=undefined -g -fno-sanitize-recover=undefined)
+      list(APPEND CMAKE_EXE_LINKER_FLAGS -fsanitize=undefined)
+      if (NOT MI_USE_CXX)
+        message(STATUS "(switch to use C++ due to MI_DEBUG_UBSAN)")
+        set(MI_USE_CXX "ON")
+      endif()
+    else()
+      message(WARNING "Can only use undefined-behavior sanitizer with clang++ (MI_DEBUG_UBSAN=ON but ignored)")    
+    endif()  
+  else()
+    message(WARNING "Can only use thread sanitizer with a debug build (CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE})")    
+  endif()
+endif()
+
+if(MI_USE_CXX)
+  message(STATUS "Use the C++ compiler to compile (MI_USE_CXX=ON)")
+  set_source_files_properties(${mi_sources} PROPERTIES LANGUAGE CXX )
+  set_source_files_properties(src/static.c test/test-api.c test/test-api-fill test/test-stress PROPERTIES LANGUAGE CXX )
+  if(CMAKE_CXX_COMPILER_ID MATCHES "AppleClang|Clang")
+    list(APPEND mi_cflags -Wno-deprecated)
+  endif()
+  if(CMAKE_CXX_COMPILER_ID MATCHES "Intel")
+    list(APPEND mi_cflags -Kc++)
+  endif()
+endif()
+
+# Compiler flags
+if(CMAKE_C_COMPILER_ID MATCHES "AppleClang|Clang|GNU")
+  list(APPEND mi_cflags -Wall -Wextra -Wno-unknown-pragmas -fvisibility=hidden)
+  if(NOT MI_USE_CXX)
+    list(APPEND mi_cflags -Wstrict-prototypes)
+  endif()  
+  if(CMAKE_C_COMPILER_ID MATCHES "AppleClang|Clang")
+    list(APPEND mi_cflags -Wpedantic -Wno-static-in-inline)
+  endif()
+endif()
+
+if(CMAKE_C_COMPILER_ID MATCHES "Intel")
+  list(APPEND mi_cflags -Wall -fvisibility=hidden)
+endif()
+
+if(CMAKE_C_COMPILER_ID MATCHES "AppleClang|Clang|GNU|Intel" AND NOT CMAKE_SYSTEM_NAME MATCHES "Haiku")
+  if(MI_LOCAL_DYNAMIC_TLS)
+    list(APPEND mi_cflags -ftls-model=local-dynamic)
+  else()
+    list(APPEND mi_cflags -ftls-model=initial-exec)
+  endif()
+  if(MI_OVERRIDE)
+    list(APPEND mi_cflags -fno-builtin-malloc)
+  endif()
+endif()
+
+if (MSVC AND MSVC_VERSION GREATER_EQUAL 1914)
+  list(APPEND mi_cflags /Zc:__cplusplus)
+endif()
+
+# extra needed libraries
+if(WIN32)
+  list(APPEND mi_libraries psapi shell32 user32 advapi32 bcrypt)
+else()
+  find_library(MI_LIBPTHREAD pthread)
+  if (MI_LIBPTHREAD)                      
+    list(APPEND mi_libraries ${MI_LIBPTHREAD})
+  endif()
+  find_library(MI_LIBRT rt)
+  if(MI_LIBRT)
+    list(APPEND mi_libraries ${MI_LIBRT})
+  endif()  
+  find_library(MI_LIBATOMIC atomic)
+  if (MI_LIBATOMIC OR MI_USE_LIBATOMIC) 
+    list(APPEND mi_libraries atomic)
+  endif()
+endif()
+
+# -----------------------------------------------------------------------------
+# Install and output names
+# -----------------------------------------------------------------------------
+
+# dynamic/shared library and symlinks always go to /usr/local/lib equivalent
+set(mi_install_libdir   "${CMAKE_INSTALL_LIBDIR}")
+
+# static libraries and object files, includes, and cmake config files
+# are either installed at top level, or use versioned directories for side-by-side installation (default)
+if (MI_INSTALL_TOPLEVEL)
+  set(mi_install_objdir     "${CMAKE_INSTALL_LIBDIR}")
+  set(mi_install_incdir     "${CMAKE_INSTALL_INCLUDEDIR}")        
+  set(mi_install_cmakedir   "${CMAKE_INSTALL_LIBDIR}/cmake/mimalloc")
+else()
+  set(mi_install_objdir     "${CMAKE_INSTALL_LIBDIR}/mimalloc-${mi_version}")       # for static library and object files
+  set(mi_install_incdir     "${CMAKE_INSTALL_INCLUDEDIR}/mimalloc-${mi_version}")   # for includes
+  set(mi_install_cmakedir   "${CMAKE_INSTALL_LIBDIR}/cmake/mimalloc-${mi_version}") # for cmake package info
+endif()
+
+if(MI_SECURE)
+  set(mi_basename "mimalloc-secure")
+else()
+  set(mi_basename "mimalloc")
+endif()
+
+string(TOLOWER "${CMAKE_BUILD_TYPE}" CMAKE_BUILD_TYPE_LC)
+if(NOT(CMAKE_BUILD_TYPE_LC MATCHES "^(release|relwithdebinfo|minsizerel|none)$"))
+  set(mi_basename "${mi_basename}-${CMAKE_BUILD_TYPE_LC}") #append build type (e.g. -debug) if not a release version
+endif()
+if(MI_BUILD_SHARED)
+  list(APPEND mi_build_targets "shared")
+endif()
+if(MI_BUILD_STATIC)
+  list(APPEND mi_build_targets "static")
+endif()
+if(MI_BUILD_OBJECT)
+  list(APPEND mi_build_targets "object")
+endif()
+if(MI_BUILD_TESTS)
+  list(APPEND mi_build_targets "tests")
+endif()
+
+message(STATUS "")
+message(STATUS "Library base name: ${mi_basename}")
+message(STATUS "Version          : ${mi_version}")
+message(STATUS "Build type       : ${CMAKE_BUILD_TYPE_LC}")
+if(MI_USE_CXX)
+  message(STATUS "C++ Compiler     : ${CMAKE_CXX_COMPILER}")
+else()
+  message(STATUS "C Compiler       : ${CMAKE_C_COMPILER}")
+endif()
+message(STATUS "Compiler flags   : ${mi_cflags}")
+message(STATUS "Compiler defines : ${mi_defines}")
+message(STATUS "Link libraries   : ${mi_libraries}")
+message(STATUS "Build targets    : ${mi_build_targets}")
+message(STATUS "")
+
+# -----------------------------------------------------------------------------
+# Main targets
+# -----------------------------------------------------------------------------
+
+# shared library
+if(MI_BUILD_SHARED)
+  add_library(mimalloc SHARED ${mi_sources})
+  set_target_properties(mimalloc PROPERTIES VERSION ${mi_version} SOVERSION ${mi_version_major} OUTPUT_NAME ${mi_basename} )
+  target_compile_definitions(mimalloc PRIVATE ${mi_defines} MI_SHARED_LIB MI_SHARED_LIB_EXPORT)
+  target_compile_options(mimalloc PRIVATE ${mi_cflags})
+  target_link_libraries(mimalloc PUBLIC ${mi_libraries})
+  target_include_directories(mimalloc PUBLIC
+      $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
+      $<INSTALL_INTERFACE:${mi_install_incdir}>
+  )
+  if(WIN32)
+    # On windows copy the mimalloc redirection dll too.
+    if(CMAKE_SIZEOF_VOID_P EQUAL 4)
+      set(MIMALLOC_REDIRECT_SUFFIX "32")
+    else()
+      set(MIMALLOC_REDIRECT_SUFFIX "")
+    endif()
+
+    target_link_libraries(mimalloc PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/bin/mimalloc-redirect${MIMALLOC_REDIRECT_SUFFIX}.lib)
+    add_custom_command(TARGET mimalloc POST_BUILD
+      COMMAND "${CMAKE_COMMAND}" -E copy "${CMAKE_CURRENT_SOURCE_DIR}/bin/mimalloc-redirect${MIMALLOC_REDIRECT_SUFFIX}.dll" $<TARGET_FILE_DIR:mimalloc>
+      COMMENT "Copy mimalloc-redirect${MIMALLOC_REDIRECT_SUFFIX}.dll to output directory")
+    install(FILES "$<TARGET_FILE_DIR:mimalloc>/mimalloc-redirect${MIMALLOC_REDIRECT_SUFFIX}.dll" DESTINATION ${mi_install_libdir})
+  endif()
+
+  install(TARGETS mimalloc EXPORT mimalloc DESTINATION ${mi_install_libdir} LIBRARY)  
+  install(EXPORT mimalloc DESTINATION ${mi_install_cmakedir})
+endif()
+
+# static library
+if (MI_BUILD_STATIC)
+  add_library(mimalloc-static STATIC ${mi_sources})
+  set_property(TARGET mimalloc-static PROPERTY POSITION_INDEPENDENT_CODE ON)
+  target_compile_definitions(mimalloc-static PRIVATE ${mi_defines} MI_STATIC_LIB)
+  target_compile_options(mimalloc-static PRIVATE ${mi_cflags})
+  target_link_libraries(mimalloc-static PUBLIC ${mi_libraries})
+  target_include_directories(mimalloc-static PUBLIC
+      $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
+      $<INSTALL_INTERFACE:${mi_install_incdir}>
+  )
+  if(WIN32)
+    # When building both static and shared libraries on Windows, a static library should use a
+    # different output name to avoid the conflict with the import library of a shared one.
+    string(REPLACE "mimalloc" "mimalloc-static" mi_output_name ${mi_basename})
+    set_target_properties(mimalloc-static PROPERTIES OUTPUT_NAME ${mi_output_name})
+  else()
+    set_target_properties(mimalloc-static PROPERTIES OUTPUT_NAME ${mi_basename})
+  endif()
+
+  install(TARGETS mimalloc-static EXPORT mimalloc DESTINATION ${mi_install_objdir} LIBRARY)
+  install(EXPORT mimalloc DESTINATION ${mi_install_cmakedir})
+endif()
+
+# install include files
+install(FILES include/mimalloc.h DESTINATION ${mi_install_incdir})
+install(FILES include/mimalloc-override.h DESTINATION ${mi_install_incdir})
+install(FILES include/mimalloc-new-delete.h DESTINATION ${mi_install_incdir})
+install(FILES cmake/mimalloc-config.cmake DESTINATION ${mi_install_cmakedir})
+install(FILES cmake/mimalloc-config-version.cmake DESTINATION ${mi_install_cmakedir})
+
+
+# single object file for more predictable static overriding
+if (MI_BUILD_OBJECT)
+  add_library(mimalloc-obj OBJECT src/static.c)
+  set_property(TARGET mimalloc-obj PROPERTY POSITION_INDEPENDENT_CODE ON)
+  target_compile_definitions(mimalloc-obj PRIVATE ${mi_defines})
+  target_compile_options(mimalloc-obj PRIVATE ${mi_cflags})
+  target_include_directories(mimalloc-obj PUBLIC
+      $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
+      $<INSTALL_INTERFACE:${mi_install_incdir}>
+  )
+
+  # the following seems to lead to cmake warnings/errors on some systems, disable for now :-(
+  # install(TARGETS mimalloc-obj EXPORT mimalloc DESTINATION ${mi_install_objdir})
+
+  # the FILES expression can also be: $<TARGET_OBJECTS:mimalloc-obj>
+  # but that fails cmake versions less than 3.10 so we leave it as is for now
+  install(FILES ${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/mimalloc-obj.dir/src/static.c${CMAKE_C_OUTPUT_EXTENSION}
+          DESTINATION ${mi_install_objdir}
+          RENAME ${mi_basename}${CMAKE_C_OUTPUT_EXTENSION} )
+endif()
+
+# -----------------------------------------------------------------------------
+# API surface testing
+# -----------------------------------------------------------------------------
+
+if (MI_BUILD_TESTS)
+  enable_testing()
+
+  foreach(TEST_NAME api api-fill stress)
+    add_executable(mimalloc-test-${TEST_NAME} test/test-${TEST_NAME}.c)
+    target_compile_definitions(mimalloc-test-${TEST_NAME} PRIVATE ${mi_defines})
+    target_compile_options(mimalloc-test-${TEST_NAME} PRIVATE ${mi_cflags})
+    target_include_directories(mimalloc-test-${TEST_NAME} PRIVATE include)
+    target_link_libraries(mimalloc-test-${TEST_NAME} PRIVATE mimalloc ${mi_libraries})
+
+    add_test(NAME test-${TEST_NAME} COMMAND mimalloc-test-${TEST_NAME})
+  endforeach()
+endif()
+
+# -----------------------------------------------------------------------------
+# Set override properties
+# -----------------------------------------------------------------------------
+if (MI_OVERRIDE)
+  if (MI_BUILD_SHARED)
+    target_compile_definitions(mimalloc PRIVATE MI_MALLOC_OVERRIDE)
+  endif()
+  if(NOT WIN32)
+    # It is only possible to override malloc on Windows when building as a DLL.
+    if (MI_BUILD_STATIC)
+      target_compile_definitions(mimalloc-static PRIVATE MI_MALLOC_OVERRIDE)
+    endif()
+    if (MI_BUILD_OBJECT)
+      target_compile_definitions(mimalloc-obj PRIVATE MI_MALLOC_OVERRIDE)
+    endif()
+  endif()
+endif()
diff --git a/source/luametatex/source/libraries/mimalloc/LICENSE b/source/luametatex/source/libraries/mimalloc/LICENSE
new file mode 100644
index 000000000..670b668a0
--- /dev/null
+++ b/source/luametatex/source/libraries/mimalloc/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2018-2021 Microsoft Corporation, Daan Leijen
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/source/luametatex/source/libraries/mimalloc/cmake/mimalloc-config-version.cmake b/source/luametatex/source/libraries/mimalloc/cmake/mimalloc-config-version.cmake
new file mode 100644
index 000000000..8063afe6b
--- /dev/null
+++ b/source/luametatex/source/libraries/mimalloc/cmake/mimalloc-config-version.cmake
@@ -0,0 +1,19 @@
+set(mi_version_major 2)
+set(mi_version_minor 0)
+set(mi_version_patch 6)
+set(mi_version ${mi_version_major}.${mi_version_minor})
+
+set(PACKAGE_VERSION ${mi_version})
+if(PACKAGE_FIND_VERSION_MAJOR)
+    if("${PACKAGE_FIND_VERSION_MAJOR}" EQUAL "${mi_version_major}")
+        if ("${PACKAGE_FIND_VERSION_MINOR}" EQUAL "${mi_version_minor}")
+            set(PACKAGE_VERSION_EXACT TRUE)
+        elseif("${PACKAGE_FIND_VERSION_MINOR}" LESS "${mi_version_minor}")
+            set(PACKAGE_VERSION_COMPATIBLE TRUE)
+        else()
+            set(PACKAGE_VERSION_UNSUITABLE TRUE)
+        endif()
+    else()
+        set(PACKAGE_VERSION_UNSUITABLE TRUE)
+    endif()
+endif()
diff --git a/source/luametatex/source/libraries/mimalloc/cmake/mimalloc-config.cmake b/source/luametatex/source/libraries/mimalloc/cmake/mimalloc-config.cmake
new file mode 100644
index 000000000..8a28e37e7
--- /dev/null
+++ b/source/luametatex/source/libraries/mimalloc/cmake/mimalloc-config.cmake
@@ -0,0 +1,14 @@
+include(${CMAKE_CURRENT_LIST_DIR}/mimalloc.cmake)
+get_filename_component(MIMALLOC_CMAKE_DIR "${CMAKE_CURRENT_LIST_DIR}" PATH)  # one up from the cmake dir, e.g. /usr/local/lib/cmake/mimalloc-2.0
+get_filename_component(MIMALLOC_VERSION_DIR "${CMAKE_CURRENT_LIST_DIR}" NAME)
+string(REPLACE "/lib/cmake" "/lib" MIMALLOC_LIBRARY_DIR "${MIMALLOC_CMAKE_DIR}")
+if("${MIMALLOC_VERSION_DIR}" EQUAL "mimalloc")  
+  # top level install
+  string(REPLACE "/lib/cmake" "/include" MIMALLOC_INCLUDE_DIR "${MIMALLOC_CMAKE_DIR}")
+  set(MIMALLOC_OBJECT_DIR "${MIMALLOC_LIBRARY_DIR}")
+else()  
+  # versioned
+  string(REPLACE "/lib/cmake/" "/include/" MIMALLOC_INCLUDE_DIR "${CMAKE_CURRENT_LIST_DIR}")
+  string(REPLACE "/lib/cmake/" "/lib/" MIMALLOC_OBJECT_DIR "${CMAKE_CURRENT_LIST_DIR}")  
+endif()  
+set(MIMALLOC_TARGET_DIR "${MIMALLOC_LIBRARY_DIR}") # legacy
diff --git a/source/luametatex/source/libraries/mimalloc/include/mimalloc-atomic.h b/source/luametatex/source/libraries/mimalloc/include/mimalloc-atomic.h
new file mode 100644
index 000000000..7ad5da585
--- /dev/null
+++ b/source/luametatex/source/libraries/mimalloc/include/mimalloc-atomic.h
@@ -0,0 +1,338 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2018-2021 Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+#pragma once
+#ifndef MIMALLOC_ATOMIC_H
+#define MIMALLOC_ATOMIC_H
+
+// --------------------------------------------------------------------------------------------
+// Atomics
+// We need to be portable between C, C++, and MSVC.
+// We base the primitives on the C/C++ atomics and create a mimimal wrapper for MSVC in C compilation mode. 
+// This is why we try to use only `uintptr_t` and `<type>*` as atomic types. 
+// To gain better insight in the range of used atomics, we use explicitly named memory order operations 
+// instead of passing the memory order as a parameter.
+// -----------------------------------------------------------------------------------------------
+
+#if defined(__cplusplus)
+// Use C++ atomics
+#include <atomic>
+#define  _Atomic(tp)            std::atomic<tp>
+#define  mi_atomic(name)        std::atomic_##name
+#define  mi_memory_order(name)  std::memory_order_##name
+#if !defined(ATOMIC_VAR_INIT) || (__cplusplus >= 202002L) // c++20, see issue #571
+ #define MI_ATOMIC_VAR_INIT(x)  x
+#else
+ #define MI_ATOMIC_VAR_INIT(x)  ATOMIC_VAR_INIT(x)
+#endif
+#elif defined(_MSC_VER)
+// Use MSVC C wrapper for C11 atomics
+#define  _Atomic(tp)            tp 
+#define  MI_ATOMIC_VAR_INIT(x)  x
+#define  mi_atomic(name)        mi_atomic_##name
+#define  mi_memory_order(name)  mi_memory_order_##name
+#else
+// Use C11 atomics
+#include <stdatomic.h>
+#define  mi_atomic(name)        atomic_##name
+#define  mi_memory_order(name)  memory_order_##name
+#define  MI_ATOMIC_VAR_INIT(x)  ATOMIC_VAR_INIT(x)
+#endif
+
+// Various defines for all used memory orders in mimalloc
+#define mi_atomic_cas_weak(p,expected,desired,mem_success,mem_fail)  \
+  mi_atomic(compare_exchange_weak_explicit)(p,expected,desired,mem_success,mem_fail)
+
+#define mi_atomic_cas_strong(p,expected,desired,mem_success,mem_fail)  \
+  mi_atomic(compare_exchange_strong_explicit)(p,expected,desired,mem_success,mem_fail)
+
+#define mi_atomic_load_acquire(p)                mi_atomic(load_explicit)(p,mi_memory_order(acquire))
+#define mi_atomic_load_relaxed(p)                mi_atomic(load_explicit)(p,mi_memory_order(relaxed))
+#define mi_atomic_store_release(p,x)             mi_atomic(store_explicit)(p,x,mi_memory_order(release))
+#define mi_atomic_store_relaxed(p,x)             mi_atomic(store_explicit)(p,x,mi_memory_order(relaxed))
+#define mi_atomic_exchange_release(p,x)          mi_atomic(exchange_explicit)(p,x,mi_memory_order(release))
+#define mi_atomic_exchange_acq_rel(p,x)          mi_atomic(exchange_explicit)(p,x,mi_memory_order(acq_rel))
+#define mi_atomic_cas_weak_release(p,exp,des)    mi_atomic_cas_weak(p,exp,des,mi_memory_order(release),mi_memory_order(relaxed))
+#define mi_atomic_cas_weak_acq_rel(p,exp,des)    mi_atomic_cas_weak(p,exp,des,mi_memory_order(acq_rel),mi_memory_order(acquire))
+#define mi_atomic_cas_strong_release(p,exp,des)  mi_atomic_cas_strong(p,exp,des,mi_memory_order(release),mi_memory_order(relaxed))
+#define mi_atomic_cas_strong_acq_rel(p,exp,des)  mi_atomic_cas_strong(p,exp,des,mi_memory_order(acq_rel),mi_memory_order(acquire))
+
+#define mi_atomic_add_relaxed(p,x)               mi_atomic(fetch_add_explicit)(p,x,mi_memory_order(relaxed))
+#define mi_atomic_sub_relaxed(p,x)               mi_atomic(fetch_sub_explicit)(p,x,mi_memory_order(relaxed))
+#define mi_atomic_add_acq_rel(p,x)               mi_atomic(fetch_add_explicit)(p,x,mi_memory_order(acq_rel))
+#define mi_atomic_sub_acq_rel(p,x)               mi_atomic(fetch_sub_explicit)(p,x,mi_memory_order(acq_rel))
+#define mi_atomic_and_acq_rel(p,x)               mi_atomic(fetch_and_explicit)(p,x,mi_memory_order(acq_rel))
+#define mi_atomic_or_acq_rel(p,x)                mi_atomic(fetch_or_explicit)(p,x,mi_memory_order(acq_rel))
+
+#define mi_atomic_increment_relaxed(p)           mi_atomic_add_relaxed(p,(uintptr_t)1)
+#define mi_atomic_decrement_relaxed(p)           mi_atomic_sub_relaxed(p,(uintptr_t)1)
+#define mi_atomic_increment_acq_rel(p)           mi_atomic_add_acq_rel(p,(uintptr_t)1)
+#define mi_atomic_decrement_acq_rel(p)           mi_atomic_sub_acq_rel(p,(uintptr_t)1)
+
+static inline void mi_atomic_yield(void);
+static inline intptr_t mi_atomic_addi(_Atomic(intptr_t)*p, intptr_t add);
+static inline intptr_t mi_atomic_subi(_Atomic(intptr_t)*p, intptr_t sub);
+
+
+#if defined(__cplusplus) || !defined(_MSC_VER)
+
+// In C++/C11 atomics we have polymorphic atomics so can use the typed `ptr` variants (where `tp` is the type of atomic value)
+// We use these macros so we can provide a typed wrapper in MSVC in C compilation mode as well
+#define mi_atomic_load_ptr_acquire(tp,p)                mi_atomic_load_acquire(p)
+#define mi_atomic_load_ptr_relaxed(tp,p)                mi_atomic_load_relaxed(p)
+
+// In C++ we need to add casts to help resolve templates if NULL is passed
+#if defined(__cplusplus)
+#define mi_atomic_store_ptr_release(tp,p,x)             mi_atomic_store_release(p,(tp*)x)
+#define mi_atomic_store_ptr_relaxed(tp,p,x)             mi_atomic_store_relaxed(p,(tp*)x)
+#define mi_atomic_cas_ptr_weak_release(tp,p,exp,des)    mi_atomic_cas_weak_release(p,exp,(tp*)des)
+#define mi_atomic_cas_ptr_weak_acq_rel(tp,p,exp,des)    mi_atomic_cas_weak_acq_rel(p,exp,(tp*)des)
+#define mi_atomic_cas_ptr_strong_release(tp,p,exp,des)  mi_atomic_cas_strong_release(p,exp,(tp*)des)
+#define mi_atomic_exchange_ptr_release(tp,p,x)          mi_atomic_exchange_release(p,(tp*)x)
+#define mi_atomic_exchange_ptr_acq_rel(tp,p,x)          mi_atomic_exchange_acq_rel(p,(tp*)x)
+#else
+#define mi_atomic_store_ptr_release(tp,p,x)             mi_atomic_store_release(p,x)
+#define mi_atomic_store_ptr_relaxed(tp,p,x)             mi_atomic_store_relaxed(p,x)
+#define mi_atomic_cas_ptr_weak_release(tp,p,exp,des)    mi_atomic_cas_weak_release(p,exp,des)
+#define mi_atomic_cas_ptr_weak_acq_rel(tp,p,exp,des)    mi_atomic_cas_weak_acq_rel(p,exp,des)
+#define mi_atomic_cas_ptr_strong_release(tp,p,exp,des)  mi_atomic_cas_strong_release(p,exp,des)
+#define mi_atomic_exchange_ptr_release(tp,p,x)          mi_atomic_exchange_release(p,x)
+#define mi_atomic_exchange_ptr_acq_rel(tp,p,x)          mi_atomic_exchange_acq_rel(p,x)
+#endif
+
+// These are used by the statistics
+static inline int64_t mi_atomic_addi64_relaxed(volatile int64_t* p, int64_t add) {
+  return mi_atomic(fetch_add_explicit)((_Atomic(int64_t)*)p, add, mi_memory_order(relaxed));
+}
+static inline void mi_atomic_maxi64_relaxed(volatile int64_t* p, int64_t x) {
+  int64_t current = mi_atomic_load_relaxed((_Atomic(int64_t)*)p);
+  while (current < x && !mi_atomic_cas_weak_release((_Atomic(int64_t)*)p, &current, x)) { /* nothing */ };
+}
+
+// Used by timers
+#define mi_atomic_loadi64_acquire(p)    mi_atomic(load_explicit)(p,mi_memory_order(acquire))
+#define mi_atomic_loadi64_relaxed(p)    mi_atomic(load_explicit)(p,mi_memory_order(relaxed))
+#define mi_atomic_storei64_release(p,x) mi_atomic(store_explicit)(p,x,mi_memory_order(release))
+#define mi_atomic_storei64_relaxed(p,x) mi_atomic(store_explicit)(p,x,mi_memory_order(relaxed))
+
+
+
+#elif defined(_MSC_VER)
+
+// MSVC C compilation wrapper that uses Interlocked operations to model C11 atomics.
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+#include <intrin.h>
+#ifdef _WIN64
+typedef LONG64   msc_intptr_t;
+#define MI_64(f) f##64
+#else
+typedef LONG     msc_intptr_t;
+#define MI_64(f) f
+#endif
+
+typedef enum mi_memory_order_e {
+  mi_memory_order_relaxed,
+  mi_memory_order_consume,
+  mi_memory_order_acquire,
+  mi_memory_order_release,
+  mi_memory_order_acq_rel,
+  mi_memory_order_seq_cst
+} mi_memory_order;
+
+static inline uintptr_t mi_atomic_fetch_add_explicit(_Atomic(uintptr_t)*p, uintptr_t add, mi_memory_order mo) {
+  (void)(mo);
+  return (uintptr_t)MI_64(_InterlockedExchangeAdd)((volatile msc_intptr_t*)p, (msc_intptr_t)add);
+}
+static inline uintptr_t mi_atomic_fetch_sub_explicit(_Atomic(uintptr_t)*p, uintptr_t sub, mi_memory_order mo) {
+  (void)(mo);
+  return (uintptr_t)MI_64(_InterlockedExchangeAdd)((volatile msc_intptr_t*)p, -((msc_intptr_t)sub));
+}
+static inline uintptr_t mi_atomic_fetch_and_explicit(_Atomic(uintptr_t)*p, uintptr_t x, mi_memory_order mo) {
+  (void)(mo);
+  return (uintptr_t)MI_64(_InterlockedAnd)((volatile msc_intptr_t*)p, (msc_intptr_t)x);
+}
+static inline uintptr_t mi_atomic_fetch_or_explicit(_Atomic(uintptr_t)*p, uintptr_t x, mi_memory_order mo) {
+  (void)(mo);
+  return (uintptr_t)MI_64(_InterlockedOr)((volatile msc_intptr_t*)p, (msc_intptr_t)x);
+}
+static inline bool mi_atomic_compare_exchange_strong_explicit(_Atomic(uintptr_t)*p, uintptr_t* expected, uintptr_t desired, mi_memory_order mo1, mi_memory_order mo2) {
+  (void)(mo1); (void)(mo2);
+  uintptr_t read = (uintptr_t)MI_64(_InterlockedCompareExchange)((volatile msc_intptr_t*)p, (msc_intptr_t)desired, (msc_intptr_t)(*expected));
+  if (read == *expected) {
+    return true;
+  }
+  else {
+    *expected = read;
+    return false;
+  }
+}
+static inline bool mi_atomic_compare_exchange_weak_explicit(_Atomic(uintptr_t)*p, uintptr_t* expected, uintptr_t desired, mi_memory_order mo1, mi_memory_order mo2) {
+  return mi_atomic_compare_exchange_strong_explicit(p, expected, desired, mo1, mo2);
+}
+static inline uintptr_t mi_atomic_exchange_explicit(_Atomic(uintptr_t)*p, uintptr_t exchange, mi_memory_order mo) {
+  (void)(mo);
+  return (uintptr_t)MI_64(_InterlockedExchange)((volatile msc_intptr_t*)p, (msc_intptr_t)exchange);
+}
+static inline void mi_atomic_thread_fence(mi_memory_order mo) {
+  (void)(mo);
+  _Atomic(uintptr_t) x = 0;
+  mi_atomic_exchange_explicit(&x, 1, mo);
+}
+static inline uintptr_t mi_atomic_load_explicit(_Atomic(uintptr_t) const* p, mi_memory_order mo) {
+  (void)(mo);
+#if defined(_M_IX86) || defined(_M_X64)
+  return *p;
+#else
+  uintptr_t x = *p;
+  if (mo > mi_memory_order_relaxed) {
+    while (!mi_atomic_compare_exchange_weak_explicit(p, &x, x, mo, mi_memory_order_relaxed)) { /* nothing */ };
+  }
+  return x;
+#endif
+}
+static inline void mi_atomic_store_explicit(_Atomic(uintptr_t)*p, uintptr_t x, mi_memory_order mo) {
+  (void)(mo);
+#if defined(_M_IX86) || defined(_M_X64)
+  *p = x;
+#else
+  mi_atomic_exchange_explicit(p, x, mo);
+#endif
+}
+static inline int64_t mi_atomic_loadi64_explicit(_Atomic(int64_t)*p, mi_memory_order mo) {
+  (void)(mo);
+#if defined(_M_X64)
+  return *p;
+#else
+  int64_t old = *p;
+  int64_t x = old;
+  while ((old = InterlockedCompareExchange64(p, x, old)) != x) {
+    x = old;
+  }
+  return x;
+#endif
+}
+static inline void mi_atomic_storei64_explicit(_Atomic(int64_t)*p, int64_t x, mi_memory_order mo) {
+  (void)(mo);
+#if defined(x_M_IX86) || defined(_M_X64)
+  *p = x;
+#else
+  InterlockedExchange64(p, x);
+#endif
+}
+
+// These are used by the statistics
+static inline int64_t mi_atomic_addi64_relaxed(volatile _Atomic(int64_t)*p, int64_t add) {
+#ifdef _WIN64
+  return (int64_t)mi_atomic_addi((int64_t*)p, add);
+#else
+  int64_t current;
+  int64_t sum;
+  do {
+    current = *p;
+    sum = current + add;
+  } while (_InterlockedCompareExchange64(p, sum, current) != current);
+  return current;
+#endif
+}
+static inline void mi_atomic_maxi64_relaxed(volatile _Atomic(int64_t)*p, int64_t x) {
+  int64_t current;
+  do {
+    current = *p;
+  } while (current < x && _InterlockedCompareExchange64(p, x, current) != current);
+}
+
+// The pointer macros cast to `uintptr_t`.
+#define mi_atomic_load_ptr_acquire(tp,p)                (tp*)mi_atomic_load_acquire((_Atomic(uintptr_t)*)(p))
+#define mi_atomic_load_ptr_relaxed(tp,p)                (tp*)mi_atomic_load_relaxed((_Atomic(uintptr_t)*)(p))
+#define mi_atomic_store_ptr_release(tp,p,x)             mi_atomic_store_release((_Atomic(uintptr_t)*)(p),(uintptr_t)(x))
+#define mi_atomic_store_ptr_relaxed(tp,p,x)             mi_atomic_store_relaxed((_Atomic(uintptr_t)*)(p),(uintptr_t)(x))
+#define mi_atomic_cas_ptr_weak_release(tp,p,exp,des)    mi_atomic_cas_weak_release((_Atomic(uintptr_t)*)(p),(uintptr_t*)exp,(uintptr_t)des)
+#define mi_atomic_cas_ptr_weak_acq_rel(tp,p,exp,des)    mi_atomic_cas_weak_acq_rel((_Atomic(uintptr_t)*)(p),(uintptr_t*)exp,(uintptr_t)des)
+#define mi_atomic_cas_ptr_strong_release(tp,p,exp,des)  mi_atomic_cas_strong_release((_Atomic(uintptr_t)*)(p),(uintptr_t*)exp,(uintptr_t)des)
+#define mi_atomic_exchange_ptr_release(tp,p,x)          (tp*)mi_atomic_exchange_release((_Atomic(uintptr_t)*)(p),(uintptr_t)x)
+#define mi_atomic_exchange_ptr_acq_rel(tp,p,x)          (tp*)mi_atomic_exchange_acq_rel((_Atomic(uintptr_t)*)(p),(uintptr_t)x)
+
+#define mi_atomic_loadi64_acquire(p)    mi_atomic(loadi64_explicit)(p,mi_memory_order(acquire))
+#define mi_atomic_loadi64_relaxed(p)    mi_atomic(loadi64_explicit)(p,mi_memory_order(relaxed))
+#define mi_atomic_storei64_release(p,x) mi_atomic(storei64_explicit)(p,x,mi_memory_order(release))
+#define mi_atomic_storei64_relaxed(p,x) mi_atomic(storei64_explicit)(p,x,mi_memory_order(relaxed))
+
+
+#endif
+
+
+// Atomically add a signed value; returns the previous value.
+static inline intptr_t mi_atomic_addi(_Atomic(intptr_t)*p, intptr_t add) {
+  return (intptr_t)mi_atomic_add_acq_rel((_Atomic(uintptr_t)*)p, (uintptr_t)add);
+}
+
+// Atomically subtract a signed value; returns the previous value.
+static inline intptr_t mi_atomic_subi(_Atomic(intptr_t)*p, intptr_t sub) {
+  return (intptr_t)mi_atomic_addi(p, -sub);
+}
+
+// Yield 
+#if defined(__cplusplus)
+#include <thread>
+static inline void mi_atomic_yield(void) {
+  std::this_thread::yield();
+}
+#elif defined(_WIN32)
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+static inline void mi_atomic_yield(void) {
+  YieldProcessor();
+}
+#elif defined(__SSE2__)
+#include <emmintrin.h>
+static inline void mi_atomic_yield(void) {
+  _mm_pause();
+}
+#elif (defined(__GNUC__) || defined(__clang__)) && \
+      (defined(__x86_64__) || defined(__i386__) || defined(__arm__) || defined(__armel__) || defined(__ARMEL__) || \
+       defined(__aarch64__) || defined(__powerpc__) || defined(__ppc__) || defined(__PPC__))
+#if defined(__x86_64__) || defined(__i386__)
+static inline void mi_atomic_yield(void) {
+  __asm__ volatile ("pause" ::: "memory");
+}
+#elif defined(__aarch64__)
+static inline void mi_atomic_yield(void) {
+  __asm__ volatile("wfe");
+}
+#elif (defined(__arm__) && __ARM_ARCH__ >= 7)
+static inline void mi_atomic_yield(void) {
+  __asm__ volatile("yield" ::: "memory");
+}
+#elif defined(__powerpc__) || defined(__ppc__) || defined(__PPC__)
+static inline void mi_atomic_yield(void) {
+  __asm__ __volatile__ ("or 27,27,27" ::: "memory");
+}
+#elif defined(__armel__) || defined(__ARMEL__)
+static inline void mi_atomic_yield(void) {
+  __asm__ volatile ("nop" ::: "memory");
+}
+#endif
+#elif defined(__sun)
+// Fallback for other archs
+#include <synch.h>
+static inline void mi_atomic_yield(void) {
+  smt_pause();
+}
+#elif defined(__wasi__)
+#include <sched.h>
+static inline void mi_atomic_yield(void) {
+  sched_yield();
+}
+#else
+#include <unistd.h>
+static inline void mi_atomic_yield(void) {
+  sleep(0);
+}
+#endif
+
+
+#endif // __MIMALLOC_ATOMIC_H
diff --git a/source/luametatex/source/libraries/mimalloc/include/mimalloc-internal.h b/source/luametatex/source/libraries/mimalloc/include/mimalloc-internal.h
new file mode 100644
index 000000000..d691eca58
--- /dev/null
+++ b/source/luametatex/source/libraries/mimalloc/include/mimalloc-internal.h
@@ -0,0 +1,1049 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2018-2022, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+#pragma once
+#ifndef MIMALLOC_INTERNAL_H
+#define MIMALLOC_INTERNAL_H
+
+#include "mimalloc-types.h"
+
+#if (MI_DEBUG>0)
+#define mi_trace_message(...)  _mi_trace_message(__VA_ARGS__)
+#else
+#define mi_trace_message(...)
+#endif
+
+#define MI_CACHE_LINE          64
+#if defined(_MSC_VER)
+#pragma warning(disable:4127)   // suppress constant conditional warning (due to MI_SECURE paths)
+#pragma warning(disable:26812)  // unscoped enum warning
+#define mi_decl_noinline        __declspec(noinline)
+#define mi_decl_thread          __declspec(thread)
+#define mi_decl_cache_align     __declspec(align(MI_CACHE_LINE))
+#elif (defined(__GNUC__) && (__GNUC__ >= 3)) || defined(__clang__) // includes clang and icc
+#define mi_decl_noinline        __attribute__((noinline))
+#define mi_decl_thread          __thread
+#define mi_decl_cache_align     __attribute__((aligned(MI_CACHE_LINE)))
+#else
+#define mi_decl_noinline
+#define mi_decl_thread          __thread        // hope for the best :-)
+#define mi_decl_cache_align
+#endif
+
+#if defined(__EMSCRIPTEN__) && !defined(__wasi__)
+#define __wasi__
+#endif
+
+#if defined(__cplusplus)
+#define mi_decl_externc       extern "C"
+#else
+#define mi_decl_externc  
+#endif
+
+#if !defined(_WIN32) && !defined(__wasi__) 
+#define  MI_USE_PTHREADS
+#include <pthread.h>
+#endif
+
+// "options.c"
+void       _mi_fputs(mi_output_fun* out, void* arg, const char* prefix, const char* message);
+void       _mi_fprintf(mi_output_fun* out, void* arg, const char* fmt, ...);
+void       _mi_warning_message(const char* fmt, ...);
+void       _mi_verbose_message(const char* fmt, ...);
+void       _mi_trace_message(const char* fmt, ...);
+void       _mi_options_init(void);
+void       _mi_error_message(int err, const char* fmt, ...);
+
+// random.c
+void       _mi_random_init(mi_random_ctx_t* ctx);
+void       _mi_random_split(mi_random_ctx_t* ctx, mi_random_ctx_t* new_ctx);
+uintptr_t  _mi_random_next(mi_random_ctx_t* ctx);
+uintptr_t  _mi_heap_random_next(mi_heap_t* heap);
+uintptr_t  _mi_os_random_weak(uintptr_t extra_seed);
+static inline uintptr_t _mi_random_shuffle(uintptr_t x);
+
+// init.c
+extern mi_decl_cache_align mi_stats_t       _mi_stats_main;
+extern mi_decl_cache_align const mi_page_t  _mi_page_empty;
+bool       _mi_is_main_thread(void);
+size_t     _mi_current_thread_count(void);
+bool       _mi_preloading(void);  // true while the C runtime is not ready
+
+// os.c
+size_t     _mi_os_page_size(void);
+void       _mi_os_init(void);                                      // called from process init
+void*      _mi_os_alloc(size_t size, mi_stats_t* stats);           // to allocate thread local data
+void       _mi_os_free(void* p, size_t size, mi_stats_t* stats);   // to free thread local data
+
+bool       _mi_os_protect(void* addr, size_t size);
+bool       _mi_os_unprotect(void* addr, size_t size);
+bool       _mi_os_commit(void* addr, size_t size, bool* is_zero, mi_stats_t* stats);
+bool       _mi_os_decommit(void* p, size_t size, mi_stats_t* stats);
+bool       _mi_os_reset(void* p, size_t size, mi_stats_t* stats);
+// bool       _mi_os_unreset(void* p, size_t size, bool* is_zero, mi_stats_t* stats);
+size_t     _mi_os_good_alloc_size(size_t size);
+bool       _mi_os_has_overcommit(void);
+
+// arena.c
+void*      _mi_arena_alloc_aligned(size_t size, size_t alignment, bool* commit, bool* large, bool* is_pinned, bool* is_zero, size_t* memid, mi_os_tld_t* tld);
+void*      _mi_arena_alloc(size_t size, bool* commit, bool* large, bool* is_pinned, bool* is_zero, size_t* memid, mi_os_tld_t* tld);
+void       _mi_arena_free(void* p, size_t size, size_t memid, bool is_committed, mi_os_tld_t* tld);
+
+// "segment-cache.c"
+void*      _mi_segment_cache_pop(size_t size, mi_commit_mask_t* commit_mask, mi_commit_mask_t* decommit_mask, bool* large, bool* is_pinned, bool* is_zero, size_t* memid, mi_os_tld_t* tld);
+bool       _mi_segment_cache_push(void* start, size_t size, size_t memid, const mi_commit_mask_t* commit_mask, const mi_commit_mask_t* decommit_mask, bool is_large, bool is_pinned, mi_os_tld_t* tld);
+void       _mi_segment_cache_collect(bool force, mi_os_tld_t* tld);
+void       _mi_segment_map_allocated_at(const mi_segment_t* segment);
+void       _mi_segment_map_freed_at(const mi_segment_t* segment);
+
+// "segment.c"
+mi_page_t* _mi_segment_page_alloc(mi_heap_t* heap, size_t block_wsize, mi_segments_tld_t* tld, mi_os_tld_t* os_tld);
+void       _mi_segment_page_free(mi_page_t* page, bool force, mi_segments_tld_t* tld);
+void       _mi_segment_page_abandon(mi_page_t* page, mi_segments_tld_t* tld);
+bool       _mi_segment_try_reclaim_abandoned( mi_heap_t* heap, bool try_all, mi_segments_tld_t* tld);
+void       _mi_segment_thread_collect(mi_segments_tld_t* tld);
+void       _mi_segment_huge_page_free(mi_segment_t* segment, mi_page_t* page, mi_block_t* block);
+
+uint8_t*   _mi_segment_page_start(const mi_segment_t* segment, const mi_page_t* page, size_t* page_size); // page start for any page
+void       _mi_abandoned_reclaim_all(mi_heap_t* heap, mi_segments_tld_t* tld);
+void       _mi_abandoned_await_readers(void);
+void       _mi_abandoned_collect(mi_heap_t* heap, bool force, mi_segments_tld_t* tld);
+
+
+
+// "page.c"
+void*      _mi_malloc_generic(mi_heap_t* heap, size_t size)  mi_attr_noexcept mi_attr_malloc;
+
+void       _mi_page_retire(mi_page_t* page) mi_attr_noexcept;                  // free the page if there are no other pages with many free blocks
+void       _mi_page_unfull(mi_page_t* page);
+void       _mi_page_free(mi_page_t* page, mi_page_queue_t* pq, bool force);   // free the page
+void       _mi_page_abandon(mi_page_t* page, mi_page_queue_t* pq);            // abandon the page, to be picked up by another thread...
+void       _mi_heap_delayed_free(mi_heap_t* heap);
+void       _mi_heap_collect_retired(mi_heap_t* heap, bool force);
+
+void       _mi_page_use_delayed_free(mi_page_t* page, mi_delayed_t delay, bool override_never);
+size_t     _mi_page_queue_append(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_queue_t* append);
+void       _mi_deferred_free(mi_heap_t* heap, bool force);
+
+void       _mi_page_free_collect(mi_page_t* page,bool force);
+void       _mi_page_reclaim(mi_heap_t* heap, mi_page_t* page);   // callback from segments
+
+size_t     _mi_bin_size(uint8_t bin);           // for stats
+uint8_t    _mi_bin(size_t size);                // for stats
+
+// "heap.c"
+void       _mi_heap_destroy_pages(mi_heap_t* heap);
+void       _mi_heap_collect_abandon(mi_heap_t* heap);
+void       _mi_heap_set_default_direct(mi_heap_t* heap);
+
+// "stats.c"
+void       _mi_stats_done(mi_stats_t* stats);
+
+mi_msecs_t  _mi_clock_now(void);
+mi_msecs_t  _mi_clock_end(mi_msecs_t start);
+mi_msecs_t  _mi_clock_start(void);
+
+// "alloc.c"
+void*       _mi_page_malloc(mi_heap_t* heap, mi_page_t* page, size_t size) mi_attr_noexcept;  // called from `_mi_malloc_generic`
+void*       _mi_heap_malloc_zero(mi_heap_t* heap, size_t size, bool zero) mi_attr_noexcept;
+void*       _mi_heap_realloc_zero(mi_heap_t* heap, void* p, size_t newsize, bool zero) mi_attr_noexcept;
+mi_block_t* _mi_page_ptr_unalign(const mi_segment_t* segment, const mi_page_t* page, const void* p);
+bool        _mi_free_delayed_block(mi_block_t* block);
+void        _mi_block_zero_init(const mi_page_t* page, void* p, size_t size);
+
+#if MI_DEBUG>1
+bool        _mi_page_is_valid(mi_page_t* page);
+#endif
+
+
+// ------------------------------------------------------
+// Branches
+// ------------------------------------------------------
+
+#if defined(__GNUC__) || defined(__clang__)
+#define mi_unlikely(x)     __builtin_expect(!!(x),false)
+#define mi_likely(x)       __builtin_expect(!!(x),true)
+#else
+#define mi_unlikely(x)     (x)
+#define mi_likely(x)       (x)
+#endif
+
+#ifndef __has_builtin
+#define __has_builtin(x)  0
+#endif
+
+
+/* -----------------------------------------------------------
+  Error codes passed to `_mi_fatal_error`
+  All are recoverable but EFAULT is a serious error and aborts by default in secure mode.
+  For portability define undefined error codes using common Unix codes:
+  <https://www-numi.fnal.gov/offline_software/srt_public_context/WebDocs/Errors/unix_system_errors.html>
+----------------------------------------------------------- */
+#include <errno.h>
+#ifndef EAGAIN         // double free
+#define EAGAIN (11)
+#endif
+#ifndef ENOMEM         // out of memory
+#define ENOMEM (12)
+#endif
+#ifndef EFAULT         // corrupted free-list or meta-data
+#define EFAULT (14)
+#endif
+#ifndef EINVAL         // trying to free an invalid pointer
+#define EINVAL (22)
+#endif
+#ifndef EOVERFLOW      // count*size overflow
+#define EOVERFLOW (75)
+#endif
+
+
+/* -----------------------------------------------------------
+  Inlined definitions
+----------------------------------------------------------- */
+#define MI_UNUSED(x)     (void)(x)
+#if (MI_DEBUG>0)
+#define MI_UNUSED_RELEASE(x)
+#else
+#define MI_UNUSED_RELEASE(x)  MI_UNUSED(x)
+#endif
+
+#define MI_INIT4(x)   x(),x(),x(),x()
+#define MI_INIT8(x)   MI_INIT4(x),MI_INIT4(x)
+#define MI_INIT16(x)  MI_INIT8(x),MI_INIT8(x)
+#define MI_INIT32(x)  MI_INIT16(x),MI_INIT16(x)
+#define MI_INIT64(x)  MI_INIT32(x),MI_INIT32(x)
+#define MI_INIT128(x) MI_INIT64(x),MI_INIT64(x)
+#define MI_INIT256(x) MI_INIT128(x),MI_INIT128(x)
+
+
+// Is `x` a power of two? (0 is considered a power of two)
+static inline bool _mi_is_power_of_two(uintptr_t x) {
+  return ((x & (x - 1)) == 0);
+}
+
+// Align upwards
+static inline uintptr_t _mi_align_up(uintptr_t sz, size_t alignment) {
+  mi_assert_internal(alignment != 0);
+  uintptr_t mask = alignment - 1;
+  if ((alignment & mask) == 0) {  // power of two?
+    return ((sz + mask) & ~mask);
+  }
+  else {
+    return (((sz + mask)/alignment)*alignment);
+  }
+}
+
+// Align downwards
+static inline uintptr_t _mi_align_down(uintptr_t sz, size_t alignment) {
+  mi_assert_internal(alignment != 0);
+  uintptr_t mask = alignment - 1;
+  if ((alignment & mask) == 0) { // power of two?
+    return (sz & ~mask);
+  }
+  else {
+    return ((sz / alignment) * alignment);
+  }
+}
+
+// Divide upwards: `s <= _mi_divide_up(s,d)*d < s+d`.
+static inline uintptr_t _mi_divide_up(uintptr_t size, size_t divider) {
+  mi_assert_internal(divider != 0);
+  return (divider == 0 ? size : ((size + divider - 1) / divider));
+}
+
+// Is memory zero initialized?
+static inline bool mi_mem_is_zero(void* p, size_t size) {
+  for (size_t i = 0; i < size; i++) {
+    if (((uint8_t*)p)[i] != 0) return false;
+  }
+  return true;
+}
+
+
+// Align a byte size to a size in _machine words_,
+// i.e. byte size == `wsize*sizeof(void*)`.
+static inline size_t _mi_wsize_from_size(size_t size) {
+  mi_assert_internal(size <= SIZE_MAX - sizeof(uintptr_t));
+  return (size + sizeof(uintptr_t) - 1) / sizeof(uintptr_t);
+}
+
+// Overflow detecting multiply
+#if __has_builtin(__builtin_umul_overflow) || (defined(__GNUC__) && (__GNUC__ >= 5))
+#include <limits.h>      // UINT_MAX, ULONG_MAX
+#if defined(_CLOCK_T)    // for Illumos
+#undef _CLOCK_T
+#endif
+static inline bool mi_mul_overflow(size_t count, size_t size, size_t* total) {
+  #if (SIZE_MAX == ULONG_MAX)
+    return __builtin_umull_overflow(count, size, (unsigned long *)total);
+  #elif (SIZE_MAX == UINT_MAX)
+    return __builtin_umul_overflow(count, size, (unsigned int *)total);
+  #else
+    return __builtin_umulll_overflow(count, size, (unsigned long long *)total);
+  #endif
+}
+#else /* __builtin_umul_overflow is unavailable */
+static inline bool mi_mul_overflow(size_t count, size_t size, size_t* total) {
+  #define MI_MUL_NO_OVERFLOW ((size_t)1 << (4*sizeof(size_t)))  // sqrt(SIZE_MAX)
+  *total = count * size;
+  return ((size >= MI_MUL_NO_OVERFLOW || count >= MI_MUL_NO_OVERFLOW)
+    && size > 0 && (SIZE_MAX / size) < count);
+}
+#endif
+
+// Safe multiply `count*size` into `total`; return `true` on overflow.
+static inline bool mi_count_size_overflow(size_t count, size_t size, size_t* total) {
+  if (count==1) {  // quick check for the case where count is one (common for C++ allocators)
+    *total = size;
+    return false;
+  }
+  else if (mi_unlikely(mi_mul_overflow(count, size, total))) {
+    _mi_error_message(EOVERFLOW, "allocation request is too large (%zu * %zu bytes)\n", count, size);
+    *total = SIZE_MAX;
+    return true;
+  }
+  else return false;
+}
+
+
+/* ----------------------------------------------------------------------------------------
+The thread local default heap: `_mi_get_default_heap` returns the thread local heap.
+On most platforms (Windows, Linux, FreeBSD, NetBSD, etc), this just returns a
+__thread local variable (`_mi_heap_default`). With the initial-exec TLS model this ensures
+that the storage will always be available (allocated on the thread stacks).
+On some platforms though we cannot use that when overriding `malloc` since the underlying
+TLS implementation (or the loader) will call itself `malloc` on a first access and recurse.
+We try to circumvent this in an efficient way:
+- macOSX : we use an unused TLS slot from the OS allocated slots (MI_TLS_SLOT). On OSX, the
+           loader itself calls `malloc` even before the modules are initialized.
+- OpenBSD: we use an unused slot from the pthread block (MI_TLS_PTHREAD_SLOT_OFS).
+- DragonFly: defaults are working but seem slow compared to freeBSD (see PR #323)
+------------------------------------------------------------------------------------------- */
+
+extern const mi_heap_t _mi_heap_empty;  // read-only empty heap, initial value of the thread local default heap
+extern bool _mi_process_is_initialized;
+mi_heap_t*  _mi_heap_main_get(void);    // statically allocated main backing heap
+
+#if defined(MI_MALLOC_OVERRIDE)
+#if defined(__APPLE__) // macOS
+#define MI_TLS_SLOT               89  // seems unused? 
+// #define MI_TLS_RECURSE_GUARD 1     
+// other possible unused ones are 9, 29, __PTK_FRAMEWORK_JAVASCRIPTCORE_KEY4 (94), __PTK_FRAMEWORK_GC_KEY9 (112) and __PTK_FRAMEWORK_OLDGC_KEY9 (89)
+// see <https://github.com/rweichler/substrate/blob/master/include/pthread_machdep.h>
+#elif defined(__OpenBSD__)
+// use end bytes of a name; goes wrong if anyone uses names > 23 characters (ptrhread specifies 16) 
+// see <https://github.com/openbsd/src/blob/master/lib/libc/include/thread_private.h#L371>
+#define MI_TLS_PTHREAD_SLOT_OFS   (6*sizeof(int) + 4*sizeof(void*) + 24)  
+// #elif defined(__DragonFly__)
+// #warning "mimalloc is not working correctly on DragonFly yet."
+// #define MI_TLS_PTHREAD_SLOT_OFS   (4 + 1*sizeof(void*))  // offset `uniqueid` (also used by gdb?) <https://github.com/DragonFlyBSD/DragonFlyBSD/blob/master/lib/libthread_xu/thread/thr_private.h#L458>
+#elif defined(__ANDROID__)
+// See issue #381
+#define MI_TLS_PTHREAD
+#endif
+#endif
+
+#if defined(MI_TLS_SLOT)
+static inline void* mi_tls_slot(size_t slot) mi_attr_noexcept;   // forward declaration
+#elif defined(MI_TLS_PTHREAD_SLOT_OFS)
+static inline mi_heap_t** mi_tls_pthread_heap_slot(void) {
+  pthread_t self = pthread_self();
+  #if defined(__DragonFly__)
+  if (self==NULL) {
+    mi_heap_t* pheap_main = _mi_heap_main_get();
+    return &pheap_main;
+  }
+  #endif
+  return (mi_heap_t**)((uint8_t*)self + MI_TLS_PTHREAD_SLOT_OFS);
+}
+#elif defined(MI_TLS_PTHREAD)
+extern pthread_key_t _mi_heap_default_key;
+#endif
+
+// Default heap to allocate from (if not using TLS- or pthread slots).
+// Do not use this directly but use through `mi_heap_get_default()` (or the unchecked `mi_get_default_heap`).
+// This thread local variable is only used when neither MI_TLS_SLOT, MI_TLS_PTHREAD, or MI_TLS_PTHREAD_SLOT_OFS are defined.
+// However, on the Apple M1 we do use the address of this variable as the unique thread-id (issue #356).
+extern mi_decl_thread mi_heap_t* _mi_heap_default;  // default heap to allocate from
+
+static inline mi_heap_t* mi_get_default_heap(void) {
+#if defined(MI_TLS_SLOT)
+  mi_heap_t* heap = (mi_heap_t*)mi_tls_slot(MI_TLS_SLOT);
+  if (mi_unlikely(heap == NULL)) {
+    #ifdef __GNUC__
+    __asm(""); // prevent conditional load of the address of _mi_heap_empty
+    #endif
+    heap = (mi_heap_t*)&_mi_heap_empty;    
+  }
+  return heap;
+#elif defined(MI_TLS_PTHREAD_SLOT_OFS)
+  mi_heap_t* heap = *mi_tls_pthread_heap_slot();
+  return (mi_unlikely(heap == NULL) ? (mi_heap_t*)&_mi_heap_empty : heap);
+#elif defined(MI_TLS_PTHREAD)
+  mi_heap_t* heap = (mi_unlikely(_mi_heap_default_key == (pthread_key_t)(-1)) ? _mi_heap_main_get() : (mi_heap_t*)pthread_getspecific(_mi_heap_default_key));
+  return (mi_unlikely(heap == NULL) ? (mi_heap_t*)&_mi_heap_empty : heap);
+#else
+  #if defined(MI_TLS_RECURSE_GUARD)  
+  if (mi_unlikely(!_mi_process_is_initialized)) return _mi_heap_main_get();
+  #endif
+  return _mi_heap_default;
+#endif
+}
+
+static inline bool mi_heap_is_default(const mi_heap_t* heap) {
+  return (heap == mi_get_default_heap());
+}
+
+static inline bool mi_heap_is_backing(const mi_heap_t* heap) {
+  return (heap->tld->heap_backing == heap);
+}
+
+static inline bool mi_heap_is_initialized(mi_heap_t* heap) {
+  mi_assert_internal(heap != NULL);
+  return (heap != &_mi_heap_empty);
+}
+
+static inline uintptr_t _mi_ptr_cookie(const void* p) {
+  extern mi_heap_t _mi_heap_main;
+  mi_assert_internal(_mi_heap_main.cookie != 0);
+  return ((uintptr_t)p ^ _mi_heap_main.cookie);
+}
+
+/* -----------------------------------------------------------
+  Pages
+----------------------------------------------------------- */
+
+static inline mi_page_t* _mi_heap_get_free_small_page(mi_heap_t* heap, size_t size) {
+  mi_assert_internal(size <= (MI_SMALL_SIZE_MAX + MI_PADDING_SIZE));
+  const size_t idx = _mi_wsize_from_size(size);
+  mi_assert_internal(idx < MI_PAGES_DIRECT);
+  return heap->pages_free_direct[idx];
+}
+
+// Get the page belonging to a certain size class
+static inline mi_page_t* _mi_get_free_small_page(size_t size) {
+  return _mi_heap_get_free_small_page(mi_get_default_heap(), size);
+}
+
+// Segment that contains the pointer
+static inline mi_segment_t* _mi_ptr_segment(const void* p) {
+  // mi_assert_internal(p != NULL);
+  return (mi_segment_t*)((uintptr_t)p & ~MI_SEGMENT_MASK);
+}
+
+static inline mi_page_t* mi_slice_to_page(mi_slice_t* s) {
+  mi_assert_internal(s->slice_offset== 0 && s->slice_count > 0);
+  return (mi_page_t*)(s);
+}
+
+static inline mi_slice_t* mi_page_to_slice(mi_page_t* p) {
+  mi_assert_internal(p->slice_offset== 0 && p->slice_count > 0);
+  return (mi_slice_t*)(p);
+}
+
+// Segment belonging to a page
+static inline mi_segment_t* _mi_page_segment(const mi_page_t* page) {
+  mi_segment_t* segment = _mi_ptr_segment(page); 
+  mi_assert_internal(segment == NULL || ((mi_slice_t*)page >= segment->slices && (mi_slice_t*)page < segment->slices + segment->slice_entries));
+  return segment;
+}
+
+static inline mi_slice_t* mi_slice_first(const mi_slice_t* slice) {
+  mi_slice_t* start = (mi_slice_t*)((uint8_t*)slice - slice->slice_offset);
+  mi_assert_internal(start >= _mi_ptr_segment(slice)->slices);
+  mi_assert_internal(start->slice_offset == 0);
+  mi_assert_internal(start + start->slice_count > slice);
+  return start;
+}
+
+// Get the page containing the pointer
+static inline mi_page_t* _mi_segment_page_of(const mi_segment_t* segment, const void* p) {
+  ptrdiff_t diff = (uint8_t*)p - (uint8_t*)segment;
+  mi_assert_internal(diff >= 0 && diff < (ptrdiff_t)MI_SEGMENT_SIZE);
+  size_t idx = (size_t)diff >> MI_SEGMENT_SLICE_SHIFT;
+  mi_assert_internal(idx < segment->slice_entries);
+  mi_slice_t* slice0 = (mi_slice_t*)&segment->slices[idx];
+  mi_slice_t* slice = mi_slice_first(slice0);  // adjust to the block that holds the page data
+  mi_assert_internal(slice->slice_offset == 0);
+  mi_assert_internal(slice >= segment->slices && slice < segment->slices + segment->slice_entries);
+  return mi_slice_to_page(slice);
+}
+
+// Quick page start for initialized pages
+static inline uint8_t* _mi_page_start(const mi_segment_t* segment, const mi_page_t* page, size_t* page_size) {
+  return _mi_segment_page_start(segment, page, page_size);
+}
+
+// Get the page containing the pointer
+static inline mi_page_t* _mi_ptr_page(void* p) {
+  return _mi_segment_page_of(_mi_ptr_segment(p), p);
+}
+
+// Get the block size of a page (special case for huge objects)
+static inline size_t mi_page_block_size(const mi_page_t* page) {
+  const size_t bsize = page->xblock_size;
+  mi_assert_internal(bsize > 0);
+  if (mi_likely(bsize < MI_HUGE_BLOCK_SIZE)) {
+    return bsize;
+  }
+  else {
+    size_t psize;
+    _mi_segment_page_start(_mi_page_segment(page), page, &psize);
+    return psize;
+  }
+}
+
+// Get the usable block size of a page without fixed padding.
+// This may still include internal padding due to alignment and rounding up size classes.
+static inline size_t mi_page_usable_block_size(const mi_page_t* page) {
+  return mi_page_block_size(page) - MI_PADDING_SIZE;
+}
+
+// size of a segment
+static inline size_t mi_segment_size(mi_segment_t* segment) {
+  return segment->segment_slices * MI_SEGMENT_SLICE_SIZE;
+}
+
+static inline uint8_t* mi_segment_end(mi_segment_t* segment) {
+  return (uint8_t*)segment + mi_segment_size(segment);
+}
+
+// Thread free access
+static inline mi_block_t* mi_page_thread_free(const mi_page_t* page) {
+  return (mi_block_t*)(mi_atomic_load_relaxed(&((mi_page_t*)page)->xthread_free) & ~3);
+}
+
+static inline mi_delayed_t mi_page_thread_free_flag(const mi_page_t* page) {
+  return (mi_delayed_t)(mi_atomic_load_relaxed(&((mi_page_t*)page)->xthread_free) & 3);
+}
+
+// Heap access
+static inline mi_heap_t* mi_page_heap(const mi_page_t* page) {
+  return (mi_heap_t*)(mi_atomic_load_relaxed(&((mi_page_t*)page)->xheap));
+}
+
+static inline void mi_page_set_heap(mi_page_t* page, mi_heap_t* heap) {
+  mi_assert_internal(mi_page_thread_free_flag(page) != MI_DELAYED_FREEING);
+  mi_atomic_store_release(&page->xheap,(uintptr_t)heap);
+}
+
+// Thread free flag helpers
+static inline mi_block_t* mi_tf_block(mi_thread_free_t tf) {
+  return (mi_block_t*)(tf & ~0x03);
+}
+static inline mi_delayed_t mi_tf_delayed(mi_thread_free_t tf) {
+  return (mi_delayed_t)(tf & 0x03);
+}
+static inline mi_thread_free_t mi_tf_make(mi_block_t* block, mi_delayed_t delayed) {
+  return (mi_thread_free_t)((uintptr_t)block | (uintptr_t)delayed);
+}
+static inline mi_thread_free_t mi_tf_set_delayed(mi_thread_free_t tf, mi_delayed_t delayed) {
+  return mi_tf_make(mi_tf_block(tf),delayed);
+}
+static inline mi_thread_free_t mi_tf_set_block(mi_thread_free_t tf, mi_block_t* block) {
+  return mi_tf_make(block, mi_tf_delayed(tf));
+}
+
+// are all blocks in a page freed?
+// note: needs up-to-date used count, (as the `xthread_free` list may not be empty). see `_mi_page_collect_free`.
+static inline bool mi_page_all_free(const mi_page_t* page) {
+  mi_assert_internal(page != NULL);
+  return (page->used == 0);
+}
+
+// are there any available blocks?
+static inline bool mi_page_has_any_available(const mi_page_t* page) {
+  mi_assert_internal(page != NULL && page->reserved > 0);
+  return (page->used < page->reserved || (mi_page_thread_free(page) != NULL));
+}
+
+// are there immediately available blocks, i.e. blocks available on the free list.
+static inline bool mi_page_immediate_available(const mi_page_t* page) {
+  mi_assert_internal(page != NULL);
+  return (page->free != NULL);
+}
+
+// is more than 7/8th of a page in use?
+static inline bool mi_page_mostly_used(const mi_page_t* page) {
+  if (page==NULL) return true;
+  uint16_t frac = page->reserved / 8U;
+  return (page->reserved - page->used <= frac);
+}
+
+static inline mi_page_queue_t* mi_page_queue(const mi_heap_t* heap, size_t size) {
+  return &((mi_heap_t*)heap)->pages[_mi_bin(size)];
+}
+
+
+
+//-----------------------------------------------------------
+// Page flags
+//-----------------------------------------------------------
+static inline bool mi_page_is_in_full(const mi_page_t* page) {
+  return page->flags.x.in_full;
+}
+
+static inline void mi_page_set_in_full(mi_page_t* page, bool in_full) {
+  page->flags.x.in_full = in_full;
+}
+
+static inline bool mi_page_has_aligned(const mi_page_t* page) {
+  return page->flags.x.has_aligned;
+}
+
+static inline void mi_page_set_has_aligned(mi_page_t* page, bool has_aligned) {
+  page->flags.x.has_aligned = has_aligned;
+}
+
+
+/* -------------------------------------------------------------------
+Encoding/Decoding the free list next pointers
+
+This is to protect against buffer overflow exploits where the
+free list is mutated. Many hardened allocators xor the next pointer `p`
+with a secret key `k1`, as `p^k1`. This prevents overwriting with known
+values but might be still too weak: if the attacker can guess
+the pointer `p` this  can reveal `k1` (since `p^k1^p == k1`).
+Moreover, if multiple blocks can be read as well, the attacker can
+xor both as `(p1^k1) ^ (p2^k1) == p1^p2` which may reveal a lot
+about the pointers (and subsequently `k1`).
+
+Instead mimalloc uses an extra key `k2` and encodes as `((p^k2)<<<k1)+k1`.
+Since these operations are not associative, the above approaches do not
+work so well any more even if the `p` can be guesstimated. For example,
+for the read case we can subtract two entries to discard the `+k1` term,
+but that leads to `((p1^k2)<<<k1) - ((p2^k2)<<<k1)` at best.
+We include the left-rotation since xor and addition are otherwise linear
+in the lowest bit. Finally, both keys are unique per page which reduces
+the re-use of keys by a large factor.
+
+We also pass a separate `null` value to be used as `NULL` or otherwise
+`(k2<<<k1)+k1` would appear (too) often as a sentinel value.
+------------------------------------------------------------------- */
+
+static inline bool mi_is_in_same_segment(const void* p, const void* q) {
+  return (_mi_ptr_segment(p) == _mi_ptr_segment(q));
+}
+
+static inline bool mi_is_in_same_page(const void* p, const void* q) {
+  mi_segment_t* segment = _mi_ptr_segment(p);
+  if (_mi_ptr_segment(q) != segment) return false;
+  // assume q may be invalid // return (_mi_segment_page_of(segment, p) == _mi_segment_page_of(segment, q));
+  mi_page_t* page = _mi_segment_page_of(segment, p);
+  size_t psize;
+  uint8_t* start = _mi_segment_page_start(segment, page, &psize);
+  return (start <= (uint8_t*)q && (uint8_t*)q < start + psize);
+}
+
+static inline uintptr_t mi_rotl(uintptr_t x, uintptr_t shift) {
+  shift %= MI_INTPTR_BITS;
+  return (shift==0 ? x : ((x << shift) | (x >> (MI_INTPTR_BITS - shift))));
+}
+static inline uintptr_t mi_rotr(uintptr_t x, uintptr_t shift) {
+  shift %= MI_INTPTR_BITS;
+  return (shift==0 ? x : ((x >> shift) | (x << (MI_INTPTR_BITS - shift))));
+}
+
+static inline void* mi_ptr_decode(const void* null, const mi_encoded_t x, const uintptr_t* keys) {
+  void* p = (void*)(mi_rotr(x - keys[0], keys[0]) ^ keys[1]);
+  return (mi_unlikely(p==null) ? NULL : p);
+}
+
+static inline mi_encoded_t mi_ptr_encode(const void* null, const void* p, const uintptr_t* keys) {
+  uintptr_t x = (uintptr_t)(mi_unlikely(p==NULL) ? null : p);
+  return mi_rotl(x ^ keys[1], keys[0]) + keys[0];
+}
+
+static inline mi_block_t* mi_block_nextx( const void* null, const mi_block_t* block, const uintptr_t* keys ) {
+  #ifdef MI_ENCODE_FREELIST
+  return (mi_block_t*)mi_ptr_decode(null, block->next, keys);
+  #else
+  MI_UNUSED(keys); MI_UNUSED(null);
+  return (mi_block_t*)block->next;
+  #endif
+}
+
+static inline void mi_block_set_nextx(const void* null, mi_block_t* block, const mi_block_t* next, const uintptr_t* keys) {
+  #ifdef MI_ENCODE_FREELIST
+  block->next = mi_ptr_encode(null, next, keys);
+  #else
+  MI_UNUSED(keys); MI_UNUSED(null);
+  block->next = (mi_encoded_t)next;
+  #endif
+}
+
+static inline mi_block_t* mi_block_next(const mi_page_t* page, const mi_block_t* block) {
+  #ifdef MI_ENCODE_FREELIST
+  mi_block_t* next = mi_block_nextx(page,block,page->keys);
+  // check for free list corruption: is `next` at least in the same page?
+  // TODO: check if `next` is `page->block_size` aligned?
+  if (mi_unlikely(next!=NULL && !mi_is_in_same_page(block, next))) {
+    _mi_error_message(EFAULT, "corrupted free list entry of size %zub at %p: value 0x%zx\n", mi_page_block_size(page), block, (uintptr_t)next);
+    next = NULL;
+  }
+  return next;
+  #else
+  MI_UNUSED(page);
+  return mi_block_nextx(page,block,NULL);
+  #endif
+}
+
+static inline void mi_block_set_next(const mi_page_t* page, mi_block_t* block, const mi_block_t* next) {
+  #ifdef MI_ENCODE_FREELIST
+  mi_block_set_nextx(page,block,next, page->keys);
+  #else
+  MI_UNUSED(page);
+  mi_block_set_nextx(page,block,next,NULL);
+  #endif
+}
+
+
+// -------------------------------------------------------------------
+// commit mask
+// -------------------------------------------------------------------
+
+static inline void mi_commit_mask_create_empty(mi_commit_mask_t* cm) {
+  for (size_t i = 0; i < MI_COMMIT_MASK_FIELD_COUNT; i++) {
+    cm->mask[i] = 0;
+  }
+}
+
+static inline void mi_commit_mask_create_full(mi_commit_mask_t* cm) {
+  for (size_t i = 0; i < MI_COMMIT_MASK_FIELD_COUNT; i++) {
+    cm->mask[i] = ~((size_t)0);
+  }
+}
+
+static inline bool mi_commit_mask_is_empty(const mi_commit_mask_t* cm) {
+  for (size_t i = 0; i < MI_COMMIT_MASK_FIELD_COUNT; i++) {
+    if (cm->mask[i] != 0) return false;
+  }
+  return true;
+}
+
+static inline bool mi_commit_mask_is_full(const mi_commit_mask_t* cm) {
+  for (size_t i = 0; i < MI_COMMIT_MASK_FIELD_COUNT; i++) {
+    if (cm->mask[i] != ~((size_t)0)) return false;
+  }
+  return true;
+}
+
+// defined in `segment.c`:
+size_t _mi_commit_mask_committed_size(const mi_commit_mask_t* cm, size_t total);
+size_t _mi_commit_mask_next_run(const mi_commit_mask_t* cm, size_t* idx);
+
+#define mi_commit_mask_foreach(cm,idx,count) \
+  idx = 0; \
+  while ((count = _mi_commit_mask_next_run(cm,&idx)) > 0) { 
+        
+#define mi_commit_mask_foreach_end() \
+    idx += count; \
+  }
+      
+
+
+
+// -------------------------------------------------------------------
+// Fast "random" shuffle
+// -------------------------------------------------------------------
+
+static inline uintptr_t _mi_random_shuffle(uintptr_t x) {
+  if (x==0) { x = 17; }   // ensure we don't get stuck in generating zeros
+#if (MI_INTPTR_SIZE==8)
+  // by Sebastiano Vigna, see: <http://xoshiro.di.unimi.it/splitmix64.c>
+  x ^= x >> 30;
+  x *= 0xbf58476d1ce4e5b9UL;
+  x ^= x >> 27;
+  x *= 0x94d049bb133111ebUL;
+  x ^= x >> 31;
+#elif (MI_INTPTR_SIZE==4)
+  // by Chris Wellons, see: <https://nullprogram.com/blog/2018/07/31/>
+  x ^= x >> 16;
+  x *= 0x7feb352dUL;
+  x ^= x >> 15;
+  x *= 0x846ca68bUL;
+  x ^= x >> 16;
+#endif
+  return x;
+}
+
+// -------------------------------------------------------------------
+// Optimize numa node access for the common case (= one node)
+// -------------------------------------------------------------------
+
+int    _mi_os_numa_node_get(mi_os_tld_t* tld);
+size_t _mi_os_numa_node_count_get(void);
+
+extern _Atomic(size_t) _mi_numa_node_count;
+static inline int _mi_os_numa_node(mi_os_tld_t* tld) {
+  if (mi_likely(mi_atomic_load_relaxed(&_mi_numa_node_count) == 1)) return 0;
+  else return _mi_os_numa_node_get(tld);
+}
+static inline size_t _mi_os_numa_node_count(void) {
+  const size_t count = mi_atomic_load_relaxed(&_mi_numa_node_count);
+  if (mi_likely(count>0)) return count;
+  else return _mi_os_numa_node_count_get();
+}
+
+
+// -------------------------------------------------------------------
+// Getting the thread id should be performant as it is called in the
+// fast path of `_mi_free` and we specialize for various platforms.
+// We only require _mi_threadid() to return a unique id for each thread.
+// -------------------------------------------------------------------
+#if defined(_WIN32)
+
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+static inline mi_threadid_t _mi_thread_id(void) mi_attr_noexcept {
+  // Windows: works on Intel and ARM in both 32- and 64-bit
+  return (uintptr_t)NtCurrentTeb();
+}
+
+// We use assembly for a fast thread id on the main platforms. The TLS layout depends on 
+// both the OS and libc implementation so we use specific tests for each main platform.
+// If you test on another platform and it works please send a PR :-)
+// see also https://akkadia.org/drepper/tls.pdf for more info on the TLS register.
+#elif defined(__GNUC__) && ( \
+           (defined(__GLIBC__)   && (defined(__x86_64__) || defined(__i386__) || defined(__arm__) || defined(__aarch64__))) \
+        || (defined(__APPLE__)   && (defined(__x86_64__) || defined(__aarch64__))) \
+        || (defined(__BIONIC__)  && (defined(__x86_64__) || defined(__i386__) || defined(__arm__) || defined(__aarch64__))) \
+        || (defined(__FreeBSD__) && (defined(__x86_64__) || defined(__i386__) || defined(__aarch64__))) \
+        || (defined(__OpenBSD__) && (defined(__x86_64__) || defined(__i386__) || defined(__aarch64__))) \
+      )
+
+static inline void* mi_tls_slot(size_t slot) mi_attr_noexcept {
+  void* res;
+  const size_t ofs = (slot*sizeof(void*));
+  #if defined(__i386__)
+    __asm__("movl %%gs:%1, %0" : "=r" (res) : "m" (*((void**)ofs)) : );  // x86 32-bit always uses GS
+  #elif defined(__APPLE__) && defined(__x86_64__)
+    __asm__("movq %%gs:%1, %0" : "=r" (res) : "m" (*((void**)ofs)) : );  // x86_64 macOSX uses GS
+  #elif defined(__x86_64__) && (MI_INTPTR_SIZE==4)
+    __asm__("movl %%fs:%1, %0" : "=r" (res) : "m" (*((void**)ofs)) : );  // x32 ABI
+  #elif defined(__x86_64__)
+    __asm__("movq %%fs:%1, %0" : "=r" (res) : "m" (*((void**)ofs)) : );  // x86_64 Linux, BSD uses FS
+  #elif defined(__arm__)
+    void** tcb; MI_UNUSED(ofs);
+    __asm__ volatile ("mrc p15, 0, %0, c13, c0, 3\nbic %0, %0, #3" : "=r" (tcb));
+    res = tcb[slot];
+  #elif defined(__aarch64__)
+    void** tcb; MI_UNUSED(ofs);
+    #if defined(__APPLE__) // M1, issue #343
+    __asm__ volatile ("mrs %0, tpidrro_el0\nbic %0, %0, #7" : "=r" (tcb));
+    #else
+    __asm__ volatile ("mrs %0, tpidr_el0" : "=r" (tcb));
+    #endif
+    res = tcb[slot];
+  #endif
+  return res;
+}
+
+// setting a tls slot is only used on macOS for now
+static inline void mi_tls_slot_set(size_t slot, void* value) mi_attr_noexcept {
+  const size_t ofs = (slot*sizeof(void*));
+  #if defined(__i386__)
+    __asm__("movl %1,%%gs:%0" : "=m" (*((void**)ofs)) : "rn" (value) : );  // 32-bit always uses GS
+  #elif defined(__APPLE__) && defined(__x86_64__)
+    __asm__("movq %1,%%gs:%0" : "=m" (*((void**)ofs)) : "rn" (value) : );  // x86_64 macOS uses GS
+  #elif defined(__x86_64__) && (MI_INTPTR_SIZE==4)
+    __asm__("movl %1,%%fs:%0" : "=m" (*((void**)ofs)) : "rn" (value) : );  // x32 ABI
+  #elif defined(__x86_64__)
+    __asm__("movq %1,%%fs:%0" : "=m" (*((void**)ofs)) : "rn" (value) : );  // x86_64 Linux, BSD uses FS
+  #elif defined(__arm__)
+    void** tcb; MI_UNUSED(ofs);
+    __asm__ volatile ("mrc p15, 0, %0, c13, c0, 3\nbic %0, %0, #3" : "=r" (tcb));
+    tcb[slot] = value;
+  #elif defined(__aarch64__)
+    void** tcb; MI_UNUSED(ofs);
+    #if defined(__APPLE__) // M1, issue #343
+    __asm__ volatile ("mrs %0, tpidrro_el0\nbic %0, %0, #7" : "=r" (tcb));
+    #else
+    __asm__ volatile ("mrs %0, tpidr_el0" : "=r" (tcb));
+    #endif
+    tcb[slot] = value;
+  #endif
+}
+
+static inline mi_threadid_t _mi_thread_id(void) mi_attr_noexcept {
+  #if defined(__BIONIC__)
+    // issue #384, #495: on the Bionic libc (Android), slot 1 is the thread id
+    // see: https://github.com/aosp-mirror/platform_bionic/blob/c44b1d0676ded732df4b3b21c5f798eacae93228/libc/platform/bionic/tls_defines.h#L86
+    return (uintptr_t)mi_tls_slot(1);
+  #else
+    // in all our other targets, slot 0 is the thread id
+    // glibc: https://sourceware.org/git/?p=glibc.git;a=blob_plain;f=sysdeps/x86_64/nptl/tls.h
+    // apple: https://github.com/apple/darwin-xnu/blob/main/libsyscall/os/tsd.h#L36
+    return (uintptr_t)mi_tls_slot(0);
+  #endif
+}
+
+#else
+
+// otherwise use portable C, taking the address of a thread local variable (this is still very fast on most platforms).
+static inline mi_threadid_t _mi_thread_id(void) mi_attr_noexcept {
+  return (uintptr_t)&_mi_heap_default;
+}
+
+#endif
+
+
+// -----------------------------------------------------------------------
+// Count bits: trailing or leading zeros (with MI_INTPTR_BITS on all zero)
+// -----------------------------------------------------------------------
+
+#if defined(__GNUC__)
+
+#include <limits.h>       // LONG_MAX
+#define MI_HAVE_FAST_BITSCAN
+static inline size_t mi_clz(uintptr_t x) {
+  if (x==0) return MI_INTPTR_BITS;
+#if (INTPTR_MAX == LONG_MAX)
+  return __builtin_clzl(x);
+#else
+  return __builtin_clzll(x);
+#endif
+}
+static inline size_t mi_ctz(uintptr_t x) {
+  if (x==0) return MI_INTPTR_BITS;
+#if (INTPTR_MAX == LONG_MAX)
+  return __builtin_ctzl(x);
+#else
+  return __builtin_ctzll(x);
+#endif
+}
+
+#elif defined(_MSC_VER) 
+
+#include <limits.h>       // LONG_MAX
+#define MI_HAVE_FAST_BITSCAN
+static inline size_t mi_clz(uintptr_t x) {
+  if (x==0) return MI_INTPTR_BITS;
+  unsigned long idx;
+#if (INTPTR_MAX == LONG_MAX)
+  _BitScanReverse(&idx, x);
+#else
+  _BitScanReverse64(&idx, x);
+#endif  
+  return ((MI_INTPTR_BITS - 1) - idx);
+}
+static inline size_t mi_ctz(uintptr_t x) {
+  if (x==0) return MI_INTPTR_BITS;
+  unsigned long idx;
+#if (INTPTR_MAX == LONG_MAX)
+  _BitScanForward(&idx, x);
+#else
+  _BitScanForward64(&idx, x);
+#endif  
+  return idx;
+}
+
+#else
+static inline size_t mi_ctz32(uint32_t x) {
+  // de Bruijn multiplication, see <http://supertech.csail.mit.edu/papers/debruijn.pdf>
+  static const unsigned char debruijn[32] = {
+    0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,
+    31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9
+  };
+  if (x==0) return 32;
+  return debruijn[((x & -(int32_t)x) * 0x077CB531UL) >> 27];
+}
+static inline size_t mi_clz32(uint32_t x) {
+  // de Bruijn multiplication, see <http://supertech.csail.mit.edu/papers/debruijn.pdf>
+  static const uint8_t debruijn[32] = {
+    31, 22, 30, 21, 18, 10, 29, 2, 20, 17, 15, 13, 9, 6, 28, 1,
+    23, 19, 11, 3, 16, 14, 7, 24, 12, 4, 8, 25, 5, 26, 27, 0
+  };
+  if (x==0) return 32;
+  x |= x >> 1;
+  x |= x >> 2;
+  x |= x >> 4;
+  x |= x >> 8;
+  x |= x >> 16;
+  return debruijn[(uint32_t)(x * 0x07C4ACDDUL) >> 27];
+}
+
+static inline size_t mi_clz(uintptr_t x) {
+  if (x==0) return MI_INTPTR_BITS;  
+#if (MI_INTPTR_BITS <= 32)
+  return mi_clz32((uint32_t)x);
+#else
+  size_t count = mi_clz32((uint32_t)(x >> 32));
+  if (count < 32) return count;
+  return (32 + mi_clz32((uint32_t)x));
+#endif
+}
+static inline size_t mi_ctz(uintptr_t x) {
+  if (x==0) return MI_INTPTR_BITS;
+#if (MI_INTPTR_BITS <= 32)
+  return mi_ctz32((uint32_t)x);
+#else
+  size_t count = mi_ctz32((uint32_t)x);
+  if (count < 32) return count;
+  return (32 + mi_ctz32((uint32_t)(x>>32)));
+#endif
+}
+
+#endif
+
+// "bit scan reverse": Return index of the highest bit (or MI_INTPTR_BITS if `x` is zero)
+static inline size_t mi_bsr(uintptr_t x) {
+  return (x==0 ? MI_INTPTR_BITS : MI_INTPTR_BITS - 1 - mi_clz(x));
+}
+
+
+// ---------------------------------------------------------------------------------
+// Provide our own `_mi_memcpy` for potential performance optimizations.
+//
+// For now, only on Windows with msvc/clang-cl we optimize to `rep movsb` if 
+// we happen to run on x86/x64 cpu's that have "fast short rep movsb" (FSRM) support 
+// (AMD Zen3+ (~2020) or Intel Ice Lake+ (~2017). See also issue #201 and pr #253. 
+// ---------------------------------------------------------------------------------
+
+#if defined(_WIN32) && (defined(_M_IX86) || defined(_M_X64))
+#include <intrin.h>
+#include <string.h>
+extern bool _mi_cpu_has_fsrm;
+static inline void _mi_memcpy(void* dst, const void* src, size_t n) {
+  if (_mi_cpu_has_fsrm) {
+    __movsb((unsigned char*)dst, (const unsigned char*)src, n);
+  }
+  else {
+    memcpy(dst, src, n); // todo: use noinline?
+  }
+}
+#else
+#include <string.h>
+static inline void _mi_memcpy(void* dst, const void* src, size_t n) {
+  memcpy(dst, src, n);
+}
+#endif
+
+
+// -------------------------------------------------------------------------------
+// The `_mi_memcpy_aligned` can be used if the pointers are machine-word aligned 
+// This is used for example in `mi_realloc`.
+// -------------------------------------------------------------------------------
+
+#if (defined(__GNUC__) && (__GNUC__ >= 4)) || defined(__clang__)
+// On GCC/CLang we provide a hint that the pointers are word aligned.
+#include <string.h>
+static inline void _mi_memcpy_aligned(void* dst, const void* src, size_t n) {
+  mi_assert_internal(((uintptr_t)dst % MI_INTPTR_SIZE == 0) && ((uintptr_t)src % MI_INTPTR_SIZE == 0));
+  void* adst = __builtin_assume_aligned(dst, MI_INTPTR_SIZE);
+  const void* asrc = __builtin_assume_aligned(src, MI_INTPTR_SIZE);
+  _mi_memcpy(adst, asrc, n);
+}
+#else
+// Default fallback on `_mi_memcpy`
+static inline void _mi_memcpy_aligned(void* dst, const void* src, size_t n) {
+  mi_assert_internal(((uintptr_t)dst % MI_INTPTR_SIZE == 0) && ((uintptr_t)src % MI_INTPTR_SIZE == 0));
+  _mi_memcpy(dst, src, n);
+}
+#endif
+
+
+#endif
diff --git a/source/luametatex/source/libraries/mimalloc/include/mimalloc-new-delete.h b/source/luametatex/source/libraries/mimalloc/include/mimalloc-new-delete.h
new file mode 100644
index 000000000..2749a0be9
--- /dev/null
+++ b/source/luametatex/source/libraries/mimalloc/include/mimalloc-new-delete.h
@@ -0,0 +1,57 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2018-2020 Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+#pragma once
+#ifndef MIMALLOC_NEW_DELETE_H
+#define MIMALLOC_NEW_DELETE_H
+
+// ----------------------------------------------------------------------------
+// This header provides convenient overrides for the new and
+// delete operations in C++.
+//
+// This header should be included in only one source file!
+//
+// On Windows, or when linking dynamically with mimalloc, these
+// can be more performant than the standard new-delete operations.
+// See <https://en.cppreference.com/w/cpp/memory/new/operator_new>
+// ---------------------------------------------------------------------------
+#if defined(__cplusplus)
+  #include <new>
+  #include <mimalloc.h>
+
+  void operator delete(void* p) noexcept              { mi_free(p); };
+  void operator delete[](void* p) noexcept            { mi_free(p); };
+
+  void operator delete  (void* p, const std::nothrow_t&) noexcept { mi_free(p); }
+  void operator delete[](void* p, const std::nothrow_t&) noexcept { mi_free(p); }
+
+  void* operator new(std::size_t n) noexcept(false)   { return mi_new(n); }
+  void* operator new[](std::size_t n) noexcept(false) { return mi_new(n); }
+
+  void* operator new  (std::size_t n, const std::nothrow_t& tag) noexcept { (void)(tag); return mi_new_nothrow(n); }
+  void* operator new[](std::size_t n, const std::nothrow_t& tag) noexcept { (void)(tag); return mi_new_nothrow(n); }
+
+  #if (__cplusplus >= 201402L || _MSC_VER >= 1916)
+  void operator delete  (void* p, std::size_t n) noexcept { mi_free_size(p,n); };
+  void operator delete[](void* p, std::size_t n) noexcept { mi_free_size(p,n); };
+  #endif
+
+  #if (__cplusplus > 201402L || defined(__cpp_aligned_new))
+  void operator delete  (void* p, std::align_val_t al) noexcept { mi_free_aligned(p, static_cast<size_t>(al)); }
+  void operator delete[](void* p, std::align_val_t al) noexcept { mi_free_aligned(p, static_cast<size_t>(al)); }
+  void operator delete  (void* p, std::size_t n, std::align_val_t al) noexcept { mi_free_size_aligned(p, n, static_cast<size_t>(al)); };
+  void operator delete[](void* p, std::size_t n, std::align_val_t al) noexcept { mi_free_size_aligned(p, n, static_cast<size_t>(al)); };
+  void operator delete  (void* p, std::align_val_t al, const std::nothrow_t& tag) noexcept { mi_free_aligned(p, static_cast<size_t>(al)); }
+  void operator delete[](void* p, std::align_val_t al, const std::nothrow_t& tag) noexcept { mi_free_aligned(p, static_cast<size_t>(al)); }
+  
+  void* operator new  (std::size_t n, std::align_val_t al) noexcept(false) { return mi_new_aligned(n, static_cast<size_t>(al)); }
+  void* operator new[](std::size_t n, std::align_val_t al) noexcept(false) { return mi_new_aligned(n, static_cast<size_t>(al)); }
+  void* operator new  (std::size_t n, std::align_val_t al, const std::nothrow_t&) noexcept { return mi_new_aligned_nothrow(n, static_cast<size_t>(al)); }
+  void* operator new[](std::size_t n, std::align_val_t al, const std::nothrow_t&) noexcept { return mi_new_aligned_nothrow(n, static_cast<size_t>(al)); }
+  #endif
+#endif
+
+#endif // MIMALLOC_NEW_DELETE_H
diff --git a/source/luametatex/source/libraries/mimalloc/include/mimalloc-override.h b/source/luametatex/source/libraries/mimalloc/include/mimalloc-override.h
new file mode 100644
index 000000000..c63b0b91a
--- /dev/null
+++ b/source/luametatex/source/libraries/mimalloc/include/mimalloc-override.h
@@ -0,0 +1,67 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2018-2020 Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+#pragma once
+#ifndef MIMALLOC_OVERRIDE_H
+#define MIMALLOC_OVERRIDE_H
+
+/* ----------------------------------------------------------------------------
+This header can be used to statically redirect malloc/free and new/delete
+to the mimalloc variants. This can be useful if one can include this file on
+each source file in a project (but be careful when using external code to
+not accidentally mix pointers from different allocators).
+-----------------------------------------------------------------------------*/
+
+#include <mimalloc.h>
+
+// Standard C allocation
+#define malloc(n)               mi_malloc(n)
+#define calloc(n,c)             mi_calloc(n,c)
+#define realloc(p,n)            mi_realloc(p,n)
+#define free(p)                 mi_free(p)
+
+#define strdup(s)               mi_strdup(s)
+#define strndup(s,n)              mi_strndup(s,n)
+#define realpath(f,n)           mi_realpath(f,n)
+
+// Microsoft extensions
+#define _expand(p,n)            mi_expand(p,n)
+#define _msize(p)               mi_usable_size(p)
+#define _recalloc(p,n,c)        mi_recalloc(p,n,c)
+
+#define _strdup(s)              mi_strdup(s)
+#define _strndup(s,n)           mi_strndup(s,n)
+#define _wcsdup(s)              (wchar_t*)mi_wcsdup((const unsigned short*)(s))
+#define _mbsdup(s)              mi_mbsdup(s)
+#define _dupenv_s(b,n,v)        mi_dupenv_s(b,n,v)
+#define _wdupenv_s(b,n,v)       mi_wdupenv_s((unsigned short*)(b),n,(const unsigned short*)(v))
+
+// Various Posix and Unix variants
+#define reallocf(p,n)           mi_reallocf(p,n)
+#define malloc_size(p)          mi_usable_size(p)
+#define malloc_usable_size(p)   mi_usable_size(p)
+#define cfree(p)                mi_free(p)
+
+#define valloc(n)               mi_valloc(n)
+#define pvalloc(n)              mi_pvalloc(n)
+#define reallocarray(p,s,n)     mi_reallocarray(p,s,n)
+#define reallocarr(p,s,n)       mi_reallocarr(p,s,n)
+#define memalign(a,n)           mi_memalign(a,n)
+#define aligned_alloc(a,n)      mi_aligned_alloc(a,n)
+#define posix_memalign(p,a,n)   mi_posix_memalign(p,a,n)
+#define _posix_memalign(p,a,n)  mi_posix_memalign(p,a,n)
+
+// Microsoft aligned variants
+#define _aligned_malloc(n,a)                  mi_malloc_aligned(n,a)
+#define _aligned_realloc(p,n,a)               mi_realloc_aligned(p,n,a)
+#define _aligned_recalloc(p,s,n,a)            mi_aligned_recalloc(p,s,n,a)
+#define _aligned_msize(p,a,o)                 mi_usable_size(p)
+#define _aligned_free(p)                      mi_free(p)
+#define _aligned_offset_malloc(n,a,o)         mi_malloc_aligned_at(n,a,o)
+#define _aligned_offset_realloc(p,n,a,o)      mi_realloc_aligned_at(p,n,a,o)
+#define _aligned_offset_recalloc(p,s,n,a,o)   mi_recalloc_aligned_at(p,s,n,a,o)
+
+#endif // MIMALLOC_OVERRIDE_H
diff --git a/source/luametatex/source/libraries/mimalloc/include/mimalloc-types.h b/source/luametatex/source/libraries/mimalloc/include/mimalloc-types.h
new file mode 100644
index 000000000..fb75ea464
--- /dev/null
+++ b/source/luametatex/source/libraries/mimalloc/include/mimalloc-types.h
@@ -0,0 +1,598 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2018-2021, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+#pragma once
+#ifndef MIMALLOC_TYPES_H
+#define MIMALLOC_TYPES_H
+
+#include <stddef.h>   // ptrdiff_t
+#include <stdint.h>   // uintptr_t, uint16_t, etc
+#include "mimalloc-atomic.h"  // _Atomic
+
+#ifdef _MSC_VER
+#pragma warning(disable:4214) // bitfield is not int
+#endif 
+
+// Minimal alignment necessary. On most platforms 16 bytes are needed
+// due to SSE registers for example. This must be at least `sizeof(void*)`
+#ifndef MI_MAX_ALIGN_SIZE
+#define MI_MAX_ALIGN_SIZE  16   // sizeof(max_align_t)
+#endif
+
+// ------------------------------------------------------
+// Variants
+// ------------------------------------------------------
+
+// Define NDEBUG in the release version to disable assertions.
+// #define NDEBUG
+
+// Define MI_STAT as 1 to maintain statistics; set it to 2 to have detailed statistics (but costs some performance).
+// #define MI_STAT 1
+
+// Define MI_SECURE to enable security mitigations
+// #define MI_SECURE 1  // guard page around metadata
+// #define MI_SECURE 2  // guard page around each mimalloc page
+// #define MI_SECURE 3  // encode free lists (detect corrupted free list (buffer overflow), and invalid pointer free)
+// #define MI_SECURE 4  // checks for double free. (may be more expensive)
+
+#if !defined(MI_SECURE)
+#define MI_SECURE 0
+#endif
+
+// Define MI_DEBUG for debug mode
+// #define MI_DEBUG 1  // basic assertion checks and statistics, check double free, corrupted free list, and invalid pointer free.
+// #define MI_DEBUG 2  // + internal assertion checks
+// #define MI_DEBUG 3  // + extensive internal invariant checking (cmake -DMI_DEBUG_FULL=ON)
+#if !defined(MI_DEBUG)
+#if !defined(NDEBUG) || defined(_DEBUG)
+#define MI_DEBUG 2
+#else
+#define MI_DEBUG 0
+#endif
+#endif
+
+// Reserve extra padding at the end of each block to be more resilient against heap block overflows.
+// The padding can detect byte-precise buffer overflow on free.
+#if !defined(MI_PADDING) && (MI_DEBUG>=1)
+#define MI_PADDING  1
+#endif
+
+
+// Encoded free lists allow detection of corrupted free lists
+// and can detect buffer overflows, modify after free, and double `free`s.
+#if (MI_SECURE>=3 || MI_DEBUG>=1 || MI_PADDING > 0)
+#define MI_ENCODE_FREELIST  1
+#endif
+
+
+// ------------------------------------------------------
+// Platform specific values
+// ------------------------------------------------------
+
+// ------------------------------------------------------
+// Size of a pointer.
+// We assume that `sizeof(void*)==sizeof(intptr_t)`
+// and it holds for all platforms we know of.
+//
+// However, the C standard only requires that:
+//  p == (void*)((intptr_t)p))
+// but we also need:
+//  i == (intptr_t)((void*)i)
+// or otherwise one might define an intptr_t type that is larger than a pointer...
+// ------------------------------------------------------
+
+#if INTPTR_MAX > INT64_MAX
+# define MI_INTPTR_SHIFT (4)  // assume 128-bit  (as on arm CHERI for example)
+#elif INTPTR_MAX == INT64_MAX
+# define MI_INTPTR_SHIFT (3)
+#elif INTPTR_MAX == INT32_MAX
+# define MI_INTPTR_SHIFT (2)
+#else
+#error platform pointers must be 32, 64, or 128 bits
+#endif
+
+#if SIZE_MAX == UINT64_MAX
+# define MI_SIZE_SHIFT (3)
+typedef int64_t  mi_ssize_t;
+#elif SIZE_MAX == UINT32_MAX
+# define MI_SIZE_SHIFT (2)
+typedef int32_t  mi_ssize_t;
+#else
+#error platform objects must be 32 or 64 bits
+#endif
+
+#if (SIZE_MAX/2) > LONG_MAX
+# define MI_ZU(x)  x##ULL
+# define MI_ZI(x)  x##LL
+#else
+# define MI_ZU(x)  x##UL
+# define MI_ZI(x)  x##L
+#endif
+
+#define MI_INTPTR_SIZE  (1<<MI_INTPTR_SHIFT)
+#define MI_INTPTR_BITS  (MI_INTPTR_SIZE*8)
+
+#define MI_SIZE_SIZE  (1<<MI_SIZE_SHIFT)
+#define MI_SIZE_BITS  (MI_SIZE_SIZE*8)
+
+#define MI_KiB     (MI_ZU(1024))
+#define MI_MiB     (MI_KiB*MI_KiB)
+#define MI_GiB     (MI_MiB*MI_KiB)
+
+
+// ------------------------------------------------------
+// Main internal data-structures
+// ------------------------------------------------------
+
+// Main tuning parameters for segment and page sizes
+// Sizes for 64-bit (usually divide by two for 32-bit)
+#define MI_SEGMENT_SLICE_SHIFT            (13 + MI_INTPTR_SHIFT)         // 64KiB  (32KiB on 32-bit)
+
+#if MI_INTPTR_SIZE > 4
+#define MI_SEGMENT_SHIFT                  (10 + MI_SEGMENT_SLICE_SHIFT)  // 64MiB
+#else
+#define MI_SEGMENT_SHIFT                  ( 7 + MI_SEGMENT_SLICE_SHIFT)  // 4MiB on 32-bit
+#endif
+
+#define MI_SMALL_PAGE_SHIFT               (MI_SEGMENT_SLICE_SHIFT)       // 64KiB
+#define MI_MEDIUM_PAGE_SHIFT              ( 3 + MI_SMALL_PAGE_SHIFT)     // 512KiB
+
+
+// Derived constants
+#define MI_SEGMENT_SIZE                   (MI_ZU(1)<<MI_SEGMENT_SHIFT)
+#define MI_SEGMENT_ALIGN                  MI_SEGMENT_SIZE
+#define MI_SEGMENT_MASK                   (MI_SEGMENT_SIZE - 1)
+#define MI_SEGMENT_SLICE_SIZE             (MI_ZU(1)<< MI_SEGMENT_SLICE_SHIFT)
+#define MI_SLICES_PER_SEGMENT             (MI_SEGMENT_SIZE / MI_SEGMENT_SLICE_SIZE) // 1024
+
+#define MI_SMALL_PAGE_SIZE                (MI_ZU(1)<<MI_SMALL_PAGE_SHIFT)
+#define MI_MEDIUM_PAGE_SIZE               (MI_ZU(1)<<MI_MEDIUM_PAGE_SHIFT)
+
+#define MI_SMALL_OBJ_SIZE_MAX             (MI_SMALL_PAGE_SIZE/4)   // 8KiB on 64-bit
+#define MI_MEDIUM_OBJ_SIZE_MAX            (MI_MEDIUM_PAGE_SIZE/4)  // 128KiB on 64-bit
+#define MI_MEDIUM_OBJ_WSIZE_MAX           (MI_MEDIUM_OBJ_SIZE_MAX/MI_INTPTR_SIZE)   
+#define MI_LARGE_OBJ_SIZE_MAX             (MI_SEGMENT_SIZE/2)      // 32MiB on 64-bit
+#define MI_LARGE_OBJ_WSIZE_MAX            (MI_LARGE_OBJ_SIZE_MAX/MI_INTPTR_SIZE)
+
+// Maximum number of size classes. (spaced exponentially in 12.5% increments)
+#define MI_BIN_HUGE  (73U)
+
+#if (MI_MEDIUM_OBJ_WSIZE_MAX >= 655360)
+#error "mimalloc internal: define more bins"
+#endif
+#if (MI_ALIGNMENT_MAX > MI_SEGMENT_SIZE/2)
+#error "mimalloc internal: the max aligned boundary is too large for the segment size"
+#endif
+#if (MI_ALIGNED_MAX % MI_SEGMENT_SLICE_SIZE != 0)
+#error "mimalloc internal: the max aligned boundary must be an integral multiple of the segment slice size"
+#endif
+
+// Maximum slice offset (15)
+#define MI_MAX_SLICE_OFFSET               ((MI_ALIGNMENT_MAX / MI_SEGMENT_SLICE_SIZE) - 1)
+
+// Used as a special value to encode block sizes in 32 bits.
+#define MI_HUGE_BLOCK_SIZE                ((uint32_t)(2*MI_GiB))
+
+// blocks up to this size are always allocated aligned
+#define MI_MAX_ALIGN_GUARANTEE            (8*MI_MAX_ALIGN_SIZE)  
+
+
+
+
+// ------------------------------------------------------
+// Mimalloc pages contain allocated blocks
+// ------------------------------------------------------
+
+// The free lists use encoded next fields
+// (Only actually encodes when MI_ENCODED_FREELIST is defined.)
+typedef uintptr_t  mi_encoded_t;
+
+// thread id's
+typedef size_t     mi_threadid_t;
+
+// free lists contain blocks
+typedef struct mi_block_s {
+  mi_encoded_t next;
+} mi_block_t;
+
+
+// The delayed flags are used for efficient multi-threaded free-ing
+typedef enum mi_delayed_e {
+  MI_USE_DELAYED_FREE   = 0, // push on the owning heap thread delayed list
+  MI_DELAYED_FREEING    = 1, // temporary: another thread is accessing the owning heap
+  MI_NO_DELAYED_FREE    = 2, // optimize: push on page local thread free queue if another block is already in the heap thread delayed free list
+  MI_NEVER_DELAYED_FREE = 3  // sticky, only resets on page reclaim
+} mi_delayed_t;
+
+
+// The `in_full` and `has_aligned` page flags are put in a union to efficiently
+// test if both are false (`full_aligned == 0`) in the `mi_free` routine.
+#if !MI_TSAN
+typedef union mi_page_flags_s {
+  uint8_t full_aligned;
+  struct {
+    uint8_t in_full : 1;
+    uint8_t has_aligned : 1;
+  } x;
+} mi_page_flags_t;
+#else
+// under thread sanitizer, use a byte for each flag to suppress warning, issue #130
+typedef union mi_page_flags_s {
+  uint16_t full_aligned;
+  struct {
+    uint8_t in_full;
+    uint8_t has_aligned;
+  } x;
+} mi_page_flags_t;
+#endif
+
+// Thread free list.
+// We use the bottom 2 bits of the pointer for mi_delayed_t flags
+typedef uintptr_t mi_thread_free_t;
+
+// A page contains blocks of one specific size (`block_size`).
+// Each page has three list of free blocks:
+// `free` for blocks that can be allocated,
+// `local_free` for freed blocks that are not yet available to `mi_malloc`
+// `thread_free` for freed blocks by other threads
+// The `local_free` and `thread_free` lists are migrated to the `free` list
+// when it is exhausted. The separate `local_free` list is necessary to
+// implement a monotonic heartbeat. The `thread_free` list is needed for
+// avoiding atomic operations in the common case.
+//
+//
+// `used - |thread_free|` == actual blocks that are in use (alive)
+// `used - |thread_free| + |free| + |local_free| == capacity`
+//
+// We don't count `freed` (as |free|) but use `used` to reduce
+// the number of memory accesses in the `mi_page_all_free` function(s).
+//
+// Notes: 
+// - Access is optimized for `mi_free` and `mi_page_alloc` (in `alloc.c`)
+// - Using `uint16_t` does not seem to slow things down
+// - The size is 8 words on 64-bit which helps the page index calculations
+//   (and 10 words on 32-bit, and encoded free lists add 2 words. Sizes 10 
+//    and 12 are still good for address calculation)
+// - To limit the structure size, the `xblock_size` is 32-bits only; for 
+//   blocks > MI_HUGE_BLOCK_SIZE the size is determined from the segment page size
+// - `thread_free` uses the bottom bits as a delayed-free flags to optimize
+//   concurrent frees where only the first concurrent free adds to the owning
+//   heap `thread_delayed_free` list (see `alloc.c:mi_free_block_mt`).
+//   The invariant is that no-delayed-free is only set if there is
+//   at least one block that will be added, or as already been added, to 
+//   the owning heap `thread_delayed_free` list. This guarantees that pages
+//   will be freed correctly even if only other threads free blocks.
+typedef struct mi_page_s {
+  // "owned" by the segment
+  uint32_t              slice_count;       // slices in this page (0 if not a page)
+  uint32_t              slice_offset;      // distance from the actual page data slice (0 if a page)
+  uint8_t               is_reset : 1;        // `true` if the page memory was reset
+  uint8_t               is_committed : 1;    // `true` if the page virtual memory is committed
+  uint8_t               is_zero_init : 1;    // `true` if the page was zero initialized
+
+  // layout like this to optimize access in `mi_malloc` and `mi_free`
+  uint16_t              capacity;          // number of blocks committed, must be the first field, see `segment.c:page_clear`
+  uint16_t              reserved;          // number of blocks reserved in memory
+  mi_page_flags_t       flags;             // `in_full` and `has_aligned` flags (8 bits)
+  uint8_t               is_zero : 1;         // `true` if the blocks in the free list are zero initialized
+  uint8_t               retire_expire : 7;   // expiration count for retired blocks
+
+  mi_block_t*           free;              // list of available free blocks (`malloc` allocates from this list)
+  #ifdef MI_ENCODE_FREELIST
+  uintptr_t             keys[2];           // two random keys to encode the free lists (see `_mi_block_next`)
+  #endif
+  uint32_t              used;              // number of blocks in use (including blocks in `local_free` and `thread_free`)
+  uint32_t              xblock_size;       // size available in each block (always `>0`) 
+
+  mi_block_t* local_free;                  // list of deferred free blocks by this thread (migrates to `free`)
+  _Atomic(mi_thread_free_t) xthread_free;  // list of deferred free blocks freed by other threads
+  _Atomic(uintptr_t)        xheap;
+
+  struct mi_page_s* next;                  // next page owned by this thread with the same `block_size`
+  struct mi_page_s* prev;                  // previous page owned by this thread with the same `block_size`
+
+  // 64-bit 9 words, 32-bit 12 words, (+2 for secure)
+  #if MI_INTPTR_SIZE==8
+  uintptr_t padding[1];
+  #endif
+} mi_page_t;
+
+
+
+typedef enum mi_page_kind_e {
+  MI_PAGE_SMALL,    // small blocks go into 64KiB pages inside a segment
+  MI_PAGE_MEDIUM,   // medium blocks go into medium pages inside a segment
+  MI_PAGE_LARGE,    // larger blocks go into a page of just one block
+  MI_PAGE_HUGE,     // huge blocks (> 16 MiB) are put into a single page in a single segment.
+} mi_page_kind_t;
+
+typedef enum mi_segment_kind_e {
+  MI_SEGMENT_NORMAL, // MI_SEGMENT_SIZE size with pages inside.
+  MI_SEGMENT_HUGE,   // > MI_LARGE_SIZE_MAX segment with just one huge page inside.
+} mi_segment_kind_t;
+
+// ------------------------------------------------------
+// A segment holds a commit mask where a bit is set if
+// the corresponding MI_COMMIT_SIZE area is committed.
+// The MI_COMMIT_SIZE must be a multiple of the slice
+// size. If it is equal we have the most fine grained 
+// decommit (but setting it higher can be more efficient).
+// The MI_MINIMAL_COMMIT_SIZE is the minimal amount that will
+// be committed in one go which can be set higher than
+// MI_COMMIT_SIZE for efficiency (while the decommit mask
+// is still tracked in fine-grained MI_COMMIT_SIZE chunks)
+// ------------------------------------------------------
+
+#define MI_MINIMAL_COMMIT_SIZE      (2*MI_MiB)
+#define MI_COMMIT_SIZE              (MI_SEGMENT_SLICE_SIZE)              // 64KiB
+#define MI_COMMIT_MASK_BITS         (MI_SEGMENT_SIZE / MI_COMMIT_SIZE)  
+#define MI_COMMIT_MASK_FIELD_BITS    MI_SIZE_BITS
+#define MI_COMMIT_MASK_FIELD_COUNT  (MI_COMMIT_MASK_BITS / MI_COMMIT_MASK_FIELD_BITS)
+
+#if (MI_COMMIT_MASK_BITS != (MI_COMMIT_MASK_FIELD_COUNT * MI_COMMIT_MASK_FIELD_BITS))
+#error "the segment size must be exactly divisible by the (commit size * size_t bits)"
+#endif
+
+typedef struct mi_commit_mask_s {
+  size_t mask[MI_COMMIT_MASK_FIELD_COUNT];
+} mi_commit_mask_t;
+
+typedef mi_page_t  mi_slice_t;
+typedef int64_t    mi_msecs_t;
+
+
+// Segments are large allocated memory blocks (8mb on 64 bit) from
+// the OS. Inside segments we allocated fixed size _pages_ that
+// contain blocks.
+typedef struct mi_segment_s {
+  size_t            memid;              // memory id for arena allocation
+  bool              mem_is_pinned;      // `true` if we cannot decommit/reset/protect in this memory (i.e. when allocated using large OS pages)    
+  bool              mem_is_large;       // in large/huge os pages?
+  bool              mem_is_committed;   // `true` if the whole segment is eagerly committed
+
+  bool              allow_decommit;     
+  mi_msecs_t        decommit_expire;
+  mi_commit_mask_t  decommit_mask;
+  mi_commit_mask_t  commit_mask;
+
+  _Atomic(struct mi_segment_s*) abandoned_next;
+
+  // from here is zero initialized
+  struct mi_segment_s* next;            // the list of freed segments in the cache (must be first field, see `segment.c:mi_segment_init`)
+  
+  size_t            abandoned;          // abandoned pages (i.e. the original owning thread stopped) (`abandoned <= used`)
+  size_t            abandoned_visits;   // count how often this segment is visited in the abandoned list (to force reclaim it it is too long)
+  size_t            used;               // count of pages in use
+  uintptr_t         cookie;             // verify addresses in debug mode: `mi_ptr_cookie(segment) == segment->cookie`  
+
+  size_t            segment_slices;      // for huge segments this may be different from `MI_SLICES_PER_SEGMENT`
+  size_t            segment_info_slices; // initial slices we are using segment info and possible guard pages.
+
+  // layout like this to optimize access in `mi_free`
+  mi_segment_kind_t kind;
+  _Atomic(mi_threadid_t) thread_id;      // unique id of the thread owning this segment
+  size_t            slice_entries;       // entries in the `slices` array, at most `MI_SLICES_PER_SEGMENT`
+  mi_slice_t        slices[MI_SLICES_PER_SEGMENT];
+} mi_segment_t;
+
+
+// ------------------------------------------------------
+// Heaps
+// Provide first-class heaps to allocate from.
+// A heap just owns a set of pages for allocation and
+// can only be allocate/reallocate from the thread that created it.
+// Freeing blocks can be done from any thread though.
+// Per thread, the segments are shared among its heaps.
+// Per thread, there is always a default heap that is
+// used for allocation; it is initialized to statically
+// point to an empty heap to avoid initialization checks
+// in the fast path.
+// ------------------------------------------------------
+
+// Thread local data
+typedef struct mi_tld_s mi_tld_t;
+
+// Pages of a certain block size are held in a queue.
+typedef struct mi_page_queue_s {
+  mi_page_t* first;
+  mi_page_t* last;
+  size_t     block_size;
+} mi_page_queue_t;
+
+#define MI_BIN_FULL  (MI_BIN_HUGE+1)
+
+// Random context
+typedef struct mi_random_cxt_s {
+  uint32_t input[16];
+  uint32_t output[16];
+  int      output_available;
+} mi_random_ctx_t;
+
+
+// In debug mode there is a padding structure at the end of the blocks to check for buffer overflows
+#if (MI_PADDING)
+typedef struct mi_padding_s {
+  uint32_t canary; // encoded block value to check validity of the padding (in case of overflow)
+  uint32_t delta;  // padding bytes before the block. (mi_usable_size(p) - delta == exact allocated bytes)
+} mi_padding_t;
+#define MI_PADDING_SIZE   (sizeof(mi_padding_t))
+#define MI_PADDING_WSIZE  ((MI_PADDING_SIZE + MI_INTPTR_SIZE - 1) / MI_INTPTR_SIZE)
+#else
+#define MI_PADDING_SIZE   0
+#define MI_PADDING_WSIZE  0
+#endif
+
+#define MI_PAGES_DIRECT   (MI_SMALL_WSIZE_MAX + MI_PADDING_WSIZE + 1)
+
+
+// A heap owns a set of pages.
+struct mi_heap_s {
+  mi_tld_t*             tld;
+  mi_page_t*            pages_free_direct[MI_PAGES_DIRECT];  // optimize: array where every entry points a page with possibly free blocks in the corresponding queue for that size.
+  mi_page_queue_t       pages[MI_BIN_FULL + 1];              // queue of pages for each size class (or "bin")
+  _Atomic(mi_block_t*)  thread_delayed_free;
+  mi_threadid_t         thread_id;                           // thread this heap belongs too
+  uintptr_t             cookie;                              // random cookie to verify pointers (see `_mi_ptr_cookie`)
+  uintptr_t             keys[2];                             // two random keys used to encode the `thread_delayed_free` list
+  mi_random_ctx_t       random;                              // random number context used for secure allocation
+  size_t                page_count;                          // total number of pages in the `pages` queues.
+  size_t                page_retired_min;                    // smallest retired index (retired pages are fully free, but still in the page queues)
+  size_t                page_retired_max;                    // largest retired index into the `pages` array.
+  mi_heap_t*            next;                                // list of heaps per thread
+  bool                  no_reclaim;                          // `true` if this heap should not reclaim abandoned pages
+};
+
+
+
+// ------------------------------------------------------
+// Debug
+// ------------------------------------------------------
+
+#if !defined(MI_DEBUG_UNINIT)
+#define MI_DEBUG_UNINIT     (0xD0)
+#endif
+#if !defined(MI_DEBUG_FREED)
+#define MI_DEBUG_FREED      (0xDF)
+#endif
+#if !defined(MI_DEBUG_PADDING)
+#define MI_DEBUG_PADDING    (0xDE)
+#endif
+
+#if (MI_DEBUG)
+// use our own assertion to print without memory allocation
+void _mi_assert_fail(const char* assertion, const char* fname, unsigned int line, const char* func );
+#define mi_assert(expr)     ((expr) ? (void)0 : _mi_assert_fail(#expr,__FILE__,__LINE__,__func__))
+#else
+#define mi_assert(x)
+#endif
+
+#if (MI_DEBUG>1)
+#define mi_assert_internal    mi_assert
+#else
+#define mi_assert_internal(x)
+#endif
+
+#if (MI_DEBUG>2)
+#define mi_assert_expensive   mi_assert
+#else
+#define mi_assert_expensive(x)
+#endif
+
+// ------------------------------------------------------
+// Statistics
+// ------------------------------------------------------
+
+#ifndef MI_STAT
+#if (MI_DEBUG>0)
+#define MI_STAT 2
+#else
+#define MI_STAT 0
+#endif
+#endif
+
+typedef struct mi_stat_count_s {
+  int64_t allocated;
+  int64_t freed;
+  int64_t peak;
+  int64_t current;
+} mi_stat_count_t;
+
+typedef struct mi_stat_counter_s {
+  int64_t total;
+  int64_t count;
+} mi_stat_counter_t;
+
+typedef struct mi_stats_s {
+  mi_stat_count_t segments;
+  mi_stat_count_t pages;
+  mi_stat_count_t reserved;
+  mi_stat_count_t committed;
+  mi_stat_count_t reset;
+  mi_stat_count_t page_committed;
+  mi_stat_count_t segments_abandoned;
+  mi_stat_count_t pages_abandoned;
+  mi_stat_count_t threads;
+  mi_stat_count_t normal;
+  mi_stat_count_t huge;
+  mi_stat_count_t large;
+  mi_stat_count_t malloc;
+  mi_stat_count_t segments_cache;
+  mi_stat_counter_t pages_extended;
+  mi_stat_counter_t mmap_calls;
+  mi_stat_counter_t commit_calls;
+  mi_stat_counter_t page_no_retire;
+  mi_stat_counter_t searches;
+  mi_stat_counter_t normal_count;
+  mi_stat_counter_t huge_count;
+  mi_stat_counter_t large_count;
+#if MI_STAT>1
+  mi_stat_count_t normal_bins[MI_BIN_HUGE+1];
+#endif
+} mi_stats_t;
+
+
+void _mi_stat_increase(mi_stat_count_t* stat, size_t amount);
+void _mi_stat_decrease(mi_stat_count_t* stat, size_t amount);
+void _mi_stat_counter_increase(mi_stat_counter_t* stat, size_t amount);
+
+#if (MI_STAT)
+#define mi_stat_increase(stat,amount)         _mi_stat_increase( &(stat), amount)
+#define mi_stat_decrease(stat,amount)         _mi_stat_decrease( &(stat), amount)
+#define mi_stat_counter_increase(stat,amount) _mi_stat_counter_increase( &(stat), amount)
+#else
+#define mi_stat_increase(stat,amount)         (void)0
+#define mi_stat_decrease(stat,amount)         (void)0
+#define mi_stat_counter_increase(stat,amount) (void)0
+#endif
+
+#define mi_heap_stat_counter_increase(heap,stat,amount)  mi_stat_counter_increase( (heap)->tld->stats.stat, amount)
+#define mi_heap_stat_increase(heap,stat,amount)  mi_stat_increase( (heap)->tld->stats.stat, amount)
+#define mi_heap_stat_decrease(heap,stat,amount)  mi_stat_decrease( (heap)->tld->stats.stat, amount)
+
+// ------------------------------------------------------
+// Thread Local data
+// ------------------------------------------------------
+
+// A "span" is is an available range of slices. The span queues keep
+// track of slice spans of at most the given `slice_count` (but more than the previous size class).
+typedef struct mi_span_queue_s {
+  mi_slice_t* first;
+  mi_slice_t* last;
+  size_t      slice_count;
+} mi_span_queue_t;
+
+#define MI_SEGMENT_BIN_MAX (35)     // 35 == mi_segment_bin(MI_SLICES_PER_SEGMENT)
+
+// OS thread local data
+typedef struct mi_os_tld_s {
+  size_t                region_idx;   // start point for next allocation
+  mi_stats_t*           stats;        // points to tld stats
+} mi_os_tld_t;
+
+
+// Segments thread local data
+typedef struct mi_segments_tld_s {
+  mi_span_queue_t     spans[MI_SEGMENT_BIN_MAX+1];  // free slice spans inside segments
+  size_t              count;        // current number of segments;
+  size_t              peak_count;   // peak number of segments
+  size_t              current_size; // current size of all segments
+  size_t              peak_size;    // peak size of all segments
+  mi_stats_t*         stats;        // points to tld stats
+  mi_os_tld_t*        os;           // points to os stats
+} mi_segments_tld_t;
+
+// Thread local data
+struct mi_tld_s {
+  unsigned long long  heartbeat;     // monotonic heartbeat count
+  bool                recurse;       // true if deferred was called; used to prevent infinite recursion.
+  mi_heap_t*          heap_backing;  // backing heap of this thread (cannot be deleted)
+  mi_heap_t*          heaps;         // list of heaps in this thread (so we can abandon all when the thread terminates)
+  mi_segments_tld_t   segments;      // segment tld
+  mi_os_tld_t         os;            // os tld
+  mi_stats_t          stats;         // statistics
+};
+
+#endif
diff --git a/source/luametatex/source/libraries/mimalloc/include/mimalloc.h b/source/luametatex/source/libraries/mimalloc/include/mimalloc.h
new file mode 100644
index 000000000..c752ac247
--- /dev/null
+++ b/source/luametatex/source/libraries/mimalloc/include/mimalloc.h
@@ -0,0 +1,453 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2018-2022, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+#pragma once
+#ifndef MIMALLOC_H
+#define MIMALLOC_H
+
+#define MI_MALLOC_VERSION 206   // major + 2 digits minor
+
+// ------------------------------------------------------
+// Compiler specific attributes
+// ------------------------------------------------------
+
+#ifdef __cplusplus
+  #if (__cplusplus >= 201103L) || (_MSC_VER > 1900)  // C++11
+    #define mi_attr_noexcept   noexcept
+  #else
+    #define mi_attr_noexcept   throw()
+  #endif
+#else
+  #define mi_attr_noexcept
+#endif
+
+#if defined(__cplusplus) && (__cplusplus >= 201703)
+  #define mi_decl_nodiscard    [[nodiscard]]
+#elif (defined(__GNUC__) && (__GNUC__ >= 4)) || defined(__clang__)  // includes clang, icc, and clang-cl
+  #define mi_decl_nodiscard    __attribute__((warn_unused_result))
+#elif (_MSC_VER >= 1700)
+  #define mi_decl_nodiscard    _Check_return_
+#else
+  #define mi_decl_nodiscard
+#endif
+
+#if defined(_MSC_VER) || defined(__MINGW32__)
+  #if !defined(MI_SHARED_LIB)
+    #define mi_decl_export
+  #elif defined(MI_SHARED_LIB_EXPORT)
+    #define mi_decl_export              __declspec(dllexport)
+  #else
+    #define mi_decl_export              __declspec(dllimport)
+  #endif
+  #if defined(__MINGW32__)
+    #define mi_decl_restrict
+    #define mi_attr_malloc              __attribute__((malloc))
+  #else
+    #if (_MSC_VER >= 1900) && !defined(__EDG__)
+      #define mi_decl_restrict          __declspec(allocator) __declspec(restrict)
+    #else
+      #define mi_decl_restrict          __declspec(restrict)
+    #endif
+    #define mi_attr_malloc
+  #endif
+  #define mi_cdecl                      __cdecl
+  #define mi_attr_alloc_size(s)
+  #define mi_attr_alloc_size2(s1,s2)
+  #define mi_attr_alloc_align(p)
+#elif defined(__GNUC__)                 // includes clang and icc
+  #if defined(MI_SHARED_LIB) && defined(MI_SHARED_LIB_EXPORT)
+    #define mi_decl_export              __attribute__((visibility("default")))
+  #else
+    #define mi_decl_export
+  #endif
+  #define mi_cdecl                      // leads to warnings... __attribute__((cdecl))
+  #define mi_decl_restrict
+  #define mi_attr_malloc                __attribute__((malloc))
+  #if (defined(__clang_major__) && (__clang_major__ < 4)) || (__GNUC__ < 5)
+    #define mi_attr_alloc_size(s)
+    #define mi_attr_alloc_size2(s1,s2)
+    #define mi_attr_alloc_align(p)
+  #elif defined(__INTEL_COMPILER)
+    #define mi_attr_alloc_size(s)       __attribute__((alloc_size(s)))
+    #define mi_attr_alloc_size2(s1,s2)  __attribute__((alloc_size(s1,s2)))
+    #define mi_attr_alloc_align(p)
+  #else
+    #define mi_attr_alloc_size(s)       __attribute__((alloc_size(s)))
+    #define mi_attr_alloc_size2(s1,s2)  __attribute__((alloc_size(s1,s2)))
+    #define mi_attr_alloc_align(p)      __attribute__((alloc_align(p)))
+  #endif
+#else
+  #define mi_cdecl
+  #define mi_decl_export
+  #define mi_decl_restrict
+  #define mi_attr_malloc
+  #define mi_attr_alloc_size(s)
+  #define mi_attr_alloc_size2(s1,s2)
+  #define mi_attr_alloc_align(p)
+#endif
+
+// ------------------------------------------------------
+// Includes
+// ------------------------------------------------------
+
+#include <stddef.h>     // size_t
+#include <stdbool.h>    // bool
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// ------------------------------------------------------
+// Standard malloc interface
+// ------------------------------------------------------
+
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_malloc(size_t size)  mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(1);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_calloc(size_t count, size_t size)  mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size2(1,2);
+mi_decl_nodiscard mi_decl_export void* mi_realloc(void* p, size_t newsize)      mi_attr_noexcept mi_attr_alloc_size(2);
+mi_decl_export void* mi_expand(void* p, size_t newsize)                         mi_attr_noexcept mi_attr_alloc_size(2);
+
+mi_decl_export void mi_free(void* p) mi_attr_noexcept;
+mi_decl_nodiscard mi_decl_export mi_decl_restrict char* mi_strdup(const char* s) mi_attr_noexcept mi_attr_malloc;
+mi_decl_nodiscard mi_decl_export mi_decl_restrict char* mi_strndup(const char* s, size_t n) mi_attr_noexcept mi_attr_malloc;
+mi_decl_nodiscard mi_decl_export mi_decl_restrict char* mi_realpath(const char* fname, char* resolved_name) mi_attr_noexcept mi_attr_malloc;
+
+// ------------------------------------------------------
+// Extended functionality
+// ------------------------------------------------------
+#define MI_SMALL_WSIZE_MAX  (128)
+#define MI_SMALL_SIZE_MAX   (MI_SMALL_WSIZE_MAX*sizeof(void*))
+
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_malloc_small(size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(1);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_zalloc_small(size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(1);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_zalloc(size_t size)       mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(1);
+
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_mallocn(size_t count, size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size2(1,2);
+mi_decl_nodiscard mi_decl_export void* mi_reallocn(void* p, size_t count, size_t size)        mi_attr_noexcept mi_attr_alloc_size2(2,3);
+mi_decl_nodiscard mi_decl_export void* mi_reallocf(void* p, size_t newsize)                   mi_attr_noexcept mi_attr_alloc_size(2);
+
+mi_decl_nodiscard mi_decl_export size_t mi_usable_size(const void* p) mi_attr_noexcept;
+mi_decl_nodiscard mi_decl_export size_t mi_good_size(size_t size)     mi_attr_noexcept;
+
+
+// ------------------------------------------------------
+// Internals
+// ------------------------------------------------------
+
+typedef void (mi_cdecl mi_deferred_free_fun)(bool force, unsigned long long heartbeat, void* arg);
+mi_decl_export void mi_register_deferred_free(mi_deferred_free_fun* deferred_free, void* arg) mi_attr_noexcept;
+
+typedef void (mi_cdecl mi_output_fun)(const char* msg, void* arg);
+mi_decl_export void mi_register_output(mi_output_fun* out, void* arg) mi_attr_noexcept;
+
+typedef void (mi_cdecl mi_error_fun)(int err, void* arg);
+mi_decl_export void mi_register_error(mi_error_fun* fun, void* arg);
+
+mi_decl_export void mi_collect(bool force)    mi_attr_noexcept;
+mi_decl_export int  mi_version(void)          mi_attr_noexcept;
+mi_decl_export void mi_stats_reset(void)      mi_attr_noexcept;
+mi_decl_export void mi_stats_merge(void)      mi_attr_noexcept;
+mi_decl_export void mi_stats_print(void* out) mi_attr_noexcept;  // backward compatibility: `out` is ignored and should be NULL
+mi_decl_export void mi_stats_print_out(mi_output_fun* out, void* arg) mi_attr_noexcept;
+
+mi_decl_export void mi_process_init(void)     mi_attr_noexcept;
+mi_decl_export void mi_thread_init(void)      mi_attr_noexcept;
+mi_decl_export void mi_thread_done(void)      mi_attr_noexcept;
+mi_decl_export void mi_thread_stats_print_out(mi_output_fun* out, void* arg) mi_attr_noexcept;
+
+mi_decl_export void mi_process_info(size_t* elapsed_msecs, size_t* user_msecs, size_t* system_msecs, 
+                                    size_t* current_rss, size_t* peak_rss, 
+                                    size_t* current_commit, size_t* peak_commit, size_t* page_faults) mi_attr_noexcept;
+
+// -------------------------------------------------------------------------------------
+// Aligned allocation
+// Note that `alignment` always follows `size` for consistency with unaligned
+// allocation, but unfortunately this differs from `posix_memalign` and `aligned_alloc`.
+// -------------------------------------------------------------------------------------
+#define MI_ALIGNMENT_MAX   (1024*1024UL)    // maximum supported alignment is 1MiB
+
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_malloc_aligned(size_t size, size_t alignment) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(1) mi_attr_alloc_align(2);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_malloc_aligned_at(size_t size, size_t alignment, size_t offset) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(1);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_zalloc_aligned(size_t size, size_t alignment) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(1) mi_attr_alloc_align(2);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_zalloc_aligned_at(size_t size, size_t alignment, size_t offset) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(1);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_calloc_aligned(size_t count, size_t size, size_t alignment) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size2(1,2) mi_attr_alloc_align(3);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_calloc_aligned_at(size_t count, size_t size, size_t alignment, size_t offset) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size2(1,2);
+mi_decl_nodiscard mi_decl_export void* mi_realloc_aligned(void* p, size_t newsize, size_t alignment) mi_attr_noexcept mi_attr_alloc_size(2) mi_attr_alloc_align(3);
+mi_decl_nodiscard mi_decl_export void* mi_realloc_aligned_at(void* p, size_t newsize, size_t alignment, size_t offset) mi_attr_noexcept mi_attr_alloc_size(2);
+
+
+// -------------------------------------------------------------------------------------
+// Heaps: first-class, but can only allocate from the same thread that created it.
+// -------------------------------------------------------------------------------------
+
+struct mi_heap_s;
+typedef struct mi_heap_s mi_heap_t;
+
+mi_decl_nodiscard mi_decl_export mi_heap_t* mi_heap_new(void);
+mi_decl_export void       mi_heap_delete(mi_heap_t* heap);
+mi_decl_export void       mi_heap_destroy(mi_heap_t* heap);
+mi_decl_export mi_heap_t* mi_heap_set_default(mi_heap_t* heap);
+mi_decl_export mi_heap_t* mi_heap_get_default(void);
+mi_decl_export mi_heap_t* mi_heap_get_backing(void);
+mi_decl_export void       mi_heap_collect(mi_heap_t* heap, bool force) mi_attr_noexcept;
+
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_heap_malloc(mi_heap_t* heap, size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_heap_zalloc(mi_heap_t* heap, size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_heap_calloc(mi_heap_t* heap, size_t count, size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size2(2, 3);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_heap_mallocn(mi_heap_t* heap, size_t count, size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size2(2, 3);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_heap_malloc_small(mi_heap_t* heap, size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2);
+
+mi_decl_nodiscard mi_decl_export void* mi_heap_realloc(mi_heap_t* heap, void* p, size_t newsize)              mi_attr_noexcept mi_attr_alloc_size(3);
+mi_decl_nodiscard mi_decl_export void* mi_heap_reallocn(mi_heap_t* heap, void* p, size_t count, size_t size)  mi_attr_noexcept mi_attr_alloc_size2(3,4);
+mi_decl_nodiscard mi_decl_export void* mi_heap_reallocf(mi_heap_t* heap, void* p, size_t newsize)             mi_attr_noexcept mi_attr_alloc_size(3);
+
+mi_decl_nodiscard mi_decl_export mi_decl_restrict char* mi_heap_strdup(mi_heap_t* heap, const char* s)            mi_attr_noexcept mi_attr_malloc;
+mi_decl_nodiscard mi_decl_export mi_decl_restrict char* mi_heap_strndup(mi_heap_t* heap, const char* s, size_t n) mi_attr_noexcept mi_attr_malloc;
+mi_decl_nodiscard mi_decl_export mi_decl_restrict char* mi_heap_realpath(mi_heap_t* heap, const char* fname, char* resolved_name) mi_attr_noexcept mi_attr_malloc;
+
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_heap_malloc_aligned(mi_heap_t* heap, size_t size, size_t alignment) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2) mi_attr_alloc_align(3);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_heap_malloc_aligned_at(mi_heap_t* heap, size_t size, size_t alignment, size_t offset) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_heap_zalloc_aligned(mi_heap_t* heap, size_t size, size_t alignment) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2) mi_attr_alloc_align(3);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_heap_zalloc_aligned_at(mi_heap_t* heap, size_t size, size_t alignment, size_t offset) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_heap_calloc_aligned(mi_heap_t* heap, size_t count, size_t size, size_t alignment) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size2(2, 3) mi_attr_alloc_align(4);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_heap_calloc_aligned_at(mi_heap_t* heap, size_t count, size_t size, size_t alignment, size_t offset) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size2(2, 3);
+mi_decl_nodiscard mi_decl_export void* mi_heap_realloc_aligned(mi_heap_t* heap, void* p, size_t newsize, size_t alignment) mi_attr_noexcept mi_attr_alloc_size(3) mi_attr_alloc_align(4);
+mi_decl_nodiscard mi_decl_export void* mi_heap_realloc_aligned_at(mi_heap_t* heap, void* p, size_t newsize, size_t alignment, size_t offset) mi_attr_noexcept mi_attr_alloc_size(3);
+
+
+// --------------------------------------------------------------------------------
+// Zero initialized re-allocation.
+// Only valid on memory that was originally allocated with zero initialization too.
+// e.g. `mi_calloc`, `mi_zalloc`, `mi_zalloc_aligned` etc.
+// see <https://github.com/microsoft/mimalloc/issues/63#issuecomment-508272992>
+// --------------------------------------------------------------------------------
+
+mi_decl_nodiscard mi_decl_export void* mi_rezalloc(void* p, size_t newsize)                mi_attr_noexcept mi_attr_alloc_size(2);
+mi_decl_nodiscard mi_decl_export void* mi_recalloc(void* p, size_t newcount, size_t size)  mi_attr_noexcept mi_attr_alloc_size2(2,3);
+
+mi_decl_nodiscard mi_decl_export void* mi_rezalloc_aligned(void* p, size_t newsize, size_t alignment) mi_attr_noexcept mi_attr_alloc_size(2) mi_attr_alloc_align(3);
+mi_decl_nodiscard mi_decl_export void* mi_rezalloc_aligned_at(void* p, size_t newsize, size_t alignment, size_t offset) mi_attr_noexcept mi_attr_alloc_size(2);
+mi_decl_nodiscard mi_decl_export void* mi_recalloc_aligned(void* p, size_t newcount, size_t size, size_t alignment) mi_attr_noexcept mi_attr_alloc_size2(2,3) mi_attr_alloc_align(4);
+mi_decl_nodiscard mi_decl_export void* mi_recalloc_aligned_at(void* p, size_t newcount, size_t size, size_t alignment, size_t offset) mi_attr_noexcept mi_attr_alloc_size2(2,3);
+
+mi_decl_nodiscard mi_decl_export void* mi_heap_rezalloc(mi_heap_t* heap, void* p, size_t newsize)                mi_attr_noexcept mi_attr_alloc_size(3);
+mi_decl_nodiscard mi_decl_export void* mi_heap_recalloc(mi_heap_t* heap, void* p, size_t newcount, size_t size)  mi_attr_noexcept mi_attr_alloc_size2(3,4);
+
+mi_decl_nodiscard mi_decl_export void* mi_heap_rezalloc_aligned(mi_heap_t* heap, void* p, size_t newsize, size_t alignment) mi_attr_noexcept mi_attr_alloc_size(3) mi_attr_alloc_align(4);
+mi_decl_nodiscard mi_decl_export void* mi_heap_rezalloc_aligned_at(mi_heap_t* heap, void* p, size_t newsize, size_t alignment, size_t offset) mi_attr_noexcept mi_attr_alloc_size(3);
+mi_decl_nodiscard mi_decl_export void* mi_heap_recalloc_aligned(mi_heap_t* heap, void* p, size_t newcount, size_t size, size_t alignment) mi_attr_noexcept mi_attr_alloc_size2(3,4) mi_attr_alloc_align(5);
+mi_decl_nodiscard mi_decl_export void* mi_heap_recalloc_aligned_at(mi_heap_t* heap, void* p, size_t newcount, size_t size, size_t alignment, size_t offset) mi_attr_noexcept mi_attr_alloc_size2(3,4);
+
+
+// ------------------------------------------------------
+// Analysis
+// ------------------------------------------------------
+
+mi_decl_export bool mi_heap_contains_block(mi_heap_t* heap, const void* p);
+mi_decl_export bool mi_heap_check_owned(mi_heap_t* heap, const void* p);
+mi_decl_export bool mi_check_owned(const void* p);
+
+// An area of heap space contains blocks of a single size.
+typedef struct mi_heap_area_s {
+  void*  blocks;      // start of the area containing heap blocks
+  size_t reserved;    // bytes reserved for this area (virtual)
+  size_t committed;   // current available bytes for this area
+  size_t used;        // number of allocated blocks
+  size_t block_size;  // size in bytes of each block
+  size_t full_block_size; // size in bytes of a full block including padding and metadata.
+} mi_heap_area_t;
+
+typedef bool (mi_cdecl mi_block_visit_fun)(const mi_heap_t* heap, const mi_heap_area_t* area, void* block, size_t block_size, void* arg);
+
+mi_decl_export bool mi_heap_visit_blocks(const mi_heap_t* heap, bool visit_all_blocks, mi_block_visit_fun* visitor, void* arg);
+
+// Experimental
+mi_decl_nodiscard mi_decl_export bool mi_is_in_heap_region(const void* p) mi_attr_noexcept;
+mi_decl_nodiscard mi_decl_export bool mi_is_redirected(void) mi_attr_noexcept;
+
+mi_decl_export int mi_reserve_huge_os_pages_interleave(size_t pages, size_t numa_nodes, size_t timeout_msecs) mi_attr_noexcept;
+mi_decl_export int mi_reserve_huge_os_pages_at(size_t pages, int numa_node, size_t timeout_msecs) mi_attr_noexcept;
+
+mi_decl_export int  mi_reserve_os_memory(size_t size, bool commit, bool allow_large) mi_attr_noexcept;
+mi_decl_export bool mi_manage_os_memory(void* start, size_t size, bool is_committed, bool is_large, bool is_zero, int numa_node) mi_attr_noexcept;
+
+mi_decl_export void mi_debug_show_arenas(void) mi_attr_noexcept;
+
+// deprecated
+mi_decl_export int  mi_reserve_huge_os_pages(size_t pages, double max_secs, size_t* pages_reserved) mi_attr_noexcept;
+
+
+// ------------------------------------------------------
+// Convenience
+// ------------------------------------------------------
+
+#define mi_malloc_tp(tp)                ((tp*)mi_malloc(sizeof(tp)))
+#define mi_zalloc_tp(tp)                ((tp*)mi_zalloc(sizeof(tp)))
+#define mi_calloc_tp(tp,n)              ((tp*)mi_calloc(n,sizeof(tp)))
+#define mi_mallocn_tp(tp,n)             ((tp*)mi_mallocn(n,sizeof(tp)))
+#define mi_reallocn_tp(p,tp,n)          ((tp*)mi_reallocn(p,n,sizeof(tp)))
+#define mi_recalloc_tp(p,tp,n)          ((tp*)mi_recalloc(p,n,sizeof(tp)))
+
+#define mi_heap_malloc_tp(hp,tp)        ((tp*)mi_heap_malloc(hp,sizeof(tp)))
+#define mi_heap_zalloc_tp(hp,tp)        ((tp*)mi_heap_zalloc(hp,sizeof(tp)))
+#define mi_heap_calloc_tp(hp,tp,n)      ((tp*)mi_heap_calloc(hp,n,sizeof(tp)))
+#define mi_heap_mallocn_tp(hp,tp,n)     ((tp*)mi_heap_mallocn(hp,n,sizeof(tp)))
+#define mi_heap_reallocn_tp(hp,p,tp,n)  ((tp*)mi_heap_reallocn(hp,p,n,sizeof(tp)))
+#define mi_heap_recalloc_tp(hp,p,tp,n)  ((tp*)mi_heap_recalloc(hp,p,n,sizeof(tp)))
+
+
+// ------------------------------------------------------
+// Options
+// ------------------------------------------------------
+
+typedef enum mi_option_e {
+  // stable options
+  mi_option_show_errors,
+  mi_option_show_stats,
+  mi_option_verbose,
+  // some of the following options are experimental
+  // (deprecated options are kept for binary backward compatibility with v1.x versions)
+  mi_option_eager_commit,
+  mi_option_deprecated_eager_region_commit,
+  mi_option_deprecated_reset_decommits,
+  mi_option_large_os_pages,           // use large (2MiB) OS pages, implies eager commit
+  mi_option_reserve_huge_os_pages,    // reserve N huge OS pages (1GiB) at startup
+  mi_option_reserve_huge_os_pages_at, // reserve huge OS pages at a specific NUMA node
+  mi_option_reserve_os_memory,        // reserve specified amount of OS memory at startup
+  mi_option_deprecated_segment_cache,
+  mi_option_page_reset,
+  mi_option_abandoned_page_decommit,
+  mi_option_deprecated_segment_reset,
+  mi_option_eager_commit_delay,
+  mi_option_decommit_delay,
+  mi_option_use_numa_nodes,           // 0 = use available numa nodes, otherwise use at most N nodes.
+  mi_option_limit_os_alloc,           // 1 = do not use OS memory for allocation (but only reserved arenas)
+  mi_option_os_tag,
+  mi_option_max_errors,
+  mi_option_max_warnings,
+  mi_option_max_segment_reclaim,
+  mi_option_allow_decommit,
+  mi_option_segment_decommit_delay,  
+  mi_option_decommit_extend_delay,
+  _mi_option_last
+} mi_option_t;
+
+
+mi_decl_nodiscard mi_decl_export bool mi_option_is_enabled(mi_option_t option);
+mi_decl_export void mi_option_enable(mi_option_t option);
+mi_decl_export void mi_option_disable(mi_option_t option);
+mi_decl_export void mi_option_set_enabled(mi_option_t option, bool enable);
+mi_decl_export void mi_option_set_enabled_default(mi_option_t option, bool enable);
+
+mi_decl_nodiscard mi_decl_export long mi_option_get(mi_option_t option);
+mi_decl_nodiscard mi_decl_export long mi_option_get_clamp(mi_option_t option, long min, long max);
+mi_decl_export void mi_option_set(mi_option_t option, long value);
+mi_decl_export void mi_option_set_default(mi_option_t option, long value);
+
+
+// -------------------------------------------------------------------------------------------------------
+// "mi" prefixed implementations of various posix, Unix, Windows, and C++ allocation functions.
+// (This can be convenient when providing overrides of these functions as done in `mimalloc-override.h`.)
+// note: we use `mi_cfree` as "checked free" and it checks if the pointer is in our heap before free-ing.
+// -------------------------------------------------------------------------------------------------------
+
+mi_decl_export void  mi_cfree(void* p) mi_attr_noexcept;
+mi_decl_export void* mi__expand(void* p, size_t newsize) mi_attr_noexcept;
+mi_decl_nodiscard mi_decl_export size_t mi_malloc_size(const void* p)        mi_attr_noexcept;
+mi_decl_nodiscard mi_decl_export size_t mi_malloc_good_size(size_t size)     mi_attr_noexcept;
+mi_decl_nodiscard mi_decl_export size_t mi_malloc_usable_size(const void *p) mi_attr_noexcept;
+
+mi_decl_export int mi_posix_memalign(void** p, size_t alignment, size_t size)   mi_attr_noexcept;
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_memalign(size_t alignment, size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2) mi_attr_alloc_align(1);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_valloc(size_t size)  mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(1);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_pvalloc(size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(1);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_aligned_alloc(size_t alignment, size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2) mi_attr_alloc_align(1);
+
+mi_decl_nodiscard mi_decl_export void* mi_reallocarray(void* p, size_t count, size_t size) mi_attr_noexcept mi_attr_alloc_size2(2,3);
+mi_decl_nodiscard mi_decl_export int   mi_reallocarr(void* p, size_t count, size_t size) mi_attr_noexcept;
+mi_decl_nodiscard mi_decl_export void* mi_aligned_recalloc(void* p, size_t newcount, size_t size, size_t alignment) mi_attr_noexcept;
+mi_decl_nodiscard mi_decl_export void* mi_aligned_offset_recalloc(void* p, size_t newcount, size_t size, size_t alignment, size_t offset) mi_attr_noexcept;
+
+mi_decl_nodiscard mi_decl_export mi_decl_restrict unsigned short* mi_wcsdup(const unsigned short* s) mi_attr_noexcept mi_attr_malloc;
+mi_decl_nodiscard mi_decl_export mi_decl_restrict unsigned char*  mi_mbsdup(const unsigned char* s)  mi_attr_noexcept mi_attr_malloc;
+mi_decl_export int mi_dupenv_s(char** buf, size_t* size, const char* name)                      mi_attr_noexcept;
+mi_decl_export int mi_wdupenv_s(unsigned short** buf, size_t* size, const unsigned short* name) mi_attr_noexcept;
+
+mi_decl_export void mi_free_size(void* p, size_t size)                           mi_attr_noexcept;
+mi_decl_export void mi_free_size_aligned(void* p, size_t size, size_t alignment) mi_attr_noexcept;
+mi_decl_export void mi_free_aligned(void* p, size_t alignment)                   mi_attr_noexcept;
+
+// The `mi_new` wrappers implement C++ semantics on out-of-memory instead of directly returning `NULL`.
+// (and call `std::get_new_handler` and potentially raise a `std::bad_alloc` exception).
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_new(size_t size)                   mi_attr_malloc mi_attr_alloc_size(1);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_new_aligned(size_t size, size_t alignment) mi_attr_malloc mi_attr_alloc_size(1) mi_attr_alloc_align(2);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_new_nothrow(size_t size)           mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(1);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_new_aligned_nothrow(size_t size, size_t alignment) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(1) mi_attr_alloc_align(2);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_new_n(size_t count, size_t size)   mi_attr_malloc mi_attr_alloc_size2(1, 2);
+mi_decl_nodiscard mi_decl_export void* mi_new_realloc(void* p, size_t newsize)                mi_attr_alloc_size(2);
+mi_decl_nodiscard mi_decl_export void* mi_new_reallocn(void* p, size_t newcount, size_t size) mi_attr_alloc_size2(2, 3);
+
+#ifdef __cplusplus
+}
+#endif
+
+// ---------------------------------------------------------------------------------------------
+// Implement the C++ std::allocator interface for use in STL containers.
+// (note: see `mimalloc-new-delete.h` for overriding the new/delete operators globally)
+// ---------------------------------------------------------------------------------------------
+#ifdef __cplusplus
+
+#include <cstddef>     // std::size_t
+#include <cstdint>     // PTRDIFF_MAX
+#if (__cplusplus >= 201103L) || (_MSC_VER > 1900)  // C++11
+#include <type_traits> // std::true_type
+#include <utility>     // std::forward
+#endif
+
+template<class T> struct mi_stl_allocator {
+  typedef T                 value_type;
+  typedef std::size_t       size_type;
+  typedef std::ptrdiff_t    difference_type;
+  typedef value_type&       reference;
+  typedef value_type const& const_reference;
+  typedef value_type*       pointer;
+  typedef value_type const* const_pointer;
+  template <class U> struct rebind { typedef mi_stl_allocator<U> other; };
+
+  mi_stl_allocator()                                             mi_attr_noexcept = default;
+  mi_stl_allocator(const mi_stl_allocator&)                      mi_attr_noexcept = default;
+  template<class U> mi_stl_allocator(const mi_stl_allocator<U>&) mi_attr_noexcept { }
+  mi_stl_allocator  select_on_container_copy_construction() const { return *this; }
+  void              deallocate(T* p, size_type) { mi_free(p); }
+
+  #if (__cplusplus >= 201703L)  // C++17
+  mi_decl_nodiscard T* allocate(size_type count) { return static_cast<T*>(mi_new_n(count, sizeof(T))); }
+  mi_decl_nodiscard T* allocate(size_type count, const void*) { return allocate(count); }
+  #else
+  mi_decl_nodiscard pointer allocate(size_type count, const void* = 0) { return static_cast<pointer>(mi_new_n(count, sizeof(value_type))); }
+  #endif
+
+  #if ((__cplusplus >= 201103L) || (_MSC_VER > 1900))  // C++11
+  using propagate_on_container_copy_assignment = std::true_type;
+  using propagate_on_container_move_assignment = std::true_type;
+  using propagate_on_container_swap            = std::true_type;
+  using is_always_equal                        = std::true_type;
+  template <class U, class ...Args> void construct(U* p, Args&& ...args) { ::new(p) U(std::forward<Args>(args)...); }
+  template <class U> void destroy(U* p) mi_attr_noexcept { p->~U(); }
+  #else
+  void construct(pointer p, value_type const& val) { ::new(p) value_type(val); }
+  void destroy(pointer p) { p->~value_type(); }
+  #endif
+
+  size_type     max_size() const mi_attr_noexcept { return (PTRDIFF_MAX/sizeof(value_type)); }
+  pointer       address(reference x) const        { return &x; }
+  const_pointer address(const_reference x) const  { return &x; }
+};
+
+template<class T1,class T2> bool operator==(const mi_stl_allocator<T1>& , const mi_stl_allocator<T2>& ) mi_attr_noexcept { return true; }
+template<class T1,class T2> bool operator!=(const mi_stl_allocator<T1>& , const mi_stl_allocator<T2>& ) mi_attr_noexcept { return false; }
+#endif // __cplusplus
+
+#endif
diff --git a/source/luametatex/source/libraries/mimalloc/readme.md b/source/luametatex/source/libraries/mimalloc/readme.md
new file mode 100644
index 000000000..6142dbc5e
--- /dev/null
+++ b/source/luametatex/source/libraries/mimalloc/readme.md
@@ -0,0 +1,716 @@
+
+<img align="left" width="100" height="100" src="doc/mimalloc-logo.png"/>
+
+[<img align="right" src="https://dev.azure.com/Daan0324/mimalloc/_apis/build/status/microsoft.mimalloc?branchName=dev"/>](https://dev.azure.com/Daan0324/mimalloc/_build?definitionId=1&_a=summary)
+
+# mimalloc
+
+&nbsp;
+
+mimalloc (pronounced "me-malloc")
+is a general purpose allocator with excellent [performance](#performance) characteristics.
+Initially developed by Daan Leijen for the run-time systems of the
+[Koka](https://koka-lang.github.io) and [Lean](https://github.com/leanprover/lean) languages.
+
+Latest release tag: `v2.0.6` (2022-04-14).  
+Latest stable  tag: `v1.7.6` (2022-02-14).
+
+mimalloc is a drop-in replacement for `malloc` and can be used in other programs
+without code changes, for example, on dynamically linked ELF-based systems (Linux, BSD, etc.) you can use it as:
+```
+> LD_PRELOAD=/usr/lib/libmimalloc.so  myprogram
+```
+It also has an easy way to override the default allocator in [Windows](#override_on_windows). Notable aspects of the design include:
+
+- __small and consistent__: the library is about 8k LOC using simple and
+  consistent data structures. This makes it very suitable
+  to integrate and adapt in other projects. For runtime systems it
+  provides hooks for a monotonic _heartbeat_ and deferred freeing (for
+  bounded worst-case times with reference counting).
+- __free list sharding__: instead of one big free list (per size class) we have
+  many smaller lists per "mimalloc page" which reduces fragmentation and
+  increases locality --
+  things that are allocated close in time get allocated close in memory.
+  (A mimalloc page contains blocks of one size class and is usually 64KiB on a 64-bit system).
+- __free list multi-sharding__: the big idea! Not only do we shard the free list
+  per mimalloc page, but for each page we have multiple free lists. In particular, there
+  is one list for thread-local `free` operations, and another one for concurrent `free`
+  operations. Free-ing from another thread can now be a single CAS without needing
+  sophisticated coordination between threads. Since there will be 
+  thousands of separate free lists, contention is naturally distributed over the heap,
+  and the chance of contending on a single location will be low -- this is quite
+  similar to randomized algorithms like skip lists where adding
+  a random oracle removes the need for a more complex algorithm.
+- __eager page reset__: when a "page" becomes empty (with increased chance
+  due to free list sharding) the memory is marked to the OS as unused ("reset" or "purged")
+  reducing (real) memory pressure and fragmentation, especially in long running
+  programs.
+- __secure__: _mimalloc_ can be built in secure mode, adding guard pages,
+  randomized allocation, encrypted free lists, etc. to protect against various
+  heap vulnerabilities. The performance penalty is usually around 10% on average
+  over our benchmarks.
+- __first-class heaps__: efficiently create and use multiple heaps to allocate across different regions.
+  A heap can be destroyed at once instead of deallocating each object separately.  
+- __bounded__: it does not suffer from _blowup_ \[1\], has bounded worst-case allocation
+  times (_wcat_), bounded space overhead (~0.2% meta-data, with low internal fragmentation),
+  and has no internal points of contention using only atomic operations.
+- __fast__: In our benchmarks (see [below](#performance)),
+  _mimalloc_ outperforms other leading allocators (_jemalloc_, _tcmalloc_, _Hoard_, etc),
+  and often uses less memory. A nice property
+  is that it does consistently well over a wide range of benchmarks. There is also good huge OS page
+  support for larger server programs.
+
+The [documentation](https://microsoft.github.io/mimalloc) gives a full overview of the API.
+You can read more on the design of _mimalloc_ in the [technical report](https://www.microsoft.com/en-us/research/publication/mimalloc-free-list-sharding-in-action) which also has detailed benchmark results.   
+
+Enjoy!  
+
+### Branches
+
+* `master`: latest stable release (based on `dev-slice`).
+* `dev`: development branch for mimalloc v1. Use this branch for submitting PR's.
+* `dev-slice`: development branch for mimalloc v2. This branch is downstream of `dev`.
+
+### Releases
+
+Note: the `v2.x` version has a new algorithm for managing internal mimalloc pages that tends to use reduce memory usage
+  and fragmentation compared to mimalloc `v1.x` (especially for large workloads). Should otherwise have similar performance
+  (see [below](#performance)); please report if you observe any significant performance regression.
+
+* 2022-04-14, `v1.7.6`, `v2.0.6`: fix fallback path for aligned OS allocation on Windows, improve Windows aligned allocation
+  even when compiling with older SDK's, fix dynamic overriding on macOS Monterey, fix MSVC C++ dynamic overriding, fix
+  warnings under Clang 14, improve performance if many OS threads are created and destroyed, fix statistics for large object
+  allocations, using MIMALLOC_VERBOSE=1 has no maximum on the number of error messages, various small fixes.
+
+* 2022-02-14, `v1.7.5`, `v2.0.5` (alpha): fix malloc override on
+  Windows 11, fix compilation with musl, potentially reduced
+  committed memory, add `bin/minject` for Windows, 
+  improved wasm support, faster aligned allocation,
+  various small fixes.
+
+* 2021-11-14, `v1.7.3`, `v2.0.3` (beta): improved WASM support, improved macOS support and performance (including
+  M1), improved performance for v2 for large objects, Python integration improvements, more standard
+  installation directories, various small fixes.
+
+* 2021-06-17, `v1.7.2`, `v2.0.2` (beta): support M1, better installation layout on Linux, fix
+  thread_id on Android, prefer 2-6TiB area for aligned allocation to work better on pre-windows 8, various small fixes.
+
+* 2021-04-06, `v1.7.1`, `v2.0.1` (beta): fix bug in arena allocation for huge pages, improved aslr on large allocations, initial M1 support (still experimental).
+  
+* 2021-01-31, `v2.0.0`: beta release 2.0: new slice algorithm for managing internal mimalloc pages.
+  
+* 2021-01-31, `v1.7.0`: stable release 1.7: support explicit user provided memory regions, more precise statistics,
+  improve macOS overriding, initial support for Apple M1, improved DragonFly support, faster memcpy on Windows, various small fixes.
+
+* [Older release notes](#older-release-notes)
+
+Special thanks to:
+
+* [David Carlier](https://devnexen.blogspot.com/) (@devnexen) for his many contributions, and making
+  mimalloc work better on many less common operating systems, like Haiku, Dragonfly, etc.
+* Mary Feofanova (@mary3000), Evgeniy Moiseenko, and Manuel Pöter (@mpoeter) for making mimalloc TSAN checkable, and finding
+  memory model bugs using the [genMC] model checker.
+* Weipeng Liu (@pongba), Zhuowei Li, Junhua Wang, and Jakub Szymanski, for their early support of mimalloc and deployment
+  at large scale services, leading to many improvements in the mimalloc algorithms for large workloads.
+* Jason Gibson (@jasongibson) for exhaustive testing on large scale workloads and server environments, and finding complex bugs 
+  in (early versions of) `mimalloc`.
+* Manuel Pöter (@mpoeter) and Sam Gross(@colesbury) for finding an ABA concurrency issue in abandoned segment reclamation. Sam also created the [no GIL](https://github.com/colesbury/nogil) Python fork which 
+  uses mimalloc internally.
+
+
+[genMC]: https://plv.mpi-sws.org/genmc/
+
+### Usage
+
+mimalloc is used in various large scale low-latency services and programs, for example:
+
+<a href="https://www.bing.com"><img height="50" align="left" src="https://upload.wikimedia.org/wikipedia/commons/e/e9/Bing_logo.svg"></a>
+<a href="https://azure.microsoft.com/"><img height="50" align="left" src="https://upload.wikimedia.org/wikipedia/commons/a/a8/Microsoft_Azure_Logo.svg"></a>
+<a href="https://deathstrandingpc.505games.com"><img height="100" src="doc/ds-logo.png"></a>
+<a href="https://docs.unrealengine.com/4.26/en-US/WhatsNew/Builds/ReleaseNotes/4_25/"><img height="100" src="doc/unreal-logo.svg"></a>
+<a href="https://cab.spbu.ru/software/spades/"><img height="100" src="doc/spades-logo.png"></a>
+
+
+# Building
+
+## Windows
+
+Open `ide/vs2019/mimalloc.sln` in Visual Studio 2019 and build.
+The `mimalloc` project builds a static library (in `out/msvc-x64`), while the
+`mimalloc-override` project builds a DLL for overriding malloc
+in the entire program.
+
+## macOS, Linux, BSD, etc.
+
+We use [`cmake`](https://cmake.org)<sup>1</sup> as the build system:
+
+```
+> mkdir -p out/release
+> cd out/release
+> cmake ../..
+> make
+```
+This builds the library as a shared (dynamic)
+library (`.so` or `.dylib`), a static library (`.a`), and
+as a single object file (`.o`).
+
+`> sudo make install` (install the library and header files in `/usr/local/lib`  and `/usr/local/include`)
+
+You can build the debug version which does many internal checks and
+maintains detailed statistics as:
+
+```
+> mkdir -p out/debug
+> cd out/debug
+> cmake -DCMAKE_BUILD_TYPE=Debug ../..
+> make
+```
+This will name the shared library as `libmimalloc-debug.so`.
+
+Finally, you can build a _secure_ version that uses guard pages, encrypted
+free lists, etc., as:
+```
+> mkdir -p out/secure
+> cd out/secure
+> cmake -DMI_SECURE=ON ../..
+> make
+```
+This will name the shared library as `libmimalloc-secure.so`.
+Use `ccmake`<sup>2</sup> instead of `cmake`
+to see and customize all the available build options.
+
+Notes:
+1. Install CMake: `sudo apt-get install cmake`
+2. Install CCMake: `sudo apt-get install cmake-curses-gui`
+
+
+## Single source
+
+You can also directly build the single `src/static.c` file as part of your project without
+needing `cmake` at all. Make sure to also add the mimalloc `include` directory to the include path.
+
+
+# Using the library
+
+The preferred usage is including `<mimalloc.h>`, linking with
+the shared- or static library, and using the `mi_malloc` API exclusively for allocation. For example,
+```
+> gcc -o myprogram -lmimalloc myfile.c
+```
+
+mimalloc uses only safe OS calls (`mmap` and `VirtualAlloc`) and can co-exist
+with other allocators linked to the same program.
+If you use `cmake`, you can simply use:
+```
+find_package(mimalloc 1.4 REQUIRED)
+```
+in your `CMakeLists.txt` to find a locally installed mimalloc. Then use either:
+```
+target_link_libraries(myapp PUBLIC mimalloc)
+```
+to link with the shared (dynamic) library, or:
+```
+target_link_libraries(myapp PUBLIC mimalloc-static)
+```
+to link with the static library. See `test\CMakeLists.txt` for an example.
+
+For best performance in C++ programs, it is also recommended to override the
+global `new` and `delete` operators. For convience, mimalloc provides
+[`mimalloc-new-delete.h`](https://github.com/microsoft/mimalloc/blob/master/include/mimalloc-new-delete.h) which does this for you -- just include it in a single(!) source file in your project.
+In C++, mimalloc also provides the `mi_stl_allocator` struct which implements the `std::allocator`
+interface.
+
+You can pass environment variables to print verbose messages (`MIMALLOC_VERBOSE=1`)
+and statistics (`MIMALLOC_SHOW_STATS=1`) (in the debug version):
+```
+> env MIMALLOC_SHOW_STATS=1 ./cfrac 175451865205073170563711388363
+
+175451865205073170563711388363 = 374456281610909315237213 * 468551
+
+heap stats:     peak      total      freed       unit
+normal   2:    16.4 kb    17.5 mb    17.5 mb      16 b   ok
+normal   3:    16.3 kb    15.2 mb    15.2 mb      24 b   ok
+normal   4:      64 b      4.6 kb     4.6 kb      32 b   ok
+normal   5:      80 b    118.4 kb   118.4 kb      40 b   ok
+normal   6:      48 b       48 b       48 b       48 b   ok
+normal  17:     960 b      960 b      960 b      320 b   ok
+
+heap stats:     peak      total      freed       unit
+    normal:    33.9 kb    32.8 mb    32.8 mb       1 b   ok
+      huge:       0 b        0 b        0 b        1 b   ok
+     total:    33.9 kb    32.8 mb    32.8 mb       1 b   ok
+malloc requested:         32.8 mb
+
+ committed:    58.2 kb    58.2 kb    58.2 kb       1 b   ok
+  reserved:     2.0 mb     2.0 mb     2.0 mb       1 b   ok
+     reset:       0 b        0 b        0 b        1 b   ok
+  segments:       1          1          1
+-abandoned:       0
+     pages:       6          6          6
+-abandoned:       0
+     mmaps:       3
+ mmap fast:       0
+ mmap slow:       1
+   threads:       0
+   elapsed:     2.022s
+   process: user: 1.781s, system: 0.016s, faults: 756, reclaims: 0, rss: 2.7 mb
+```
+
+The above model of using the `mi_` prefixed API is not always possible
+though in existing programs that already use the standard malloc interface,
+and another option is to override the standard malloc interface
+completely and redirect all calls to the _mimalloc_ library instead .
+
+## Environment Options
+
+You can set further options either programmatically (using [`mi_option_set`](https://microsoft.github.io/mimalloc/group__options.html)),
+or via environment variables:
+
+- `MIMALLOC_SHOW_STATS=1`: show statistics when the program terminates.
+- `MIMALLOC_VERBOSE=1`: show verbose messages.
+- `MIMALLOC_SHOW_ERRORS=1`: show error and warning messages.
+- `MIMALLOC_PAGE_RESET=0`: by default, mimalloc will reset (or purge) OS pages that are not in use, to signal to the OS
+   that the underlying physical memory can be reused. This can reduce memory fragmentation in long running (server)
+   programs. By setting it to `0` this will no longer be done which can improve performance for batch-like programs.
+   As an alternative, the `MIMALLOC_RESET_DELAY=`<msecs> can be set higher (100ms by default) to make the page
+   reset occur less frequently instead of turning it off completely.
+- `MIMALLOC_USE_NUMA_NODES=N`: pretend there are at most `N` NUMA nodes. If not set, the actual NUMA nodes are detected
+   at runtime. Setting `N` to 1 may avoid problems in some virtual environments. Also, setting it to a lower number than
+   the actual NUMA nodes is fine and will only cause threads to potentially allocate more memory across actual NUMA
+   nodes (but this can happen in any case as NUMA local allocation is always a best effort but not guaranteed).
+- `MIMALLOC_LARGE_OS_PAGES=1`: use large OS pages (2MiB) when available; for some workloads this can significantly
+   improve performance. Use `MIMALLOC_VERBOSE` to check if the large OS pages are enabled -- usually one needs
+   to explicitly allow large OS pages (as on [Windows][windows-huge] and [Linux][linux-huge]). However, sometimes
+   the OS is very slow to reserve contiguous physical memory for large OS pages so use with care on systems that
+   can have fragmented memory (for that reason, we generally recommend to use `MIMALLOC_RESERVE_HUGE_OS_PAGES` instead whenever possible).
+   <!--
+   - `MIMALLOC_EAGER_REGION_COMMIT=1`: on Windows, commit large (256MiB) regions eagerly. On Windows, these regions
+   show in the working set even though usually just a small part is committed to physical memory. This is why it
+   turned off by default on Windows as it looks not good in the task manager. However, turning it on has no
+   real drawbacks and may improve performance by a little.
+   -->
+- `MIMALLOC_RESERVE_HUGE_OS_PAGES=N`: where N is the number of 1GiB _huge_ OS pages. This reserves the huge pages at
+   startup and sometimes this can give a large (latency) performance improvement on big workloads.
+   Usually it is better to not use
+   `MIMALLOC_LARGE_OS_PAGES` in combination with this setting. Just like large OS pages, use with care as reserving
+   contiguous physical memory can take a long time when memory is fragmented (but reserving the huge pages is done at
+   startup only once).
+   Note that we usually need to explicitly enable huge OS pages (as on [Windows][windows-huge] and [Linux][linux-huge])).
+   With huge OS pages, it may be beneficial to set the setting
+   `MIMALLOC_EAGER_COMMIT_DELAY=N` (`N` is 1 by default) to delay the initial `N` segments (of 4MiB)
+   of a thread to not allocate in the huge OS pages; this prevents threads that are short lived
+   and allocate just a little to take up space in the huge OS page area (which cannot be reset).
+   The huge pages are usually allocated evenly among NUMA nodes.
+   We can use `MIMALLOC_RESERVE_HUGE_OS_PAGES_AT=N` where `N` is the numa node (starting at 0) to allocate all 
+   the huge pages at a specific numa node instead. 
+
+Use caution when using `fork` in combination with either large or huge OS pages: on a fork, the OS uses copy-on-write
+for all pages in the original process including the huge OS pages. When any memory is now written in that area, the
+OS will copy the entire 1GiB huge page (or 2MiB large page) which can cause the memory usage to grow in large increments.
+
+[linux-huge]: https://access.redhat.com/documentation/en-us/red_hat_enterprise_linux/5/html/tuning_and_optimizing_red_hat_enterprise_linux_for_oracle_9i_and_10g_databases/sect-oracle_9i_and_10g_tuning_guide-large_memory_optimization_big_pages_and_huge_pages-configuring_huge_pages_in_red_hat_enterprise_linux_4_or_5
+[windows-huge]: https://docs.microsoft.com/en-us/sql/database-engine/configure-windows/enable-the-lock-pages-in-memory-option-windows?view=sql-server-2017
+
+## Secure Mode
+
+_mimalloc_ can be build in secure mode by using the `-DMI_SECURE=ON` flags in `cmake`. This build enables various mitigations
+to make mimalloc more robust against exploits. In particular:
+
+- All internal mimalloc pages are surrounded by guard pages and the heap metadata is behind a guard page as well (so a buffer overflow
+  exploit cannot reach into the metadata).
+- All free list pointers are
+  [encoded](https://github.com/microsoft/mimalloc/blob/783e3377f79ee82af43a0793910a9f2d01ac7863/include/mimalloc-internal.h#L396)
+  with per-page keys which is used both to prevent overwrites with a known pointer, as well as to detect heap corruption.
+- Double free's are detected (and ignored).
+- The free lists are initialized in a random order and allocation randomly chooses between extension and reuse within a page to
+  mitigate against attacks that rely on a predicable allocation order. Similarly, the larger heap blocks allocated by mimalloc
+  from the OS are also address randomized.
+
+As always, evaluate with care as part of an overall security strategy as all of the above are mitigations but not guarantees.
+
+## Debug Mode
+
+When _mimalloc_ is built using debug mode, various checks are done at runtime to catch development errors.
+
+- Statistics are maintained in detail for each object size. They can be shown using `MIMALLOC_SHOW_STATS=1` at runtime.
+- All objects have padding at the end to detect (byte precise) heap block overflows.
+- Double free's, and freeing invalid heap pointers are detected.
+- Corrupted free-lists and some forms of use-after-free are detected.
+
+
+# Overriding Standard Malloc
+
+Overriding the standard `malloc` (and `new`) can be done either _dynamically_ or _statically_.
+
+## Dynamic override
+
+This is the recommended way to override the standard malloc interface.
+
+### Override on Linux, BSD
+
+On these ELF-based systems we preload the mimalloc shared
+library so all calls to the standard `malloc` interface are
+resolved to the _mimalloc_ library.
+```
+> env LD_PRELOAD=/usr/lib/libmimalloc.so myprogram
+```
+
+You can set extra environment variables to check that mimalloc is running,
+like:
+```
+> env MIMALLOC_VERBOSE=1 LD_PRELOAD=/usr/lib/libmimalloc.so myprogram
+```
+or run with the debug version to get detailed statistics:
+```
+> env MIMALLOC_SHOW_STATS=1 LD_PRELOAD=/usr/lib/libmimalloc-debug.so myprogram
+```
+
+### Override on MacOS
+
+On macOS we can also preload the mimalloc shared
+library so all calls to the standard `malloc` interface are
+resolved to the _mimalloc_ library.
+```
+> env DYLD_INSERT_LIBRARIES=/usr/lib/libmimalloc.dylib myprogram
+```
+
+Note that certain security restrictions may apply when doing this from
+the [shell](https://stackoverflow.com/questions/43941322/dyld-insert-libraries-ignored-when-calling-application-through-bash).
+
+
+### Override on Windows
+
+<span id="override_on_windows">Overriding on Windows</span> is robust and has the
+particular advantage to be able to redirect all malloc/free calls that go through
+the (dynamic) C runtime allocator, including those from other DLL's or libraries.
+
+The overriding on Windows requires that you link your program explicitly with
+the mimalloc DLL and use the C-runtime library as a DLL (using the `/MD` or `/MDd` switch).
+Also, the `mimalloc-redirect.dll` (or `mimalloc-redirect32.dll`) must be put
+in the same folder as the main `mimalloc-override.dll` at runtime (as it is a dependency).
+The redirection DLL ensures that all calls to the C runtime malloc API get redirected to
+mimalloc (in `mimalloc-override.dll`).
+
+To ensure the mimalloc DLL is loaded at run-time it is easiest to insert some
+call to the mimalloc API in the `main` function, like `mi_version()`
+(or use the `/INCLUDE:mi_version` switch on the linker). See the `mimalloc-override-test` project
+for an example on how to use this. For best performance on Windows with C++, it
+is also recommended to also override the `new`/`delete` operations (by including
+[`mimalloc-new-delete.h`](https://github.com/microsoft/mimalloc/blob/master/include/mimalloc-new-delete.h) a single(!) source file in your project).
+
+The environment variable `MIMALLOC_DISABLE_REDIRECT=1` can be used to disable dynamic
+overriding at run-time. Use `MIMALLOC_VERBOSE=1` to check if mimalloc was successfully redirected.
+
+(Note: in principle, it is possible to even patch existing executables without any recompilation
+if they are linked with the dynamic C runtime (`ucrtbase.dll`) -- just put the `mimalloc-override.dll`
+into the import table (and put `mimalloc-redirect.dll` in the same folder)
+Such patching can be done for example with [CFF Explorer](https://ntcore.com/?page_id=388)).
+
+
+## Static override
+
+On Unix-like systems, you can also statically link with _mimalloc_ to override the standard
+malloc interface. The recommended way is to link the final program with the
+_mimalloc_ single object file (`mimalloc-override.o`). We use
+an object file instead of a library file as linkers give preference to
+that over archives to resolve symbols. To ensure that the standard
+malloc interface resolves to the _mimalloc_ library, link it as the first
+object file. For example:
+```
+> gcc -o myprogram mimalloc-override.o  myfile1.c ...
+```
+
+Another way to override statically that works on all platforms, is to
+link statically to mimalloc (as shown in the introduction) and include a
+header file in each source file that re-defines `malloc` etc. to `mi_malloc`.
+This is provided by [`mimalloc-override.h`](https://github.com/microsoft/mimalloc/blob/master/include/mimalloc-override.h). This only works reliably though if all sources are
+under your control or otherwise mixing of pointers from different heaps may occur!
+
+
+# Performance
+
+Last update: 2021-01-30
+
+We tested _mimalloc_ against many other top allocators over a wide
+range of benchmarks, ranging from various real world programs to
+synthetic benchmarks that see how the allocator behaves under more
+extreme circumstances. In our benchmark suite, _mimalloc_ outperforms other leading
+allocators (_jemalloc_, _tcmalloc_, _Hoard_, etc), and has a similar memory footprint. A nice property is that it
+does consistently well over the wide range of benchmarks.
+
+General memory allocators are interesting as there exists no algorithm that is
+optimal -- for a given allocator one can usually construct a workload
+where it does not do so well. The goal is thus to find an allocation
+strategy that performs well over a wide range of benchmarks without
+suffering from (too much) underperformance in less common situations.
+
+As always, interpret these results with care since some benchmarks test synthetic
+or uncommon situations that may never apply to your workloads. For example, most
+allocators do not do well on `xmalloc-testN` but that includes even the best
+industrial allocators like _jemalloc_ and _tcmalloc_ that are used in some of
+the world's largest systems (like Chrome or FreeBSD).
+
+Also, the benchmarks here do not measure the behaviour on very large and long-running server workloads,
+or worst-case latencies of allocation. Much work has gone into `mimalloc` to work well on such
+workloads (for example, to reduce virtual memory fragmentation on long-running services)
+but such optimizations are not always reflected in the current benchmark suite.
+
+We show here only an overview -- for
+more specific details and further benchmarks we refer to the
+[technical report](https://www.microsoft.com/en-us/research/publication/mimalloc-free-list-sharding-in-action).
+The benchmark suite is automated and available separately
+as [mimalloc-bench](https://github.com/daanx/mimalloc-bench).
+
+
+## Benchmark Results on a 16-core AMD 5950x (Zen3)
+
+Testing on the 16-core AMD 5950x processor at 3.4Ghz (4.9Ghz boost), with
+with 32GiB memory at 3600Mhz, running	Ubuntu 20.04 with glibc 2.31 and GCC 9.3.0.
+
+We measure three versions of _mimalloc_: the main version `mi` (tag:v1.7.0),
+the new v2.0 beta version as `xmi` (tag:v2.0.0), and the main version in secure mode as `smi` (tag:v1.7.0).
+
+The other allocators are
+Google's [_tcmalloc_](https://github.com/gperftools/gperftools) (`tc`, tag:gperftools-2.8.1) used in Chrome,
+Facebook's [_jemalloc_](https://github.com/jemalloc/jemalloc) (`je`, tag:5.2.1) by Jason Evans used in Firefox and FreeBSD,
+the Intel thread building blocks [allocator](https://github.com/intel/tbb) (`tbb`, tag:v2020.3),
+[rpmalloc](https://github.com/mjansson/rpmalloc) (`rp`,tag:1.4.1) by Mattias Jansson,
+the original scalable [_Hoard_](https://github.com/emeryberger/Hoard) (git:d880f72) allocator by Emery Berger \[1],
+the memory compacting [_Mesh_](https://github.com/plasma-umass/Mesh) (git:67ff31a) allocator by
+Bobby Powers _et al_ \[8],
+and finally the default system allocator (`glibc`, 2.31) (based on _PtMalloc2_).
+
+<img width="90%" src="doc/bench-2021/bench-amd5950x-2021-01-30-a.svg"/>
+<img width="90%" src="doc/bench-2021/bench-amd5950x-2021-01-30-b.svg"/>
+
+Any benchmarks ending in `N` run on all 32 logical cores in parallel.
+Results are averaged over 10 runs and reported relative
+to mimalloc (where 1.2 means it took 1.2&times; longer to run).
+The legend also contains the _overall relative score_ between the
+allocators where 100 points is the maximum if an allocator is fastest on
+all benchmarks.
+
+The single threaded _cfrac_ benchmark by Dave Barrett is an implementation of
+continued fraction factorization which uses many small short-lived allocations.
+All allocators do well on such common usage, where _mimalloc_ is just a tad
+faster than _tcmalloc_ and
+_jemalloc_.
+
+The _leanN_ program is interesting as a large realistic and
+concurrent workload of the [Lean](https://github.com/leanprover/lean)
+theorem prover compiling its own standard library, and there is a 13%
+speedup over _tcmalloc_. This is
+quite significant: if Lean spends 20% of its time in the
+allocator that means that _mimalloc_ is 1.6&times; faster than _tcmalloc_
+here. (This is surprising as that is not measured in a pure
+allocation benchmark like _alloc-test_. We conjecture that we see this
+outsized improvement here because _mimalloc_ has better locality in
+the allocation which improves performance for the *other* computations
+in a program as well).
+
+The single threaded _redis_ benchmark again show that most allocators do well on such workloads.
+
+The _larsonN_ server benchmark by Larson and Krishnan \[2] allocates and frees between threads. They observed this
+behavior (which they call _bleeding_) in actual server applications, and the benchmark simulates this.
+Here, _mimalloc_ is quite a bit faster than _tcmalloc_ and _jemalloc_ probably due to the object migration between different threads.
+
+The _mstressN_ workload performs many allocations and re-allocations,
+and migrates objects between threads (as in _larsonN_). However, it also
+creates  and destroys the _N_ worker threads a few times keeping some objects
+alive beyond the life time of the allocating thread. We observed this
+behavior in many larger server applications.
+
+The [_rptestN_](https://github.com/mjansson/rpmalloc-benchmark) benchmark
+by Mattias Jansson is a allocator test originally designed
+for _rpmalloc_, and tries to simulate realistic allocation patterns over
+multiple threads. Here the differences between allocators become more apparent.
+
+The second benchmark set tests specific aspects of the allocators and
+shows even more extreme differences between them.
+
+The _alloc-test_, by
+[OLogN Technologies AG](http://ithare.com/testing-memory-allocators-ptmalloc2-tcmalloc-hoard-jemalloc-while-trying-to-simulate-real-world-loads/), is a very allocation intensive benchmark doing millions of
+allocations in various size classes. The test is scaled such that when an
+allocator performs almost identically on _alloc-test1_ as _alloc-testN_ it
+means that it scales linearly. 
+
+The _sh6bench_ and _sh8bench_ benchmarks are
+developed by [MicroQuill](http://www.microquill.com/) as part of SmartHeap.
+In _sh6bench_ _mimalloc_ does much
+better than the others (more than 2.5&times; faster than _jemalloc_).
+We cannot explain this well but believe it is
+caused in part by the "reverse" free-ing pattern in _sh6bench_.
+The _sh8bench_ is a variation with object migration
+between threads; whereas _tcmalloc_ did well on _sh6bench_, the addition of object migration causes it to be 10&times; slower than before.
+
+The _xmalloc-testN_ benchmark by Lever and Boreham \[5] and Christian Eder, simulates an asymmetric workload where
+some threads only allocate, and others only free -- they observed this pattern in
+larger server applications. Here we see that
+the _mimalloc_ technique of having non-contended sharded thread free
+lists pays off as it outperforms others by a very large margin. Only _rpmalloc_, _tbb_, and _glibc_ also scale well on this benchmark.
+
+The _cache-scratch_ benchmark by Emery Berger \[1], and introduced with
+the Hoard allocator to test for _passive-false_ sharing of cache lines.
+With a single thread they all
+perform the same, but when running with multiple threads the potential allocator
+induced false sharing of the cache lines can cause large run-time differences.
+Crundal \[6] describes in detail why the false cache line sharing occurs in the _tcmalloc_ design, and also discusses how this
+can be avoided with some small implementation changes.
+Only the _tbb_, _rpmalloc_ and _mesh_ allocators also avoid the
+cache line sharing completely, while _Hoard_ and _glibc_ seem to mitigate
+the effects. Kukanov and Voss \[7] describe in detail
+how the design of _tbb_ avoids the false cache line sharing.
+
+
+## On a 36-core Intel Xeon
+
+For completeness, here are the results on a big Amazon
+[c5.18xlarge](https://aws.amazon.com/ec2/instance-types/#Compute_Optimized) instance
+consisting of a 2&times;18-core Intel Xeon (Cascade Lake) at 3.4GHz (boost 3.5GHz)
+with 144GiB ECC memory, running	Ubuntu 20.04 with glibc 2.31, GCC 9.3.0, and
+Clang 10.0.0. This time, the mimalloc allocators (mi, xmi, and smi) were
+compiled with the Clang compiler instead of GCC.
+The results are similar to the AMD results but it is interesting to
+see the differences in the _larsonN_, _mstressN_, and _xmalloc-testN_ benchmarks.
+
+<img width="90%" src="doc/bench-2021/bench-c5-18xlarge-2021-01-30-a.svg"/>
+<img width="90%" src="doc/bench-2021/bench-c5-18xlarge-2021-01-30-b.svg"/>
+
+
+## Peak Working Set
+
+The following figure shows the peak working set (rss) of the allocators
+on the benchmarks (on the c5.18xlarge instance).
+
+<img width="90%" src="doc/bench-2021/bench-c5-18xlarge-2021-01-30-rss-a.svg"/>
+<img width="90%" src="doc/bench-2021/bench-c5-18xlarge-2021-01-30-rss-b.svg"/>
+
+Note that the _xmalloc-testN_ memory usage should be disregarded as it
+allocates more the faster the program runs. Similarly, memory usage of
+_larsonN_, _mstressN_, _rptestN_ and _sh8bench_ can vary depending on scheduling and
+speed. Nevertheless, we hope to improve the memory usage on _mstressN_
+and _rptestN_ (just as _cfrac_, _larsonN_ and _sh8bench_ have a small working set which skews the results).
+
+<!--
+# Previous Benchmarks
+
+Todo: should we create a separate page for this?
+
+## Benchmark Results on 36-core Intel: 2020-01-20
+
+Testing on a big Amazon EC2 compute instance
+([c5.18xlarge](https://aws.amazon.com/ec2/instance-types/#Compute_Optimized))
+consisting of a 72 processor Intel Xeon at 3GHz
+with 144GiB ECC memory, running	Ubuntu 18.04.1 with glibc 2.27 and GCC 7.4.0.
+The measured allocators are _mimalloc_ (xmi, tag:v1.4.0, page reset enabled)
+and its secure build as _smi_,
+Google's [_tcmalloc_](https://github.com/gperftools/gperftools) (tc, tag:gperftools-2.7) used in Chrome,
+Facebook's [_jemalloc_](https://github.com/jemalloc/jemalloc) (je, tag:5.2.1) by Jason Evans used in Firefox and FreeBSD,
+the Intel thread building blocks [allocator](https://github.com/intel/tbb) (tbb, tag:2020),
+[rpmalloc](https://github.com/mjansson/rpmalloc) (rp,tag:1.4.0) by Mattias Jansson,
+the original scalable [_Hoard_](https://github.com/emeryberger/Hoard) (tag:3.13) allocator by Emery Berger \[1],
+the memory compacting [_Mesh_](https://github.com/plasma-umass/Mesh) (git:51222e7) allocator by
+Bobby Powers _et al_ \[8],
+and finally the default system allocator (glibc, 2.27) (based on _PtMalloc2_).
+
+<img width="90%" src="doc/bench-2020/bench-c5-18xlarge-2020-01-20-a.svg"/>
+<img width="90%" src="doc/bench-2020/bench-c5-18xlarge-2020-01-20-b.svg"/>
+
+The following figure shows the peak working set (rss) of the allocators
+on the benchmarks (on the c5.18xlarge instance).
+
+<img width="90%" src="doc/bench-2020/bench-c5-18xlarge-2020-01-20-rss-a.svg"/>
+<img width="90%" src="doc/bench-2020/bench-c5-18xlarge-2020-01-20-rss-b.svg"/>
+
+
+## On 24-core AMD Epyc, 2020-01-16
+
+For completeness, here are the results on a
+[r5a.12xlarge](https://aws.amazon.com/ec2/instance-types/#Memory_Optimized) instance
+having a 48 processor AMD Epyc 7000 at 2.5GHz with 384GiB of memory.
+The results are similar to the Intel results but it is interesting to
+see the differences in the _larsonN_, _mstressN_, and _xmalloc-testN_ benchmarks.
+
+<img width="90%" src="doc/bench-2020/bench-r5a-12xlarge-2020-01-16-a.svg"/>
+<img width="90%" src="doc/bench-2020/bench-r5a-12xlarge-2020-01-16-b.svg"/>
+
+-->
+
+
+# References
+
+- \[1] Emery D. Berger, Kathryn S. McKinley, Robert D. Blumofe, and Paul R. Wilson.
+   _Hoard: A Scalable Memory Allocator for Multithreaded Applications_
+   the Ninth International Conference on Architectural Support for Programming Languages and Operating Systems (ASPLOS-IX). Cambridge, MA, November 2000.
+   [pdf](http://www.cs.utexas.edu/users/mckinley/papers/asplos-2000.pdf)
+
+- \[2] P. Larson and M. Krishnan. _Memory allocation for long-running server applications_.
+  In ISMM, Vancouver, B.C., Canada, 1998. [pdf](http://citeseer.ist.psu.edu/viewdoc/download?doi=10.1.1.45.1947&rep=rep1&type=pdf)
+
+- \[3] D. Grunwald, B. Zorn, and R. Henderson.
+  _Improving the cache locality of memory allocation_. In R. Cartwright, editor,
+  Proceedings of the Conference on Programming Language Design and Implementation, pages 177–186, New York, NY, USA, June 1993. [pdf](http://citeseer.ist.psu.edu/viewdoc/download?doi=10.1.1.43.6621&rep=rep1&type=pdf)
+
+- \[4] J. Barnes and P. Hut. _A hierarchical O(n*log(n)) force-calculation algorithm_. Nature, 324:446-449, 1986.
+
+- \[5] C. Lever, and D. Boreham. _Malloc() Performance in a Multithreaded Linux Environment._
+  In USENIX Annual Technical Conference, Freenix Session. San Diego, CA. Jun. 2000.
+  Available at <https://github.com/kuszmaul/SuperMalloc/tree/master/tests>
+
+- \[6] Timothy Crundal. _Reducing Active-False Sharing in TCMalloc_. 2016. CS16S1 project at the Australian National University. [pdf](http://courses.cecs.anu.edu.au/courses/CSPROJECTS/16S1/Reports/Timothy_Crundal_Report.pdf)
+
+- \[7] Alexey Kukanov, and Michael J Voss.
+   _The Foundations for Scalable Multi-Core Software in Intel Threading Building Blocks._
+   Intel Technology Journal 11 (4). 2007
+
+- \[8] Bobby Powers, David Tench, Emery D. Berger, and Andrew McGregor.
+ _Mesh: Compacting Memory Management for C/C++_
+ In Proceedings of the 40th ACM SIGPLAN Conference on Programming Language Design and Implementation (PLDI'19), June 2019, pages 333-–346.
+
+<!--
+- \[9] Paul Liétar, Theodore Butler, Sylvan Clebsch, Sophia Drossopoulou, Juliana Franco, Matthew J Parkinson,
+  Alex Shamis, Christoph M Wintersteiger, and David Chisnall.
+  _Snmalloc: A Message Passing Allocator._
+  In Proceedings of the 2019 ACM SIGPLAN International Symposium on Memory Management, 122–135. ACM. 2019.
+-->
+
+# Contributing
+
+This project welcomes contributions and suggestions.  Most contributions require you to agree to a
+Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us
+the rights to use your contribution. For details, visit https://cla.microsoft.com.
+
+When you submit a pull request, a CLA-bot will automatically determine whether you need to provide
+a CLA and decorate the PR appropriately (e.g., label, comment). Simply follow the instructions
+provided by the bot. You will only need to do this once across all repos using our CLA.
+
+
+# Older Release Notes
+
+* 2020-09-24, `v1.6.7`: stable release 1.6: using standard C atomics, passing tsan testing, improved
+  handling of failing to commit on Windows, add [`mi_process_info`](https://github.com/microsoft/mimalloc/blob/master/include/mimalloc.h#L156) api call.
+* 2020-08-06, `v1.6.4`: stable release 1.6: improved error recovery in low-memory situations,
+  support for IllumOS and Haiku, NUMA support for Vista/XP, improved NUMA detection for AMD Ryzen, ubsan support.
+* 2020-05-05, `v1.6.3`: stable release 1.6: improved behavior in out-of-memory situations, improved malloc zones on macOS,
+  build PIC static libraries by default, add option to abort on out-of-memory, line buffered statistics.
+* 2020-04-20, `v1.6.2`: stable release 1.6: fix compilation on Android, MingW, Raspberry, and Conda,
+  stability fix for Windows 7, fix multiple mimalloc instances in one executable, fix `strnlen` overload,
+  fix aligned debug padding.
+* 2020-02-17, `v1.6.1`: stable release 1.6: minor updates (build with clang-cl, fix alignment issue for small objects).
+* 2020-02-09, `v1.6.0`: stable release 1.6: fixed potential memory leak, improved overriding
+  and thread local support on FreeBSD, NetBSD, DragonFly, and macOSX. New byte-precise
+  heap block overflow detection in debug mode (besides the double-free detection and free-list
+  corruption detection). Add `nodiscard` attribute to most allocation functions.
+  Enable `MIMALLOC_PAGE_RESET` by default. New reclamation strategy for abandoned heap pages
+  for better memory footprint.
+* 2020-02-09, `v1.5.0`: stable release 1.5: improved free performance, small bug fixes.
+* 2020-01-22, `v1.4.0`: stable release 1.4: improved performance for delayed OS page reset,
+more eager concurrent free, addition of STL allocator, fixed potential memory leak.
+* 2020-01-15, `v1.3.0`: stable release 1.3: bug fixes, improved randomness and [stronger
+free list encoding](https://github.com/microsoft/mimalloc/blob/783e3377f79ee82af43a0793910a9f2d01ac7863/include/mimalloc-internal.h#L396) in secure mode.
+* 2019-12-22, `v1.2.2`: stable release 1.2: minor updates.
+* 2019-11-22, `v1.2.0`: stable release 1.2: bug fixes, improved secure mode (free list corruption checks, double free mitigation). Improved dynamic overriding on Windows.
+* 2019-10-07, `v1.1.0`: stable release 1.1.
+* 2019-09-01, `v1.0.8`: pre-release 8: more robust windows dynamic overriding, initial huge page support.
+* 2019-08-10, `v1.0.6`: pre-release 6: various performance improvements.
+
diff --git a/source/luametatex/source/libraries/mimalloc/src/alloc-aligned.c b/source/luametatex/source/libraries/mimalloc/src/alloc-aligned.c
new file mode 100644
index 000000000..fce0fd749
--- /dev/null
+++ b/source/luametatex/source/libraries/mimalloc/src/alloc-aligned.c
@@ -0,0 +1,261 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2018-2021, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+
+#include "mimalloc.h"
+#include "mimalloc-internal.h"
+
+#include <string.h>  // memset
+
+// ------------------------------------------------------
+// Aligned Allocation
+// ------------------------------------------------------
+
+// Fallback primitive aligned allocation -- split out for better codegen
+static mi_decl_noinline void* mi_heap_malloc_zero_aligned_at_fallback(mi_heap_t* const heap, const size_t size, const size_t alignment, const size_t offset, const bool zero) mi_attr_noexcept
+{
+  mi_assert_internal(size <= PTRDIFF_MAX);
+  mi_assert_internal(alignment!=0 && _mi_is_power_of_two(alignment) && alignment <= MI_ALIGNMENT_MAX);
+
+  const uintptr_t align_mask = alignment-1;  // for any x, `(x & align_mask) == (x % alignment)`
+  const size_t padsize = size + MI_PADDING_SIZE;
+
+  // use regular allocation if it is guaranteed to fit the alignment constraints
+  if (offset==0 && alignment<=padsize && padsize<=MI_MAX_ALIGN_GUARANTEE && (padsize&align_mask)==0) {
+    void* p = _mi_heap_malloc_zero(heap, size, zero);
+    mi_assert_internal(p == NULL || ((uintptr_t)p % alignment) == 0);
+    return p;
+  }
+
+  // otherwise over-allocate
+  void* p = _mi_heap_malloc_zero(heap, size + alignment - 1, zero);
+  if (p == NULL) return NULL;
+
+  // .. and align within the allocation
+  uintptr_t adjust = alignment - (((uintptr_t)p + offset) & align_mask);
+  mi_assert_internal(adjust <= alignment);
+  void* aligned_p = (adjust == alignment ? p : (void*)((uintptr_t)p + adjust));
+  if (aligned_p != p) mi_page_set_has_aligned(_mi_ptr_page(p), true);
+  mi_assert_internal(((uintptr_t)aligned_p + offset) % alignment == 0);
+  mi_assert_internal(p == _mi_page_ptr_unalign(_mi_ptr_segment(aligned_p), _mi_ptr_page(aligned_p), aligned_p));
+  return aligned_p;
+}
+
+// Primitive aligned allocation
+static void* mi_heap_malloc_zero_aligned_at(mi_heap_t* const heap, const size_t size, const size_t alignment, const size_t offset, const bool zero) mi_attr_noexcept
+{
+  // note: we don't require `size > offset`, we just guarantee that the address at offset is aligned regardless of the allocated size.
+  mi_assert(alignment > 0);
+  if (mi_unlikely(alignment==0 || !_mi_is_power_of_two(alignment))) { // require power-of-two (see <https://en.cppreference.com/w/c/memory/aligned_alloc>)
+    #if MI_DEBUG > 0
+    _mi_error_message(EOVERFLOW, "aligned allocation requires the alignment to be a power-of-two (size %zu, alignment %zu)\n", size, alignment);
+    #endif
+    return NULL;
+  }
+  if (mi_unlikely(alignment > MI_ALIGNMENT_MAX)) {  // we cannot align at a boundary larger than this (or otherwise we cannot find segment headers)
+    #if MI_DEBUG > 0
+    _mi_error_message(EOVERFLOW, "aligned allocation has a maximum alignment of %zu (size %zu, alignment %zu)\n", MI_ALIGNMENT_MAX, size, alignment);
+    #endif
+    return NULL;
+  }
+  if (mi_unlikely(size > PTRDIFF_MAX)) {          // we don't allocate more than PTRDIFF_MAX (see <https://sourceware.org/ml/libc-announce/2019/msg00001.html>)                                                    
+    #if MI_DEBUG > 0
+    _mi_error_message(EOVERFLOW, "aligned allocation request is too large (size %zu, alignment %zu)\n", size, alignment);
+    #endif
+    return NULL;
+  }
+  const uintptr_t align_mask = alignment-1;       // for any x, `(x & align_mask) == (x % alignment)`
+  const size_t padsize = size + MI_PADDING_SIZE;  // note: cannot overflow due to earlier size > PTRDIFF_MAX check
+
+  // try first if there happens to be a small block available with just the right alignment
+  if (mi_likely(padsize <= MI_SMALL_SIZE_MAX)) {
+    mi_page_t* page = _mi_heap_get_free_small_page(heap, padsize);
+    const bool is_aligned = (((uintptr_t)page->free+offset) & align_mask)==0;
+    if (mi_likely(page->free != NULL && is_aligned))
+    {
+      #if MI_STAT>1
+      mi_heap_stat_increase(heap, malloc, size);
+      #endif
+      void* p = _mi_page_malloc(heap, page, padsize); // TODO: inline _mi_page_malloc
+      mi_assert_internal(p != NULL);
+      mi_assert_internal(((uintptr_t)p + offset) % alignment == 0);
+      if (zero) { _mi_block_zero_init(page, p, size); }
+      return p;
+    }
+  }
+  // fallback
+  return mi_heap_malloc_zero_aligned_at_fallback(heap, size, alignment, offset, zero);
+}
+
+
+// ------------------------------------------------------
+// Optimized mi_heap_malloc_aligned / mi_malloc_aligned
+// ------------------------------------------------------
+
+mi_decl_restrict void* mi_heap_malloc_aligned_at(mi_heap_t* heap, size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
+  return mi_heap_malloc_zero_aligned_at(heap, size, alignment, offset, false);
+}
+
+mi_decl_restrict void* mi_heap_malloc_aligned(mi_heap_t* heap, size_t size, size_t alignment) mi_attr_noexcept {
+  #if !MI_PADDING
+  // without padding, any small sized allocation is naturally aligned (see also `_mi_segment_page_start`)
+  if (!_mi_is_power_of_two(alignment)) return NULL;
+  if (mi_likely(_mi_is_power_of_two(size) && size >= alignment && size <= MI_SMALL_SIZE_MAX))
+  #else
+  // with padding, we can only guarantee this for fixed alignments
+  if (mi_likely((alignment == sizeof(void*) || (alignment == MI_MAX_ALIGN_SIZE && size > (MI_MAX_ALIGN_SIZE/2)))
+                && size <= MI_SMALL_SIZE_MAX))
+  #endif
+  {
+    // fast path for common alignment and size
+    return mi_heap_malloc_small(heap, size);
+  }
+  else {
+    return mi_heap_malloc_aligned_at(heap, size, alignment, 0);
+  }
+}
+
+// ------------------------------------------------------
+// Aligned Allocation
+// ------------------------------------------------------
+
+mi_decl_restrict void* mi_heap_zalloc_aligned_at(mi_heap_t* heap, size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
+  return mi_heap_malloc_zero_aligned_at(heap, size, alignment, offset, true);
+}
+
+mi_decl_restrict void* mi_heap_zalloc_aligned(mi_heap_t* heap, size_t size, size_t alignment) mi_attr_noexcept {
+  return mi_heap_zalloc_aligned_at(heap, size, alignment, 0);
+}
+
+mi_decl_restrict void* mi_heap_calloc_aligned_at(mi_heap_t* heap, size_t count, size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
+  size_t total;
+  if (mi_count_size_overflow(count, size, &total)) return NULL;
+  return mi_heap_zalloc_aligned_at(heap, total, alignment, offset);
+}
+
+mi_decl_restrict void* mi_heap_calloc_aligned(mi_heap_t* heap, size_t count, size_t size, size_t alignment) mi_attr_noexcept {
+  return mi_heap_calloc_aligned_at(heap,count,size,alignment,0);
+}
+
+mi_decl_restrict void* mi_malloc_aligned_at(size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
+  return mi_heap_malloc_aligned_at(mi_get_default_heap(), size, alignment, offset);
+}
+
+mi_decl_restrict void* mi_malloc_aligned(size_t size, size_t alignment) mi_attr_noexcept {
+  return mi_heap_malloc_aligned(mi_get_default_heap(), size, alignment);
+}
+
+mi_decl_restrict void* mi_zalloc_aligned_at(size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
+  return mi_heap_zalloc_aligned_at(mi_get_default_heap(), size, alignment, offset);
+}
+
+mi_decl_restrict void* mi_zalloc_aligned(size_t size, size_t alignment) mi_attr_noexcept {
+  return mi_heap_zalloc_aligned(mi_get_default_heap(), size, alignment);
+}
+
+mi_decl_restrict void* mi_calloc_aligned_at(size_t count, size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
+  return mi_heap_calloc_aligned_at(mi_get_default_heap(), count, size, alignment, offset);
+}
+
+mi_decl_restrict void* mi_calloc_aligned(size_t count, size_t size, size_t alignment) mi_attr_noexcept {
+  return mi_heap_calloc_aligned(mi_get_default_heap(), count, size, alignment);
+}
+
+
+// ------------------------------------------------------
+// Aligned re-allocation
+// ------------------------------------------------------
+
+static void* mi_heap_realloc_zero_aligned_at(mi_heap_t* heap, void* p, size_t newsize, size_t alignment, size_t offset, bool zero) mi_attr_noexcept {
+  mi_assert(alignment > 0);
+  if (alignment <= sizeof(uintptr_t)) return _mi_heap_realloc_zero(heap,p,newsize,zero);
+  if (p == NULL) return mi_heap_malloc_zero_aligned_at(heap,newsize,alignment,offset,zero);
+  size_t size = mi_usable_size(p);
+  if (newsize <= size && newsize >= (size - (size / 2))
+      && (((uintptr_t)p + offset) % alignment) == 0) {
+    return p;  // reallocation still fits, is aligned and not more than 50% waste
+  }
+  else {
+    void* newp = mi_heap_malloc_aligned_at(heap,newsize,alignment,offset);
+    if (newp != NULL) {
+      if (zero && newsize > size) {
+        const mi_page_t* page = _mi_ptr_page(newp);
+        if (page->is_zero) {
+          // already zero initialized
+          mi_assert_expensive(mi_mem_is_zero(newp,newsize));
+        }
+        else {
+          // also set last word in the previous allocation to zero to ensure any padding is zero-initialized
+          size_t start = (size >= sizeof(intptr_t) ? size - sizeof(intptr_t) : 0);
+          memset((uint8_t*)newp + start, 0, newsize - start);
+        }
+      }
+      _mi_memcpy_aligned(newp, p, (newsize > size ? size : newsize));
+      mi_free(p); // only free if successful
+    }
+    return newp;
+  }
+}
+
+static void* mi_heap_realloc_zero_aligned(mi_heap_t* heap, void* p, size_t newsize, size_t alignment, bool zero) mi_attr_noexcept {
+  mi_assert(alignment > 0);
+  if (alignment <= sizeof(uintptr_t)) return _mi_heap_realloc_zero(heap,p,newsize,zero);
+  size_t offset = ((uintptr_t)p % alignment); // use offset of previous allocation (p can be NULL)
+  return mi_heap_realloc_zero_aligned_at(heap,p,newsize,alignment,offset,zero);
+}
+
+void* mi_heap_realloc_aligned_at(mi_heap_t* heap, void* p, size_t newsize, size_t alignment, size_t offset) mi_attr_noexcept {
+  return mi_heap_realloc_zero_aligned_at(heap,p,newsize,alignment,offset,false);
+}
+
+void* mi_heap_realloc_aligned(mi_heap_t* heap, void* p, size_t newsize, size_t alignment) mi_attr_noexcept {
+  return mi_heap_realloc_zero_aligned(heap,p,newsize,alignment,false);
+}
+
+void* mi_heap_rezalloc_aligned_at(mi_heap_t* heap, void* p, size_t newsize, size_t alignment, size_t offset) mi_attr_noexcept {
+  return mi_heap_realloc_zero_aligned_at(heap, p, newsize, alignment, offset, true);
+}
+
+void* mi_heap_rezalloc_aligned(mi_heap_t* heap, void* p, size_t newsize, size_t alignment) mi_attr_noexcept {
+  return mi_heap_realloc_zero_aligned(heap, p, newsize, alignment, true);
+}
+
+void* mi_heap_recalloc_aligned_at(mi_heap_t* heap, void* p, size_t newcount, size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
+  size_t total;
+  if (mi_count_size_overflow(newcount, size, &total)) return NULL;
+  return mi_heap_rezalloc_aligned_at(heap, p, total, alignment, offset);
+}
+
+void* mi_heap_recalloc_aligned(mi_heap_t* heap, void* p, size_t newcount, size_t size, size_t alignment) mi_attr_noexcept {
+  size_t total;
+  if (mi_count_size_overflow(newcount, size, &total)) return NULL;
+  return mi_heap_rezalloc_aligned(heap, p, total, alignment);
+}
+
+void* mi_realloc_aligned_at(void* p, size_t newsize, size_t alignment, size_t offset) mi_attr_noexcept {
+  return mi_heap_realloc_aligned_at(mi_get_default_heap(), p, newsize, alignment, offset);
+}
+
+void* mi_realloc_aligned(void* p, size_t newsize, size_t alignment) mi_attr_noexcept {
+  return mi_heap_realloc_aligned(mi_get_default_heap(), p, newsize, alignment);
+}
+
+void* mi_rezalloc_aligned_at(void* p, size_t newsize, size_t alignment, size_t offset) mi_attr_noexcept {
+  return mi_heap_rezalloc_aligned_at(mi_get_default_heap(), p, newsize, alignment, offset);
+}
+
+void* mi_rezalloc_aligned(void* p, size_t newsize, size_t alignment) mi_attr_noexcept {
+  return mi_heap_rezalloc_aligned(mi_get_default_heap(), p, newsize, alignment);
+}
+
+void* mi_recalloc_aligned_at(void* p, size_t newcount, size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
+  return mi_heap_recalloc_aligned_at(mi_get_default_heap(), p, newcount, size, alignment, offset);
+}
+
+void* mi_recalloc_aligned(void* p, size_t newcount, size_t size, size_t alignment) mi_attr_noexcept {
+  return mi_heap_recalloc_aligned(mi_get_default_heap(), p, newcount, size, alignment);
+}
+
diff --git a/source/luametatex/source/libraries/mimalloc/src/alloc-override-osx.c b/source/luametatex/source/libraries/mimalloc/src/alloc-override-osx.c
new file mode 100644
index 000000000..41d0a386e
--- /dev/null
+++ b/source/luametatex/source/libraries/mimalloc/src/alloc-override-osx.c
@@ -0,0 +1,458 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2018-2022, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+
+#include "mimalloc.h"
+#include "mimalloc-internal.h"
+
+#if defined(MI_MALLOC_OVERRIDE)
+
+#if !defined(__APPLE__)
+#error "this file should only be included on macOS"
+#endif
+
+/* ------------------------------------------------------
+   Override system malloc on macOS
+   This is done through the malloc zone interface.
+   It seems to be most robust in combination with interposing
+   though or otherwise we may get zone errors as there are could
+   be allocations done by the time we take over the 
+   zone. 
+------------------------------------------------------ */
+
+#include <AvailabilityMacros.h>
+#include <malloc/malloc.h>
+#include <string.h>  // memset
+#include <stdlib.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if defined(MAC_OS_X_VERSION_10_6) && (MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_6)
+// only available from OSX 10.6
+extern malloc_zone_t* malloc_default_purgeable_zone(void) __attribute__((weak_import));
+#endif
+
+/* ------------------------------------------------------
+   malloc zone members
+------------------------------------------------------ */
+
+static size_t zone_size(malloc_zone_t* zone, const void* p) {
+  MI_UNUSED(zone);
+  if (!mi_is_in_heap_region(p)){ return 0; } // not our pointer, bail out
+  return mi_usable_size(p);
+}
+
+static void* zone_malloc(malloc_zone_t* zone, size_t size) {
+  MI_UNUSED(zone);
+  return mi_malloc(size);
+}
+
+static void* zone_calloc(malloc_zone_t* zone, size_t count, size_t size) {
+  MI_UNUSED(zone);
+  return mi_calloc(count, size);
+}
+
+static void* zone_valloc(malloc_zone_t* zone, size_t size) {
+  MI_UNUSED(zone);
+  return mi_malloc_aligned(size, _mi_os_page_size());
+}
+
+static void zone_free(malloc_zone_t* zone, void* p) {
+  MI_UNUSED(zone);
+  mi_cfree(p);
+}
+
+static void* zone_realloc(malloc_zone_t* zone, void* p, size_t newsize) {
+  MI_UNUSED(zone);
+  return mi_realloc(p, newsize);
+}
+
+static void* zone_memalign(malloc_zone_t* zone, size_t alignment, size_t size) {
+  MI_UNUSED(zone);
+  return mi_malloc_aligned(size,alignment);
+}
+
+static void zone_destroy(malloc_zone_t* zone) {
+  MI_UNUSED(zone);
+  // todo: ignore for now?
+}
+
+static unsigned zone_batch_malloc(malloc_zone_t* zone, size_t size, void** ps, unsigned count) {
+  size_t i;
+  for (i = 0; i < count; i++) {
+    ps[i] = zone_malloc(zone, size);
+    if (ps[i] == NULL) break;
+  }
+  return i;
+}
+
+static void zone_batch_free(malloc_zone_t* zone, void** ps, unsigned count) {
+  for(size_t i = 0; i < count; i++) {
+    zone_free(zone, ps[i]);
+    ps[i] = NULL;
+  }
+}
+
+static size_t zone_pressure_relief(malloc_zone_t* zone, size_t size) {
+  MI_UNUSED(zone); MI_UNUSED(size);
+  mi_collect(false);
+  return 0;
+}
+
+static void zone_free_definite_size(malloc_zone_t* zone, void* p, size_t size) {
+  MI_UNUSED(size);
+  zone_free(zone,p);
+}
+
+static boolean_t zone_claimed_address(malloc_zone_t* zone, void* p) {
+  MI_UNUSED(zone);
+  return mi_is_in_heap_region(p);
+}
+
+
+/* ------------------------------------------------------
+   Introspection members
+------------------------------------------------------ */
+
+static kern_return_t intro_enumerator(task_t task, void* p,
+                            unsigned type_mask, vm_address_t zone_address,
+                            memory_reader_t reader,
+                            vm_range_recorder_t recorder)
+{
+  // todo: enumerate all memory
+  MI_UNUSED(task); MI_UNUSED(p); MI_UNUSED(type_mask); MI_UNUSED(zone_address);
+  MI_UNUSED(reader); MI_UNUSED(recorder);
+  return KERN_SUCCESS;
+}
+
+static size_t intro_good_size(malloc_zone_t* zone, size_t size) {
+  MI_UNUSED(zone);
+  return mi_good_size(size);
+}
+
+static boolean_t intro_check(malloc_zone_t* zone) {
+  MI_UNUSED(zone);
+  return true;
+}
+
+static void intro_print(malloc_zone_t* zone, boolean_t verbose) {
+  MI_UNUSED(zone); MI_UNUSED(verbose);
+  mi_stats_print(NULL);
+}
+
+static void intro_log(malloc_zone_t* zone, void* p) {
+  MI_UNUSED(zone); MI_UNUSED(p);
+  // todo?
+}
+
+static void intro_force_lock(malloc_zone_t* zone) {
+  MI_UNUSED(zone);
+  // todo?
+}
+
+static void intro_force_unlock(malloc_zone_t* zone) {
+  MI_UNUSED(zone);
+  // todo?
+}
+
+static void intro_statistics(malloc_zone_t* zone, malloc_statistics_t* stats) {
+  MI_UNUSED(zone);
+  // todo...
+  stats->blocks_in_use = 0;
+  stats->size_in_use = 0;
+  stats->max_size_in_use = 0;
+  stats->size_allocated = 0;
+}
+
+static boolean_t intro_zone_locked(malloc_zone_t* zone) {
+  MI_UNUSED(zone);
+  return false;
+}
+
+
+/* ------------------------------------------------------
+  At process start, override the default allocator
+------------------------------------------------------ */
+
+#if defined(__GNUC__) && !defined(__clang__)
+#pragma GCC diagnostic ignored "-Wmissing-field-initializers"
+#endif
+
+#if defined(__clang__)
+#pragma clang diagnostic ignored "-Wc99-extensions"
+#endif
+
+static malloc_introspection_t mi_introspect = {
+  .enumerator = &intro_enumerator,
+  .good_size = &intro_good_size,
+  .check = &intro_check,
+  .print = &intro_print,
+  .log = &intro_log,
+  .force_lock = &intro_force_lock,
+  .force_unlock = &intro_force_unlock,
+#if defined(MAC_OS_X_VERSION_10_6) && (MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_6)
+  .statistics = &intro_statistics,
+  .zone_locked = &intro_zone_locked,
+#endif
+};
+
+static malloc_zone_t mi_malloc_zone = {
+  // note: even with designators, the order is important for C++ compilation
+  //.reserved1 = NULL,
+  //.reserved2 = NULL,
+  .size = &zone_size,
+  .malloc = &zone_malloc,
+  .calloc = &zone_calloc,
+  .valloc = &zone_valloc,
+  .free = &zone_free,
+  .realloc = &zone_realloc,
+  .destroy = &zone_destroy,
+  .zone_name = "mimalloc",
+  .batch_malloc = &zone_batch_malloc,
+  .batch_free = &zone_batch_free,
+  .introspect = &mi_introspect,  
+#if defined(MAC_OS_X_VERSION_10_6) && (MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_6)
+  #if defined(MAC_OS_X_VERSION_10_14) && (MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_14)
+  .version = 10,
+  #else
+  .version = 9,
+  #endif
+  // switch to version 9+ on OSX 10.6 to support memalign.
+  .memalign = &zone_memalign,
+  .free_definite_size = &zone_free_definite_size,
+  .pressure_relief = &zone_pressure_relief,
+  #if defined(MAC_OS_X_VERSION_10_14) && (MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_14)
+  .claimed_address = &zone_claimed_address,
+  #endif
+#else
+  .version = 4,
+#endif
+};
+
+#ifdef __cplusplus
+}
+#endif
+
+
+#if defined(MI_OSX_INTERPOSE) && defined(MI_SHARED_LIB_EXPORT)
+
+// ------------------------------------------------------
+// Override malloc_xxx and malloc_zone_xxx api's to use only 
+// our mimalloc zone. Since even the loader uses malloc
+// on macOS, this ensures that all allocations go through
+// mimalloc (as all calls are interposed).
+// The main `malloc`, `free`, etc calls are interposed in `alloc-override.c`,
+// Here, we also override macOS specific API's like
+// `malloc_zone_calloc` etc. see <https://github.com/aosm/libmalloc/blob/master/man/malloc_zone_malloc.3>
+// ------------------------------------------------------
+
+static inline malloc_zone_t* mi_get_default_zone(void)
+{
+  static bool init;
+  if (mi_unlikely(!init)) { 
+    init = true;
+    malloc_zone_register(&mi_malloc_zone);  // by calling register we avoid a zone error on free (see <http://eatmyrandom.blogspot.com/2010/03/mallocfree-interception-on-mac-os-x.html>)
+  }
+  return &mi_malloc_zone;
+}
+
+mi_decl_externc int  malloc_jumpstart(uintptr_t cookie);
+mi_decl_externc void _malloc_fork_prepare(void);
+mi_decl_externc void _malloc_fork_parent(void);
+mi_decl_externc void _malloc_fork_child(void);
+
+
+static malloc_zone_t* mi_malloc_create_zone(vm_size_t size, unsigned flags) {
+  MI_UNUSED(size); MI_UNUSED(flags);
+  return mi_get_default_zone();
+}
+
+static malloc_zone_t* mi_malloc_default_zone (void) {   
+  return mi_get_default_zone();
+}
+
+static malloc_zone_t* mi_malloc_default_purgeable_zone(void) {
+  return mi_get_default_zone();
+}
+
+static void mi_malloc_destroy_zone(malloc_zone_t* zone) {
+  MI_UNUSED(zone);
+  // nothing.
+}
+
+static kern_return_t mi_malloc_get_all_zones (task_t task, memory_reader_t mr, vm_address_t** addresses, unsigned* count) {
+  MI_UNUSED(task); MI_UNUSED(mr);
+  if (addresses != NULL) *addresses = NULL;
+  if (count != NULL) *count = 0;
+  return KERN_SUCCESS;
+}
+
+static const char* mi_malloc_get_zone_name(malloc_zone_t* zone) {  
+  return (zone == NULL ? mi_malloc_zone.zone_name : zone->zone_name);
+}
+
+static void mi_malloc_set_zone_name(malloc_zone_t* zone, const char* name) {  
+  MI_UNUSED(zone); MI_UNUSED(name);
+}
+
+static int mi_malloc_jumpstart(uintptr_t cookie) {
+  MI_UNUSED(cookie);
+  return 1; // or 0 for no error?
+}
+
+static void mi__malloc_fork_prepare(void) {
+  // nothing  
+}
+static void mi__malloc_fork_parent(void) {
+  // nothing
+}
+static void mi__malloc_fork_child(void) {
+  // nothing
+}
+
+static void mi_malloc_printf(const char* fmt, ...) {
+  MI_UNUSED(fmt);
+}
+
+static bool zone_check(malloc_zone_t* zone) {
+  MI_UNUSED(zone);
+  return true;
+}
+
+static malloc_zone_t* zone_from_ptr(const void* p) {
+  MI_UNUSED(p);
+  return mi_get_default_zone();
+}
+
+static void zone_log(malloc_zone_t* zone, void* p) {
+  MI_UNUSED(zone); MI_UNUSED(p);
+}
+
+static void zone_print(malloc_zone_t* zone, bool b) {
+  MI_UNUSED(zone); MI_UNUSED(b);
+}
+
+static void zone_print_ptr_info(void* p) {
+  MI_UNUSED(p);
+}
+
+static void zone_register(malloc_zone_t* zone) {
+  MI_UNUSED(zone);
+}
+
+static void zone_unregister(malloc_zone_t* zone) {
+  MI_UNUSED(zone);
+}
+
+// use interposing so `DYLD_INSERT_LIBRARIES` works without `DYLD_FORCE_FLAT_NAMESPACE=1`
+// See: <https://books.google.com/books?id=K8vUkpOXhN4C&pg=PA73>
+struct mi_interpose_s {
+  const void* replacement;
+  const void* target;
+};
+#define MI_INTERPOSE_FUN(oldfun,newfun) { (const void*)&newfun, (const void*)&oldfun }
+#define MI_INTERPOSE_MI(fun)            MI_INTERPOSE_FUN(fun,mi_##fun)
+#define MI_INTERPOSE_ZONE(fun)          MI_INTERPOSE_FUN(malloc_##fun,fun)
+__attribute__((used)) static const struct mi_interpose_s _mi_zone_interposes[]  __attribute__((section("__DATA, __interpose"))) =
+{
+
+  MI_INTERPOSE_MI(malloc_create_zone),
+  MI_INTERPOSE_MI(malloc_default_purgeable_zone),
+  MI_INTERPOSE_MI(malloc_default_zone),
+  MI_INTERPOSE_MI(malloc_destroy_zone),
+  MI_INTERPOSE_MI(malloc_get_all_zones),
+  MI_INTERPOSE_MI(malloc_get_zone_name),
+  MI_INTERPOSE_MI(malloc_jumpstart),  
+  MI_INTERPOSE_MI(malloc_printf),
+  MI_INTERPOSE_MI(malloc_set_zone_name),
+  MI_INTERPOSE_MI(_malloc_fork_child),
+  MI_INTERPOSE_MI(_malloc_fork_parent),
+  MI_INTERPOSE_MI(_malloc_fork_prepare),
+  
+  MI_INTERPOSE_ZONE(zone_batch_free),
+  MI_INTERPOSE_ZONE(zone_batch_malloc),
+  MI_INTERPOSE_ZONE(zone_calloc),
+  MI_INTERPOSE_ZONE(zone_check),
+  MI_INTERPOSE_ZONE(zone_free),
+  MI_INTERPOSE_ZONE(zone_from_ptr),
+  MI_INTERPOSE_ZONE(zone_log),
+  MI_INTERPOSE_ZONE(zone_malloc),
+  MI_INTERPOSE_ZONE(zone_memalign),
+  MI_INTERPOSE_ZONE(zone_print),
+  MI_INTERPOSE_ZONE(zone_print_ptr_info),
+  MI_INTERPOSE_ZONE(zone_realloc),
+  MI_INTERPOSE_ZONE(zone_register),
+  MI_INTERPOSE_ZONE(zone_unregister),
+  MI_INTERPOSE_ZONE(zone_valloc)
+};
+
+
+#else
+
+// ------------------------------------------------------
+// hook into the zone api's without interposing
+// This is the official way of adding an allocator but
+// it seems less robust than using interpose.
+// ------------------------------------------------------
+
+static inline malloc_zone_t* mi_get_default_zone(void)
+{
+  // The first returned zone is the real default
+  malloc_zone_t** zones = NULL;
+  unsigned count = 0;
+  kern_return_t ret = malloc_get_all_zones(0, NULL, (vm_address_t**)&zones, &count);
+  if (ret == KERN_SUCCESS && count > 0) {
+    return zones[0];
+  }
+  else {
+    // fallback
+    return malloc_default_zone();
+  }
+}
+
+#if defined(__clang__)
+__attribute__((constructor(0))) 
+#else
+__attribute__((constructor))      // seems not supported by g++-11 on the M1
+#endif
+static void _mi_macos_override_malloc() {
+  malloc_zone_t* purgeable_zone = NULL;
+
+  #if defined(MAC_OS_X_VERSION_10_6) && (MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_6)
+  // force the purgeable zone to exist to avoid strange bugs
+  if (malloc_default_purgeable_zone) {
+    purgeable_zone = malloc_default_purgeable_zone();
+  }
+  #endif
+
+  // Register our zone.
+  // thomcc: I think this is still needed to put us in the zone list.
+  malloc_zone_register(&mi_malloc_zone);
+  // Unregister the default zone, this makes our zone the new default
+  // as that was the last registered.
+  malloc_zone_t *default_zone = mi_get_default_zone();
+  // thomcc: Unsure if the next test is *always* false or just false in the
+  // cases I've tried. I'm also unsure if the code inside is needed. at all
+  if (default_zone != &mi_malloc_zone) {
+    malloc_zone_unregister(default_zone);
+
+    // Reregister the default zone so free and realloc in that zone keep working.
+    malloc_zone_register(default_zone);
+  }
+
+  // Unregister, and re-register the purgeable_zone to avoid bugs if it occurs
+  // earlier than the default zone.
+  if (purgeable_zone != NULL) {
+    malloc_zone_unregister(purgeable_zone);
+    malloc_zone_register(purgeable_zone);
+  }
+
+}
+#endif  // MI_OSX_INTERPOSE
+
+#endif // MI_MALLOC_OVERRIDE
diff --git a/source/luametatex/source/libraries/mimalloc/src/alloc-override.c b/source/luametatex/source/libraries/mimalloc/src/alloc-override.c
new file mode 100644
index 000000000..e29cb4b23
--- /dev/null
+++ b/source/luametatex/source/libraries/mimalloc/src/alloc-override.c
@@ -0,0 +1,281 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2018-2021, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+
+#if !defined(MI_IN_ALLOC_C)
+#error "this file should be included from 'alloc.c' (so aliases can work)"
+#endif
+
+#if defined(MI_MALLOC_OVERRIDE) && defined(_WIN32) && !(defined(MI_SHARED_LIB) && defined(_DLL))
+#error "It is only possible to override "malloc" on Windows when building as a DLL (and linking the C runtime as a DLL)"
+#endif
+
+#if defined(MI_MALLOC_OVERRIDE) && !(defined(_WIN32)) 
+
+#if defined(__APPLE__)
+#include <AvailabilityMacros.h>
+mi_decl_externc void   vfree(void* p);
+mi_decl_externc size_t malloc_size(const void* p);
+mi_decl_externc size_t malloc_good_size(size_t size);
+#endif
+
+// helper definition for C override of C++ new
+typedef struct mi_nothrow_s { int _tag; } mi_nothrow_t;
+
+// ------------------------------------------------------
+// Override system malloc
+// ------------------------------------------------------
+
+#if (defined(__GNUC__) || defined(__clang__)) && !defined(__APPLE__) && !defined(MI_VALGRIND)
+  // gcc, clang: use aliasing to alias the exported function to one of our `mi_` functions
+  #if (defined(__GNUC__) && __GNUC__ >= 9)
+    #pragma GCC diagnostic ignored "-Wattributes"  // or we get warnings that nodiscard is ignored on a forward
+    #define MI_FORWARD(fun)      __attribute__((alias(#fun), used, visibility("default"), copy(fun)));
+  #else
+    #define MI_FORWARD(fun)      __attribute__((alias(#fun), used, visibility("default")));
+  #endif
+  #define MI_FORWARD1(fun,x)      MI_FORWARD(fun)
+  #define MI_FORWARD2(fun,x,y)    MI_FORWARD(fun)
+  #define MI_FORWARD3(fun,x,y,z)  MI_FORWARD(fun)
+  #define MI_FORWARD0(fun,x)      MI_FORWARD(fun)
+  #define MI_FORWARD02(fun,x,y)   MI_FORWARD(fun)
+#else
+  // otherwise use forwarding by calling our `mi_` function 
+  #define MI_FORWARD1(fun,x)      { return fun(x); }
+  #define MI_FORWARD2(fun,x,y)    { return fun(x,y); }
+  #define MI_FORWARD3(fun,x,y,z)  { return fun(x,y,z); }
+  #define MI_FORWARD0(fun,x)      { fun(x); }
+  #define MI_FORWARD02(fun,x,y)   { fun(x,y); }
+#endif
+
+#if defined(__APPLE__) && defined(MI_SHARED_LIB_EXPORT) && defined(MI_OSX_INTERPOSE)    
+  // define MI_OSX_IS_INTERPOSED as we should not provide forwarding definitions for 
+  // functions that are interposed (or the interposing does not work)
+  #define MI_OSX_IS_INTERPOSED
+
+  // use interposing so `DYLD_INSERT_LIBRARIES` works without `DYLD_FORCE_FLAT_NAMESPACE=1`
+  // See: <https://books.google.com/books?id=K8vUkpOXhN4C&pg=PA73>
+  struct mi_interpose_s {
+    const void* replacement;
+    const void* target;
+  };
+  #define MI_INTERPOSE_FUN(oldfun,newfun) { (const void*)&newfun, (const void*)&oldfun }
+  #define MI_INTERPOSE_MI(fun)            MI_INTERPOSE_FUN(fun,mi_##fun)
+  
+  __attribute__((used)) static struct mi_interpose_s _mi_interposes[]  __attribute__((section("__DATA, __interpose"))) =
+  {
+    MI_INTERPOSE_MI(malloc),
+    MI_INTERPOSE_MI(calloc),
+    MI_INTERPOSE_MI(realloc),
+    MI_INTERPOSE_MI(strdup),
+    MI_INTERPOSE_MI(strndup),
+    MI_INTERPOSE_MI(realpath),
+    MI_INTERPOSE_MI(posix_memalign),
+    MI_INTERPOSE_MI(reallocf),
+    MI_INTERPOSE_MI(valloc),
+    MI_INTERPOSE_MI(malloc_size),
+    MI_INTERPOSE_MI(malloc_good_size),
+    #if defined(MAC_OS_X_VERSION_10_15) && MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_15 
+    MI_INTERPOSE_MI(aligned_alloc),
+    #endif
+    #ifdef MI_OSX_ZONE
+    // we interpose malloc_default_zone in alloc-override-osx.c so we can use mi_free safely
+    MI_INTERPOSE_MI(free),
+    MI_INTERPOSE_FUN(vfree,mi_free),
+    #else
+    // sometimes code allocates from default zone but deallocates using plain free :-( (like NxHashResizeToCapacity <https://github.com/nneonneo/osx-10.9-opensource/blob/master/objc4-551.1/runtime/hashtable2.mm>)
+    MI_INTERPOSE_FUN(free,mi_cfree), // use safe free that checks if pointers are from us
+    MI_INTERPOSE_FUN(vfree,mi_cfree),
+    #endif
+  };
+
+  #ifdef __cplusplus
+  extern "C" {
+  #endif
+  void  _ZdlPv(void* p);   // delete
+  void  _ZdaPv(void* p);   // delete[]
+  void  _ZdlPvm(void* p, size_t n);  // delete
+  void  _ZdaPvm(void* p, size_t n);  // delete[]
+  void* _Znwm(size_t n);  // new
+  void* _Znam(size_t n);  // new[]
+  void* _ZnwmRKSt9nothrow_t(size_t n, mi_nothrow_t tag); // new nothrow
+  void* _ZnamRKSt9nothrow_t(size_t n, mi_nothrow_t tag); // new[] nothrow
+  #ifdef __cplusplus
+  }
+  #endif
+  __attribute__((used)) static struct mi_interpose_s _mi_cxx_interposes[]  __attribute__((section("__DATA, __interpose"))) =
+  {
+    MI_INTERPOSE_FUN(_ZdlPv,mi_free),
+    MI_INTERPOSE_FUN(_ZdaPv,mi_free),
+    MI_INTERPOSE_FUN(_ZdlPvm,mi_free_size),
+    MI_INTERPOSE_FUN(_ZdaPvm,mi_free_size),
+    MI_INTERPOSE_FUN(_Znwm,mi_new),
+    MI_INTERPOSE_FUN(_Znam,mi_new),
+    MI_INTERPOSE_FUN(_ZnwmRKSt9nothrow_t,mi_new_nothrow),
+    MI_INTERPOSE_FUN(_ZnamRKSt9nothrow_t,mi_new_nothrow),
+  };
+
+#elif defined(_MSC_VER)
+  // cannot override malloc unless using a dll.
+  // we just override new/delete which does work in a static library.
+#else
+  // On all other systems forward to our API  
+  void* malloc(size_t size)              MI_FORWARD1(mi_malloc, size)
+  void* calloc(size_t size, size_t n)    MI_FORWARD2(mi_calloc, size, n)
+  void* realloc(void* p, size_t newsize) MI_FORWARD2(mi_realloc, p, newsize)
+  void  free(void* p)                    MI_FORWARD0(mi_free, p)
+#endif
+
+#if (defined(__GNUC__) || defined(__clang__)) && !defined(__APPLE__)
+#pragma GCC visibility push(default)
+#endif
+
+// ------------------------------------------------------
+// Override new/delete
+// This is not really necessary as they usually call
+// malloc/free anyway, but it improves performance.
+// ------------------------------------------------------
+#ifdef __cplusplus
+  // ------------------------------------------------------
+  // With a C++ compiler we override the new/delete operators.
+  // see <https://en.cppreference.com/w/cpp/memory/new/operator_new>
+  // ------------------------------------------------------
+  #include <new>
+
+  #ifndef MI_OSX_IS_INTERPOSED
+    void operator delete(void* p) noexcept              MI_FORWARD0(mi_free,p)
+    void operator delete[](void* p) noexcept            MI_FORWARD0(mi_free,p)
+
+    void* operator new(std::size_t n) noexcept(false)   MI_FORWARD1(mi_new,n)
+    void* operator new[](std::size_t n) noexcept(false) MI_FORWARD1(mi_new,n)
+
+    void* operator new  (std::size_t n, const std::nothrow_t& tag) noexcept { MI_UNUSED(tag); return mi_new_nothrow(n); }
+    void* operator new[](std::size_t n, const std::nothrow_t& tag) noexcept { MI_UNUSED(tag); return mi_new_nothrow(n); }
+
+    #if (__cplusplus >= 201402L || _MSC_VER >= 1916)
+    void operator delete  (void* p, std::size_t n) noexcept MI_FORWARD02(mi_free_size,p,n)
+    void operator delete[](void* p, std::size_t n) noexcept MI_FORWARD02(mi_free_size,p,n)
+    #endif
+  #endif
+
+  #if (__cplusplus > 201402L && defined(__cpp_aligned_new)) && (!defined(__GNUC__) || (__GNUC__ > 5))
+  void operator delete  (void* p, std::align_val_t al) noexcept { mi_free_aligned(p, static_cast<size_t>(al)); }
+  void operator delete[](void* p, std::align_val_t al) noexcept { mi_free_aligned(p, static_cast<size_t>(al)); }
+  void operator delete  (void* p, std::size_t n, std::align_val_t al) noexcept { mi_free_size_aligned(p, n, static_cast<size_t>(al)); };
+  void operator delete[](void* p, std::size_t n, std::align_val_t al) noexcept { mi_free_size_aligned(p, n, static_cast<size_t>(al)); };
+  void operator delete  (void* p, std::align_val_t al, const std::nothrow_t&) noexcept { mi_free_aligned(p, static_cast<size_t>(al)); }
+  void operator delete[](void* p, std::align_val_t al, const std::nothrow_t&) noexcept { mi_free_aligned(p, static_cast<size_t>(al)); }
+  
+  void* operator new( std::size_t n, std::align_val_t al)   noexcept(false) { return mi_new_aligned(n, static_cast<size_t>(al)); }
+  void* operator new[]( std::size_t n, std::align_val_t al) noexcept(false) { return mi_new_aligned(n, static_cast<size_t>(al)); }
+  void* operator new  (std::size_t n, std::align_val_t al, const std::nothrow_t&) noexcept { return mi_new_aligned_nothrow(n, static_cast<size_t>(al)); }
+  void* operator new[](std::size_t n, std::align_val_t al, const std::nothrow_t&) noexcept { return mi_new_aligned_nothrow(n, static_cast<size_t>(al)); }
+  #endif
+
+#elif (defined(__GNUC__) || defined(__clang__)) 
+  // ------------------------------------------------------
+  // Override by defining the mangled C++ names of the operators (as
+  // used by GCC and CLang).
+  // See <https://itanium-cxx-abi.github.io/cxx-abi/abi.html#mangling>
+  // ------------------------------------------------------
+  
+  void _ZdlPv(void* p)            MI_FORWARD0(mi_free,p) // delete
+  void _ZdaPv(void* p)            MI_FORWARD0(mi_free,p) // delete[]
+  void _ZdlPvm(void* p, size_t n) MI_FORWARD02(mi_free_size,p,n)
+  void _ZdaPvm(void* p, size_t n) MI_FORWARD02(mi_free_size,p,n)
+  void _ZdlPvSt11align_val_t(void* p, size_t al)            { mi_free_aligned(p,al); }
+  void _ZdaPvSt11align_val_t(void* p, size_t al)            { mi_free_aligned(p,al); }
+  void _ZdlPvmSt11align_val_t(void* p, size_t n, size_t al) { mi_free_size_aligned(p,n,al); }
+  void _ZdaPvmSt11align_val_t(void* p, size_t n, size_t al) { mi_free_size_aligned(p,n,al); }
+  
+  #if (MI_INTPTR_SIZE==8)
+    void* _Znwm(size_t n)                             MI_FORWARD1(mi_new,n)  // new 64-bit
+    void* _Znam(size_t n)                             MI_FORWARD1(mi_new,n)  // new[] 64-bit
+    void* _ZnwmRKSt9nothrow_t(size_t n, mi_nothrow_t tag) { MI_UNUSED(tag); return mi_new_nothrow(n); }
+    void* _ZnamRKSt9nothrow_t(size_t n, mi_nothrow_t tag) { MI_UNUSED(tag); return mi_new_nothrow(n); }     
+    void* _ZnwmSt11align_val_t(size_t n, size_t al)   MI_FORWARD2(mi_new_aligned, n, al)
+    void* _ZnamSt11align_val_t(size_t n, size_t al)   MI_FORWARD2(mi_new_aligned, n, al)
+    void* _ZnwmSt11align_val_tRKSt9nothrow_t(size_t n, size_t al, mi_nothrow_t tag) { MI_UNUSED(tag); return mi_new_aligned_nothrow(n,al); }
+    void* _ZnamSt11align_val_tRKSt9nothrow_t(size_t n, size_t al, mi_nothrow_t tag) { MI_UNUSED(tag); return mi_new_aligned_nothrow(n,al); }
+  #elif (MI_INTPTR_SIZE==4)
+    void* _Znwj(size_t n)                             MI_FORWARD1(mi_new,n)  // new 64-bit
+    void* _Znaj(size_t n)                             MI_FORWARD1(mi_new,n)  // new[] 64-bit
+    void* _ZnwjRKSt9nothrow_t(size_t n, mi_nothrow_t tag) { MI_UNUSED(tag); return mi_new_nothrow(n); }
+    void* _ZnajRKSt9nothrow_t(size_t n, mi_nothrow_t tag) { MI_UNUSED(tag); return mi_new_nothrow(n); }   
+    void* _ZnwjSt11align_val_t(size_t n, size_t al)   MI_FORWARD2(mi_new_aligned, n, al)
+    void* _ZnajSt11align_val_t(size_t n, size_t al)   MI_FORWARD2(mi_new_aligned, n, al)
+    void* _ZnwjSt11align_val_tRKSt9nothrow_t(size_t n, size_t al, mi_nothrow_t tag) { MI_UNUSED(tag); return mi_new_aligned_nothrow(n,al); }
+    void* _ZnajSt11align_val_tRKSt9nothrow_t(size_t n, size_t al, mi_nothrow_t tag) { MI_UNUSED(tag); return mi_new_aligned_nothrow(n,al); }
+  #else
+    #error "define overloads for new/delete for this platform (just for performance, can be skipped)"
+  #endif
+#endif // __cplusplus
+
+// ------------------------------------------------------
+// Further Posix & Unix functions definitions
+// ------------------------------------------------------
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef MI_OSX_IS_INTERPOSED
+  // Forward Posix/Unix calls as well
+  void*  reallocf(void* p, size_t newsize) MI_FORWARD2(mi_reallocf,p,newsize)
+  size_t malloc_size(const void* p)        MI_FORWARD1(mi_usable_size,p)
+  #if !defined(__ANDROID__) && !defined(__FreeBSD__)
+  size_t malloc_usable_size(void *p)       MI_FORWARD1(mi_usable_size,p)
+  #else
+  size_t malloc_usable_size(const void *p) MI_FORWARD1(mi_usable_size,p)
+  #endif
+
+  // No forwarding here due to aliasing/name mangling issues
+  void*  valloc(size_t size)               { return mi_valloc(size); }
+  void   vfree(void* p)                    { mi_free(p); }                
+  size_t malloc_good_size(size_t size)     { return mi_malloc_good_size(size); }
+  int    posix_memalign(void** p, size_t alignment, size_t size) { return mi_posix_memalign(p, alignment, size); }
+  
+  // `aligned_alloc` is only available when __USE_ISOC11 is defined.
+  // Note: Conda has a custom glibc where `aligned_alloc` is declared `static inline` and we cannot
+  // override it, but both _ISOC11_SOURCE and __USE_ISOC11 are undefined in Conda GCC7 or GCC9.
+  // Fortunately, in the case where `aligned_alloc` is declared as `static inline` it
+  // uses internally `memalign`, `posix_memalign`, or `_aligned_malloc` so we  can avoid overriding it ourselves.
+  #if __USE_ISOC11 
+  void* aligned_alloc(size_t alignment, size_t size) { return mi_aligned_alloc(alignment, size); }
+  #endif
+#endif
+
+// no forwarding here due to aliasing/name mangling issues
+void  cfree(void* p)                                    { mi_free(p); } 
+void* pvalloc(size_t size)                              { return mi_pvalloc(size); }
+void* reallocarray(void* p, size_t count, size_t size)  { return mi_reallocarray(p, count, size); }
+int   reallocarr(void* p, size_t count, size_t size)    { return mi_reallocarr(p, count, size); }
+void* memalign(size_t alignment, size_t size)           { return mi_memalign(alignment, size); }
+void* _aligned_malloc(size_t alignment, size_t size)    { return mi_aligned_alloc(alignment, size); }
+
+#if defined(__GLIBC__) && defined(__linux__)
+  // forward __libc interface (needed for glibc-based Linux distributions)
+  void* __libc_malloc(size_t size)                      MI_FORWARD1(mi_malloc,size)
+  void* __libc_calloc(size_t count, size_t size)        MI_FORWARD2(mi_calloc,count,size)
+  void* __libc_realloc(void* p, size_t size)            MI_FORWARD2(mi_realloc,p,size)
+  void  __libc_free(void* p)                            MI_FORWARD0(mi_free,p)
+  void  __libc_cfree(void* p)                           MI_FORWARD0(mi_free,p)
+
+  void* __libc_valloc(size_t size)                      { return mi_valloc(size); }
+  void* __libc_pvalloc(size_t size)                     { return mi_pvalloc(size); }
+  void* __libc_memalign(size_t alignment, size_t size)  { return mi_memalign(alignment,size); }
+  int   __posix_memalign(void** p, size_t alignment, size_t size) { return mi_posix_memalign(p,alignment,size); }
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#if (defined(__GNUC__) || defined(__clang__)) && !defined(__APPLE__)
+#pragma GCC visibility pop
+#endif
+
+#endif // MI_MALLOC_OVERRIDE && !_WIN32
diff --git a/source/luametatex/source/libraries/mimalloc/src/alloc-posix.c b/source/luametatex/source/libraries/mimalloc/src/alloc-posix.c
new file mode 100644
index 000000000..176e7ec30
--- /dev/null
+++ b/source/luametatex/source/libraries/mimalloc/src/alloc-posix.c
@@ -0,0 +1,181 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2018-2021, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+
+// ------------------------------------------------------------------------
+// mi prefixed publi definitions of various Posix, Unix, and C++ functions
+// for convenience and used when overriding these functions.
+// ------------------------------------------------------------------------
+#include "mimalloc.h"
+#include "mimalloc-internal.h"
+
+// ------------------------------------------------------
+// Posix & Unix functions definitions
+// ------------------------------------------------------
+
+#include <errno.h>
+#include <string.h>  // memset
+#include <stdlib.h>  // getenv
+
+#ifdef _MSC_VER
+#pragma warning(disable:4996)  // getenv _wgetenv
+#endif
+
+#ifndef EINVAL
+#define EINVAL 22
+#endif
+#ifndef ENOMEM
+#define ENOMEM 12
+#endif
+
+
+mi_decl_nodiscard size_t mi_malloc_size(const void* p) mi_attr_noexcept {
+  //if (!mi_is_in_heap_region(p)) return 0;
+  return mi_usable_size(p);
+}
+
+mi_decl_nodiscard size_t mi_malloc_usable_size(const void *p) mi_attr_noexcept {
+  //if (!mi_is_in_heap_region(p)) return 0;
+  return mi_usable_size(p);
+}
+
+mi_decl_nodiscard size_t mi_malloc_good_size(size_t size) mi_attr_noexcept {
+  return mi_good_size(size);
+}
+
+void mi_cfree(void* p) mi_attr_noexcept {
+  if (mi_is_in_heap_region(p)) {
+    mi_free(p);
+  }
+}
+
+int mi_posix_memalign(void** p, size_t alignment, size_t size) mi_attr_noexcept {
+  // Note: The spec dictates we should not modify `*p` on an error. (issue#27)
+  // <http://man7.org/linux/man-pages/man3/posix_memalign.3.html>
+  if (p == NULL) return EINVAL;
+  if (alignment % sizeof(void*) != 0) return EINVAL;                   // natural alignment
+  if (alignment==0 || !_mi_is_power_of_two(alignment)) return EINVAL;  // not a power of 2
+  void* q = mi_malloc_aligned(size, alignment);
+  if (q==NULL && size != 0) return ENOMEM;
+  mi_assert_internal(((uintptr_t)q % alignment) == 0);
+  *p = q;
+  return 0;
+}
+
+mi_decl_nodiscard mi_decl_restrict void* mi_memalign(size_t alignment, size_t size) mi_attr_noexcept {
+  void* p = mi_malloc_aligned(size, alignment);
+  mi_assert_internal(((uintptr_t)p % alignment) == 0);
+  return p;
+}
+
+mi_decl_nodiscard mi_decl_restrict void* mi_valloc(size_t size) mi_attr_noexcept {
+  return mi_memalign( _mi_os_page_size(), size );
+}
+
+mi_decl_nodiscard mi_decl_restrict void* mi_pvalloc(size_t size) mi_attr_noexcept {
+  size_t psize = _mi_os_page_size();
+  if (size >= SIZE_MAX - psize) return NULL; // overflow
+  size_t asize = _mi_align_up(size, psize);
+  return mi_malloc_aligned(asize, psize);
+}
+
+mi_decl_nodiscard mi_decl_restrict void* mi_aligned_alloc(size_t alignment, size_t size) mi_attr_noexcept {
+  if (mi_unlikely((size&(alignment-1)) != 0)) { // C11 requires alignment>0 && integral multiple, see <https://en.cppreference.com/w/c/memory/aligned_alloc>
+    #if MI_DEBUG > 0
+    _mi_error_message(EOVERFLOW, "(mi_)aligned_alloc requires the size to be an integral multiple of the alignment (size %zu, alignment %zu)\n", size, alignment);
+    #endif
+    return NULL;
+  }
+  // C11 also requires alignment to be a power-of-two which is checked in mi_malloc_aligned
+  void* p = mi_malloc_aligned(size, alignment);
+  mi_assert_internal(((uintptr_t)p % alignment) == 0);
+  return p;
+}
+
+mi_decl_nodiscard void* mi_reallocarray( void* p, size_t count, size_t size ) mi_attr_noexcept {  // BSD
+  void* newp = mi_reallocn(p,count,size);
+  if (newp==NULL) { errno = ENOMEM; }
+  return newp;
+}
+
+mi_decl_nodiscard int mi_reallocarr( void* p, size_t count, size_t size ) mi_attr_noexcept { // NetBSD
+  mi_assert(p != NULL);
+  if (p == NULL) {
+    errno = EINVAL;
+    return EINVAL;
+  }
+  void** op = (void**)p;  
+  void* newp = mi_reallocarray(*op, count, size);
+  if (mi_unlikely(newp == NULL)) return errno;
+  *op = newp;
+  return 0;
+}
+
+void* mi__expand(void* p, size_t newsize) mi_attr_noexcept {  // Microsoft
+  void* res = mi_expand(p, newsize);
+  if (res == NULL) { errno = ENOMEM; }
+  return res;
+}
+
+mi_decl_nodiscard mi_decl_restrict unsigned short* mi_wcsdup(const unsigned short* s) mi_attr_noexcept {
+  if (s==NULL) return NULL;
+  size_t len;
+  for(len = 0; s[len] != 0; len++) { }
+  size_t size = (len+1)*sizeof(unsigned short);
+  unsigned short* p = (unsigned short*)mi_malloc(size);
+  if (p != NULL) {
+    _mi_memcpy(p,s,size);
+  }
+  return p;
+}
+
+mi_decl_nodiscard mi_decl_restrict unsigned char* mi_mbsdup(const unsigned char* s)  mi_attr_noexcept {
+  return (unsigned char*)mi_strdup((const char*)s);
+}
+
+int mi_dupenv_s(char** buf, size_t* size, const char* name) mi_attr_noexcept {
+  if (buf==NULL || name==NULL) return EINVAL;
+  if (size != NULL) *size = 0;
+  char* p = getenv(name);        // mscver warning 4996
+  if (p==NULL) {
+    *buf = NULL;
+  }
+  else {
+    *buf = mi_strdup(p);
+    if (*buf==NULL) return ENOMEM;
+    if (size != NULL) *size = strlen(p);
+  }
+  return 0;
+}
+
+int mi_wdupenv_s(unsigned short** buf, size_t* size, const unsigned short* name) mi_attr_noexcept {
+  if (buf==NULL || name==NULL) return EINVAL;
+  if (size != NULL) *size = 0;
+#if !defined(_WIN32) || (defined(WINAPI_FAMILY) && (WINAPI_FAMILY != WINAPI_FAMILY_DESKTOP_APP))
+  // not supported
+  *buf = NULL;
+  return EINVAL;
+#else
+  unsigned short* p = (unsigned short*)_wgetenv((const wchar_t*)name);  // msvc warning 4996
+  if (p==NULL) {
+    *buf = NULL;
+  }
+  else {
+    *buf = mi_wcsdup(p);
+    if (*buf==NULL) return ENOMEM;
+    if (size != NULL) *size = wcslen((const wchar_t*)p);
+  }
+  return 0;
+#endif
+}
+
+mi_decl_nodiscard void* mi_aligned_offset_recalloc(void* p, size_t newcount, size_t size, size_t alignment, size_t offset) mi_attr_noexcept { // Microsoft
+  return mi_recalloc_aligned_at(p, newcount, size, alignment, offset);
+}
+
+mi_decl_nodiscard void* mi_aligned_recalloc(void* p, size_t newcount, size_t size, size_t alignment) mi_attr_noexcept { // Microsoft
+  return mi_recalloc_aligned(p, newcount, size, alignment);
+}
diff --git a/source/luametatex/source/libraries/mimalloc/src/alloc.c b/source/luametatex/source/libraries/mimalloc/src/alloc.c
new file mode 100644
index 000000000..1a36b5da8
--- /dev/null
+++ b/source/luametatex/source/libraries/mimalloc/src/alloc.c
@@ -0,0 +1,934 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2018-2022, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+#ifndef _DEFAULT_SOURCE
+#define _DEFAULT_SOURCE   // for realpath() on Linux
+#endif
+
+#include "mimalloc.h"
+#include "mimalloc-internal.h"
+#include "mimalloc-atomic.h"
+
+#include <string.h>  // memset, strlen
+#include <stdlib.h>  // malloc, exit
+
+#define MI_IN_ALLOC_C
+#include "alloc-override.c"
+#undef MI_IN_ALLOC_C
+
+// ------------------------------------------------------
+// Allocation
+// ------------------------------------------------------
+
+// Fast allocation in a page: just pop from the free list.
+// Fall back to generic allocation only if the list is empty.
+extern inline void* _mi_page_malloc(mi_heap_t* heap, mi_page_t* page, size_t size) mi_attr_noexcept {
+  mi_assert_internal(page->xblock_size==0||mi_page_block_size(page) >= size);
+  mi_block_t* const block = page->free;
+  if (mi_unlikely(block == NULL)) {
+    return _mi_malloc_generic(heap, size); 
+  }
+  mi_assert_internal(block != NULL && _mi_ptr_page(block) == page);
+  // pop from the free list
+  page->used++;
+  page->free = mi_block_next(page, block);
+  mi_assert_internal(page->free == NULL || _mi_ptr_page(page->free) == page);
+
+#if (MI_DEBUG>0)
+  if (!page->is_zero) { memset(block, MI_DEBUG_UNINIT, size); }
+#elif (MI_SECURE!=0)
+  block->next = 0;  // don't leak internal data
+#endif
+
+#if (MI_STAT>0)
+  const size_t bsize = mi_page_usable_block_size(page);
+  if (bsize <= MI_MEDIUM_OBJ_SIZE_MAX) {
+    mi_heap_stat_increase(heap, normal, bsize);
+    mi_heap_stat_counter_increase(heap, normal_count, 1);
+#if (MI_STAT>1)
+    const size_t bin = _mi_bin(bsize);
+    mi_heap_stat_increase(heap, normal_bins[bin], 1);
+#endif
+  }
+#endif
+
+#if (MI_PADDING > 0) && defined(MI_ENCODE_FREELIST)
+  mi_padding_t* const padding = (mi_padding_t*)((uint8_t*)block + mi_page_usable_block_size(page));
+  ptrdiff_t delta = ((uint8_t*)padding - (uint8_t*)block - (size - MI_PADDING_SIZE));
+  mi_assert_internal(delta >= 0 && mi_page_usable_block_size(page) >= (size - MI_PADDING_SIZE + delta));
+  padding->canary = (uint32_t)(mi_ptr_encode(page,block,page->keys));
+  padding->delta  = (uint32_t)(delta);
+  uint8_t* fill = (uint8_t*)padding - delta;
+  const size_t maxpad = (delta > MI_MAX_ALIGN_SIZE ? MI_MAX_ALIGN_SIZE : delta); // set at most N initial padding bytes
+  for (size_t i = 0; i < maxpad; i++) { fill[i] = MI_DEBUG_PADDING; }
+#endif
+
+  return block;
+}
+
+// allocate a small block
+extern inline mi_decl_restrict void* mi_heap_malloc_small(mi_heap_t* heap, size_t size) mi_attr_noexcept {
+  mi_assert(heap!=NULL);
+  mi_assert(heap->thread_id == 0 || heap->thread_id == _mi_thread_id()); // heaps are thread local
+  mi_assert(size <= MI_SMALL_SIZE_MAX);
+  #if (MI_PADDING)
+  if (size == 0) {
+    size = sizeof(void*);
+  }
+  #endif
+  mi_page_t* page = _mi_heap_get_free_small_page(heap,size + MI_PADDING_SIZE);
+  void* p = _mi_page_malloc(heap, page, size + MI_PADDING_SIZE);
+  mi_assert_internal(p==NULL || mi_usable_size(p) >= size);
+  #if MI_STAT>1
+  if (p != NULL) {
+    if (!mi_heap_is_initialized(heap)) { heap = mi_get_default_heap(); }
+    mi_heap_stat_increase(heap, malloc, mi_usable_size(p));
+  }
+  #endif
+  return p;
+}
+
+extern inline mi_decl_restrict void* mi_malloc_small(size_t size) mi_attr_noexcept {
+  return mi_heap_malloc_small(mi_get_default_heap(), size);
+}
+
+// The main allocation function
+extern inline mi_decl_restrict void* mi_heap_malloc(mi_heap_t* heap, size_t size) mi_attr_noexcept {
+  if (mi_likely(size <= MI_SMALL_SIZE_MAX)) {
+    return mi_heap_malloc_small(heap, size);
+  }
+  else {
+    mi_assert(heap!=NULL);
+    mi_assert(heap->thread_id == 0 || heap->thread_id == _mi_thread_id()); // heaps are thread local
+    void* const p = _mi_malloc_generic(heap, size + MI_PADDING_SIZE);      // note: size can overflow but it is detected in malloc_generic
+    mi_assert_internal(p == NULL || mi_usable_size(p) >= size);
+    #if MI_STAT>1
+    if (p != NULL) {
+      if (!mi_heap_is_initialized(heap)) { heap = mi_get_default_heap(); }
+      mi_heap_stat_increase(heap, malloc, mi_usable_size(p));
+    }
+    #endif
+    return p;
+  }
+}
+
+extern inline mi_decl_restrict void* mi_malloc(size_t size) mi_attr_noexcept {
+  return mi_heap_malloc(mi_get_default_heap(), size);
+}
+
+
+void _mi_block_zero_init(const mi_page_t* page, void* p, size_t size) {
+  // note: we need to initialize the whole usable block size to zero, not just the requested size,
+  // or the recalloc/rezalloc functions cannot safely expand in place (see issue #63)
+  MI_UNUSED(size);
+  mi_assert_internal(p != NULL);
+  mi_assert_internal(mi_usable_size(p) >= size); // size can be zero
+  mi_assert_internal(_mi_ptr_page(p)==page);
+  if (page->is_zero && size > sizeof(mi_block_t)) {
+    // already zero initialized memory
+    ((mi_block_t*)p)->next = 0;  // clear the free list pointer
+    mi_assert_expensive(mi_mem_is_zero(p, mi_usable_size(p)));
+  }
+  else {
+    // otherwise memset
+    memset(p, 0, mi_usable_size(p));
+  }
+}
+
+// zero initialized small block
+mi_decl_restrict void* mi_zalloc_small(size_t size) mi_attr_noexcept {
+  void* p = mi_malloc_small(size);
+  if (p != NULL) {
+    _mi_block_zero_init(_mi_ptr_page(p), p, size);  // todo: can we avoid getting the page again?
+  }
+  return p;
+}
+
+void* _mi_heap_malloc_zero(mi_heap_t* heap, size_t size, bool zero) mi_attr_noexcept {
+  void* p = mi_heap_malloc(heap,size);
+  if (zero && p != NULL) {
+    _mi_block_zero_init(_mi_ptr_page(p),p,size);  // todo: can we avoid getting the page again?
+  }
+  return p;
+}
+
+extern inline mi_decl_restrict void* mi_heap_zalloc(mi_heap_t* heap, size_t size) mi_attr_noexcept {
+  return _mi_heap_malloc_zero(heap, size, true);
+}
+
+mi_decl_restrict void* mi_zalloc(size_t size) mi_attr_noexcept {
+  return mi_heap_zalloc(mi_get_default_heap(),size);
+}
+
+
+// ------------------------------------------------------
+// Check for double free in secure and debug mode
+// This is somewhat expensive so only enabled for secure mode 4
+// ------------------------------------------------------
+
+#if (MI_ENCODE_FREELIST && (MI_SECURE>=4 || MI_DEBUG!=0))
+// linear check if the free list contains a specific element
+static bool mi_list_contains(const mi_page_t* page, const mi_block_t* list, const mi_block_t* elem) {
+  while (list != NULL) {
+    if (elem==list) return true;
+    list = mi_block_next(page, list);
+  }
+  return false;
+}
+
+static mi_decl_noinline bool mi_check_is_double_freex(const mi_page_t* page, const mi_block_t* block) {
+  // The decoded value is in the same page (or NULL).
+  // Walk the free lists to verify positively if it is already freed
+  if (mi_list_contains(page, page->free, block) ||
+      mi_list_contains(page, page->local_free, block) ||
+      mi_list_contains(page, mi_page_thread_free(page), block))
+  {
+    _mi_error_message(EAGAIN, "double free detected of block %p with size %zu\n", block, mi_page_block_size(page));
+    return true;
+  }
+  return false;
+}
+
+static inline bool mi_check_is_double_free(const mi_page_t* page, const mi_block_t* block) {
+  mi_block_t* n = mi_block_nextx(page, block, page->keys); // pretend it is freed, and get the decoded first field
+  if (((uintptr_t)n & (MI_INTPTR_SIZE-1))==0 &&  // quick check: aligned pointer?
+      (n==NULL || mi_is_in_same_page(block, n))) // quick check: in same page or NULL?
+  {
+    // Suspicous: decoded value a in block is in the same page (or NULL) -- maybe a double free?
+    // (continue in separate function to improve code generation)
+    return mi_check_is_double_freex(page, block);
+  }
+  return false;
+}
+#else
+static inline bool mi_check_is_double_free(const mi_page_t* page, const mi_block_t* block) {
+  MI_UNUSED(page);
+  MI_UNUSED(block);
+  return false;
+}
+#endif
+
+// ---------------------------------------------------------------------------
+// Check for heap block overflow by setting up padding at the end of the block
+// ---------------------------------------------------------------------------
+
+#if (MI_PADDING>0) && defined(MI_ENCODE_FREELIST)
+static bool mi_page_decode_padding(const mi_page_t* page, const mi_block_t* block, size_t* delta, size_t* bsize) {
+  *bsize = mi_page_usable_block_size(page);
+  const mi_padding_t* const padding = (mi_padding_t*)((uint8_t*)block + *bsize);
+  *delta = padding->delta;
+  return ((uint32_t)mi_ptr_encode(page,block,page->keys) == padding->canary && *delta <= *bsize);
+}
+
+// Return the exact usable size of a block.
+static size_t mi_page_usable_size_of(const mi_page_t* page, const mi_block_t* block) {
+  size_t bsize;
+  size_t delta;
+  bool ok = mi_page_decode_padding(page, block, &delta, &bsize);
+  mi_assert_internal(ok); mi_assert_internal(delta <= bsize);
+  return (ok ? bsize - delta : 0);
+}
+
+static bool mi_verify_padding(const mi_page_t* page, const mi_block_t* block, size_t* size, size_t* wrong) {
+  size_t bsize;
+  size_t delta;
+  bool ok = mi_page_decode_padding(page, block, &delta, &bsize);
+  *size = *wrong = bsize;
+  if (!ok) return false;
+  mi_assert_internal(bsize >= delta);
+  *size = bsize - delta;
+  uint8_t* fill = (uint8_t*)block + bsize - delta;
+  const size_t maxpad = (delta > MI_MAX_ALIGN_SIZE ? MI_MAX_ALIGN_SIZE : delta); // check at most the first N padding bytes
+  for (size_t i = 0; i < maxpad; i++) {
+    if (fill[i] != MI_DEBUG_PADDING) {
+      *wrong = bsize - delta + i;
+      return false;
+    }
+  }
+  return true;
+}
+
+static void mi_check_padding(const mi_page_t* page, const mi_block_t* block) {
+  size_t size;
+  size_t wrong;
+  if (!mi_verify_padding(page,block,&size,&wrong)) {
+    _mi_error_message(EFAULT, "buffer overflow in heap block %p of size %zu: write after %zu bytes\n", block, size, wrong );
+  }
+}
+
+// When a non-thread-local block is freed, it becomes part of the thread delayed free
+// list that is freed later by the owning heap. If the exact usable size is too small to
+// contain the pointer for the delayed list, then shrink the padding (by decreasing delta)
+// so it will later not trigger an overflow error in `mi_free_block`.
+static void mi_padding_shrink(const mi_page_t* page, const mi_block_t* block, const size_t min_size) {
+  size_t bsize;
+  size_t delta;
+  bool ok = mi_page_decode_padding(page, block, &delta, &bsize);
+  mi_assert_internal(ok);
+  if (!ok || (bsize - delta) >= min_size) return;  // usually already enough space
+  mi_assert_internal(bsize >= min_size);
+  if (bsize < min_size) return;  // should never happen
+  size_t new_delta = (bsize - min_size);
+  mi_assert_internal(new_delta < bsize);
+  mi_padding_t* padding = (mi_padding_t*)((uint8_t*)block + bsize);
+  padding->delta = (uint32_t)new_delta;
+}
+#else
+static void mi_check_padding(const mi_page_t* page, const mi_block_t* block) {
+  MI_UNUSED(page);
+  MI_UNUSED(block);
+}
+
+static size_t mi_page_usable_size_of(const mi_page_t* page, const mi_block_t* block) {
+  MI_UNUSED(block);
+  return mi_page_usable_block_size(page);
+}
+
+static void mi_padding_shrink(const mi_page_t* page, const mi_block_t* block, const size_t min_size) {
+  MI_UNUSED(page);
+  MI_UNUSED(block);
+  MI_UNUSED(min_size);
+}
+#endif
+
+// only maintain stats for smaller objects if requested
+#if (MI_STAT>0)
+static void mi_stat_free(const mi_page_t* page, const mi_block_t* block) {
+  #if (MI_STAT < 2)  
+  MI_UNUSED(block);
+  #endif
+  mi_heap_t* const heap = mi_heap_get_default();
+  const size_t bsize = mi_page_usable_block_size(page);
+  #if (MI_STAT>1)
+  const size_t usize = mi_page_usable_size_of(page, block);
+  mi_heap_stat_decrease(heap, malloc, usize);
+  #endif  
+  if (bsize <= MI_MEDIUM_OBJ_SIZE_MAX) {
+    mi_heap_stat_decrease(heap, normal, bsize);
+    #if (MI_STAT > 1)
+    mi_heap_stat_decrease(heap, normal_bins[_mi_bin(bsize)], 1);
+    #endif
+  }
+  else if (bsize <= MI_LARGE_OBJ_SIZE_MAX) {
+    mi_heap_stat_decrease(heap, large, bsize);
+  }
+  else {
+    mi_heap_stat_decrease(heap, huge, bsize);
+  }
+}
+#else
+static void mi_stat_free(const mi_page_t* page, const mi_block_t* block) {
+  MI_UNUSED(page); MI_UNUSED(block);
+}
+#endif
+
+#if (MI_STAT>0)
+// maintain stats for huge objects
+static void mi_stat_huge_free(const mi_page_t* page) {
+  mi_heap_t* const heap = mi_heap_get_default();
+  const size_t bsize = mi_page_block_size(page); // to match stats in `page.c:mi_page_huge_alloc`
+  if (bsize <= MI_LARGE_OBJ_SIZE_MAX) {
+    mi_heap_stat_decrease(heap, large, bsize);
+  }
+  else {
+    mi_heap_stat_decrease(heap, huge, bsize);
+  }
+}
+#else
+static void mi_stat_huge_free(const mi_page_t* page) {
+  MI_UNUSED(page);
+}
+#endif
+
+// ------------------------------------------------------
+// Free
+// ------------------------------------------------------
+
+// multi-threaded free
+static mi_decl_noinline void _mi_free_block_mt(mi_page_t* page, mi_block_t* block)
+{
+  // The padding check may access the non-thread-owned page for the key values.
+  // that is safe as these are constant and the page won't be freed (as the block is not freed yet).
+  mi_check_padding(page, block);
+  mi_padding_shrink(page, block, sizeof(mi_block_t)); // for small size, ensure we can fit the delayed thread pointers without triggering overflow detection
+  #if (MI_DEBUG!=0)
+  memset(block, MI_DEBUG_FREED, mi_usable_size(block));
+  #endif
+
+  // huge page segments are always abandoned and can be freed immediately
+  mi_segment_t* segment = _mi_page_segment(page);
+  if (segment->kind==MI_SEGMENT_HUGE) {
+    mi_stat_huge_free(page);
+    _mi_segment_huge_page_free(segment, page, block);
+    return;
+  }
+
+  // Try to put the block on either the page-local thread free list, or the heap delayed free list.
+  mi_thread_free_t tfreex;
+  bool use_delayed;
+  mi_thread_free_t tfree = mi_atomic_load_relaxed(&page->xthread_free);
+  do {
+    use_delayed = (mi_tf_delayed(tfree) == MI_USE_DELAYED_FREE);
+    if (mi_unlikely(use_delayed)) {
+      // unlikely: this only happens on the first concurrent free in a page that is in the full list
+      tfreex = mi_tf_set_delayed(tfree,MI_DELAYED_FREEING);
+    }
+    else {
+      // usual: directly add to page thread_free list
+      mi_block_set_next(page, block, mi_tf_block(tfree));
+      tfreex = mi_tf_set_block(tfree,block);
+    }
+  } while (!mi_atomic_cas_weak_release(&page->xthread_free, &tfree, tfreex));
+
+  if (mi_unlikely(use_delayed)) {
+    // racy read on `heap`, but ok because MI_DELAYED_FREEING is set (see `mi_heap_delete` and `mi_heap_collect_abandon`)
+    mi_heap_t* const heap = (mi_heap_t*)(mi_atomic_load_acquire(&page->xheap)); //mi_page_heap(page);
+    mi_assert_internal(heap != NULL);
+    if (heap != NULL) {
+      // add to the delayed free list of this heap. (do this atomically as the lock only protects heap memory validity)
+      mi_block_t* dfree = mi_atomic_load_ptr_relaxed(mi_block_t, &heap->thread_delayed_free);
+      do {
+        mi_block_set_nextx(heap,block,dfree, heap->keys);
+      } while (!mi_atomic_cas_ptr_weak_release(mi_block_t,&heap->thread_delayed_free, &dfree, block));
+    }
+
+    // and reset the MI_DELAYED_FREEING flag
+    tfree = mi_atomic_load_relaxed(&page->xthread_free);
+    do {
+      tfreex = tfree;
+      mi_assert_internal(mi_tf_delayed(tfree) == MI_DELAYED_FREEING);
+      tfreex = mi_tf_set_delayed(tfree,MI_NO_DELAYED_FREE);
+    } while (!mi_atomic_cas_weak_release(&page->xthread_free, &tfree, tfreex));
+  }
+}
+
+// regular free
+static inline void _mi_free_block(mi_page_t* page, bool local, mi_block_t* block)
+{
+  // and push it on the free list
+  if (mi_likely(local)) {
+    // owning thread can free a block directly
+    if (mi_unlikely(mi_check_is_double_free(page, block))) return;
+    mi_check_padding(page, block);
+    #if (MI_DEBUG!=0)
+    memset(block, MI_DEBUG_FREED, mi_page_block_size(page));
+    #endif
+    mi_block_set_next(page, block, page->local_free);
+    page->local_free = block;
+    page->used--;
+    if (mi_unlikely(mi_page_all_free(page))) {
+      _mi_page_retire(page);
+    }
+    else if (mi_unlikely(mi_page_is_in_full(page))) {
+      _mi_page_unfull(page);
+    }
+  }
+  else {
+    _mi_free_block_mt(page,block);
+  }
+}
+
+
+// Adjust a block that was allocated aligned, to the actual start of the block in the page.
+mi_block_t* _mi_page_ptr_unalign(const mi_segment_t* segment, const mi_page_t* page, const void* p) {
+  mi_assert_internal(page!=NULL && p!=NULL);
+  const size_t diff   = (uint8_t*)p - _mi_page_start(segment, page, NULL);
+  const size_t adjust = (diff % mi_page_block_size(page));
+  return (mi_block_t*)((uintptr_t)p - adjust);
+}
+
+
+static void mi_decl_noinline mi_free_generic(const mi_segment_t* segment, bool local, void* p) mi_attr_noexcept {
+  mi_page_t* const page = _mi_segment_page_of(segment, p);
+  mi_block_t* const block = (mi_page_has_aligned(page) ? _mi_page_ptr_unalign(segment, page, p) : (mi_block_t*)p);
+  mi_stat_free(page, block);
+  _mi_free_block(page, local, block);
+}
+
+// Get the segment data belonging to a pointer
+// This is just a single `and` in assembly but does further checks in debug mode
+// (and secure mode) if this was a valid pointer.
+static inline mi_segment_t* mi_checked_ptr_segment(const void* p, const char* msg) 
+{
+  MI_UNUSED(msg);
+#if (MI_DEBUG>0)
+  if (mi_unlikely(((uintptr_t)p & (MI_INTPTR_SIZE - 1)) != 0)) {
+    _mi_error_message(EINVAL, "%s: invalid (unaligned) pointer: %p\n", msg, p);
+    return NULL;
+  }
+#endif
+
+  mi_segment_t* const segment = _mi_ptr_segment(p);
+  if (mi_unlikely(segment == NULL)) return NULL;  // checks also for (p==NULL)
+
+#if (MI_DEBUG>0)
+  if (mi_unlikely(!mi_is_in_heap_region(p))) {
+    _mi_warning_message("%s: pointer might not point to a valid heap region: %p\n"
+      "(this may still be a valid very large allocation (over 64MiB))\n", msg, p);
+    if (mi_likely(_mi_ptr_cookie(segment) == segment->cookie)) {
+      _mi_warning_message("(yes, the previous pointer %p was valid after all)\n", p);
+    }
+  }
+#endif
+#if (MI_DEBUG>0 || MI_SECURE>=4)
+  if (mi_unlikely(_mi_ptr_cookie(segment) != segment->cookie)) {
+    _mi_error_message(EINVAL, "%s: pointer does not point to a valid heap space: %p\n", msg, p);
+    return NULL;
+  }
+#endif
+  return segment;
+}
+
+// Free a block 
+void mi_free(void* p) mi_attr_noexcept
+{
+  mi_segment_t* const segment = mi_checked_ptr_segment(p,"mi_free");
+  if (mi_unlikely(segment == NULL)) return; 
+
+  mi_threadid_t tid = _mi_thread_id();
+  mi_page_t* const page = _mi_segment_page_of(segment, p);
+  
+  if (mi_likely(tid == mi_atomic_load_relaxed(&segment->thread_id) && page->flags.full_aligned == 0)) {  // the thread id matches and it is not a full page, nor has aligned blocks
+    // local, and not full or aligned
+    mi_block_t* block = (mi_block_t*)(p);
+    if (mi_unlikely(mi_check_is_double_free(page,block))) return;
+    mi_check_padding(page, block);
+    mi_stat_free(page, block);
+    #if (MI_DEBUG!=0)
+    memset(block, MI_DEBUG_FREED, mi_page_block_size(page));
+    #endif
+    mi_block_set_next(page, block, page->local_free);
+    page->local_free = block;
+    if (mi_unlikely(--page->used == 0)) {   // using this expression generates better code than: page->used--; if (mi_page_all_free(page))    
+      _mi_page_retire(page);
+    }
+  }
+  else {
+    // non-local, aligned blocks, or a full page; use the more generic path
+    // note: recalc page in generic to improve code generation
+    mi_free_generic(segment, tid == segment->thread_id, p);
+  }
+}
+
+bool _mi_free_delayed_block(mi_block_t* block) {
+  // get segment and page
+  const mi_segment_t* const segment = _mi_ptr_segment(block);
+  mi_assert_internal(_mi_ptr_cookie(segment) == segment->cookie);
+  mi_assert_internal(_mi_thread_id() == segment->thread_id);
+  mi_page_t* const page = _mi_segment_page_of(segment, block);
+
+  // Clear the no-delayed flag so delayed freeing is used again for this page.
+  // This must be done before collecting the free lists on this page -- otherwise
+  // some blocks may end up in the page `thread_free` list with no blocks in the
+  // heap `thread_delayed_free` list which may cause the page to be never freed!
+  // (it would only be freed if we happen to scan it in `mi_page_queue_find_free_ex`)
+  _mi_page_use_delayed_free(page, MI_USE_DELAYED_FREE, false /* dont overwrite never delayed */);
+
+  // collect all other non-local frees to ensure up-to-date `used` count
+  _mi_page_free_collect(page, false);
+
+  // and free the block (possibly freeing the page as well since used is updated)
+  _mi_free_block(page, true, block);
+  return true;
+}
+
+// Bytes available in a block
+mi_decl_noinline static size_t mi_page_usable_aligned_size_of(const mi_segment_t* segment, const mi_page_t* page, const void* p) mi_attr_noexcept {
+  const mi_block_t* block = _mi_page_ptr_unalign(segment, page, p);
+  const size_t size = mi_page_usable_size_of(page, block);
+  const ptrdiff_t adjust = (uint8_t*)p - (uint8_t*)block;
+  mi_assert_internal(adjust >= 0 && (size_t)adjust <= size);
+  return (size - adjust);
+}
+
+static inline size_t _mi_usable_size(const void* p, const char* msg) mi_attr_noexcept {
+  const mi_segment_t* const segment = mi_checked_ptr_segment(p, msg);
+  if (segment==NULL) return 0;  // also returns 0 if `p == NULL`
+  const mi_page_t* const page = _mi_segment_page_of(segment, p);  
+  if (mi_likely(!mi_page_has_aligned(page))) {
+    const mi_block_t* block = (const mi_block_t*)p;
+    return mi_page_usable_size_of(page, block);
+  }
+  else {
+    // split out to separate routine for improved code generation
+    return mi_page_usable_aligned_size_of(segment, page, p);
+  }
+}
+
+size_t mi_usable_size(const void* p) mi_attr_noexcept {
+  return _mi_usable_size(p, "mi_usable_size");
+}
+
+
+// ------------------------------------------------------
+// ensure explicit external inline definitions are emitted!
+// ------------------------------------------------------
+
+#ifdef __cplusplus
+void* _mi_externs[] = {
+  (void*)&_mi_page_malloc,
+  (void*)&mi_malloc,
+  (void*)&mi_malloc_small,
+  (void*)&mi_zalloc_small,
+  (void*)&mi_heap_malloc,
+  (void*)&mi_heap_zalloc,
+  (void*)&mi_heap_malloc_small
+};
+#endif
+
+
+// ------------------------------------------------------
+// Allocation extensions
+// ------------------------------------------------------
+
+void mi_free_size(void* p, size_t size) mi_attr_noexcept {
+  MI_UNUSED_RELEASE(size);
+  mi_assert(p == NULL || size <= _mi_usable_size(p,"mi_free_size"));
+  mi_free(p);
+}
+
+void mi_free_size_aligned(void* p, size_t size, size_t alignment) mi_attr_noexcept {
+  MI_UNUSED_RELEASE(alignment);
+  mi_assert(((uintptr_t)p % alignment) == 0);
+  mi_free_size(p,size);
+}
+
+void mi_free_aligned(void* p, size_t alignment) mi_attr_noexcept {
+  MI_UNUSED_RELEASE(alignment);
+  mi_assert(((uintptr_t)p % alignment) == 0);
+  mi_free(p);
+}
+
+extern inline mi_decl_restrict void* mi_heap_calloc(mi_heap_t* heap, size_t count, size_t size) mi_attr_noexcept {
+  size_t total;
+  if (mi_count_size_overflow(count,size,&total)) return NULL;
+  return mi_heap_zalloc(heap,total);
+}
+
+mi_decl_restrict void* mi_calloc(size_t count, size_t size) mi_attr_noexcept {
+  return mi_heap_calloc(mi_get_default_heap(),count,size);
+}
+
+// Uninitialized `calloc`
+extern mi_decl_restrict void* mi_heap_mallocn(mi_heap_t* heap, size_t count, size_t size) mi_attr_noexcept {
+  size_t total;
+  if (mi_count_size_overflow(count, size, &total)) return NULL;
+  return mi_heap_malloc(heap, total);
+}
+
+mi_decl_restrict void* mi_mallocn(size_t count, size_t size) mi_attr_noexcept {
+  return mi_heap_mallocn(mi_get_default_heap(),count,size);
+}
+
+// Expand (or shrink) in place (or fail)
+void* mi_expand(void* p, size_t newsize) mi_attr_noexcept {
+  #if MI_PADDING
+  // we do not shrink/expand with padding enabled 
+  MI_UNUSED(p); MI_UNUSED(newsize);
+  return NULL;
+  #else
+  if (p == NULL) return NULL;
+  const size_t size = _mi_usable_size(p,"mi_expand");
+  if (newsize > size) return NULL;
+  return p; // it fits
+  #endif
+}
+
+void* _mi_heap_realloc_zero(mi_heap_t* heap, void* p, size_t newsize, bool zero) mi_attr_noexcept {
+  const size_t size = _mi_usable_size(p,"mi_realloc"); // also works if p == NULL
+  if (mi_unlikely(newsize <= size && newsize >= (size / 2))) {
+    // todo: adjust potential padding to reflect the new size?
+    return p;  // reallocation still fits and not more than 50% waste
+  }
+  void* newp = mi_heap_malloc(heap,newsize);
+  if (mi_likely(newp != NULL)) {
+    if (zero && newsize > size) {
+      // also set last word in the previous allocation to zero to ensure any padding is zero-initialized
+      const size_t start = (size >= sizeof(intptr_t) ? size - sizeof(intptr_t) : 0);
+      memset((uint8_t*)newp + start, 0, newsize - start);
+    }
+    if (mi_likely(p != NULL)) {
+      _mi_memcpy_aligned(newp, p, (newsize > size ? size : newsize));
+      mi_free(p); // only free the original pointer if successful
+    }
+  }
+  return newp;
+}
+
+void* mi_heap_realloc(mi_heap_t* heap, void* p, size_t newsize) mi_attr_noexcept {
+  return _mi_heap_realloc_zero(heap, p, newsize, false);  
+}
+
+void* mi_heap_reallocn(mi_heap_t* heap, void* p, size_t count, size_t size) mi_attr_noexcept {
+  size_t total;
+  if (mi_count_size_overflow(count, size, &total)) return NULL;
+  return mi_heap_realloc(heap, p, total);
+}
+
+
+// Reallocate but free `p` on errors
+void* mi_heap_reallocf(mi_heap_t* heap, void* p, size_t newsize) mi_attr_noexcept {
+  void* newp = mi_heap_realloc(heap, p, newsize);
+  if (newp==NULL && p!=NULL) mi_free(p);
+  return newp;
+}
+
+void* mi_heap_rezalloc(mi_heap_t* heap, void* p, size_t newsize) mi_attr_noexcept {
+  return _mi_heap_realloc_zero(heap, p, newsize, true);
+}
+
+void* mi_heap_recalloc(mi_heap_t* heap, void* p, size_t count, size_t size) mi_attr_noexcept {
+  size_t total;
+  if (mi_count_size_overflow(count, size, &total)) return NULL;
+  return mi_heap_rezalloc(heap, p, total);
+}
+
+
+void* mi_realloc(void* p, size_t newsize) mi_attr_noexcept {
+  return mi_heap_realloc(mi_get_default_heap(),p,newsize);
+}
+
+void* mi_reallocn(void* p, size_t count, size_t size) mi_attr_noexcept {
+  return mi_heap_reallocn(mi_get_default_heap(),p,count,size);
+}
+
+// Reallocate but free `p` on errors
+void* mi_reallocf(void* p, size_t newsize) mi_attr_noexcept {
+  return mi_heap_reallocf(mi_get_default_heap(),p,newsize);
+}
+
+void* mi_rezalloc(void* p, size_t newsize) mi_attr_noexcept {
+  return mi_heap_rezalloc(mi_get_default_heap(), p, newsize);
+}
+
+void* mi_recalloc(void* p, size_t count, size_t size) mi_attr_noexcept {
+  return mi_heap_recalloc(mi_get_default_heap(), p, count, size);
+}
+
+
+
+// ------------------------------------------------------
+// strdup, strndup, and realpath
+// ------------------------------------------------------
+
+// `strdup` using mi_malloc
+mi_decl_restrict char* mi_heap_strdup(mi_heap_t* heap, const char* s) mi_attr_noexcept {
+  if (s == NULL) return NULL;
+  size_t n = strlen(s);
+  char* t = (char*)mi_heap_malloc(heap,n+1);
+  if (t != NULL) _mi_memcpy(t, s, n + 1);
+  return t;
+}
+
+mi_decl_restrict char* mi_strdup(const char* s) mi_attr_noexcept {
+  return mi_heap_strdup(mi_get_default_heap(), s);
+}
+
+// `strndup` using mi_malloc
+mi_decl_restrict char* mi_heap_strndup(mi_heap_t* heap, const char* s, size_t n) mi_attr_noexcept {
+  if (s == NULL) return NULL;
+  const char* end = (const char*)memchr(s, 0, n);  // find end of string in the first `n` characters (returns NULL if not found)
+  const size_t m = (end != NULL ? (size_t)(end - s) : n);  // `m` is the minimum of `n` or the end-of-string
+  mi_assert_internal(m <= n);
+  char* t = (char*)mi_heap_malloc(heap, m+1);
+  if (t == NULL) return NULL;
+  _mi_memcpy(t, s, m);
+  t[m] = 0;
+  return t;
+}
+
+mi_decl_restrict char* mi_strndup(const char* s, size_t n) mi_attr_noexcept {
+  return mi_heap_strndup(mi_get_default_heap(),s,n);
+}
+
+#ifndef __wasi__
+// `realpath` using mi_malloc
+#ifdef _WIN32
+#ifndef PATH_MAX
+#define PATH_MAX MAX_PATH
+#endif
+#include <windows.h>
+mi_decl_restrict char* mi_heap_realpath(mi_heap_t* heap, const char* fname, char* resolved_name) mi_attr_noexcept {
+  // todo: use GetFullPathNameW to allow longer file names
+  char buf[PATH_MAX];
+  DWORD res = GetFullPathNameA(fname, PATH_MAX, (resolved_name == NULL ? buf : resolved_name), NULL);
+  if (res == 0) {
+    errno = GetLastError(); return NULL;
+  }
+  else if (res > PATH_MAX) {
+    errno = EINVAL; return NULL;
+  }
+  else if (resolved_name != NULL) {
+    return resolved_name;
+  }
+  else {
+    return mi_heap_strndup(heap, buf, PATH_MAX);
+  }
+}
+#else
+#include <unistd.h>  // pathconf
+static size_t mi_path_max(void) {
+  static size_t path_max = 0;
+  if (path_max <= 0) {
+    long m = pathconf("/",_PC_PATH_MAX);
+    if (m <= 0) path_max = 4096;      // guess
+    else if (m < 256) path_max = 256; // at least 256
+    else path_max = m;
+  }
+  return path_max;
+}
+
+char* mi_heap_realpath(mi_heap_t* heap, const char* fname, char* resolved_name) mi_attr_noexcept {
+  if (resolved_name != NULL) {
+    return realpath(fname,resolved_name);
+  }
+  else {
+    size_t n  = mi_path_max();
+    char* buf = (char*)mi_malloc(n+1);
+    if (buf==NULL) return NULL;
+    char* rname  = realpath(fname,buf);
+    char* result = mi_heap_strndup(heap,rname,n); // ok if `rname==NULL`
+    mi_free(buf);
+    return result;
+  }
+}
+#endif
+
+mi_decl_restrict char* mi_realpath(const char* fname, char* resolved_name) mi_attr_noexcept {
+  return mi_heap_realpath(mi_get_default_heap(),fname,resolved_name);
+}
+#endif
+
+/*-------------------------------------------------------
+C++ new and new_aligned
+The standard requires calling into `get_new_handler` and
+throwing the bad_alloc exception on failure. If we compile
+with a C++ compiler we can implement this precisely. If we
+use a C compiler we cannot throw a `bad_alloc` exception
+but we call `exit` instead (i.e. not returning).
+-------------------------------------------------------*/
+
+#ifdef __cplusplus
+#include <new>
+static bool mi_try_new_handler(bool nothrow) {
+  #if defined(_MSC_VER) || (__cplusplus >= 201103L)
+    std::new_handler h = std::get_new_handler();
+  #else
+    std::new_handler h = std::set_new_handler();
+    std::set_new_handler(h);
+  #endif  
+  if (h==NULL) {
+    _mi_error_message(ENOMEM, "out of memory in 'new'");      
+    if (!nothrow) {
+      throw std::bad_alloc();
+    }
+    return false;
+  }
+  else {
+    h();
+    return true;
+  }
+}
+#else
+typedef void (*std_new_handler_t)(void);
+
+#if (defined(__GNUC__) || defined(__clang__))
+std_new_handler_t __attribute((weak)) _ZSt15get_new_handlerv(void) {
+  return NULL;
+}
+static std_new_handler_t mi_get_new_handler(void) {
+  return _ZSt15get_new_handlerv();
+}
+#else
+// note: on windows we could dynamically link to `?get_new_handler@std@@YAP6AXXZXZ`.
+static std_new_handler_t mi_get_new_handler() {
+  return NULL;
+}
+#endif
+
+static bool mi_try_new_handler(bool nothrow) {
+  std_new_handler_t h = mi_get_new_handler();
+  if (h==NULL) {
+    _mi_error_message(ENOMEM, "out of memory in 'new'");       
+    if (!nothrow) {
+      abort();  // cannot throw in plain C, use abort
+    }
+    return false;
+  }
+  else {
+    h();
+    return true;
+  }
+}
+#endif
+
+static mi_decl_noinline void* mi_try_new(size_t size, bool nothrow ) {
+  void* p = NULL;
+  while(p == NULL && mi_try_new_handler(nothrow)) {
+    p = mi_malloc(size);
+  }
+  return p;
+}
+
+mi_decl_restrict void* mi_new(size_t size) {
+  void* p = mi_malloc(size);
+  if (mi_unlikely(p == NULL)) return mi_try_new(size,false);
+  return p;
+}
+
+mi_decl_restrict void* mi_new_nothrow(size_t size) mi_attr_noexcept {
+  void* p = mi_malloc(size);
+  if (mi_unlikely(p == NULL)) return mi_try_new(size, true);
+  return p;
+}
+
+mi_decl_restrict void* mi_new_aligned(size_t size, size_t alignment) {
+  void* p;
+  do {
+    p = mi_malloc_aligned(size, alignment);
+  }
+  while(p == NULL && mi_try_new_handler(false));
+  return p;
+}
+
+mi_decl_restrict void* mi_new_aligned_nothrow(size_t size, size_t alignment) mi_attr_noexcept {
+  void* p;
+  do {
+    p = mi_malloc_aligned(size, alignment);
+  }
+  while(p == NULL && mi_try_new_handler(true));
+  return p;
+}
+
+mi_decl_restrict void* mi_new_n(size_t count, size_t size) {
+  size_t total;
+  if (mi_unlikely(mi_count_size_overflow(count, size, &total))) {
+    mi_try_new_handler(false);  // on overflow we invoke the try_new_handler once to potentially throw std::bad_alloc
+    return NULL;
+  }
+  else {
+    return mi_new(total);
+  }
+}
+
+void* mi_new_realloc(void* p, size_t newsize) {
+  void* q;
+  do {
+    q = mi_realloc(p, newsize);
+  } while (q == NULL && mi_try_new_handler(false));
+  return q;
+}
+
+void* mi_new_reallocn(void* p, size_t newcount, size_t size) {
+  size_t total;
+  if (mi_unlikely(mi_count_size_overflow(newcount, size, &total))) {
+    mi_try_new_handler(false);  // on overflow we invoke the try_new_handler once to potentially throw std::bad_alloc
+    return NULL;
+  }
+  else {
+    return mi_new_realloc(p, total);
+  }
+}
diff --git a/source/luametatex/source/libraries/mimalloc/src/arena.c b/source/luametatex/source/libraries/mimalloc/src/arena.c
new file mode 100644
index 000000000..6b1e951f3
--- /dev/null
+++ b/source/luametatex/source/libraries/mimalloc/src/arena.c
@@ -0,0 +1,446 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2019-2021, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+
+/* ----------------------------------------------------------------------------
+"Arenas" are fixed area's of OS memory from which we can allocate
+large blocks (>= MI_ARENA_MIN_BLOCK_SIZE, 4MiB).
+In contrast to the rest of mimalloc, the arenas are shared between
+threads and need to be accessed using atomic operations.
+
+Currently arenas are only used to for huge OS page (1GiB) reservations,
+or direct OS memory reservations -- otherwise it delegates to direct allocation from the OS.
+In the future, we can expose an API to manually add more kinds of arenas
+which is sometimes needed for embedded devices or shared memory for example.
+(We can also employ this with WASI or `sbrk` systems to reserve large arenas
+ on demand and be able to reuse them efficiently).
+
+The arena allocation needs to be thread safe and we use an atomic bitmap to allocate.
+-----------------------------------------------------------------------------*/
+#include "mimalloc.h"
+#include "mimalloc-internal.h"
+#include "mimalloc-atomic.h"
+
+#include <string.h>  // memset
+#include <errno.h> // ENOMEM
+
+#include "bitmap.h"  // atomic bitmap
+
+
+// os.c
+void* _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, bool* large, mi_stats_t* stats);
+void  _mi_os_free_ex(void* p, size_t size, bool was_committed, mi_stats_t* stats);
+
+void* _mi_os_alloc_huge_os_pages(size_t pages, int numa_node, mi_msecs_t max_secs, size_t* pages_reserved, size_t* psize);
+void  _mi_os_free_huge_pages(void* p, size_t size, mi_stats_t* stats);
+
+bool  _mi_os_commit(void* p, size_t size, bool* is_zero, mi_stats_t* stats);
+bool  _mi_os_decommit(void* addr, size_t size, mi_stats_t* stats);
+
+
+/* -----------------------------------------------------------
+  Arena allocation
+----------------------------------------------------------- */
+
+
+// Block info: bit 0 contains the `in_use` bit, the upper bits the
+// size in count of arena blocks.
+typedef uintptr_t mi_block_info_t;
+#define MI_ARENA_BLOCK_SIZE   (MI_SEGMENT_SIZE)        // 8MiB  (must be at least MI_SEGMENT_ALIGN)
+#define MI_ARENA_MIN_OBJ_SIZE (MI_ARENA_BLOCK_SIZE/2)  // 4MiB
+#define MI_MAX_ARENAS         (64)                     // not more than 256 (since we use 8 bits in the memid)
+
+// A memory arena descriptor
+typedef struct mi_arena_s {
+  _Atomic(uint8_t*) start;                // the start of the memory area
+  size_t   block_count;                   // size of the area in arena blocks (of `MI_ARENA_BLOCK_SIZE`)
+  size_t   field_count;                   // number of bitmap fields (where `field_count * MI_BITMAP_FIELD_BITS >= block_count`)
+  int      numa_node;                     // associated NUMA node
+  bool     is_zero_init;                  // is the arena zero initialized?
+  bool     allow_decommit;                // is decommit allowed? if true, is_large should be false and blocks_committed != NULL
+  bool     is_large;                      // large- or huge OS pages (always committed)
+  _Atomic(size_t) search_idx;             // optimization to start the search for free blocks
+  mi_bitmap_field_t* blocks_dirty;        // are the blocks potentially non-zero?
+  mi_bitmap_field_t* blocks_committed;    // are the blocks committed? (can be NULL for memory that cannot be decommitted)
+  mi_bitmap_field_t  blocks_inuse[1];     // in-place bitmap of in-use blocks (of size `field_count`)
+} mi_arena_t;
+
+
+// The available arenas
+static mi_decl_cache_align _Atomic(mi_arena_t*) mi_arenas[MI_MAX_ARENAS];
+static mi_decl_cache_align _Atomic(size_t)      mi_arena_count; // = 0
+
+
+/* -----------------------------------------------------------
+  Arena allocations get a memory id where the lower 8 bits are
+  the arena index +1, and the upper bits the block index.
+----------------------------------------------------------- */
+
+// Use `0` as a special id for direct OS allocated memory.
+#define MI_MEMID_OS   0
+
+static size_t mi_arena_id_create(size_t arena_index, mi_bitmap_index_t bitmap_index) {
+  mi_assert_internal(arena_index < 0xFE);
+  mi_assert_internal(((bitmap_index << 8) >> 8) == bitmap_index); // no overflow?
+  return ((bitmap_index << 8) | ((arena_index+1) & 0xFF));
+}
+
+static void mi_arena_id_indices(size_t memid, size_t* arena_index, mi_bitmap_index_t* bitmap_index) {
+  mi_assert_internal(memid != MI_MEMID_OS);
+  *arena_index = (memid & 0xFF) - 1;
+  *bitmap_index = (memid >> 8);
+}
+
+static size_t mi_block_count_of_size(size_t size) {
+  return _mi_divide_up(size, MI_ARENA_BLOCK_SIZE);
+}
+
+/* -----------------------------------------------------------
+  Thread safe allocation in an arena
+----------------------------------------------------------- */
+static bool mi_arena_alloc(mi_arena_t* arena, size_t blocks, mi_bitmap_index_t* bitmap_idx)
+{
+  size_t idx = 0; // mi_atomic_load_relaxed(&arena->search_idx);  // start from last search; ok to be relaxed as the exact start does not matter
+  if (_mi_bitmap_try_find_from_claim_across(arena->blocks_inuse, arena->field_count, idx, blocks, bitmap_idx)) {
+    mi_atomic_store_relaxed(&arena->search_idx, mi_bitmap_index_field(*bitmap_idx));  // start search from found location next time around
+    return true;
+  };
+  return false;
+}
+
+
+/* -----------------------------------------------------------
+  Arena Allocation
+----------------------------------------------------------- */
+
+static mi_decl_noinline void* mi_arena_alloc_from(mi_arena_t* arena, size_t arena_index, size_t needed_bcount,
+                                                  bool* commit, bool* large, bool* is_pinned, bool* is_zero, size_t* memid, mi_os_tld_t* tld)
+{
+  mi_bitmap_index_t bitmap_index;
+  if (!mi_arena_alloc(arena, needed_bcount, &bitmap_index)) return NULL;
+
+  // claimed it! set the dirty bits (todo: no need for an atomic op here?)
+  void* p    = arena->start + (mi_bitmap_index_bit(bitmap_index)*MI_ARENA_BLOCK_SIZE);
+  *memid     = mi_arena_id_create(arena_index, bitmap_index);
+  *is_zero   = _mi_bitmap_claim_across(arena->blocks_dirty, arena->field_count, needed_bcount, bitmap_index, NULL);
+  *large     = arena->is_large;
+  *is_pinned = (arena->is_large || !arena->allow_decommit);
+  if (arena->blocks_committed == NULL) {
+    // always committed
+    *commit = true;
+  }
+  else if (*commit) {
+    // arena not committed as a whole, but commit requested: ensure commit now
+    bool any_uncommitted;
+    _mi_bitmap_claim_across(arena->blocks_committed, arena->field_count, needed_bcount, bitmap_index, &any_uncommitted);
+    if (any_uncommitted) {
+      bool commit_zero;
+      _mi_os_commit(p, needed_bcount * MI_ARENA_BLOCK_SIZE, &commit_zero, tld->stats);
+      if (commit_zero) *is_zero = true;
+    }
+  }
+  else {
+    // no need to commit, but check if already fully committed
+    *commit = _mi_bitmap_is_claimed_across(arena->blocks_committed, arena->field_count, needed_bcount, bitmap_index);
+  }
+  return p;
+}
+
+static mi_decl_noinline void* mi_arena_allocate(int numa_node, size_t size, size_t alignment, bool* commit, bool* large, bool* is_pinned, bool* is_zero, size_t* memid, mi_os_tld_t* tld)
+{  
+  MI_UNUSED_RELEASE(alignment);
+  mi_assert_internal(alignment <= MI_SEGMENT_ALIGN);
+  const size_t max_arena = mi_atomic_load_relaxed(&mi_arena_count);  
+  const size_t bcount = mi_block_count_of_size(size);
+  if (mi_likely(max_arena == 0)) return NULL;
+  mi_assert_internal(size <= bcount*MI_ARENA_BLOCK_SIZE);
+
+  // try numa affine allocation
+  for (size_t i = 0; i < max_arena; i++) {
+    mi_arena_t* arena = mi_atomic_load_ptr_relaxed(mi_arena_t, &mi_arenas[i]);
+    if (arena==NULL) break; // end reached
+    if ((arena->numa_node<0 || arena->numa_node==numa_node) && // numa local?
+      (*large || !arena->is_large)) // large OS pages allowed, or arena is not large OS pages
+    {
+      void* p = mi_arena_alloc_from(arena, i, bcount, commit, large, is_pinned, is_zero, memid, tld);
+      mi_assert_internal((uintptr_t)p % alignment == 0);
+      if (p != NULL) {
+        return p;
+      }
+    }
+  }
+
+  // try from another numa node instead..
+  for (size_t i = 0; i < max_arena; i++) {
+    mi_arena_t* arena = mi_atomic_load_ptr_relaxed(mi_arena_t, &mi_arenas[i]);
+    if (arena==NULL) break; // end reached
+    if ((arena->numa_node>=0 && arena->numa_node!=numa_node) && // not numa local!
+      (*large || !arena->is_large)) // large OS pages allowed, or arena is not large OS pages
+    {
+      void* p = mi_arena_alloc_from(arena, i, bcount, commit, large, is_pinned, is_zero, memid, tld);
+      mi_assert_internal((uintptr_t)p % alignment == 0);
+      if (p != NULL) {
+        return p;
+      }
+    }
+  }
+  return NULL;
+}
+
+
+void* _mi_arena_alloc_aligned(size_t size, size_t alignment, bool* commit, bool* large, bool* is_pinned, bool* is_zero,
+                              size_t* memid, mi_os_tld_t* tld)
+{
+  mi_assert_internal(commit != NULL && is_pinned != NULL && is_zero != NULL && memid != NULL && tld != NULL);
+  mi_assert_internal(size > 0);
+  *memid   = MI_MEMID_OS;
+  *is_zero = false;
+  *is_pinned = false;
+
+  bool default_large = false;
+  if (large==NULL) large = &default_large;     // ensure `large != NULL`
+  const int numa_node = _mi_os_numa_node(tld); // current numa node
+
+  // try to allocate in an arena if the alignment is small enough and the object is not too small (as for heap meta data)
+  if (size >= MI_ARENA_MIN_OBJ_SIZE && alignment <= MI_SEGMENT_ALIGN) {
+    void* p = mi_arena_allocate(numa_node, size, alignment, commit, large, is_pinned, is_zero, memid, tld);
+    if (p != NULL) return p;
+  }
+
+  // finally, fall back to the OS
+  if (mi_option_is_enabled(mi_option_limit_os_alloc)) {
+    errno = ENOMEM;
+    return NULL;
+  }
+  *is_zero = true;
+  *memid   = MI_MEMID_OS;  
+  void* p = _mi_os_alloc_aligned(size, alignment, *commit, large, tld->stats);
+  if (p != NULL) *is_pinned = *large;
+  return p;
+}
+
+void* _mi_arena_alloc(size_t size, bool* commit, bool* large, bool* is_pinned, bool* is_zero, size_t* memid, mi_os_tld_t* tld)
+{
+  return _mi_arena_alloc_aligned(size, MI_ARENA_BLOCK_SIZE, commit, large, is_pinned, is_zero, memid, tld);
+}
+
+/* -----------------------------------------------------------
+  Arena free
+----------------------------------------------------------- */
+
+void _mi_arena_free(void* p, size_t size, size_t memid, bool all_committed, mi_os_tld_t* tld) {
+  mi_assert_internal(size > 0 && tld->stats != NULL);
+  if (p==NULL) return;
+  if (size==0) return;
+
+  if (memid == MI_MEMID_OS) {
+    // was a direct OS allocation, pass through
+    _mi_os_free_ex(p, size, all_committed, tld->stats);
+  }
+  else {
+    // allocated in an arena
+    size_t arena_idx;
+    size_t bitmap_idx;
+    mi_arena_id_indices(memid, &arena_idx, &bitmap_idx);
+    mi_assert_internal(arena_idx < MI_MAX_ARENAS);
+    mi_arena_t* arena = mi_atomic_load_ptr_relaxed(mi_arena_t,&mi_arenas[arena_idx]);
+    mi_assert_internal(arena != NULL);
+    const size_t blocks = mi_block_count_of_size(size);
+    // checks
+    if (arena == NULL) {
+      _mi_error_message(EINVAL, "trying to free from non-existent arena: %p, size %zu, memid: 0x%zx\n", p, size, memid);
+      return;
+    }
+    mi_assert_internal(arena->field_count > mi_bitmap_index_field(bitmap_idx));
+    if (arena->field_count <= mi_bitmap_index_field(bitmap_idx)) {
+      _mi_error_message(EINVAL, "trying to free from non-existent arena block: %p, size %zu, memid: 0x%zx\n", p, size, memid);
+      return;
+    }
+    // potentially decommit
+    if (!arena->allow_decommit || arena->blocks_committed == NULL) {
+      mi_assert_internal(all_committed); // note: may be not true as we may "pretend" to be not committed (in segment.c)
+    }
+    else {
+      mi_assert_internal(arena->blocks_committed != NULL);
+      _mi_os_decommit(p, blocks * MI_ARENA_BLOCK_SIZE, tld->stats); // ok if this fails
+      _mi_bitmap_unclaim_across(arena->blocks_committed, arena->field_count, blocks, bitmap_idx);
+    }
+    // and make it available to others again 
+    bool all_inuse = _mi_bitmap_unclaim_across(arena->blocks_inuse, arena->field_count, blocks, bitmap_idx);
+    if (!all_inuse) {
+      _mi_error_message(EAGAIN, "trying to free an already freed block: %p, size %zu\n", p, size);
+      return;
+    };
+  }
+}
+
+/* -----------------------------------------------------------
+  Add an arena.
+----------------------------------------------------------- */
+
+static bool mi_arena_add(mi_arena_t* arena) {
+  mi_assert_internal(arena != NULL);
+  mi_assert_internal((uintptr_t)mi_atomic_load_ptr_relaxed(uint8_t,&arena->start) % MI_SEGMENT_ALIGN == 0);
+  mi_assert_internal(arena->block_count > 0);
+
+  size_t i = mi_atomic_increment_acq_rel(&mi_arena_count);
+  if (i >= MI_MAX_ARENAS) {
+    mi_atomic_decrement_acq_rel(&mi_arena_count);
+    return false;
+  }
+  mi_atomic_store_ptr_release(mi_arena_t,&mi_arenas[i], arena);
+  return true;
+}
+
+bool mi_manage_os_memory(void* start, size_t size, bool is_committed, bool is_large, bool is_zero, int numa_node) mi_attr_noexcept
+{
+  if (size < MI_ARENA_BLOCK_SIZE) return false;
+
+  if (is_large) {
+    mi_assert_internal(is_committed);
+    is_committed = true;
+  }
+  
+  const size_t bcount = size / MI_ARENA_BLOCK_SIZE; 
+  const size_t fields = _mi_divide_up(bcount, MI_BITMAP_FIELD_BITS);
+  const size_t bitmaps = (is_committed ? 2 : 3);
+  const size_t asize  = sizeof(mi_arena_t) + (bitmaps*fields*sizeof(mi_bitmap_field_t));
+  mi_arena_t* arena   = (mi_arena_t*)_mi_os_alloc(asize, &_mi_stats_main); // TODO: can we avoid allocating from the OS?
+  if (arena == NULL) return false;
+
+  arena->block_count = bcount;
+  arena->field_count = fields;
+  arena->start = (uint8_t*)start;
+  arena->numa_node    = numa_node; // TODO: or get the current numa node if -1? (now it allows anyone to allocate on -1)
+  arena->is_large     = is_large;
+  arena->is_zero_init = is_zero;
+  arena->allow_decommit = !is_large && !is_committed; // only allow decommit for initially uncommitted memory
+  arena->search_idx   = 0;
+  arena->blocks_dirty = &arena->blocks_inuse[fields]; // just after inuse bitmap
+  arena->blocks_committed = (!arena->allow_decommit ? NULL : &arena->blocks_inuse[2*fields]); // just after dirty bitmap
+  // the bitmaps are already zero initialized due to os_alloc
+  // initialize committed bitmap?
+  if (arena->blocks_committed != NULL && is_committed) {
+    memset((void*)arena->blocks_committed, 0xFF, fields*sizeof(mi_bitmap_field_t)); // cast to void* to avoid atomic warning
+  }
+  // and claim leftover blocks if needed (so we never allocate there)
+  ptrdiff_t post = (fields * MI_BITMAP_FIELD_BITS) - bcount;
+  mi_assert_internal(post >= 0);
+  if (post > 0) {
+    // don't use leftover bits at the end
+    mi_bitmap_index_t postidx = mi_bitmap_index_create(fields - 1, MI_BITMAP_FIELD_BITS - post);
+    _mi_bitmap_claim(arena->blocks_inuse, fields, post, postidx, NULL);
+  }
+
+  mi_arena_add(arena);
+  return true;
+}
+
+// Reserve a range of regular OS memory
+int mi_reserve_os_memory(size_t size, bool commit, bool allow_large) mi_attr_noexcept 
+{
+  size = _mi_align_up(size, MI_ARENA_BLOCK_SIZE); // at least one block
+  bool large = allow_large;
+  void* start = _mi_os_alloc_aligned(size, MI_SEGMENT_ALIGN, commit, &large, &_mi_stats_main);
+  if (start==NULL) return ENOMEM;
+  if (!mi_manage_os_memory(start, size, (large || commit), large, true, -1)) {
+    _mi_os_free_ex(start, size, commit, &_mi_stats_main);
+    _mi_verbose_message("failed to reserve %zu k memory\n", _mi_divide_up(size,1024));
+    return ENOMEM;
+  }
+  _mi_verbose_message("reserved %zu KiB memory%s\n", _mi_divide_up(size,1024), large ? " (in large os pages)" : "");
+  return 0;
+}
+
+static size_t mi_debug_show_bitmap(const char* prefix, mi_bitmap_field_t* fields, size_t field_count ) {
+  size_t inuse_count = 0;
+  for (size_t i = 0; i < field_count; i++) {
+    char buf[MI_BITMAP_FIELD_BITS + 1];
+    uintptr_t field = mi_atomic_load_relaxed(&fields[i]);
+    for (size_t bit = 0; bit < MI_BITMAP_FIELD_BITS; bit++) {
+      bool inuse = ((((uintptr_t)1 << bit) & field) != 0);
+      if (inuse) inuse_count++;
+      buf[MI_BITMAP_FIELD_BITS - 1 - bit] = (inuse ? 'x' : '.');
+    }
+    buf[MI_BITMAP_FIELD_BITS] = 0;
+    _mi_verbose_message("%s%s\n", prefix, buf);
+  }
+  return inuse_count;
+}
+
+void mi_debug_show_arenas(void) mi_attr_noexcept {
+  size_t max_arenas = mi_atomic_load_relaxed(&mi_arena_count);
+  for (size_t i = 0; i < max_arenas; i++) {
+    mi_arena_t* arena = mi_atomic_load_ptr_relaxed(mi_arena_t, &mi_arenas[i]);
+    if (arena == NULL) break;
+    size_t inuse_count = 0;
+    _mi_verbose_message("arena %zu: %zu blocks with %zu fields\n", i, arena->block_count, arena->field_count);
+    inuse_count += mi_debug_show_bitmap("  ", arena->blocks_inuse, arena->field_count);
+    _mi_verbose_message("  blocks in use ('x'): %zu\n", inuse_count);
+  }
+}
+
+/* -----------------------------------------------------------
+  Reserve a huge page arena.
+----------------------------------------------------------- */
+// reserve at a specific numa node
+int mi_reserve_huge_os_pages_at(size_t pages, int numa_node, size_t timeout_msecs) mi_attr_noexcept {
+  if (pages==0) return 0;
+  if (numa_node < -1) numa_node = -1;
+  if (numa_node >= 0) numa_node = numa_node % _mi_os_numa_node_count();
+  size_t hsize = 0;
+  size_t pages_reserved = 0;
+  void* p = _mi_os_alloc_huge_os_pages(pages, numa_node, timeout_msecs, &pages_reserved, &hsize);
+  if (p==NULL || pages_reserved==0) {
+    _mi_warning_message("failed to reserve %zu GiB huge pages\n", pages);
+    return ENOMEM;
+  }
+  _mi_verbose_message("numa node %i: reserved %zu GiB huge pages (of the %zu GiB requested)\n", numa_node, pages_reserved, pages);
+
+  if (!mi_manage_os_memory(p, hsize, true, true, true, numa_node)) {
+    _mi_os_free_huge_pages(p, hsize, &_mi_stats_main);
+    return ENOMEM;
+  }
+  return 0;
+}
+
+
+// reserve huge pages evenly among the given number of numa nodes (or use the available ones as detected)
+int mi_reserve_huge_os_pages_interleave(size_t pages, size_t numa_nodes, size_t timeout_msecs) mi_attr_noexcept {
+  if (pages == 0) return 0;
+
+  // pages per numa node
+  size_t numa_count = (numa_nodes > 0 ? numa_nodes : _mi_os_numa_node_count());
+  if (numa_count <= 0) numa_count = 1;
+  const size_t pages_per = pages / numa_count;
+  const size_t pages_mod = pages % numa_count;
+  const size_t timeout_per = (timeout_msecs==0 ? 0 : (timeout_msecs / numa_count) + 50);
+
+  // reserve evenly among numa nodes
+  for (size_t numa_node = 0; numa_node < numa_count && pages > 0; numa_node++) {
+    size_t node_pages = pages_per;  // can be 0
+    if (numa_node < pages_mod) node_pages++;
+    int err = mi_reserve_huge_os_pages_at(node_pages, (int)numa_node, timeout_per);
+    if (err) return err;
+    if (pages < node_pages) {
+      pages = 0;
+    }
+    else {
+      pages -= node_pages;
+    }
+  }
+
+  return 0;
+}
+
+int mi_reserve_huge_os_pages(size_t pages, double max_secs, size_t* pages_reserved) mi_attr_noexcept {
+  MI_UNUSED(max_secs);
+  _mi_warning_message("mi_reserve_huge_os_pages is deprecated: use mi_reserve_huge_os_pages_interleave/at instead\n");
+  if (pages_reserved != NULL) *pages_reserved = 0;
+  int err = mi_reserve_huge_os_pages_interleave(pages, 0, (size_t)(max_secs * 1000.0));
+  if (err==0 && pages_reserved!=NULL) *pages_reserved = pages;
+  return err;
+}
diff --git a/source/luametatex/source/libraries/mimalloc/src/bitmap.c b/source/luametatex/source/libraries/mimalloc/src/bitmap.c
new file mode 100644
index 000000000..af6de0a12
--- /dev/null
+++ b/source/luametatex/source/libraries/mimalloc/src/bitmap.c
@@ -0,0 +1,395 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2019-2021 Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+
+/* ----------------------------------------------------------------------------
+Concurrent bitmap that can set/reset sequences of bits atomically,
+represeted as an array of fields where each field is a machine word (`size_t`)
+
+There are two api's; the standard one cannot have sequences that cross
+between the bitmap fields (and a sequence must be <= MI_BITMAP_FIELD_BITS).
+(this is used in region allocation)
+
+The `_across` postfixed functions do allow sequences that can cross over
+between the fields. (This is used in arena allocation)
+---------------------------------------------------------------------------- */
+
+#include "mimalloc.h"
+#include "mimalloc-internal.h"
+#include "bitmap.h"
+
+/* -----------------------------------------------------------
+  Bitmap definition
+----------------------------------------------------------- */
+
+// The bit mask for a given number of blocks at a specified bit index.
+static inline size_t mi_bitmap_mask_(size_t count, size_t bitidx) {
+  mi_assert_internal(count + bitidx <= MI_BITMAP_FIELD_BITS);
+  mi_assert_internal(count > 0);
+  if (count >= MI_BITMAP_FIELD_BITS) return MI_BITMAP_FIELD_FULL;
+  if (count == 0) return 0;
+  return ((((size_t)1 << count) - 1) << bitidx);
+}
+
+
+/* -----------------------------------------------------------
+  Claim a bit sequence atomically
+----------------------------------------------------------- */
+
+// Try to atomically claim a sequence of `count` bits in a single
+// field at `idx` in `bitmap`. Returns `true` on success.
+inline bool _mi_bitmap_try_find_claim_field(mi_bitmap_t bitmap, size_t idx, const size_t count, mi_bitmap_index_t* bitmap_idx)
+{
+  mi_assert_internal(bitmap_idx != NULL);
+  mi_assert_internal(count <= MI_BITMAP_FIELD_BITS);
+  mi_assert_internal(count > 0);
+  mi_bitmap_field_t* field = &bitmap[idx];
+  size_t map  = mi_atomic_load_relaxed(field);
+  if (map==MI_BITMAP_FIELD_FULL) return false; // short cut
+
+  // search for 0-bit sequence of length count
+  const size_t mask = mi_bitmap_mask_(count, 0);
+  const size_t bitidx_max = MI_BITMAP_FIELD_BITS - count;
+
+#ifdef MI_HAVE_FAST_BITSCAN
+  size_t bitidx = mi_ctz(~map);    // quickly find the first zero bit if possible
+#else
+  size_t bitidx = 0;               // otherwise start at 0
+#endif
+  size_t m = (mask << bitidx);     // invariant: m == mask shifted by bitidx
+
+  // scan linearly for a free range of zero bits
+  while (bitidx <= bitidx_max) {
+    const size_t mapm = map & m;
+    if (mapm == 0) {  // are the mask bits free at bitidx?
+      mi_assert_internal((m >> bitidx) == mask); // no overflow?
+      const size_t newmap = map | m;
+      mi_assert_internal((newmap^map) >> bitidx == mask);
+      if (!mi_atomic_cas_weak_acq_rel(field, &map, newmap)) {  // TODO: use strong cas here?
+        // no success, another thread claimed concurrently.. keep going (with updated `map`)
+        continue;
+      }
+      else {
+        // success, we claimed the bits!
+        *bitmap_idx = mi_bitmap_index_create(idx, bitidx);
+        return true;
+      }
+    }
+    else {
+      // on to the next bit range
+#ifdef MI_HAVE_FAST_BITSCAN
+      const size_t shift = (count == 1 ? 1 : mi_bsr(mapm) - bitidx + 1);
+      mi_assert_internal(shift > 0 && shift <= count);
+#else
+      const size_t shift = 1;
+#endif
+      bitidx += shift;
+      m <<= shift;
+    }
+  }
+  // no bits found
+  return false;
+}
+
+// Find `count` bits of 0 and set them to 1 atomically; returns `true` on success.
+// Starts at idx, and wraps around to search in all `bitmap_fields` fields.
+// `count` can be at most MI_BITMAP_FIELD_BITS and will never cross fields.
+bool _mi_bitmap_try_find_from_claim(mi_bitmap_t bitmap, const size_t bitmap_fields, const size_t start_field_idx, const size_t count, mi_bitmap_index_t* bitmap_idx) {
+  size_t idx = start_field_idx;
+  for (size_t visited = 0; visited < bitmap_fields; visited++, idx++) {
+    if (idx >= bitmap_fields) idx = 0; // wrap
+    if (_mi_bitmap_try_find_claim_field(bitmap, idx, count, bitmap_idx)) {
+      return true;
+    }
+  }
+  return false;
+}
+
+/*
+// Find `count` bits of 0 and set them to 1 atomically; returns `true` on success.
+// For now, `count` can be at most MI_BITMAP_FIELD_BITS and will never span fields.
+bool _mi_bitmap_try_find_claim(mi_bitmap_t bitmap, const size_t bitmap_fields, const size_t count, mi_bitmap_index_t* bitmap_idx) {
+  return _mi_bitmap_try_find_from_claim(bitmap, bitmap_fields, 0, count, bitmap_idx);
+}
+*/
+
+// Set `count` bits at `bitmap_idx` to 0 atomically
+// Returns `true` if all `count` bits were 1 previously.
+bool _mi_bitmap_unclaim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) {
+  const size_t idx = mi_bitmap_index_field(bitmap_idx);
+  const size_t bitidx = mi_bitmap_index_bit_in_field(bitmap_idx);
+  const size_t mask = mi_bitmap_mask_(count, bitidx);
+  mi_assert_internal(bitmap_fields > idx); MI_UNUSED(bitmap_fields);
+  // mi_assert_internal((bitmap[idx] & mask) == mask);
+  size_t prev = mi_atomic_and_acq_rel(&bitmap[idx], ~mask);
+  return ((prev & mask) == mask);
+}
+
+
+// Set `count` bits at `bitmap_idx` to 1 atomically
+// Returns `true` if all `count` bits were 0 previously. `any_zero` is `true` if there was at least one zero bit.
+bool _mi_bitmap_claim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, bool* any_zero) {
+  const size_t idx = mi_bitmap_index_field(bitmap_idx);
+  const size_t bitidx = mi_bitmap_index_bit_in_field(bitmap_idx);
+  const size_t mask = mi_bitmap_mask_(count, bitidx);
+  mi_assert_internal(bitmap_fields > idx); MI_UNUSED(bitmap_fields);
+  //mi_assert_internal(any_zero != NULL || (bitmap[idx] & mask) == 0);
+  size_t prev = mi_atomic_or_acq_rel(&bitmap[idx], mask);
+  if (any_zero != NULL) *any_zero = ((prev & mask) != mask);
+  return ((prev & mask) == 0);
+}
+
+// Returns `true` if all `count` bits were 1. `any_ones` is `true` if there was at least one bit set to one.
+static bool mi_bitmap_is_claimedx(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, bool* any_ones) {
+  const size_t idx = mi_bitmap_index_field(bitmap_idx);
+  const size_t bitidx = mi_bitmap_index_bit_in_field(bitmap_idx);
+  const size_t mask = mi_bitmap_mask_(count, bitidx);
+  mi_assert_internal(bitmap_fields > idx); MI_UNUSED(bitmap_fields);
+  size_t field = mi_atomic_load_relaxed(&bitmap[idx]);
+  if (any_ones != NULL) *any_ones = ((field & mask) != 0);
+  return ((field & mask) == mask);
+}
+
+bool _mi_bitmap_is_claimed(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) {
+  return mi_bitmap_is_claimedx(bitmap, bitmap_fields, count, bitmap_idx, NULL);
+}
+
+bool _mi_bitmap_is_any_claimed(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) {
+  bool any_ones;
+  mi_bitmap_is_claimedx(bitmap, bitmap_fields, count, bitmap_idx, &any_ones);
+  return any_ones;
+}
+
+
+//--------------------------------------------------------------------------
+// the `_across` functions work on bitmaps where sequences can cross over
+// between the fields. This is used in arena allocation
+//--------------------------------------------------------------------------
+
+// Try to atomically claim a sequence of `count` bits starting from the field 
+// at `idx` in `bitmap` and crossing into subsequent fields. Returns `true` on success.
+static bool mi_bitmap_try_find_claim_field_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t idx, const size_t count, const size_t retries, mi_bitmap_index_t* bitmap_idx)
+{
+  mi_assert_internal(bitmap_idx != NULL);
+  
+  // check initial trailing zeros
+  mi_bitmap_field_t* field = &bitmap[idx];
+  size_t map = mi_atomic_load_relaxed(field);  
+  const size_t initial = mi_clz(map);  // count of initial zeros starting at idx
+  mi_assert_internal(initial <= MI_BITMAP_FIELD_BITS);
+  if (initial == 0)     return false;
+  if (initial >= count) return _mi_bitmap_try_find_claim_field(bitmap, idx, count, bitmap_idx);     // no need to cross fields
+  if (_mi_divide_up(count - initial, MI_BITMAP_FIELD_BITS) >= (bitmap_fields - idx)) return false; // not enough entries
+
+  // scan ahead
+  size_t found = initial;
+  size_t mask = 0;     // mask bits for the final field
+  while(found < count) {
+    field++;
+    map = mi_atomic_load_relaxed(field);
+    const size_t mask_bits = (found + MI_BITMAP_FIELD_BITS <= count ? MI_BITMAP_FIELD_BITS : (count - found));
+    mask = mi_bitmap_mask_(mask_bits, 0);
+    if ((map & mask) != 0) return false;
+    found += mask_bits;
+  }
+  mi_assert_internal(field < &bitmap[bitmap_fields]);
+
+  // found range of zeros up to the final field; mask contains mask in the final field
+  // now claim it atomically
+  mi_bitmap_field_t* const final_field = field;
+  const size_t final_mask = mask;
+  mi_bitmap_field_t* const initial_field = &bitmap[idx];
+  const size_t initial_mask = mi_bitmap_mask_(initial, MI_BITMAP_FIELD_BITS - initial);
+
+  // initial field
+  size_t newmap;
+  field = initial_field;
+  map = mi_atomic_load_relaxed(field);
+  do {
+    newmap = map | initial_mask;
+    if ((map & initial_mask) != 0) { goto rollback; };
+  } while (!mi_atomic_cas_strong_acq_rel(field, &map, newmap));
+  
+  // intermediate fields
+  while (++field < final_field) {
+    newmap = MI_BITMAP_FIELD_FULL;
+    map = 0;
+    if (!mi_atomic_cas_strong_acq_rel(field, &map, newmap)) { goto rollback; }
+  }
+  
+  // final field
+  mi_assert_internal(field == final_field);
+  map = mi_atomic_load_relaxed(field);
+  do {
+    newmap = map | final_mask;
+    if ((map & final_mask) != 0) { goto rollback; }
+  } while (!mi_atomic_cas_strong_acq_rel(field, &map, newmap));
+
+  // claimed!
+  *bitmap_idx = mi_bitmap_index_create(idx, MI_BITMAP_FIELD_BITS - initial);
+  return true;
+
+rollback: 
+  // roll back intermediate fields
+  while (--field > initial_field) {
+    newmap = 0;
+    map = MI_BITMAP_FIELD_FULL;
+    mi_assert_internal(mi_atomic_load_relaxed(field) == map);
+    mi_atomic_store_release(field, newmap);
+  }
+  if (field == initial_field) {
+    map = mi_atomic_load_relaxed(field);
+    do {
+      mi_assert_internal((map & initial_mask) == initial_mask);
+      newmap = map & ~initial_mask;
+    } while (!mi_atomic_cas_strong_acq_rel(field, &map, newmap));
+  }  
+  // retry? (we make a recursive call instead of goto to be able to use const declarations)
+  if (retries < 4) {
+    return mi_bitmap_try_find_claim_field_across(bitmap, bitmap_fields, idx, count, retries+1, bitmap_idx);
+  }
+  else {
+    return false;
+  }
+}
+
+
+// Find `count` bits of zeros and set them to 1 atomically; returns `true` on success.
+// Starts at idx, and wraps around to search in all `bitmap_fields` fields.
+bool _mi_bitmap_try_find_from_claim_across(mi_bitmap_t bitmap, const size_t bitmap_fields, const size_t start_field_idx, const size_t count, mi_bitmap_index_t* bitmap_idx) {
+  mi_assert_internal(count > 0);
+  if (count==1) return _mi_bitmap_try_find_from_claim(bitmap, bitmap_fields, start_field_idx, count, bitmap_idx);
+  size_t idx = start_field_idx;
+  for (size_t visited = 0; visited < bitmap_fields; visited++, idx++) {
+    if (idx >= bitmap_fields) idx = 0; // wrap
+    // try to claim inside the field
+    if (count <= MI_BITMAP_FIELD_BITS) {
+      if (_mi_bitmap_try_find_claim_field(bitmap, idx, count, bitmap_idx)) {
+        return true;
+      }
+    }
+    // try to claim across fields
+    if (mi_bitmap_try_find_claim_field_across(bitmap, bitmap_fields, idx, count, 0, bitmap_idx)) {
+      return true;
+    }
+  }
+  return false;
+}
+
+// Helper for masks across fields; returns the mid count, post_mask may be 0
+static size_t mi_bitmap_mask_across(mi_bitmap_index_t bitmap_idx, size_t bitmap_fields, size_t count, size_t* pre_mask, size_t* mid_mask, size_t* post_mask) {
+  MI_UNUSED_RELEASE(bitmap_fields);
+  const size_t bitidx = mi_bitmap_index_bit_in_field(bitmap_idx);
+  if (mi_likely(bitidx + count <= MI_BITMAP_FIELD_BITS)) {
+    *pre_mask = mi_bitmap_mask_(count, bitidx);
+    *mid_mask = 0;
+    *post_mask = 0;
+    mi_assert_internal(mi_bitmap_index_field(bitmap_idx) < bitmap_fields);
+    return 0;
+  }
+  else {
+    const size_t pre_bits = MI_BITMAP_FIELD_BITS - bitidx;
+    mi_assert_internal(pre_bits < count);
+    *pre_mask = mi_bitmap_mask_(pre_bits, bitidx);
+    count -= pre_bits;
+    const size_t mid_count = (count / MI_BITMAP_FIELD_BITS);
+    *mid_mask = MI_BITMAP_FIELD_FULL;
+    count %= MI_BITMAP_FIELD_BITS;
+    *post_mask = (count==0 ? 0 : mi_bitmap_mask_(count, 0));
+    mi_assert_internal(mi_bitmap_index_field(bitmap_idx) + mid_count + (count==0 ? 0 : 1) < bitmap_fields);
+    return mid_count;
+  }
+}
+
+// Set `count` bits at `bitmap_idx` to 0 atomically
+// Returns `true` if all `count` bits were 1 previously.
+bool _mi_bitmap_unclaim_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) {
+  size_t idx = mi_bitmap_index_field(bitmap_idx);
+  size_t pre_mask;
+  size_t mid_mask;
+  size_t post_mask;
+  size_t mid_count = mi_bitmap_mask_across(bitmap_idx, bitmap_fields, count, &pre_mask, &mid_mask, &post_mask);  
+  bool all_one = true;
+  mi_bitmap_field_t* field = &bitmap[idx];
+  size_t prev = mi_atomic_and_acq_rel(field++, ~pre_mask);
+  if ((prev & pre_mask) != pre_mask) all_one = false;
+  while(mid_count-- > 0) {
+    prev = mi_atomic_and_acq_rel(field++, ~mid_mask);
+    if ((prev & mid_mask) != mid_mask) all_one = false;
+  }
+  if (post_mask!=0) {
+    prev = mi_atomic_and_acq_rel(field, ~post_mask);
+    if ((prev & post_mask) != post_mask) all_one = false;
+  }
+  return all_one;  
+}
+
+// Set `count` bits at `bitmap_idx` to 1 atomically
+// Returns `true` if all `count` bits were 0 previously. `any_zero` is `true` if there was at least one zero bit.
+bool _mi_bitmap_claim_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, bool* pany_zero) {
+  size_t idx = mi_bitmap_index_field(bitmap_idx);
+  size_t pre_mask;
+  size_t mid_mask;
+  size_t post_mask;
+  size_t mid_count = mi_bitmap_mask_across(bitmap_idx, bitmap_fields, count, &pre_mask, &mid_mask, &post_mask);
+  bool all_zero = true;
+  bool any_zero = false;
+  _Atomic(size_t)*field = &bitmap[idx];
+  size_t prev = mi_atomic_or_acq_rel(field++, pre_mask);
+  if ((prev & pre_mask) != 0) all_zero = false;
+  if ((prev & pre_mask) != pre_mask) any_zero = true;
+  while (mid_count-- > 0) {
+    prev = mi_atomic_or_acq_rel(field++, mid_mask);
+    if ((prev & mid_mask) != 0) all_zero = false;
+    if ((prev & mid_mask) != mid_mask) any_zero = true;
+  }
+  if (post_mask!=0) {
+    prev = mi_atomic_or_acq_rel(field, post_mask);
+    if ((prev & post_mask) != 0) all_zero = false;
+    if ((prev & post_mask) != post_mask) any_zero = true;
+  }
+  if (pany_zero != NULL) *pany_zero = any_zero;
+  return all_zero;
+}
+
+
+// Returns `true` if all `count` bits were 1. 
+// `any_ones` is `true` if there was at least one bit set to one.
+static bool mi_bitmap_is_claimedx_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, bool* pany_ones) {
+  size_t idx = mi_bitmap_index_field(bitmap_idx);
+  size_t pre_mask;
+  size_t mid_mask;
+  size_t post_mask;
+  size_t mid_count = mi_bitmap_mask_across(bitmap_idx, bitmap_fields, count, &pre_mask, &mid_mask, &post_mask);
+  bool all_ones = true;
+  bool any_ones = false;
+  mi_bitmap_field_t* field = &bitmap[idx];
+  size_t prev = mi_atomic_load_relaxed(field++);
+  if ((prev & pre_mask) != pre_mask) all_ones = false;
+  if ((prev & pre_mask) != 0) any_ones = true;
+  while (mid_count-- > 0) {
+    prev = mi_atomic_load_relaxed(field++);
+    if ((prev & mid_mask) != mid_mask) all_ones = false;
+    if ((prev & mid_mask) != 0) any_ones = true;
+  }
+  if (post_mask!=0) {
+    prev = mi_atomic_load_relaxed(field);
+    if ((prev & post_mask) != post_mask) all_ones = false;
+    if ((prev & post_mask) != 0) any_ones = true;
+  }  
+  if (pany_ones != NULL) *pany_ones = any_ones;
+  return all_ones;
+}
+
+bool _mi_bitmap_is_claimed_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) {
+  return mi_bitmap_is_claimedx_across(bitmap, bitmap_fields, count, bitmap_idx, NULL);
+}
+
+bool _mi_bitmap_is_any_claimed_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) {
+  bool any_ones;
+  mi_bitmap_is_claimedx_across(bitmap, bitmap_fields, count, bitmap_idx, &any_ones);
+  return any_ones;
+}
diff --git a/source/luametatex/source/libraries/mimalloc/src/bitmap.h b/source/luametatex/source/libraries/mimalloc/src/bitmap.h
new file mode 100644
index 000000000..7bd3106c9
--- /dev/null
+++ b/source/luametatex/source/libraries/mimalloc/src/bitmap.h
@@ -0,0 +1,107 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2019-2020 Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+
+/* ----------------------------------------------------------------------------
+Concurrent bitmap that can set/reset sequences of bits atomically,
+represeted as an array of fields where each field is a machine word (`size_t`)
+
+There are two api's; the standard one cannot have sequences that cross
+between the bitmap fields (and a sequence must be <= MI_BITMAP_FIELD_BITS).
+(this is used in region allocation)
+
+The `_across` postfixed functions do allow sequences that can cross over
+between the fields. (This is used in arena allocation)
+---------------------------------------------------------------------------- */
+#pragma once
+#ifndef MI_BITMAP_H
+#define MI_BITMAP_H
+
+/* -----------------------------------------------------------
+  Bitmap definition
+----------------------------------------------------------- */
+
+#define MI_BITMAP_FIELD_BITS   (8*MI_SIZE_SIZE)
+#define MI_BITMAP_FIELD_FULL   (~((size_t)0))   // all bits set
+
+// An atomic bitmap of `size_t` fields
+typedef _Atomic(size_t)  mi_bitmap_field_t;
+typedef mi_bitmap_field_t*  mi_bitmap_t;
+
+// A bitmap index is the index of the bit in a bitmap.
+typedef size_t mi_bitmap_index_t;
+
+// Create a bit index.
+static inline mi_bitmap_index_t mi_bitmap_index_create(size_t idx, size_t bitidx) {
+  mi_assert_internal(bitidx < MI_BITMAP_FIELD_BITS);
+  return (idx*MI_BITMAP_FIELD_BITS) + bitidx;
+}
+
+// Create a bit index.
+static inline mi_bitmap_index_t mi_bitmap_index_create_from_bit(size_t full_bitidx) {  
+  return mi_bitmap_index_create(full_bitidx / MI_BITMAP_FIELD_BITS, full_bitidx % MI_BITMAP_FIELD_BITS);
+}
+
+// Get the field index from a bit index.
+static inline size_t mi_bitmap_index_field(mi_bitmap_index_t bitmap_idx) {
+  return (bitmap_idx / MI_BITMAP_FIELD_BITS);
+}
+
+// Get the bit index in a bitmap field
+static inline size_t mi_bitmap_index_bit_in_field(mi_bitmap_index_t bitmap_idx) {
+  return (bitmap_idx % MI_BITMAP_FIELD_BITS);
+}
+
+// Get the full bit index
+static inline size_t mi_bitmap_index_bit(mi_bitmap_index_t bitmap_idx) {
+  return bitmap_idx;
+}
+
+/* -----------------------------------------------------------
+  Claim a bit sequence atomically
+----------------------------------------------------------- */
+
+// Try to atomically claim a sequence of `count` bits in a single
+// field at `idx` in `bitmap`. Returns `true` on success.
+bool _mi_bitmap_try_find_claim_field(mi_bitmap_t bitmap, size_t idx, const size_t count, mi_bitmap_index_t* bitmap_idx);
+
+// Starts at idx, and wraps around to search in all `bitmap_fields` fields.
+// For now, `count` can be at most MI_BITMAP_FIELD_BITS and will never cross fields.
+bool _mi_bitmap_try_find_from_claim(mi_bitmap_t bitmap, const size_t bitmap_fields, const size_t start_field_idx, const size_t count, mi_bitmap_index_t* bitmap_idx);
+
+// Set `count` bits at `bitmap_idx` to 0 atomically
+// Returns `true` if all `count` bits were 1 previously.
+bool _mi_bitmap_unclaim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx);
+
+// Set `count` bits at `bitmap_idx` to 1 atomically
+// Returns `true` if all `count` bits were 0 previously. `any_zero` is `true` if there was at least one zero bit.
+bool _mi_bitmap_claim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, bool* any_zero);
+
+bool _mi_bitmap_is_claimed(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx);
+bool _mi_bitmap_is_any_claimed(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx);
+
+
+//--------------------------------------------------------------------------
+// the `_across` functions work on bitmaps where sequences can cross over
+// between the fields. This is used in arena allocation
+//--------------------------------------------------------------------------
+
+// Find `count` bits of zeros and set them to 1 atomically; returns `true` on success.
+// Starts at idx, and wraps around to search in all `bitmap_fields` fields.
+bool _mi_bitmap_try_find_from_claim_across(mi_bitmap_t bitmap, const size_t bitmap_fields, const size_t start_field_idx, const size_t count, mi_bitmap_index_t* bitmap_idx);
+
+// Set `count` bits at `bitmap_idx` to 0 atomically
+// Returns `true` if all `count` bits were 1 previously.
+bool _mi_bitmap_unclaim_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx);
+
+// Set `count` bits at `bitmap_idx` to 1 atomically
+// Returns `true` if all `count` bits were 0 previously. `any_zero` is `true` if there was at least one zero bit.
+bool _mi_bitmap_claim_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, bool* pany_zero);
+
+bool _mi_bitmap_is_claimed_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx);
+bool _mi_bitmap_is_any_claimed_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx);
+
+#endif
diff --git a/source/luametatex/source/libraries/mimalloc/src/heap.c b/source/luametatex/source/libraries/mimalloc/src/heap.c
new file mode 100644
index 000000000..816d961ae
--- /dev/null
+++ b/source/luametatex/source/libraries/mimalloc/src/heap.c
@@ -0,0 +1,580 @@
+/*----------------------------------------------------------------------------
+Copyright (c) 2018-2021, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+
+#include "mimalloc.h"
+#include "mimalloc-internal.h"
+#include "mimalloc-atomic.h"
+
+#include <string.h>  // memset, memcpy
+
+#if defined(_MSC_VER) && (_MSC_VER < 1920)
+#pragma warning(disable:4204)  // non-constant aggregate initializer
+#endif
+
+/* -----------------------------------------------------------
+  Helpers
+----------------------------------------------------------- */
+
+// return `true` if ok, `false` to break
+typedef bool (heap_page_visitor_fun)(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_t* page, void* arg1, void* arg2);
+
+// Visit all pages in a heap; returns `false` if break was called.
+static bool mi_heap_visit_pages(mi_heap_t* heap, heap_page_visitor_fun* fn, void* arg1, void* arg2)
+{
+  if (heap==NULL || heap->page_count==0) return 0;
+
+  // visit all pages
+  #if MI_DEBUG>1
+  size_t total = heap->page_count;
+  #endif
+  size_t count = 0;
+  for (size_t i = 0; i <= MI_BIN_FULL; i++) {
+    mi_page_queue_t* pq = &heap->pages[i];
+    mi_page_t* page = pq->first;
+    while(page != NULL) {
+      mi_page_t* next = page->next; // save next in case the page gets removed from the queue
+      mi_assert_internal(mi_page_heap(page) == heap);
+      count++;
+      if (!fn(heap, pq, page, arg1, arg2)) return false;
+      page = next; // and continue
+    }
+  }
+  mi_assert_internal(count == total);
+  return true;
+}
+
+
+#if MI_DEBUG>=2
+static bool mi_heap_page_is_valid(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_t* page, void* arg1, void* arg2) {
+  MI_UNUSED(arg1);
+  MI_UNUSED(arg2);
+  MI_UNUSED(pq);
+  mi_assert_internal(mi_page_heap(page) == heap);
+  mi_segment_t* segment = _mi_page_segment(page);
+  mi_assert_internal(segment->thread_id == heap->thread_id);
+  mi_assert_expensive(_mi_page_is_valid(page));
+  return true;
+}
+#endif
+#if MI_DEBUG>=3
+static bool mi_heap_is_valid(mi_heap_t* heap) {
+  mi_assert_internal(heap!=NULL);
+  mi_heap_visit_pages(heap, &mi_heap_page_is_valid, NULL, NULL);
+  return true;
+}
+#endif
+
+
+
+
+/* -----------------------------------------------------------
+  "Collect" pages by migrating `local_free` and `thread_free`
+  lists and freeing empty pages. This is done when a thread
+  stops (and in that case abandons pages if there are still
+  blocks alive)
+----------------------------------------------------------- */
+
+typedef enum mi_collect_e {
+  MI_NORMAL,
+  MI_FORCE,
+  MI_ABANDON
+} mi_collect_t;
+
+
+static bool mi_heap_page_collect(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_t* page, void* arg_collect, void* arg2 ) {
+  MI_UNUSED(arg2);
+  MI_UNUSED(heap);
+  mi_assert_internal(mi_heap_page_is_valid(heap, pq, page, NULL, NULL));
+  mi_collect_t collect = *((mi_collect_t*)arg_collect);
+  _mi_page_free_collect(page, collect >= MI_FORCE);
+  if (mi_page_all_free(page)) {
+    // no more used blocks, free the page. 
+    // note: this will free retired pages as well.
+    _mi_page_free(page, pq, collect >= MI_FORCE);
+  }
+  else if (collect == MI_ABANDON) {
+    // still used blocks but the thread is done; abandon the page
+    _mi_page_abandon(page, pq);
+  }
+  return true; // don't break
+}
+
+static bool mi_heap_page_never_delayed_free(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_t* page, void* arg1, void* arg2) {
+  MI_UNUSED(arg1);
+  MI_UNUSED(arg2);
+  MI_UNUSED(heap);
+  MI_UNUSED(pq);
+  _mi_page_use_delayed_free(page, MI_NEVER_DELAYED_FREE, false);
+  return true; // don't break
+}
+
+static void mi_heap_collect_ex(mi_heap_t* heap, mi_collect_t collect)
+{
+  if (heap==NULL || !mi_heap_is_initialized(heap)) return;
+
+  const bool force = collect >= MI_FORCE;  
+  _mi_deferred_free(heap, force);
+
+  // note: never reclaim on collect but leave it to threads that need storage to reclaim 
+  const bool force_main = 
+    #ifdef NDEBUG
+      collect == MI_FORCE
+    #else
+      collect >= MI_FORCE
+    #endif
+      && _mi_is_main_thread() && mi_heap_is_backing(heap) && !heap->no_reclaim;
+
+  if (force_main) {
+    // the main thread is abandoned (end-of-program), try to reclaim all abandoned segments.
+    // if all memory is freed by now, all segments should be freed.
+    _mi_abandoned_reclaim_all(heap, &heap->tld->segments);
+  }
+  
+  // if abandoning, mark all pages to no longer add to delayed_free
+  if (collect == MI_ABANDON) {
+    mi_heap_visit_pages(heap, &mi_heap_page_never_delayed_free, NULL, NULL);
+  }
+
+  // free thread delayed blocks.
+  // (if abandoning, after this there are no more thread-delayed references into the pages.)
+  _mi_heap_delayed_free(heap);
+
+  // collect retired pages
+  _mi_heap_collect_retired(heap, force);
+
+  // collect all pages owned by this thread
+  mi_heap_visit_pages(heap, &mi_heap_page_collect, &collect, NULL);
+  mi_assert_internal( collect != MI_ABANDON || mi_atomic_load_ptr_acquire(mi_block_t,&heap->thread_delayed_free) == NULL );
+
+  // collect abandoned segments (in particular, decommit expired parts of segments in the abandoned segment list)
+  // note: forced decommit can be quite expensive if many threads are created/destroyed so we do not force on abandonment
+  _mi_abandoned_collect(heap, collect == MI_FORCE /* force? */, &heap->tld->segments);
+
+  // collect segment local caches
+  if (force) {
+    _mi_segment_thread_collect(&heap->tld->segments);
+  }
+
+  // decommit in global segment caches
+  // note: forced decommit can be quite expensive if many threads are created/destroyed so we do not force on abandonment
+  _mi_segment_cache_collect( collect == MI_FORCE, &heap->tld->os);  
+
+  // collect regions on program-exit (or shared library unload)
+  if (force && _mi_is_main_thread() && mi_heap_is_backing(heap)) {
+    //_mi_mem_collect(&heap->tld->os);
+  }
+}
+
+void _mi_heap_collect_abandon(mi_heap_t* heap) {
+  mi_heap_collect_ex(heap, MI_ABANDON);
+}
+
+void mi_heap_collect(mi_heap_t* heap, bool force) mi_attr_noexcept {
+  mi_heap_collect_ex(heap, (force ? MI_FORCE : MI_NORMAL));
+}
+
+void mi_collect(bool force) mi_attr_noexcept {
+  mi_heap_collect(mi_get_default_heap(), force);
+}
+
+
+/* -----------------------------------------------------------
+  Heap new
+----------------------------------------------------------- */
+
+mi_heap_t* mi_heap_get_default(void) {
+  mi_thread_init();
+  return mi_get_default_heap();
+}
+
+mi_heap_t* mi_heap_get_backing(void) {
+  mi_heap_t* heap = mi_heap_get_default();
+  mi_assert_internal(heap!=NULL);
+  mi_heap_t* bheap = heap->tld->heap_backing;
+  mi_assert_internal(bheap!=NULL);
+  mi_assert_internal(bheap->thread_id == _mi_thread_id());
+  return bheap;
+}
+
+mi_heap_t* mi_heap_new(void) {
+  mi_heap_t* bheap = mi_heap_get_backing();
+  mi_heap_t* heap = mi_heap_malloc_tp(bheap, mi_heap_t);  // todo: OS allocate in secure mode?
+  if (heap==NULL) return NULL;
+  _mi_memcpy_aligned(heap, &_mi_heap_empty, sizeof(mi_heap_t));
+  heap->tld = bheap->tld;
+  heap->thread_id = _mi_thread_id();
+  _mi_random_split(&bheap->random, &heap->random);
+  heap->cookie  = _mi_heap_random_next(heap) | 1;
+  heap->keys[0] = _mi_heap_random_next(heap);
+  heap->keys[1] = _mi_heap_random_next(heap);
+  heap->no_reclaim = true;  // don't reclaim abandoned pages or otherwise destroy is unsafe
+  // push on the thread local heaps list
+  heap->next = heap->tld->heaps;
+  heap->tld->heaps = heap;
+  return heap;
+}
+
+uintptr_t _mi_heap_random_next(mi_heap_t* heap) {
+  return _mi_random_next(&heap->random);
+}
+
+// zero out the page queues
+static void mi_heap_reset_pages(mi_heap_t* heap) {
+  mi_assert_internal(heap != NULL);
+  mi_assert_internal(mi_heap_is_initialized(heap));
+  // TODO: copy full empty heap instead?
+  memset(&heap->pages_free_direct, 0, sizeof(heap->pages_free_direct));
+#ifdef MI_MEDIUM_DIRECT
+  memset(&heap->pages_free_medium, 0, sizeof(heap->pages_free_medium));
+#endif
+  _mi_memcpy_aligned(&heap->pages, &_mi_heap_empty.pages, sizeof(heap->pages));
+  heap->thread_delayed_free = NULL;
+  heap->page_count = 0;
+}
+
+// called from `mi_heap_destroy` and `mi_heap_delete` to free the internal heap resources.
+static void mi_heap_free(mi_heap_t* heap) {
+  mi_assert(heap != NULL);
+  mi_assert_internal(mi_heap_is_initialized(heap));
+  if (heap==NULL || !mi_heap_is_initialized(heap)) return;
+  if (mi_heap_is_backing(heap)) return; // dont free the backing heap
+
+  // reset default
+  if (mi_heap_is_default(heap)) {
+    _mi_heap_set_default_direct(heap->tld->heap_backing);
+  }
+
+  // remove ourselves from the thread local heaps list
+  // linear search but we expect the number of heaps to be relatively small
+  mi_heap_t* prev = NULL;
+  mi_heap_t* curr = heap->tld->heaps; 
+  while (curr != heap && curr != NULL) {
+    prev = curr;
+    curr = curr->next;
+  }
+  mi_assert_internal(curr == heap);
+  if (curr == heap) {
+    if (prev != NULL) { prev->next = heap->next; }
+                 else { heap->tld->heaps = heap->next; }
+  }
+  mi_assert_internal(heap->tld->heaps != NULL);
+
+  // and free the used memory
+  mi_free(heap);
+}
+
+
+/* -----------------------------------------------------------
+  Heap destroy
+----------------------------------------------------------- */
+
+static bool _mi_heap_page_destroy(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_t* page, void* arg1, void* arg2) {
+  MI_UNUSED(arg1);
+  MI_UNUSED(arg2);
+  MI_UNUSED(heap);
+  MI_UNUSED(pq);
+
+  // ensure no more thread_delayed_free will be added
+  _mi_page_use_delayed_free(page, MI_NEVER_DELAYED_FREE, false);
+
+  // stats
+  const size_t bsize = mi_page_block_size(page);
+  if (bsize > MI_MEDIUM_OBJ_SIZE_MAX) {
+    if (bsize <= MI_LARGE_OBJ_SIZE_MAX) {
+      mi_heap_stat_decrease(heap, large, bsize);
+    }
+    else {
+      mi_heap_stat_decrease(heap, huge, bsize);
+    }
+  }
+#if (MI_STAT)
+  _mi_page_free_collect(page, false);  // update used count
+  const size_t inuse = page->used;
+  if (bsize <= MI_LARGE_OBJ_SIZE_MAX) {
+    mi_heap_stat_decrease(heap, normal, bsize * inuse);
+#if (MI_STAT>1)
+    mi_heap_stat_decrease(heap, normal_bins[_mi_bin(bsize)], inuse);
+#endif
+  }
+  mi_heap_stat_decrease(heap, malloc, bsize * inuse);  // todo: off for aligned blocks...
+#endif
+
+  /// pretend it is all free now
+  mi_assert_internal(mi_page_thread_free(page) == NULL);
+  page->used = 0;
+
+  // and free the page
+  // mi_page_free(page,false);
+  page->next = NULL;
+  page->prev = NULL;
+  _mi_segment_page_free(page,false /* no force? */, &heap->tld->segments);
+
+  return true; // keep going
+}
+
+void _mi_heap_destroy_pages(mi_heap_t* heap) {
+  mi_heap_visit_pages(heap, &_mi_heap_page_destroy, NULL, NULL);
+  mi_heap_reset_pages(heap);
+}
+
+void mi_heap_destroy(mi_heap_t* heap) {
+  mi_assert(heap != NULL);
+  mi_assert(mi_heap_is_initialized(heap));
+  mi_assert(heap->no_reclaim);
+  mi_assert_expensive(mi_heap_is_valid(heap));
+  if (heap==NULL || !mi_heap_is_initialized(heap)) return;
+  if (!heap->no_reclaim) {
+    // don't free in case it may contain reclaimed pages
+    mi_heap_delete(heap);
+  }
+  else {
+    // free all pages
+    _mi_heap_destroy_pages(heap);
+    mi_heap_free(heap);
+  }
+}
+
+
+
+/* -----------------------------------------------------------
+  Safe Heap delete
+----------------------------------------------------------- */
+
+// Transfer the pages from one heap to the other
+static void mi_heap_absorb(mi_heap_t* heap, mi_heap_t* from) {
+  mi_assert_internal(heap!=NULL);
+  if (from==NULL || from->page_count == 0) return;
+
+  // reduce the size of the delayed frees
+  _mi_heap_delayed_free(from);
+  
+  // transfer all pages by appending the queues; this will set a new heap field 
+  // so threads may do delayed frees in either heap for a while.
+  // note: appending waits for each page to not be in the `MI_DELAYED_FREEING` state
+  // so after this only the new heap will get delayed frees
+  for (size_t i = 0; i <= MI_BIN_FULL; i++) {
+    mi_page_queue_t* pq = &heap->pages[i];
+    mi_page_queue_t* append = &from->pages[i];
+    size_t pcount = _mi_page_queue_append(heap, pq, append);
+    heap->page_count += pcount;
+    from->page_count -= pcount;
+  }
+  mi_assert_internal(from->page_count == 0);
+
+  // and do outstanding delayed frees in the `from` heap  
+  // note: be careful here as the `heap` field in all those pages no longer point to `from`,
+  // turns out to be ok as `_mi_heap_delayed_free` only visits the list and calls a 
+  // the regular `_mi_free_delayed_block` which is safe.
+  _mi_heap_delayed_free(from);  
+  #if !defined(_MSC_VER) || (_MSC_VER > 1900) // somehow the following line gives an error in VS2015, issue #353
+  mi_assert_internal(mi_atomic_load_ptr_relaxed(mi_block_t,&from->thread_delayed_free) == NULL);
+  #endif
+
+  // and reset the `from` heap
+  mi_heap_reset_pages(from);  
+}
+
+// Safe delete a heap without freeing any still allocated blocks in that heap.
+void mi_heap_delete(mi_heap_t* heap)
+{
+  mi_assert(heap != NULL);
+  mi_assert(mi_heap_is_initialized(heap));
+  mi_assert_expensive(mi_heap_is_valid(heap));
+  if (heap==NULL || !mi_heap_is_initialized(heap)) return;
+
+  if (!mi_heap_is_backing(heap)) {
+    // tranfer still used pages to the backing heap
+    mi_heap_absorb(heap->tld->heap_backing, heap);
+  }
+  else {
+    // the backing heap abandons its pages
+    _mi_heap_collect_abandon(heap);
+  }
+  mi_assert_internal(heap->page_count==0);
+  mi_heap_free(heap);
+}
+
+mi_heap_t* mi_heap_set_default(mi_heap_t* heap) {
+  mi_assert(heap != NULL);
+  mi_assert(mi_heap_is_initialized(heap));
+  if (heap==NULL || !mi_heap_is_initialized(heap)) return NULL;
+  mi_assert_expensive(mi_heap_is_valid(heap));
+  mi_heap_t* old = mi_get_default_heap();
+  _mi_heap_set_default_direct(heap);
+  return old;
+}
+
+
+
+
+/* -----------------------------------------------------------
+  Analysis
+----------------------------------------------------------- */
+
+// static since it is not thread safe to access heaps from other threads.
+static mi_heap_t* mi_heap_of_block(const void* p) {
+  if (p == NULL) return NULL;
+  mi_segment_t* segment = _mi_ptr_segment(p);
+  bool valid = (_mi_ptr_cookie(segment) == segment->cookie);
+  mi_assert_internal(valid);
+  if (mi_unlikely(!valid)) return NULL;
+  return mi_page_heap(_mi_segment_page_of(segment,p));
+}
+
+bool mi_heap_contains_block(mi_heap_t* heap, const void* p) {
+  mi_assert(heap != NULL);
+  if (heap==NULL || !mi_heap_is_initialized(heap)) return false;
+  return (heap == mi_heap_of_block(p));
+}
+
+
+static bool mi_heap_page_check_owned(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_t* page, void* p, void* vfound) {
+  MI_UNUSED(heap);
+  MI_UNUSED(pq);
+  bool* found = (bool*)vfound;
+  mi_segment_t* segment = _mi_page_segment(page);
+  void* start = _mi_page_start(segment, page, NULL);
+  void* end   = (uint8_t*)start + (page->capacity * mi_page_block_size(page));
+  *found = (p >= start && p < end);
+  return (!*found); // continue if not found
+}
+
+bool mi_heap_check_owned(mi_heap_t* heap, const void* p) {
+  mi_assert(heap != NULL);
+  if (heap==NULL || !mi_heap_is_initialized(heap)) return false;
+  if (((uintptr_t)p & (MI_INTPTR_SIZE - 1)) != 0) return false;  // only aligned pointers
+  bool found = false;
+  mi_heap_visit_pages(heap, &mi_heap_page_check_owned, (void*)p, &found);
+  return found;
+}
+
+bool mi_check_owned(const void* p) {
+  return mi_heap_check_owned(mi_get_default_heap(), p);
+}
+
+/* -----------------------------------------------------------
+  Visit all heap blocks and areas
+  Todo: enable visiting abandoned pages, and
+        enable visiting all blocks of all heaps across threads
+----------------------------------------------------------- */
+
+// Separate struct to keep `mi_page_t` out of the public interface
+typedef struct mi_heap_area_ex_s {
+  mi_heap_area_t area;
+  mi_page_t*     page;
+} mi_heap_area_ex_t;
+
+static bool mi_heap_area_visit_blocks(const mi_heap_area_ex_t* xarea, mi_block_visit_fun* visitor, void* arg) {
+  mi_assert(xarea != NULL);
+  if (xarea==NULL) return true;
+  const mi_heap_area_t* area = &xarea->area;
+  mi_page_t* page = xarea->page;
+  mi_assert(page != NULL);
+  if (page == NULL) return true;
+
+  _mi_page_free_collect(page,true);
+  mi_assert_internal(page->local_free == NULL);
+  if (page->used == 0) return true;
+
+  const size_t bsize = mi_page_block_size(page);
+  const size_t ubsize = mi_page_usable_block_size(page); // without padding
+  size_t   psize;
+  uint8_t* pstart = _mi_page_start(_mi_page_segment(page), page, &psize);
+
+  if (page->capacity == 1) {
+    // optimize page with one block
+    mi_assert_internal(page->used == 1 && page->free == NULL);
+    return visitor(mi_page_heap(page), area, pstart, ubsize, arg);
+  }
+
+  // create a bitmap of free blocks.
+  #define MI_MAX_BLOCKS   (MI_SMALL_PAGE_SIZE / sizeof(void*))
+  uintptr_t free_map[MI_MAX_BLOCKS / sizeof(uintptr_t)];
+  memset(free_map, 0, sizeof(free_map));
+
+  size_t free_count = 0;
+  for (mi_block_t* block = page->free; block != NULL; block = mi_block_next(page,block)) {
+    free_count++;
+    mi_assert_internal((uint8_t*)block >= pstart && (uint8_t*)block < (pstart + psize));
+    size_t offset = (uint8_t*)block - pstart;
+    mi_assert_internal(offset % bsize == 0);
+    size_t blockidx = offset / bsize;  // Todo: avoid division?
+    mi_assert_internal( blockidx < MI_MAX_BLOCKS);
+    size_t bitidx = (blockidx / sizeof(uintptr_t));
+    size_t bit = blockidx - (bitidx * sizeof(uintptr_t));
+    free_map[bitidx] |= ((uintptr_t)1 << bit);
+  }
+  mi_assert_internal(page->capacity == (free_count + page->used));
+
+  // walk through all blocks skipping the free ones
+  size_t used_count = 0;
+  for (size_t i = 0; i < page->capacity; i++) {
+    size_t bitidx = (i / sizeof(uintptr_t));
+    size_t bit = i - (bitidx * sizeof(uintptr_t));
+    uintptr_t m = free_map[bitidx];
+    if (bit == 0 && m == UINTPTR_MAX) {
+      i += (sizeof(uintptr_t) - 1); // skip a run of free blocks
+    }
+    else if ((m & ((uintptr_t)1 << bit)) == 0) {
+      used_count++;
+      uint8_t* block = pstart + (i * bsize);
+      if (!visitor(mi_page_heap(page), area, block, ubsize, arg)) return false;
+    }
+  }
+  mi_assert_internal(page->used == used_count);
+  return true;
+}
+
+typedef bool (mi_heap_area_visit_fun)(const mi_heap_t* heap, const mi_heap_area_ex_t* area, void* arg);
+
+
+static bool mi_heap_visit_areas_page(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_t* page, void* vfun, void* arg) {
+  MI_UNUSED(heap);
+  MI_UNUSED(pq);
+  mi_heap_area_visit_fun* fun = (mi_heap_area_visit_fun*)vfun;
+  mi_heap_area_ex_t xarea;
+  const size_t bsize = mi_page_block_size(page);
+  const size_t ubsize = mi_page_usable_block_size(page);
+  xarea.page = page;
+  xarea.area.reserved = page->reserved * bsize;
+  xarea.area.committed = page->capacity * bsize;
+  xarea.area.blocks = _mi_page_start(_mi_page_segment(page), page, NULL);
+  xarea.area.used = page->used * bsize;
+  xarea.area.block_size = ubsize;
+  xarea.area.full_block_size = bsize;
+  return fun(heap, &xarea, arg);
+}
+
+// Visit all heap pages as areas
+static bool mi_heap_visit_areas(const mi_heap_t* heap, mi_heap_area_visit_fun* visitor, void* arg) {
+  if (visitor == NULL) return false;
+  return mi_heap_visit_pages((mi_heap_t*)heap, &mi_heap_visit_areas_page, (void*)(visitor), arg); // note: function pointer to void* :-{
+}
+
+// Just to pass arguments
+typedef struct mi_visit_blocks_args_s {
+  bool  visit_blocks;
+  mi_block_visit_fun* visitor;
+  void* arg;
+} mi_visit_blocks_args_t;
+
+static bool mi_heap_area_visitor(const mi_heap_t* heap, const mi_heap_area_ex_t* xarea, void* arg) {
+  mi_visit_blocks_args_t* args = (mi_visit_blocks_args_t*)arg;
+  if (!args->visitor(heap, &xarea->area, NULL, xarea->area.block_size, args->arg)) return false;
+  if (args->visit_blocks) {
+    return mi_heap_area_visit_blocks(xarea, args->visitor, args->arg);
+  }
+  else {
+    return true;
+  }
+}
+
+// Visit all blocks in a heap
+bool mi_heap_visit_blocks(const mi_heap_t* heap, bool visit_blocks, mi_block_visit_fun* visitor, void* arg) {
+  mi_visit_blocks_args_t args = { visit_blocks, visitor, arg };
+  return mi_heap_visit_areas(heap, &mi_heap_area_visitor, &args);
+}
diff --git a/source/luametatex/source/libraries/mimalloc/src/init.c b/source/luametatex/source/libraries/mimalloc/src/init.c
new file mode 100644
index 000000000..19124afef
--- /dev/null
+++ b/source/luametatex/source/libraries/mimalloc/src/init.c
@@ -0,0 +1,693 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2018-2022, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+#include "mimalloc.h"
+#include "mimalloc-internal.h"
+
+#include <string.h>  // memcpy, memset
+#include <stdlib.h>  // atexit
+
+// Empty page used to initialize the small free pages array
+const mi_page_t _mi_page_empty = {
+  0, false, false, false, false,
+  0,       // capacity
+  0,       // reserved capacity
+  { 0 },   // flags
+  false,   // is_zero
+  0,       // retire_expire
+  NULL,    // free
+  #if MI_ENCODE_FREELIST
+  { 0, 0 },
+  #endif
+  0,       // used
+  0,       // xblock_size
+  NULL,    // local_free
+  MI_ATOMIC_VAR_INIT(0), // xthread_free
+  MI_ATOMIC_VAR_INIT(0), // xheap
+  NULL, NULL
+  #if MI_INTPTR_SIZE==8
+  , { 0 }  // padding
+  #endif
+};
+
+#define MI_PAGE_EMPTY() ((mi_page_t*)&_mi_page_empty)
+
+#if (MI_PADDING>0) && (MI_INTPTR_SIZE >= 8)
+#define MI_SMALL_PAGES_EMPTY  { MI_INIT128(MI_PAGE_EMPTY), MI_PAGE_EMPTY(), MI_PAGE_EMPTY() }
+#elif (MI_PADDING>0)
+#define MI_SMALL_PAGES_EMPTY  { MI_INIT128(MI_PAGE_EMPTY), MI_PAGE_EMPTY(), MI_PAGE_EMPTY(), MI_PAGE_EMPTY() }
+#else
+#define MI_SMALL_PAGES_EMPTY  { MI_INIT128(MI_PAGE_EMPTY), MI_PAGE_EMPTY() }
+#endif
+
+
+// Empty page queues for every bin
+#define QNULL(sz)  { NULL, NULL, (sz)*sizeof(uintptr_t) }
+#define MI_PAGE_QUEUES_EMPTY \
+  { QNULL(1), \
+    QNULL(     1), QNULL(     2), QNULL(     3), QNULL(     4), QNULL(     5), QNULL(     6), QNULL(     7), QNULL(     8), /* 8 */ \
+    QNULL(    10), QNULL(    12), QNULL(    14), QNULL(    16), QNULL(    20), QNULL(    24), QNULL(    28), QNULL(    32), /* 16 */ \
+    QNULL(    40), QNULL(    48), QNULL(    56), QNULL(    64), QNULL(    80), QNULL(    96), QNULL(   112), QNULL(   128), /* 24 */ \
+    QNULL(   160), QNULL(   192), QNULL(   224), QNULL(   256), QNULL(   320), QNULL(   384), QNULL(   448), QNULL(   512), /* 32 */ \
+    QNULL(   640), QNULL(   768), QNULL(   896), QNULL(  1024), QNULL(  1280), QNULL(  1536), QNULL(  1792), QNULL(  2048), /* 40 */ \
+    QNULL(  2560), QNULL(  3072), QNULL(  3584), QNULL(  4096), QNULL(  5120), QNULL(  6144), QNULL(  7168), QNULL(  8192), /* 48 */ \
+    QNULL( 10240), QNULL( 12288), QNULL( 14336), QNULL( 16384), QNULL( 20480), QNULL( 24576), QNULL( 28672), QNULL( 32768), /* 56 */ \
+    QNULL( 40960), QNULL( 49152), QNULL( 57344), QNULL( 65536), QNULL( 81920), QNULL( 98304), QNULL(114688), QNULL(131072), /* 64 */ \
+    QNULL(163840), QNULL(196608), QNULL(229376), QNULL(262144), QNULL(327680), QNULL(393216), QNULL(458752), QNULL(524288), /* 72 */ \
+    QNULL(MI_MEDIUM_OBJ_WSIZE_MAX + 1  /* 655360, Huge queue */), \
+    QNULL(MI_MEDIUM_OBJ_WSIZE_MAX + 2) /* Full queue */ }
+
+#define MI_STAT_COUNT_NULL()  {0,0,0,0}
+
+// Empty statistics
+#if MI_STAT>1
+#define MI_STAT_COUNT_END_NULL()  , { MI_STAT_COUNT_NULL(), MI_INIT32(MI_STAT_COUNT_NULL) }
+#else
+#define MI_STAT_COUNT_END_NULL()
+#endif
+
+#define MI_STATS_NULL  \
+  MI_STAT_COUNT_NULL(), MI_STAT_COUNT_NULL(), \
+  MI_STAT_COUNT_NULL(), MI_STAT_COUNT_NULL(), \
+  MI_STAT_COUNT_NULL(), MI_STAT_COUNT_NULL(), \
+  MI_STAT_COUNT_NULL(), MI_STAT_COUNT_NULL(), \
+  MI_STAT_COUNT_NULL(), MI_STAT_COUNT_NULL(), \
+  MI_STAT_COUNT_NULL(), MI_STAT_COUNT_NULL(), \
+  MI_STAT_COUNT_NULL(), MI_STAT_COUNT_NULL(), \
+  { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 },     \
+  { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 } \
+  MI_STAT_COUNT_END_NULL()
+
+
+// Empty slice span queues for every bin
+#define SQNULL(sz)  { NULL, NULL, sz }
+#define MI_SEGMENT_SPAN_QUEUES_EMPTY \
+  { SQNULL(1), \
+    SQNULL(     1), SQNULL(     2), SQNULL(     3), SQNULL(     4), SQNULL(     5), SQNULL(     6), SQNULL(     7), SQNULL(    10), /*  8 */ \
+    SQNULL(    12), SQNULL(    14), SQNULL(    16), SQNULL(    20), SQNULL(    24), SQNULL(    28), SQNULL(    32), SQNULL(    40), /* 16 */ \
+    SQNULL(    48), SQNULL(    56), SQNULL(    64), SQNULL(    80), SQNULL(    96), SQNULL(   112), SQNULL(   128), SQNULL(   160), /* 24 */ \
+    SQNULL(   192), SQNULL(   224), SQNULL(   256), SQNULL(   320), SQNULL(   384), SQNULL(   448), SQNULL(   512), SQNULL(   640), /* 32 */ \
+    SQNULL(   768), SQNULL(   896), SQNULL(  1024) /* 35 */ }
+
+
+// --------------------------------------------------------
+// Statically allocate an empty heap as the initial
+// thread local value for the default heap,
+// and statically allocate the backing heap for the main
+// thread so it can function without doing any allocation
+// itself (as accessing a thread local for the first time
+// may lead to allocation itself on some platforms)
+// --------------------------------------------------------
+
+mi_decl_cache_align const mi_heap_t _mi_heap_empty = {
+  NULL,
+  MI_SMALL_PAGES_EMPTY,
+  MI_PAGE_QUEUES_EMPTY,
+  MI_ATOMIC_VAR_INIT(NULL),
+  0,                // tid
+  0,                // cookie
+  { 0, 0 },         // keys
+  { {0}, {0}, 0 },
+  0,                // page count
+  MI_BIN_FULL, 0,   // page retired min/max
+  NULL,             // next
+  false
+};
+
+#define tld_empty_stats  ((mi_stats_t*)((uint8_t*)&tld_empty + offsetof(mi_tld_t,stats)))
+#define tld_empty_os     ((mi_os_tld_t*)((uint8_t*)&tld_empty + offsetof(mi_tld_t,os)))
+
+mi_decl_cache_align static const mi_tld_t tld_empty = {
+  0,
+  false,
+  NULL, NULL,
+  { MI_SEGMENT_SPAN_QUEUES_EMPTY, 0, 0, 0, 0, tld_empty_stats, tld_empty_os }, // segments
+  { 0, tld_empty_stats }, // os
+  { MI_STATS_NULL }       // stats
+};
+
+// the thread-local default heap for allocation
+mi_decl_thread mi_heap_t* _mi_heap_default = (mi_heap_t*)&_mi_heap_empty;
+
+extern mi_heap_t _mi_heap_main;
+
+static mi_tld_t tld_main = {
+  0, false,
+  &_mi_heap_main, & _mi_heap_main,
+  { MI_SEGMENT_SPAN_QUEUES_EMPTY, 0, 0, 0, 0, &tld_main.stats, &tld_main.os }, // segments
+  { 0, &tld_main.stats },  // os
+  { MI_STATS_NULL }       // stats
+};
+
+mi_heap_t _mi_heap_main = {
+  &tld_main,
+  MI_SMALL_PAGES_EMPTY,
+  MI_PAGE_QUEUES_EMPTY,
+  MI_ATOMIC_VAR_INIT(NULL),
+  0,                // thread id
+  0,                // initial cookie
+  { 0, 0 },         // the key of the main heap can be fixed (unlike page keys that need to be secure!)
+  { {0x846ca68b}, {0}, 0 },  // random
+  0,                // page count
+  MI_BIN_FULL, 0,   // page retired min/max
+  NULL,             // next heap
+  false             // can reclaim
+};
+
+bool _mi_process_is_initialized = false;  // set to `true` in `mi_process_init`.
+
+mi_stats_t _mi_stats_main = { MI_STATS_NULL };
+
+
+static void mi_heap_main_init(void) {
+  if (_mi_heap_main.cookie == 0) {
+    _mi_heap_main.thread_id = _mi_thread_id();
+    _mi_heap_main.cookie = _mi_os_random_weak((uintptr_t)&mi_heap_main_init);
+    _mi_random_init(&_mi_heap_main.random);
+    _mi_heap_main.keys[0] = _mi_heap_random_next(&_mi_heap_main);
+    _mi_heap_main.keys[1] = _mi_heap_random_next(&_mi_heap_main);
+  }
+}
+
+mi_heap_t* _mi_heap_main_get(void) {
+  mi_heap_main_init();
+  return &_mi_heap_main;
+}
+
+
+/* -----------------------------------------------------------
+  Initialization and freeing of the thread local heaps
+----------------------------------------------------------- */
+
+// note: in x64 in release build `sizeof(mi_thread_data_t)` is under 4KiB (= OS page size).
+typedef struct mi_thread_data_s {
+  mi_heap_t  heap;  // must come first due to cast in `_mi_heap_done`
+  mi_tld_t   tld;
+} mi_thread_data_t;
+
+
+// Thread meta-data is allocated directly from the OS. For
+// some programs that do not use thread pools and allocate and
+// destroy many OS threads, this may causes too much overhead 
+// per thread so we maintain a small cache of recently freed metadata.
+
+#define TD_CACHE_SIZE (8)
+static _Atomic(mi_thread_data_t*) td_cache[TD_CACHE_SIZE];
+
+static mi_thread_data_t* mi_thread_data_alloc(void) {
+  // try to find thread metadata in the cache
+  mi_thread_data_t* td;
+  for (int i = 0; i < TD_CACHE_SIZE; i++) {
+    td = mi_atomic_load_ptr_relaxed(mi_thread_data_t, &td_cache[i]);
+    if (td != NULL) {
+      td = mi_atomic_exchange_ptr_acq_rel(mi_thread_data_t, &td_cache[i], NULL); 
+      if (td != NULL) {
+        return td;
+      }
+    }
+  }
+  // if that fails, allocate directly from the OS
+  td = (mi_thread_data_t*)_mi_os_alloc(sizeof(mi_thread_data_t), &_mi_stats_main);
+  if (td == NULL) {
+    // if this fails, try once more. (issue #257)
+    td = (mi_thread_data_t*)_mi_os_alloc(sizeof(mi_thread_data_t), &_mi_stats_main);
+    if (td == NULL) {
+      // really out of memory
+      _mi_error_message(ENOMEM, "unable to allocate thread local heap metadata (%zu bytes)\n", sizeof(mi_thread_data_t));
+    }
+  }
+  return td;
+}
+
+static void mi_thread_data_free( mi_thread_data_t* tdfree ) {
+  // try to add the thread metadata to the cache
+  for (int i = 0; i < TD_CACHE_SIZE; i++) {
+    mi_thread_data_t* td = mi_atomic_load_ptr_relaxed(mi_thread_data_t, &td_cache[i]);
+    if (td == NULL) {
+      mi_thread_data_t* expected = NULL;
+      if (mi_atomic_cas_ptr_weak_acq_rel(mi_thread_data_t, &td_cache[i], &expected, tdfree)) {
+        return;
+      }
+    }
+  }
+  // if that fails, just free it directly
+  _mi_os_free(tdfree, sizeof(mi_thread_data_t), &_mi_stats_main);
+}
+
+static void mi_thread_data_collect(void) {
+  // free all thread metadata from the cache
+  for (int i = 0; i < TD_CACHE_SIZE; i++) {
+    mi_thread_data_t* td = mi_atomic_load_ptr_relaxed(mi_thread_data_t, &td_cache[i]);
+    if (td != NULL) {
+      td = mi_atomic_exchange_ptr_acq_rel(mi_thread_data_t, &td_cache[i], NULL);
+      if (td != NULL) {
+        _mi_os_free( td, sizeof(mi_thread_data_t), &_mi_stats_main );
+      }
+    }
+  }
+}
+
+// Initialize the thread local default heap, called from `mi_thread_init`
+static bool _mi_heap_init(void) {
+  if (mi_heap_is_initialized(mi_get_default_heap())) return true;
+  if (_mi_is_main_thread()) {
+    // mi_assert_internal(_mi_heap_main.thread_id != 0);  // can happen on freeBSD where alloc is called before any initialization
+    // the main heap is statically allocated
+    mi_heap_main_init();
+    _mi_heap_set_default_direct(&_mi_heap_main);
+    //mi_assert_internal(_mi_heap_default->tld->heap_backing == mi_get_default_heap());
+  }
+  else {
+    // use `_mi_os_alloc` to allocate directly from the OS
+    mi_thread_data_t* td = mi_thread_data_alloc();
+    if (td == NULL) return false;
+
+    // OS allocated so already zero initialized
+    mi_tld_t*  tld = &td->tld;
+    mi_heap_t* heap = &td->heap;
+    _mi_memcpy_aligned(tld, &tld_empty, sizeof(*tld));
+    _mi_memcpy_aligned(heap, &_mi_heap_empty, sizeof(*heap));
+    heap->thread_id = _mi_thread_id();
+    _mi_random_init(&heap->random);
+    heap->cookie  = _mi_heap_random_next(heap) | 1;
+    heap->keys[0] = _mi_heap_random_next(heap);
+    heap->keys[1] = _mi_heap_random_next(heap);
+    heap->tld = tld;
+    tld->heap_backing = heap;
+    tld->heaps = heap;
+    tld->segments.stats = &tld->stats;
+    tld->segments.os = &tld->os;
+    tld->os.stats = &tld->stats;
+    _mi_heap_set_default_direct(heap);    
+  }
+  return false;
+}
+
+// Free the thread local default heap (called from `mi_thread_done`)
+static bool _mi_heap_done(mi_heap_t* heap) {
+  if (!mi_heap_is_initialized(heap)) return true;
+
+  // reset default heap
+  _mi_heap_set_default_direct(_mi_is_main_thread() ? &_mi_heap_main : (mi_heap_t*)&_mi_heap_empty);
+
+  // switch to backing heap
+  heap = heap->tld->heap_backing;
+  if (!mi_heap_is_initialized(heap)) return false;
+
+  // delete all non-backing heaps in this thread
+  mi_heap_t* curr = heap->tld->heaps;
+  while (curr != NULL) {
+    mi_heap_t* next = curr->next; // save `next` as `curr` will be freed
+    if (curr != heap) {
+      mi_assert_internal(!mi_heap_is_backing(curr));
+      mi_heap_delete(curr);
+    }
+    curr = next;
+  }
+  mi_assert_internal(heap->tld->heaps == heap && heap->next == NULL);
+  mi_assert_internal(mi_heap_is_backing(heap));
+
+  // collect if not the main thread
+  if (heap != &_mi_heap_main) {
+    _mi_heap_collect_abandon(heap);
+  }
+  
+  // merge stats
+  _mi_stats_done(&heap->tld->stats);  
+
+  // free if not the main thread
+  if (heap != &_mi_heap_main) {
+    // the following assertion does not always hold for huge segments as those are always treated
+    // as abondened: one may allocate it in one thread, but deallocate in another in which case
+    // the count can be too large or negative. todo: perhaps not count huge segments? see issue #363
+    // mi_assert_internal(heap->tld->segments.count == 0 || heap->thread_id != _mi_thread_id());
+    mi_thread_data_free((mi_thread_data_t*)heap);
+  }
+  else {
+    mi_thread_data_collect(); // free cached thread metadata  
+    #if 0  
+    // never free the main thread even in debug mode; if a dll is linked statically with mimalloc,
+    // there may still be delete/free calls after the mi_fls_done is called. Issue #207
+    _mi_heap_destroy_pages(heap);
+    mi_assert_internal(heap->tld->heap_backing == &_mi_heap_main);
+    #endif
+  }
+  return false;
+}
+
+
+
+// --------------------------------------------------------
+// Try to run `mi_thread_done()` automatically so any memory
+// owned by the thread but not yet released can be abandoned
+// and re-owned by another thread.
+//
+// 1. windows dynamic library:
+//     call from DllMain on DLL_THREAD_DETACH
+// 2. windows static library:
+//     use `FlsAlloc` to call a destructor when the thread is done
+// 3. unix, pthreads:
+//     use a pthread key to call a destructor when a pthread is done
+//
+// In the last two cases we also need to call `mi_process_init`
+// to set up the thread local keys.
+// --------------------------------------------------------
+
+static void _mi_thread_done(mi_heap_t* default_heap);
+
+#if defined(_WIN32) && defined(MI_SHARED_LIB)
+  // nothing to do as it is done in DllMain
+#elif defined(_WIN32) && !defined(MI_SHARED_LIB)
+  // use thread local storage keys to detect thread ending
+  #include <windows.h>
+  #include <fibersapi.h>
+  #if (_WIN32_WINNT < 0x600)  // before Windows Vista 
+  WINBASEAPI DWORD WINAPI FlsAlloc( _In_opt_ PFLS_CALLBACK_FUNCTION lpCallback );
+  WINBASEAPI PVOID WINAPI FlsGetValue( _In_ DWORD dwFlsIndex );
+  WINBASEAPI BOOL  WINAPI FlsSetValue( _In_ DWORD dwFlsIndex, _In_opt_ PVOID lpFlsData );
+  WINBASEAPI BOOL  WINAPI FlsFree(_In_ DWORD dwFlsIndex);
+  #endif
+  static DWORD mi_fls_key = (DWORD)(-1);
+  static void NTAPI mi_fls_done(PVOID value) {
+    if (value!=NULL) _mi_thread_done((mi_heap_t*)value);
+  }
+#elif defined(MI_USE_PTHREADS)
+  // use pthread local storage keys to detect thread ending
+  // (and used with MI_TLS_PTHREADS for the default heap)
+  pthread_key_t _mi_heap_default_key = (pthread_key_t)(-1);
+  static void mi_pthread_done(void* value) {
+    if (value!=NULL) _mi_thread_done((mi_heap_t*)value);
+  }
+#elif defined(__wasi__)
+// no pthreads in the WebAssembly Standard Interface
+#else
+  #pragma message("define a way to call mi_thread_done when a thread is done")
+#endif
+
+// Set up handlers so `mi_thread_done` is called automatically
+static void mi_process_setup_auto_thread_done(void) {
+  static bool tls_initialized = false; // fine if it races
+  if (tls_initialized) return;
+  tls_initialized = true;
+  #if defined(_WIN32) && defined(MI_SHARED_LIB)
+    // nothing to do as it is done in DllMain
+  #elif defined(_WIN32) && !defined(MI_SHARED_LIB)
+    mi_fls_key = FlsAlloc(&mi_fls_done);
+  #elif defined(MI_USE_PTHREADS)
+    mi_assert_internal(_mi_heap_default_key == (pthread_key_t)(-1));
+    pthread_key_create(&_mi_heap_default_key, &mi_pthread_done);
+  #endif
+  _mi_heap_set_default_direct(&_mi_heap_main);
+}
+
+
+bool _mi_is_main_thread(void) {
+  return (_mi_heap_main.thread_id==0 || _mi_heap_main.thread_id == _mi_thread_id());
+}
+
+static _Atomic(size_t) thread_count = MI_ATOMIC_VAR_INIT(1);
+
+size_t  _mi_current_thread_count(void) {
+  return mi_atomic_load_relaxed(&thread_count);
+}
+
+// This is called from the `mi_malloc_generic`
+void mi_thread_init(void) mi_attr_noexcept
+{
+  // ensure our process has started already
+  mi_process_init();
+  
+  // initialize the thread local default heap
+  // (this will call `_mi_heap_set_default_direct` and thus set the
+  //  fiber/pthread key to a non-zero value, ensuring `_mi_thread_done` is called)
+  if (_mi_heap_init()) return;  // returns true if already initialized
+
+  _mi_stat_increase(&_mi_stats_main.threads, 1);
+  mi_atomic_increment_relaxed(&thread_count);
+  //_mi_verbose_message("thread init: 0x%zx\n", _mi_thread_id());
+}
+
+void mi_thread_done(void) mi_attr_noexcept {
+  _mi_thread_done(mi_get_default_heap());
+}
+
+static void _mi_thread_done(mi_heap_t* heap) {
+  mi_atomic_decrement_relaxed(&thread_count);
+  _mi_stat_decrease(&_mi_stats_main.threads, 1);
+
+  // check thread-id as on Windows shutdown with FLS the main (exit) thread may call this on thread-local heaps...
+  if (heap->thread_id != _mi_thread_id()) return;
+  
+  // abandon the thread local heap
+  if (_mi_heap_done(heap)) return;  // returns true if already ran
+}
+
+void _mi_heap_set_default_direct(mi_heap_t* heap)  {
+  mi_assert_internal(heap != NULL);
+  #if defined(MI_TLS_SLOT)
+  mi_tls_slot_set(MI_TLS_SLOT,heap);
+  #elif defined(MI_TLS_PTHREAD_SLOT_OFS)
+  *mi_tls_pthread_heap_slot() = heap;
+  #elif defined(MI_TLS_PTHREAD)
+  // we use _mi_heap_default_key
+  #else
+  _mi_heap_default = heap;
+  #endif
+
+  // ensure the default heap is passed to `_mi_thread_done`
+  // setting to a non-NULL value also ensures `mi_thread_done` is called.
+  #if defined(_WIN32) && defined(MI_SHARED_LIB)
+    // nothing to do as it is done in DllMain
+  #elif defined(_WIN32) && !defined(MI_SHARED_LIB)
+    mi_assert_internal(mi_fls_key != 0);
+    FlsSetValue(mi_fls_key, heap);
+  #elif defined(MI_USE_PTHREADS)
+  if (_mi_heap_default_key != (pthread_key_t)(-1)) {  // can happen during recursive invocation on freeBSD
+    pthread_setspecific(_mi_heap_default_key, heap);
+  }
+  #endif
+}
+
+
+// --------------------------------------------------------
+// Run functions on process init/done, and thread init/done
+// --------------------------------------------------------
+static void mi_process_done(void);
+
+static bool os_preloading = true;    // true until this module is initialized
+static bool mi_redirected = false;   // true if malloc redirects to mi_malloc
+
+// Returns true if this module has not been initialized; Don't use C runtime routines until it returns false.
+bool _mi_preloading(void) {
+  return os_preloading;
+}
+
+mi_decl_nodiscard bool mi_is_redirected(void) mi_attr_noexcept {
+  return mi_redirected;
+}
+
+// Communicate with the redirection module on Windows
+#if defined(_WIN32) && defined(MI_SHARED_LIB)
+#ifdef __cplusplus
+extern "C" {
+#endif
+mi_decl_export void _mi_redirect_entry(DWORD reason) {
+  // called on redirection; careful as this may be called before DllMain
+  if (reason == DLL_PROCESS_ATTACH) {
+    mi_redirected = true;
+  }
+  else if (reason == DLL_PROCESS_DETACH) {
+    mi_redirected = false;
+  }
+  else if (reason == DLL_THREAD_DETACH) {
+    mi_thread_done();
+  }
+}
+__declspec(dllimport) bool mi_allocator_init(const char** message);
+__declspec(dllimport) void mi_allocator_done(void);
+#ifdef __cplusplus
+}
+#endif
+#else
+static bool mi_allocator_init(const char** message) {
+  if (message != NULL) *message = NULL;
+  return true;
+}
+static void mi_allocator_done(void) {
+  // nothing to do
+}
+#endif
+
+// Called once by the process loader
+static void mi_process_load(void) {
+  mi_heap_main_init();
+  #if defined(MI_TLS_RECURSE_GUARD)
+  volatile mi_heap_t* dummy = _mi_heap_default; // access TLS to allocate it before setting tls_initialized to true;
+  MI_UNUSED(dummy);
+  #endif
+  os_preloading = false;
+  #if !(defined(_WIN32) && defined(MI_SHARED_LIB))  // use Dll process detach (see below) instead of atexit (issue #521)
+  atexit(&mi_process_done);  
+  #endif
+  _mi_options_init();
+  mi_process_init();
+  //mi_stats_reset();-
+  if (mi_redirected) _mi_verbose_message("malloc is redirected.\n");
+
+  // show message from the redirector (if present)
+  const char* msg = NULL;
+  mi_allocator_init(&msg);
+  if (msg != NULL && (mi_option_is_enabled(mi_option_verbose) || mi_option_is_enabled(mi_option_show_errors))) {
+    _mi_fputs(NULL,NULL,NULL,msg);
+  }
+}
+
+#if defined(_WIN32) && (defined(_M_IX86) || defined(_M_X64))
+#include <intrin.h>
+mi_decl_cache_align bool _mi_cpu_has_fsrm = false;
+
+static void mi_detect_cpu_features(void) {
+  // FSRM for fast rep movsb support (AMD Zen3+ (~2020) or Intel Ice Lake+ (~2017))
+  int32_t cpu_info[4];
+  __cpuid(cpu_info, 7);
+  _mi_cpu_has_fsrm = ((cpu_info[3] & (1 << 4)) != 0); // bit 4 of EDX : see <https ://en.wikipedia.org/wiki/CPUID#EAX=7,_ECX=0:_Extended_Features>
+}
+#else
+static void mi_detect_cpu_features(void) {
+  // nothing
+}
+#endif
+
+// Initialize the process; called by thread_init or the process loader
+void mi_process_init(void) mi_attr_noexcept {
+  // ensure we are called once
+  if (_mi_process_is_initialized) return;
+  _mi_verbose_message("process init: 0x%zx\n", _mi_thread_id());
+  _mi_process_is_initialized = true;
+  mi_process_setup_auto_thread_done();
+
+  
+  mi_detect_cpu_features();
+  _mi_os_init();
+  mi_heap_main_init();
+  #if (MI_DEBUG)
+  _mi_verbose_message("debug level : %d\n", MI_DEBUG);
+  #endif
+  _mi_verbose_message("secure level: %d\n", MI_SECURE);
+  mi_thread_init();
+
+  #if defined(_WIN32) && !defined(MI_SHARED_LIB)
+  // When building as a static lib the FLS cleanup happens to early for the main thread.
+  // To avoid this, set the FLS value for the main thread to NULL so the fls cleanup
+  // will not call _mi_thread_done on the (still executing) main thread. See issue #508.
+  FlsSetValue(mi_fls_key, NULL);
+  #endif
+
+  mi_stats_reset();  // only call stat reset *after* thread init (or the heap tld == NULL)
+
+  if (mi_option_is_enabled(mi_option_reserve_huge_os_pages)) {
+    size_t pages = mi_option_get_clamp(mi_option_reserve_huge_os_pages, 0, 128*1024);
+    long reserve_at = mi_option_get(mi_option_reserve_huge_os_pages_at);
+    if (reserve_at != -1) {
+      mi_reserve_huge_os_pages_at(pages, reserve_at, pages*500);
+    } else {
+      mi_reserve_huge_os_pages_interleave(pages, 0, pages*500);
+    }
+  } 
+  if (mi_option_is_enabled(mi_option_reserve_os_memory)) {
+    long ksize = mi_option_get(mi_option_reserve_os_memory);
+    if (ksize > 0) {
+      mi_reserve_os_memory((size_t)ksize*MI_KiB, true /* commit? */, true /* allow large pages? */);
+    }
+  }
+}
+
+// Called when the process is done (through `at_exit`)
+static void mi_process_done(void) {
+  // only shutdown if we were initialized
+  if (!_mi_process_is_initialized) return;
+  // ensure we are called once
+  static bool process_done = false;
+  if (process_done) return;
+  process_done = true;
+
+  #if defined(_WIN32) && !defined(MI_SHARED_LIB)
+  FlsFree(mi_fls_key);  // call thread-done on all threads (except the main thread) to prevent dangling callback pointer if statically linked with a DLL; Issue #208
+  #endif
+  
+  #ifndef MI_SKIP_COLLECT_ON_EXIT
+    #if (MI_DEBUG != 0) || !defined(MI_SHARED_LIB)  
+    // free all memory if possible on process exit. This is not needed for a stand-alone process
+    // but should be done if mimalloc is statically linked into another shared library which
+    // is repeatedly loaded/unloaded, see issue #281.
+    mi_collect(true /* force */ );
+    #endif
+  #endif
+
+  if (mi_option_is_enabled(mi_option_show_stats) || mi_option_is_enabled(mi_option_verbose)) {
+    mi_stats_print(NULL);
+  }
+  mi_allocator_done();  
+  _mi_verbose_message("process done: 0x%zx\n", _mi_heap_main.thread_id);
+  os_preloading = true; // don't call the C runtime anymore
+}
+
+
+
+#if defined(_WIN32) && defined(MI_SHARED_LIB)
+  // Windows DLL: easy to hook into process_init and thread_done
+  __declspec(dllexport) BOOL WINAPI DllMain(HINSTANCE inst, DWORD reason, LPVOID reserved) {
+    MI_UNUSED(reserved);
+    MI_UNUSED(inst);
+    if (reason==DLL_PROCESS_ATTACH) {
+      mi_process_load();
+    }
+    else if (reason==DLL_PROCESS_DETACH) {
+      mi_process_done();
+    }
+    else if (reason==DLL_THREAD_DETACH) {
+      if (!mi_is_redirected()) {
+        mi_thread_done();
+      }
+    }    
+    return TRUE;
+  }
+
+#elif defined(_MSC_VER)
+  // MSVC: use data section magic for static libraries
+  // See <https://www.codeguru.com/cpp/misc/misc/applicationcontrol/article.php/c6945/Running-Code-Before-and-After-Main.htm>
+  static int _mi_process_init(void) {
+    mi_process_load();
+    return 0;
+  }
+  typedef int(*_mi_crt_callback_t)(void);
+  #if defined(_M_X64) || defined(_M_ARM64)
+    __pragma(comment(linker, "/include:" "_mi_msvc_initu"))
+    #pragma section(".CRT$XIU", long, read)
+  #else
+    __pragma(comment(linker, "/include:" "__mi_msvc_initu"))
+  #endif
+  #pragma data_seg(".CRT$XIU")
+  mi_decl_externc _mi_crt_callback_t _mi_msvc_initu[] = { &_mi_process_init };
+  #pragma data_seg()
+
+#elif defined(__cplusplus)
+  // C++: use static initialization to detect process start
+  static bool _mi_process_init(void) {
+    mi_process_load();
+    return (_mi_heap_main.thread_id != 0);
+  }
+  static bool mi_initialized = _mi_process_init();
+
+#elif defined(__GNUC__) || defined(__clang__)
+  // GCC,Clang: use the constructor attribute
+  static void __attribute__((constructor)) _mi_process_init(void) {
+    mi_process_load();
+  }
+
+#else
+#pragma message("define a way to call mi_process_load on your platform")
+#endif
diff --git a/source/luametatex/source/libraries/mimalloc/src/options.c b/source/luametatex/source/libraries/mimalloc/src/options.c
new file mode 100644
index 000000000..6b2379322
--- /dev/null
+++ b/source/luametatex/source/libraries/mimalloc/src/options.c
@@ -0,0 +1,627 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2018-2021, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+#include "mimalloc.h"
+#include "mimalloc-internal.h"
+#include "mimalloc-atomic.h"
+
+#include <stdio.h>
+#include <stdlib.h> // strtol
+#include <string.h> // strncpy, strncat, strlen, strstr
+#include <ctype.h>  // toupper
+#include <stdarg.h>
+
+#ifdef _MSC_VER
+#pragma warning(disable:4996)   // strncpy, strncat
+#endif
+
+
+static long mi_max_error_count   = 16; // stop outputting errors after this (use < 0 for no limit)
+static long mi_max_warning_count = 16; // stop outputting warnings after this (use < 0 for no limit)
+
+static void mi_add_stderr_output(void);
+
+int mi_version(void) mi_attr_noexcept {
+  return MI_MALLOC_VERSION;
+}
+
+#ifdef _WIN32
+#include <conio.h>
+#endif
+
+// --------------------------------------------------------
+// Options
+// These can be accessed by multiple threads and may be
+// concurrently initialized, but an initializing data race
+// is ok since they resolve to the same value.
+// --------------------------------------------------------
+typedef enum mi_init_e {
+  UNINIT,       // not yet initialized
+  DEFAULTED,    // not found in the environment, use default value
+  INITIALIZED   // found in environment or set explicitly
+} mi_init_t;
+
+typedef struct mi_option_desc_s {
+  long        value;  // the value
+  mi_init_t   init;   // is it initialized yet? (from the environment)
+  mi_option_t option; // for debugging: the option index should match the option
+  const char* name;   // option name without `mimalloc_` prefix
+  const char* legacy_name; // potential legacy v1.x option name
+} mi_option_desc_t;
+
+#define MI_OPTION(opt)                  mi_option_##opt, #opt, NULL
+#define MI_OPTION_LEGACY(opt,legacy)    mi_option_##opt, #opt, #legacy
+
+static mi_option_desc_t options[_mi_option_last] =
+{
+  // stable options
+  #if MI_DEBUG || defined(MI_SHOW_ERRORS)
+  { 1, UNINIT, MI_OPTION(show_errors) },
+  #else
+  { 0, UNINIT, MI_OPTION(show_errors) },
+  #endif
+  { 0, UNINIT, MI_OPTION(show_stats) },
+  { 0, UNINIT, MI_OPTION(verbose) },
+
+  // Some of the following options are experimental and not all combinations are valid. Use with care.
+  { 1, UNINIT, MI_OPTION(eager_commit) },        // commit per segment directly (8MiB)  (but see also `eager_commit_delay`)
+  { 0, UNINIT, MI_OPTION(deprecated_eager_region_commit) },
+  { 0, UNINIT, MI_OPTION(deprecated_reset_decommits) },
+  { 0, UNINIT, MI_OPTION(large_os_pages) },      // use large OS pages, use only with eager commit to prevent fragmentation of VMA's
+  { 0, UNINIT, MI_OPTION(reserve_huge_os_pages) },  // per 1GiB huge pages
+  { -1, UNINIT, MI_OPTION(reserve_huge_os_pages_at) }, // reserve huge pages at node N
+  { 0, UNINIT, MI_OPTION(reserve_os_memory)     },
+  { 0, UNINIT, MI_OPTION(deprecated_segment_cache) },  // cache N segments per thread
+  { 0, UNINIT, MI_OPTION(page_reset) },          // reset page memory on free
+  { 0, UNINIT, MI_OPTION_LEGACY(abandoned_page_decommit, abandoned_page_reset) },// decommit free page memory when a thread terminates
+  { 0, UNINIT, MI_OPTION(deprecated_segment_reset) },
+  #if defined(__NetBSD__)
+  { 0, UNINIT, MI_OPTION(eager_commit_delay) },  // the first N segments per thread are not eagerly committed
+  #elif defined(_WIN32)
+  { 4, UNINIT, MI_OPTION(eager_commit_delay) },  // the first N segments per thread are not eagerly committed (but per page in the segment on demand)
+  #else
+  { 1, UNINIT, MI_OPTION(eager_commit_delay) },  // the first N segments per thread are not eagerly committed (but per page in the segment on demand)
+  #endif
+  { 25,   UNINIT, MI_OPTION_LEGACY(decommit_delay, reset_delay) }, // page decommit delay in milli-seconds
+  { 0,    UNINIT, MI_OPTION(use_numa_nodes) },    // 0 = use available numa nodes, otherwise use at most N nodes.
+  { 0,    UNINIT, MI_OPTION(limit_os_alloc) },    // 1 = do not use OS memory for allocation (but only reserved arenas)
+  { 100,  UNINIT, MI_OPTION(os_tag) },            // only apple specific for now but might serve more or less related purpose
+  { 16,   UNINIT, MI_OPTION(max_errors) },        // maximum errors that are output
+  { 16,   UNINIT, MI_OPTION(max_warnings) },      // maximum warnings that are output
+  { 8,    UNINIT, MI_OPTION(max_segment_reclaim)},// max. number of segment reclaims from the abandoned segments per try.
+  { 1,    UNINIT, MI_OPTION(allow_decommit) },    // decommit slices when no longer used (after decommit_delay milli-seconds)
+  { 500,  UNINIT, MI_OPTION(segment_decommit_delay) }, // decommit delay in milli-seconds for freed segments
+  { 2,    UNINIT, MI_OPTION(decommit_extend_delay) }
+};
+
+static void mi_option_init(mi_option_desc_t* desc);
+
+void _mi_options_init(void) {
+  // called on process load; should not be called before the CRT is initialized!
+  // (e.g. do not call this from process_init as that may run before CRT initialization)
+  mi_add_stderr_output(); // now it safe to use stderr for output
+  for(int i = 0; i < _mi_option_last; i++ ) {
+    mi_option_t option = (mi_option_t)i;
+    long l = mi_option_get(option); MI_UNUSED(l); // initialize
+    if (option != mi_option_verbose) {
+      mi_option_desc_t* desc = &options[option];
+      _mi_verbose_message("option '%s': %ld\n", desc->name, desc->value);
+    }
+  }
+  mi_max_error_count = mi_option_get(mi_option_max_errors);
+  mi_max_warning_count = mi_option_get(mi_option_max_warnings);
+}
+
+mi_decl_nodiscard long mi_option_get(mi_option_t option) {
+  mi_assert(option >= 0 && option < _mi_option_last);
+  if (option < 0 || option >= _mi_option_last) return 0;
+  mi_option_desc_t* desc = &options[option];
+  mi_assert(desc->option == option);  // index should match the option
+  if (mi_unlikely(desc->init == UNINIT)) {
+    mi_option_init(desc);
+  }
+  return desc->value;
+}
+
+mi_decl_nodiscard long mi_option_get_clamp(mi_option_t option, long min, long max) {
+  long x = mi_option_get(option);
+  return (x < min ? min : (x > max ? max : x));
+}
+
+void mi_option_set(mi_option_t option, long value) {
+  mi_assert(option >= 0 && option < _mi_option_last);
+  if (option < 0 || option >= _mi_option_last) return;
+  mi_option_desc_t* desc = &options[option];
+  mi_assert(desc->option == option);  // index should match the option
+  desc->value = value;
+  desc->init = INITIALIZED;
+}
+
+void mi_option_set_default(mi_option_t option, long value) {
+  mi_assert(option >= 0 && option < _mi_option_last);
+  if (option < 0 || option >= _mi_option_last) return;
+  mi_option_desc_t* desc = &options[option];
+  if (desc->init != INITIALIZED) {
+    desc->value = value;
+  }
+}
+
+mi_decl_nodiscard bool mi_option_is_enabled(mi_option_t option) {
+  return (mi_option_get(option) != 0);
+}
+
+void mi_option_set_enabled(mi_option_t option, bool enable) {
+  mi_option_set(option, (enable ? 1 : 0));
+}
+
+void mi_option_set_enabled_default(mi_option_t option, bool enable) {
+  mi_option_set_default(option, (enable ? 1 : 0));
+}
+
+void mi_option_enable(mi_option_t option) {
+  mi_option_set_enabled(option,true);
+}
+
+void mi_option_disable(mi_option_t option) {
+  mi_option_set_enabled(option,false);
+}
+
+
+static void mi_out_stderr(const char* msg, void* arg) {
+  MI_UNUSED(arg);
+  if (msg == NULL) return;
+  #ifdef _WIN32
+  // on windows with redirection, the C runtime cannot handle locale dependent output
+  // after the main thread closes so we use direct console output.
+  if (!_mi_preloading()) {
+    // _cputs(msg);  // _cputs cannot be used at is aborts if it fails to lock the console
+    static HANDLE hcon = INVALID_HANDLE_VALUE;
+    if (hcon == INVALID_HANDLE_VALUE) {
+      hcon = GetStdHandle(STD_ERROR_HANDLE);
+    }
+    const size_t len = strlen(msg);
+    if (hcon != INVALID_HANDLE_VALUE && len > 0 && len < UINT32_MAX) {
+      DWORD written = 0;
+      WriteConsoleA(hcon, msg, (DWORD)len, &written, NULL);
+    }
+  }
+  #else
+  fputs(msg, stderr);
+  #endif
+}
+
+// Since an output function can be registered earliest in the `main`
+// function we also buffer output that happens earlier. When
+// an output function is registered it is called immediately with
+// the output up to that point.
+#ifndef MI_MAX_DELAY_OUTPUT
+#define MI_MAX_DELAY_OUTPUT ((size_t)(32*1024))
+#endif
+static char out_buf[MI_MAX_DELAY_OUTPUT+1];
+static _Atomic(size_t) out_len;
+
+static void mi_out_buf(const char* msg, void* arg) {
+  MI_UNUSED(arg);
+  if (msg==NULL) return;
+  if (mi_atomic_load_relaxed(&out_len)>=MI_MAX_DELAY_OUTPUT) return;
+  size_t n = strlen(msg);
+  if (n==0) return;
+  // claim space
+  size_t start = mi_atomic_add_acq_rel(&out_len, n);
+  if (start >= MI_MAX_DELAY_OUTPUT) return;
+  // check bound
+  if (start+n >= MI_MAX_DELAY_OUTPUT) {
+    n = MI_MAX_DELAY_OUTPUT-start-1;
+  }
+  _mi_memcpy(&out_buf[start], msg, n);
+}
+
+static void mi_out_buf_flush(mi_output_fun* out, bool no_more_buf, void* arg) {
+  if (out==NULL) return;
+  // claim (if `no_more_buf == true`, no more output will be added after this point)
+  size_t count = mi_atomic_add_acq_rel(&out_len, (no_more_buf ? MI_MAX_DELAY_OUTPUT : 1));
+  // and output the current contents
+  if (count>MI_MAX_DELAY_OUTPUT) count = MI_MAX_DELAY_OUTPUT;
+  out_buf[count] = 0;
+  out(out_buf,arg);
+  if (!no_more_buf) {
+    out_buf[count] = '\n'; // if continue with the buffer, insert a newline
+  }
+}
+
+
+// Once this module is loaded, switch to this routine
+// which outputs to stderr and the delayed output buffer.
+static void mi_out_buf_stderr(const char* msg, void* arg) {
+  mi_out_stderr(msg,arg);
+  mi_out_buf(msg,arg);
+}
+
+
+
+// --------------------------------------------------------
+// Default output handler
+// --------------------------------------------------------
+
+// Should be atomic but gives errors on many platforms as generally we cannot cast a function pointer to a uintptr_t.
+// For now, don't register output from multiple threads.
+static mi_output_fun* volatile mi_out_default; // = NULL
+static _Atomic(void*) mi_out_arg; // = NULL
+
+static mi_output_fun* mi_out_get_default(void** parg) {
+  if (parg != NULL) { *parg = mi_atomic_load_ptr_acquire(void,&mi_out_arg); }
+  mi_output_fun* out = mi_out_default;
+  return (out == NULL ? &mi_out_buf : out);
+}
+
+void mi_register_output(mi_output_fun* out, void* arg) mi_attr_noexcept {
+  mi_out_default = (out == NULL ? &mi_out_stderr : out); // stop using the delayed output buffer
+  mi_atomic_store_ptr_release(void,&mi_out_arg, arg);
+  if (out!=NULL) mi_out_buf_flush(out,true,arg);         // output all the delayed output now
+}
+
+// add stderr to the delayed output after the module is loaded
+static void mi_add_stderr_output() {
+  mi_assert_internal(mi_out_default == NULL);
+  mi_out_buf_flush(&mi_out_stderr, false, NULL); // flush current contents to stderr
+  mi_out_default = &mi_out_buf_stderr;           // and add stderr to the delayed output
+}
+
+// --------------------------------------------------------
+// Messages, all end up calling `_mi_fputs`.
+// --------------------------------------------------------
+static _Atomic(size_t) error_count;   // = 0;  // when >= max_error_count stop emitting errors
+static _Atomic(size_t) warning_count; // = 0;  // when >= max_warning_count stop emitting warnings
+
+// When overriding malloc, we may recurse into mi_vfprintf if an allocation
+// inside the C runtime causes another message.
+// In some cases (like on macOS) the loader already allocates which
+// calls into mimalloc; if we then access thread locals (like `recurse`)
+// this may crash as the access may call _tlv_bootstrap that tries to
+// (recursively) invoke malloc again to allocate space for the thread local
+// variables on demand. This is why we use a _mi_preloading test on such
+// platforms. However, C code generator may move the initial thread local address
+// load before the `if` and we therefore split it out in a separate funcion.
+static mi_decl_thread bool recurse = false;
+
+static mi_decl_noinline bool mi_recurse_enter_prim(void) {
+  if (recurse) return false;
+  recurse = true;
+  return true;
+}
+
+static mi_decl_noinline void mi_recurse_exit_prim(void) {
+  recurse = false;
+}
+
+static bool mi_recurse_enter(void) {
+  #if defined(__APPLE__) || defined(MI_TLS_RECURSE_GUARD)
+  if (_mi_preloading()) return true;
+  #endif
+  return mi_recurse_enter_prim();
+}
+
+static void mi_recurse_exit(void) {
+  #if defined(__APPLE__) || defined(MI_TLS_RECURSE_GUARD)
+  if (_mi_preloading()) return;
+  #endif
+  mi_recurse_exit_prim();
+}
+
+void _mi_fputs(mi_output_fun* out, void* arg, const char* prefix, const char* message) {
+  if (out==NULL || (FILE*)out==stdout || (FILE*)out==stderr) { // TODO: use mi_out_stderr for stderr?
+    if (!mi_recurse_enter()) return;
+    out = mi_out_get_default(&arg);
+    if (prefix != NULL) out(prefix, arg);
+    out(message, arg);
+    mi_recurse_exit();
+  }
+  else {
+    if (prefix != NULL) out(prefix, arg);
+    out(message, arg);
+  }
+}
+
+// Define our own limited `fprintf` that avoids memory allocation.
+// We do this using `snprintf` with a limited buffer.
+static void mi_vfprintf( mi_output_fun* out, void* arg, const char* prefix, const char* fmt, va_list args ) {
+  char buf[512];
+  if (fmt==NULL) return;
+  if (!mi_recurse_enter()) return;
+  vsnprintf(buf,sizeof(buf)-1,fmt,args);
+  mi_recurse_exit();
+  _mi_fputs(out,arg,prefix,buf);
+}
+
+void _mi_fprintf( mi_output_fun* out, void* arg, const char* fmt, ... ) {
+  va_list args;
+  va_start(args,fmt);
+  mi_vfprintf(out,arg,NULL,fmt,args);
+  va_end(args);
+}
+
+static void mi_vfprintf_thread(mi_output_fun* out, void* arg, const char* prefix, const char* fmt, va_list args) {
+  if (prefix != NULL && strlen(prefix) <= 32 && !_mi_is_main_thread()) {
+    char tprefix[64];
+    snprintf(tprefix, sizeof(tprefix), "%sthread 0x%x: ", prefix, (unsigned) _mi_thread_id()); /* HH: %z is unknown */
+    mi_vfprintf(out, arg, tprefix, fmt, args);
+  }
+  else {
+    mi_vfprintf(out, arg, prefix, fmt, args);
+  }
+}
+
+void _mi_trace_message(const char* fmt, ...) {
+  if (mi_option_get(mi_option_verbose) <= 1) return;  // only with verbose level 2 or higher
+  va_list args;
+  va_start(args, fmt);
+  mi_vfprintf_thread(NULL, NULL, "mimalloc: ", fmt, args);
+  va_end(args);
+}
+
+void _mi_verbose_message(const char* fmt, ...) {
+  if (!mi_option_is_enabled(mi_option_verbose)) return;
+  va_list args;
+  va_start(args,fmt);
+  mi_vfprintf(NULL, NULL, "mimalloc: ", fmt, args);
+  va_end(args);
+}
+
+static void mi_show_error_message(const char* fmt, va_list args) {
+  if (!mi_option_is_enabled(mi_option_verbose)) {
+    if (!mi_option_is_enabled(mi_option_show_errors)) return;
+    if (mi_max_error_count >= 0 && (long)mi_atomic_increment_acq_rel(&error_count) > mi_max_error_count) return;
+  }
+  mi_vfprintf_thread(NULL, NULL, "mimalloc: error: ", fmt, args);
+}
+
+void _mi_warning_message(const char* fmt, ...) {
+  if (!mi_option_is_enabled(mi_option_verbose)) {
+    if (!mi_option_is_enabled(mi_option_show_errors)) return;
+    if (mi_max_warning_count >= 0 && (long)mi_atomic_increment_acq_rel(&warning_count) > mi_max_warning_count) return;
+  }
+  va_list args;
+  va_start(args,fmt);
+  mi_vfprintf_thread(NULL, NULL, "mimalloc: warning: ", fmt, args);
+  va_end(args);
+}
+
+
+#if MI_DEBUG
+void _mi_assert_fail(const char* assertion, const char* fname, unsigned line, const char* func ) {
+  _mi_fprintf(NULL, NULL, "mimalloc: assertion failed: at \"%s\":%u, %s\n  assertion: \"%s\"\n", fname, line, (func==NULL?"":func), assertion);
+  abort();
+}
+#endif
+
+// --------------------------------------------------------
+// Errors
+// --------------------------------------------------------
+
+static mi_error_fun* volatile  mi_error_handler; // = NULL
+static _Atomic(void*) mi_error_arg;     // = NULL
+
+static void mi_error_default(int err) {
+  MI_UNUSED(err);
+#if (MI_DEBUG>0)
+  if (err==EFAULT) {
+    #ifdef _MSC_VER
+    __debugbreak();
+    #endif
+    abort();
+  }
+#endif
+#if (MI_SECURE>0)
+  if (err==EFAULT) {  // abort on serious errors in secure mode (corrupted meta-data)
+    abort();
+  }
+#endif
+#if defined(MI_XMALLOC)
+  if (err==ENOMEM || err==EOVERFLOW) { // abort on memory allocation fails in xmalloc mode
+    abort();
+  }
+#endif
+}
+
+void mi_register_error(mi_error_fun* fun, void* arg) {
+  mi_error_handler = fun;  // can be NULL
+  mi_atomic_store_ptr_release(void,&mi_error_arg, arg);
+}
+
+void _mi_error_message(int err, const char* fmt, ...) {
+  // show detailed error message
+  va_list args;
+  va_start(args, fmt);
+  mi_show_error_message(fmt, args);
+  va_end(args);
+  // and call the error handler which may abort (or return normally)
+  if (mi_error_handler != NULL) {
+    mi_error_handler(err, mi_atomic_load_ptr_acquire(void,&mi_error_arg));
+  }
+  else {
+    mi_error_default(err);
+  }
+}
+
+// --------------------------------------------------------
+// Initialize options by checking the environment
+// --------------------------------------------------------
+
+static void mi_strlcpy(char* dest, const char* src, size_t dest_size) {
+  if (dest==NULL || src==NULL || dest_size == 0) return;
+  // copy until end of src, or when dest is (almost) full
+  while (*src != 0 && dest_size > 1) {
+    *dest++ = *src++;
+    dest_size--;
+  }
+  // always zero terminate
+  *dest = 0;
+}
+
+static void mi_strlcat(char* dest, const char* src, size_t dest_size) {
+  if (dest==NULL || src==NULL || dest_size == 0) return;
+  // find end of string in the dest buffer
+  while (*dest != 0 && dest_size > 1) {
+    dest++;
+    dest_size--;
+  }
+  // and catenate
+  mi_strlcpy(dest, src, dest_size);
+}
+
+#ifdef MI_NO_GETENV
+static bool mi_getenv(const char* name, char* result, size_t result_size) {
+  MI_UNUSED(name);
+  MI_UNUSED(result);
+  MI_UNUSED(result_size);
+  return false;
+}
+#else
+static inline int mi_strnicmp(const char* s, const char* t, size_t n) {
+  if (n==0) return 0;
+  for (; *s != 0 && *t != 0 && n > 0; s++, t++, n--) {
+    if (toupper(*s) != toupper(*t)) break;
+  }
+  return (n==0 ? 0 : *s - *t);
+}
+#if defined _WIN32
+// On Windows use GetEnvironmentVariable instead of getenv to work
+// reliably even when this is invoked before the C runtime is initialized.
+// i.e. when `_mi_preloading() == true`.
+// Note: on windows, environment names are not case sensitive.
+#include <windows.h>
+static bool mi_getenv(const char* name, char* result, size_t result_size) {
+  result[0] = 0;
+  size_t len = GetEnvironmentVariableA(name, result, (DWORD)result_size);
+  return (len > 0 && len < result_size);
+}
+#elif !defined(MI_USE_ENVIRON) || (MI_USE_ENVIRON!=0)
+// On Posix systemsr use `environ` to acces environment variables
+// even before the C runtime is initialized.
+#if defined(__APPLE__) && defined(__has_include) && __has_include(<crt_externs.h>)
+#include <crt_externs.h>
+static char** mi_get_environ(void) {
+  return (*_NSGetEnviron());
+}
+#else
+extern char** environ;
+static char** mi_get_environ(void) {
+  return environ;
+}
+#endif
+static bool mi_getenv(const char* name, char* result, size_t result_size) {
+  if (name==NULL) return false;
+  const size_t len = strlen(name);
+  if (len == 0) return false;
+  char** env = mi_get_environ();
+  if (env == NULL) return false;
+  // compare up to 256 entries
+  for (int i = 0; i < 256 && env[i] != NULL; i++) {
+    const char* s = env[i];
+    if (mi_strnicmp(name, s, len) == 0 && s[len] == '=') { // case insensitive
+      // found it
+      mi_strlcpy(result, s + len + 1, result_size);
+      return true;
+    }
+  }
+  return false;
+}
+#else
+// fallback: use standard C `getenv` but this cannot be used while initializing the C runtime
+static bool mi_getenv(const char* name, char* result, size_t result_size) {
+  // cannot call getenv() when still initializing the C runtime.
+  if (_mi_preloading()) return false;
+  const char* s = getenv(name);
+  if (s == NULL) {
+    // we check the upper case name too.
+    char buf[64+1];
+    size_t len = strlen(name);
+    if (len >= sizeof(buf)) len = sizeof(buf) - 1;
+    for (size_t i = 0; i < len; i++) {
+      buf[i] = toupper(name[i]);
+    }
+    buf[len] = 0;
+    s = getenv(buf);
+  }
+  if (s != NULL && strlen(s) < result_size) {
+    mi_strlcpy(result, s, result_size);
+    return true;
+  }
+  else {
+    return false;
+  }
+}
+#endif  // !MI_USE_ENVIRON
+#endif  // !MI_NO_GETENV
+
+static void mi_option_init(mi_option_desc_t* desc) {
+  // Read option value from the environment
+  char s[64+1];
+  char buf[64+1];
+  mi_strlcpy(buf, "mimalloc_", sizeof(buf));
+  mi_strlcat(buf, desc->name, sizeof(buf));
+  bool found = mi_getenv(buf,s,sizeof(s));
+  if (!found && desc->legacy_name != NULL) {
+    mi_strlcpy(buf, "mimalloc_", sizeof(buf));
+    mi_strlcat(buf, desc->legacy_name, sizeof(buf));
+    found = mi_getenv(buf,s,sizeof(s));
+    if (found) {
+      _mi_warning_message("environment option \"mimalloc_%s\" is deprecated -- use \"mimalloc_%s\" instead.\n", desc->legacy_name, desc->name );
+    }
+  }
+
+  if (found) {
+    size_t len = strlen(s);
+    if (len >= sizeof(buf)) len = sizeof(buf) - 1;
+    for (size_t i = 0; i < len; i++) {
+      buf[i] = (char)toupper(s[i]);
+    }
+    buf[len] = 0;
+    if (buf[0]==0 || strstr("1;TRUE;YES;ON", buf) != NULL) {
+      desc->value = 1;
+      desc->init = INITIALIZED;
+    }
+    else if (strstr("0;FALSE;NO;OFF", buf) != NULL) {
+      desc->value = 0;
+      desc->init = INITIALIZED;
+    }
+    else {
+      char* end = buf;
+      long value = strtol(buf, &end, 10);
+      if (desc->option == mi_option_reserve_os_memory) {
+        // this option is interpreted in KiB to prevent overflow of `long`
+        if (*end == 'K') { end++; }
+        else if (*end == 'M') { value *= MI_KiB; end++; }
+        else if (*end == 'G') { value *= MI_MiB; end++; }
+        else { value = (value + MI_KiB - 1) / MI_KiB; }
+        if (end[0] == 'I' && end[1] == 'B') { end += 2; }
+        else if (*end == 'B') { end++; }
+      }
+      if (*end == 0) {
+        desc->value = value;
+        desc->init = INITIALIZED;
+      }
+      else {
+        // set `init` first to avoid recursion through _mi_warning_message on mimalloc_verbose.
+        desc->init = DEFAULTED;
+        if (desc->option == mi_option_verbose && desc->value == 0) {
+          // if the 'mimalloc_verbose' env var has a bogus value we'd never know
+          // (since the value defaults to 'off') so in that case briefly enable verbose
+          desc->value = 1;
+          _mi_warning_message("environment option mimalloc_%s has an invalid value.\n", desc->name );
+          desc->value = 0;
+        }
+        else {
+          _mi_warning_message("environment option mimalloc_%s has an invalid value.\n", desc->name );
+        }
+      }
+    }
+    mi_assert_internal(desc->init != UNINIT);
+  }
+  else if (!_mi_preloading()) {
+    desc->init = DEFAULTED;
+  }
+}
diff --git a/source/luametatex/source/libraries/mimalloc/src/os.c b/source/luametatex/source/libraries/mimalloc/src/os.c
new file mode 100644
index 000000000..72959d818
--- /dev/null
+++ b/source/luametatex/source/libraries/mimalloc/src/os.c
@@ -0,0 +1,1443 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2018-2021, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+#ifndef _DEFAULT_SOURCE
+#define _DEFAULT_SOURCE   // ensure mmap flags are defined
+#endif
+
+#if defined(__sun)
+// illumos provides new mman.h api when any of these are defined
+// otherwise the old api based on caddr_t which predates the void pointers one.
+// stock solaris provides only the former, chose to atomically to discard those
+// flags only here rather than project wide tough.
+#undef _XOPEN_SOURCE
+#undef _POSIX_C_SOURCE
+#endif
+#include "mimalloc.h"
+#include "mimalloc-internal.h"
+#include "mimalloc-atomic.h"
+
+#include <string.h>  // strerror
+
+#ifdef _MSC_VER
+#pragma warning(disable:4996)  // strerror
+#endif
+
+#if defined(__wasi__)
+#define MI_USE_SBRK
+#endif
+
+#if defined(_WIN32)
+#include <windows.h>
+#elif defined(__wasi__)
+#include <unistd.h>    // sbrk
+#else
+#include <sys/mman.h>  // mmap
+#include <unistd.h>    // sysconf
+#if defined(__linux__)
+#include <features.h>
+#include <fcntl.h>
+#if defined(__GLIBC__)
+#include <linux/mman.h> // linux mmap flags
+#else
+#include <sys/mman.h>
+#endif
+#endif
+#if defined(__APPLE__)
+#include <TargetConditionals.h>
+#if !TARGET_IOS_IPHONE && !TARGET_IOS_SIMULATOR
+#include <mach/vm_statistics.h>
+#endif
+#endif
+#if defined(__FreeBSD__) || defined(__DragonFly__)
+#include <sys/param.h>
+#if __FreeBSD_version >= 1200000
+#include <sys/cpuset.h>
+#include <sys/domainset.h>
+#endif
+#include <sys/sysctl.h>
+#endif
+#endif
+
+/* -----------------------------------------------------------
+  Initialization.
+  On windows initializes support for aligned allocation and
+  large OS pages (if MIMALLOC_LARGE_OS_PAGES is true).
+----------------------------------------------------------- */
+bool _mi_os_decommit(void* addr, size_t size, mi_stats_t* stats);
+bool _mi_os_commit(void* addr, size_t size, bool* is_zero, mi_stats_t* tld_stats);
+
+static void* mi_align_up_ptr(void* p, size_t alignment) {
+  return (void*)_mi_align_up((uintptr_t)p, alignment);
+}
+
+static void* mi_align_down_ptr(void* p, size_t alignment) {
+  return (void*)_mi_align_down((uintptr_t)p, alignment);
+}
+
+
+// page size (initialized properly in `os_init`)
+static size_t os_page_size = 4096;
+
+// minimal allocation granularity
+static size_t os_alloc_granularity = 4096;
+
+// if non-zero, use large page allocation
+static size_t large_os_page_size = 0;
+
+// is memory overcommit allowed? 
+// set dynamically in _mi_os_init (and if true we use MAP_NORESERVE)
+static bool os_overcommit = true;
+
+bool _mi_os_has_overcommit(void) {
+  return os_overcommit;
+}
+
+// OS (small) page size
+size_t _mi_os_page_size(void) {
+  return os_page_size;
+}
+
+// if large OS pages are supported (2 or 4MiB), then return the size, otherwise return the small page size (4KiB)
+size_t _mi_os_large_page_size(void) {
+  return (large_os_page_size != 0 ? large_os_page_size : _mi_os_page_size());
+}
+
+#if !defined(MI_USE_SBRK) && !defined(__wasi__)
+static bool use_large_os_page(size_t size, size_t alignment) {
+  // if we have access, check the size and alignment requirements
+  if (large_os_page_size == 0 || !mi_option_is_enabled(mi_option_large_os_pages)) return false;
+  return ((size % large_os_page_size) == 0 && (alignment % large_os_page_size) == 0);
+}
+#endif
+
+// round to a good OS allocation size (bounded by max 12.5% waste)
+size_t _mi_os_good_alloc_size(size_t size) {
+  size_t align_size;
+  if (size < 512*MI_KiB) align_size = _mi_os_page_size();
+  else if (size < 2*MI_MiB) align_size = 64*MI_KiB;
+  else if (size < 8*MI_MiB) align_size = 256*MI_KiB;
+  else if (size < 32*MI_MiB) align_size = 1*MI_MiB;
+  else align_size = 4*MI_MiB;
+  if (mi_unlikely(size >= (SIZE_MAX - align_size))) return size; // possible overflow?
+  return _mi_align_up(size, align_size);
+}
+
+#if defined(_WIN32)
+// We use VirtualAlloc2 for aligned allocation, but it is only supported on Windows 10 and Windows Server 2016.
+// So, we need to look it up dynamically to run on older systems. (use __stdcall for 32-bit compatibility)
+// NtAllocateVirtualAllocEx is used for huge OS page allocation (1GiB)
+// We define a minimal MEM_EXTENDED_PARAMETER ourselves in order to be able to compile with older SDK's.
+typedef enum MI_MEM_EXTENDED_PARAMETER_TYPE_E {
+  MiMemExtendedParameterInvalidType = 0,
+  MiMemExtendedParameterAddressRequirements,
+  MiMemExtendedParameterNumaNode,
+  MiMemExtendedParameterPartitionHandle,
+  MiMemExtendedParameterUserPhysicalHandle,
+  MiMemExtendedParameterAttributeFlags,
+  MiMemExtendedParameterMax
+} MI_MEM_EXTENDED_PARAMETER_TYPE; 
+
+typedef struct DECLSPEC_ALIGN(8) MI_MEM_EXTENDED_PARAMETER_S {
+  struct { DWORD64 Type : 8; DWORD64 Reserved : 56; } Type;
+  union  { DWORD64 ULong64; PVOID Pointer; SIZE_T Size; HANDLE Handle; DWORD ULong; } Arg;
+} MI_MEM_EXTENDED_PARAMETER;
+
+typedef struct MI_MEM_ADDRESS_REQUIREMENTS_S {
+  PVOID  LowestStartingAddress;
+  PVOID  HighestEndingAddress;
+  SIZE_T Alignment;
+} MI_MEM_ADDRESS_REQUIREMENTS;
+
+#define MI_MEM_EXTENDED_PARAMETER_NONPAGED_HUGE   0x00000010
+
+#include <winternl.h>
+typedef PVOID    (__stdcall *PVirtualAlloc2)(HANDLE, PVOID, SIZE_T, ULONG, ULONG, MI_MEM_EXTENDED_PARAMETER*, ULONG);
+typedef NTSTATUS (__stdcall *PNtAllocateVirtualMemoryEx)(HANDLE, PVOID*, SIZE_T*, ULONG, ULONG, MI_MEM_EXTENDED_PARAMETER*, ULONG);
+static PVirtualAlloc2 pVirtualAlloc2 = NULL;
+static PNtAllocateVirtualMemoryEx pNtAllocateVirtualMemoryEx = NULL;
+
+// Similarly, GetNumaProcesorNodeEx is only supported since Windows 7
+typedef struct MI_PROCESSOR_NUMBER_S { WORD Group; BYTE Number; BYTE Reserved; } MI_PROCESSOR_NUMBER;
+
+typedef VOID (__stdcall *PGetCurrentProcessorNumberEx)(MI_PROCESSOR_NUMBER* ProcNumber);
+typedef BOOL (__stdcall *PGetNumaProcessorNodeEx)(MI_PROCESSOR_NUMBER* Processor, PUSHORT NodeNumber);
+typedef BOOL (__stdcall* PGetNumaNodeProcessorMaskEx)(USHORT Node, PGROUP_AFFINITY ProcessorMask);
+static PGetCurrentProcessorNumberEx pGetCurrentProcessorNumberEx = NULL;
+static PGetNumaProcessorNodeEx      pGetNumaProcessorNodeEx = NULL;
+static PGetNumaNodeProcessorMaskEx  pGetNumaNodeProcessorMaskEx = NULL;
+
+static bool mi_win_enable_large_os_pages(void)
+{
+  if (large_os_page_size > 0) return true;
+
+  // Try to see if large OS pages are supported
+  // To use large pages on Windows, we first need access permission
+  // Set "Lock pages in memory" permission in the group policy editor
+  // <https://devblogs.microsoft.com/oldnewthing/20110128-00/?p=11643>
+  unsigned long err = 0;
+  HANDLE token = NULL;
+  BOOL ok = OpenProcessToken(GetCurrentProcess(), TOKEN_ADJUST_PRIVILEGES | TOKEN_QUERY, &token);
+  if (ok) {
+    TOKEN_PRIVILEGES tp;
+    ok = LookupPrivilegeValue(NULL, TEXT("SeLockMemoryPrivilege"), &tp.Privileges[0].Luid);
+    if (ok) {
+      tp.PrivilegeCount = 1;
+      tp.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED;
+      ok = AdjustTokenPrivileges(token, FALSE, &tp, 0, (PTOKEN_PRIVILEGES)NULL, 0);
+      if (ok) {
+        err = GetLastError();
+        ok = (err == ERROR_SUCCESS);
+        if (ok) {
+          large_os_page_size = GetLargePageMinimum();
+        }
+      }
+    }
+    CloseHandle(token);
+  }
+  if (!ok) {
+    if (err == 0) err = GetLastError();
+    _mi_warning_message("cannot enable large OS page support, error %lu\n", err);
+  }
+  return (ok!=0);
+}
+
+void _mi_os_init(void) 
+{
+  os_overcommit = false;
+  // get the page size
+  SYSTEM_INFO si;
+  GetSystemInfo(&si);
+  if (si.dwPageSize > 0) os_page_size = si.dwPageSize;
+  if (si.dwAllocationGranularity > 0) os_alloc_granularity = si.dwAllocationGranularity;
+  // get the VirtualAlloc2 function
+  HINSTANCE  hDll;
+  hDll = LoadLibrary(TEXT("kernelbase.dll"));
+  if (hDll != NULL) {
+    // use VirtualAlloc2FromApp if possible as it is available to Windows store apps
+    pVirtualAlloc2 = (PVirtualAlloc2)(void (*)(void))GetProcAddress(hDll, "VirtualAlloc2FromApp");
+    if (pVirtualAlloc2==NULL) pVirtualAlloc2 = (PVirtualAlloc2)(void (*)(void))GetProcAddress(hDll, "VirtualAlloc2");
+    FreeLibrary(hDll);
+  }
+  // NtAllocateVirtualMemoryEx is used for huge page allocation
+  hDll = LoadLibrary(TEXT("ntdll.dll"));
+  if (hDll != NULL) {
+    pNtAllocateVirtualMemoryEx = (PNtAllocateVirtualMemoryEx)(void (*)(void))GetProcAddress(hDll, "NtAllocateVirtualMemoryEx");
+    FreeLibrary(hDll);
+  }
+  // Try to use Win7+ numa API
+  hDll = LoadLibrary(TEXT("kernel32.dll"));
+  if (hDll != NULL) {
+    pGetCurrentProcessorNumberEx = (PGetCurrentProcessorNumberEx)(void (*)(void))GetProcAddress(hDll, "GetCurrentProcessorNumberEx");
+    pGetNumaProcessorNodeEx = (PGetNumaProcessorNodeEx)(void (*)(void))GetProcAddress(hDll, "GetNumaProcessorNodeEx");
+    pGetNumaNodeProcessorMaskEx = (PGetNumaNodeProcessorMaskEx)(void (*)(void))GetProcAddress(hDll, "GetNumaNodeProcessorMaskEx");
+    FreeLibrary(hDll);
+  }
+  if (mi_option_is_enabled(mi_option_large_os_pages) || mi_option_is_enabled(mi_option_reserve_huge_os_pages)) {
+    mi_win_enable_large_os_pages();
+  }
+}
+#elif defined(__wasi__)
+void _mi_os_init(void) {
+  os_overcommit = false;
+  os_page_size = 64*MI_KiB; // WebAssembly has a fixed page size: 64KiB
+  os_alloc_granularity = 16;
+}
+
+#else  // generic unix
+
+static void os_detect_overcommit(void) {
+#if defined(__linux__)
+  int fd = open("/proc/sys/vm/overcommit_memory", O_RDONLY);
+	if (fd < 0) return;
+  char buf[32];
+  ssize_t nread = read(fd, &buf, sizeof(buf));
+	close(fd);
+  // <https://www.kernel.org/doc/Documentation/vm/overcommit-accounting>
+  // 0: heuristic overcommit, 1: always overcommit, 2: never overcommit (ignore NORESERVE)
+  if (nread >= 1) {
+    os_overcommit = (buf[0] == '0' || buf[0] == '1');
+  }
+#elif defined(__FreeBSD__)
+  int val = 0;
+  size_t olen = sizeof(val);
+  if (sysctlbyname("vm.overcommit", &val, &olen, NULL, 0) == 0) {
+    os_overcommit = (val != 0);
+  }  
+#else
+  // default: overcommit is true  
+#endif
+}
+
+void _mi_os_init(void) {
+  // get the page size
+  long result = sysconf(_SC_PAGESIZE);
+  if (result > 0) {
+    os_page_size = (size_t)result;
+    os_alloc_granularity = os_page_size;
+  }
+  large_os_page_size = 2*MI_MiB; // TODO: can we query the OS for this?
+  os_detect_overcommit();
+}
+#endif
+
+
+#if defined(MADV_NORMAL)
+static int mi_madvise(void* addr, size_t length, int advice) {
+  #if defined(__sun)
+  return madvise((caddr_t)addr, length, advice);  // Solaris needs cast (issue #520)
+  #else
+  return madvise(addr, length, advice);
+  #endif
+}
+#endif
+
+
+/* -----------------------------------------------------------
+  aligned hinting
+-------------------------------------------------------------- */
+
+// On 64-bit systems, we can do efficient aligned allocation by using
+// the 2TiB to 30TiB area to allocate those.
+#if (MI_INTPTR_SIZE >= 8)
+static mi_decl_cache_align _Atomic(uintptr_t)aligned_base;
+
+// Return a MI_SEGMENT_SIZE aligned address that is probably available.
+// If this returns NULL, the OS will determine the address but on some OS's that may not be 
+// properly aligned which can be more costly as it needs to be adjusted afterwards.
+// For a size > 1GiB this always returns NULL in order to guarantee good ASLR randomization; 
+// (otherwise an initial large allocation of say 2TiB has a 50% chance to include (known) addresses 
+//  in the middle of the 2TiB - 6TiB address range (see issue #372))
+
+#define MI_HINT_BASE ((uintptr_t)2 << 40)  // 2TiB start
+#define MI_HINT_AREA ((uintptr_t)4 << 40)  // upto 6TiB   (since before win8 there is "only" 8TiB available to processes)
+#define MI_HINT_MAX  ((uintptr_t)30 << 40) // wrap after 30TiB (area after 32TiB is used for huge OS pages)
+
+static void* mi_os_get_aligned_hint(size_t try_alignment, size_t size)
+{
+  if (try_alignment <= 1 || try_alignment > MI_SEGMENT_SIZE) return NULL;
+  size = _mi_align_up(size, MI_SEGMENT_SIZE);
+  if (size > 1*MI_GiB) return NULL;  // guarantee the chance of fixed valid address is at most 1/(MI_HINT_AREA / 1<<30) = 1/4096.
+  #if (MI_SECURE>0)
+  size += MI_SEGMENT_SIZE;        // put in `MI_SEGMENT_SIZE` virtual gaps between hinted blocks; this splits VLA's but increases guarded areas.
+  #endif
+
+  uintptr_t hint = mi_atomic_add_acq_rel(&aligned_base, size);
+  if (hint == 0 || hint > MI_HINT_MAX) {   // wrap or initialize
+    uintptr_t init = MI_HINT_BASE;
+    #if (MI_SECURE>0 || MI_DEBUG==0)       // security: randomize start of aligned allocations unless in debug mode
+    uintptr_t r = _mi_heap_random_next(mi_get_default_heap());
+    init = init + ((MI_SEGMENT_SIZE * ((r>>17) & 0xFFFFF)) % MI_HINT_AREA);  // (randomly 20 bits)*4MiB == 0 to 4TiB
+    #endif
+    uintptr_t expected = hint + size;
+    mi_atomic_cas_strong_acq_rel(&aligned_base, &expected, init);
+    hint = mi_atomic_add_acq_rel(&aligned_base, size); // this may still give 0 or > MI_HINT_MAX but that is ok, it is a hint after all
+  }
+  if (hint%try_alignment != 0) return NULL;
+  return (void*)hint;
+}
+#else
+static void* mi_os_get_aligned_hint(size_t try_alignment, size_t size) {
+  MI_UNUSED(try_alignment); MI_UNUSED(size);
+  return NULL;
+}
+#endif
+
+/* -----------------------------------------------------------
+  Free memory
+-------------------------------------------------------------- */
+
+static bool mi_os_mem_free(void* addr, size_t size, bool was_committed, mi_stats_t* stats)
+{
+  if (addr == NULL || size == 0) return true; // || _mi_os_is_huge_reserved(addr)
+  bool err = false;
+#if defined(_WIN32)
+  DWORD errcode = 0;
+  err = (VirtualFree(addr, 0, MEM_RELEASE) == 0);
+  if (err) { errcode = GetLastError(); }
+  if (errcode == ERROR_INVALID_ADDRESS) {
+    // In mi_os_mem_alloc_aligned the fallback path may have returned a pointer inside
+    // the memory region returned by VirtualAlloc; in that case we need to free using
+    // the start of the region.
+    MEMORY_BASIC_INFORMATION info = { 0, 0 };
+    VirtualQuery(addr, &info, sizeof(info));
+    if (info.AllocationBase < addr && ((uint8_t*)addr - (uint8_t*)info.AllocationBase) < MI_SEGMENT_SIZE) {
+      errcode = 0;
+      err = (VirtualFree(info.AllocationBase, 0, MEM_RELEASE) == 0);
+      if (err) { errcode = GetLastError(); }
+    }
+  }
+  if (errcode != 0) {
+    _mi_warning_message("unable to release OS memory: error code 0x%x, addr: %p, size: %zu\n", errcode, addr, size);
+  }
+#elif defined(MI_USE_SBRK) || defined(__wasi__)
+  err = false; // sbrk heap cannot be shrunk
+#else
+  err = (munmap(addr, size) == -1);
+  if (err) {
+    _mi_warning_message("unable to release OS memory: %s, addr: %p, size: %zu\n", strerror(errno), addr, size);
+  }
+#endif
+  if (was_committed) { _mi_stat_decrease(&stats->committed, size); }
+  _mi_stat_decrease(&stats->reserved, size);
+  return !err;  
+}
+
+
+/* -----------------------------------------------------------
+  Raw allocation on Windows (VirtualAlloc) 
+-------------------------------------------------------------- */
+
+#ifdef _WIN32
+ 
+#define MEM_COMMIT_RESERVE  (MEM_COMMIT|MEM_RESERVE)
+
+static void* mi_win_virtual_allocx(void* addr, size_t size, size_t try_alignment, DWORD flags) {
+#if (MI_INTPTR_SIZE >= 8)
+  // on 64-bit systems, try to use the virtual address area after 2TiB for 4MiB aligned allocations
+  if (addr == NULL) {
+    void* hint = mi_os_get_aligned_hint(try_alignment,size);
+    if (hint != NULL) {
+      void* p = VirtualAlloc(hint, size, flags, PAGE_READWRITE);
+      if (p != NULL) return p;
+      _mi_verbose_message("warning: unable to allocate hinted aligned OS memory (%zu bytes, error code: 0x%x, address: %p, alignment: %zu, flags: 0x%x)\n", size, GetLastError(), hint, try_alignment, flags);
+      // fall through on error
+    }
+  } 
+#endif
+  // on modern Windows try use VirtualAlloc2 for aligned allocation
+  if (try_alignment > 1 && (try_alignment % _mi_os_page_size()) == 0 && pVirtualAlloc2 != NULL) {
+    MI_MEM_ADDRESS_REQUIREMENTS reqs = { 0, 0, 0 };
+    reqs.Alignment = try_alignment;
+    MI_MEM_EXTENDED_PARAMETER param = { {0, 0}, {0} };
+    param.Type.Type = MiMemExtendedParameterAddressRequirements;
+    param.Arg.Pointer = &reqs;
+    void* p = (*pVirtualAlloc2)(GetCurrentProcess(), addr, size, flags, PAGE_READWRITE, &param, 1);
+    if (p != NULL) return p;
+    _mi_warning_message("unable to allocate aligned OS memory (%zu bytes, error code: 0x%x, address: %p, alignment: %zu, flags: 0x%x)\n", size, GetLastError(), addr, try_alignment, flags);
+    // fall through on error
+  }
+  // last resort
+  return VirtualAlloc(addr, size, flags, PAGE_READWRITE);
+}
+
+static void* mi_win_virtual_alloc(void* addr, size_t size, size_t try_alignment, DWORD flags, bool large_only, bool allow_large, bool* is_large) {
+  mi_assert_internal(!(large_only && !allow_large));
+  static _Atomic(size_t) large_page_try_ok; // = 0;
+  void* p = NULL;
+  // Try to allocate large OS pages (2MiB) if allowed or required.
+  if ((large_only || use_large_os_page(size, try_alignment))
+      && allow_large && (flags&MEM_COMMIT)!=0 && (flags&MEM_RESERVE)!=0) {
+    size_t try_ok = mi_atomic_load_acquire(&large_page_try_ok);
+    if (!large_only && try_ok > 0) {
+      // if a large page allocation fails, it seems the calls to VirtualAlloc get very expensive.
+      // therefore, once a large page allocation failed, we don't try again for `large_page_try_ok` times.
+      mi_atomic_cas_strong_acq_rel(&large_page_try_ok, &try_ok, try_ok - 1);
+    }
+    else {
+      // large OS pages must always reserve and commit.
+      *is_large = true;
+      p = mi_win_virtual_allocx(addr, size, try_alignment, flags | MEM_LARGE_PAGES);
+      if (large_only) return p;
+      // fall back to non-large page allocation on error (`p == NULL`).
+      if (p == NULL) {
+        mi_atomic_store_release(&large_page_try_ok,10UL);  // on error, don't try again for the next N allocations
+      }
+    }
+  }
+  // Fall back to regular page allocation
+  if (p == NULL) {
+    *is_large = ((flags&MEM_LARGE_PAGES) != 0);
+    p = mi_win_virtual_allocx(addr, size, try_alignment, flags);
+  }
+  if (p == NULL) {
+    _mi_warning_message("unable to allocate OS memory (%zu bytes, error code: 0x%x, address: %p, alignment: %zu, flags: 0x%x, large only: %d, allow large: %d)\n", size, GetLastError(), addr, try_alignment, flags, large_only, allow_large);
+  }
+  return p;
+}
+
+/* -----------------------------------------------------------
+  Raw allocation using `sbrk` or `wasm_memory_grow`
+-------------------------------------------------------------- */
+
+#elif defined(MI_USE_SBRK) || defined(__wasi__)
+#if defined(MI_USE_SBRK) 
+  static void* mi_memory_grow( size_t size ) {
+    void* p = sbrk(size);
+    if (p == (void*)(-1)) return NULL;
+    #if !defined(__wasi__) // on wasi this is always zero initialized already (?)
+    memset(p,0,size); 
+    #endif
+    return p;
+  }
+#elif defined(__wasi__)
+  static void* mi_memory_grow( size_t size ) {
+    size_t base = (size > 0 ? __builtin_wasm_memory_grow(0,_mi_divide_up(size, _mi_os_page_size()))
+                            : __builtin_wasm_memory_size(0));
+    if (base == SIZE_MAX) return NULL;     
+    return (void*)(base * _mi_os_page_size());    
+  }
+#endif
+
+#if defined(MI_USE_PTHREADS)
+static pthread_mutex_t mi_heap_grow_mutex = PTHREAD_MUTEX_INITIALIZER;
+#endif
+
+static void* mi_heap_grow(size_t size, size_t try_alignment) {
+  void* p = NULL;
+  if (try_alignment <= 1) {
+    // `sbrk` is not thread safe in general so try to protect it (we could skip this on WASM but leave it in for now)
+    #if defined(MI_USE_PTHREADS) 
+    pthread_mutex_lock(&mi_heap_grow_mutex);
+    #endif
+    p = mi_memory_grow(size);
+    #if defined(MI_USE_PTHREADS)
+    pthread_mutex_unlock(&mi_heap_grow_mutex);
+    #endif
+  }
+  else {
+    void* base = NULL;
+    size_t alloc_size = 0;
+    // to allocate aligned use a lock to try to avoid thread interaction
+    // between getting the current size and actual allocation
+    // (also, `sbrk` is not thread safe in general)
+    #if defined(MI_USE_PTHREADS)
+    pthread_mutex_lock(&mi_heap_grow_mutex);
+    #endif
+    {
+      void* current = mi_memory_grow(0);  // get current size
+      if (current != NULL) {
+        void* aligned_current = mi_align_up_ptr(current, try_alignment);  // and align from there to minimize wasted space
+        alloc_size = _mi_align_up( ((uint8_t*)aligned_current - (uint8_t*)current) + size, _mi_os_page_size());
+        base = mi_memory_grow(alloc_size);        
+      }
+    }
+    #if defined(MI_USE_PTHREADS)
+    pthread_mutex_unlock(&mi_heap_grow_mutex);
+    #endif
+    if (base != NULL) {
+      p = mi_align_up_ptr(base, try_alignment);
+      if ((uint8_t*)p + size > (uint8_t*)base + alloc_size) {
+        // another thread used wasm_memory_grow/sbrk in-between and we do not have enough
+        // space after alignment. Give up (and waste the space as we cannot shrink :-( )
+        // (in `mi_os_mem_alloc_aligned` this will fall back to overallocation to align)
+        p = NULL;
+      }
+    }
+  }
+  if (p == NULL) {
+    _mi_warning_message("unable to allocate sbrk/wasm_memory_grow OS memory (%zu bytes, %zu alignment)\n", size, try_alignment);    
+    errno = ENOMEM;
+    return NULL;
+  }
+  mi_assert_internal( try_alignment == 0 || (uintptr_t)p % try_alignment == 0 );
+  return p;
+}
+
+/* -----------------------------------------------------------
+  Raw allocation on Unix's (mmap)
+-------------------------------------------------------------- */
+#else 
+#define MI_OS_USE_MMAP
+static void* mi_unix_mmapx(void* addr, size_t size, size_t try_alignment, int protect_flags, int flags, int fd) {
+  MI_UNUSED(try_alignment);  
+  #if defined(MAP_ALIGNED)  // BSD
+  if (addr == NULL && try_alignment > 1 && (try_alignment % _mi_os_page_size()) == 0) {
+    size_t n = mi_bsr(try_alignment);
+    if (((size_t)1 << n) == try_alignment && n >= 12 && n <= 30) {  // alignment is a power of 2 and 4096 <= alignment <= 1GiB
+      flags |= MAP_ALIGNED(n);
+      void* p = mmap(addr, size, protect_flags, flags | MAP_ALIGNED(n), fd, 0);
+      if (p!=MAP_FAILED) return p;
+      // fall back to regular mmap
+    }
+  }
+  #elif defined(MAP_ALIGN)  // Solaris
+  if (addr == NULL && try_alignment > 1 && (try_alignment % _mi_os_page_size()) == 0) {
+    void* p = mmap((void*)try_alignment, size, protect_flags, flags | MAP_ALIGN, fd, 0);  // addr parameter is the required alignment
+    if (p!=MAP_FAILED) return p;
+    // fall back to regular mmap
+  }
+  #endif
+  #if (MI_INTPTR_SIZE >= 8) && !defined(MAP_ALIGNED)
+  // on 64-bit systems, use the virtual address area after 2TiB for 4MiB aligned allocations
+  if (addr == NULL) {
+    void* hint = mi_os_get_aligned_hint(try_alignment, size);
+    if (hint != NULL) {
+      void* p = mmap(hint, size, protect_flags, flags, fd, 0);
+      if (p!=MAP_FAILED) return p;
+      // fall back to regular mmap
+    }
+  }
+  #endif
+  // regular mmap
+  void* p = mmap(addr, size, protect_flags, flags, fd, 0);
+  if (p!=MAP_FAILED) return p;  
+  // failed to allocate
+  return NULL;
+}
+
+static int mi_unix_mmap_fd(void) {
+#if defined(VM_MAKE_TAG)
+  // macOS: tracking anonymous page with a specific ID. (All up to 98 are taken officially but LLVM sanitizers had taken 99)
+  int os_tag = (int)mi_option_get(mi_option_os_tag);
+  if (os_tag < 100 || os_tag > 255) os_tag = 100;
+  return VM_MAKE_TAG(os_tag);
+#else
+  return -1;
+#endif
+}
+
+static void* mi_unix_mmap(void* addr, size_t size, size_t try_alignment, int protect_flags, bool large_only, bool allow_large, bool* is_large) {
+  void* p = NULL;
+  #if !defined(MAP_ANONYMOUS)
+  #define MAP_ANONYMOUS  MAP_ANON
+  #endif
+  #if !defined(MAP_NORESERVE)
+  #define MAP_NORESERVE  0
+  #endif
+  const int fd = mi_unix_mmap_fd();
+  int flags = MAP_PRIVATE | MAP_ANONYMOUS;
+  if (_mi_os_has_overcommit()) {
+    flags |= MAP_NORESERVE;
+  }  
+  #if defined(PROT_MAX)
+  protect_flags |= PROT_MAX(PROT_READ | PROT_WRITE); // BSD
+  #endif    
+  // huge page allocation
+  if ((large_only || use_large_os_page(size, try_alignment)) && allow_large) {
+    static _Atomic(size_t) large_page_try_ok; // = 0;
+    size_t try_ok = mi_atomic_load_acquire(&large_page_try_ok);
+    if (!large_only && try_ok > 0) {
+      // If the OS is not configured for large OS pages, or the user does not have
+      // enough permission, the `mmap` will always fail (but it might also fail for other reasons).
+      // Therefore, once a large page allocation failed, we don't try again for `large_page_try_ok` times
+      // to avoid too many failing calls to mmap.
+      mi_atomic_cas_strong_acq_rel(&large_page_try_ok, &try_ok, try_ok - 1);
+    }
+    else {
+      int lflags = flags & ~MAP_NORESERVE;  // using NORESERVE on huge pages seems to fail on Linux
+      int lfd = fd;
+      #ifdef MAP_ALIGNED_SUPER
+      lflags |= MAP_ALIGNED_SUPER;
+      #endif
+      #ifdef MAP_HUGETLB
+      lflags |= MAP_HUGETLB;
+      #endif
+      #ifdef MAP_HUGE_1GB
+      static bool mi_huge_pages_available = true;
+      if ((size % MI_GiB) == 0 && mi_huge_pages_available) {
+        lflags |= MAP_HUGE_1GB;
+      }
+      else
+      #endif
+      {
+        #ifdef MAP_HUGE_2MB
+        lflags |= MAP_HUGE_2MB;
+        #endif
+      }
+      #ifdef VM_FLAGS_SUPERPAGE_SIZE_2MB
+      lfd |= VM_FLAGS_SUPERPAGE_SIZE_2MB;
+      #endif
+      if (large_only || lflags != flags) {
+        // try large OS page allocation
+        *is_large = true;
+        p = mi_unix_mmapx(addr, size, try_alignment, protect_flags, lflags, lfd);
+        #ifdef MAP_HUGE_1GB
+        if (p == NULL && (lflags & MAP_HUGE_1GB) != 0) {
+          mi_huge_pages_available = false; // don't try huge 1GiB pages again
+          _mi_warning_message("unable to allocate huge (1GiB) page, trying large (2MiB) pages instead (error %i)\n", errno);
+          lflags = ((lflags & ~MAP_HUGE_1GB) | MAP_HUGE_2MB);
+          p = mi_unix_mmapx(addr, size, try_alignment, protect_flags, lflags, lfd);
+        }
+        #endif
+        if (large_only) return p;
+        if (p == NULL) {
+          mi_atomic_store_release(&large_page_try_ok, (size_t)8);  // on error, don't try again for the next N allocations
+        }
+      }
+    }
+  }
+  // regular allocation
+  if (p == NULL) {
+    *is_large = false;
+    p = mi_unix_mmapx(addr, size, try_alignment, protect_flags, flags, fd);
+    if (p != NULL) {
+      #if defined(MADV_HUGEPAGE)
+      // Many Linux systems don't allow MAP_HUGETLB but they support instead
+      // transparent huge pages (THP). Generally, it is not required to call `madvise` with MADV_HUGE
+      // though since properly aligned allocations will already use large pages if available
+      // in that case -- in particular for our large regions (in `memory.c`).
+      // However, some systems only allow THP if called with explicit `madvise`, so
+      // when large OS pages are enabled for mimalloc, we call `madvise` anyways.
+      if (allow_large && use_large_os_page(size, try_alignment)) {
+        if (mi_madvise(p, size, MADV_HUGEPAGE) == 0) {
+          *is_large = true; // possibly
+        };
+      }
+      #elif defined(__sun)
+      if (allow_large && use_large_os_page(size, try_alignment)) {
+        struct memcntl_mha cmd = {0};
+        cmd.mha_pagesize = large_os_page_size;
+        cmd.mha_cmd = MHA_MAPSIZE_VA;
+        if (memcntl((caddr_t)p, size, MC_HAT_ADVISE, (caddr_t)&cmd, 0, 0) == 0) {
+          *is_large = true;
+        }
+      }      
+      #endif
+    }
+  }
+  if (p == NULL) {
+    _mi_warning_message("unable to allocate OS memory (%zu bytes, error code: %i, address: %p, large only: %d, allow large: %d)\n", size, errno, addr, large_only, allow_large);
+  }
+  return p;
+}
+#endif
+
+
+/* -----------------------------------------------------------
+   Primitive allocation from the OS.
+-------------------------------------------------------------- */
+
+// Note: the `try_alignment` is just a hint and the returned pointer is not guaranteed to be aligned.
+static void* mi_os_mem_alloc(size_t size, size_t try_alignment, bool commit, bool allow_large, bool* is_large, mi_stats_t* stats) {
+  mi_assert_internal(size > 0 && (size % _mi_os_page_size()) == 0);
+  if (size == 0) return NULL;
+  if (!commit) allow_large = false;
+  if (try_alignment == 0) try_alignment = 1; // avoid 0 to ensure there will be no divide by zero when aligning
+
+  void* p = NULL;
+  /*
+  if (commit && allow_large) {
+    p = _mi_os_try_alloc_from_huge_reserved(size, try_alignment);
+    if (p != NULL) {
+      *is_large = true;
+      return p;
+    }
+  }
+  */
+
+  #if defined(_WIN32)
+    int flags = MEM_RESERVE;
+    if (commit) { flags |= MEM_COMMIT; }
+    p = mi_win_virtual_alloc(NULL, size, try_alignment, flags, false, allow_large, is_large);
+  #elif defined(MI_USE_SBRK) || defined(__wasi__)
+    MI_UNUSED(allow_large);
+    *is_large = false;
+    p = mi_heap_grow(size, try_alignment);
+  #else
+    int protect_flags = (commit ? (PROT_WRITE | PROT_READ) : PROT_NONE);
+    p = mi_unix_mmap(NULL, size, try_alignment, protect_flags, false, allow_large, is_large);
+  #endif
+  mi_stat_counter_increase(stats->mmap_calls, 1);
+  if (p != NULL) {
+    _mi_stat_increase(&stats->reserved, size);
+    if (commit) { _mi_stat_increase(&stats->committed, size); }
+  }
+  return p;
+}
+
+
+// Primitive aligned allocation from the OS.
+// This function guarantees the allocated memory is aligned.
+static void* mi_os_mem_alloc_aligned(size_t size, size_t alignment, bool commit, bool allow_large, bool* is_large, mi_stats_t* stats) {
+  mi_assert_internal(alignment >= _mi_os_page_size() && ((alignment & (alignment - 1)) == 0));
+  mi_assert_internal(size > 0 && (size % _mi_os_page_size()) == 0);
+  mi_assert_internal(is_large != NULL);
+  if (!commit) allow_large = false;
+  if (!(alignment >= _mi_os_page_size() && ((alignment & (alignment - 1)) == 0))) return NULL;
+  size = _mi_align_up(size, _mi_os_page_size());
+
+  // try first with a hint (this will be aligned directly on Win 10+ or BSD)
+  void* p = mi_os_mem_alloc(size, alignment, commit, allow_large, is_large, stats);
+  if (p == NULL) return NULL;
+  
+  // if not aligned, free it, overallocate, and unmap around it
+  if (((uintptr_t)p % alignment != 0)) {
+    mi_os_mem_free(p, size, commit, stats);
+    _mi_warning_message("unable to allocate aligned OS memory directly, fall back to over-allocation (%zu bytes, address: %p, alignment: %zu, commit: %d)\n", size, p, alignment, commit);
+    if (size >= (SIZE_MAX - alignment)) return NULL; // overflow
+    const size_t over_size = size + alignment;
+
+#if _WIN32
+    // over-allocate uncommitted (virtual) memory
+    p = mi_os_mem_alloc(over_size, 0 /*alignment*/, false /* commit? */, false /* allow_large */, is_large, stats);
+    if (p == NULL) return NULL;
+    
+    // set p to the aligned part in the full region
+    // note: this is dangerous on Windows as VirtualFree needs the actual region pointer
+    // but in mi_os_mem_free we handle this (hopefully exceptional) situation.
+    p = mi_align_up_ptr(p, alignment);
+
+    // explicitly commit only the aligned part
+    if (commit) {
+      _mi_os_commit(p, size, NULL, stats);
+    }
+#else
+    // overallocate...
+    p = mi_os_mem_alloc(over_size, 1, commit, false, is_large, stats);
+    if (p == NULL) return NULL;
+    // and selectively unmap parts around the over-allocated area. (noop on sbrk)
+    void* aligned_p = mi_align_up_ptr(p, alignment);
+    size_t pre_size = (uint8_t*)aligned_p - (uint8_t*)p;
+    size_t mid_size = _mi_align_up(size, _mi_os_page_size());
+    size_t post_size = over_size - pre_size - mid_size;
+    mi_assert_internal(pre_size < over_size && post_size < over_size && mid_size >= size);
+    if (pre_size > 0)  mi_os_mem_free(p, pre_size, commit, stats);
+    if (post_size > 0) mi_os_mem_free((uint8_t*)aligned_p + mid_size, post_size, commit, stats);
+    // we can return the aligned pointer on `mmap` (and sbrk) systems
+    p = aligned_p;
+#endif
+  }
+
+  mi_assert_internal(p == NULL || (p != NULL && ((uintptr_t)p % alignment) == 0));
+  return p;
+}
+
+
+/* -----------------------------------------------------------
+  OS API: alloc, free, alloc_aligned
+----------------------------------------------------------- */
+
+void* _mi_os_alloc(size_t size, mi_stats_t* tld_stats) {
+  MI_UNUSED(tld_stats);
+  mi_stats_t* stats = &_mi_stats_main;
+  if (size == 0) return NULL;
+  size = _mi_os_good_alloc_size(size);
+  bool is_large = false;
+  return mi_os_mem_alloc(size, 0, true, false, &is_large, stats);
+}
+
+void  _mi_os_free_ex(void* p, size_t size, bool was_committed, mi_stats_t* tld_stats) {
+  MI_UNUSED(tld_stats);
+  mi_stats_t* stats = &_mi_stats_main;
+  if (size == 0 || p == NULL) return;
+  size = _mi_os_good_alloc_size(size);
+  mi_os_mem_free(p, size, was_committed, stats);
+}
+
+void  _mi_os_free(void* p, size_t size, mi_stats_t* stats) {
+  _mi_os_free_ex(p, size, true, stats);
+}
+
+void* _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, bool* large, mi_stats_t* tld_stats)
+{
+  MI_UNUSED(&mi_os_get_aligned_hint); // suppress unused warnings
+  MI_UNUSED(tld_stats);
+  if (size == 0) return NULL;
+  size = _mi_os_good_alloc_size(size);
+  alignment = _mi_align_up(alignment, _mi_os_page_size());
+  bool allow_large = false;
+  if (large != NULL) {
+    allow_large = *large;
+    *large = false;
+  }
+  return mi_os_mem_alloc_aligned(size, alignment, commit, allow_large, (large!=NULL?large:&allow_large), &_mi_stats_main /*tld->stats*/ );
+}
+
+
+
+/* -----------------------------------------------------------
+  OS memory API: reset, commit, decommit, protect, unprotect.
+----------------------------------------------------------- */
+
+
+// OS page align within a given area, either conservative (pages inside the area only),
+// or not (straddling pages outside the area is possible)
+static void* mi_os_page_align_areax(bool conservative, void* addr, size_t size, size_t* newsize) {
+  mi_assert(addr != NULL && size > 0);
+  if (newsize != NULL) *newsize = 0;
+  if (size == 0 || addr == NULL) return NULL;
+
+  // page align conservatively within the range
+  void* start = (conservative ? mi_align_up_ptr(addr, _mi_os_page_size())
+    : mi_align_down_ptr(addr, _mi_os_page_size()));
+  void* end = (conservative ? mi_align_down_ptr((uint8_t*)addr + size, _mi_os_page_size())
+    : mi_align_up_ptr((uint8_t*)addr + size, _mi_os_page_size()));
+  ptrdiff_t diff = (uint8_t*)end - (uint8_t*)start;
+  if (diff <= 0) return NULL;
+
+  mi_assert_internal((conservative && (size_t)diff <= size) || (!conservative && (size_t)diff >= size));
+  if (newsize != NULL) *newsize = (size_t)diff;
+  return start;
+}
+
+static void* mi_os_page_align_area_conservative(void* addr, size_t size, size_t* newsize) {
+  return mi_os_page_align_areax(true, addr, size, newsize);
+}
+
+static void mi_mprotect_hint(int err) {
+#if defined(MI_OS_USE_MMAP) && (MI_SECURE>=2) // guard page around every mimalloc page
+  if (err == ENOMEM) {
+    _mi_warning_message("the previous warning may have been caused by a low memory map limit.\n"
+                        "  On Linux this is controlled by the vm.max_map_count. For example:\n"
+                        "  > sudo sysctl -w vm.max_map_count=262144\n");
+  }
+#else
+  MI_UNUSED(err);
+#endif
+}
+
+// Commit/Decommit memory.
+// Usually commit is aligned liberal, while decommit is aligned conservative.
+// (but not for the reset version where we want commit to be conservative as well)
+static bool mi_os_commitx(void* addr, size_t size, bool commit, bool conservative, bool* is_zero, mi_stats_t* stats) {
+  // page align in the range, commit liberally, decommit conservative
+  if (is_zero != NULL) { *is_zero = false; }
+  size_t csize;
+  void* start = mi_os_page_align_areax(conservative, addr, size, &csize);
+  if (csize == 0) return true;  // || _mi_os_is_huge_reserved(addr))
+  int err = 0;
+  if (commit) {
+    _mi_stat_increase(&stats->committed, size);  // use size for precise commit vs. decommit
+    _mi_stat_counter_increase(&stats->commit_calls, 1);
+  }
+  else {
+    _mi_stat_decrease(&stats->committed, size);
+  }
+
+  #if defined(_WIN32)
+  if (commit) {
+    // *is_zero = true;  // note: if the memory was already committed, the call succeeds but the memory is not zero'd
+    void* p = VirtualAlloc(start, csize, MEM_COMMIT, PAGE_READWRITE);
+    err = (p == start ? 0 : GetLastError());
+  }
+  else {
+    BOOL ok = VirtualFree(start, csize, MEM_DECOMMIT);
+    err = (ok ? 0 : GetLastError());
+  }
+  #elif defined(__wasi__)
+  // WebAssembly guests can't control memory protection
+  #elif 0 && defined(MAP_FIXED) && !defined(__APPLE__)
+  // Linux: disabled for now as mmap fixed seems much more expensive than MADV_DONTNEED (and splits VMA's?)
+  if (commit) {
+    // commit: just change the protection
+    err = mprotect(start, csize, (PROT_READ | PROT_WRITE));
+    if (err != 0) { err = errno; }
+  } 
+  else {
+    // decommit: use mmap with MAP_FIXED to discard the existing memory (and reduce rss)
+    const int fd = mi_unix_mmap_fd();
+    void* p = mmap(start, csize, PROT_NONE, (MAP_FIXED | MAP_PRIVATE | MAP_ANONYMOUS | MAP_NORESERVE), fd, 0);
+    if (p != start) { err = errno; }
+  }
+  #else
+  // Linux, macOSX and others.
+  if (commit) {
+    // commit: ensure we can access the area    
+    err = mprotect(start, csize, (PROT_READ | PROT_WRITE));
+    if (err != 0) { err = errno; }
+  } 
+  else {
+    #if defined(MADV_DONTNEED) && MI_DEBUG == 0 && MI_SECURE == 0
+    // decommit: use MADV_DONTNEED as it decreases rss immediately (unlike MADV_FREE)
+    // (on the other hand, MADV_FREE would be good enough.. it is just not reflected in the stats :-( )
+    err = madvise(start, csize, MADV_DONTNEED);
+    #else
+    // decommit: just disable access (also used in debug and secure mode to trap on illegal access)
+    err = mprotect(start, csize, PROT_NONE);
+    if (err != 0) { err = errno; }
+    #endif
+    //#if defined(MADV_FREE_REUSE)
+    //  while ((err = mi_madvise(start, csize, MADV_FREE_REUSE)) != 0 && errno == EAGAIN) { errno = 0; }
+    //#endif
+  }
+  #endif
+  if (err != 0) {
+    _mi_warning_message("%s error: start: %p, csize: 0x%zx, err: %i\n", commit ? "commit" : "decommit", start, csize, err);
+    mi_mprotect_hint(err);
+  }
+  mi_assert_internal(err == 0);
+  return (err == 0);
+}
+
+bool _mi_os_commit(void* addr, size_t size, bool* is_zero, mi_stats_t* tld_stats) {
+  MI_UNUSED(tld_stats);
+  mi_stats_t* stats = &_mi_stats_main;
+  return mi_os_commitx(addr, size, true, false /* liberal */, is_zero, stats);
+}
+
+bool _mi_os_decommit(void* addr, size_t size, mi_stats_t* tld_stats) {
+  MI_UNUSED(tld_stats);
+  mi_stats_t* stats = &_mi_stats_main;
+  bool is_zero;
+  return mi_os_commitx(addr, size, false, true /* conservative */, &is_zero, stats);
+}
+
+/*
+static bool mi_os_commit_unreset(void* addr, size_t size, bool* is_zero, mi_stats_t* stats) {  
+  return mi_os_commitx(addr, size, true, true // conservative
+                      , is_zero, stats);
+}
+*/
+
+// Signal to the OS that the address range is no longer in use
+// but may be used later again. This will release physical memory
+// pages and reduce swapping while keeping the memory committed.
+// We page align to a conservative area inside the range to reset.
+static bool mi_os_resetx(void* addr, size_t size, bool reset, mi_stats_t* stats) {
+  // page align conservatively within the range
+  size_t csize;
+  void* start = mi_os_page_align_area_conservative(addr, size, &csize);
+  if (csize == 0) return true;  // || _mi_os_is_huge_reserved(addr)
+  if (reset) _mi_stat_increase(&stats->reset, csize);
+        else _mi_stat_decrease(&stats->reset, csize);
+  if (!reset) return true; // nothing to do on unreset!
+
+  #if (MI_DEBUG>1)
+  if (MI_SECURE==0) {
+    memset(start, 0, csize); // pretend it is eagerly reset
+  }
+  #endif
+
+#if defined(_WIN32)
+  // Testing shows that for us (on `malloc-large`) MEM_RESET is 2x faster than DiscardVirtualMemory
+  void* p = VirtualAlloc(start, csize, MEM_RESET, PAGE_READWRITE);
+  mi_assert_internal(p == start);
+  #if 1
+  if (p == start && start != NULL) {
+    VirtualUnlock(start,csize); // VirtualUnlock after MEM_RESET removes the memory from the working set
+  }
+  #endif
+  if (p != start) return false;
+#else
+#if defined(MADV_FREE)
+  static _Atomic(size_t) advice = MI_ATOMIC_VAR_INIT(MADV_FREE);
+  int oadvice = (int)mi_atomic_load_relaxed(&advice);
+  int err;
+  while ((err = mi_madvise(start, csize, oadvice)) != 0 && errno == EAGAIN) { errno = 0;  };
+  if (err != 0 && errno == EINVAL && oadvice == MADV_FREE) {  
+    // if MADV_FREE is not supported, fall back to MADV_DONTNEED from now on
+    mi_atomic_store_release(&advice, (size_t)MADV_DONTNEED);
+    err = mi_madvise(start, csize, MADV_DONTNEED);
+  }
+#elif defined(__wasi__)
+  int err = 0;
+#else
+  int err = mi_madvise(start, csize, MADV_DONTNEED);
+#endif
+  if (err != 0) {
+    _mi_warning_message("madvise reset error: start: %p, csize: 0x%zx, errno: %i\n", start, csize, errno);
+  }
+  //mi_assert(err == 0);
+  if (err != 0) return false;
+#endif
+  return true;
+}
+
+// Signal to the OS that the address range is no longer in use
+// but may be used later again. This will release physical memory
+// pages and reduce swapping while keeping the memory committed.
+// We page align to a conservative area inside the range to reset.
+bool _mi_os_reset(void* addr, size_t size, mi_stats_t* tld_stats) {
+  MI_UNUSED(tld_stats);
+  mi_stats_t* stats = &_mi_stats_main;
+  return mi_os_resetx(addr, size, true, stats);
+}
+
+/*
+bool _mi_os_unreset(void* addr, size_t size, bool* is_zero, mi_stats_t* tld_stats) {
+  MI_UNUSED(tld_stats);
+  mi_stats_t* stats = &_mi_stats_main;
+  if (mi_option_is_enabled(mi_option_reset_decommits)) {
+    return mi_os_commit_unreset(addr, size, is_zero, stats);  // re-commit it (conservatively!)
+  }
+  else {
+    *is_zero = false;
+    return mi_os_resetx(addr, size, false, stats);
+  }
+}
+*/
+
+// Protect a region in memory to be not accessible.
+static  bool mi_os_protectx(void* addr, size_t size, bool protect) {
+  // page align conservatively within the range
+  size_t csize = 0;
+  void* start = mi_os_page_align_area_conservative(addr, size, &csize);
+  if (csize == 0) return false;
+  /*
+  if (_mi_os_is_huge_reserved(addr)) {
+	  _mi_warning_message("cannot mprotect memory allocated in huge OS pages\n");
+  }
+  */
+  int err = 0;
+#ifdef _WIN32
+  DWORD oldprotect = 0;
+  BOOL ok = VirtualProtect(start, csize, protect ? PAGE_NOACCESS : PAGE_READWRITE, &oldprotect);
+  err = (ok ? 0 : GetLastError());
+#elif defined(__wasi__)
+  err = 0;
+#else
+  err = mprotect(start, csize, protect ? PROT_NONE : (PROT_READ | PROT_WRITE));
+  if (err != 0) { err = errno; }
+#endif
+  if (err != 0) {
+    _mi_warning_message("mprotect error: start: %p, csize: 0x%zx, err: %i\n", start, csize, err);
+    mi_mprotect_hint(err);
+  }
+  return (err == 0);
+}
+
+bool _mi_os_protect(void* addr, size_t size) {
+  return mi_os_protectx(addr, size, true);
+}
+
+bool _mi_os_unprotect(void* addr, size_t size) {
+  return mi_os_protectx(addr, size, false);
+}
+
+
+
+bool _mi_os_shrink(void* p, size_t oldsize, size_t newsize, mi_stats_t* stats) {
+  // page align conservatively within the range
+  mi_assert_internal(oldsize > newsize && p != NULL);
+  if (oldsize < newsize || p == NULL) return false;
+  if (oldsize == newsize) return true;
+
+  // oldsize and newsize should be page aligned or we cannot shrink precisely
+  void* addr = (uint8_t*)p + newsize;
+  size_t size = 0;
+  void* start = mi_os_page_align_area_conservative(addr, oldsize - newsize, &size);
+  if (size == 0 || start != addr) return false;
+
+#ifdef _WIN32
+  // we cannot shrink on windows, but we can decommit
+  return _mi_os_decommit(start, size, stats);
+#else
+  return mi_os_mem_free(start, size, true, stats);
+#endif
+}
+
+
+/* ----------------------------------------------------------------------------
+Support for allocating huge OS pages (1Gib) that are reserved up-front
+and possibly associated with a specific NUMA node. (use `numa_node>=0`)
+-----------------------------------------------------------------------------*/
+#define MI_HUGE_OS_PAGE_SIZE  (MI_GiB)
+
+#if defined(_WIN32) && (MI_INTPTR_SIZE >= 8)
+static void* mi_os_alloc_huge_os_pagesx(void* addr, size_t size, int numa_node)
+{
+  mi_assert_internal(size%MI_GiB == 0);
+  mi_assert_internal(addr != NULL);
+  const DWORD flags = MEM_LARGE_PAGES | MEM_COMMIT | MEM_RESERVE;
+
+  mi_win_enable_large_os_pages();
+
+  MI_MEM_EXTENDED_PARAMETER params[3] = { {{0,0},{0}},{{0,0},{0}},{{0,0},{0}} };
+  // on modern Windows try use NtAllocateVirtualMemoryEx for 1GiB huge pages
+  static bool mi_huge_pages_available = true;
+  if (pNtAllocateVirtualMemoryEx != NULL && mi_huge_pages_available) {
+    params[0].Type.Type = MiMemExtendedParameterAttributeFlags;
+    params[0].Arg.ULong64 = MI_MEM_EXTENDED_PARAMETER_NONPAGED_HUGE;
+    ULONG param_count = 1;
+    if (numa_node >= 0) {
+      param_count++;
+      params[1].Type.Type = MiMemExtendedParameterNumaNode;
+      params[1].Arg.ULong = (unsigned)numa_node;
+    }
+    SIZE_T psize = size;
+    void* base = addr;
+    NTSTATUS err = (*pNtAllocateVirtualMemoryEx)(GetCurrentProcess(), &base, &psize, flags, PAGE_READWRITE, params, param_count);
+    if (err == 0 && base != NULL) {
+      return base;
+    }
+    else {
+      // fall back to regular large pages
+      mi_huge_pages_available = false; // don't try further huge pages
+      _mi_warning_message("unable to allocate using huge (1GiB) pages, trying large (2MiB) pages instead (status 0x%lx)\n", err);
+    }
+  }
+  // on modern Windows try use VirtualAlloc2 for numa aware large OS page allocation
+  if (pVirtualAlloc2 != NULL && numa_node >= 0) {
+    params[0].Type.Type = MiMemExtendedParameterNumaNode;
+    params[0].Arg.ULong = (unsigned)numa_node;
+    return (*pVirtualAlloc2)(GetCurrentProcess(), addr, size, flags, PAGE_READWRITE, params, 1);
+  }
+  
+  // otherwise use regular virtual alloc on older windows
+  return VirtualAlloc(addr, size, flags, PAGE_READWRITE);
+}
+
+#elif defined(MI_OS_USE_MMAP) && (MI_INTPTR_SIZE >= 8) && !defined(__HAIKU__)
+#include <sys/syscall.h>
+#ifndef MPOL_PREFERRED
+#define MPOL_PREFERRED 1
+#endif
+#if defined(SYS_mbind)
+static long mi_os_mbind(void* start, unsigned long len, unsigned long mode, const unsigned long* nmask, unsigned long maxnode, unsigned flags) {
+  return syscall(SYS_mbind, start, len, mode, nmask, maxnode, flags);
+}
+#else
+static long mi_os_mbind(void* start, unsigned long len, unsigned long mode, const unsigned long* nmask, unsigned long maxnode, unsigned flags) {
+  MI_UNUSED(start); MI_UNUSED(len); MI_UNUSED(mode); MI_UNUSED(nmask); MI_UNUSED(maxnode); MI_UNUSED(flags);
+  return 0;
+}
+#endif
+static void* mi_os_alloc_huge_os_pagesx(void* addr, size_t size, int numa_node) {
+  mi_assert_internal(size%MI_GiB == 0);
+  bool is_large = true;
+  void* p = mi_unix_mmap(addr, size, MI_SEGMENT_SIZE, PROT_READ | PROT_WRITE, true, true, &is_large);
+  if (p == NULL) return NULL;
+  if (numa_node >= 0 && numa_node < 8*MI_INTPTR_SIZE) { // at most 64 nodes
+    unsigned long numa_mask = (1UL << numa_node);
+    // TODO: does `mbind` work correctly for huge OS pages? should we
+    // use `set_mempolicy` before calling mmap instead?
+    // see: <https://lkml.org/lkml/2017/2/9/875>
+    long err = mi_os_mbind(p, size, MPOL_PREFERRED, &numa_mask, 8*MI_INTPTR_SIZE, 0);
+    if (err != 0) {
+      _mi_warning_message("failed to bind huge (1GiB) pages to numa node %d: %s\n", numa_node, strerror(errno));
+    }
+  }
+  return p;
+}
+#else
+static void* mi_os_alloc_huge_os_pagesx(void* addr, size_t size, int numa_node) {
+  MI_UNUSED(addr); MI_UNUSED(size); MI_UNUSED(numa_node);
+  return NULL;
+}
+#endif
+
+#if (MI_INTPTR_SIZE >= 8)
+// To ensure proper alignment, use our own area for huge OS pages
+static mi_decl_cache_align _Atomic(uintptr_t)  mi_huge_start; // = 0
+
+// Claim an aligned address range for huge pages
+static uint8_t* mi_os_claim_huge_pages(size_t pages, size_t* total_size) {
+  if (total_size != NULL) *total_size = 0;
+  const size_t size = pages * MI_HUGE_OS_PAGE_SIZE;
+
+  uintptr_t start = 0;
+  uintptr_t end = 0;
+  uintptr_t huge_start = mi_atomic_load_relaxed(&mi_huge_start);
+  do {
+    start = huge_start;
+    if (start == 0) {
+      // Initialize the start address after the 32TiB area
+      start = ((uintptr_t)32 << 40);  // 32TiB virtual start address
+#if (MI_SECURE>0 || MI_DEBUG==0)      // security: randomize start of huge pages unless in debug mode
+      uintptr_t r = _mi_heap_random_next(mi_get_default_heap());
+      start = start + ((uintptr_t)MI_HUGE_OS_PAGE_SIZE * ((r>>17) & 0x0FFF));  // (randomly 12bits)*1GiB == between 0 to 4TiB
+#endif
+    }
+    end = start + size;
+    mi_assert_internal(end % MI_SEGMENT_SIZE == 0);
+  } while (!mi_atomic_cas_strong_acq_rel(&mi_huge_start, &huge_start, end));
+
+  if (total_size != NULL) *total_size = size;
+  return (uint8_t*)start;
+}
+#else
+static uint8_t* mi_os_claim_huge_pages(size_t pages, size_t* total_size) {
+  MI_UNUSED(pages);
+  if (total_size != NULL) *total_size = 0;
+  return NULL;
+}
+#endif
+
+// Allocate MI_SEGMENT_SIZE aligned huge pages
+void* _mi_os_alloc_huge_os_pages(size_t pages, int numa_node, mi_msecs_t max_msecs, size_t* pages_reserved, size_t* psize) {
+  if (psize != NULL) *psize = 0;
+  if (pages_reserved != NULL) *pages_reserved = 0;
+  size_t size = 0;
+  uint8_t* start = mi_os_claim_huge_pages(pages, &size);
+  if (start == NULL) return NULL; // or 32-bit systems
+
+  // Allocate one page at the time but try to place them contiguously
+  // We allocate one page at the time to be able to abort if it takes too long
+  // or to at least allocate as many as available on the system.
+  mi_msecs_t start_t = _mi_clock_start();
+  size_t page;
+  for (page = 0; page < pages; page++) {
+    // allocate a page
+    void* addr = start + (page * MI_HUGE_OS_PAGE_SIZE);
+    void* p = mi_os_alloc_huge_os_pagesx(addr, MI_HUGE_OS_PAGE_SIZE, numa_node);
+
+    // Did we succeed at a contiguous address?
+    if (p != addr) {
+      // no success, issue a warning and break
+      if (p != NULL) {
+        _mi_warning_message("could not allocate contiguous huge page %zu at %p\n", page, addr);
+        _mi_os_free(p, MI_HUGE_OS_PAGE_SIZE, &_mi_stats_main);
+      }
+      break;
+    }
+
+    // success, record it
+    _mi_stat_increase(&_mi_stats_main.committed, MI_HUGE_OS_PAGE_SIZE);
+    _mi_stat_increase(&_mi_stats_main.reserved, MI_HUGE_OS_PAGE_SIZE);
+
+    // check for timeout
+    if (max_msecs > 0) {
+      mi_msecs_t elapsed = _mi_clock_end(start_t);
+      if (page >= 1) {
+        mi_msecs_t estimate = ((elapsed / (page+1)) * pages);
+        if (estimate > 2*max_msecs) { // seems like we are going to timeout, break
+          elapsed = max_msecs + 1;
+        }
+      }
+      if (elapsed > max_msecs) {
+        _mi_warning_message("huge page allocation timed out\n");
+        break;
+      }
+    }
+  }
+  mi_assert_internal(page*MI_HUGE_OS_PAGE_SIZE <= size);
+  if (pages_reserved != NULL) { *pages_reserved = page; }
+  if (psize != NULL) { *psize = page * MI_HUGE_OS_PAGE_SIZE; }
+  return (page == 0 ? NULL : start);
+}
+
+// free every huge page in a range individually (as we allocated per page)
+// note: needed with VirtualAlloc but could potentially be done in one go on mmap'd systems.
+void _mi_os_free_huge_pages(void* p, size_t size, mi_stats_t* stats) {
+  if (p==NULL || size==0) return;
+  uint8_t* base = (uint8_t*)p;
+  while (size >= MI_HUGE_OS_PAGE_SIZE) {
+    _mi_os_free(base, MI_HUGE_OS_PAGE_SIZE, stats);
+    size -= MI_HUGE_OS_PAGE_SIZE;
+    base += MI_HUGE_OS_PAGE_SIZE;
+  }
+}
+
+/* ----------------------------------------------------------------------------
+Support NUMA aware allocation
+-----------------------------------------------------------------------------*/
+#ifdef _WIN32  
+static size_t mi_os_numa_nodex(void) {
+  USHORT numa_node = 0;
+  if (pGetCurrentProcessorNumberEx != NULL && pGetNumaProcessorNodeEx != NULL) {
+    // Extended API is supported
+    MI_PROCESSOR_NUMBER pnum;
+    (*pGetCurrentProcessorNumberEx)(&pnum);
+    USHORT nnode = 0;
+    BOOL ok = (*pGetNumaProcessorNodeEx)(&pnum, &nnode);
+    if (ok) numa_node = nnode;
+  }
+  else {
+    // Vista or earlier, use older API that is limited to 64 processors. Issue #277
+    DWORD pnum = GetCurrentProcessorNumber();
+    UCHAR nnode = 0;
+    BOOL ok = GetNumaProcessorNode((UCHAR)pnum, &nnode);
+    if (ok) numa_node = nnode;    
+  }
+  return numa_node;
+}
+
+static size_t mi_os_numa_node_countx(void) {
+  ULONG numa_max = 0;
+  GetNumaHighestNodeNumber(&numa_max);
+  // find the highest node number that has actual processors assigned to it. Issue #282
+  while(numa_max > 0) {
+    if (pGetNumaNodeProcessorMaskEx != NULL) {
+      // Extended API is supported
+      GROUP_AFFINITY affinity;
+      if ((*pGetNumaNodeProcessorMaskEx)((USHORT)numa_max, &affinity)) {
+        if (affinity.Mask != 0) break;  // found the maximum non-empty node
+      }
+    }
+    else {
+      // Vista or earlier, use older API that is limited to 64 processors.
+      ULONGLONG mask;
+      if (GetNumaNodeProcessorMask((UCHAR)numa_max, &mask)) {
+        if (mask != 0) break; // found the maximum non-empty node
+      };
+    }
+    // max node was invalid or had no processor assigned, try again
+    numa_max--;
+  }
+  return ((size_t)numa_max + 1);
+}
+#elif defined(__linux__)
+#include <sys/syscall.h>  // getcpu
+#include <stdio.h>        // access
+
+static size_t mi_os_numa_nodex(void) {
+#ifdef SYS_getcpu
+  unsigned long node = 0;
+  unsigned long ncpu = 0;
+  long err = syscall(SYS_getcpu, &ncpu, &node, NULL);
+  if (err != 0) return 0;
+  return node;
+#else
+  return 0;
+#endif
+}
+static size_t mi_os_numa_node_countx(void) {
+  char buf[128];
+  unsigned node = 0;
+  for(node = 0; node < 256; node++) {
+    // enumerate node entries -- todo: it there a more efficient way to do this? (but ensure there is no allocation)
+    snprintf(buf, 127, "/sys/devices/system/node/node%u", node + 1);
+    if (access(buf,R_OK) != 0) break;
+  }
+  return (node+1);
+}
+#elif defined(__FreeBSD__) && __FreeBSD_version >= 1200000
+static size_t mi_os_numa_nodex(void) {
+  domainset_t dom;
+  size_t node;
+  int policy;
+  if (cpuset_getdomain(CPU_LEVEL_CPUSET, CPU_WHICH_PID, -1, sizeof(dom), &dom, &policy) == -1) return 0ul;
+  for (node = 0; node < MAXMEMDOM; node++) {
+    if (DOMAINSET_ISSET(node, &dom)) return node;
+  }
+  return 0ul;
+}
+static size_t mi_os_numa_node_countx(void) {
+  size_t ndomains = 0;
+  size_t len = sizeof(ndomains);
+  if (sysctlbyname("vm.ndomains", &ndomains, &len, NULL, 0) == -1) return 0ul;
+  return ndomains;
+}
+#elif defined(__DragonFly__)
+static size_t mi_os_numa_nodex(void) {
+  // TODO: DragonFly does not seem to provide any userland means to get this information.
+  return 0ul;
+}
+static size_t mi_os_numa_node_countx(void) {
+  size_t ncpus = 0, nvirtcoresperphys = 0;
+  size_t len = sizeof(size_t);
+  if (sysctlbyname("hw.ncpu", &ncpus, &len, NULL, 0) == -1) return 0ul;
+  if (sysctlbyname("hw.cpu_topology_ht_ids", &nvirtcoresperphys, &len, NULL, 0) == -1) return 0ul;
+  return nvirtcoresperphys * ncpus;
+}
+#else
+static size_t mi_os_numa_nodex(void) {
+  return 0;
+}
+static size_t mi_os_numa_node_countx(void) {
+  return 1;
+}
+#endif
+
+_Atomic(size_t)  _mi_numa_node_count; // = 0   // cache the node count
+
+size_t _mi_os_numa_node_count_get(void) {
+  size_t count = mi_atomic_load_acquire(&_mi_numa_node_count);
+  if (count <= 0) {
+    long ncount = mi_option_get(mi_option_use_numa_nodes); // given explicitly?
+    if (ncount > 0) {
+      count = (size_t)ncount;
+    }
+    else {
+      count = mi_os_numa_node_countx(); // or detect dynamically
+      if (count == 0) count = 1;
+    }    
+    mi_atomic_store_release(&_mi_numa_node_count, count); // save it
+    _mi_verbose_message("using %zd numa regions\n", count);
+  }
+  return count;
+}
+
+int _mi_os_numa_node_get(mi_os_tld_t* tld) {
+  MI_UNUSED(tld);
+  size_t numa_count = _mi_os_numa_node_count();
+  if (numa_count<=1) return 0; // optimize on single numa node systems: always node 0
+  // never more than the node count and >= 0
+  size_t numa_node = mi_os_numa_nodex();
+  if (numa_node >= numa_count) { numa_node = numa_node % numa_count; }
+  return (int)numa_node;
+}
diff --git a/source/luametatex/source/libraries/mimalloc/src/page-queue.c b/source/luametatex/source/libraries/mimalloc/src/page-queue.c
new file mode 100644
index 000000000..92f933c2a
--- /dev/null
+++ b/source/luametatex/source/libraries/mimalloc/src/page-queue.c
@@ -0,0 +1,331 @@
+/*----------------------------------------------------------------------------
+Copyright (c) 2018-2020, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+
+/* -----------------------------------------------------------
+  Definition of page queues for each block size
+----------------------------------------------------------- */
+
+#ifndef MI_IN_PAGE_C
+#error "this file should be included from 'page.c'"
+#endif
+
+/* -----------------------------------------------------------
+  Minimal alignment in machine words (i.e. `sizeof(void*)`)
+----------------------------------------------------------- */
+
+#if (MI_MAX_ALIGN_SIZE > 4*MI_INTPTR_SIZE)
+  #error "define alignment for more than 4x word size for this platform"
+#elif (MI_MAX_ALIGN_SIZE > 2*MI_INTPTR_SIZE)
+  #define MI_ALIGN4W   // 4 machine words minimal alignment
+#elif (MI_MAX_ALIGN_SIZE > MI_INTPTR_SIZE)
+  #define MI_ALIGN2W   // 2 machine words minimal alignment
+#else
+  // ok, default alignment is 1 word
+#endif
+
+
+/* -----------------------------------------------------------
+  Queue query
+----------------------------------------------------------- */
+
+
+static inline bool mi_page_queue_is_huge(const mi_page_queue_t* pq) {
+  return (pq->block_size == (MI_MEDIUM_OBJ_SIZE_MAX+sizeof(uintptr_t)));
+}
+
+static inline bool mi_page_queue_is_full(const mi_page_queue_t* pq) {
+  return (pq->block_size == (MI_MEDIUM_OBJ_SIZE_MAX+(2*sizeof(uintptr_t))));
+}
+
+static inline bool mi_page_queue_is_special(const mi_page_queue_t* pq) {
+  return (pq->block_size > MI_MEDIUM_OBJ_SIZE_MAX);
+}
+
+/* -----------------------------------------------------------
+  Bins
+----------------------------------------------------------- */
+
+// Return the bin for a given field size.
+// Returns MI_BIN_HUGE if the size is too large.
+// We use `wsize` for the size in "machine word sizes",
+// i.e. byte size == `wsize*sizeof(void*)`.
+static inline uint8_t mi_bin(size_t size) {
+  size_t wsize = _mi_wsize_from_size(size);
+  uint8_t bin;
+  if (wsize <= 1) {
+    bin = 1;
+  }
+  #if defined(MI_ALIGN4W)
+  else if (wsize <= 4) {
+    bin = (uint8_t)((wsize+1)&~1); // round to double word sizes
+  }
+  #elif defined(MI_ALIGN2W)
+  else if (wsize <= 8) {
+    bin = (uint8_t)((wsize+1)&~1); // round to double word sizes
+  }
+  #else
+  else if (wsize <= 8) {
+    bin = (uint8_t)wsize;
+  }
+  #endif
+  else if (wsize > MI_MEDIUM_OBJ_WSIZE_MAX) {
+    bin = MI_BIN_HUGE;
+  }
+  else {
+    #if defined(MI_ALIGN4W)
+    if (wsize <= 16) { wsize = (wsize+3)&~3; } // round to 4x word sizes
+    #endif
+    wsize--;
+    // find the highest bit
+    uint8_t b = (uint8_t)mi_bsr(wsize);  // note: wsize != 0
+    // and use the top 3 bits to determine the bin (~12.5% worst internal fragmentation).
+    // - adjust with 3 because we use do not round the first 8 sizes
+    //   which each get an exact bin
+    bin = ((b << 2) + (uint8_t)((wsize >> (b - 2)) & 0x03)) - 3;
+    mi_assert_internal(bin < MI_BIN_HUGE);
+  }
+  mi_assert_internal(bin > 0 && bin <= MI_BIN_HUGE);
+  return bin;
+}
+
+
+
+/* -----------------------------------------------------------
+  Queue of pages with free blocks
+----------------------------------------------------------- */
+
+uint8_t _mi_bin(size_t size) {
+  return mi_bin(size);
+}
+
+size_t _mi_bin_size(uint8_t bin) {
+  return _mi_heap_empty.pages[bin].block_size;
+}
+
+// Good size for allocation
+size_t mi_good_size(size_t size) mi_attr_noexcept {
+  if (size <= MI_MEDIUM_OBJ_SIZE_MAX) {
+    return _mi_bin_size(mi_bin(size));
+  }
+  else {
+    return _mi_align_up(size,_mi_os_page_size());
+  }
+}
+
+#if (MI_DEBUG>1)
+static bool mi_page_queue_contains(mi_page_queue_t* queue, const mi_page_t* page) {
+  mi_assert_internal(page != NULL);
+  mi_page_t* list = queue->first;
+  while (list != NULL) {
+    mi_assert_internal(list->next == NULL || list->next->prev == list);
+    mi_assert_internal(list->prev == NULL || list->prev->next == list);
+    if (list == page) break;
+    list = list->next;
+  }
+  return (list == page);
+}
+
+#endif
+
+#if (MI_DEBUG>1)
+static bool mi_heap_contains_queue(const mi_heap_t* heap, const mi_page_queue_t* pq) {
+  return (pq >= &heap->pages[0] && pq <= &heap->pages[MI_BIN_FULL]);
+}
+#endif
+
+static mi_page_queue_t* mi_page_queue_of(const mi_page_t* page) {
+  uint8_t bin = (mi_page_is_in_full(page) ? MI_BIN_FULL : mi_bin(page->xblock_size));
+  mi_heap_t* heap = mi_page_heap(page);
+  mi_assert_internal(heap != NULL && bin <= MI_BIN_FULL);
+  mi_page_queue_t* pq = &heap->pages[bin];
+  mi_assert_internal(bin >= MI_BIN_HUGE || page->xblock_size == pq->block_size);
+  mi_assert_expensive(mi_page_queue_contains(pq, page));
+  return pq;
+}
+
+static mi_page_queue_t* mi_heap_page_queue_of(mi_heap_t* heap, const mi_page_t* page) {
+  uint8_t bin = (mi_page_is_in_full(page) ? MI_BIN_FULL : mi_bin(page->xblock_size));
+  mi_assert_internal(bin <= MI_BIN_FULL);
+  mi_page_queue_t* pq = &heap->pages[bin];
+  mi_assert_internal(mi_page_is_in_full(page) || page->xblock_size == pq->block_size);
+  return pq;
+}
+
+// The current small page array is for efficiency and for each
+// small size (up to 256) it points directly to the page for that
+// size without having to compute the bin. This means when the
+// current free page queue is updated for a small bin, we need to update a
+// range of entries in `_mi_page_small_free`.
+static inline void mi_heap_queue_first_update(mi_heap_t* heap, const mi_page_queue_t* pq) {
+  mi_assert_internal(mi_heap_contains_queue(heap,pq));
+  size_t size = pq->block_size;
+  if (size > MI_SMALL_SIZE_MAX) return;
+
+  mi_page_t* page = pq->first;
+  if (pq->first == NULL) page = (mi_page_t*)&_mi_page_empty;
+
+  // find index in the right direct page array
+  size_t start;
+  size_t idx = _mi_wsize_from_size(size);
+  mi_page_t** pages_free = heap->pages_free_direct;
+
+  if (pages_free[idx] == page) return;  // already set
+
+  // find start slot
+  if (idx<=1) {
+    start = 0;
+  }
+  else {
+    // find previous size; due to minimal alignment upto 3 previous bins may need to be skipped
+    uint8_t bin = mi_bin(size);
+    const mi_page_queue_t* prev = pq - 1;
+    while( bin == mi_bin(prev->block_size) && prev > &heap->pages[0]) {
+      prev--;
+    }
+    start = 1 + _mi_wsize_from_size(prev->block_size);
+    if (start > idx) start = idx;
+  }
+
+  // set size range to the right page
+  mi_assert(start <= idx);
+  for (size_t sz = start; sz <= idx; sz++) {
+    pages_free[sz] = page;
+  }
+}
+
+/*
+static bool mi_page_queue_is_empty(mi_page_queue_t* queue) {
+  return (queue->first == NULL);
+}
+*/
+
+static void mi_page_queue_remove(mi_page_queue_t* queue, mi_page_t* page) {
+  mi_assert_internal(page != NULL);
+  mi_assert_expensive(mi_page_queue_contains(queue, page));
+  mi_assert_internal(page->xblock_size == queue->block_size || (page->xblock_size > MI_MEDIUM_OBJ_SIZE_MAX && mi_page_queue_is_huge(queue))  || (mi_page_is_in_full(page) && mi_page_queue_is_full(queue)));
+  mi_heap_t* heap = mi_page_heap(page);
+
+  if (page->prev != NULL) page->prev->next = page->next;
+  if (page->next != NULL) page->next->prev = page->prev;
+  if (page == queue->last)  queue->last = page->prev;
+  if (page == queue->first) {
+    queue->first = page->next;
+    // update first
+    mi_assert_internal(mi_heap_contains_queue(heap, queue));
+    mi_heap_queue_first_update(heap,queue);
+  }
+  heap->page_count--;
+  page->next = NULL;
+  page->prev = NULL;
+  // mi_atomic_store_ptr_release(mi_atomic_cast(void*, &page->heap), NULL);
+  mi_page_set_in_full(page,false);
+}
+
+
+static void mi_page_queue_push(mi_heap_t* heap, mi_page_queue_t* queue, mi_page_t* page) {
+  mi_assert_internal(mi_page_heap(page) == heap);
+  mi_assert_internal(!mi_page_queue_contains(queue, page));
+
+  mi_assert_internal(_mi_page_segment(page)->kind != MI_SEGMENT_HUGE);
+  mi_assert_internal(page->xblock_size == queue->block_size ||
+                      (page->xblock_size > MI_MEDIUM_OBJ_SIZE_MAX) ||
+                        (mi_page_is_in_full(page) && mi_page_queue_is_full(queue)));
+
+  mi_page_set_in_full(page, mi_page_queue_is_full(queue));
+  // mi_atomic_store_ptr_release(mi_atomic_cast(void*, &page->heap), heap);
+  page->next = queue->first;
+  page->prev = NULL;
+  if (queue->first != NULL) {
+    mi_assert_internal(queue->first->prev == NULL);
+    queue->first->prev = page;
+    queue->first = page;
+  }
+  else {
+    queue->first = queue->last = page;
+  }
+
+  // update direct
+  mi_heap_queue_first_update(heap, queue);
+  heap->page_count++;
+}
+
+
+static void mi_page_queue_enqueue_from(mi_page_queue_t* to, mi_page_queue_t* from, mi_page_t* page) {
+  mi_assert_internal(page != NULL);
+  mi_assert_expensive(mi_page_queue_contains(from, page));
+  mi_assert_expensive(!mi_page_queue_contains(to, page));
+
+  mi_assert_internal((page->xblock_size == to->block_size && page->xblock_size == from->block_size) ||
+                     (page->xblock_size == to->block_size && mi_page_queue_is_full(from)) ||
+                     (page->xblock_size == from->block_size && mi_page_queue_is_full(to)) ||
+                     (page->xblock_size > MI_LARGE_OBJ_SIZE_MAX && mi_page_queue_is_huge(to)) ||
+                     (page->xblock_size > MI_LARGE_OBJ_SIZE_MAX && mi_page_queue_is_full(to)));
+
+  mi_heap_t* heap = mi_page_heap(page);
+  if (page->prev != NULL) page->prev->next = page->next;
+  if (page->next != NULL) page->next->prev = page->prev;
+  if (page == from->last)  from->last = page->prev;
+  if (page == from->first) {
+    from->first = page->next;
+    // update first
+    mi_assert_internal(mi_heap_contains_queue(heap, from));
+    mi_heap_queue_first_update(heap, from);
+  }
+
+  page->prev = to->last;
+  page->next = NULL;
+  if (to->last != NULL) {
+    mi_assert_internal(heap == mi_page_heap(to->last));
+    to->last->next = page;
+    to->last = page;
+  }
+  else {
+    to->first = page;
+    to->last = page;
+    mi_heap_queue_first_update(heap, to);
+  }
+
+  mi_page_set_in_full(page, mi_page_queue_is_full(to));
+}
+
+// Only called from `mi_heap_absorb`.
+size_t _mi_page_queue_append(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_queue_t* append) {
+  mi_assert_internal(mi_heap_contains_queue(heap,pq));
+  mi_assert_internal(pq->block_size == append->block_size);
+
+  if (append->first==NULL) return 0;
+
+  // set append pages to new heap and count
+  size_t count = 0;
+  for (mi_page_t* page = append->first; page != NULL; page = page->next) {
+    // inline `mi_page_set_heap` to avoid wrong assertion during absorption;
+    // in this case it is ok to be delayed freeing since both "to" and "from" heap are still alive.
+    mi_atomic_store_release(&page->xheap, (uintptr_t)heap); 
+    // set the flag to delayed free (not overriding NEVER_DELAYED_FREE) which has as a
+    // side effect that it spins until any DELAYED_FREEING is finished. This ensures
+    // that after appending only the new heap will be used for delayed free operations.
+    _mi_page_use_delayed_free(page, MI_USE_DELAYED_FREE, false);
+    count++;
+  }
+
+  if (pq->last==NULL) {
+    // take over afresh
+    mi_assert_internal(pq->first==NULL);
+    pq->first = append->first;
+    pq->last = append->last;
+    mi_heap_queue_first_update(heap, pq);
+  }
+  else {
+    // append to end
+    mi_assert_internal(pq->last!=NULL);
+    mi_assert_internal(append->first!=NULL);
+    pq->last->next = append->first;
+    append->first->prev = pq->last;
+    pq->last = append->last;
+  }
+  return count;
+}
diff --git a/source/luametatex/source/libraries/mimalloc/src/page.c b/source/luametatex/source/libraries/mimalloc/src/page.c
new file mode 100644
index 000000000..fd6c5397d
--- /dev/null
+++ b/source/luametatex/source/libraries/mimalloc/src/page.c
@@ -0,0 +1,869 @@
+/*----------------------------------------------------------------------------
+Copyright (c) 2018-2020, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+
+/* -----------------------------------------------------------
+  The core of the allocator. Every segment contains
+  pages of a certain block size. The main function
+  exported is `mi_malloc_generic`.
+----------------------------------------------------------- */
+
+#include "mimalloc.h"
+#include "mimalloc-internal.h"
+#include "mimalloc-atomic.h"
+
+/* -----------------------------------------------------------
+  Definition of page queues for each block size
+----------------------------------------------------------- */
+
+#define MI_IN_PAGE_C
+#include "page-queue.c"
+#undef MI_IN_PAGE_C
+
+
+/* -----------------------------------------------------------
+  Page helpers
+----------------------------------------------------------- */
+
+// Index a block in a page
+static inline mi_block_t* mi_page_block_at(const mi_page_t* page, void* page_start, size_t block_size, size_t i) {
+  MI_UNUSED(page);
+  mi_assert_internal(page != NULL);
+  mi_assert_internal(i <= page->reserved);
+  return (mi_block_t*)((uint8_t*)page_start + (i * block_size));
+}
+
+static void mi_page_init(mi_heap_t* heap, mi_page_t* page, size_t size, mi_tld_t* tld);
+static void mi_page_extend_free(mi_heap_t* heap, mi_page_t* page, mi_tld_t* tld);
+
+#if (MI_DEBUG>=3)
+static size_t mi_page_list_count(mi_page_t* page, mi_block_t* head) {
+  size_t count = 0;
+  while (head != NULL) {
+    mi_assert_internal(page == _mi_ptr_page(head));
+    count++;
+    head = mi_block_next(page, head);
+  }
+  return count;
+}
+
+/*
+// Start of the page available memory
+static inline uint8_t* mi_page_area(const mi_page_t* page) {
+  return _mi_page_start(_mi_page_segment(page), page, NULL);
+}
+*/
+
+static bool mi_page_list_is_valid(mi_page_t* page, mi_block_t* p) {
+  size_t psize;
+  uint8_t* page_area = _mi_page_start(_mi_page_segment(page), page, &psize);
+  mi_block_t* start = (mi_block_t*)page_area;
+  mi_block_t* end   = (mi_block_t*)(page_area + psize);
+  while(p != NULL) {
+    if (p < start || p >= end) return false;
+    p = mi_block_next(page, p);
+  }
+  return true;
+}
+
+static bool mi_page_is_valid_init(mi_page_t* page) {
+  mi_assert_internal(page->xblock_size > 0);
+  mi_assert_internal(page->used <= page->capacity);
+  mi_assert_internal(page->capacity <= page->reserved);
+
+  mi_segment_t* segment = _mi_page_segment(page);
+  uint8_t* start = _mi_page_start(segment,page,NULL);
+  mi_assert_internal(start == _mi_segment_page_start(segment,page,NULL));
+  //const size_t bsize = mi_page_block_size(page);
+  //mi_assert_internal(start + page->capacity*page->block_size == page->top);
+
+  mi_assert_internal(mi_page_list_is_valid(page,page->free));
+  mi_assert_internal(mi_page_list_is_valid(page,page->local_free));
+
+  #if MI_DEBUG>3 // generally too expensive to check this
+  if (page->is_zero) {
+    const size_t ubsize = mi_page_usable_block_size(page);
+    for(mi_block_t* block = page->free; block != NULL; block = mi_block_next(page,block)) {
+      mi_assert_expensive(mi_mem_is_zero(block + 1, ubsize - sizeof(mi_block_t)));
+    }
+  }
+  #endif
+
+  mi_block_t* tfree = mi_page_thread_free(page);
+  mi_assert_internal(mi_page_list_is_valid(page, tfree));
+  //size_t tfree_count = mi_page_list_count(page, tfree);
+  //mi_assert_internal(tfree_count <= page->thread_freed + 1);
+
+  size_t free_count = mi_page_list_count(page, page->free) + mi_page_list_count(page, page->local_free);
+  mi_assert_internal(page->used + free_count == page->capacity);
+
+  return true;
+}
+
+bool _mi_page_is_valid(mi_page_t* page) {
+  mi_assert_internal(mi_page_is_valid_init(page));
+  #if MI_SECURE
+  mi_assert_internal(page->keys[0] != 0);
+  #endif
+  if (mi_page_heap(page)!=NULL) {
+    mi_segment_t* segment = _mi_page_segment(page);
+
+    mi_assert_internal(!_mi_process_is_initialized || segment->thread_id==0 || segment->thread_id == mi_page_heap(page)->thread_id);
+    if (segment->kind != MI_SEGMENT_HUGE) {
+      mi_page_queue_t* pq = mi_page_queue_of(page);
+      mi_assert_internal(mi_page_queue_contains(pq, page));
+      mi_assert_internal(pq->block_size==mi_page_block_size(page) || mi_page_block_size(page) > MI_MEDIUM_OBJ_SIZE_MAX || mi_page_is_in_full(page));
+      mi_assert_internal(mi_heap_contains_queue(mi_page_heap(page),pq));
+    }
+  }
+  return true;
+}
+#endif
+
+void _mi_page_use_delayed_free(mi_page_t* page, mi_delayed_t delay, bool override_never) {
+  mi_thread_free_t tfreex;
+  mi_delayed_t     old_delay;
+  mi_thread_free_t tfree;  
+  do {
+    tfree = mi_atomic_load_acquire(&page->xthread_free); // note: must acquire as we can break/repeat this loop and not do a CAS;
+    tfreex = mi_tf_set_delayed(tfree, delay);
+    old_delay = mi_tf_delayed(tfree);
+    if (mi_unlikely(old_delay == MI_DELAYED_FREEING)) {
+      mi_atomic_yield(); // delay until outstanding MI_DELAYED_FREEING are done.
+      // tfree = mi_tf_set_delayed(tfree, MI_NO_DELAYED_FREE); // will cause CAS to busy fail
+    }
+    else if (delay == old_delay) {
+      break; // avoid atomic operation if already equal
+    }
+    else if (!override_never && old_delay == MI_NEVER_DELAYED_FREE) {
+      break; // leave never-delayed flag set
+    }
+  } while ((old_delay == MI_DELAYED_FREEING) ||
+           !mi_atomic_cas_weak_release(&page->xthread_free, &tfree, tfreex));
+}
+
+/* -----------------------------------------------------------
+  Page collect the `local_free` and `thread_free` lists
+----------------------------------------------------------- */
+
+// Collect the local `thread_free` list using an atomic exchange.
+// Note: The exchange must be done atomically as this is used right after
+// moving to the full list in `mi_page_collect_ex` and we need to
+// ensure that there was no race where the page became unfull just before the move.
+static void _mi_page_thread_free_collect(mi_page_t* page)
+{
+  mi_block_t* head;
+  mi_thread_free_t tfreex;
+  mi_thread_free_t tfree = mi_atomic_load_relaxed(&page->xthread_free);
+  do {
+    head = mi_tf_block(tfree);
+    tfreex = mi_tf_set_block(tfree,NULL);
+  } while (!mi_atomic_cas_weak_acq_rel(&page->xthread_free, &tfree, tfreex));
+
+  // return if the list is empty
+  if (head == NULL) return;
+
+  // find the tail -- also to get a proper count (without data races)
+  uint32_t max_count = page->capacity; // cannot collect more than capacity
+  uint32_t count = 1;
+  mi_block_t* tail = head;
+  mi_block_t* next;
+  while ((next = mi_block_next(page,tail)) != NULL && count <= max_count) {
+    count++;
+    tail = next;
+  }
+  // if `count > max_count` there was a memory corruption (possibly infinite list due to double multi-threaded free)
+  if (count > max_count) {
+    _mi_error_message(EFAULT, "corrupted thread-free list\n");
+    return; // the thread-free items cannot be freed
+  }
+
+  // and append the current local free list
+  mi_block_set_next(page,tail, page->local_free);
+  page->local_free = head;
+
+  // update counts now
+  page->used -= count;
+}
+
+void _mi_page_free_collect(mi_page_t* page, bool force) {
+  mi_assert_internal(page!=NULL);
+
+  // collect the thread free list
+  if (force || mi_page_thread_free(page) != NULL) {  // quick test to avoid an atomic operation
+    _mi_page_thread_free_collect(page);
+  }
+
+  // and the local free list
+  if (page->local_free != NULL) {
+    if (mi_likely(page->free == NULL)) {
+      // usual case
+      page->free = page->local_free;
+      page->local_free = NULL;
+      page->is_zero = false;
+    }
+    else if (force) {
+      // append -- only on shutdown (force) as this is a linear operation
+      mi_block_t* tail = page->local_free;
+      mi_block_t* next;
+      while ((next = mi_block_next(page, tail)) != NULL) {
+        tail = next;
+      }
+      mi_block_set_next(page, tail, page->free);
+      page->free = page->local_free;
+      page->local_free = NULL;
+      page->is_zero = false;
+    }
+  }
+
+  mi_assert_internal(!force || page->local_free == NULL);
+}
+
+
+
+/* -----------------------------------------------------------
+  Page fresh and retire
+----------------------------------------------------------- */
+
+// called from segments when reclaiming abandoned pages
+void _mi_page_reclaim(mi_heap_t* heap, mi_page_t* page) {
+  mi_assert_expensive(mi_page_is_valid_init(page));
+
+  mi_assert_internal(mi_page_heap(page) == heap);
+  mi_assert_internal(mi_page_thread_free_flag(page) != MI_NEVER_DELAYED_FREE);
+  mi_assert_internal(_mi_page_segment(page)->kind != MI_SEGMENT_HUGE);
+  mi_assert_internal(!page->is_reset);
+  // TODO: push on full queue immediately if it is full?
+  mi_page_queue_t* pq = mi_page_queue(heap, mi_page_block_size(page));
+  mi_page_queue_push(heap, pq, page);
+  mi_assert_expensive(_mi_page_is_valid(page));
+}
+
+// allocate a fresh page from a segment
+static mi_page_t* mi_page_fresh_alloc(mi_heap_t* heap, mi_page_queue_t* pq, size_t block_size) {
+  mi_assert_internal(pq==NULL||mi_heap_contains_queue(heap, pq));
+  mi_page_t* page = _mi_segment_page_alloc(heap, block_size, &heap->tld->segments, &heap->tld->os);
+  if (page == NULL) {
+    // this may be out-of-memory, or an abandoned page was reclaimed (and in our queue)
+    return NULL;
+  }
+  mi_assert_internal(pq==NULL || _mi_page_segment(page)->kind != MI_SEGMENT_HUGE);
+  mi_page_init(heap, page, block_size, heap->tld);
+  mi_heap_stat_increase(heap, pages, 1);
+  if (pq!=NULL) mi_page_queue_push(heap, pq, page); // huge pages use pq==NULL
+  mi_assert_expensive(_mi_page_is_valid(page));
+  return page;
+}
+
+// Get a fresh page to use
+static mi_page_t* mi_page_fresh(mi_heap_t* heap, mi_page_queue_t* pq) {
+  mi_assert_internal(mi_heap_contains_queue(heap, pq));
+  mi_page_t* page = mi_page_fresh_alloc(heap, pq, pq->block_size);
+  if (page==NULL) return NULL;
+  mi_assert_internal(pq->block_size==mi_page_block_size(page));
+  mi_assert_internal(pq==mi_page_queue(heap, mi_page_block_size(page)));
+  return page;
+}
+
+/* -----------------------------------------------------------
+   Do any delayed frees
+   (put there by other threads if they deallocated in a full page)
+----------------------------------------------------------- */
+void _mi_heap_delayed_free(mi_heap_t* heap) {
+  // take over the list (note: no atomic exchange since it is often NULL)
+  mi_block_t* block = mi_atomic_load_ptr_relaxed(mi_block_t, &heap->thread_delayed_free);
+  while (block != NULL && !mi_atomic_cas_ptr_weak_acq_rel(mi_block_t, &heap->thread_delayed_free, &block, NULL)) { /* nothing */ };
+
+  // and free them all
+  while(block != NULL) {
+    mi_block_t* next = mi_block_nextx(heap,block, heap->keys);
+    // use internal free instead of regular one to keep stats etc correct
+    if (!_mi_free_delayed_block(block)) {
+      // we might already start delayed freeing while another thread has not yet
+      // reset the delayed_freeing flag; in that case delay it further by reinserting.
+      mi_block_t* dfree = mi_atomic_load_ptr_relaxed(mi_block_t, &heap->thread_delayed_free);
+      do {
+        mi_block_set_nextx(heap, block, dfree, heap->keys);
+      } while (!mi_atomic_cas_ptr_weak_release(mi_block_t,&heap->thread_delayed_free, &dfree, block));
+    }
+    block = next;
+  }
+}
+
+/* -----------------------------------------------------------
+  Unfull, abandon, free and retire
+----------------------------------------------------------- */
+
+// Move a page from the full list back to a regular list
+void _mi_page_unfull(mi_page_t* page) {
+  mi_assert_internal(page != NULL);
+  mi_assert_expensive(_mi_page_is_valid(page));
+  mi_assert_internal(mi_page_is_in_full(page));
+  if (!mi_page_is_in_full(page)) return;
+
+  mi_heap_t* heap = mi_page_heap(page);
+  mi_page_queue_t* pqfull = &heap->pages[MI_BIN_FULL];
+  mi_page_set_in_full(page, false); // to get the right queue
+  mi_page_queue_t* pq = mi_heap_page_queue_of(heap, page);
+  mi_page_set_in_full(page, true);
+  mi_page_queue_enqueue_from(pq, pqfull, page);
+}
+
+static void mi_page_to_full(mi_page_t* page, mi_page_queue_t* pq) {
+  mi_assert_internal(pq == mi_page_queue_of(page));
+  mi_assert_internal(!mi_page_immediate_available(page));
+  mi_assert_internal(!mi_page_is_in_full(page));
+
+  if (mi_page_is_in_full(page)) return;
+  mi_page_queue_enqueue_from(&mi_page_heap(page)->pages[MI_BIN_FULL], pq, page);
+  _mi_page_free_collect(page,false);  // try to collect right away in case another thread freed just before MI_USE_DELAYED_FREE was set
+}
+
+
+// Abandon a page with used blocks at the end of a thread.
+// Note: only call if it is ensured that no references exist from
+// the `page->heap->thread_delayed_free` into this page.
+// Currently only called through `mi_heap_collect_ex` which ensures this.
+void _mi_page_abandon(mi_page_t* page, mi_page_queue_t* pq) {
+  mi_assert_internal(page != NULL);
+  mi_assert_expensive(_mi_page_is_valid(page));
+  mi_assert_internal(pq == mi_page_queue_of(page));
+  mi_assert_internal(mi_page_heap(page) != NULL);
+
+  mi_heap_t* pheap = mi_page_heap(page);
+
+  // remove from our page list
+  mi_segments_tld_t* segments_tld = &pheap->tld->segments;
+  mi_page_queue_remove(pq, page);
+
+  // page is no longer associated with our heap
+  mi_assert_internal(mi_page_thread_free_flag(page)==MI_NEVER_DELAYED_FREE);
+  mi_page_set_heap(page, NULL);
+
+#if MI_DEBUG>1
+  // check there are no references left..
+  for (mi_block_t* block = (mi_block_t*)pheap->thread_delayed_free; block != NULL; block = mi_block_nextx(pheap, block, pheap->keys)) {
+    mi_assert_internal(_mi_ptr_page(block) != page);
+  }
+#endif
+
+  // and abandon it
+  mi_assert_internal(mi_page_heap(page) == NULL);
+  _mi_segment_page_abandon(page,segments_tld);
+}
+
+
+// Free a page with no more free blocks
+void _mi_page_free(mi_page_t* page, mi_page_queue_t* pq, bool force) {
+  mi_assert_internal(page != NULL);
+  mi_assert_expensive(_mi_page_is_valid(page));
+  mi_assert_internal(pq == mi_page_queue_of(page));
+  mi_assert_internal(mi_page_all_free(page));
+  mi_assert_internal(mi_page_thread_free_flag(page)!=MI_DELAYED_FREEING);
+
+  // no more aligned blocks in here
+  mi_page_set_has_aligned(page, false);
+
+  mi_heap_t* heap = mi_page_heap(page);
+
+  // remove from the page list
+  // (no need to do _mi_heap_delayed_free first as all blocks are already free)
+  mi_segments_tld_t* segments_tld = &heap->tld->segments;
+  mi_page_queue_remove(pq, page);
+
+  // and free it
+  mi_page_set_heap(page,NULL);
+  _mi_segment_page_free(page, force, segments_tld);
+}
+
+// Retire parameters
+#define MI_MAX_RETIRE_SIZE    MI_MEDIUM_OBJ_SIZE_MAX  
+#define MI_RETIRE_CYCLES      (8)
+
+// Retire a page with no more used blocks
+// Important to not retire too quickly though as new
+// allocations might coming.
+// Note: called from `mi_free` and benchmarks often
+// trigger this due to freeing everything and then
+// allocating again so careful when changing this.
+void _mi_page_retire(mi_page_t* page) mi_attr_noexcept {
+  mi_assert_internal(page != NULL);
+  mi_assert_expensive(_mi_page_is_valid(page));
+  mi_assert_internal(mi_page_all_free(page));
+  
+  mi_page_set_has_aligned(page, false);
+
+  // don't retire too often..
+  // (or we end up retiring and re-allocating most of the time)
+  // NOTE: refine this more: we should not retire if this
+  // is the only page left with free blocks. It is not clear
+  // how to check this efficiently though...
+  // for now, we don't retire if it is the only page left of this size class.
+  mi_page_queue_t* pq = mi_page_queue_of(page);
+  if (mi_likely(page->xblock_size <= MI_MAX_RETIRE_SIZE && !mi_page_is_in_full(page))) {
+    if (pq->last==page && pq->first==page) { // the only page in the queue?
+      mi_stat_counter_increase(_mi_stats_main.page_no_retire,1);
+      page->retire_expire = 1 + (page->xblock_size <= MI_SMALL_OBJ_SIZE_MAX ? MI_RETIRE_CYCLES : MI_RETIRE_CYCLES/4);      
+      mi_heap_t* heap = mi_page_heap(page);
+      mi_assert_internal(pq >= heap->pages);
+      const size_t index = pq - heap->pages;
+      mi_assert_internal(index < MI_BIN_FULL && index < MI_BIN_HUGE);
+      if (index < heap->page_retired_min) heap->page_retired_min = index;
+      if (index > heap->page_retired_max) heap->page_retired_max = index;
+      mi_assert_internal(mi_page_all_free(page));
+      return; // dont't free after all
+    }
+  }
+  _mi_page_free(page, pq, false);
+}
+
+// free retired pages: we don't need to look at the entire queues
+// since we only retire pages that are at the head position in a queue.
+void _mi_heap_collect_retired(mi_heap_t* heap, bool force) {
+  size_t min = MI_BIN_FULL;
+  size_t max = 0;
+  for(size_t bin = heap->page_retired_min; bin <= heap->page_retired_max; bin++) {
+    mi_page_queue_t* pq   = &heap->pages[bin];
+    mi_page_t*       page = pq->first;
+    if (page != NULL && page->retire_expire != 0) {
+      if (mi_page_all_free(page)) {
+        page->retire_expire--;
+        if (force || page->retire_expire == 0) {
+          _mi_page_free(pq->first, pq, force);
+        }
+        else {
+          // keep retired, update min/max
+          if (bin < min) min = bin;
+          if (bin > max) max = bin;
+        }
+      }
+      else {
+        page->retire_expire = 0;
+      }
+    }
+  }
+  heap->page_retired_min = min;
+  heap->page_retired_max = max;
+}
+
+
+/* -----------------------------------------------------------
+  Initialize the initial free list in a page.
+  In secure mode we initialize a randomized list by
+  alternating between slices.
+----------------------------------------------------------- */
+
+#define MI_MAX_SLICE_SHIFT  (6)   // at most 64 slices
+#define MI_MAX_SLICES       (1UL << MI_MAX_SLICE_SHIFT)
+#define MI_MIN_SLICES       (2)
+
+static void mi_page_free_list_extend_secure(mi_heap_t* const heap, mi_page_t* const page, const size_t bsize, const size_t extend, mi_stats_t* const stats) {
+  MI_UNUSED(stats);
+  #if (MI_SECURE<=2)
+  mi_assert_internal(page->free == NULL);
+  mi_assert_internal(page->local_free == NULL);
+  #endif
+  mi_assert_internal(page->capacity + extend <= page->reserved);
+  mi_assert_internal(bsize == mi_page_block_size(page));
+  void* const page_area = _mi_page_start(_mi_page_segment(page), page, NULL);
+
+  // initialize a randomized free list
+  // set up `slice_count` slices to alternate between
+  size_t shift = MI_MAX_SLICE_SHIFT;
+  while ((extend >> shift) == 0) {
+    shift--;
+  }
+  const size_t slice_count = (size_t)1U << shift;
+  const size_t slice_extend = extend / slice_count;
+  mi_assert_internal(slice_extend >= 1);
+  mi_block_t* blocks[MI_MAX_SLICES];   // current start of the slice
+  size_t      counts[MI_MAX_SLICES];   // available objects in the slice
+  for (size_t i = 0; i < slice_count; i++) {
+    blocks[i] = mi_page_block_at(page, page_area, bsize, page->capacity + i*slice_extend);
+    counts[i] = slice_extend;
+  }
+  counts[slice_count-1] += (extend % slice_count);  // final slice holds the modulus too (todo: distribute evenly?)
+
+  // and initialize the free list by randomly threading through them
+  // set up first element
+  const uintptr_t r = _mi_heap_random_next(heap);
+  size_t current = r % slice_count;
+  counts[current]--;
+  mi_block_t* const free_start = blocks[current];
+  // and iterate through the rest; use `random_shuffle` for performance
+  uintptr_t rnd = _mi_random_shuffle(r|1); // ensure not 0
+  for (size_t i = 1; i < extend; i++) {
+    // call random_shuffle only every INTPTR_SIZE rounds
+    const size_t round = i%MI_INTPTR_SIZE;
+    if (round == 0) rnd = _mi_random_shuffle(rnd);
+    // select a random next slice index
+    size_t next = ((rnd >> 8*round) & (slice_count-1));
+    while (counts[next]==0) {                            // ensure it still has space
+      next++;
+      if (next==slice_count) next = 0;
+    }
+    // and link the current block to it
+    counts[next]--;
+    mi_block_t* const block = blocks[current];
+    blocks[current] = (mi_block_t*)((uint8_t*)block + bsize);  // bump to the following block
+    mi_block_set_next(page, block, blocks[next]);   // and set next; note: we may have `current == next`
+    current = next;
+  }
+  // prepend to the free list (usually NULL)
+  mi_block_set_next(page, blocks[current], page->free);  // end of the list
+  page->free = free_start;
+}
+
+static mi_decl_noinline void mi_page_free_list_extend( mi_page_t* const page, const size_t bsize, const size_t extend, mi_stats_t* const stats)
+{
+  MI_UNUSED(stats);
+  #if (MI_SECURE <= 2)
+  mi_assert_internal(page->free == NULL);
+  mi_assert_internal(page->local_free == NULL);
+  #endif
+  mi_assert_internal(page->capacity + extend <= page->reserved);
+  mi_assert_internal(bsize == mi_page_block_size(page));
+  void* const page_area = _mi_page_start(_mi_page_segment(page), page, NULL );
+
+  mi_block_t* const start = mi_page_block_at(page, page_area, bsize, page->capacity);
+
+  // initialize a sequential free list
+  mi_block_t* const last = mi_page_block_at(page, page_area, bsize, page->capacity + extend - 1);
+  mi_block_t* block = start;
+  while(block <= last) {
+    mi_block_t* next = (mi_block_t*)((uint8_t*)block + bsize);
+    mi_block_set_next(page,block,next);
+    block = next;
+  }
+  // prepend to free list (usually `NULL`)
+  mi_block_set_next(page, last, page->free);
+  page->free = start;
+}
+
+/* -----------------------------------------------------------
+  Page initialize and extend the capacity
+----------------------------------------------------------- */
+
+#define MI_MAX_EXTEND_SIZE    (4*1024)      // heuristic, one OS page seems to work well.
+#if (MI_SECURE>0)
+#define MI_MIN_EXTEND         (8*MI_SECURE) // extend at least by this many
+#else
+#define MI_MIN_EXTEND         (1)
+#endif
+
+// Extend the capacity (up to reserved) by initializing a free list
+// We do at most `MI_MAX_EXTEND` to avoid touching too much memory
+// Note: we also experimented with "bump" allocation on the first
+// allocations but this did not speed up any benchmark (due to an
+// extra test in malloc? or cache effects?)
+static void mi_page_extend_free(mi_heap_t* heap, mi_page_t* page, mi_tld_t* tld) {
+  MI_UNUSED(tld); 
+  mi_assert_expensive(mi_page_is_valid_init(page));
+  #if (MI_SECURE<=2)
+  mi_assert(page->free == NULL);
+  mi_assert(page->local_free == NULL);
+  if (page->free != NULL) return;
+  #endif
+  if (page->capacity >= page->reserved) return;
+
+  size_t page_size;
+  _mi_page_start(_mi_page_segment(page), page, &page_size);
+  mi_stat_counter_increase(tld->stats.pages_extended, 1);
+
+  // calculate the extend count
+  const size_t bsize = (page->xblock_size < MI_HUGE_BLOCK_SIZE ? page->xblock_size : page_size);
+  size_t extend = page->reserved - page->capacity;
+  mi_assert_internal(extend > 0);
+
+  size_t max_extend = (bsize >= MI_MAX_EXTEND_SIZE ? MI_MIN_EXTEND : MI_MAX_EXTEND_SIZE/(uint32_t)bsize);
+  if (max_extend < MI_MIN_EXTEND) { max_extend = MI_MIN_EXTEND; }
+  mi_assert_internal(max_extend > 0);
+    
+  if (extend > max_extend) {
+    // ensure we don't touch memory beyond the page to reduce page commit.
+    // the `lean` benchmark tests this. Going from 1 to 8 increases rss by 50%.
+    extend = max_extend;
+  }
+
+  mi_assert_internal(extend > 0 && extend + page->capacity <= page->reserved);
+  mi_assert_internal(extend < (1UL<<16));
+
+  // and append the extend the free list
+  if (extend < MI_MIN_SLICES || MI_SECURE==0) { //!mi_option_is_enabled(mi_option_secure)) {
+    mi_page_free_list_extend(page, bsize, extend, &tld->stats );
+  }
+  else {
+    mi_page_free_list_extend_secure(heap, page, bsize, extend, &tld->stats);
+  }
+  // enable the new free list
+  page->capacity += (uint16_t)extend;
+  mi_stat_increase(tld->stats.page_committed, extend * bsize);
+
+  // extension into zero initialized memory preserves the zero'd free list
+  if (!page->is_zero_init) {
+    page->is_zero = false;
+  }
+  mi_assert_expensive(mi_page_is_valid_init(page));
+}
+
+// Initialize a fresh page
+static void mi_page_init(mi_heap_t* heap, mi_page_t* page, size_t block_size, mi_tld_t* tld) {
+  mi_assert(page != NULL);
+  mi_segment_t* segment = _mi_page_segment(page);
+  mi_assert(segment != NULL);
+  mi_assert_internal(block_size > 0);
+  // set fields
+  mi_page_set_heap(page, heap);
+  page->xblock_size = (block_size < MI_HUGE_BLOCK_SIZE ? (uint32_t)block_size : MI_HUGE_BLOCK_SIZE); // initialize before _mi_segment_page_start
+  size_t page_size;
+  _mi_segment_page_start(segment, page, &page_size);
+  mi_assert_internal(mi_page_block_size(page) <= page_size);
+  mi_assert_internal(page_size <= page->slice_count*MI_SEGMENT_SLICE_SIZE);
+  mi_assert_internal(page_size / block_size < (1L<<16));
+  page->reserved = (uint16_t)(page_size / block_size);
+  #ifdef MI_ENCODE_FREELIST
+  page->keys[0] = _mi_heap_random_next(heap);
+  page->keys[1] = _mi_heap_random_next(heap);
+  #endif
+  #if MI_DEBUG > 0
+  page->is_zero = false; // ensure in debug mode we initialize with MI_DEBUG_UNINIT, see issue #501
+  #else
+  page->is_zero = page->is_zero_init;
+  #endif
+
+  mi_assert_internal(page->is_committed);
+  mi_assert_internal(!page->is_reset);
+  mi_assert_internal(page->capacity == 0);
+  mi_assert_internal(page->free == NULL);
+  mi_assert_internal(page->used == 0);
+  mi_assert_internal(page->xthread_free == 0);
+  mi_assert_internal(page->next == NULL);
+  mi_assert_internal(page->prev == NULL);
+  mi_assert_internal(page->retire_expire == 0);
+  mi_assert_internal(!mi_page_has_aligned(page));
+  #if (MI_ENCODE_FREELIST)
+  mi_assert_internal(page->keys[0] != 0);
+  mi_assert_internal(page->keys[1] != 0);
+  #endif
+  mi_assert_expensive(mi_page_is_valid_init(page));
+
+  // initialize an initial free list
+  mi_page_extend_free(heap,page,tld);
+  mi_assert(mi_page_immediate_available(page));
+}
+
+
+/* -----------------------------------------------------------
+  Find pages with free blocks
+-------------------------------------------------------------*/
+
+// Find a page with free blocks of `page->block_size`.
+static mi_page_t* mi_page_queue_find_free_ex(mi_heap_t* heap, mi_page_queue_t* pq, bool first_try)
+{
+  // search through the pages in "next fit" order
+  size_t count = 0;
+  mi_page_t* page = pq->first;
+  while (page != NULL)
+  {
+    mi_page_t* next = page->next; // remember next
+    count++;
+
+    // 0. collect freed blocks by us and other threads
+    _mi_page_free_collect(page, false);
+
+    // 1. if the page contains free blocks, we are done
+    if (mi_page_immediate_available(page)) {
+      break;  // pick this one
+    }
+
+    // 2. Try to extend
+    if (page->capacity < page->reserved) {
+      mi_page_extend_free(heap, page, heap->tld);
+      mi_assert_internal(mi_page_immediate_available(page));
+      break;
+    }
+
+    // 3. If the page is completely full, move it to the `mi_pages_full`
+    // queue so we don't visit long-lived pages too often.
+    mi_assert_internal(!mi_page_is_in_full(page) && !mi_page_immediate_available(page));
+    mi_page_to_full(page, pq);
+
+    page = next;
+  } // for each page
+
+  mi_heap_stat_counter_increase(heap, searches, count);
+
+  if (page == NULL) {
+    _mi_heap_collect_retired(heap, false); // perhaps make a page available?
+    page = mi_page_fresh(heap, pq);
+    if (page == NULL && first_try) {
+      // out-of-memory _or_ an abandoned page with free blocks was reclaimed, try once again
+      page = mi_page_queue_find_free_ex(heap, pq, false);      
+    }
+  }
+  else {
+    mi_assert(pq->first == page);
+    page->retire_expire = 0;
+  }
+  mi_assert_internal(page == NULL || mi_page_immediate_available(page));
+  return page;
+}
+
+
+
+// Find a page with free blocks of `size`.
+static inline mi_page_t* mi_find_free_page(mi_heap_t* heap, size_t size) {
+  mi_page_queue_t* pq = mi_page_queue(heap,size);
+  mi_page_t* page = pq->first;
+  if (page != NULL) {
+   #if (MI_SECURE>=3) // in secure mode, we extend half the time to increase randomness      
+    if (page->capacity < page->reserved && ((_mi_heap_random_next(heap) & 1) == 1)) {
+      mi_page_extend_free(heap, page, heap->tld);
+      mi_assert_internal(mi_page_immediate_available(page));
+    }
+    else 
+   #endif
+    {
+      _mi_page_free_collect(page,false);
+    }
+    
+    if (mi_page_immediate_available(page)) {
+      page->retire_expire = 0;
+      return page; // fast path
+    }
+  }
+  return mi_page_queue_find_free_ex(heap, pq, true);
+}
+
+
+/* -----------------------------------------------------------
+  Users can register a deferred free function called
+  when the `free` list is empty. Since the `local_free`
+  is separate this is deterministically called after
+  a certain number of allocations.
+----------------------------------------------------------- */
+
+static mi_deferred_free_fun* volatile deferred_free = NULL;
+static _Atomic(void*) deferred_arg; // = NULL
+
+void _mi_deferred_free(mi_heap_t* heap, bool force) {
+  heap->tld->heartbeat++;
+  if (deferred_free != NULL && !heap->tld->recurse) {
+    heap->tld->recurse = true;
+    deferred_free(force, heap->tld->heartbeat, mi_atomic_load_ptr_relaxed(void,&deferred_arg));
+    heap->tld->recurse = false;
+  }
+}
+
+void mi_register_deferred_free(mi_deferred_free_fun* fn, void* arg) mi_attr_noexcept {
+  deferred_free = fn;
+  mi_atomic_store_ptr_release(void,&deferred_arg, arg);
+}
+
+
+/* -----------------------------------------------------------
+  General allocation
+----------------------------------------------------------- */
+
+// Large and huge page allocation.
+// Huge pages are allocated directly without being in a queue.
+// Because huge pages contain just one block, and the segment contains
+// just that page, we always treat them as abandoned and any thread
+// that frees the block can free the whole page and segment directly.
+static mi_page_t* mi_large_huge_page_alloc(mi_heap_t* heap, size_t size) {
+  size_t block_size = _mi_os_good_alloc_size(size);
+  mi_assert_internal(mi_bin(block_size) == MI_BIN_HUGE);
+  bool is_huge = (block_size > MI_LARGE_OBJ_SIZE_MAX);
+  mi_page_queue_t* pq = (is_huge ? NULL : mi_page_queue(heap, block_size));
+  mi_page_t* page = mi_page_fresh_alloc(heap, pq, block_size);
+  if (page != NULL) {
+    mi_assert_internal(mi_page_immediate_available(page));
+    
+    if (pq == NULL) {
+      // huge pages are directly abandoned
+      mi_assert_internal(_mi_page_segment(page)->kind == MI_SEGMENT_HUGE);
+      mi_assert_internal(_mi_page_segment(page)->used==1);
+      mi_assert_internal(_mi_page_segment(page)->thread_id==0); // abandoned, not in the huge queue
+      mi_page_set_heap(page, NULL);
+    }
+    else {
+      mi_assert_internal(_mi_page_segment(page)->kind != MI_SEGMENT_HUGE);
+    }
+    
+    const size_t bsize = mi_page_usable_block_size(page);  // note: not `mi_page_block_size` to account for padding
+    if (bsize <= MI_LARGE_OBJ_SIZE_MAX) {
+      mi_heap_stat_increase(heap, large, bsize);
+      mi_heap_stat_counter_increase(heap, large_count, 1);
+    }
+    else {
+      mi_heap_stat_increase(heap, huge, bsize);
+      mi_heap_stat_counter_increase(heap, huge_count, 1);
+    }
+  }
+  return page;
+}
+
+
+// Allocate a page
+// Note: in debug mode the size includes MI_PADDING_SIZE and might have overflowed.
+static mi_page_t* mi_find_page(mi_heap_t* heap, size_t size) mi_attr_noexcept {
+  // huge allocation?
+  const size_t req_size = size - MI_PADDING_SIZE;  // correct for padding_size in case of an overflow on `size`  
+  if (mi_unlikely(req_size > (MI_MEDIUM_OBJ_SIZE_MAX - MI_PADDING_SIZE) )) {
+    if (mi_unlikely(req_size > PTRDIFF_MAX)) {  // we don't allocate more than PTRDIFF_MAX (see <https://sourceware.org/ml/libc-announce/2019/msg00001.html>)
+      _mi_error_message(EOVERFLOW, "allocation request is too large (%zu bytes)\n", req_size);
+      return NULL;
+    }
+    else {
+      return mi_large_huge_page_alloc(heap,size);
+    }
+  }
+  else {
+    // otherwise find a page with free blocks in our size segregated queues
+    mi_assert_internal(size >= MI_PADDING_SIZE);
+    return mi_find_free_page(heap, size);
+  }
+}
+
+// Generic allocation routine if the fast path (`alloc.c:mi_page_malloc`) does not succeed.
+// Note: in debug mode the size includes MI_PADDING_SIZE and might have overflowed.
+void* _mi_malloc_generic(mi_heap_t* heap, size_t size) mi_attr_noexcept
+{
+  mi_assert_internal(heap != NULL);
+
+  // initialize if necessary
+  if (mi_unlikely(!mi_heap_is_initialized(heap))) {
+    mi_thread_init(); // calls `_mi_heap_init` in turn
+    heap = mi_get_default_heap();
+    if (mi_unlikely(!mi_heap_is_initialized(heap))) { return NULL; }
+  }
+  mi_assert_internal(mi_heap_is_initialized(heap));
+
+  // call potential deferred free routines
+  _mi_deferred_free(heap, false);
+
+  // free delayed frees from other threads
+  _mi_heap_delayed_free(heap);
+
+  // find (or allocate) a page of the right size
+  mi_page_t* page = mi_find_page(heap, size);
+  if (mi_unlikely(page == NULL)) { // first time out of memory, try to collect and retry the allocation once more
+    mi_heap_collect(heap, true /* force */);
+    page = mi_find_page(heap, size);
+  }
+
+  if (mi_unlikely(page == NULL)) { // out of memory
+    const size_t req_size = size - MI_PADDING_SIZE;  // correct for padding_size in case of an overflow on `size`  
+    _mi_error_message(ENOMEM, "unable to allocate memory (%zu bytes)\n", req_size);
+    return NULL;
+  }
+
+  mi_assert_internal(mi_page_immediate_available(page));
+  mi_assert_internal(mi_page_block_size(page) >= size);
+
+  // and try again, this time succeeding! (i.e. this should never recurse)
+  return _mi_page_malloc(heap, page, size);
+}
diff --git a/source/luametatex/source/libraries/mimalloc/src/random.c b/source/luametatex/source/libraries/mimalloc/src/random.c
new file mode 100644
index 000000000..d474a53a0
--- /dev/null
+++ b/source/luametatex/source/libraries/mimalloc/src/random.c
@@ -0,0 +1,367 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2019-2021, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+#ifndef _DEFAULT_SOURCE
+#define _DEFAULT_SOURCE   // for syscall() on Linux
+#endif
+
+#include "mimalloc.h"
+#include "mimalloc-internal.h"
+
+#include <string.h> // memset
+
+/* ----------------------------------------------------------------------------
+We use our own PRNG to keep predictable performance of random number generation
+and to avoid implementations that use a lock. We only use the OS provided
+random source to initialize the initial seeds. Since we do not need ultimate
+performance but we do rely on the security (for secret cookies in secure mode)
+we use a cryptographically secure generator (chacha20).
+-----------------------------------------------------------------------------*/
+
+#define MI_CHACHA_ROUNDS (20)   // perhaps use 12 for better performance?
+
+
+/* ----------------------------------------------------------------------------
+Chacha20 implementation as the original algorithm with a 64-bit nonce
+and counter: https://en.wikipedia.org/wiki/Salsa20
+The input matrix has sixteen 32-bit values:
+Position  0 to  3: constant key
+Position  4 to 11: the key
+Position 12 to 13: the counter.
+Position 14 to 15: the nonce.
+
+The implementation uses regular C code which compiles very well on modern compilers.
+(gcc x64 has no register spills, and clang 6+ uses SSE instructions)
+-----------------------------------------------------------------------------*/
+
+static inline uint32_t rotl(uint32_t x, uint32_t shift) {
+  return (x << shift) | (x >> (32 - shift));
+}
+
+static inline void qround(uint32_t x[16], size_t a, size_t b, size_t c, size_t d) {
+  x[a] += x[b]; x[d] = rotl(x[d] ^ x[a], 16);
+  x[c] += x[d]; x[b] = rotl(x[b] ^ x[c], 12);
+  x[a] += x[b]; x[d] = rotl(x[d] ^ x[a], 8);
+  x[c] += x[d]; x[b] = rotl(x[b] ^ x[c], 7);
+}
+
+static void chacha_block(mi_random_ctx_t* ctx)
+{
+  // scramble into `x`
+  uint32_t x[16];
+  for (size_t i = 0; i < 16; i++) {
+    x[i] = ctx->input[i];
+  }
+  for (size_t i = 0; i < MI_CHACHA_ROUNDS; i += 2) {
+    qround(x, 0, 4,  8, 12);
+    qround(x, 1, 5,  9, 13);
+    qround(x, 2, 6, 10, 14);
+    qround(x, 3, 7, 11, 15);
+    qround(x, 0, 5, 10, 15);
+    qround(x, 1, 6, 11, 12);
+    qround(x, 2, 7,  8, 13);
+    qround(x, 3, 4,  9, 14);
+  }
+
+  // add scrambled data to the initial state
+  for (size_t i = 0; i < 16; i++) {
+    ctx->output[i] = x[i] + ctx->input[i];
+  }
+  ctx->output_available = 16;
+
+  // increment the counter for the next round
+  ctx->input[12] += 1;
+  if (ctx->input[12] == 0) {
+    ctx->input[13] += 1;
+    if (ctx->input[13] == 0) {  // and keep increasing into the nonce
+      ctx->input[14] += 1;
+    }
+  }
+}
+
+static uint32_t chacha_next32(mi_random_ctx_t* ctx) {
+  if (ctx->output_available <= 0) {
+    chacha_block(ctx);
+    ctx->output_available = 16; // (assign again to suppress static analysis warning)
+  }
+  const uint32_t x = ctx->output[16 - ctx->output_available];
+  ctx->output[16 - ctx->output_available] = 0; // reset once the data is handed out
+  ctx->output_available--;
+  return x;
+}
+
+static inline uint32_t read32(const uint8_t* p, size_t idx32) {
+  const size_t i = 4*idx32;
+  return ((uint32_t)p[i+0] | (uint32_t)p[i+1] << 8 | (uint32_t)p[i+2] << 16 | (uint32_t)p[i+3] << 24);
+}
+
+static void chacha_init(mi_random_ctx_t* ctx, const uint8_t key[32], uint64_t nonce)
+{
+  // since we only use chacha for randomness (and not encryption) we
+  // do not _need_ to read 32-bit values as little endian but we do anyways
+  // just for being compatible :-)
+  memset(ctx, 0, sizeof(*ctx));
+  for (size_t i = 0; i < 4; i++) {
+    const uint8_t* sigma = (uint8_t*)"expand 32-byte k";
+    ctx->input[i] = read32(sigma,i);
+  }
+  for (size_t i = 0; i < 8; i++) {
+    ctx->input[i + 4] = read32(key,i);
+  }
+  ctx->input[12] = 0;
+  ctx->input[13] = 0;
+  ctx->input[14] = (uint32_t)nonce;
+  ctx->input[15] = (uint32_t)(nonce >> 32);
+}
+
+static void chacha_split(mi_random_ctx_t* ctx, uint64_t nonce, mi_random_ctx_t* ctx_new) {
+  memset(ctx_new, 0, sizeof(*ctx_new));
+  _mi_memcpy(ctx_new->input, ctx->input, sizeof(ctx_new->input));
+  ctx_new->input[12] = 0;
+  ctx_new->input[13] = 0;
+  ctx_new->input[14] = (uint32_t)nonce;
+  ctx_new->input[15] = (uint32_t)(nonce >> 32);
+  mi_assert_internal(ctx->input[14] != ctx_new->input[14] || ctx->input[15] != ctx_new->input[15]); // do not reuse nonces!
+  chacha_block(ctx_new);
+}
+
+
+/* ----------------------------------------------------------------------------
+Random interface
+-----------------------------------------------------------------------------*/
+
+#if MI_DEBUG>1
+static bool mi_random_is_initialized(mi_random_ctx_t* ctx) {
+  return (ctx != NULL && ctx->input[0] != 0);
+}
+#endif
+
+void _mi_random_split(mi_random_ctx_t* ctx, mi_random_ctx_t* ctx_new) {
+  mi_assert_internal(mi_random_is_initialized(ctx));
+  mi_assert_internal(ctx != ctx_new);
+  chacha_split(ctx, (uintptr_t)ctx_new /*nonce*/, ctx_new);
+}
+
+uintptr_t _mi_random_next(mi_random_ctx_t* ctx) {
+  mi_assert_internal(mi_random_is_initialized(ctx));
+  #if MI_INTPTR_SIZE <= 4
+    return chacha_next32(ctx);
+  #elif MI_INTPTR_SIZE == 8
+    return (((uintptr_t)chacha_next32(ctx) << 32) | chacha_next32(ctx));
+  #else
+  # error "define mi_random_next for this platform"
+  #endif
+}
+
+
+/* ----------------------------------------------------------------------------
+To initialize a fresh random context we rely on the OS:
+- Windows     : BCryptGenRandom (or RtlGenRandom)
+- macOS       : CCRandomGenerateBytes, arc4random_buf
+- bsd,wasi    : arc4random_buf
+- Linux       : getrandom,/dev/urandom
+If we cannot get good randomness, we fall back to weak randomness based on a timer and ASLR.
+-----------------------------------------------------------------------------*/
+
+#if defined(_WIN32)
+
+#if defined(MI_USE_RTLGENRANDOM) || defined(__cplusplus)
+// We prefer to use BCryptGenRandom instead of (the unofficial) RtlGenRandom but when using 
+// dynamic overriding, we observed it can raise an exception when compiled with C++, and 
+// sometimes deadlocks when also running under the VS debugger.
+#pragma comment (lib,"advapi32.lib")
+#define RtlGenRandom  SystemFunction036
+#ifdef __cplusplus
+extern "C" {
+#endif
+BOOLEAN NTAPI RtlGenRandom(PVOID RandomBuffer, ULONG RandomBufferLength);
+#ifdef __cplusplus
+}
+#endif
+static bool os_random_buf(void* buf, size_t buf_len) {
+  return (RtlGenRandom(buf, (ULONG)buf_len) != 0);
+}
+#else
+#pragma comment (lib,"bcrypt.lib")
+#include <bcrypt.h>
+static bool os_random_buf(void* buf, size_t buf_len) {
+  return (BCryptGenRandom(NULL, (PUCHAR)buf, (ULONG)buf_len, BCRYPT_USE_SYSTEM_PREFERRED_RNG) >= 0);
+}
+#endif
+
+#elif defined(__APPLE__)
+#include <AvailabilityMacros.h>
+#if defined(MAC_OS_X_VERSION_10_10) && MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_10
+#include <CommonCrypto/CommonCryptoError.h>
+#include <CommonCrypto/CommonRandom.h>
+#endif
+static bool os_random_buf(void* buf, size_t buf_len) {
+  #if defined(MAC_OS_X_VERSION_10_15) && MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_15
+    // We prefere CCRandomGenerateBytes as it returns an error code while arc4random_buf
+    // may fail silently on macOS. See PR #390, and <https://opensource.apple.com/source/Libc/Libc-1439.40.11/gen/FreeBSD/arc4random.c.auto.html>      
+    return (CCRandomGenerateBytes(buf, buf_len) == kCCSuccess);
+  #else
+    // fall back on older macOS
+    arc4random_buf(buf, buf_len);
+    return true;
+  #endif
+}
+
+#elif defined(__ANDROID__) || defined(__DragonFly__) || \
+      defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || \
+      defined(__sun) // todo: what to use with __wasi__?
+#include <stdlib.h>
+static bool os_random_buf(void* buf, size_t buf_len) {
+  arc4random_buf(buf, buf_len);
+  return true;
+}
+#elif defined(__linux__) || defined(__HAIKU__)
+#if defined(__linux__)
+#include <sys/syscall.h>
+#endif
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <errno.h>
+static bool os_random_buf(void* buf, size_t buf_len) {
+  // Modern Linux provides `getrandom` but different distributions either use `sys/random.h` or `linux/random.h`
+  // and for the latter the actual `getrandom` call is not always defined.
+  // (see <https://stackoverflow.com/questions/45237324/why-doesnt-getrandom-compile>)
+  // We therefore use a syscall directly and fall back dynamically to /dev/urandom when needed.
+#ifdef SYS_getrandom
+  #ifndef GRND_NONBLOCK
+  #define GRND_NONBLOCK (1)
+  #endif
+  static _Atomic(uintptr_t) no_getrandom; // = 0
+  if (mi_atomic_load_acquire(&no_getrandom)==0) {
+    ssize_t ret = syscall(SYS_getrandom, buf, buf_len, GRND_NONBLOCK);
+    if (ret >= 0) return (buf_len == (size_t)ret);
+    if (errno != ENOSYS) return false;
+    mi_atomic_store_release(&no_getrandom, 1UL); // don't call again, and fall back to /dev/urandom
+  }
+#endif
+  int flags = O_RDONLY;
+  #if defined(O_CLOEXEC)
+  flags |= O_CLOEXEC;
+  #endif
+  int fd = open("/dev/urandom", flags, 0);
+  if (fd < 0) return false;
+  size_t count = 0;
+  while(count < buf_len) {
+    ssize_t ret = read(fd, (char*)buf + count, buf_len - count);
+    if (ret<=0) {
+      if (errno!=EAGAIN && errno!=EINTR) break;
+    }
+    else {
+      count += ret;
+    }
+  }
+  close(fd);
+  return (count==buf_len);
+}
+#else
+static bool os_random_buf(void* buf, size_t buf_len) {
+  return false;
+}
+#endif
+
+#if defined(_WIN32)
+#include <windows.h>
+#elif defined(__APPLE__)
+#include <mach/mach_time.h>
+#else
+#include <time.h>
+#endif
+
+uintptr_t _mi_os_random_weak(uintptr_t extra_seed) {
+  uintptr_t x = (uintptr_t)&_mi_os_random_weak ^ extra_seed; // ASLR makes the address random
+  
+  #if defined(_WIN32)
+    LARGE_INTEGER pcount;
+    QueryPerformanceCounter(&pcount);
+    x ^= (uintptr_t)(pcount.QuadPart);
+  #elif defined(__APPLE__)
+    x ^= (uintptr_t)mach_absolute_time();
+  #else
+    struct timespec time;
+    clock_gettime(CLOCK_MONOTONIC, &time);
+    x ^= (uintptr_t)time.tv_sec;
+    x ^= (uintptr_t)time.tv_nsec;
+  #endif
+  // and do a few randomization steps
+  uintptr_t max = ((x ^ (x >> 17)) & 0x0F) + 1;
+  for (uintptr_t i = 0; i < max; i++) {
+    x = _mi_random_shuffle(x);
+  }
+  mi_assert_internal(x != 0);
+  return x;
+}
+
+void _mi_random_init(mi_random_ctx_t* ctx) {
+  uint8_t key[32];
+  if (!os_random_buf(key, sizeof(key))) {
+    // if we fail to get random data from the OS, we fall back to a
+    // weak random source based on the current time
+    #if !defined(__wasi__)
+    _mi_warning_message("unable to use secure randomness\n");
+    #endif
+    uintptr_t x = _mi_os_random_weak(0);
+    for (size_t i = 0; i < 8; i++) {  // key is eight 32-bit words.
+      x = _mi_random_shuffle(x);
+      ((uint32_t*)key)[i] = (uint32_t)x;
+    }
+  }
+  chacha_init(ctx, key, (uintptr_t)ctx /*nonce*/ );
+}
+
+/* --------------------------------------------------------
+test vectors from <https://tools.ietf.org/html/rfc8439>
+----------------------------------------------------------- */
+/*
+static bool array_equals(uint32_t* x, uint32_t* y, size_t n) {
+  for (size_t i = 0; i < n; i++) {
+    if (x[i] != y[i]) return false;
+  }
+  return true;
+}
+static void chacha_test(void)
+{
+  uint32_t x[4] = { 0x11111111, 0x01020304, 0x9b8d6f43, 0x01234567 };
+  uint32_t x_out[4] = { 0xea2a92f4, 0xcb1cf8ce, 0x4581472e, 0x5881c4bb };
+  qround(x, 0, 1, 2, 3);
+  mi_assert_internal(array_equals(x, x_out, 4));
+
+  uint32_t y[16] = {
+       0x879531e0,  0xc5ecf37d,  0x516461b1,  0xc9a62f8a,
+       0x44c20ef3,  0x3390af7f,  0xd9fc690b,  0x2a5f714c,
+       0x53372767,  0xb00a5631,  0x974c541a,  0x359e9963,
+       0x5c971061,  0x3d631689,  0x2098d9d6,  0x91dbd320 };
+  uint32_t y_out[16] = {
+       0x879531e0,  0xc5ecf37d,  0xbdb886dc,  0xc9a62f8a,
+       0x44c20ef3,  0x3390af7f,  0xd9fc690b,  0xcfacafd2,
+       0xe46bea80,  0xb00a5631,  0x974c541a,  0x359e9963,
+       0x5c971061,  0xccc07c79,  0x2098d9d6,  0x91dbd320 };
+  qround(y, 2, 7, 8, 13);
+  mi_assert_internal(array_equals(y, y_out, 16));
+
+  mi_random_ctx_t r = {
+    { 0x61707865, 0x3320646e, 0x79622d32, 0x6b206574,
+      0x03020100, 0x07060504, 0x0b0a0908, 0x0f0e0d0c,
+      0x13121110, 0x17161514, 0x1b1a1918, 0x1f1e1d1c,
+      0x00000001, 0x09000000, 0x4a000000, 0x00000000 },
+    {0},
+    0
+  };
+  uint32_t r_out[16] = {
+       0xe4e7f110, 0x15593bd1, 0x1fdd0f50, 0xc47120a3,
+       0xc7f4d1c7, 0x0368c033, 0x9aaa2204, 0x4e6cd4c3,
+       0x466482d2, 0x09aa9f07, 0x05d7c214, 0xa2028bd9,
+       0xd19c12b5, 0xb94e16de, 0xe883d0cb, 0x4e3c50a2 };
+  chacha_block(&r);
+  mi_assert_internal(array_equals(r.output, r_out, 16));
+}
+*/
diff --git a/source/luametatex/source/libraries/mimalloc/src/region.c b/source/luametatex/source/libraries/mimalloc/src/region.c
new file mode 100644
index 000000000..72ce84947
--- /dev/null
+++ b/source/luametatex/source/libraries/mimalloc/src/region.c
@@ -0,0 +1,505 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2019-2020, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+
+/* ----------------------------------------------------------------------------
+This implements a layer between the raw OS memory (VirtualAlloc/mmap/sbrk/..)
+and the segment and huge object allocation by mimalloc. There may be multiple
+implementations of this (one could be the identity going directly to the OS,
+another could be a simple cache etc), but the current one uses large "regions".
+In contrast to the rest of mimalloc, the "regions" are shared between threads and
+need to be accessed using atomic operations.
+We need this memory layer between the raw OS calls because of:
+1. on `sbrk` like systems (like WebAssembly) we need our own memory maps in order
+   to reuse memory effectively.
+2. It turns out that for large objects, between 1MiB and 32MiB (?), the cost of
+   an OS allocation/free is still (much) too expensive relative to the accesses 
+   in that object :-( (`malloc-large` tests this). This means we need a cheaper 
+   way to reuse memory.
+3. This layer allows for NUMA aware allocation.
+
+Possible issues:
+- (2) can potentially be addressed too with a small cache per thread which is much
+  simpler. Generally though that requires shrinking of huge pages, and may overuse
+  memory per thread. (and is not compatible with `sbrk`).
+- Since the current regions are per-process, we need atomic operations to
+  claim blocks which may be contended
+- In the worst case, we need to search the whole region map (16KiB for 256GiB)
+  linearly. At what point will direct OS calls be faster? Is there a way to
+  do this better without adding too much complexity?
+-----------------------------------------------------------------------------*/
+#include "mimalloc.h"
+#include "mimalloc-internal.h"
+#include "mimalloc-atomic.h"
+
+#include <string.h>  // memset
+
+#include "bitmap.h"
+
+// Internal raw OS interface
+size_t  _mi_os_large_page_size(void);
+bool    _mi_os_protect(void* addr, size_t size);
+bool    _mi_os_unprotect(void* addr, size_t size);
+bool    _mi_os_commit(void* p, size_t size, bool* is_zero, mi_stats_t* stats);
+bool    _mi_os_decommit(void* p, size_t size, mi_stats_t* stats);
+bool    _mi_os_reset(void* p, size_t size, mi_stats_t* stats);
+bool    _mi_os_unreset(void* p, size_t size, bool* is_zero, mi_stats_t* stats);
+
+// arena.c
+void    _mi_arena_free(void* p, size_t size, size_t memid, bool all_committed, mi_stats_t* stats);
+void*   _mi_arena_alloc(size_t size, bool* commit, bool* large, bool* is_pinned, bool* is_zero, size_t* memid, mi_os_tld_t* tld);
+void*   _mi_arena_alloc_aligned(size_t size, size_t alignment, bool* commit, bool* large, bool* is_pinned, bool* is_zero, size_t* memid, mi_os_tld_t* tld);
+
+
+
+// Constants
+#if (MI_INTPTR_SIZE==8)
+#define MI_HEAP_REGION_MAX_SIZE    (256 * MI_GiB)  // 64KiB for the region map 
+#elif (MI_INTPTR_SIZE==4)
+#define MI_HEAP_REGION_MAX_SIZE    (3 * MI_GiB)    // ~ KiB for the region map
+#else
+#error "define the maximum heap space allowed for regions on this platform"
+#endif
+
+#define MI_SEGMENT_ALIGN          MI_SEGMENT_SIZE
+
+#define MI_REGION_MAX_BLOCKS      MI_BITMAP_FIELD_BITS
+#define MI_REGION_SIZE            (MI_SEGMENT_SIZE * MI_BITMAP_FIELD_BITS)    // 256MiB  (64MiB on 32 bits)
+#define MI_REGION_MAX             (MI_HEAP_REGION_MAX_SIZE / MI_REGION_SIZE)  // 1024  (48 on 32 bits)
+#define MI_REGION_MAX_OBJ_BLOCKS  (MI_REGION_MAX_BLOCKS/4)                    // 64MiB
+#define MI_REGION_MAX_OBJ_SIZE    (MI_REGION_MAX_OBJ_BLOCKS*MI_SEGMENT_SIZE)  
+
+// Region info 
+typedef union mi_region_info_u {
+  size_t value;      
+  struct {
+    bool  valid;        // initialized?
+    bool  is_large:1;   // allocated in fixed large/huge OS pages
+    bool  is_pinned:1;  // pinned memory cannot be decommitted
+    short numa_node;    // the associated NUMA node (where -1 means no associated node)
+  } x;
+} mi_region_info_t;
+
+
+// A region owns a chunk of REGION_SIZE (256MiB) (virtual) memory with
+// a bit map with one bit per MI_SEGMENT_SIZE (4MiB) block.
+typedef struct mem_region_s {
+  _Atomic(size_t)           info;        // mi_region_info_t.value
+  _Atomic(void*)            start;       // start of the memory area 
+  mi_bitmap_field_t         in_use;      // bit per in-use block
+  mi_bitmap_field_t         dirty;       // track if non-zero per block
+  mi_bitmap_field_t         commit;      // track if committed per block
+  mi_bitmap_field_t         reset;       // track if reset per block
+  _Atomic(size_t)           arena_memid; // if allocated from a (huge page) arena
+  _Atomic(size_t)           padding;     // round to 8 fields (needs to be atomic for msvc, see issue #508)
+} mem_region_t;
+
+// The region map
+static mem_region_t regions[MI_REGION_MAX];
+
+// Allocated regions
+static _Atomic(size_t) regions_count; // = 0;        
+
+
+/* ----------------------------------------------------------------------------
+Utility functions
+-----------------------------------------------------------------------------*/
+
+// Blocks (of 4MiB) needed for the given size.
+static size_t mi_region_block_count(size_t size) {
+  return _mi_divide_up(size, MI_SEGMENT_SIZE);
+}
+
+/*
+// Return a rounded commit/reset size such that we don't fragment large OS pages into small ones.
+static size_t mi_good_commit_size(size_t size) {
+  if (size > (SIZE_MAX - _mi_os_large_page_size())) return size;
+  return _mi_align_up(size, _mi_os_large_page_size());
+}
+*/
+
+// Return if a pointer points into a region reserved by us.
+mi_decl_nodiscard bool mi_is_in_heap_region(const void* p) mi_attr_noexcept {
+  if (p==NULL) return false;
+  size_t count = mi_atomic_load_relaxed(&regions_count);
+  for (size_t i = 0; i < count; i++) {
+    uint8_t* start = (uint8_t*)mi_atomic_load_ptr_relaxed(uint8_t, &regions[i].start);
+    if (start != NULL && (uint8_t*)p >= start && (uint8_t*)p < start + MI_REGION_SIZE) return true;
+  }
+  return false;
+}
+
+
+static void* mi_region_blocks_start(const mem_region_t* region, mi_bitmap_index_t bit_idx) {
+  uint8_t* start = (uint8_t*)mi_atomic_load_ptr_acquire(uint8_t, &((mem_region_t*)region)->start);
+  mi_assert_internal(start != NULL);
+  return (start + (bit_idx * MI_SEGMENT_SIZE));  
+}
+
+static size_t mi_memid_create(mem_region_t* region, mi_bitmap_index_t bit_idx) {
+  mi_assert_internal(bit_idx < MI_BITMAP_FIELD_BITS);
+  size_t idx = region - regions;
+  mi_assert_internal(&regions[idx] == region);
+  return (idx*MI_BITMAP_FIELD_BITS + bit_idx)<<1;
+}
+
+static size_t mi_memid_create_from_arena(size_t arena_memid) {
+  return (arena_memid << 1) | 1;
+}
+
+
+static bool mi_memid_is_arena(size_t id, mem_region_t** region, mi_bitmap_index_t* bit_idx, size_t* arena_memid) {
+  if ((id&1)==1) {
+    if (arena_memid != NULL) *arena_memid = (id>>1);
+    return true;
+  }
+  else {
+    size_t idx = (id >> 1) / MI_BITMAP_FIELD_BITS;
+    *bit_idx   = (mi_bitmap_index_t)(id>>1) % MI_BITMAP_FIELD_BITS;
+    *region    = &regions[idx];
+    return false;
+  }
+}
+
+
+/* ----------------------------------------------------------------------------
+  Allocate a region is allocated from the OS (or an arena)
+-----------------------------------------------------------------------------*/
+
+static bool mi_region_try_alloc_os(size_t blocks, bool commit, bool allow_large, mem_region_t** region, mi_bitmap_index_t* bit_idx, mi_os_tld_t* tld)
+{
+  // not out of regions yet?
+  if (mi_atomic_load_relaxed(&regions_count) >= MI_REGION_MAX - 1) return false;
+
+  // try to allocate a fresh region from the OS
+  bool region_commit = (commit && mi_option_is_enabled(mi_option_eager_region_commit));
+  bool region_large = (commit && allow_large);
+  bool is_zero = false;
+  bool is_pinned = false;
+  size_t arena_memid = 0;
+  void* const start = _mi_arena_alloc_aligned(MI_REGION_SIZE, MI_SEGMENT_ALIGN, &region_commit, &region_large, &is_pinned, &is_zero, &arena_memid, tld);
+  if (start == NULL) return false;
+  mi_assert_internal(!(region_large && !allow_large));
+  mi_assert_internal(!region_large || region_commit);
+
+  // claim a fresh slot
+  const size_t idx = mi_atomic_increment_acq_rel(&regions_count);
+  if (idx >= MI_REGION_MAX) {
+    mi_atomic_decrement_acq_rel(&regions_count);
+    _mi_arena_free(start, MI_REGION_SIZE, arena_memid, region_commit, tld->stats);
+    _mi_warning_message("maximum regions used: %zu GiB (perhaps recompile with a larger setting for MI_HEAP_REGION_MAX_SIZE)", _mi_divide_up(MI_HEAP_REGION_MAX_SIZE, MI_GiB));
+    return false;
+  }
+
+  // allocated, initialize and claim the initial blocks
+  mem_region_t* r = &regions[idx];
+  r->arena_memid  = arena_memid;
+  mi_atomic_store_release(&r->in_use, (size_t)0);
+  mi_atomic_store_release(&r->dirty, (is_zero ? 0 : MI_BITMAP_FIELD_FULL));
+  mi_atomic_store_release(&r->commit, (region_commit ? MI_BITMAP_FIELD_FULL : 0));
+  mi_atomic_store_release(&r->reset, (size_t)0);
+  *bit_idx = 0;
+  _mi_bitmap_claim(&r->in_use, 1, blocks, *bit_idx, NULL);
+  mi_atomic_store_ptr_release(void,&r->start, start);
+
+  // and share it 
+  mi_region_info_t info;
+  info.value = 0;                        // initialize the full union to zero
+  info.x.valid = true;
+  info.x.is_large = region_large;
+  info.x.is_pinned = is_pinned;
+  info.x.numa_node = (short)_mi_os_numa_node(tld);
+  mi_atomic_store_release(&r->info, info.value); // now make it available to others
+  *region = r;
+  return true;
+}
+
+/* ----------------------------------------------------------------------------
+  Try to claim blocks in suitable regions
+-----------------------------------------------------------------------------*/
+
+static bool mi_region_is_suitable(const mem_region_t* region, int numa_node, bool allow_large ) {
+  // initialized at all?
+  mi_region_info_t info;
+  info.value = mi_atomic_load_relaxed(&((mem_region_t*)region)->info);
+  if (info.value==0) return false;
+
+  // numa correct
+  if (numa_node >= 0) {  // use negative numa node to always succeed
+    int rnode = info.x.numa_node;
+    if (rnode >= 0 && rnode != numa_node) return false;
+  }
+
+  // check allow-large
+  if (!allow_large && info.x.is_large) return false;
+
+  return true;
+}
+
+
+static bool mi_region_try_claim(int numa_node, size_t blocks, bool allow_large, mem_region_t** region, mi_bitmap_index_t* bit_idx, mi_os_tld_t* tld)
+{
+  // try all regions for a free slot  
+  const size_t count = mi_atomic_load_relaxed(&regions_count); // monotonic, so ok to be relaxed
+  size_t idx = tld->region_idx; // Or start at 0 to reuse low addresses? Starting at 0 seems to increase latency though
+  for (size_t visited = 0; visited < count; visited++, idx++) {
+    if (idx >= count) idx = 0;  // wrap around
+    mem_region_t* r = &regions[idx];
+    // if this region suits our demand (numa node matches, large OS page matches)
+    if (mi_region_is_suitable(r, numa_node, allow_large)) {
+      // then try to atomically claim a segment(s) in this region
+      if (_mi_bitmap_try_find_claim_field(&r->in_use, 0, blocks, bit_idx)) {
+        tld->region_idx = idx;    // remember the last found position
+        *region = r;
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+
+static void* mi_region_try_alloc(size_t blocks, bool* commit, bool* large, bool* is_pinned, bool* is_zero, size_t* memid, mi_os_tld_t* tld)
+{
+  mi_assert_internal(blocks <= MI_BITMAP_FIELD_BITS);
+  mem_region_t* region;
+  mi_bitmap_index_t bit_idx;
+  const int numa_node = (_mi_os_numa_node_count() <= 1 ? -1 : _mi_os_numa_node(tld));
+  // try to claim in existing regions
+  if (!mi_region_try_claim(numa_node, blocks, *large, &region, &bit_idx, tld)) {
+    // otherwise try to allocate a fresh region and claim in there
+    if (!mi_region_try_alloc_os(blocks, *commit, *large, &region, &bit_idx, tld)) {
+      // out of regions or memory
+      return NULL;
+    }
+  }
+  
+  // ------------------------------------------------
+  // found a region and claimed `blocks` at `bit_idx`, initialize them now
+  mi_assert_internal(region != NULL);
+  mi_assert_internal(_mi_bitmap_is_claimed(&region->in_use, 1, blocks, bit_idx));
+
+  mi_region_info_t info;
+  info.value = mi_atomic_load_acquire(&region->info);
+  uint8_t* start = (uint8_t*)mi_atomic_load_ptr_acquire(uint8_t,&region->start);
+  mi_assert_internal(!(info.x.is_large && !*large));
+  mi_assert_internal(start != NULL);
+
+  *is_zero   = _mi_bitmap_claim(&region->dirty, 1, blocks, bit_idx, NULL);  
+  *large     = info.x.is_large;
+  *is_pinned = info.x.is_pinned;
+  *memid     = mi_memid_create(region, bit_idx);
+  void* p = start + (mi_bitmap_index_bit_in_field(bit_idx) * MI_SEGMENT_SIZE);
+
+  // commit
+  if (*commit) {
+    // ensure commit
+    bool any_uncommitted;
+    _mi_bitmap_claim(&region->commit, 1, blocks, bit_idx, &any_uncommitted);
+    if (any_uncommitted) {
+      mi_assert_internal(!info.x.is_large && !info.x.is_pinned);
+      bool commit_zero = false;
+      if (!_mi_mem_commit(p, blocks * MI_SEGMENT_SIZE, &commit_zero, tld)) {
+        // failed to commit! unclaim and return
+        mi_bitmap_unclaim(&region->in_use, 1, blocks, bit_idx);
+        return NULL;
+      }
+      if (commit_zero) *is_zero = true;      
+    }
+  }
+  else {
+    // no need to commit, but check if already fully committed
+    *commit = _mi_bitmap_is_claimed(&region->commit, 1, blocks, bit_idx);
+  }  
+  mi_assert_internal(!*commit || _mi_bitmap_is_claimed(&region->commit, 1, blocks, bit_idx));
+
+  // unreset reset blocks
+  if (_mi_bitmap_is_any_claimed(&region->reset, 1, blocks, bit_idx)) {
+    // some blocks are still reset
+    mi_assert_internal(!info.x.is_large && !info.x.is_pinned);
+    mi_assert_internal(!mi_option_is_enabled(mi_option_eager_commit) || *commit || mi_option_get(mi_option_eager_commit_delay) > 0); 
+    mi_bitmap_unclaim(&region->reset, 1, blocks, bit_idx);
+    if (*commit || !mi_option_is_enabled(mi_option_reset_decommits)) { // only if needed
+      bool reset_zero = false;
+      _mi_mem_unreset(p, blocks * MI_SEGMENT_SIZE, &reset_zero, tld);
+      if (reset_zero) *is_zero = true;
+    }
+  }
+  mi_assert_internal(!_mi_bitmap_is_any_claimed(&region->reset, 1, blocks, bit_idx));
+  
+  #if (MI_DEBUG>=2)
+  if (*commit) { ((uint8_t*)p)[0] = 0; }
+  #endif
+  
+  // and return the allocation  
+  mi_assert_internal(p != NULL);  
+  return p;
+}
+
+
+/* ----------------------------------------------------------------------------
+ Allocation
+-----------------------------------------------------------------------------*/
+
+// Allocate `size` memory aligned at `alignment`. Return non NULL on success, with a given memory `id`.
+// (`id` is abstract, but `id = idx*MI_REGION_MAP_BITS + bitidx`)
+void* _mi_mem_alloc_aligned(size_t size, size_t alignment, bool* commit, bool* large, bool* is_pinned, bool* is_zero, size_t* memid, mi_os_tld_t* tld)
+{
+  mi_assert_internal(memid != NULL && tld != NULL);
+  mi_assert_internal(size > 0);
+  *memid = 0;
+  *is_zero = false;
+  *is_pinned = false;
+  bool default_large = false;
+  if (large==NULL) large = &default_large;  // ensure `large != NULL`  
+  if (size == 0) return NULL;
+  size = _mi_align_up(size, _mi_os_page_size());
+
+  // allocate from regions if possible
+  void* p = NULL;
+  size_t arena_memid;
+  const size_t blocks = mi_region_block_count(size);
+  if (blocks <= MI_REGION_MAX_OBJ_BLOCKS && alignment <= MI_SEGMENT_ALIGN) {
+    p = mi_region_try_alloc(blocks, commit, large, is_pinned, is_zero, memid, tld);    
+    if (p == NULL) {
+      _mi_warning_message("unable to allocate from region: size %zu\n", size);
+    }
+  }
+  if (p == NULL) {
+    // and otherwise fall back to the OS
+    p = _mi_arena_alloc_aligned(size, alignment, commit, large, is_pinned, is_zero, &arena_memid, tld);
+    *memid = mi_memid_create_from_arena(arena_memid);
+  }
+
+  if (p != NULL) {
+    mi_assert_internal((uintptr_t)p % alignment == 0);
+#if (MI_DEBUG>=2)
+    if (*commit) { ((uint8_t*)p)[0] = 0; } // ensure the memory is committed
+#endif
+  }
+  return p;
+}
+
+
+
+/* ----------------------------------------------------------------------------
+Free
+-----------------------------------------------------------------------------*/
+
+// Free previously allocated memory with a given id.
+void _mi_mem_free(void* p, size_t size, size_t id, bool full_commit, bool any_reset, mi_os_tld_t* tld) {
+  mi_assert_internal(size > 0 && tld != NULL);
+  if (p==NULL) return;
+  if (size==0) return;
+  size = _mi_align_up(size, _mi_os_page_size());
+  
+  size_t arena_memid = 0;
+  mi_bitmap_index_t bit_idx;
+  mem_region_t* region;
+  if (mi_memid_is_arena(id,&region,&bit_idx,&arena_memid)) {
+   // was a direct arena allocation, pass through
+    _mi_arena_free(p, size, arena_memid, full_commit, tld->stats);
+  }
+  else {
+    // allocated in a region
+    mi_assert_internal(size <= MI_REGION_MAX_OBJ_SIZE); if (size > MI_REGION_MAX_OBJ_SIZE) return;
+    const size_t blocks = mi_region_block_count(size);
+    mi_assert_internal(blocks + bit_idx <= MI_BITMAP_FIELD_BITS);
+    mi_region_info_t info;
+    info.value = mi_atomic_load_acquire(&region->info);
+    mi_assert_internal(info.value != 0);
+    void* blocks_start = mi_region_blocks_start(region, bit_idx);
+    mi_assert_internal(blocks_start == p); // not a pointer in our area?
+    mi_assert_internal(bit_idx + blocks <= MI_BITMAP_FIELD_BITS);
+    if (blocks_start != p || bit_idx + blocks > MI_BITMAP_FIELD_BITS) return; // or `abort`?
+
+    // committed?
+    if (full_commit && (size % MI_SEGMENT_SIZE) == 0) {
+      _mi_bitmap_claim(&region->commit, 1, blocks, bit_idx, NULL);
+    }
+
+    if (any_reset) {
+      // set the is_reset bits if any pages were reset
+      _mi_bitmap_claim(&region->reset, 1, blocks, bit_idx, NULL);
+    }
+
+    // reset the blocks to reduce the working set.
+    if (!info.x.is_large && !info.x.is_pinned && mi_option_is_enabled(mi_option_segment_reset) 
+       && (mi_option_is_enabled(mi_option_eager_commit) ||
+           mi_option_is_enabled(mi_option_reset_decommits))) // cannot reset halfway committed segments, use only `option_page_reset` instead            
+    {
+      bool any_unreset;
+      _mi_bitmap_claim(&region->reset, 1, blocks, bit_idx, &any_unreset);
+      if (any_unreset) {
+        _mi_abandoned_await_readers(); // ensure no more pending write (in case reset = decommit)
+        _mi_mem_reset(p, blocks * MI_SEGMENT_SIZE, tld);
+      }
+    }    
+
+    // and unclaim
+    bool all_unclaimed = mi_bitmap_unclaim(&region->in_use, 1, blocks, bit_idx);
+    mi_assert_internal(all_unclaimed); MI_UNUSED(all_unclaimed);
+  }
+}
+
+
+/* ----------------------------------------------------------------------------
+  collection
+-----------------------------------------------------------------------------*/
+void _mi_mem_collect(mi_os_tld_t* tld) {
+  // free every region that has no segments in use.
+  size_t rcount = mi_atomic_load_relaxed(&regions_count);
+  for (size_t i = 0; i < rcount; i++) {
+    mem_region_t* region = &regions[i];
+    if (mi_atomic_load_relaxed(&region->info) != 0) {
+      // if no segments used, try to claim the whole region
+      size_t m = mi_atomic_load_relaxed(&region->in_use);
+      while (m == 0 && !mi_atomic_cas_weak_release(&region->in_use, &m, MI_BITMAP_FIELD_FULL)) { /* nothing */ };
+      if (m == 0) {
+        // on success, free the whole region
+        uint8_t* start = (uint8_t*)mi_atomic_load_ptr_acquire(uint8_t,&regions[i].start);
+        size_t arena_memid = mi_atomic_load_relaxed(&regions[i].arena_memid);
+        size_t commit = mi_atomic_load_relaxed(&regions[i].commit);
+        memset((void*)&regions[i], 0, sizeof(mem_region_t));  // cast to void* to avoid atomic warning
+        // and release the whole region
+        mi_atomic_store_release(&region->info, (size_t)0);
+        if (start != NULL) { // && !_mi_os_is_huge_reserved(start)) {         
+          _mi_abandoned_await_readers(); // ensure no pending reads
+          _mi_arena_free(start, MI_REGION_SIZE, arena_memid, (~commit == 0), tld->stats);
+        }
+      }
+    }
+  }
+}
+
+
+/* ----------------------------------------------------------------------------
+  Other
+-----------------------------------------------------------------------------*/
+
+bool _mi_mem_reset(void* p, size_t size, mi_os_tld_t* tld) {
+  return _mi_os_reset(p, size, tld->stats);
+}
+
+bool _mi_mem_unreset(void* p, size_t size, bool* is_zero, mi_os_tld_t* tld) {
+  return _mi_os_unreset(p, size, is_zero, tld->stats);
+}
+
+bool _mi_mem_commit(void* p, size_t size, bool* is_zero, mi_os_tld_t* tld) {
+  return _mi_os_commit(p, size, is_zero, tld->stats);
+}
+
+bool _mi_mem_decommit(void* p, size_t size, mi_os_tld_t* tld) {
+  return _mi_os_decommit(p, size, tld->stats);
+}
+
+bool _mi_mem_protect(void* p, size_t size) {
+  return _mi_os_protect(p, size);
+}
+
+bool _mi_mem_unprotect(void* p, size_t size) {
+  return _mi_os_unprotect(p, size);
+}
diff --git a/source/luametatex/source/libraries/mimalloc/src/segment-cache.c b/source/luametatex/source/libraries/mimalloc/src/segment-cache.c
new file mode 100644
index 000000000..aacdbc11d
--- /dev/null
+++ b/source/luametatex/source/libraries/mimalloc/src/segment-cache.c
@@ -0,0 +1,360 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2020, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+
+/* ----------------------------------------------------------------------------
+  Implements a cache of segments to avoid expensive OS calls and to reuse
+  the commit_mask to optimize the commit/decommit calls.
+  The full memory map of all segments is also implemented here.
+-----------------------------------------------------------------------------*/
+#include "mimalloc.h"
+#include "mimalloc-internal.h"
+#include "mimalloc-atomic.h"
+
+#include "bitmap.h"  // atomic bitmap
+
+//#define MI_CACHE_DISABLE 1    // define to completely disable the segment cache
+
+#define MI_CACHE_FIELDS     (16)
+#define MI_CACHE_MAX        (MI_BITMAP_FIELD_BITS*MI_CACHE_FIELDS)       // 1024 on 64-bit
+
+#define BITS_SET()          MI_ATOMIC_VAR_INIT(UINTPTR_MAX)
+#define MI_CACHE_BITS_SET   MI_INIT16(BITS_SET)                          // note: update if MI_CACHE_FIELDS changes
+
+typedef struct mi_cache_slot_s {
+  void*               p;
+  size_t              memid;
+  bool                is_pinned;
+  mi_commit_mask_t    commit_mask;
+  mi_commit_mask_t    decommit_mask;
+  _Atomic(mi_msecs_t) expire;
+} mi_cache_slot_t;
+
+static mi_decl_cache_align mi_cache_slot_t cache[MI_CACHE_MAX];    // = 0
+
+static mi_decl_cache_align mi_bitmap_field_t cache_available[MI_CACHE_FIELDS] = { MI_CACHE_BITS_SET };        // zero bit = available!
+static mi_decl_cache_align mi_bitmap_field_t cache_available_large[MI_CACHE_FIELDS] = { MI_CACHE_BITS_SET };
+static mi_decl_cache_align mi_bitmap_field_t cache_inuse[MI_CACHE_FIELDS];   // zero bit = free
+
+
+mi_decl_noinline void* _mi_segment_cache_pop(size_t size, mi_commit_mask_t* commit_mask, mi_commit_mask_t* decommit_mask, bool* large, bool* is_pinned, bool* is_zero, size_t* memid, mi_os_tld_t* tld)
+{
+#ifdef MI_CACHE_DISABLE
+  return NULL;
+#else
+
+  // only segment blocks
+  if (size != MI_SEGMENT_SIZE) return NULL;
+
+  // numa node determines start field
+  const int numa_node = _mi_os_numa_node(tld);
+  size_t start_field = 0;
+  if (numa_node > 0) {
+    start_field = (MI_CACHE_FIELDS / _mi_os_numa_node_count())*numa_node;
+    if (start_field >= MI_CACHE_FIELDS) start_field = 0;
+  }
+
+  // find an available slot
+  mi_bitmap_index_t bitidx = 0;
+  bool claimed = false;
+  if (*large) {  // large allowed?
+    claimed = _mi_bitmap_try_find_from_claim(cache_available_large, MI_CACHE_FIELDS, start_field, 1, &bitidx);
+    if (claimed) *large = true;
+  }
+  if (!claimed) {
+    claimed = _mi_bitmap_try_find_from_claim(cache_available, MI_CACHE_FIELDS, start_field, 1, &bitidx);
+    if (claimed) *large = false;
+  }
+
+  if (!claimed) return NULL;
+
+  // found a slot
+  mi_cache_slot_t* slot = &cache[mi_bitmap_index_bit(bitidx)];
+  void* p = slot->p;
+  *memid = slot->memid;
+  *is_pinned = slot->is_pinned;
+  *is_zero = false;
+  *commit_mask = slot->commit_mask;     
+  *decommit_mask = slot->decommit_mask;
+  slot->p = NULL;
+  mi_atomic_storei64_release(&slot->expire,(mi_msecs_t)0);
+  
+  // mark the slot as free again
+  mi_assert_internal(_mi_bitmap_is_claimed(cache_inuse, MI_CACHE_FIELDS, 1, bitidx));
+  _mi_bitmap_unclaim(cache_inuse, MI_CACHE_FIELDS, 1, bitidx);
+  return p;
+#endif
+}
+
+static mi_decl_noinline void mi_commit_mask_decommit(mi_commit_mask_t* cmask, void* p, size_t total, mi_stats_t* stats)
+{
+  if (mi_commit_mask_is_empty(cmask)) {
+    // nothing
+  }
+  else if (mi_commit_mask_is_full(cmask)) {
+    _mi_os_decommit(p, total, stats);
+  }
+  else {
+    // todo: one call to decommit the whole at once?
+    mi_assert_internal((total%MI_COMMIT_MASK_BITS)==0);
+    size_t part = total/MI_COMMIT_MASK_BITS;
+    size_t idx;
+    size_t count;    
+    mi_commit_mask_foreach(cmask, idx, count) {
+      void*  start = (uint8_t*)p + (idx*part);
+      size_t size = count*part;
+      _mi_os_decommit(start, size, stats);
+    }
+    mi_commit_mask_foreach_end()
+  }
+  mi_commit_mask_create_empty(cmask);
+}
+
+#define MI_MAX_PURGE_PER_PUSH  (4)
+
+static mi_decl_noinline void mi_segment_cache_purge(bool force, mi_os_tld_t* tld)
+{
+  MI_UNUSED(tld);
+  if (!mi_option_is_enabled(mi_option_allow_decommit)) return;
+  mi_msecs_t now = _mi_clock_now();
+  size_t purged = 0;
+  const size_t max_visits = (force ? MI_CACHE_MAX /* visit all */ : MI_CACHE_FIELDS /* probe at most N (=16) slots */);
+  size_t idx              = (force ? 0 : _mi_random_shuffle((uintptr_t)now) % MI_CACHE_MAX /* random start */ );
+  for (size_t visited = 0; visited < max_visits; visited++,idx++) {  // visit N slots
+    if (idx >= MI_CACHE_MAX) idx = 0; // wrap
+    mi_cache_slot_t* slot = &cache[idx];
+    mi_msecs_t expire = mi_atomic_loadi64_relaxed(&slot->expire);
+    if (expire != 0 && (force || now >= expire)) {  // racy read
+      // seems expired, first claim it from available
+      purged++;
+      mi_bitmap_index_t bitidx = mi_bitmap_index_create_from_bit(idx);
+      if (_mi_bitmap_claim(cache_available, MI_CACHE_FIELDS, 1, bitidx, NULL)) {
+        // was available, we claimed it
+        expire = mi_atomic_loadi64_acquire(&slot->expire);
+        if (expire != 0 && (force || now >= expire)) {  // safe read
+          // still expired, decommit it
+          mi_atomic_storei64_relaxed(&slot->expire,(mi_msecs_t)0);
+          mi_assert_internal(!mi_commit_mask_is_empty(&slot->commit_mask) && _mi_bitmap_is_claimed(cache_available_large, MI_CACHE_FIELDS, 1, bitidx));
+          _mi_abandoned_await_readers();  // wait until safe to decommit
+          // decommit committed parts
+          // TODO: instead of decommit, we could also free to the OS?
+          mi_commit_mask_decommit(&slot->commit_mask, slot->p, MI_SEGMENT_SIZE, tld->stats);
+          mi_commit_mask_create_empty(&slot->decommit_mask);
+        }
+        _mi_bitmap_unclaim(cache_available, MI_CACHE_FIELDS, 1, bitidx); // make it available again for a pop
+      }
+      if (!force && purged > MI_MAX_PURGE_PER_PUSH) break;  // bound to no more than N purge tries per push
+    }
+  }
+}
+
+void _mi_segment_cache_collect(bool force, mi_os_tld_t* tld) {
+  mi_segment_cache_purge(force, tld );
+}
+
+mi_decl_noinline bool _mi_segment_cache_push(void* start, size_t size, size_t memid, const mi_commit_mask_t* commit_mask, const mi_commit_mask_t* decommit_mask, bool is_large, bool is_pinned, mi_os_tld_t* tld)
+{
+#ifdef MI_CACHE_DISABLE
+  return false;
+#else
+
+  // only for normal segment blocks
+  if (size != MI_SEGMENT_SIZE || ((uintptr_t)start % MI_SEGMENT_ALIGN) != 0) return false;
+
+  // numa node determines start field
+  int numa_node = _mi_os_numa_node(NULL);
+  size_t start_field = 0;
+  if (numa_node > 0) {
+    start_field = (MI_CACHE_FIELDS / _mi_os_numa_node_count())*numa_node;
+    if (start_field >= MI_CACHE_FIELDS) start_field = 0;
+  }
+
+  // purge expired entries
+  mi_segment_cache_purge(false /* force? */, tld);
+
+  // find an available slot
+  mi_bitmap_index_t bitidx;
+  bool claimed = _mi_bitmap_try_find_from_claim(cache_inuse, MI_CACHE_FIELDS, start_field, 1, &bitidx);
+  if (!claimed) return false;
+
+  mi_assert_internal(_mi_bitmap_is_claimed(cache_available, MI_CACHE_FIELDS, 1, bitidx));
+  mi_assert_internal(_mi_bitmap_is_claimed(cache_available_large, MI_CACHE_FIELDS, 1, bitidx));
+#if MI_DEBUG>1
+  if (is_pinned || is_large) {
+    mi_assert_internal(mi_commit_mask_is_full(commit_mask));
+  }
+#endif
+
+  // set the slot
+  mi_cache_slot_t* slot = &cache[mi_bitmap_index_bit(bitidx)];
+  slot->p = start;
+  slot->memid = memid;
+  slot->is_pinned = is_pinned;
+  mi_atomic_storei64_relaxed(&slot->expire,(mi_msecs_t)0);
+  slot->commit_mask = *commit_mask;
+  slot->decommit_mask = *decommit_mask;
+  if (!mi_commit_mask_is_empty(commit_mask) && !is_large && !is_pinned && mi_option_is_enabled(mi_option_allow_decommit)) {
+    long delay = mi_option_get(mi_option_segment_decommit_delay);
+    if (delay == 0) {
+      _mi_abandoned_await_readers(); // wait until safe to decommit
+      mi_commit_mask_decommit(&slot->commit_mask, start, MI_SEGMENT_SIZE, tld->stats);
+      mi_commit_mask_create_empty(&slot->decommit_mask);
+    }
+    else {
+      mi_atomic_storei64_release(&slot->expire, _mi_clock_now() + delay);
+    }
+  }
+
+  // make it available
+  _mi_bitmap_unclaim((is_large ? cache_available_large : cache_available), MI_CACHE_FIELDS, 1, bitidx);
+  return true;
+#endif
+}
+
+
+/* -----------------------------------------------------------
+  The following functions are to reliably find the segment or
+  block that encompasses any pointer p (or NULL if it is not
+  in any of our segments).
+  We maintain a bitmap of all memory with 1 bit per MI_SEGMENT_SIZE (64MiB)
+  set to 1 if it contains the segment meta data.
+----------------------------------------------------------- */
+
+
+#if (MI_INTPTR_SIZE==8)
+#define MI_MAX_ADDRESS    ((size_t)20 << 40)  // 20TB
+#else
+#define MI_MAX_ADDRESS    ((size_t)2 << 30)   // 2Gb
+#endif
+
+#define MI_SEGMENT_MAP_BITS  (MI_MAX_ADDRESS / MI_SEGMENT_SIZE)
+#define MI_SEGMENT_MAP_SIZE  (MI_SEGMENT_MAP_BITS / 8)
+#define MI_SEGMENT_MAP_WSIZE (MI_SEGMENT_MAP_SIZE / MI_INTPTR_SIZE)
+
+static _Atomic(uintptr_t) mi_segment_map[MI_SEGMENT_MAP_WSIZE + 1];  // 2KiB per TB with 64MiB segments
+
+static size_t mi_segment_map_index_of(const mi_segment_t* segment, size_t* bitidx) {
+  mi_assert_internal(_mi_ptr_segment(segment) == segment); // is it aligned on MI_SEGMENT_SIZE?
+  if ((uintptr_t)segment >= MI_MAX_ADDRESS) {
+    *bitidx = 0;
+    return MI_SEGMENT_MAP_WSIZE;
+  }
+  else {
+    const uintptr_t segindex = ((uintptr_t)segment) / MI_SEGMENT_SIZE;
+    *bitidx = segindex % MI_INTPTR_BITS;
+    const size_t mapindex = segindex / MI_INTPTR_BITS;
+    mi_assert_internal(mapindex < MI_SEGMENT_MAP_WSIZE);
+    return mapindex;
+  }
+}
+
+void _mi_segment_map_allocated_at(const mi_segment_t* segment) {
+  size_t bitidx;
+  size_t index = mi_segment_map_index_of(segment, &bitidx);
+  mi_assert_internal(index <= MI_SEGMENT_MAP_WSIZE);
+  if (index==MI_SEGMENT_MAP_WSIZE) return;
+  uintptr_t mask = mi_atomic_load_relaxed(&mi_segment_map[index]);
+  uintptr_t newmask;
+  do {
+    newmask = (mask | ((uintptr_t)1 << bitidx));
+  } while (!mi_atomic_cas_weak_release(&mi_segment_map[index], &mask, newmask));
+}
+
+void _mi_segment_map_freed_at(const mi_segment_t* segment) {
+  size_t bitidx;
+  size_t index = mi_segment_map_index_of(segment, &bitidx);
+  mi_assert_internal(index <= MI_SEGMENT_MAP_WSIZE);
+  if (index == MI_SEGMENT_MAP_WSIZE) return;
+  uintptr_t mask = mi_atomic_load_relaxed(&mi_segment_map[index]);
+  uintptr_t newmask;
+  do {
+    newmask = (mask & ~((uintptr_t)1 << bitidx));
+  } while (!mi_atomic_cas_weak_release(&mi_segment_map[index], &mask, newmask));
+}
+
+// Determine the segment belonging to a pointer or NULL if it is not in a valid segment.
+static mi_segment_t* _mi_segment_of(const void* p) {
+  mi_segment_t* segment = _mi_ptr_segment(p);
+  if (segment == NULL) return NULL; 
+  size_t bitidx;
+  size_t index = mi_segment_map_index_of(segment, &bitidx);
+  // fast path: for any pointer to valid small/medium/large object or first MI_SEGMENT_SIZE in huge
+  const uintptr_t mask = mi_atomic_load_relaxed(&mi_segment_map[index]);
+  if (mi_likely((mask & ((uintptr_t)1 << bitidx)) != 0)) {
+    return segment; // yes, allocated by us
+  }
+  if (index==MI_SEGMENT_MAP_WSIZE) return NULL;
+
+  // TODO: maintain max/min allocated range for efficiency for more efficient rejection of invalid pointers?
+
+  // search downwards for the first segment in case it is an interior pointer
+  // could be slow but searches in MI_INTPTR_SIZE * MI_SEGMENT_SIZE (512MiB) steps trough
+  // valid huge objects
+  // note: we could maintain a lowest index to speed up the path for invalid pointers?
+  size_t lobitidx;
+  size_t loindex;
+  uintptr_t lobits = mask & (((uintptr_t)1 << bitidx) - 1);
+  if (lobits != 0) {
+    loindex = index;
+    lobitidx = mi_bsr(lobits);    // lobits != 0
+  }
+  else if (index == 0) {
+    return NULL;
+  }
+  else {
+    mi_assert_internal(index > 0);
+    uintptr_t lomask = mask;
+    loindex = index;
+    do {
+      loindex--;  
+      lomask = mi_atomic_load_relaxed(&mi_segment_map[loindex]);      
+    } while (lomask != 0 && loindex > 0);
+    if (lomask == 0) return NULL;
+    lobitidx = mi_bsr(lomask);    // lomask != 0
+  }
+  mi_assert_internal(loindex < MI_SEGMENT_MAP_WSIZE);
+  // take difference as the addresses could be larger than the MAX_ADDRESS space.
+  size_t diff = (((index - loindex) * (8*MI_INTPTR_SIZE)) + bitidx - lobitidx) * MI_SEGMENT_SIZE;
+  segment = (mi_segment_t*)((uint8_t*)segment - diff);
+
+  if (segment == NULL) return NULL;
+  mi_assert_internal((void*)segment < p);
+  bool cookie_ok = (_mi_ptr_cookie(segment) == segment->cookie);
+  mi_assert_internal(cookie_ok);
+  if (mi_unlikely(!cookie_ok)) return NULL;
+  if (((uint8_t*)segment + mi_segment_size(segment)) <= (uint8_t*)p) return NULL; // outside the range
+  mi_assert_internal(p >= (void*)segment && (uint8_t*)p < (uint8_t*)segment + mi_segment_size(segment));
+  return segment;
+}
+
+// Is this a valid pointer in our heap?
+static bool  mi_is_valid_pointer(const void* p) {
+  return (_mi_segment_of(p) != NULL);
+}
+
+mi_decl_nodiscard mi_decl_export bool mi_is_in_heap_region(const void* p) mi_attr_noexcept {
+  return mi_is_valid_pointer(p);
+}
+
+/*
+// Return the full segment range belonging to a pointer
+static void* mi_segment_range_of(const void* p, size_t* size) {
+  mi_segment_t* segment = _mi_segment_of(p);
+  if (segment == NULL) {
+    if (size != NULL) *size = 0;
+    return NULL;
+  }
+  else {
+    if (size != NULL) *size = segment->segment_size;
+    return segment;
+  }
+  mi_assert_expensive(page == NULL || mi_segment_is_valid(_mi_page_segment(page),tld));
+  mi_assert_internal(page == NULL || (mi_segment_page_size(_mi_page_segment(page)) - (MI_SECURE == 0 ? 0 : _mi_os_page_size())) >= block_size);
+  mi_reset_delayed(tld);
+  mi_assert_internal(page == NULL || mi_page_not_in_queue(page, tld));
+  return page;
+}
+*/
diff --git a/source/luametatex/source/libraries/mimalloc/src/segment.c b/source/luametatex/source/libraries/mimalloc/src/segment.c
new file mode 100644
index 000000000..800d4fc31
--- /dev/null
+++ b/source/luametatex/source/libraries/mimalloc/src/segment.c
@@ -0,0 +1,1544 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2018-2020, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+#include "mimalloc.h"
+#include "mimalloc-internal.h"
+#include "mimalloc-atomic.h"
+
+#include <string.h>  // memset
+#include <stdio.h>
+
+#define MI_PAGE_HUGE_ALIGN  (256*1024)
+
+static void mi_segment_delayed_decommit(mi_segment_t* segment, bool force, mi_stats_t* stats);
+
+
+// -------------------------------------------------------------------
+// commit mask 
+// -------------------------------------------------------------------
+
+static bool mi_commit_mask_all_set(const mi_commit_mask_t* commit, const mi_commit_mask_t* cm) {
+  for (size_t i = 0; i < MI_COMMIT_MASK_FIELD_COUNT; i++) {
+    if ((commit->mask[i] & cm->mask[i]) != cm->mask[i]) return false;
+  }
+  return true;
+}
+
+static bool mi_commit_mask_any_set(const mi_commit_mask_t* commit, const mi_commit_mask_t* cm) {
+  for (size_t i = 0; i < MI_COMMIT_MASK_FIELD_COUNT; i++) {
+    if ((commit->mask[i] & cm->mask[i]) != 0) return true;
+  }
+  return false;
+}
+
+static void mi_commit_mask_create_intersect(const mi_commit_mask_t* commit, const mi_commit_mask_t* cm, mi_commit_mask_t* res) {
+  for (size_t i = 0; i < MI_COMMIT_MASK_FIELD_COUNT; i++) {
+    res->mask[i] = (commit->mask[i] & cm->mask[i]);
+  }
+}
+
+static void mi_commit_mask_clear(mi_commit_mask_t* res, const mi_commit_mask_t* cm) {
+  for (size_t i = 0; i < MI_COMMIT_MASK_FIELD_COUNT; i++) {
+    res->mask[i] &= ~(cm->mask[i]);
+  }
+}
+
+static void mi_commit_mask_set(mi_commit_mask_t* res, const mi_commit_mask_t* cm) {
+  for (size_t i = 0; i < MI_COMMIT_MASK_FIELD_COUNT; i++) {
+    res->mask[i] |= cm->mask[i];
+  }
+}
+
+static void mi_commit_mask_create(size_t bitidx, size_t bitcount, mi_commit_mask_t* cm) {
+  mi_assert_internal(bitidx < MI_COMMIT_MASK_BITS);
+  mi_assert_internal((bitidx + bitcount) <= MI_COMMIT_MASK_BITS);
+  if (bitcount == MI_COMMIT_MASK_BITS) {
+    mi_assert_internal(bitidx==0);
+    mi_commit_mask_create_full(cm);
+  }
+  else if (bitcount == 0) {
+    mi_commit_mask_create_empty(cm);
+  }
+  else {
+    mi_commit_mask_create_empty(cm);
+    size_t i = bitidx / MI_COMMIT_MASK_FIELD_BITS;
+    size_t ofs = bitidx % MI_COMMIT_MASK_FIELD_BITS;
+    while (bitcount > 0) {
+      mi_assert_internal(i < MI_COMMIT_MASK_FIELD_COUNT);
+      size_t avail = MI_COMMIT_MASK_FIELD_BITS - ofs;
+      size_t count = (bitcount > avail ? avail : bitcount);
+      size_t mask = (count >= MI_COMMIT_MASK_FIELD_BITS ? ~((size_t)0) : (((size_t)1 << count) - 1) << ofs);
+      cm->mask[i] = mask;
+      bitcount -= count;
+      ofs = 0;
+      i++;
+    }
+  }
+}
+
+size_t _mi_commit_mask_committed_size(const mi_commit_mask_t* cm, size_t total) {
+  mi_assert_internal((total%MI_COMMIT_MASK_BITS)==0);
+  size_t count = 0;
+  for (size_t i = 0; i < MI_COMMIT_MASK_FIELD_COUNT; i++) {
+    size_t mask = cm->mask[i];
+    if (~mask == 0) {
+      count += MI_COMMIT_MASK_FIELD_BITS;
+    }
+    else {
+      for (; mask != 0; mask >>= 1) {  // todo: use popcount
+        if ((mask&1)!=0) count++;
+      }
+    }
+  }
+  // we use total since for huge segments each commit bit may represent a larger size
+  return ((total / MI_COMMIT_MASK_BITS) * count);
+}
+
+
+size_t _mi_commit_mask_next_run(const mi_commit_mask_t* cm, size_t* idx) {
+  size_t i = (*idx) / MI_COMMIT_MASK_FIELD_BITS;
+  size_t ofs = (*idx) % MI_COMMIT_MASK_FIELD_BITS;
+  size_t mask = 0;
+  // find first ones
+  while (i < MI_COMMIT_MASK_FIELD_COUNT) {
+    mask = cm->mask[i];
+    mask >>= ofs;
+    if (mask != 0) {
+      while ((mask&1) == 0) {
+        mask >>= 1;
+        ofs++;
+      }
+      break;
+    }
+    i++;
+    ofs = 0;
+  }
+  if (i >= MI_COMMIT_MASK_FIELD_COUNT) {
+    // not found
+    *idx = MI_COMMIT_MASK_BITS;
+    return 0;
+  }
+  else {
+    // found, count ones
+    size_t count = 0;
+    *idx = (i*MI_COMMIT_MASK_FIELD_BITS) + ofs;
+    do {
+      mi_assert_internal(ofs < MI_COMMIT_MASK_FIELD_BITS && (mask&1) == 1);
+      do {
+        count++;
+        mask >>= 1;
+      } while ((mask&1) == 1);
+      if ((((*idx + count) % MI_COMMIT_MASK_FIELD_BITS) == 0)) {
+        i++;
+        if (i >= MI_COMMIT_MASK_FIELD_COUNT) break;
+        mask = cm->mask[i];
+        ofs = 0;
+      }
+    } while ((mask&1) == 1);
+    mi_assert_internal(count > 0);
+    return count;
+  }
+}
+
+
+/* --------------------------------------------------------------------------------
+  Segment allocation
+
+  If a  thread ends, it "abandons" pages with used blocks
+  and there is an abandoned segment list whose segments can
+  be reclaimed by still running threads, much like work-stealing.
+-------------------------------------------------------------------------------- */
+
+
+/* -----------------------------------------------------------
+   Slices
+----------------------------------------------------------- */
+
+
+static const mi_slice_t* mi_segment_slices_end(const mi_segment_t* segment) {
+  return &segment->slices[segment->slice_entries];
+}
+
+static uint8_t* mi_slice_start(const mi_slice_t* slice) {
+  mi_segment_t* segment = _mi_ptr_segment(slice);
+  mi_assert_internal(slice >= segment->slices && slice < mi_segment_slices_end(segment));
+  return ((uint8_t*)segment + ((slice - segment->slices)*MI_SEGMENT_SLICE_SIZE));
+}
+
+
+/* -----------------------------------------------------------
+   Bins
+----------------------------------------------------------- */
+// Use bit scan forward to quickly find the first zero bit if it is available
+
+static inline size_t mi_slice_bin8(size_t slice_count) {
+  if (slice_count<=1) return slice_count;
+  mi_assert_internal(slice_count <= MI_SLICES_PER_SEGMENT);
+  slice_count--;
+  size_t s = mi_bsr(slice_count);  // slice_count > 1
+  if (s <= 2) return slice_count + 1;
+  size_t bin = ((s << 2) | ((slice_count >> (s - 2))&0x03)) - 4;
+  return bin;
+}
+
+static inline size_t mi_slice_bin(size_t slice_count) {
+  mi_assert_internal(slice_count*MI_SEGMENT_SLICE_SIZE <= MI_SEGMENT_SIZE);
+  mi_assert_internal(mi_slice_bin8(MI_SLICES_PER_SEGMENT) <= MI_SEGMENT_BIN_MAX);
+  size_t bin = mi_slice_bin8(slice_count);
+  mi_assert_internal(bin <= MI_SEGMENT_BIN_MAX);
+  return bin;
+}
+
+static inline size_t mi_slice_index(const mi_slice_t* slice) {
+  mi_segment_t* segment = _mi_ptr_segment(slice);
+  ptrdiff_t index = slice - segment->slices;
+  mi_assert_internal(index >= 0 && index < (ptrdiff_t)segment->slice_entries);
+  return index;
+}
+
+
+/* -----------------------------------------------------------
+   Slice span queues
+----------------------------------------------------------- */
+
+static void mi_span_queue_push(mi_span_queue_t* sq, mi_slice_t* slice) {
+  // todo: or push to the end?
+  mi_assert_internal(slice->prev == NULL && slice->next==NULL);
+  slice->prev = NULL; // paranoia
+  slice->next = sq->first;
+  sq->first = slice;
+  if (slice->next != NULL) slice->next->prev = slice;
+                     else sq->last = slice;
+  slice->xblock_size = 0; // free
+}
+
+static mi_span_queue_t* mi_span_queue_for(size_t slice_count, mi_segments_tld_t* tld) {
+  size_t bin = mi_slice_bin(slice_count);
+  mi_span_queue_t* sq = &tld->spans[bin];
+  mi_assert_internal(sq->slice_count >= slice_count);
+  return sq;
+}
+
+static void mi_span_queue_delete(mi_span_queue_t* sq, mi_slice_t* slice) {
+  mi_assert_internal(slice->xblock_size==0 && slice->slice_count>0 && slice->slice_offset==0);
+  // should work too if the queue does not contain slice (which can happen during reclaim)
+  if (slice->prev != NULL) slice->prev->next = slice->next;
+  if (slice == sq->first) sq->first = slice->next;
+  if (slice->next != NULL) slice->next->prev = slice->prev;
+  if (slice == sq->last) sq->last = slice->prev;
+  slice->prev = NULL;
+  slice->next = NULL;
+  slice->xblock_size = 1; // no more free
+}
+
+
+/* -----------------------------------------------------------
+ Invariant checking
+----------------------------------------------------------- */
+
+static bool mi_slice_is_used(const mi_slice_t* slice) {
+  return (slice->xblock_size > 0);
+}
+
+
+#if (MI_DEBUG>=3)
+static bool mi_span_queue_contains(mi_span_queue_t* sq, mi_slice_t* slice) {
+  for (mi_slice_t* s = sq->first; s != NULL; s = s->next) {
+    if (s==slice) return true;
+  }
+  return false;
+}
+
+static bool mi_segment_is_valid(mi_segment_t* segment, mi_segments_tld_t* tld) {
+  mi_assert_internal(segment != NULL);
+  mi_assert_internal(_mi_ptr_cookie(segment) == segment->cookie);
+  mi_assert_internal(segment->abandoned <= segment->used);
+  mi_assert_internal(segment->thread_id == 0 || segment->thread_id == _mi_thread_id());
+  mi_assert_internal(mi_commit_mask_all_set(&segment->commit_mask, &segment->decommit_mask)); // can only decommit committed blocks
+  //mi_assert_internal(segment->segment_info_size % MI_SEGMENT_SLICE_SIZE == 0);
+  mi_slice_t* slice = &segment->slices[0];
+  const mi_slice_t* end = mi_segment_slices_end(segment);
+  size_t used_count = 0;
+  mi_span_queue_t* sq;
+  while(slice < end) {
+    mi_assert_internal(slice->slice_count > 0);
+    mi_assert_internal(slice->slice_offset == 0);
+    size_t index = mi_slice_index(slice);
+    size_t maxindex = (index + slice->slice_count >= segment->slice_entries ? segment->slice_entries : index + slice->slice_count) - 1;
+    if (mi_slice_is_used(slice)) { // a page in use, we need at least MAX_SLICE_OFFSET valid back offsets
+      used_count++;
+      for (size_t i = 0; i <= MI_MAX_SLICE_OFFSET && index + i <= maxindex; i++) {
+        mi_assert_internal(segment->slices[index + i].slice_offset == i*sizeof(mi_slice_t));
+        mi_assert_internal(i==0 || segment->slices[index + i].slice_count == 0);
+        mi_assert_internal(i==0 || segment->slices[index + i].xblock_size == 1);
+      }
+      // and the last entry as well (for coalescing)
+      const mi_slice_t* last = slice + slice->slice_count - 1;
+      if (last > slice && last < mi_segment_slices_end(segment)) {
+        mi_assert_internal(last->slice_offset == (slice->slice_count-1)*sizeof(mi_slice_t));
+        mi_assert_internal(last->slice_count == 0);
+        mi_assert_internal(last->xblock_size == 1);
+      }
+    }
+    else {  // free range of slices; only last slice needs a valid back offset
+      mi_slice_t* last = &segment->slices[maxindex];
+      if (segment->kind != MI_SEGMENT_HUGE || slice->slice_count <= (segment->slice_entries - segment->segment_info_slices)) {
+        mi_assert_internal((uint8_t*)slice == (uint8_t*)last - last->slice_offset);
+      }
+      mi_assert_internal(slice == last || last->slice_count == 0 );
+      mi_assert_internal(last->xblock_size == 0 || (segment->kind==MI_SEGMENT_HUGE && last->xblock_size==1));
+      if (segment->kind != MI_SEGMENT_HUGE && segment->thread_id != 0) { // segment is not huge or abandoned
+        sq = mi_span_queue_for(slice->slice_count,tld);
+        mi_assert_internal(mi_span_queue_contains(sq,slice));
+      }
+    }
+    slice = &segment->slices[maxindex+1];
+  }
+  mi_assert_internal(slice == end);
+  mi_assert_internal(used_count == segment->used + 1);
+  return true;
+}
+#endif
+
+/* -----------------------------------------------------------
+ Segment size calculations
+----------------------------------------------------------- */
+
+static size_t mi_segment_info_size(mi_segment_t* segment) {
+  return segment->segment_info_slices * MI_SEGMENT_SLICE_SIZE;
+}
+
+static uint8_t* _mi_segment_page_start_from_slice(const mi_segment_t* segment, const mi_slice_t* slice, size_t xblock_size, size_t* page_size)
+{
+  ptrdiff_t idx = slice - segment->slices;
+  size_t psize = (size_t)slice->slice_count * MI_SEGMENT_SLICE_SIZE;
+  // make the start not OS page aligned for smaller blocks to avoid page/cache effects
+  size_t start_offset = (xblock_size >= MI_INTPTR_SIZE && xblock_size <= 1024 ? MI_MAX_ALIGN_GUARANTEE : 0); 
+  if (page_size != NULL) { *page_size = psize - start_offset; }
+  return (uint8_t*)segment + ((idx*MI_SEGMENT_SLICE_SIZE) + start_offset);
+}
+
+// Start of the page available memory; can be used on uninitialized pages
+uint8_t* _mi_segment_page_start(const mi_segment_t* segment, const mi_page_t* page, size_t* page_size)
+{
+  const mi_slice_t* slice = mi_page_to_slice((mi_page_t*)page);
+  uint8_t* p = _mi_segment_page_start_from_slice(segment, slice, page->xblock_size, page_size);  
+  mi_assert_internal(page->xblock_size > 0 || _mi_ptr_page(p) == page);
+  mi_assert_internal(_mi_ptr_segment(p) == segment);
+  return p;
+}
+
+
+static size_t mi_segment_calculate_slices(size_t required, size_t* pre_size, size_t* info_slices) {
+  size_t page_size = _mi_os_page_size();
+  size_t isize     = _mi_align_up(sizeof(mi_segment_t), page_size);
+  size_t guardsize = 0;
+
+  if (MI_SECURE>0) {
+    // in secure mode, we set up a protected page in between the segment info
+    // and the page data (and one at the end of the segment)
+    guardsize =  page_size;
+    required  = _mi_align_up(required, page_size);
+  }
+
+  if (pre_size != NULL) *pre_size = isize;
+  isize = _mi_align_up(isize + guardsize, MI_SEGMENT_SLICE_SIZE);
+  if (info_slices != NULL) *info_slices = isize / MI_SEGMENT_SLICE_SIZE;
+  size_t segment_size = (required==0 ? MI_SEGMENT_SIZE : _mi_align_up( required + isize + guardsize, MI_SEGMENT_SLICE_SIZE) );  
+  mi_assert_internal(segment_size % MI_SEGMENT_SLICE_SIZE == 0);
+  return (segment_size / MI_SEGMENT_SLICE_SIZE);
+}
+
+
+/* ----------------------------------------------------------------------------
+Segment caches
+We keep a small segment cache per thread to increase local
+reuse and avoid setting/clearing guard pages in secure mode.
+------------------------------------------------------------------------------- */
+
+static void mi_segments_track_size(long segment_size, mi_segments_tld_t* tld) {
+  if (segment_size>=0) _mi_stat_increase(&tld->stats->segments,1);
+                  else _mi_stat_decrease(&tld->stats->segments,1);
+  tld->count += (segment_size >= 0 ? 1 : -1);
+  if (tld->count > tld->peak_count) tld->peak_count = tld->count;
+  tld->current_size += segment_size;
+  if (tld->current_size > tld->peak_size) tld->peak_size = tld->current_size;
+}
+
+static void mi_segment_os_free(mi_segment_t* segment, mi_segments_tld_t* tld) {
+  segment->thread_id = 0;
+  _mi_segment_map_freed_at(segment);
+  mi_segments_track_size(-((long)mi_segment_size(segment)),tld);
+  if (MI_SECURE>0) {
+    // _mi_os_unprotect(segment, mi_segment_size(segment)); // ensure no more guard pages are set
+    // unprotect the guard pages; we cannot just unprotect the whole segment size as part may be decommitted
+    size_t os_pagesize = _mi_os_page_size();
+    _mi_os_unprotect((uint8_t*)segment + mi_segment_info_size(segment) - os_pagesize, os_pagesize);
+    uint8_t* end = (uint8_t*)segment + mi_segment_size(segment) - os_pagesize;
+    _mi_os_unprotect(end, os_pagesize);
+  }
+
+  // purge delayed decommits now? (no, leave it to the cache)
+  // mi_segment_delayed_decommit(segment,true,tld->stats);
+  
+  // _mi_os_free(segment, mi_segment_size(segment), /*segment->memid,*/ tld->stats);
+  const size_t size = mi_segment_size(segment);
+  if (size != MI_SEGMENT_SIZE || !_mi_segment_cache_push(segment, size, segment->memid, &segment->commit_mask, &segment->decommit_mask, segment->mem_is_large, segment->mem_is_pinned, tld->os)) {
+    const size_t csize = _mi_commit_mask_committed_size(&segment->commit_mask, size);
+    if (csize > 0 && !segment->mem_is_pinned) _mi_stat_decrease(&_mi_stats_main.committed, csize);
+    _mi_abandoned_await_readers();  // wait until safe to free
+    _mi_arena_free(segment, mi_segment_size(segment), segment->memid, segment->mem_is_pinned /* pretend not committed to not double count decommits */, tld->os);
+  }
+}
+
+// called by threads that are terminating 
+void _mi_segment_thread_collect(mi_segments_tld_t* tld) {
+  MI_UNUSED(tld);
+  // nothing to do
+}
+
+
+/* -----------------------------------------------------------
+   Span management
+----------------------------------------------------------- */
+
+static void mi_segment_commit_mask(mi_segment_t* segment, bool conservative, uint8_t* p, size_t size, uint8_t** start_p, size_t* full_size, mi_commit_mask_t* cm) {
+  mi_assert_internal(_mi_ptr_segment(p) == segment);
+  mi_assert_internal(segment->kind != MI_SEGMENT_HUGE);
+  mi_commit_mask_create_empty(cm);
+  if (size == 0 || size > MI_SEGMENT_SIZE || segment->kind == MI_SEGMENT_HUGE) return;
+  const size_t segstart = mi_segment_info_size(segment);
+  const size_t segsize = mi_segment_size(segment);
+  if (p >= (uint8_t*)segment + segsize) return;
+
+  size_t pstart = (p - (uint8_t*)segment);
+  mi_assert_internal(pstart + size <= segsize);
+
+  size_t start;
+  size_t end;
+  if (conservative) {
+    // decommit conservative
+    start = _mi_align_up(pstart, MI_COMMIT_SIZE);
+    end   = _mi_align_down(pstart + size, MI_COMMIT_SIZE);
+    mi_assert_internal(start >= segstart);
+    mi_assert_internal(end <= segsize);
+  }
+  else {
+    // commit liberal
+    start = _mi_align_down(pstart, MI_MINIMAL_COMMIT_SIZE);
+    end   = _mi_align_up(pstart + size, MI_MINIMAL_COMMIT_SIZE);
+  }
+  if (pstart >= segstart && start < segstart) {  // note: the mask is also calculated for an initial commit of the info area
+    start = segstart;
+  }
+  if (end > segsize) {
+    end = segsize;
+  }
+
+  mi_assert_internal(start <= pstart && (pstart + size) <= end);
+  mi_assert_internal(start % MI_COMMIT_SIZE==0 && end % MI_COMMIT_SIZE == 0);
+  *start_p   = (uint8_t*)segment + start;
+  *full_size = (end > start ? end - start : 0);
+  if (*full_size == 0) return;
+
+  size_t bitidx = start / MI_COMMIT_SIZE;
+  mi_assert_internal(bitidx < MI_COMMIT_MASK_BITS);
+  
+  size_t bitcount = *full_size / MI_COMMIT_SIZE; // can be 0
+  if (bitidx + bitcount > MI_COMMIT_MASK_BITS) {
+    _mi_warning_message("commit mask overflow: idx=%zu count=%zu start=%zx end=%zx p=0x%p size=%zu fullsize=%zu\n", bitidx, bitcount, start, end, p, size, *full_size);
+  }
+  mi_assert_internal((bitidx + bitcount) <= MI_COMMIT_MASK_BITS);
+  mi_commit_mask_create(bitidx, bitcount, cm);
+}
+
+
+static bool mi_segment_commitx(mi_segment_t* segment, bool commit, uint8_t* p, size_t size, mi_stats_t* stats) {    
+  mi_assert_internal(mi_commit_mask_all_set(&segment->commit_mask, &segment->decommit_mask));
+
+  // try to commit in at least MI_MINIMAL_COMMIT_SIZE sizes.
+  /*
+  if (commit && size > 0) {
+    const size_t csize = _mi_align_up(size, MI_MINIMAL_COMMIT_SIZE);
+    if (p + csize <= mi_segment_end(segment)) {
+      size = csize;
+    }
+  }
+  */
+  // commit liberal, but decommit conservative
+  uint8_t* start = NULL;
+  size_t   full_size = 0;
+  mi_commit_mask_t mask;
+  mi_segment_commit_mask(segment, !commit/*conservative*/, p, size, &start, &full_size, &mask);
+  if (mi_commit_mask_is_empty(&mask) || full_size==0) return true;
+
+  if (commit && !mi_commit_mask_all_set(&segment->commit_mask, &mask)) {
+    bool is_zero = false;
+    mi_commit_mask_t cmask;
+    mi_commit_mask_create_intersect(&segment->commit_mask, &mask, &cmask);
+    _mi_stat_decrease(&_mi_stats_main.committed, _mi_commit_mask_committed_size(&cmask, MI_SEGMENT_SIZE)); // adjust for overlap
+    if (!_mi_os_commit(start,full_size,&is_zero,stats)) return false;    
+    mi_commit_mask_set(&segment->commit_mask, &mask);     
+  }
+  else if (!commit && mi_commit_mask_any_set(&segment->commit_mask, &mask)) {
+    mi_assert_internal((void*)start != (void*)segment);
+    //mi_assert_internal(mi_commit_mask_all_set(&segment->commit_mask, &mask));
+
+    mi_commit_mask_t cmask;
+    mi_commit_mask_create_intersect(&segment->commit_mask, &mask, &cmask);
+    _mi_stat_increase(&_mi_stats_main.committed, full_size - _mi_commit_mask_committed_size(&cmask, MI_SEGMENT_SIZE)); // adjust for overlap
+    if (segment->allow_decommit) { 
+      _mi_os_decommit(start, full_size, stats); // ok if this fails
+    } 
+    mi_commit_mask_clear(&segment->commit_mask, &mask);
+  }
+  // increase expiration of reusing part of the delayed decommit
+  if (commit && mi_commit_mask_any_set(&segment->decommit_mask, &mask)) {
+    segment->decommit_expire = _mi_clock_now() + mi_option_get(mi_option_decommit_delay);
+  }
+  // always undo delayed decommits
+  mi_commit_mask_clear(&segment->decommit_mask, &mask);
+  return true;
+}
+
+static bool mi_segment_ensure_committed(mi_segment_t* segment, uint8_t* p, size_t size, mi_stats_t* stats) {
+  mi_assert_internal(mi_commit_mask_all_set(&segment->commit_mask, &segment->decommit_mask));
+  // note: assumes commit_mask is always full for huge segments as otherwise the commit mask bits can overflow
+  if (mi_commit_mask_is_full(&segment->commit_mask) && mi_commit_mask_is_empty(&segment->decommit_mask)) return true; // fully committed
+  return mi_segment_commitx(segment,true,p,size,stats);
+}
+
+static void mi_segment_perhaps_decommit(mi_segment_t* segment, uint8_t* p, size_t size, mi_stats_t* stats) {
+  if (!segment->allow_decommit) return;
+  if (mi_option_get(mi_option_decommit_delay) == 0) {
+    mi_segment_commitx(segment, false, p, size, stats);
+  }
+  else {
+    // register for future decommit in the decommit mask
+    uint8_t* start = NULL;
+    size_t   full_size = 0;
+    mi_commit_mask_t mask; 
+    mi_segment_commit_mask(segment, true /*conservative*/, p, size, &start, &full_size, &mask);
+    if (mi_commit_mask_is_empty(&mask) || full_size==0) return;
+    
+    // update delayed commit
+    mi_assert_internal(segment->decommit_expire > 0 || mi_commit_mask_is_empty(&segment->decommit_mask));      
+    mi_commit_mask_t cmask;
+    mi_commit_mask_create_intersect(&segment->commit_mask, &mask, &cmask);  // only decommit what is committed; span_free may try to decommit more
+    mi_commit_mask_set(&segment->decommit_mask, &cmask);
+    mi_msecs_t now = _mi_clock_now();    
+    if (segment->decommit_expire == 0) {
+      // no previous decommits, initialize now
+      segment->decommit_expire = now + mi_option_get(mi_option_decommit_delay);
+    }
+    else if (segment->decommit_expire <= now) {
+      // previous decommit mask already expired
+      // mi_segment_delayed_decommit(segment, true, stats);
+      segment->decommit_expire = now + mi_option_get(mi_option_decommit_extend_delay); // (mi_option_get(mi_option_decommit_delay) / 8); // wait a tiny bit longer in case there is a series of free's
+    }
+    else {
+      // previous decommit mask is not yet expired, increase the expiration by a bit.
+      segment->decommit_expire += mi_option_get(mi_option_decommit_extend_delay);
+    }
+  }  
+}
+
+static void mi_segment_delayed_decommit(mi_segment_t* segment, bool force, mi_stats_t* stats) {
+  if (!segment->allow_decommit || mi_commit_mask_is_empty(&segment->decommit_mask)) return;
+  mi_msecs_t now = _mi_clock_now();
+  if (!force && now < segment->decommit_expire) return;
+
+  mi_commit_mask_t mask = segment->decommit_mask;
+  segment->decommit_expire = 0;
+  mi_commit_mask_create_empty(&segment->decommit_mask);
+
+  size_t idx;
+  size_t count;
+  mi_commit_mask_foreach(&mask, idx, count) {
+    // if found, decommit that sequence
+    if (count > 0) {
+      uint8_t* p = (uint8_t*)segment + (idx*MI_COMMIT_SIZE);
+      size_t size = count * MI_COMMIT_SIZE;
+      mi_segment_commitx(segment, false, p, size, stats);
+    }
+  }
+  mi_commit_mask_foreach_end()
+  mi_assert_internal(mi_commit_mask_is_empty(&segment->decommit_mask));
+}
+
+
+static bool mi_segment_is_abandoned(mi_segment_t* segment) {
+  return (segment->thread_id == 0);
+}
+
+// note: can be called on abandoned segments
+static void mi_segment_span_free(mi_segment_t* segment, size_t slice_index, size_t slice_count, mi_segments_tld_t* tld) {
+  mi_assert_internal(slice_index < segment->slice_entries);
+  mi_span_queue_t* sq = (segment->kind == MI_SEGMENT_HUGE || mi_segment_is_abandoned(segment) 
+                          ? NULL : mi_span_queue_for(slice_count,tld));
+  if (slice_count==0) slice_count = 1;
+  mi_assert_internal(slice_index + slice_count - 1 < segment->slice_entries);
+
+  // set first and last slice (the intermediates can be undetermined)
+  mi_slice_t* slice = &segment->slices[slice_index];
+  slice->slice_count = (uint32_t)slice_count;
+  mi_assert_internal(slice->slice_count == slice_count); // no overflow?
+  slice->slice_offset = 0;
+  if (slice_count > 1) {
+    mi_slice_t* last = &segment->slices[slice_index + slice_count - 1];
+    last->slice_count = 0;
+    last->slice_offset = (uint32_t)(sizeof(mi_page_t)*(slice_count - 1));
+    last->xblock_size = 0;
+  }
+
+  // perhaps decommit
+  mi_segment_perhaps_decommit(segment,mi_slice_start(slice),slice_count*MI_SEGMENT_SLICE_SIZE,tld->stats);
+  
+  // and push it on the free page queue (if it was not a huge page)
+  if (sq != NULL) mi_span_queue_push( sq, slice );
+             else slice->xblock_size = 0; // mark huge page as free anyways
+}
+
+/*
+// called from reclaim to add existing free spans
+static void mi_segment_span_add_free(mi_slice_t* slice, mi_segments_tld_t* tld) {
+  mi_segment_t* segment = _mi_ptr_segment(slice);
+  mi_assert_internal(slice->xblock_size==0 && slice->slice_count>0 && slice->slice_offset==0);
+  size_t slice_index = mi_slice_index(slice);
+  mi_segment_span_free(segment,slice_index,slice->slice_count,tld);
+}
+*/
+
+static void mi_segment_span_remove_from_queue(mi_slice_t* slice, mi_segments_tld_t* tld) {
+  mi_assert_internal(slice->slice_count > 0 && slice->slice_offset==0 && slice->xblock_size==0);
+  mi_assert_internal(_mi_ptr_segment(slice)->kind != MI_SEGMENT_HUGE);
+  mi_span_queue_t* sq = mi_span_queue_for(slice->slice_count, tld);
+  mi_span_queue_delete(sq, slice);
+}
+
+// note: can be called on abandoned segments
+static mi_slice_t* mi_segment_span_free_coalesce(mi_slice_t* slice, mi_segments_tld_t* tld) {
+  mi_assert_internal(slice != NULL && slice->slice_count > 0 && slice->slice_offset == 0);
+  mi_segment_t* segment = _mi_ptr_segment(slice);
+  bool is_abandoned = mi_segment_is_abandoned(segment);
+
+  // for huge pages, just mark as free but don't add to the queues
+  if (segment->kind == MI_SEGMENT_HUGE) {
+    mi_assert_internal(segment->used == 1);  // decreased right after this call in `mi_segment_page_clear`
+    slice->xblock_size = 0;  // mark as free anyways
+    // we should mark the last slice `xblock_size=0` now to maintain invariants but we skip it to 
+    // avoid a possible cache miss (and the segment is about to be freed)
+    return slice;
+  }
+
+  // otherwise coalesce the span and add to the free span queues
+  size_t slice_count = slice->slice_count;
+  mi_slice_t* next = slice + slice->slice_count;
+  mi_assert_internal(next <= mi_segment_slices_end(segment));
+  if (next < mi_segment_slices_end(segment) && next->xblock_size==0) {
+    // free next block -- remove it from free and merge
+    mi_assert_internal(next->slice_count > 0 && next->slice_offset==0);
+    slice_count += next->slice_count; // extend
+    if (!is_abandoned) { mi_segment_span_remove_from_queue(next, tld); }
+  }
+  if (slice > segment->slices) {
+    mi_slice_t* prev = mi_slice_first(slice - 1);
+    mi_assert_internal(prev >= segment->slices);
+    if (prev->xblock_size==0) {
+      // free previous slice -- remove it from free and merge
+      mi_assert_internal(prev->slice_count > 0 && prev->slice_offset==0);
+      slice_count += prev->slice_count;
+      if (!is_abandoned) { mi_segment_span_remove_from_queue(prev, tld); }
+      slice = prev;
+    }
+  }
+
+  // and add the new free page
+  mi_segment_span_free(segment, mi_slice_index(slice), slice_count, tld);
+  return slice;
+}
+
+
+static void mi_segment_slice_split(mi_segment_t* segment, mi_slice_t* slice, size_t slice_count, mi_segments_tld_t* tld) {
+  mi_assert_internal(_mi_ptr_segment(slice)==segment);
+  mi_assert_internal(slice->slice_count >= slice_count);
+  mi_assert_internal(slice->xblock_size > 0); // no more in free queue
+  if (slice->slice_count <= slice_count) return;
+  mi_assert_internal(segment->kind != MI_SEGMENT_HUGE);
+  size_t next_index = mi_slice_index(slice) + slice_count;
+  size_t next_count = slice->slice_count - slice_count;
+  mi_segment_span_free(segment, next_index, next_count, tld);
+  slice->slice_count = (uint32_t)slice_count;
+}
+
+// Note: may still return NULL if committing the memory failed
+static mi_page_t* mi_segment_span_allocate(mi_segment_t* segment, size_t slice_index, size_t slice_count, mi_segments_tld_t* tld) {
+  mi_assert_internal(slice_index < segment->slice_entries);
+  mi_slice_t* slice = &segment->slices[slice_index];
+  mi_assert_internal(slice->xblock_size==0 || slice->xblock_size==1);
+
+  // commit before changing the slice data
+  if (!mi_segment_ensure_committed(segment, _mi_segment_page_start_from_slice(segment, slice, 0, NULL), slice_count * MI_SEGMENT_SLICE_SIZE, tld->stats)) {
+    return NULL;  // commit failed!
+  }
+
+  // convert the slices to a page
+  slice->slice_offset = 0;
+  slice->slice_count = (uint32_t)slice_count;
+  mi_assert_internal(slice->slice_count == slice_count);
+  const size_t bsize = slice_count * MI_SEGMENT_SLICE_SIZE;
+  slice->xblock_size = (uint32_t)(bsize >= MI_HUGE_BLOCK_SIZE ? MI_HUGE_BLOCK_SIZE : bsize);
+  mi_page_t*  page = mi_slice_to_page(slice);
+  mi_assert_internal(mi_page_block_size(page) == bsize);
+
+  // set slice back pointers for the first MI_MAX_SLICE_OFFSET entries
+  size_t extra = slice_count-1;
+  if (extra > MI_MAX_SLICE_OFFSET) extra = MI_MAX_SLICE_OFFSET;
+  if (slice_index + extra >= segment->slice_entries) extra = segment->slice_entries - slice_index - 1;  // huge objects may have more slices than avaiable entries in the segment->slices
+  slice++;
+  for (size_t i = 1; i <= extra; i++, slice++) {
+    slice->slice_offset = (uint32_t)(sizeof(mi_slice_t)*i);
+    slice->slice_count = 0;
+    slice->xblock_size = 1;
+  }
+
+  // and also for the last one (if not set already) (the last one is needed for coalescing)
+  // note: the cast is needed for ubsan since the index can be larger than MI_SLICES_PER_SEGMENT for huge allocations (see #543)
+  mi_slice_t* last = &((mi_slice_t*)segment->slices)[slice_index + slice_count - 1]; 
+  if (last < mi_segment_slices_end(segment) && last >= slice) {
+    last->slice_offset = (uint32_t)(sizeof(mi_slice_t)*(slice_count-1));
+    last->slice_count = 0;
+    last->xblock_size = 1;
+  }
+  
+  // and initialize the page
+  page->is_reset = false;
+  page->is_committed = true;
+  segment->used++;
+  return page;
+}
+
+static mi_page_t* mi_segments_page_find_and_allocate(size_t slice_count, mi_segments_tld_t* tld) {
+  mi_assert_internal(slice_count*MI_SEGMENT_SLICE_SIZE <= MI_LARGE_OBJ_SIZE_MAX);
+  // search from best fit up
+  mi_span_queue_t* sq = mi_span_queue_for(slice_count, tld);
+  if (slice_count == 0) slice_count = 1;
+  while (sq <= &tld->spans[MI_SEGMENT_BIN_MAX]) {
+    for (mi_slice_t* slice = sq->first; slice != NULL; slice = slice->next) {
+      if (slice->slice_count >= slice_count) {
+        // found one
+        mi_span_queue_delete(sq, slice);
+        mi_segment_t* segment = _mi_ptr_segment(slice);
+        if (slice->slice_count > slice_count) {
+          mi_segment_slice_split(segment, slice, slice_count, tld);
+        }
+        mi_assert_internal(slice != NULL && slice->slice_count == slice_count && slice->xblock_size > 0);
+        mi_page_t* page = mi_segment_span_allocate(segment, mi_slice_index(slice), slice->slice_count, tld);
+        if (page == NULL) {
+          // commit failed; return NULL but first restore the slice
+          mi_segment_span_free_coalesce(slice, tld);
+          return NULL;
+        }
+        return page;        
+      }
+    }
+    sq++;
+  }
+  // could not find a page..
+  return NULL;
+}
+
+
+/* -----------------------------------------------------------
+   Segment allocation
+----------------------------------------------------------- */
+
+// Allocate a segment from the OS aligned to `MI_SEGMENT_SIZE` .
+static mi_segment_t* mi_segment_init(mi_segment_t* segment, size_t required, mi_segments_tld_t* tld, mi_os_tld_t* os_tld, mi_page_t** huge_page)
+{
+  mi_assert_internal((required==0 && huge_page==NULL) || (required>0 && huge_page != NULL));
+  mi_assert_internal((segment==NULL) || (segment!=NULL && required==0));
+  // calculate needed sizes first
+  size_t info_slices;
+  size_t pre_size;
+  const size_t segment_slices = mi_segment_calculate_slices(required, &pre_size, &info_slices);
+  const size_t slice_entries = (segment_slices > MI_SLICES_PER_SEGMENT ? MI_SLICES_PER_SEGMENT : segment_slices);
+  const size_t segment_size = segment_slices * MI_SEGMENT_SLICE_SIZE;
+
+  // Commit eagerly only if not the first N lazy segments (to reduce impact of many threads that allocate just a little)
+  const bool eager_delay = (// !_mi_os_has_overcommit() &&             // never delay on overcommit systems
+                            _mi_current_thread_count() > 1 &&       // do not delay for the first N threads
+                            tld->count < (size_t)mi_option_get(mi_option_eager_commit_delay));
+  const bool eager = !eager_delay && mi_option_is_enabled(mi_option_eager_commit);
+  bool commit = eager || (required > 0); 
+  
+  // Try to get from our cache first
+  bool is_zero = false;
+  const bool commit_info_still_good = (segment != NULL);
+  mi_commit_mask_t commit_mask;
+  mi_commit_mask_t decommit_mask;
+  if (segment != NULL) {
+    commit_mask = segment->commit_mask;
+    decommit_mask = segment->decommit_mask;
+  }
+  else {
+    mi_commit_mask_create_empty(&commit_mask);
+    mi_commit_mask_create_empty(&decommit_mask);
+  }
+  if (segment==NULL) {
+    // Allocate the segment from the OS
+    bool mem_large = (!eager_delay && (MI_SECURE==0)); // only allow large OS pages once we are no longer lazy    
+    bool is_pinned = false;
+    size_t memid = 0;
+    segment = (mi_segment_t*)_mi_segment_cache_pop(segment_size, &commit_mask, &decommit_mask, &mem_large, &is_pinned, &is_zero, &memid, os_tld);
+    if (segment==NULL) {
+      segment = (mi_segment_t*)_mi_arena_alloc_aligned(segment_size, MI_SEGMENT_SIZE, &commit, &mem_large, &is_pinned, &is_zero, &memid, os_tld);
+      if (segment == NULL) return NULL;  // failed to allocate
+      if (commit) {
+        mi_commit_mask_create_full(&commit_mask);
+      }
+      else {
+        mi_commit_mask_create_empty(&commit_mask);
+      }
+    }    
+    mi_assert_internal(segment != NULL && (uintptr_t)segment % MI_SEGMENT_SIZE == 0);
+
+    const size_t commit_needed = _mi_divide_up(info_slices*MI_SEGMENT_SLICE_SIZE, MI_COMMIT_SIZE);
+    mi_assert_internal(commit_needed>0);
+    mi_commit_mask_t commit_needed_mask;
+    mi_commit_mask_create(0, commit_needed, &commit_needed_mask);
+    if (!mi_commit_mask_all_set(&commit_mask, &commit_needed_mask)) {
+      // at least commit the info slices
+      mi_assert_internal(commit_needed*MI_COMMIT_SIZE >= info_slices*MI_SEGMENT_SLICE_SIZE);
+      bool ok = _mi_os_commit(segment, commit_needed*MI_COMMIT_SIZE, &is_zero, tld->stats);
+      if (!ok) return NULL; // failed to commit   
+      mi_commit_mask_set(&commit_mask, &commit_needed_mask); 
+    }
+    segment->memid = memid;
+    segment->mem_is_pinned = is_pinned;
+    segment->mem_is_large = mem_large;
+    segment->mem_is_committed = mi_commit_mask_is_full(&commit_mask);
+    mi_segments_track_size((long)(segment_size), tld);
+    _mi_segment_map_allocated_at(segment);
+  }
+
+  // zero the segment info? -- not always needed as it is zero initialized from the OS 
+  mi_atomic_store_ptr_release(mi_segment_t, &segment->abandoned_next, NULL);  // tsan
+  if (!is_zero) {
+    ptrdiff_t ofs = offsetof(mi_segment_t, next);
+    size_t    prefix = offsetof(mi_segment_t, slices) - ofs;
+    memset((uint8_t*)segment+ofs, 0, prefix + sizeof(mi_slice_t)*segment_slices);
+  }
+
+  if (!commit_info_still_good) {
+    segment->commit_mask = commit_mask; // on lazy commit, the initial part is always committed
+    segment->allow_decommit = (mi_option_is_enabled(mi_option_allow_decommit) && !segment->mem_is_pinned && !segment->mem_is_large);    
+    if (segment->allow_decommit) {
+      segment->decommit_expire = _mi_clock_now() + mi_option_get(mi_option_decommit_delay);
+      segment->decommit_mask = decommit_mask;
+      mi_assert_internal(mi_commit_mask_all_set(&segment->commit_mask, &segment->decommit_mask));
+      #if MI_DEBUG>2
+      const size_t commit_needed = _mi_divide_up(info_slices*MI_SEGMENT_SLICE_SIZE, MI_COMMIT_SIZE);
+      mi_commit_mask_t commit_needed_mask;
+      mi_commit_mask_create(0, commit_needed, &commit_needed_mask);
+      mi_assert_internal(!mi_commit_mask_any_set(&segment->decommit_mask, &commit_needed_mask));
+      #endif
+    }    
+    else {
+      mi_assert_internal(mi_commit_mask_is_empty(&decommit_mask));
+      segment->decommit_expire = 0;
+      mi_commit_mask_create_empty( &segment->decommit_mask );
+      mi_assert_internal(mi_commit_mask_is_empty(&segment->decommit_mask));
+    }
+  }
+  
+
+  // initialize segment info
+  segment->segment_slices = segment_slices;
+  segment->segment_info_slices = info_slices;
+  segment->thread_id = _mi_thread_id();
+  segment->cookie = _mi_ptr_cookie(segment);
+  segment->slice_entries = slice_entries;
+  segment->kind = (required == 0 ? MI_SEGMENT_NORMAL : MI_SEGMENT_HUGE);
+
+  // memset(segment->slices, 0, sizeof(mi_slice_t)*(info_slices+1));
+  _mi_stat_increase(&tld->stats->page_committed, mi_segment_info_size(segment));
+
+  // set up guard pages
+  size_t guard_slices = 0;
+  if (MI_SECURE>0) {
+    // in secure mode, we set up a protected page in between the segment info
+    // and the page data, and at the end of the segment.
+    size_t os_pagesize = _mi_os_page_size();    
+    mi_assert_internal(mi_segment_info_size(segment) - os_pagesize >= pre_size);
+    _mi_os_protect((uint8_t*)segment + mi_segment_info_size(segment) - os_pagesize, os_pagesize);
+    uint8_t* end = (uint8_t*)segment + mi_segment_size(segment) - os_pagesize;
+    mi_segment_ensure_committed(segment, end, os_pagesize, tld->stats);
+    _mi_os_protect(end, os_pagesize);
+    if (slice_entries == segment_slices) segment->slice_entries--; // don't use the last slice :-(
+    guard_slices = 1;
+  }
+
+  // reserve first slices for segment info
+  mi_page_t* page0 = mi_segment_span_allocate(segment, 0, info_slices, tld);
+  mi_assert_internal(page0!=NULL); if (page0==NULL) return NULL; // cannot fail as we always commit in advance  
+  mi_assert_internal(segment->used == 1);
+  segment->used = 0; // don't count our internal slices towards usage
+  
+  // initialize initial free pages
+  if (segment->kind == MI_SEGMENT_NORMAL) { // not a huge page
+    mi_assert_internal(huge_page==NULL);
+    mi_segment_span_free(segment, info_slices, segment->slice_entries - info_slices, tld);
+  }
+  else {
+    mi_assert_internal(huge_page!=NULL);
+    mi_assert_internal(mi_commit_mask_is_empty(&segment->decommit_mask));
+    mi_assert_internal(mi_commit_mask_is_full(&segment->commit_mask));
+    *huge_page = mi_segment_span_allocate(segment, info_slices, segment_slices - info_slices - guard_slices, tld);
+    mi_assert_internal(*huge_page != NULL); // cannot fail as we commit in advance 
+  }
+
+  mi_assert_expensive(mi_segment_is_valid(segment,tld));
+  return segment;
+}
+
+
+// Allocate a segment from the OS aligned to `MI_SEGMENT_SIZE` .
+static mi_segment_t* mi_segment_alloc(size_t required, mi_segments_tld_t* tld, mi_os_tld_t* os_tld, mi_page_t** huge_page) {
+  return mi_segment_init(NULL, required, tld, os_tld, huge_page);
+}
+
+
+static void mi_segment_free(mi_segment_t* segment, bool force, mi_segments_tld_t* tld) {
+  MI_UNUSED(force);
+  mi_assert_internal(segment != NULL);
+  mi_assert_internal(segment->next == NULL);
+  mi_assert_internal(segment->used == 0);
+
+  // Remove the free pages
+  mi_slice_t* slice = &segment->slices[0];
+  const mi_slice_t* end = mi_segment_slices_end(segment);
+  size_t page_count = 0;
+  while (slice < end) {
+    mi_assert_internal(slice->slice_count > 0);
+    mi_assert_internal(slice->slice_offset == 0);
+    mi_assert_internal(mi_slice_index(slice)==0 || slice->xblock_size == 0); // no more used pages ..
+    if (slice->xblock_size == 0 && segment->kind != MI_SEGMENT_HUGE) {
+      mi_segment_span_remove_from_queue(slice, tld);
+    }
+    page_count++;
+    slice = slice + slice->slice_count;
+  }
+  mi_assert_internal(page_count == 2); // first page is allocated by the segment itself
+
+  // stats
+  _mi_stat_decrease(&tld->stats->page_committed, mi_segment_info_size(segment));
+
+  // return it to the OS
+  mi_segment_os_free(segment, tld);
+}
+
+
+/* -----------------------------------------------------------
+   Page Free
+----------------------------------------------------------- */
+
+static void mi_segment_abandon(mi_segment_t* segment, mi_segments_tld_t* tld);
+
+// note: can be called on abandoned pages
+static mi_slice_t* mi_segment_page_clear(mi_page_t* page, mi_segments_tld_t* tld) {
+  mi_assert_internal(page->xblock_size > 0);
+  mi_assert_internal(mi_page_all_free(page));
+  mi_segment_t* segment = _mi_ptr_segment(page);
+  mi_assert_internal(segment->used > 0);
+  
+  size_t inuse = page->capacity * mi_page_block_size(page);
+  _mi_stat_decrease(&tld->stats->page_committed, inuse);
+  _mi_stat_decrease(&tld->stats->pages, 1);
+
+  // reset the page memory to reduce memory pressure?
+  if (!segment->mem_is_pinned && !page->is_reset && mi_option_is_enabled(mi_option_page_reset)) {
+    size_t psize;
+    uint8_t* start = _mi_page_start(segment, page, &psize);
+    page->is_reset = true;
+    _mi_os_reset(start, psize, tld->stats);
+  }
+
+  // zero the page data, but not the segment fields
+  page->is_zero_init = false;
+  ptrdiff_t ofs = offsetof(mi_page_t, capacity);
+  memset((uint8_t*)page + ofs, 0, sizeof(*page) - ofs);
+  page->xblock_size = 1;
+
+  // and free it
+  mi_slice_t* slice = mi_segment_span_free_coalesce(mi_page_to_slice(page), tld);  
+  segment->used--;
+  // cannot assert segment valid as it is called during reclaim
+  // mi_assert_expensive(mi_segment_is_valid(segment, tld));
+  return slice;
+}
+
+void _mi_segment_page_free(mi_page_t* page, bool force, mi_segments_tld_t* tld)
+{
+  mi_assert(page != NULL);
+
+  mi_segment_t* segment = _mi_page_segment(page);
+  mi_assert_expensive(mi_segment_is_valid(segment,tld));
+
+  // mark it as free now
+  mi_segment_page_clear(page, tld);
+  mi_assert_expensive(mi_segment_is_valid(segment, tld));
+
+  if (segment->used == 0) {
+    // no more used pages; remove from the free list and free the segment
+    mi_segment_free(segment, force, tld);
+  }
+  else if (segment->used == segment->abandoned) {
+    // only abandoned pages; remove from free list and abandon
+    mi_segment_abandon(segment,tld);
+  }
+}
+
+
+/* -----------------------------------------------------------
+Abandonment
+
+When threads terminate, they can leave segments with
+live blocks (reachable through other threads). Such segments
+are "abandoned" and will be reclaimed by other threads to
+reuse their pages and/or free them eventually
+
+We maintain a global list of abandoned segments that are
+reclaimed on demand. Since this is shared among threads
+the implementation needs to avoid the A-B-A problem on
+popping abandoned segments: <https://en.wikipedia.org/wiki/ABA_problem>
+We use tagged pointers to avoid accidentially identifying
+reused segments, much like stamped references in Java.
+Secondly, we maintain a reader counter to avoid resetting
+or decommitting segments that have a pending read operation.
+
+Note: the current implementation is one possible design;
+another way might be to keep track of abandoned segments
+in the arenas/segment_cache's. This would have the advantage of keeping
+all concurrent code in one place and not needing to deal
+with ABA issues. The drawback is that it is unclear how to
+scan abandoned segments efficiently in that case as they
+would be spread among all other segments in the arenas.
+----------------------------------------------------------- */
+
+// Use the bottom 20-bits (on 64-bit) of the aligned segment pointers
+// to put in a tag that increments on update to avoid the A-B-A problem.
+#define MI_TAGGED_MASK   MI_SEGMENT_MASK
+typedef uintptr_t        mi_tagged_segment_t;
+
+static mi_segment_t* mi_tagged_segment_ptr(mi_tagged_segment_t ts) {
+  return (mi_segment_t*)(ts & ~MI_TAGGED_MASK);
+}
+
+static mi_tagged_segment_t mi_tagged_segment(mi_segment_t* segment, mi_tagged_segment_t ts) {
+  mi_assert_internal(((uintptr_t)segment & MI_TAGGED_MASK) == 0);
+  uintptr_t tag = ((ts & MI_TAGGED_MASK) + 1) & MI_TAGGED_MASK;
+  return ((uintptr_t)segment | tag);
+}
+
+// This is a list of visited abandoned pages that were full at the time.
+// this list migrates to `abandoned` when that becomes NULL. The use of
+// this list reduces contention and the rate at which segments are visited.
+static mi_decl_cache_align _Atomic(mi_segment_t*)       abandoned_visited; // = NULL
+
+// The abandoned page list (tagged as it supports pop)
+static mi_decl_cache_align _Atomic(mi_tagged_segment_t) abandoned;         // = NULL
+
+// Maintain these for debug purposes (these counts may be a bit off)
+static mi_decl_cache_align _Atomic(size_t)           abandoned_count; 
+static mi_decl_cache_align _Atomic(size_t)           abandoned_visited_count;
+
+// We also maintain a count of current readers of the abandoned list
+// in order to prevent resetting/decommitting segment memory if it might
+// still be read.
+static mi_decl_cache_align _Atomic(size_t)           abandoned_readers; // = 0
+
+// Push on the visited list
+static void mi_abandoned_visited_push(mi_segment_t* segment) {
+  mi_assert_internal(segment->thread_id == 0);
+  mi_assert_internal(mi_atomic_load_ptr_relaxed(mi_segment_t,&segment->abandoned_next) == NULL);
+  mi_assert_internal(segment->next == NULL);
+  mi_assert_internal(segment->used > 0);
+  mi_segment_t* anext = mi_atomic_load_ptr_relaxed(mi_segment_t, &abandoned_visited);
+  do {
+    mi_atomic_store_ptr_release(mi_segment_t, &segment->abandoned_next, anext);
+  } while (!mi_atomic_cas_ptr_weak_release(mi_segment_t, &abandoned_visited, &anext, segment));
+  mi_atomic_increment_relaxed(&abandoned_visited_count);
+}
+
+// Move the visited list to the abandoned list.
+static bool mi_abandoned_visited_revisit(void)
+{
+  // quick check if the visited list is empty
+  if (mi_atomic_load_ptr_relaxed(mi_segment_t, &abandoned_visited) == NULL) return false;
+
+  // grab the whole visited list
+  mi_segment_t* first = mi_atomic_exchange_ptr_acq_rel(mi_segment_t, &abandoned_visited, NULL);
+  if (first == NULL) return false;
+
+  // first try to swap directly if the abandoned list happens to be NULL
+  mi_tagged_segment_t afirst;
+  mi_tagged_segment_t ts = mi_atomic_load_relaxed(&abandoned);
+  if (mi_tagged_segment_ptr(ts)==NULL) {
+    size_t count = mi_atomic_load_relaxed(&abandoned_visited_count);
+    afirst = mi_tagged_segment(first, ts);
+    if (mi_atomic_cas_strong_acq_rel(&abandoned, &ts, afirst)) {
+      mi_atomic_add_relaxed(&abandoned_count, count);
+      mi_atomic_sub_relaxed(&abandoned_visited_count, count);
+      return true;
+    }
+  }
+
+  // find the last element of the visited list: O(n)
+  mi_segment_t* last = first;
+  mi_segment_t* next;
+  while ((next = mi_atomic_load_ptr_relaxed(mi_segment_t, &last->abandoned_next)) != NULL) {
+    last = next;
+  }
+
+  // and atomically prepend to the abandoned list
+  // (no need to increase the readers as we don't access the abandoned segments)
+  mi_tagged_segment_t anext = mi_atomic_load_relaxed(&abandoned);
+  size_t count;
+  do {
+    count = mi_atomic_load_relaxed(&abandoned_visited_count);
+    mi_atomic_store_ptr_release(mi_segment_t, &last->abandoned_next, mi_tagged_segment_ptr(anext));
+    afirst = mi_tagged_segment(first, anext);
+  } while (!mi_atomic_cas_weak_release(&abandoned, &anext, afirst));
+  mi_atomic_add_relaxed(&abandoned_count, count);
+  mi_atomic_sub_relaxed(&abandoned_visited_count, count);
+  return true;
+}
+
+// Push on the abandoned list.
+static void mi_abandoned_push(mi_segment_t* segment) {
+  mi_assert_internal(segment->thread_id == 0);
+  mi_assert_internal(mi_atomic_load_ptr_relaxed(mi_segment_t, &segment->abandoned_next) == NULL);
+  mi_assert_internal(segment->next == NULL);
+  mi_assert_internal(segment->used > 0);
+  mi_tagged_segment_t next;
+  mi_tagged_segment_t ts = mi_atomic_load_relaxed(&abandoned);
+  do {
+    mi_atomic_store_ptr_release(mi_segment_t, &segment->abandoned_next, mi_tagged_segment_ptr(ts));
+    next = mi_tagged_segment(segment, ts);
+  } while (!mi_atomic_cas_weak_release(&abandoned, &ts, next));
+  mi_atomic_increment_relaxed(&abandoned_count);
+}
+
+// Wait until there are no more pending reads on segments that used to be in the abandoned list
+// called for example from `arena.c` before decommitting
+void _mi_abandoned_await_readers(void) {
+  size_t n;
+  do {
+    n = mi_atomic_load_acquire(&abandoned_readers);
+    if (n != 0) mi_atomic_yield();
+  } while (n != 0);
+}
+
+// Pop from the abandoned list
+static mi_segment_t* mi_abandoned_pop(void) {
+  mi_segment_t* segment;
+  // Check efficiently if it is empty (or if the visited list needs to be moved)
+  mi_tagged_segment_t ts = mi_atomic_load_relaxed(&abandoned);
+  segment = mi_tagged_segment_ptr(ts);
+  if (mi_likely(segment == NULL)) {
+    if (mi_likely(!mi_abandoned_visited_revisit())) { // try to swap in the visited list on NULL
+      return NULL;
+    }
+  }
+
+  // Do a pop. We use a reader count to prevent
+  // a segment to be decommitted while a read is still pending,
+  // and a tagged pointer to prevent A-B-A link corruption.
+  // (this is called from `region.c:_mi_mem_free` for example)
+  mi_atomic_increment_relaxed(&abandoned_readers);  // ensure no segment gets decommitted
+  mi_tagged_segment_t next = 0;
+  ts = mi_atomic_load_acquire(&abandoned);
+  do {
+    segment = mi_tagged_segment_ptr(ts);
+    if (segment != NULL) {
+      mi_segment_t* anext = mi_atomic_load_ptr_relaxed(mi_segment_t, &segment->abandoned_next);
+      next = mi_tagged_segment(anext, ts); // note: reads the segment's `abandoned_next` field so should not be decommitted
+    }
+  } while (segment != NULL && !mi_atomic_cas_weak_acq_rel(&abandoned, &ts, next));
+  mi_atomic_decrement_relaxed(&abandoned_readers);  // release reader lock
+  if (segment != NULL) {
+    mi_atomic_store_ptr_release(mi_segment_t, &segment->abandoned_next, NULL);
+    mi_atomic_decrement_relaxed(&abandoned_count);
+  }
+  return segment;
+}
+
+/* -----------------------------------------------------------
+   Abandon segment/page
+----------------------------------------------------------- */
+
+static void mi_segment_abandon(mi_segment_t* segment, mi_segments_tld_t* tld) {
+  mi_assert_internal(segment->used == segment->abandoned);
+  mi_assert_internal(segment->used > 0);
+  mi_assert_internal(mi_atomic_load_ptr_relaxed(mi_segment_t, &segment->abandoned_next) == NULL);
+  mi_assert_internal(segment->abandoned_visits == 0);
+  mi_assert_expensive(mi_segment_is_valid(segment,tld));
+  
+  // remove the free pages from the free page queues
+  mi_slice_t* slice = &segment->slices[0];
+  const mi_slice_t* end = mi_segment_slices_end(segment);
+  while (slice < end) {
+    mi_assert_internal(slice->slice_count > 0);
+    mi_assert_internal(slice->slice_offset == 0);
+    if (slice->xblock_size == 0) { // a free page
+      mi_segment_span_remove_from_queue(slice,tld);
+      slice->xblock_size = 0; // but keep it free
+    }
+    slice = slice + slice->slice_count;
+  }
+
+  // perform delayed decommits
+  mi_segment_delayed_decommit(segment, mi_option_is_enabled(mi_option_abandoned_page_decommit) /* force? */, tld->stats);    
+  
+  // all pages in the segment are abandoned; add it to the abandoned list
+  _mi_stat_increase(&tld->stats->segments_abandoned, 1);
+  mi_segments_track_size(-((long)mi_segment_size(segment)), tld);
+  segment->thread_id = 0;
+  mi_atomic_store_ptr_release(mi_segment_t, &segment->abandoned_next, NULL);
+  segment->abandoned_visits = 1;   // from 0 to 1 to signify it is abandoned
+  mi_abandoned_push(segment);
+}
+
+void _mi_segment_page_abandon(mi_page_t* page, mi_segments_tld_t* tld) {
+  mi_assert(page != NULL);
+  mi_assert_internal(mi_page_thread_free_flag(page)==MI_NEVER_DELAYED_FREE);
+  mi_assert_internal(mi_page_heap(page) == NULL);
+  mi_segment_t* segment = _mi_page_segment(page);
+
+  mi_assert_expensive(mi_segment_is_valid(segment,tld));
+  segment->abandoned++;  
+
+  _mi_stat_increase(&tld->stats->pages_abandoned, 1);
+  mi_assert_internal(segment->abandoned <= segment->used);
+  if (segment->used == segment->abandoned) {
+    // all pages are abandoned, abandon the entire segment
+    mi_segment_abandon(segment, tld);
+  }
+}
+
+/* -----------------------------------------------------------
+  Reclaim abandoned pages
+----------------------------------------------------------- */
+
+static mi_slice_t* mi_slices_start_iterate(mi_segment_t* segment, const mi_slice_t** end) {
+  mi_slice_t* slice = &segment->slices[0];
+  *end = mi_segment_slices_end(segment);
+  mi_assert_internal(slice->slice_count>0 && slice->xblock_size>0); // segment allocated page
+  slice = slice + slice->slice_count; // skip the first segment allocated page
+  return slice;
+}
+
+// Possibly free pages and check if free space is available
+static bool mi_segment_check_free(mi_segment_t* segment, size_t slices_needed, size_t block_size, mi_segments_tld_t* tld) 
+{
+  mi_assert_internal(block_size < MI_HUGE_BLOCK_SIZE);
+  mi_assert_internal(mi_segment_is_abandoned(segment));
+  bool has_page = false;
+  
+  // for all slices
+  const mi_slice_t* end;
+  mi_slice_t* slice = mi_slices_start_iterate(segment, &end);
+  while (slice < end) {
+    mi_assert_internal(slice->slice_count > 0);
+    mi_assert_internal(slice->slice_offset == 0);
+    if (mi_slice_is_used(slice)) { // used page
+      // ensure used count is up to date and collect potential concurrent frees
+      mi_page_t* const page = mi_slice_to_page(slice);
+      _mi_page_free_collect(page, false);
+      if (mi_page_all_free(page)) {
+        // if this page is all free now, free it without adding to any queues (yet) 
+        mi_assert_internal(page->next == NULL && page->prev==NULL);
+        _mi_stat_decrease(&tld->stats->pages_abandoned, 1);
+        segment->abandoned--;
+        slice = mi_segment_page_clear(page, tld); // re-assign slice due to coalesce!
+        mi_assert_internal(!mi_slice_is_used(slice));
+        if (slice->slice_count >= slices_needed) {
+          has_page = true;
+        }
+      }
+      else {
+        if (page->xblock_size == block_size && mi_page_has_any_available(page)) {
+          // a page has available free blocks of the right size
+          has_page = true;
+        }
+      }      
+    }
+    else {
+      // empty span
+      if (slice->slice_count >= slices_needed) {
+        has_page = true;
+      }
+    }
+    slice = slice + slice->slice_count;
+  }
+  return has_page;
+}
+
+// Reclaim an abandoned segment; returns NULL if the segment was freed
+// set `right_page_reclaimed` to `true` if it reclaimed a page of the right `block_size` that was not full.
+static mi_segment_t* mi_segment_reclaim(mi_segment_t* segment, mi_heap_t* heap, size_t requested_block_size, bool* right_page_reclaimed, mi_segments_tld_t* tld) {
+  mi_assert_internal(mi_atomic_load_ptr_relaxed(mi_segment_t, &segment->abandoned_next) == NULL);
+  mi_assert_expensive(mi_segment_is_valid(segment, tld));
+  if (right_page_reclaimed != NULL) { *right_page_reclaimed = false; }
+
+  segment->thread_id = _mi_thread_id();
+  segment->abandoned_visits = 0;
+  mi_segments_track_size((long)mi_segment_size(segment), tld);
+  mi_assert_internal(segment->next == NULL);
+  _mi_stat_decrease(&tld->stats->segments_abandoned, 1);
+  
+  // for all slices
+  const mi_slice_t* end;
+  mi_slice_t* slice = mi_slices_start_iterate(segment, &end);
+  while (slice < end) {
+    mi_assert_internal(slice->slice_count > 0);
+    mi_assert_internal(slice->slice_offset == 0);
+    if (mi_slice_is_used(slice)) {
+      // in use: reclaim the page in our heap
+      mi_page_t* page = mi_slice_to_page(slice);
+      mi_assert_internal(!page->is_reset);
+      mi_assert_internal(page->is_committed);
+      mi_assert_internal(mi_page_thread_free_flag(page)==MI_NEVER_DELAYED_FREE);
+      mi_assert_internal(mi_page_heap(page) == NULL);
+      mi_assert_internal(page->next == NULL && page->prev==NULL);
+      _mi_stat_decrease(&tld->stats->pages_abandoned, 1);
+      segment->abandoned--;
+      // set the heap again and allow delayed free again
+      mi_page_set_heap(page, heap);
+      _mi_page_use_delayed_free(page, MI_USE_DELAYED_FREE, true); // override never (after heap is set)
+      _mi_page_free_collect(page, false); // ensure used count is up to date
+      if (mi_page_all_free(page)) {
+        // if everything free by now, free the page
+        slice = mi_segment_page_clear(page, tld);   // set slice again due to coalesceing
+      }
+      else {
+        // otherwise reclaim it into the heap
+        _mi_page_reclaim(heap, page);
+        if (requested_block_size == page->xblock_size && mi_page_has_any_available(page)) {
+          if (right_page_reclaimed != NULL) { *right_page_reclaimed = true; }
+        }
+      }
+    }
+    else {
+      // the span is free, add it to our page queues
+      slice = mi_segment_span_free_coalesce(slice, tld); // set slice again due to coalesceing
+    }
+    mi_assert_internal(slice->slice_count>0 && slice->slice_offset==0);
+    slice = slice + slice->slice_count;
+  }
+
+  mi_assert(segment->abandoned == 0);
+  if (segment->used == 0) {  // due to page_clear
+    mi_assert_internal(right_page_reclaimed == NULL || !(*right_page_reclaimed));
+    mi_segment_free(segment, false, tld);
+    return NULL;
+  }
+  else {
+    return segment;
+  }
+}
+
+
+void _mi_abandoned_reclaim_all(mi_heap_t* heap, mi_segments_tld_t* tld) {
+  mi_segment_t* segment;
+  while ((segment = mi_abandoned_pop()) != NULL) {
+    mi_segment_reclaim(segment, heap, 0, NULL, tld);
+  }
+}
+
+static mi_segment_t* mi_segment_try_reclaim(mi_heap_t* heap, size_t needed_slices, size_t block_size, bool* reclaimed, mi_segments_tld_t* tld)
+{
+  *reclaimed = false;
+  mi_segment_t* segment;
+  long max_tries = mi_option_get_clamp(mi_option_max_segment_reclaim, 8, 1024);     // limit the work to bound allocation times  
+  while ((max_tries-- > 0) && ((segment = mi_abandoned_pop()) != NULL)) {
+    segment->abandoned_visits++;
+    bool has_page = mi_segment_check_free(segment,needed_slices,block_size,tld); // try to free up pages (due to concurrent frees)
+    if (segment->used == 0) {
+      // free the segment (by forced reclaim) to make it available to other threads.
+      // note1: we prefer to free a segment as that might lead to reclaiming another
+      // segment that is still partially used.
+      // note2: we could in principle optimize this by skipping reclaim and directly
+      // freeing but that would violate some invariants temporarily)
+      mi_segment_reclaim(segment, heap, 0, NULL, tld);
+    }
+    else if (has_page) {
+      // found a large enough free span, or a page of the right block_size with free space 
+      // we return the result of reclaim (which is usually `segment`) as it might free
+      // the segment due to concurrent frees (in which case `NULL` is returned).
+      return mi_segment_reclaim(segment, heap, block_size, reclaimed, tld);
+    }
+    else if (segment->abandoned_visits > 3) {  
+      // always reclaim on 3rd visit to limit the abandoned queue length.
+      mi_segment_reclaim(segment, heap, 0, NULL, tld);
+    }
+    else {
+      // otherwise, push on the visited list so it gets not looked at too quickly again
+      mi_segment_delayed_decommit(segment, true /* force? */, tld->stats); // forced decommit if needed as we may not visit soon again
+      mi_abandoned_visited_push(segment);
+    }
+  }
+  return NULL;
+}
+
+
+void _mi_abandoned_collect(mi_heap_t* heap, bool force, mi_segments_tld_t* tld)
+{
+  mi_segment_t* segment;
+  int max_tries = (force ? 16*1024 : 1024); // limit latency
+  if (force) {
+    mi_abandoned_visited_revisit(); 
+  }
+  while ((max_tries-- > 0) && ((segment = mi_abandoned_pop()) != NULL)) {
+    mi_segment_check_free(segment,0,0,tld); // try to free up pages (due to concurrent frees)
+    if (segment->used == 0) {
+      // free the segment (by forced reclaim) to make it available to other threads.
+      // note: we could in principle optimize this by skipping reclaim and directly
+      // freeing but that would violate some invariants temporarily)
+      mi_segment_reclaim(segment, heap, 0, NULL, tld);
+    }
+    else {
+      // otherwise, decommit if needed and push on the visited list 
+      // note: forced decommit can be expensive if many threads are destroyed/created as in mstress.
+      mi_segment_delayed_decommit(segment, force, tld->stats);
+      mi_abandoned_visited_push(segment);
+    }
+  }
+}
+
+/* -----------------------------------------------------------
+   Reclaim or allocate
+----------------------------------------------------------- */
+
+static mi_segment_t* mi_segment_reclaim_or_alloc(mi_heap_t* heap, size_t needed_slices, size_t block_size, mi_segments_tld_t* tld, mi_os_tld_t* os_tld) 
+{
+  mi_assert_internal(block_size < MI_HUGE_BLOCK_SIZE);
+  mi_assert_internal(block_size <= MI_LARGE_OBJ_SIZE_MAX);
+  
+  // 1. try to reclaim an abandoned segment
+  bool reclaimed;
+  mi_segment_t* segment = mi_segment_try_reclaim(heap, needed_slices, block_size, &reclaimed, tld);
+  if (reclaimed) {
+    // reclaimed the right page right into the heap
+    mi_assert_internal(segment != NULL);
+    return NULL; // pretend out-of-memory as the page will be in the page queue of the heap with available blocks
+  }
+  else if (segment != NULL) {
+    // reclaimed a segment with a large enough empty span in it
+    return segment;
+  }
+  // 2. otherwise allocate a fresh segment
+  return mi_segment_alloc(0, tld, os_tld, NULL);  
+}
+
+
+/* -----------------------------------------------------------
+   Page allocation
+----------------------------------------------------------- */
+
+static mi_page_t* mi_segments_page_alloc(mi_heap_t* heap, mi_page_kind_t page_kind, size_t required, size_t block_size, mi_segments_tld_t* tld, mi_os_tld_t* os_tld)
+{
+  mi_assert_internal(required <= MI_LARGE_OBJ_SIZE_MAX && page_kind <= MI_PAGE_LARGE);
+
+  // find a free page
+  size_t page_size = _mi_align_up(required, (required > MI_MEDIUM_PAGE_SIZE ? MI_MEDIUM_PAGE_SIZE : MI_SEGMENT_SLICE_SIZE));
+  size_t slices_needed = page_size / MI_SEGMENT_SLICE_SIZE;
+  mi_assert_internal(slices_needed * MI_SEGMENT_SLICE_SIZE == page_size);
+  mi_page_t* page = mi_segments_page_find_and_allocate(slices_needed, tld); //(required <= MI_SMALL_SIZE_MAX ? 0 : slices_needed), tld);
+  if (page==NULL) {
+    // no free page, allocate a new segment and try again
+    if (mi_segment_reclaim_or_alloc(heap, slices_needed, block_size, tld, os_tld) == NULL) {
+      // OOM or reclaimed a good page in the heap
+      return NULL;  
+    }
+    else {
+      // otherwise try again
+      return mi_segments_page_alloc(heap, page_kind, required, block_size, tld, os_tld);
+    }
+  }
+  mi_assert_internal(page != NULL && page->slice_count*MI_SEGMENT_SLICE_SIZE == page_size);
+  mi_assert_internal(_mi_ptr_segment(page)->thread_id == _mi_thread_id());
+  mi_segment_delayed_decommit(_mi_ptr_segment(page), false, tld->stats);
+  return page;
+}
+
+
+
+/* -----------------------------------------------------------
+   Huge page allocation
+----------------------------------------------------------- */
+
+static mi_page_t* mi_segment_huge_page_alloc(size_t size, mi_segments_tld_t* tld, mi_os_tld_t* os_tld)
+{
+  mi_page_t* page = NULL;
+  mi_segment_t* segment = mi_segment_alloc(size,tld,os_tld,&page);
+  if (segment == NULL || page==NULL) return NULL;
+  mi_assert_internal(segment->used==1);
+  mi_assert_internal(mi_page_block_size(page) >= size);  
+  segment->thread_id = 0; // huge segments are immediately abandoned
+  return page;
+}
+
+// free huge block from another thread
+void _mi_segment_huge_page_free(mi_segment_t* segment, mi_page_t* page, mi_block_t* block) {
+  // huge page segments are always abandoned and can be freed immediately by any thread
+  mi_assert_internal(segment->kind==MI_SEGMENT_HUGE);
+  mi_assert_internal(segment == _mi_page_segment(page));
+  mi_assert_internal(mi_atomic_load_relaxed(&segment->thread_id)==0);
+
+  // claim it and free
+  mi_heap_t* heap = mi_heap_get_default(); // issue #221; don't use the internal get_default_heap as we need to ensure the thread is initialized.
+  // paranoia: if this it the last reference, the cas should always succeed
+  size_t expected_tid = 0;
+  if (mi_atomic_cas_strong_acq_rel(&segment->thread_id, &expected_tid, heap->thread_id)) {
+    mi_block_set_next(page, block, page->free);
+    page->free = block;
+    page->used--;
+    page->is_zero = false;
+    mi_assert(page->used == 0);
+    mi_tld_t* tld = heap->tld;
+    _mi_segment_page_free(page, true, &tld->segments);
+  }
+#if (MI_DEBUG!=0)
+  else {
+    mi_assert_internal(false);
+  }
+#endif
+}
+
+/* -----------------------------------------------------------
+   Page allocation and free
+----------------------------------------------------------- */
+mi_page_t* _mi_segment_page_alloc(mi_heap_t* heap, size_t block_size, mi_segments_tld_t* tld, mi_os_tld_t* os_tld) {
+  mi_page_t* page;
+  if (block_size <= MI_SMALL_OBJ_SIZE_MAX) {
+    page = mi_segments_page_alloc(heap,MI_PAGE_SMALL,block_size,block_size,tld,os_tld);
+  }
+  else if (block_size <= MI_MEDIUM_OBJ_SIZE_MAX) {
+    page = mi_segments_page_alloc(heap,MI_PAGE_MEDIUM,MI_MEDIUM_PAGE_SIZE,block_size,tld, os_tld);
+  }
+  else if (block_size <= MI_LARGE_OBJ_SIZE_MAX) {
+    page = mi_segments_page_alloc(heap,MI_PAGE_LARGE,block_size,block_size,tld, os_tld);
+  }
+  else {
+    page = mi_segment_huge_page_alloc(block_size,tld,os_tld);
+  }
+  mi_assert_expensive(page == NULL || mi_segment_is_valid(_mi_page_segment(page),tld));
+  return page;
+}
+
+
diff --git a/source/luametatex/source/libraries/mimalloc/src/static.c b/source/luametatex/source/libraries/mimalloc/src/static.c
new file mode 100644
index 000000000..5b34ddbb6
--- /dev/null
+++ b/source/luametatex/source/libraries/mimalloc/src/static.c
@@ -0,0 +1,39 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2018-2020, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+#ifndef _DEFAULT_SOURCE
+#define _DEFAULT_SOURCE
+#endif
+#if defined(__sun)
+// same remarks as os.c for the static's context.
+#undef _XOPEN_SOURCE
+#undef _POSIX_C_SOURCE
+#endif
+
+#include "mimalloc.h"
+#include "mimalloc-internal.h"
+
+// For a static override we create a single object file
+// containing the whole library. If it is linked first
+// it will override all the standard library allocation
+// functions (on Unix's).
+#include "stats.c"
+#include "random.c"
+#include "os.c"
+#include "bitmap.c"
+#include "arena.c"
+#include "segment-cache.c"
+#include "segment.c"
+#include "page.c"
+#include "heap.c"
+#include "alloc.c"
+#include "alloc-aligned.c"
+#include "alloc-posix.c"
+#if MI_OSX_ZONE
+#include "alloc-override-osx.c"
+#endif
+#include "init.c"
+#include "options.c"
diff --git a/source/luametatex/source/libraries/mimalloc/src/stats.c b/source/luametatex/source/libraries/mimalloc/src/stats.c
new file mode 100644
index 000000000..134a7bcb6
--- /dev/null
+++ b/source/luametatex/source/libraries/mimalloc/src/stats.c
@@ -0,0 +1,584 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2018-2021, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+#include "mimalloc.h"
+#include "mimalloc-internal.h"
+#include "mimalloc-atomic.h"
+
+#include <stdio.h>  // fputs, stderr
+#include <string.h> // memset
+
+#if defined(_MSC_VER) && (_MSC_VER < 1920)
+#pragma warning(disable:4204)  // non-constant aggregate initializer
+#endif
+
+/* -----------------------------------------------------------
+  Statistics operations
+----------------------------------------------------------- */
+
+static bool mi_is_in_main(void* stat) {
+  return ((uint8_t*)stat >= (uint8_t*)&_mi_stats_main
+         && (uint8_t*)stat < ((uint8_t*)&_mi_stats_main + sizeof(mi_stats_t)));  
+}
+
+static void mi_stat_update(mi_stat_count_t* stat, int64_t amount) {
+  if (amount == 0) return;
+  if (mi_is_in_main(stat))
+  {
+    // add atomically (for abandoned pages)
+    int64_t current = mi_atomic_addi64_relaxed(&stat->current, amount);
+    mi_atomic_maxi64_relaxed(&stat->peak, current + amount);
+    if (amount > 0) {
+      mi_atomic_addi64_relaxed(&stat->allocated,amount);
+    }
+    else {
+      mi_atomic_addi64_relaxed(&stat->freed, -amount);
+    }
+  }
+  else {
+    // add thread local
+    stat->current += amount;
+    if (stat->current > stat->peak) stat->peak = stat->current;
+    if (amount > 0) {
+      stat->allocated += amount;
+    }
+    else {
+      stat->freed += -amount;
+    }
+  }
+}
+
+void _mi_stat_counter_increase(mi_stat_counter_t* stat, size_t amount) {  
+  if (mi_is_in_main(stat)) {
+    mi_atomic_addi64_relaxed( &stat->count, 1 );
+    mi_atomic_addi64_relaxed( &stat->total, (int64_t)amount );
+  }
+  else {
+    stat->count++;
+    stat->total += amount;
+  }
+}
+
+void _mi_stat_increase(mi_stat_count_t* stat, size_t amount) {
+  mi_stat_update(stat, (int64_t)amount);
+}
+
+void _mi_stat_decrease(mi_stat_count_t* stat, size_t amount) {
+  mi_stat_update(stat, -((int64_t)amount));
+}
+
+// must be thread safe as it is called from stats_merge
+static void mi_stat_add(mi_stat_count_t* stat, const mi_stat_count_t* src, int64_t unit) {
+  if (stat==src) return;
+  if (src->allocated==0 && src->freed==0) return;
+  mi_atomic_addi64_relaxed( &stat->allocated, src->allocated * unit);
+  mi_atomic_addi64_relaxed( &stat->current, src->current * unit);
+  mi_atomic_addi64_relaxed( &stat->freed, src->freed * unit);
+  // peak scores do not work across threads.. 
+  mi_atomic_addi64_relaxed( &stat->peak, src->peak * unit);
+}
+
+static void mi_stat_counter_add(mi_stat_counter_t* stat, const mi_stat_counter_t* src, int64_t unit) {
+  if (stat==src) return;
+  mi_atomic_addi64_relaxed( &stat->total, src->total * unit);
+  mi_atomic_addi64_relaxed( &stat->count, src->count * unit);
+}
+
+// must be thread safe as it is called from stats_merge
+static void mi_stats_add(mi_stats_t* stats, const mi_stats_t* src) {
+  if (stats==src) return;
+  mi_stat_add(&stats->segments, &src->segments,1);
+  mi_stat_add(&stats->pages, &src->pages,1);
+  mi_stat_add(&stats->reserved, &src->reserved, 1);
+  mi_stat_add(&stats->committed, &src->committed, 1);
+  mi_stat_add(&stats->reset, &src->reset, 1);
+  mi_stat_add(&stats->page_committed, &src->page_committed, 1);
+
+  mi_stat_add(&stats->pages_abandoned, &src->pages_abandoned, 1);
+  mi_stat_add(&stats->segments_abandoned, &src->segments_abandoned, 1);
+  mi_stat_add(&stats->threads, &src->threads, 1);
+
+  mi_stat_add(&stats->malloc, &src->malloc, 1);
+  mi_stat_add(&stats->segments_cache, &src->segments_cache, 1);
+  mi_stat_add(&stats->normal, &src->normal, 1);
+  mi_stat_add(&stats->huge, &src->huge, 1);
+  mi_stat_add(&stats->large, &src->large, 1);
+
+  mi_stat_counter_add(&stats->pages_extended, &src->pages_extended, 1);
+  mi_stat_counter_add(&stats->mmap_calls, &src->mmap_calls, 1);
+  mi_stat_counter_add(&stats->commit_calls, &src->commit_calls, 1);
+
+  mi_stat_counter_add(&stats->page_no_retire, &src->page_no_retire, 1);
+  mi_stat_counter_add(&stats->searches, &src->searches, 1);
+  mi_stat_counter_add(&stats->normal_count, &src->normal_count, 1);
+  mi_stat_counter_add(&stats->huge_count, &src->huge_count, 1);
+  mi_stat_counter_add(&stats->large_count, &src->large_count, 1);
+#if MI_STAT>1
+  for (size_t i = 0; i <= MI_BIN_HUGE; i++) {
+    if (src->normal_bins[i].allocated > 0 || src->normal_bins[i].freed > 0) {
+      mi_stat_add(&stats->normal_bins[i], &src->normal_bins[i], 1);
+    }
+  }
+#endif
+}
+
+/* -----------------------------------------------------------
+  Display statistics
+----------------------------------------------------------- */
+
+// unit > 0 : size in binary bytes 
+// unit == 0: count as decimal
+// unit < 0 : count in binary
+static void mi_printf_amount(int64_t n, int64_t unit, mi_output_fun* out, void* arg, const char* fmt) {
+  char buf[32]; buf[0] = 0;  
+  int  len = 32;
+  const char* suffix = (unit <= 0 ? " " : "B");
+  const int64_t base = (unit == 0 ? 1000 : 1024);
+  if (unit>0) n *= unit;
+
+  const int64_t pos = (n < 0 ? -n : n);
+  if (pos < base) {
+    if (n!=1 || suffix[0] != 'B') {  // skip printing 1 B for the unit column
+      snprintf(buf, len, "%d %-3s", (int)n, (n==0 ? "" : suffix));
+    }
+  }
+  else {
+    int64_t divider = base;    
+    const char* magnitude = "K";
+    if (pos >= divider*base) { divider *= base; magnitude = "M"; }
+    if (pos >= divider*base) { divider *= base; magnitude = "G"; }
+    const int64_t tens = (n / (divider/10));
+    const long whole = (long)(tens/10);
+    const long frac1 = (long)(tens%10);
+    char unitdesc[8];
+    snprintf(unitdesc, 8, "%s%s%s", magnitude, (base==1024 ? "i" : ""), suffix);
+    snprintf(buf, len, "%ld.%ld %-3s", whole, (frac1 < 0 ? -frac1 : frac1), unitdesc);
+  }
+  _mi_fprintf(out, arg, (fmt==NULL ? "%11s" : fmt), buf);
+}
+
+
+static void mi_print_amount(int64_t n, int64_t unit, mi_output_fun* out, void* arg) {
+  mi_printf_amount(n,unit,out,arg,NULL);
+}
+
+static void mi_print_count(int64_t n, int64_t unit, mi_output_fun* out, void* arg) {
+  if (unit==1) _mi_fprintf(out, arg, "%11s"," ");
+          else mi_print_amount(n,0,out,arg);
+}
+
+static void mi_stat_print(const mi_stat_count_t* stat, const char* msg, int64_t unit, mi_output_fun* out, void* arg ) {
+  _mi_fprintf(out, arg,"%10s:", msg);
+  if (unit>0) {
+    mi_print_amount(stat->peak, unit, out, arg);
+    mi_print_amount(stat->allocated, unit, out, arg);
+    mi_print_amount(stat->freed, unit, out, arg);
+    mi_print_amount(stat->current, unit, out, arg);
+    mi_print_amount(unit, 1, out, arg);
+    mi_print_count(stat->allocated, unit, out, arg);
+    if (stat->allocated > stat->freed)
+      _mi_fprintf(out, arg, "  not all freed!\n");
+    else
+      _mi_fprintf(out, arg, "  ok\n");
+  }
+  else if (unit<0) {
+    mi_print_amount(stat->peak, -1, out, arg);
+    mi_print_amount(stat->allocated, -1, out, arg);
+    mi_print_amount(stat->freed, -1, out, arg);
+    mi_print_amount(stat->current, -1, out, arg);
+    if (unit==-1) {
+      _mi_fprintf(out, arg, "%22s", "");
+    }
+    else {
+      mi_print_amount(-unit, 1, out, arg);
+      mi_print_count((stat->allocated / -unit), 0, out, arg);
+    }
+    if (stat->allocated > stat->freed)
+      _mi_fprintf(out, arg, "  not all freed!\n");
+    else
+      _mi_fprintf(out, arg, "  ok\n");
+  }
+  else {
+    mi_print_amount(stat->peak, 1, out, arg);
+    mi_print_amount(stat->allocated, 1, out, arg);
+    _mi_fprintf(out, arg, "%11s", " ");  // no freed 
+    mi_print_amount(stat->current, 1, out, arg);
+    _mi_fprintf(out, arg, "\n");
+  }
+}
+
+static void mi_stat_counter_print(const mi_stat_counter_t* stat, const char* msg, mi_output_fun* out, void* arg ) {
+  _mi_fprintf(out, arg, "%10s:", msg);
+  mi_print_amount(stat->total, -1, out, arg);
+  _mi_fprintf(out, arg, "\n");
+}
+
+static void mi_stat_counter_print_avg(const mi_stat_counter_t* stat, const char* msg, mi_output_fun* out, void* arg) {
+  const int64_t avg_tens = (stat->count == 0 ? 0 : (stat->total*10 / stat->count)); 
+  const long avg_whole = (long)(avg_tens/10);
+  const long avg_frac1 = (long)(avg_tens%10);
+  _mi_fprintf(out, arg, "%10s: %5ld.%ld avg\n", msg, avg_whole, avg_frac1);
+}
+
+
+static void mi_print_header(mi_output_fun* out, void* arg ) {
+  _mi_fprintf(out, arg, "%10s: %10s %10s %10s %10s %10s %10s\n", "heap stats", "peak   ", "total   ", "freed   ", "current   ", "unit   ", "count   ");
+}
+
+#if MI_STAT>1
+static void mi_stats_print_bins(const mi_stat_count_t* bins, size_t max, const char* fmt, mi_output_fun* out, void* arg) {
+  bool found = false;
+  char buf[64];
+  for (size_t i = 0; i <= max; i++) {
+    if (bins[i].allocated > 0) {
+      found = true;
+      int64_t unit = _mi_bin_size((uint8_t)i);
+      snprintf(buf, 64, "%s %3lu", fmt, (long)i);
+      mi_stat_print(&bins[i], buf, unit, out, arg);
+    }
+  }
+  if (found) {
+    _mi_fprintf(out, arg, "\n");
+    mi_print_header(out, arg);
+  }
+}
+#endif
+
+
+
+//------------------------------------------------------------
+// Use an output wrapper for line-buffered output
+// (which is nice when using loggers etc.)
+//------------------------------------------------------------
+typedef struct buffered_s {
+  mi_output_fun* out;   // original output function
+  void*          arg;   // and state
+  char*          buf;   // local buffer of at least size `count+1`
+  size_t         used;  // currently used chars `used <= count`  
+  size_t         count; // total chars available for output
+} buffered_t;
+
+static void mi_buffered_flush(buffered_t* buf) {
+  buf->buf[buf->used] = 0;
+  _mi_fputs(buf->out, buf->arg, NULL, buf->buf);
+  buf->used = 0;
+}
+
+static void mi_buffered_out(const char* msg, void* arg) {
+  buffered_t* buf = (buffered_t*)arg;
+  if (msg==NULL || buf==NULL) return;
+  for (const char* src = msg; *src != 0; src++) {
+    char c = *src;
+    if (buf->used >= buf->count) mi_buffered_flush(buf);
+    mi_assert_internal(buf->used < buf->count);
+    buf->buf[buf->used++] = c;
+    if (c == '\n') mi_buffered_flush(buf);
+  }
+}
+
+//------------------------------------------------------------
+// Print statistics
+//------------------------------------------------------------
+
+static void mi_stat_process_info(mi_msecs_t* elapsed, mi_msecs_t* utime, mi_msecs_t* stime, size_t* current_rss, size_t* peak_rss, size_t* current_commit, size_t* peak_commit, size_t* page_faults);
+
+static void _mi_stats_print(mi_stats_t* stats, mi_output_fun* out0, void* arg0) mi_attr_noexcept {
+  // wrap the output function to be line buffered
+  char buf[256];
+  buffered_t buffer = { out0, arg0, NULL, 0, 255 };
+  buffer.buf = buf;
+  mi_output_fun* out = &mi_buffered_out;
+  void* arg = &buffer;
+
+  // and print using that
+  mi_print_header(out,arg);
+  #if MI_STAT>1
+  mi_stats_print_bins(stats->normal_bins, MI_BIN_HUGE, "normal",out,arg);
+  #endif
+  #if MI_STAT
+  mi_stat_print(&stats->normal, "normal", (stats->normal_count.count == 0 ? 1 : -(stats->normal.allocated / stats->normal_count.count)), out, arg);
+  mi_stat_print(&stats->large, "large", (stats->large_count.count == 0 ? 1 : -(stats->large.allocated / stats->large_count.count)), out, arg);
+  mi_stat_print(&stats->huge, "huge", (stats->huge_count.count == 0 ? 1 : -(stats->huge.allocated / stats->huge_count.count)), out, arg);
+  mi_stat_count_t total = { 0,0,0,0 };
+  mi_stat_add(&total, &stats->normal, 1);
+  mi_stat_add(&total, &stats->large, 1);
+  mi_stat_add(&total, &stats->huge, 1);
+  mi_stat_print(&total, "total", 1, out, arg);
+  #endif
+  #if MI_STAT>1
+  mi_stat_print(&stats->malloc, "malloc req", 1, out, arg);
+  _mi_fprintf(out, arg, "\n");
+  #endif
+  mi_stat_print(&stats->reserved, "reserved", 1, out, arg);
+  mi_stat_print(&stats->committed, "committed", 1, out, arg);
+  mi_stat_print(&stats->reset, "reset", 1, out, arg);
+  mi_stat_print(&stats->page_committed, "touched", 1, out, arg);
+  mi_stat_print(&stats->segments, "segments", -1, out, arg);
+  mi_stat_print(&stats->segments_abandoned, "-abandoned", -1, out, arg);
+  mi_stat_print(&stats->segments_cache, "-cached", -1, out, arg);
+  mi_stat_print(&stats->pages, "pages", -1, out, arg);
+  mi_stat_print(&stats->pages_abandoned, "-abandoned", -1, out, arg);
+  mi_stat_counter_print(&stats->pages_extended, "-extended", out, arg);
+  mi_stat_counter_print(&stats->page_no_retire, "-noretire", out, arg);
+  mi_stat_counter_print(&stats->mmap_calls, "mmaps", out, arg);
+  mi_stat_counter_print(&stats->commit_calls, "commits", out, arg);
+  mi_stat_print(&stats->threads, "threads", -1, out, arg);
+  mi_stat_counter_print_avg(&stats->searches, "searches", out, arg);
+  _mi_fprintf(out, arg, "%10s: %7zu\n", "numa nodes", _mi_os_numa_node_count());
+  
+  mi_msecs_t elapsed;
+  mi_msecs_t user_time;
+  mi_msecs_t sys_time;
+  size_t current_rss;
+  size_t peak_rss;
+  size_t current_commit;
+  size_t peak_commit;
+  size_t page_faults;
+  mi_stat_process_info(&elapsed, &user_time, &sys_time, &current_rss, &peak_rss, &current_commit, &peak_commit, &page_faults);
+  _mi_fprintf(out, arg, "%10s: %7ld.%03ld s\n", "elapsed", elapsed/1000, elapsed%1000);
+  _mi_fprintf(out, arg, "%10s: user: %ld.%03ld s, system: %ld.%03ld s, faults: %lu, rss: ", "process",
+              user_time/1000, user_time%1000, sys_time/1000, sys_time%1000, (unsigned long)page_faults );
+  mi_printf_amount((int64_t)peak_rss, 1, out, arg, "%s");
+  if (peak_commit > 0) {
+    _mi_fprintf(out, arg, ", commit: ");
+    mi_printf_amount((int64_t)peak_commit, 1, out, arg, "%s");
+  }
+  _mi_fprintf(out, arg, "\n");  
+}
+
+static mi_msecs_t mi_process_start; // = 0
+
+static mi_stats_t* mi_stats_get_default(void) {
+  mi_heap_t* heap = mi_heap_get_default();
+  return &heap->tld->stats;
+}
+
+static void mi_stats_merge_from(mi_stats_t* stats) {
+  if (stats != &_mi_stats_main) {
+    mi_stats_add(&_mi_stats_main, stats);
+    memset(stats, 0, sizeof(mi_stats_t));
+  }
+}
+
+void mi_stats_reset(void) mi_attr_noexcept {
+  mi_stats_t* stats = mi_stats_get_default();
+  if (stats != &_mi_stats_main) { memset(stats, 0, sizeof(mi_stats_t)); }
+  memset(&_mi_stats_main, 0, sizeof(mi_stats_t));
+  if (mi_process_start == 0) { mi_process_start = _mi_clock_start(); };
+}
+
+void mi_stats_merge(void) mi_attr_noexcept {
+  mi_stats_merge_from( mi_stats_get_default() );
+}
+
+void _mi_stats_done(mi_stats_t* stats) {  // called from `mi_thread_done`
+  mi_stats_merge_from(stats);
+}
+
+void mi_stats_print_out(mi_output_fun* out, void* arg) mi_attr_noexcept {
+  mi_stats_merge_from(mi_stats_get_default());
+  _mi_stats_print(&_mi_stats_main, out, arg);
+}
+
+void mi_stats_print(void* out) mi_attr_noexcept {
+  // for compatibility there is an `out` parameter (which can be `stdout` or `stderr`)
+  mi_stats_print_out((mi_output_fun*)out, NULL);
+}
+
+void mi_thread_stats_print_out(mi_output_fun* out, void* arg) mi_attr_noexcept {
+  _mi_stats_print(mi_stats_get_default(), out, arg);
+}
+
+
+// ----------------------------------------------------------------
+// Basic timer for convenience; use milli-seconds to avoid doubles
+// ----------------------------------------------------------------
+#ifdef _WIN32
+#include <windows.h>
+static mi_msecs_t mi_to_msecs(LARGE_INTEGER t) {
+  static LARGE_INTEGER mfreq; // = 0
+  if (mfreq.QuadPart == 0LL) {
+    LARGE_INTEGER f;
+    QueryPerformanceFrequency(&f);
+    mfreq.QuadPart = f.QuadPart/1000LL;
+    if (mfreq.QuadPart == 0) mfreq.QuadPart = 1;
+  }
+  return (mi_msecs_t)(t.QuadPart / mfreq.QuadPart);  
+}
+
+mi_msecs_t _mi_clock_now(void) {
+  LARGE_INTEGER t;
+  QueryPerformanceCounter(&t);
+  return mi_to_msecs(t);
+}
+#else
+#include <time.h>
+#if defined(CLOCK_REALTIME) || defined(CLOCK_MONOTONIC)
+mi_msecs_t _mi_clock_now(void) {
+  struct timespec t;
+  #ifdef CLOCK_MONOTONIC
+  clock_gettime(CLOCK_MONOTONIC, &t);
+  #else  
+  clock_gettime(CLOCK_REALTIME, &t);
+  #endif
+  return ((mi_msecs_t)t.tv_sec * 1000) + ((mi_msecs_t)t.tv_nsec / 1000000);
+}
+#else
+// low resolution timer
+mi_msecs_t _mi_clock_now(void) {
+  return ((mi_msecs_t)clock() / ((mi_msecs_t)CLOCKS_PER_SEC / 1000));
+}
+#endif
+#endif
+
+
+static mi_msecs_t mi_clock_diff;
+
+mi_msecs_t _mi_clock_start(void) {
+  if (mi_clock_diff == 0.0) {
+    mi_msecs_t t0 = _mi_clock_now();
+    mi_clock_diff = _mi_clock_now() - t0;
+  }
+  return _mi_clock_now();
+}
+
+mi_msecs_t _mi_clock_end(mi_msecs_t start) {
+  mi_msecs_t end = _mi_clock_now();
+  return (end - start - mi_clock_diff);
+}
+
+
+// --------------------------------------------------------
+// Basic process statistics
+// --------------------------------------------------------
+
+#if defined(_WIN32)
+#include <windows.h>
+#include <psapi.h>
+#pragma comment(lib,"psapi.lib")
+
+static mi_msecs_t filetime_msecs(const FILETIME* ftime) {
+  ULARGE_INTEGER i;
+  i.LowPart = ftime->dwLowDateTime;
+  i.HighPart = ftime->dwHighDateTime;
+  mi_msecs_t msecs = (i.QuadPart / 10000); // FILETIME is in 100 nano seconds
+  return msecs;
+}
+
+static void mi_stat_process_info(mi_msecs_t* elapsed, mi_msecs_t* utime, mi_msecs_t* stime, size_t* current_rss, size_t* peak_rss, size_t* current_commit, size_t* peak_commit, size_t* page_faults) 
+{
+  *elapsed = _mi_clock_end(mi_process_start);
+  FILETIME ct;
+  FILETIME ut;
+  FILETIME st;
+  FILETIME et;
+  GetProcessTimes(GetCurrentProcess(), &ct, &et, &st, &ut);
+  *utime = filetime_msecs(&ut);
+  *stime = filetime_msecs(&st);
+  PROCESS_MEMORY_COUNTERS info;
+  GetProcessMemoryInfo(GetCurrentProcess(), &info, sizeof(info));
+  *current_rss    = (size_t)info.WorkingSetSize;
+  *peak_rss       = (size_t)info.PeakWorkingSetSize;
+  *current_commit = (size_t)info.PagefileUsage;
+  *peak_commit    = (size_t)info.PeakPagefileUsage;
+  *page_faults    = (size_t)info.PageFaultCount;  
+}
+
+#elif !defined(__wasi__) && (defined(__unix__) || defined(__unix) || defined(unix) || defined(__APPLE__) || defined(__HAIKU__))
+#include <stdio.h>
+#include <unistd.h>
+#include <sys/resource.h>
+
+#if defined(__APPLE__)
+#include <mach/mach.h>
+#endif
+
+#if defined(__HAIKU__)
+#include <kernel/OS.h>
+#endif
+
+static mi_msecs_t timeval_secs(const struct timeval* tv) {
+  return ((mi_msecs_t)tv->tv_sec * 1000L) + ((mi_msecs_t)tv->tv_usec / 1000L);
+}
+
+static void mi_stat_process_info(mi_msecs_t* elapsed, mi_msecs_t* utime, mi_msecs_t* stime, size_t* current_rss, size_t* peak_rss, size_t* current_commit, size_t* peak_commit, size_t* page_faults)
+{
+  *elapsed = _mi_clock_end(mi_process_start);
+  struct rusage rusage;
+  getrusage(RUSAGE_SELF, &rusage);
+  *utime = timeval_secs(&rusage.ru_utime);
+  *stime = timeval_secs(&rusage.ru_stime);
+#if !defined(__HAIKU__)
+  *page_faults = rusage.ru_majflt;
+#endif
+  // estimate commit using our stats
+  *peak_commit    = (size_t)(mi_atomic_loadi64_relaxed((_Atomic(int64_t)*)&_mi_stats_main.committed.peak));
+  *current_commit = (size_t)(mi_atomic_loadi64_relaxed((_Atomic(int64_t)*)&_mi_stats_main.committed.current));
+  *current_rss    = *current_commit;  // estimate 
+#if defined(__HAIKU__)
+  // Haiku does not have (yet?) a way to
+  // get these stats per process
+  thread_info tid;
+  area_info mem;
+  ssize_t c;
+  get_thread_info(find_thread(0), &tid);
+  while (get_next_area_info(tid.team, &c, &mem) == B_OK) {
+    *peak_rss += mem.ram_size;
+  }
+  *page_faults = 0;
+#elif defined(__APPLE__)
+  *peak_rss = rusage.ru_maxrss;         // BSD reports in bytes
+  struct mach_task_basic_info info;
+  mach_msg_type_number_t infoCount = MACH_TASK_BASIC_INFO_COUNT;
+  if (task_info(mach_task_self(), MACH_TASK_BASIC_INFO, (task_info_t)&info, &infoCount) == KERN_SUCCESS) {
+    *current_rss = (size_t)info.resident_size;
+  }
+#else
+  *peak_rss = rusage.ru_maxrss * 1024;  // Linux reports in KiB
+#endif  
+}
+
+#else
+#ifndef __wasi__
+// WebAssembly instances are not processes
+#pragma message("define a way to get process info")
+#endif
+
+static void mi_stat_process_info(mi_msecs_t* elapsed, mi_msecs_t* utime, mi_msecs_t* stime, size_t* current_rss, size_t* peak_rss, size_t* current_commit, size_t* peak_commit, size_t* page_faults)
+{
+  *elapsed = _mi_clock_end(mi_process_start);
+  *peak_commit    = (size_t)(mi_atomic_loadi64_relaxed((_Atomic(int64_t)*)&_mi_stats_main.committed.peak));
+  *current_commit = (size_t)(mi_atomic_loadi64_relaxed((_Atomic(int64_t)*)&_mi_stats_main.committed.current));
+  *peak_rss    = *peak_commit;
+  *current_rss = *current_commit;
+  *page_faults = 0;
+  *utime = 0;
+  *stime = 0;
+}
+#endif
+
+
+mi_decl_export void mi_process_info(size_t* elapsed_msecs, size_t* user_msecs, size_t* system_msecs, size_t* current_rss, size_t* peak_rss, size_t* current_commit, size_t* peak_commit, size_t* page_faults) mi_attr_noexcept
+{
+  mi_msecs_t elapsed = 0;
+  mi_msecs_t utime = 0;
+  mi_msecs_t stime = 0;
+  size_t current_rss0 = 0;
+  size_t peak_rss0 = 0;
+  size_t current_commit0 = 0;
+  size_t peak_commit0 = 0;
+  size_t page_faults0 = 0;  
+  mi_stat_process_info(&elapsed,&utime, &stime, &current_rss0, &peak_rss0, &current_commit0, &peak_commit0, &page_faults0);
+  if (elapsed_msecs!=NULL)  *elapsed_msecs = (elapsed < 0 ? 0 : (elapsed < (mi_msecs_t)PTRDIFF_MAX ? (size_t)elapsed : PTRDIFF_MAX));
+  if (user_msecs!=NULL)     *user_msecs     = (utime < 0 ? 0 : (utime < (mi_msecs_t)PTRDIFF_MAX ? (size_t)utime : PTRDIFF_MAX));
+  if (system_msecs!=NULL)   *system_msecs   = (stime < 0 ? 0 : (stime < (mi_msecs_t)PTRDIFF_MAX ? (size_t)stime : PTRDIFF_MAX));
+  if (current_rss!=NULL)    *current_rss    = current_rss0;
+  if (peak_rss!=NULL)       *peak_rss       = peak_rss0;
+  if (current_commit!=NULL) *current_commit = current_commit0;
+  if (peak_commit!=NULL)    *peak_commit    = peak_commit0;
+  if (page_faults!=NULL)    *page_faults    = page_faults0;
+}
+
diff --git a/source/luametatex/source/libraries/miniz/ChangeLog.md b/source/luametatex/source/libraries/miniz/ChangeLog.md
new file mode 100644
index 000000000..4ae15a8cd
--- /dev/null
+++ b/source/luametatex/source/libraries/miniz/ChangeLog.md
@@ -0,0 +1,196 @@
+## Changelog
+
+### 2.2.0
+
+ - Fix examples with amalgamation
+ - Modified cmake script to support shared library mode and find_package
+ - Fix for misleading doc comment on `mz_zip_reader_init_cfile` function
+ - Add include location tolerance and stop forcing `_GNU_SOURCE`
+ - Fix: mz_zip_reader_locate_file_v2 returns an mz_bool
+ - Fix large file system checks
+ - Add #elif to enable an external mz_crc32() to be linked in
+ - Write with dynamic size (size of file/data to be added not known before adding)
+ - Added uncompress2 for zlib compatibility
+ - Add support for building as a Meson subproject
+ - Added OSSFuzz support; Integrate with CIFuzz
+ - Add pkg-config file
+ - Fixed use-of-uninitialized value msan error when copying dist bytes with no output bytes written.
+ - mz_zip_validate_file(): fix memory leak on errors
+ - Fixed MSAN use-of-uninitialized in tinfl_decompress when invalid dist is decoded. In this instance dist was 31 which s_dist_base translates as 0
+ - Add flag to set (compressed) size in local file header
+ - avoid use of uninitialized value in tdefl_record_literal
+
+### 2.1.0
+
+ - More instances of memcpy instead of cast and use memcpy per default
+ - Remove inline for c90 support
+ - New function to read files via callback functions when adding them
+ - Fix out of bounds read while reading Zip64 extended information
+ - guard memcpy when n == 0 because buffer may be NULL
+ - Implement inflateReset() function
+ - Move comp/decomp alloc/free  prototypes under guarding #ifndef MZ_NO_MALLOC
+ - Fix large file support under Windows
+ - Don't warn if _LARGEFILE64_SOURCE is not defined to 1
+ - Fixes for MSVC warnings
+ - Remove check that path of file added to archive contains ':' or '\'
+ - Add !defined check on MINIZ_USE_ALIGNED_LOADS_AND_STORES
+
+### 2.0.8
+
+ - Remove unimplemented functions (mz_zip_locate_file and mz_zip_locate_file_v2)
+ - Add license, changelog, readme and example files to release zip
+ - Fix heap overflow to user buffer in tinfl_status tinfl_decompress
+ - Fix corrupt archive if uncompressed file smaller than 4 byte and the file is added by mz_zip_writer_add_mem*
+
+### 2.0.7
+
+ - Removed need in C++ compiler in cmake build
+ - Fixed a lot of uninitialized value errors found with Valgrind by memsetting m_dict to 0 in tdefl_init
+ - Fix resource leak in mz_zip_reader_init_file_v2
+ - Fix assert with mz_zip_writer_add_mem* w/MZ_DEFAULT_COMPRESSION
+ - cmake build: install library and headers
+ - Remove _LARGEFILE64_SOURCE requirement from apple defines for large files
+
+### 2.0.6
+
+ - Improve MZ_ZIP_FLAG_WRITE_ZIP64 documentation
+ - Remove check for cur_archive_file_ofs > UINT_MAX because cur_archive_file_ofs is not used after this point
+ - Add cmake debug configuration
+ - Fix PNG height when creating png files
+ - Add "iterative" file extraction method based on mz_zip_reader_extract_to_callback.
+ - Option to use memcpy for unaligned data access
+ - Define processor/arch macros as zero if not set to one
+
+### 2.0.4/2.0.5
+
+ - Fix compilation with the various omission compile definitions
+
+### 2.0.3
+
+- Fix GCC/clang compile warnings
+- Added callback for periodic flushes (for ZIP file streaming)
+- Use UTF-8 for file names in ZIP files per default
+
+### 2.0.2
+
+- Fix source backwards compatibility with 1.x
+- Fix a ZIP bit not being set correctly
+
+### 2.0.1
+
+- Added some tests
+- Added CI
+- Make source code ANSI C compatible
+
+### 2.0.0 beta
+
+- Matthew Sitton merged miniz 1.x to Rich Geldreich's vogl ZIP64 changes. Miniz is now licensed as MIT since the vogl code base is MIT licensed
+- Miniz is now split into several files
+- Miniz does now not seek backwards when creating ZIP files. That is the ZIP files can be streamed
+- Miniz automatically switches to the ZIP64 format when the created ZIP files goes over ZIP file limits
+- Similar to [SQLite](https://www.sqlite.org/amalgamation.html) the Miniz source code is amalgamated into one miniz.c/miniz.h pair in a build step (amalgamate.sh). Please use miniz.c/miniz.h in your projects
+- Miniz 2 is only source back-compatible with miniz 1.x. It breaks binary compatibility because structures changed
+
+### v1.16 BETA Oct 19, 2013
+
+Still testing, this release is downloadable from [here](http://www.tenacioussoftware.com/miniz_v116_beta_r1.7z). Two key inflator-only robustness and streaming related changes. Also merged in tdefl_compressor_alloc(), tdefl_compressor_free() helpers to make script bindings easier for rustyzip. I would greatly appreciate any help with testing or any feedback.
+
+The inflator in raw (non-zlib) mode is now usable on gzip or similar streams that have a bunch of bytes following the raw deflate data (problem discovered by rustyzip author williamw520). This version should never read beyond the last byte of the raw deflate data independent of how many bytes you pass into the input buffer.
+
+The inflator now has a new failure status TINFL_STATUS_FAILED_CANNOT_MAKE_PROGRESS (-4). Previously, if the inflator was starved of bytes and could not make progress (because the input buffer was empty and the caller did not set the TINFL_FLAG_HAS_MORE_INPUT flag - say on truncated or corrupted compressed data stream) it would append all 0's to the input and try to soldier on. This is scary behavior if the caller didn't know when to stop accepting output (because it didn't know how much uncompressed data was expected, or didn't enforce a sane maximum). v1.16 will instead return TINFL_STATUS_FAILED_CANNOT_MAKE_PROGRESS immediately if it needs 1 or more bytes to make progress, the input buf is empty, and the caller has indicated that no more input is available. This is a "soft" failure, so you can call the inflator again with more input and it will try to continue, or you can give up and fail. This could be very useful in network streaming scenarios.
+
+- The inflator coroutine func. is subtle and complex so I'm being cautious about this release. I would greatly appreciate any help with testing or any feedback.
+         I feel good about these changes, and they've been through several hours of automated testing, but they will probably not fix anything for the majority of prev. users so I'm
+         going to mark this release as beta for a few weeks and continue testing it at work/home on various things.
+- The inflator in raw (non-zlib) mode is now usable on gzip or similar data streams that have a bunch of bytes following the raw deflate data (problem discovered by rustyzip author williamw520).
+         This version should *never* read beyond the last byte of the raw deflate data independent of how many bytes you pass into the input buffer. This issue was caused by the various Huffman bitbuffer lookahead optimizations, and
+         would not be an issue if the caller knew and enforced the precise size of the raw compressed data *or* if the compressed data was in zlib format (i.e. always followed by the byte aligned zlib adler32).
+         So in other words, you can now call the inflator on deflate streams that are followed by arbitrary amounts of data and it's guaranteed that decompression will stop exactly on the last byte.
+- The inflator now has a new failure status: TINFL_STATUS_FAILED_CANNOT_MAKE_PROGRESS (-4). Previously, if the inflator was starved of bytes and could not make progress (because the input buffer was empty and the
+         caller did not set the TINFL_FLAG_HAS_MORE_INPUT flag - say on truncated or corrupted compressed data stream) it would append all 0's to the input and try to soldier on.
+         This is scary, because in the worst case, I believe it was possible for the prev. inflator to start outputting large amounts of literal data. If the caller didn't know when to stop accepting output
+         (because it didn't know how much uncompressed data was expected, or didn't enforce a sane maximum) it could continue forever. v1.16 cannot fall into this failure mode, instead it'll return
+         TINFL_STATUS_FAILED_CANNOT_MAKE_PROGRESS immediately if it needs 1 or more bytes to make progress, the input buf is empty, and the caller has indicated that no more input is available. This is a "soft"
+         failure, so you can call the inflator again with more input and it will try to continue, or you can give up and fail. This could be very useful in network streaming scenarios.
+- Added documentation to all the tinfl return status codes, fixed miniz_tester so it accepts double minus params for Linux, tweaked example1.c, added a simple "follower bytes" test to miniz_tester.cpp.
+### v1.15 r4 STABLE - Oct 13, 2013
+
+Merged over a few very minor bug fixes that I fixed in the zip64 branch. This is downloadable from [here](http://code.google.com/p/miniz/downloads/list) and also in SVN head (as of 10/19/13).
+
+
+### v1.15 - Oct. 13, 2013
+
+Interim bugfix release while I work on the next major release with zip64 and streaming compression/decompression support. Fixed the MZ_ZIP_FLAG_DO_NOT_SORT_CENTRAL_DIRECTORY bug (thanks kahmyong.moon@hp.com), which could cause the locate files func to not find files when this flag was specified. Also fixed a bug in mz_zip_reader_extract_to_mem_no_alloc() with user provided read buffers (thanks kymoon). I also merged lots of compiler fixes from various github repo branches and Google Code issue reports. I finally added cmake support (only tested under for Linux so far), compiled and tested with clang v3.3 and gcc 4.6 (under Linux), added defl_write_image_to_png_file_in_memory_ex() (supports Y flipping for OpenGL use, real-time compression), added a new PNG example (example6.c - Mandelbrot), and I added 64-bit file I/O support (stat64(), etc.) for glibc.
+
+- Critical fix for the MZ_ZIP_FLAG_DO_NOT_SORT_CENTRAL_DIRECTORY bug (thanks kahmyong.moon@hp.com) which could cause locate files to not find files. This bug
+        would only have occured in earlier versions if you explicitly used this flag, OR if you used mz_zip_extract_archive_file_to_heap() or mz_zip_add_mem_to_archive_file_in_place()
+        (which used this flag). If you can't switch to v1.15 but want to fix this bug, just remove the uses of this flag from both helper funcs (and of course don't use the flag).
+- Bugfix in mz_zip_reader_extract_to_mem_no_alloc() from kymoon when pUser_read_buf is not NULL and compressed size is > uncompressed size
+- Fixing mz_zip_reader_extract_*() funcs so they don't try to extract compressed data from directory entries, to account for weird zipfiles which contain zero-size compressed data on dir entries.
+         Hopefully this fix won't cause any issues on weird zip archives, because it assumes the low 16-bits of zip external attributes are DOS attributes (which I believe they always are in practice).
+- Fixing mz_zip_reader_is_file_a_directory() so it doesn't check the internal attributes, just the filename and external attributes
+- mz_zip_reader_init_file() - missing MZ_FCLOSE() call if the seek failed
+- Added cmake support for Linux builds which builds all the examples, tested with clang v3.3 and gcc v4.6.
+- Clang fix for tdefl_write_image_to_png_file_in_memory() from toffaletti
+- Merged MZ_FORCEINLINE fix from hdeanclark
+- Fix <time.h> include before config #ifdef, thanks emil.brink
+- Added tdefl_write_image_to_png_file_in_memory_ex(): supports Y flipping (super useful for OpenGL apps), and explicit control over the compression level (so you can
+        set it to 1 for real-time compression).
+- Merged in some compiler fixes from paulharris's github repro.
+- Retested this build under Windows (VS 2010, including static analysis), tcc  0.9.26, gcc v4.6 and clang v3.3.
+- Added example6.c, which dumps an image of the mandelbrot set to a PNG file.
+- Modified example2 to help test the MZ_ZIP_FLAG_DO_NOT_SORT_CENTRAL_DIRECTORY flag more.
+- In r3: Bugfix to mz_zip_writer_add_file() found during merge: Fix possible src file fclose() leak if alignment bytes+local header file write faiiled
+- In r4: Minor bugfix to mz_zip_writer_add_from_zip_reader(): Was pushing the wrong central dir header offset, appears harmless in this release, but it became a problem in the zip64 branch
+
+### v1.14 - May 20, 2012
+
+(SVN Only) Minor tweaks to get miniz.c compiling with the Tiny C Compiler, added #ifndef MINIZ_NO_TIME guards around utime.h includes. Adding mz_free() function, so the caller can free heap blocks returned by miniz using whatever heap functions it has been configured to use, MSVC specific fixes to use "safe" variants of several functions (localtime_s, fopen_s, freopen_s).
+
+MinGW32/64 GCC 4.6.1 compiler fixes: added MZ_FORCEINLINE, #include <time.h> (thanks fermtect).
+
+Compiler specific fixes, some from fermtect. I upgraded to TDM GCC 4.6.1 and now static __forceinline is giving it fits, so I'm changing all usage of __forceinline to MZ_FORCEINLINE and forcing gcc to use __attribute__((__always_inline__)) (and MSVC to use __forceinline). Also various fixes from fermtect for MinGW32: added #include , 64-bit ftell/fseek fixes.
+
+### v1.13 - May 19, 2012
+
+From jason@cornsyrup.org and kelwert@mtu.edu - Most importantly, fixed mz_crc32() so it doesn't compute the wrong CRC-32's when mz_ulong is 64-bits. Temporarily/locally slammed in "typedef unsigned long mz_ulong" and re-ran a randomized regression test on ~500k files. Other stuff:
+
+Eliminated a bunch of warnings when compiling with GCC 32-bit/64. Ran all examples, miniz.c, and tinfl.c through MSVC 2008's /analyze (static analysis) option and fixed all warnings (except for the silly "Use of the comma-operator in a tested expression.." analysis warning, which I purposely use to work around a MSVC compiler warning).
+
+Created 32-bit and 64-bit Codeblocks projects/workspace. Built and tested Linux executables. The codeblocks workspace is compatible with Linux+Win32/x64. Added miniz_tester solution/project, which is a useful little app derived from LZHAM's tester app that I use as part of the regression test. Ran miniz.c and tinfl.c through another series of regression testing on ~500,000 files and archives. Modified example5.c so it purposely disables a bunch of high-level functionality (MINIZ_NO_STDIO, etc.). (Thanks to corysama for the MINIZ_NO_STDIO bug report.)
+
+Fix ftell() usage in a few of the examples so they exit with an error on files which are too large (a limitation of the examples, not miniz itself). Fix fail logic handling in mz_zip_add_mem_to_archive_file_in_place() so it always calls mz_zip_writer_finalize_archive() and mz_zip_writer_end(), even if the file add fails.
+
+- From jason@cornsyrup.org and kelwert@mtu.edu - Fix mz_crc32() so it doesn't compute the wrong CRC-32's when mz_ulong is 64-bit.
+- Temporarily/locally slammed in "typedef unsigned long mz_ulong" and re-ran a randomized regression test on ~500k files.
+- Eliminated a bunch of warnings when compiling with GCC 32-bit/64.
+- Ran all examples, miniz.c, and tinfl.c through MSVC 2008's /analyze (static analysis) option and fixed all warnings (except for the silly
+"Use of the comma-operator in a tested expression.." analysis warning, which I purposely use to work around a MSVC compiler warning).
+- Created 32-bit and 64-bit Codeblocks projects/workspace. Built and tested Linux executables. The codeblocks workspace is compatible with Linux+Win32/x64.
+- Added miniz_tester solution/project, which is a useful little app derived from LZHAM's tester app that I use as part of the regression test.
+- Ran miniz.c and tinfl.c through another series of regression testing on ~500,000 files and archives.
+- Modified example5.c so it purposely disables a bunch of high-level functionality (MINIZ_NO_STDIO, etc.). (Thanks to corysama for the MINIZ_NO_STDIO bug report.)
+- Fix ftell() usage in examples so they exit with an error on files which are too large (a limitation of the examples, not miniz itself).
+
+### v1.12 - 4/12/12
+
+More comments, added low-level example5.c, fixed a couple minor level_and_flags issues in the archive API's.
+level_and_flags can now be set to MZ_DEFAULT_COMPRESSION. Thanks to Bruce Dawson <bruced@valvesoftware.com> for the feedback/bug report.
+
+### v1.11 - 5/28/11
+
+Added statement from unlicense.org
+
+### v1.10 - 5/27/11
+
+- Substantial compressor optimizations:
+- Level 1 is now ~4x faster than before. The L1 compressor's throughput now varies between 70-110MB/sec. on a Core i7 (actual throughput varies depending on the type of data, and x64 vs. x86).
+- Improved baseline L2-L9 compression perf. Also, greatly improved compression perf. issues on some file types.
+- Refactored the compression code for better readability and maintainability.
+- Added level 10 compression level (L10 has slightly better ratio than level 9, but could have a potentially large drop in throughput on some files).
+
+### v1.09 - 5/15/11
+
+Initial stable release.
+
+
diff --git a/source/luametatex/source/libraries/miniz/LICENSE b/source/luametatex/source/libraries/miniz/LICENSE
new file mode 100644
index 000000000..b6ff45a30
--- /dev/null
+++ b/source/luametatex/source/libraries/miniz/LICENSE
@@ -0,0 +1,22 @@
+Copyright 2013-2014 RAD Game Tools and Valve Software
+Copyright 2010-2014 Rich Geldreich and Tenacious Software LLC
+
+All Rights Reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
diff --git a/source/luametatex/source/libraries/miniz/miniz.c b/source/luametatex/source/libraries/miniz/miniz.c
new file mode 100644
index 000000000..87bdedb18
--- /dev/null
+++ b/source/luametatex/source/libraries/miniz/miniz.c
@@ -0,0 +1,7733 @@
+#include "miniz.h"
+/**************************************************************************
+ *
+ * Copyright 2013-2014 RAD Game Tools and Valve Software
+ * Copyright 2010-2014 Rich Geldreich and Tenacious Software LLC
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+
+typedef unsigned char mz_validate_uint16[sizeof(mz_uint16) == 2 ? 1 : -1];
+typedef unsigned char mz_validate_uint32[sizeof(mz_uint32) == 4 ? 1 : -1];
+typedef unsigned char mz_validate_uint64[sizeof(mz_uint64) == 8 ? 1 : -1];
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* ------------------- zlib-style API's */
+
+mz_ulong mz_adler32(mz_ulong adler, const unsigned char *ptr, size_t buf_len)
+{
+    mz_uint32 i, s1 = (mz_uint32)(adler & 0xffff), s2 = (mz_uint32)(adler >> 16);
+    size_t block_len = buf_len % 5552;
+    if (!ptr)
+        return MZ_ADLER32_INIT;
+    while (buf_len)
+    {
+        for (i = 0; i + 7 < block_len; i += 8, ptr += 8)
+        {
+            s1 += ptr[0], s2 += s1;
+            s1 += ptr[1], s2 += s1;
+            s1 += ptr[2], s2 += s1;
+            s1 += ptr[3], s2 += s1;
+            s1 += ptr[4], s2 += s1;
+            s1 += ptr[5], s2 += s1;
+            s1 += ptr[6], s2 += s1;
+            s1 += ptr[7], s2 += s1;
+        }
+        for (; i < block_len; ++i)
+            s1 += *ptr++, s2 += s1;
+        s1 %= 65521U, s2 %= 65521U;
+        buf_len -= block_len;
+        block_len = 5552;
+    }
+    return (s2 << 16) + s1;
+}
+
+/* Karl Malbrain's compact CRC-32. See "A compact CCITT crc16 and crc32 C implementation that balances processor cache usage against speed": http://www.geocities.com/malbrain/ */
+#if 0
+    mz_ulong mz_crc32(mz_ulong crc, const mz_uint8 *ptr, size_t buf_len)
+    {
+        static const mz_uint32 s_crc32[16] = { 0, 0x1db71064, 0x3b6e20c8, 0x26d930ac, 0x76dc4190, 0x6b6b51f4, 0x4db26158, 0x5005713c,
+                                               0xedb88320, 0xf00f9344, 0xd6d6a3e8, 0xcb61b38c, 0x9b64c2b0, 0x86d3d2d4, 0xa00ae278, 0xbdbdf21c };
+        mz_uint32 crcu32 = (mz_uint32)crc;
+        if (!ptr)
+            return MZ_CRC32_INIT;
+        crcu32 = ~crcu32;
+        while (buf_len--)
+        {
+            mz_uint8 b = *ptr++;
+            crcu32 = (crcu32 >> 4) ^ s_crc32[(crcu32 & 0xF) ^ (b & 0xF)];
+            crcu32 = (crcu32 >> 4) ^ s_crc32[(crcu32 & 0xF) ^ (b >> 4)];
+        }
+        return ~crcu32;
+    }
+#elif defined(USE_EXTERNAL_MZCRC)
+/* If USE_EXTERNAL_CRC is defined, an external module will export the
+ * mz_crc32() symbol for us to use, e.g. an SSE-accelerated version.
+ * Depending on the impl, it may be necessary to ~ the input/output crc values.
+ */
+mz_ulong mz_crc32(mz_ulong crc, const mz_uint8 *ptr, size_t buf_len);
+#else
+/* Faster, but larger CPU cache footprint.
+ */
+mz_ulong mz_crc32(mz_ulong crc, const mz_uint8 *ptr, size_t buf_len)
+{
+    static const mz_uint32 s_crc_table[256] =
+        {
+          0x00000000, 0x77073096, 0xEE0E612C, 0x990951BA, 0x076DC419, 0x706AF48F, 0xE963A535,
+          0x9E6495A3, 0x0EDB8832, 0x79DCB8A4, 0xE0D5E91E, 0x97D2D988, 0x09B64C2B, 0x7EB17CBD,
+          0xE7B82D07, 0x90BF1D91, 0x1DB71064, 0x6AB020F2, 0xF3B97148, 0x84BE41DE, 0x1ADAD47D,
+          0x6DDDE4EB, 0xF4D4B551, 0x83D385C7, 0x136C9856, 0x646BA8C0, 0xFD62F97A, 0x8A65C9EC,
+          0x14015C4F, 0x63066CD9, 0xFA0F3D63, 0x8D080DF5, 0x3B6E20C8, 0x4C69105E, 0xD56041E4,
+          0xA2677172, 0x3C03E4D1, 0x4B04D447, 0xD20D85FD, 0xA50AB56B, 0x35B5A8FA, 0x42B2986C,
+          0xDBBBC9D6, 0xACBCF940, 0x32D86CE3, 0x45DF5C75, 0xDCD60DCF, 0xABD13D59, 0x26D930AC,
+          0x51DE003A, 0xC8D75180, 0xBFD06116, 0x21B4F4B5, 0x56B3C423, 0xCFBA9599, 0xB8BDA50F,
+          0x2802B89E, 0x5F058808, 0xC60CD9B2, 0xB10BE924, 0x2F6F7C87, 0x58684C11, 0xC1611DAB,
+          0xB6662D3D, 0x76DC4190, 0x01DB7106, 0x98D220BC, 0xEFD5102A, 0x71B18589, 0x06B6B51F,
+          0x9FBFE4A5, 0xE8B8D433, 0x7807C9A2, 0x0F00F934, 0x9609A88E, 0xE10E9818, 0x7F6A0DBB,
+          0x086D3D2D, 0x91646C97, 0xE6635C01, 0x6B6B51F4, 0x1C6C6162, 0x856530D8, 0xF262004E,
+          0x6C0695ED, 0x1B01A57B, 0x8208F4C1, 0xF50FC457, 0x65B0D9C6, 0x12B7E950, 0x8BBEB8EA,
+          0xFCB9887C, 0x62DD1DDF, 0x15DA2D49, 0x8CD37CF3, 0xFBD44C65, 0x4DB26158, 0x3AB551CE,
+          0xA3BC0074, 0xD4BB30E2, 0x4ADFA541, 0x3DD895D7, 0xA4D1C46D, 0xD3D6F4FB, 0x4369E96A,
+          0x346ED9FC, 0xAD678846, 0xDA60B8D0, 0x44042D73, 0x33031DE5, 0xAA0A4C5F, 0xDD0D7CC9,
+          0x5005713C, 0x270241AA, 0xBE0B1010, 0xC90C2086, 0x5768B525, 0x206F85B3, 0xB966D409,
+          0xCE61E49F, 0x5EDEF90E, 0x29D9C998, 0xB0D09822, 0xC7D7A8B4, 0x59B33D17, 0x2EB40D81,
+          0xB7BD5C3B, 0xC0BA6CAD, 0xEDB88320, 0x9ABFB3B6, 0x03B6E20C, 0x74B1D29A, 0xEAD54739,
+          0x9DD277AF, 0x04DB2615, 0x73DC1683, 0xE3630B12, 0x94643B84, 0x0D6D6A3E, 0x7A6A5AA8,
+          0xE40ECF0B, 0x9309FF9D, 0x0A00AE27, 0x7D079EB1, 0xF00F9344, 0x8708A3D2, 0x1E01F268,
+          0x6906C2FE, 0xF762575D, 0x806567CB, 0x196C3671, 0x6E6B06E7, 0xFED41B76, 0x89D32BE0,
+          0x10DA7A5A, 0x67DD4ACC, 0xF9B9DF6F, 0x8EBEEFF9, 0x17B7BE43, 0x60B08ED5, 0xD6D6A3E8,
+          0xA1D1937E, 0x38D8C2C4, 0x4FDFF252, 0xD1BB67F1, 0xA6BC5767, 0x3FB506DD, 0x48B2364B,
+          0xD80D2BDA, 0xAF0A1B4C, 0x36034AF6, 0x41047A60, 0xDF60EFC3, 0xA867DF55, 0x316E8EEF,
+          0x4669BE79, 0xCB61B38C, 0xBC66831A, 0x256FD2A0, 0x5268E236, 0xCC0C7795, 0xBB0B4703,
+          0x220216B9, 0x5505262F, 0xC5BA3BBE, 0xB2BD0B28, 0x2BB45A92, 0x5CB36A04, 0xC2D7FFA7,
+          0xB5D0CF31, 0x2CD99E8B, 0x5BDEAE1D, 0x9B64C2B0, 0xEC63F226, 0x756AA39C, 0x026D930A,
+          0x9C0906A9, 0xEB0E363F, 0x72076785, 0x05005713, 0x95BF4A82, 0xE2B87A14, 0x7BB12BAE,
+          0x0CB61B38, 0x92D28E9B, 0xE5D5BE0D, 0x7CDCEFB7, 0x0BDBDF21, 0x86D3D2D4, 0xF1D4E242,
+          0x68DDB3F8, 0x1FDA836E, 0x81BE16CD, 0xF6B9265B, 0x6FB077E1, 0x18B74777, 0x88085AE6,
+          0xFF0F6A70, 0x66063BCA, 0x11010B5C, 0x8F659EFF, 0xF862AE69, 0x616BFFD3, 0x166CCF45,
+          0xA00AE278, 0xD70DD2EE, 0x4E048354, 0x3903B3C2, 0xA7672661, 0xD06016F7, 0x4969474D,
+          0x3E6E77DB, 0xAED16A4A, 0xD9D65ADC, 0x40DF0B66, 0x37D83BF0, 0xA9BCAE53, 0xDEBB9EC5,
+          0x47B2CF7F, 0x30B5FFE9, 0xBDBDF21C, 0xCABAC28A, 0x53B39330, 0x24B4A3A6, 0xBAD03605,
+          0xCDD70693, 0x54DE5729, 0x23D967BF, 0xB3667A2E, 0xC4614AB8, 0x5D681B02, 0x2A6F2B94,
+          0xB40BBE37, 0xC30C8EA1, 0x5A05DF1B, 0x2D02EF8D
+        };
+
+    mz_uint32 crc32 = (mz_uint32)crc ^ 0xFFFFFFFF;
+    const mz_uint8 *pByte_buf = (const mz_uint8 *)ptr;
+
+    while (buf_len >= 4)
+    {
+        crc32 = (crc32 >> 8) ^ s_crc_table[(crc32 ^ pByte_buf[0]) & 0xFF];
+        crc32 = (crc32 >> 8) ^ s_crc_table[(crc32 ^ pByte_buf[1]) & 0xFF];
+        crc32 = (crc32 >> 8) ^ s_crc_table[(crc32 ^ pByte_buf[2]) & 0xFF];
+        crc32 = (crc32 >> 8) ^ s_crc_table[(crc32 ^ pByte_buf[3]) & 0xFF];
+        pByte_buf += 4;
+        buf_len -= 4;
+    }
+
+    while (buf_len)
+    {
+        crc32 = (crc32 >> 8) ^ s_crc_table[(crc32 ^ pByte_buf[0]) & 0xFF];
+        ++pByte_buf;
+        --buf_len;
+    }
+
+    return ~crc32;
+}
+#endif
+
+void mz_free(void *p)
+{
+    MZ_FREE(p);
+}
+
+MINIZ_EXPORT void *miniz_def_alloc_func(void *opaque, size_t items, size_t size)
+{
+    (void)opaque, (void)items, (void)size;
+    return MZ_MALLOC(items * size);
+}
+MINIZ_EXPORT void miniz_def_free_func(void *opaque, void *address)
+{
+    (void)opaque, (void)address;
+    MZ_FREE(address);
+}
+MINIZ_EXPORT void *miniz_def_realloc_func(void *opaque, void *address, size_t items, size_t size)
+{
+    (void)opaque, (void)address, (void)items, (void)size;
+    return MZ_REALLOC(address, items * size);
+}
+
+const char *mz_version(void)
+{
+    return MZ_VERSION;
+}
+
+#ifndef MINIZ_NO_ZLIB_APIS
+
+int mz_deflateInit(mz_streamp pStream, int level)
+{
+    return mz_deflateInit2(pStream, level, MZ_DEFLATED, MZ_DEFAULT_WINDOW_BITS, 9, MZ_DEFAULT_STRATEGY);
+}
+
+int mz_deflateInit2(mz_streamp pStream, int level, int method, int window_bits, int mem_level, int strategy)
+{
+    tdefl_compressor *pComp;
+    mz_uint comp_flags = TDEFL_COMPUTE_ADLER32 | tdefl_create_comp_flags_from_zip_params(level, window_bits, strategy);
+
+    if (!pStream)
+        return MZ_STREAM_ERROR;
+    if ((method != MZ_DEFLATED) || ((mem_level < 1) || (mem_level > 9)) || ((window_bits != MZ_DEFAULT_WINDOW_BITS) && (-window_bits != MZ_DEFAULT_WINDOW_BITS)))
+        return MZ_PARAM_ERROR;
+
+    pStream->data_type = 0;
+    pStream->adler = MZ_ADLER32_INIT;
+    pStream->msg = NULL;
+    pStream->reserved = 0;
+    pStream->total_in = 0;
+    pStream->total_out = 0;
+    if (!pStream->zalloc)
+        pStream->zalloc = miniz_def_alloc_func;
+    if (!pStream->zfree)
+        pStream->zfree = miniz_def_free_func;
+
+    pComp = (tdefl_compressor *)pStream->zalloc(pStream->opaque, 1, sizeof(tdefl_compressor));
+    if (!pComp)
+        return MZ_MEM_ERROR;
+
+    pStream->state = (struct mz_internal_state *)pComp;
+
+    if (tdefl_init(pComp, NULL, NULL, comp_flags) != TDEFL_STATUS_OKAY)
+    {
+        mz_deflateEnd(pStream);
+        return MZ_PARAM_ERROR;
+    }
+
+    return MZ_OK;
+}
+
+int mz_deflateReset(mz_streamp pStream)
+{
+    if ((!pStream) || (!pStream->state) || (!pStream->zalloc) || (!pStream->zfree))
+        return MZ_STREAM_ERROR;
+    pStream->total_in = pStream->total_out = 0;
+    tdefl_init((tdefl_compressor *)pStream->state, NULL, NULL, ((tdefl_compressor *)pStream->state)->m_flags);
+    return MZ_OK;
+}
+
+int mz_deflate(mz_streamp pStream, int flush)
+{
+    size_t in_bytes, out_bytes;
+    mz_ulong orig_total_in, orig_total_out;
+    int mz_status = MZ_OK;
+
+    if ((!pStream) || (!pStream->state) || (flush < 0) || (flush > MZ_FINISH) || (!pStream->next_out))
+        return MZ_STREAM_ERROR;
+    if (!pStream->avail_out)
+        return MZ_BUF_ERROR;
+
+    if (flush == MZ_PARTIAL_FLUSH)
+        flush = MZ_SYNC_FLUSH;
+
+    if (((tdefl_compressor *)pStream->state)->m_prev_return_status == TDEFL_STATUS_DONE)
+        return (flush == MZ_FINISH) ? MZ_STREAM_END : MZ_BUF_ERROR;
+
+    orig_total_in = pStream->total_in;
+    orig_total_out = pStream->total_out;
+    for (;;)
+    {
+        tdefl_status defl_status;
+        in_bytes = pStream->avail_in;
+        out_bytes = pStream->avail_out;
+
+        defl_status = tdefl_compress((tdefl_compressor *)pStream->state, pStream->next_in, &in_bytes, pStream->next_out, &out_bytes, (tdefl_flush)flush);
+        pStream->next_in += (mz_uint)in_bytes;
+        pStream->avail_in -= (mz_uint)in_bytes;
+        pStream->total_in += (mz_uint)in_bytes;
+        pStream->adler = tdefl_get_adler32((tdefl_compressor *)pStream->state);
+
+        pStream->next_out += (mz_uint)out_bytes;
+        pStream->avail_out -= (mz_uint)out_bytes;
+        pStream->total_out += (mz_uint)out_bytes;
+
+        if (defl_status < 0)
+        {
+            mz_status = MZ_STREAM_ERROR;
+            break;
+        }
+        else if (defl_status == TDEFL_STATUS_DONE)
+        {
+            mz_status = MZ_STREAM_END;
+            break;
+        }
+        else if (!pStream->avail_out)
+            break;
+        else if ((!pStream->avail_in) && (flush != MZ_FINISH))
+        {
+            if ((flush) || (pStream->total_in != orig_total_in) || (pStream->total_out != orig_total_out))
+                break;
+            return MZ_BUF_ERROR; /* Can't make forward progress without some input.
+ */
+        }
+    }
+    return mz_status;
+}
+
+int mz_deflateEnd(mz_streamp pStream)
+{
+    if (!pStream)
+        return MZ_STREAM_ERROR;
+    if (pStream->state)
+    {
+        pStream->zfree(pStream->opaque, pStream->state);
+        pStream->state = NULL;
+    }
+    return MZ_OK;
+}
+
+mz_ulong mz_deflateBound(mz_streamp pStream, mz_ulong source_len)
+{
+    (void)pStream;
+    /* This is really over conservative. (And lame, but it's actually pretty tricky to compute a true upper bound given the way tdefl's blocking works.) */
+    return MZ_MAX(128 + (source_len * 110) / 100, 128 + source_len + ((source_len / (31 * 1024)) + 1) * 5);
+}
+
+int mz_compress2(unsigned char *pDest, mz_ulong *pDest_len, const unsigned char *pSource, mz_ulong source_len, int level)
+{
+    int status;
+    mz_stream stream;
+    memset(&stream, 0, sizeof(stream));
+
+    /* In case mz_ulong is 64-bits (argh I hate longs). */
+    if ((source_len | *pDest_len) > 0xFFFFFFFFU)
+        return MZ_PARAM_ERROR;
+
+    stream.next_in = pSource;
+    stream.avail_in = (mz_uint32)source_len;
+    stream.next_out = pDest;
+    stream.avail_out = (mz_uint32)*pDest_len;
+
+    status = mz_deflateInit(&stream, level);
+    if (status != MZ_OK)
+        return status;
+
+    status = mz_deflate(&stream, MZ_FINISH);
+    if (status != MZ_STREAM_END)
+    {
+        mz_deflateEnd(&stream);
+        return (status == MZ_OK) ? MZ_BUF_ERROR : status;
+    }
+
+    *pDest_len = stream.total_out;
+    return mz_deflateEnd(&stream);
+}
+
+int mz_compress(unsigned char *pDest, mz_ulong *pDest_len, const unsigned char *pSource, mz_ulong source_len)
+{
+    return mz_compress2(pDest, pDest_len, pSource, source_len, MZ_DEFAULT_COMPRESSION);
+}
+
+mz_ulong mz_compressBound(mz_ulong source_len)
+{
+    return mz_deflateBound(NULL, source_len);
+}
+
+typedef struct
+{
+    tinfl_decompressor m_decomp;
+    mz_uint m_dict_ofs, m_dict_avail, m_first_call, m_has_flushed;
+    int m_window_bits;
+    mz_uint8 m_dict[TINFL_LZ_DICT_SIZE];
+    tinfl_status m_last_status;
+} inflate_state;
+
+int mz_inflateInit2(mz_streamp pStream, int window_bits)
+{
+    inflate_state *pDecomp;
+    if (!pStream)
+        return MZ_STREAM_ERROR;
+    if ((window_bits != MZ_DEFAULT_WINDOW_BITS) && (-window_bits != MZ_DEFAULT_WINDOW_BITS))
+        return MZ_PARAM_ERROR;
+
+    pStream->data_type = 0;
+    pStream->adler = 0;
+    pStream->msg = NULL;
+    pStream->total_in = 0;
+    pStream->total_out = 0;
+    pStream->reserved = 0;
+    if (!pStream->zalloc)
+        pStream->zalloc = miniz_def_alloc_func;
+    if (!pStream->zfree)
+        pStream->zfree = miniz_def_free_func;
+
+    pDecomp = (inflate_state *)pStream->zalloc(pStream->opaque, 1, sizeof(inflate_state));
+    if (!pDecomp)
+        return MZ_MEM_ERROR;
+
+    pStream->state = (struct mz_internal_state *)pDecomp;
+
+    tinfl_init(&pDecomp->m_decomp);
+    pDecomp->m_dict_ofs = 0;
+    pDecomp->m_dict_avail = 0;
+    pDecomp->m_last_status = TINFL_STATUS_NEEDS_MORE_INPUT;
+    pDecomp->m_first_call = 1;
+    pDecomp->m_has_flushed = 0;
+    pDecomp->m_window_bits = window_bits;
+
+    return MZ_OK;
+}
+
+int mz_inflateInit(mz_streamp pStream)
+{
+    return mz_inflateInit2(pStream, MZ_DEFAULT_WINDOW_BITS);
+}
+
+int mz_inflateReset(mz_streamp pStream)
+{
+    inflate_state *pDecomp;
+    if (!pStream)
+        return MZ_STREAM_ERROR;
+
+    pStream->data_type = 0;
+    pStream->adler = 0;
+    pStream->msg = NULL;
+    pStream->total_in = 0;
+    pStream->total_out = 0;
+    pStream->reserved = 0;
+
+    pDecomp = (inflate_state *)pStream->state;
+
+    tinfl_init(&pDecomp->m_decomp);
+    pDecomp->m_dict_ofs = 0;
+    pDecomp->m_dict_avail = 0;
+    pDecomp->m_last_status = TINFL_STATUS_NEEDS_MORE_INPUT;
+    pDecomp->m_first_call = 1;
+    pDecomp->m_has_flushed = 0;
+    /* pDecomp->m_window_bits = window_bits */;
+
+    return MZ_OK;
+}
+
+int mz_inflate(mz_streamp pStream, int flush)
+{
+    inflate_state *pState;
+    mz_uint n, first_call, decomp_flags = TINFL_FLAG_COMPUTE_ADLER32;
+    size_t in_bytes, out_bytes, orig_avail_in;
+    tinfl_status status;
+
+    if ((!pStream) || (!pStream->state))
+        return MZ_STREAM_ERROR;
+    if (flush == MZ_PARTIAL_FLUSH)
+        flush = MZ_SYNC_FLUSH;
+    if ((flush) && (flush != MZ_SYNC_FLUSH) && (flush != MZ_FINISH))
+        return MZ_STREAM_ERROR;
+
+    pState = (inflate_state *)pStream->state;
+    if (pState->m_window_bits > 0)
+        decomp_flags |= TINFL_FLAG_PARSE_ZLIB_HEADER;
+    orig_avail_in = pStream->avail_in;
+
+    first_call = pState->m_first_call;
+    pState->m_first_call = 0;
+    if (pState->m_last_status < 0)
+        return MZ_DATA_ERROR;
+
+    if (pState->m_has_flushed && (flush != MZ_FINISH))
+        return MZ_STREAM_ERROR;
+    pState->m_has_flushed |= (flush == MZ_FINISH);
+
+    if ((flush == MZ_FINISH) && (first_call))
+    {
+        /* MZ_FINISH on the first call implies that the input and output buffers are large enough to hold the entire compressed/decompressed file. */
+        decomp_flags |= TINFL_FLAG_USING_NON_WRAPPING_OUTPUT_BUF;
+        in_bytes = pStream->avail_in;
+        out_bytes = pStream->avail_out;
+        status = tinfl_decompress(&pState->m_decomp, pStream->next_in, &in_bytes, pStream->next_out, pStream->next_out, &out_bytes, decomp_flags);
+        pState->m_last_status = status;
+        pStream->next_in += (mz_uint)in_bytes;
+        pStream->avail_in -= (mz_uint)in_bytes;
+        pStream->total_in += (mz_uint)in_bytes;
+        pStream->adler = tinfl_get_adler32(&pState->m_decomp);
+        pStream->next_out += (mz_uint)out_bytes;
+        pStream->avail_out -= (mz_uint)out_bytes;
+        pStream->total_out += (mz_uint)out_bytes;
+
+        if (status < 0)
+            return MZ_DATA_ERROR;
+        else if (status != TINFL_STATUS_DONE)
+        {
+            pState->m_last_status = TINFL_STATUS_FAILED;
+            return MZ_BUF_ERROR;
+        }
+        return MZ_STREAM_END;
+    }
+    /* flush != MZ_FINISH then we must assume there's more input. */
+    if (flush != MZ_FINISH)
+        decomp_flags |= TINFL_FLAG_HAS_MORE_INPUT;
+
+    if (pState->m_dict_avail)
+    {
+        n = MZ_MIN(pState->m_dict_avail, pStream->avail_out);
+        memcpy(pStream->next_out, pState->m_dict + pState->m_dict_ofs, n);
+        pStream->next_out += n;
+        pStream->avail_out -= n;
+        pStream->total_out += n;
+        pState->m_dict_avail -= n;
+        pState->m_dict_ofs = (pState->m_dict_ofs + n) & (TINFL_LZ_DICT_SIZE - 1);
+        return ((pState->m_last_status == TINFL_STATUS_DONE) && (!pState->m_dict_avail)) ? MZ_STREAM_END : MZ_OK;
+    }
+
+    for (;;)
+    {
+        in_bytes = pStream->avail_in;
+        out_bytes = TINFL_LZ_DICT_SIZE - pState->m_dict_ofs;
+
+        status = tinfl_decompress(&pState->m_decomp, pStream->next_in, &in_bytes, pState->m_dict, pState->m_dict + pState->m_dict_ofs, &out_bytes, decomp_flags);
+        pState->m_last_status = status;
+
+        pStream->next_in += (mz_uint)in_bytes;
+        pStream->avail_in -= (mz_uint)in_bytes;
+        pStream->total_in += (mz_uint)in_bytes;
+        pStream->adler = tinfl_get_adler32(&pState->m_decomp);
+
+        pState->m_dict_avail = (mz_uint)out_bytes;
+
+        n = MZ_MIN(pState->m_dict_avail, pStream->avail_out);
+        memcpy(pStream->next_out, pState->m_dict + pState->m_dict_ofs, n);
+        pStream->next_out += n;
+        pStream->avail_out -= n;
+        pStream->total_out += n;
+        pState->m_dict_avail -= n;
+        pState->m_dict_ofs = (pState->m_dict_ofs + n) & (TINFL_LZ_DICT_SIZE - 1);
+
+        if (status < 0)
+            return MZ_DATA_ERROR; /* Stream is corrupted (there could be some uncompressed data left in the output dictionary - oh well). */
+        else if ((status == TINFL_STATUS_NEEDS_MORE_INPUT) && (!orig_avail_in))
+            return MZ_BUF_ERROR; /* Signal caller that we can't make forward progress without supplying more input or by setting flush to MZ_FINISH. */
+        else if (flush == MZ_FINISH)
+        {
+            /* The output buffer MUST be large to hold the remaining uncompressed data when flush==MZ_FINISH. */
+            if (status == TINFL_STATUS_DONE)
+                return pState->m_dict_avail ? MZ_BUF_ERROR : MZ_STREAM_END;
+            /* status here must be TINFL_STATUS_HAS_MORE_OUTPUT, which means there's at least 1 more byte on the way. If there's no more room left in the output buffer then something is wrong. */
+            else if (!pStream->avail_out)
+                return MZ_BUF_ERROR;
+        }
+        else if ((status == TINFL_STATUS_DONE) || (!pStream->avail_in) || (!pStream->avail_out) || (pState->m_dict_avail))
+            break;
+    }
+
+    return ((status == TINFL_STATUS_DONE) && (!pState->m_dict_avail)) ? MZ_STREAM_END : MZ_OK;
+}
+
+int mz_inflateEnd(mz_streamp pStream)
+{
+    if (!pStream)
+        return MZ_STREAM_ERROR;
+    if (pStream->state)
+    {
+        pStream->zfree(pStream->opaque, pStream->state);
+        pStream->state = NULL;
+    }
+    return MZ_OK;
+}
+int mz_uncompress2(unsigned char *pDest, mz_ulong *pDest_len, const unsigned char *pSource, mz_ulong *pSource_len)
+{
+    mz_stream stream;
+    int status;
+    memset(&stream, 0, sizeof(stream));
+
+    /* In case mz_ulong is 64-bits (argh I hate longs). */
+    if ((*pSource_len | *pDest_len) > 0xFFFFFFFFU)
+        return MZ_PARAM_ERROR;
+
+    stream.next_in = pSource;
+    stream.avail_in = (mz_uint32)*pSource_len;
+    stream.next_out = pDest;
+    stream.avail_out = (mz_uint32)*pDest_len;
+
+    status = mz_inflateInit(&stream);
+    if (status != MZ_OK)
+        return status;
+
+    status = mz_inflate(&stream, MZ_FINISH);
+    *pSource_len = *pSource_len - stream.avail_in;
+    if (status != MZ_STREAM_END)
+    {
+        mz_inflateEnd(&stream);
+        return ((status == MZ_BUF_ERROR) && (!stream.avail_in)) ? MZ_DATA_ERROR : status;
+    }
+    *pDest_len = stream.total_out;
+
+    return mz_inflateEnd(&stream);
+}
+
+int mz_uncompress(unsigned char *pDest, mz_ulong *pDest_len, const unsigned char *pSource, mz_ulong source_len)
+{
+    return mz_uncompress2(pDest, pDest_len, pSource, &source_len);
+}
+
+const char *mz_error(int err)
+{
+    static struct
+    {
+        int m_err;
+        const char *m_pDesc;
+    } s_error_descs[] =
+        {
+          { MZ_OK, "" }, { MZ_STREAM_END, "stream end" }, { MZ_NEED_DICT, "need dictionary" }, { MZ_ERRNO, "file error" }, { MZ_STREAM_ERROR, "stream error" }, { MZ_DATA_ERROR, "data error" }, { MZ_MEM_ERROR, "out of memory" }, { MZ_BUF_ERROR, "buf error" }, { MZ_VERSION_ERROR, "version error" }, { MZ_PARAM_ERROR, "parameter error" }
+        };
+    mz_uint i;
+    for (i = 0; i < sizeof(s_error_descs) / sizeof(s_error_descs[0]); ++i)
+        if (s_error_descs[i].m_err == err)
+            return s_error_descs[i].m_pDesc;
+    return NULL;
+}
+
+#endif /*MINIZ_NO_ZLIB_APIS */
+
+#ifdef __cplusplus
+}
+#endif
+
+/*
+  This is free and unencumbered software released into the public domain.
+
+  Anyone is free to copy, modify, publish, use, compile, sell, or
+  distribute this software, either in source code form or as a compiled
+  binary, for any purpose, commercial or non-commercial, and by any
+  means.
+
+  In jurisdictions that recognize copyright laws, the author or authors
+  of this software dedicate any and all copyright interest in the
+  software to the public domain. We make this dedication for the benefit
+  of the public at large and to the detriment of our heirs and
+  successors. We intend this dedication to be an overt act of
+  relinquishment in perpetuity of all present and future rights to this
+  software under copyright law.
+
+  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+  IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+  OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+  ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+  OTHER DEALINGS IN THE SOFTWARE.
+
+  For more information, please refer to <http://unlicense.org/>
+*/
+/**************************************************************************
+ *
+ * Copyright 2013-2014 RAD Game Tools and Valve Software
+ * Copyright 2010-2014 Rich Geldreich and Tenacious Software LLC
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* ------------------- Low-level Compression (independent from all decompression API's) */
+
+/* Purposely making these tables static for faster init and thread safety. */
+static const mz_uint16 s_tdefl_len_sym[256] =
+    {
+      257, 258, 259, 260, 261, 262, 263, 264, 265, 265, 266, 266, 267, 267, 268, 268, 269, 269, 269, 269, 270, 270, 270, 270, 271, 271, 271, 271, 272, 272, 272, 272,
+      273, 273, 273, 273, 273, 273, 273, 273, 274, 274, 274, 274, 274, 274, 274, 274, 275, 275, 275, 275, 275, 275, 275, 275, 276, 276, 276, 276, 276, 276, 276, 276,
+      277, 277, 277, 277, 277, 277, 277, 277, 277, 277, 277, 277, 277, 277, 277, 277, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278,
+      279, 279, 279, 279, 279, 279, 279, 279, 279, 279, 279, 279, 279, 279, 279, 279, 280, 280, 280, 280, 280, 280, 280, 280, 280, 280, 280, 280, 280, 280, 280, 280,
+      281, 281, 281, 281, 281, 281, 281, 281, 281, 281, 281, 281, 281, 281, 281, 281, 281, 281, 281, 281, 281, 281, 281, 281, 281, 281, 281, 281, 281, 281, 281, 281,
+      282, 282, 282, 282, 282, 282, 282, 282, 282, 282, 282, 282, 282, 282, 282, 282, 282, 282, 282, 282, 282, 282, 282, 282, 282, 282, 282, 282, 282, 282, 282, 282,
+      283, 283, 283, 283, 283, 283, 283, 283, 283, 283, 283, 283, 283, 283, 283, 283, 283, 283, 283, 283, 283, 283, 283, 283, 283, 283, 283, 283, 283, 283, 283, 283,
+      284, 284, 284, 284, 284, 284, 284, 284, 284, 284, 284, 284, 284, 284, 284, 284, 284, 284, 284, 284, 284, 284, 284, 284, 284, 284, 284, 284, 284, 284, 284, 285
+    };
+
+static const mz_uint8 s_tdefl_len_extra[256] =
+    {
+      0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+      4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+      5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+      5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 0
+    };
+
+static const mz_uint8 s_tdefl_small_dist_sym[512] =
+    {
+      0, 1, 2, 3, 4, 4, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11,
+      11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 13,
+      13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+      14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+      14, 14, 14, 14, 14, 14, 14, 14, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+      15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+      16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+      16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+      16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17,
+      17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17,
+      17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17,
+      17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17
+    };
+
+static const mz_uint8 s_tdefl_small_dist_extra[512] =
+    {
+      0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5,
+      5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+      6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+      6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+      7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+      7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+      7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+      7, 7, 7, 7, 7, 7, 7, 7
+    };
+
+static const mz_uint8 s_tdefl_large_dist_sym[128] =
+    {
+      0, 0, 18, 19, 20, 20, 21, 21, 22, 22, 22, 22, 23, 23, 23, 23, 24, 24, 24, 24, 24, 24, 24, 24, 25, 25, 25, 25, 25, 25, 25, 25, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
+      26, 26, 26, 26, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
+      28, 28, 28, 28, 28, 28, 28, 28, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29
+    };
+
+static const mz_uint8 s_tdefl_large_dist_extra[128] =
+    {
+      0, 0, 8, 8, 9, 9, 9, 9, 10, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+      12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+      13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13
+    };
+
+/* Radix sorts tdefl_sym_freq[] array by 16-bit key m_key. Returns ptr to sorted values. */
+typedef struct
+{
+    mz_uint16 m_key, m_sym_index;
+} tdefl_sym_freq;
+static tdefl_sym_freq *tdefl_radix_sort_syms(mz_uint num_syms, tdefl_sym_freq *pSyms0, tdefl_sym_freq *pSyms1)
+{
+    mz_uint32 total_passes = 2, pass_shift, pass, i, hist[256 * 2];
+    tdefl_sym_freq *pCur_syms = pSyms0, *pNew_syms = pSyms1;
+    MZ_CLEAR_OBJ(hist);
+    for (i = 0; i < num_syms; i++)
+    {
+        mz_uint freq = pSyms0[i].m_key;
+        hist[freq & 0xFF]++;
+        hist[256 + ((freq >> 8) & 0xFF)]++;
+    }
+    while ((total_passes > 1) && (num_syms == hist[(total_passes - 1) * 256]))
+        total_passes--;
+    for (pass_shift = 0, pass = 0; pass < total_passes; pass++, pass_shift += 8)
+    {
+        const mz_uint32 *pHist = &hist[pass << 8];
+        mz_uint offsets[256], cur_ofs = 0;
+        for (i = 0; i < 256; i++)
+        {
+            offsets[i] = cur_ofs;
+            cur_ofs += pHist[i];
+        }
+        for (i = 0; i < num_syms; i++)
+            pNew_syms[offsets[(pCur_syms[i].m_key >> pass_shift) & 0xFF]++] = pCur_syms[i];
+        {
+            tdefl_sym_freq *t = pCur_syms;
+            pCur_syms = pNew_syms;
+            pNew_syms = t;
+        }
+    }
+    return pCur_syms;
+}
+
+/* tdefl_calculate_minimum_redundancy() originally written by: Alistair Moffat, alistair@cs.mu.oz.au, Jyrki Katajainen, jyrki@diku.dk, November 1996. */
+static void tdefl_calculate_minimum_redundancy(tdefl_sym_freq *A, int n)
+{
+    int root, leaf, next, avbl, used, dpth;
+    if (n == 0)
+        return;
+    else if (n == 1)
+    {
+        A[0].m_key = 1;
+        return;
+    }
+    A[0].m_key += A[1].m_key;
+    root = 0;
+    leaf = 2;
+    for (next = 1; next < n - 1; next++)
+    {
+        if (leaf >= n || A[root].m_key < A[leaf].m_key)
+        {
+            A[next].m_key = A[root].m_key;
+            A[root++].m_key = (mz_uint16)next;
+        }
+        else
+            A[next].m_key = A[leaf++].m_key;
+        if (leaf >= n || (root < next && A[root].m_key < A[leaf].m_key))
+        {
+            A[next].m_key = (mz_uint16)(A[next].m_key + A[root].m_key);
+            A[root++].m_key = (mz_uint16)next;
+        }
+        else
+            A[next].m_key = (mz_uint16)(A[next].m_key + A[leaf++].m_key);
+    }
+    A[n - 2].m_key = 0;
+    for (next = n - 3; next >= 0; next--)
+        A[next].m_key = A[A[next].m_key].m_key + 1;
+    avbl = 1;
+    used = dpth = 0;
+    root = n - 2;
+    next = n - 1;
+    while (avbl > 0)
+    {
+        while (root >= 0 && (int)A[root].m_key == dpth)
+        {
+            used++;
+            root--;
+        }
+        while (avbl > used)
+        {
+            A[next--].m_key = (mz_uint16)(dpth);
+            avbl--;
+        }
+        avbl = 2 * used;
+        dpth++;
+        used = 0;
+    }
+}
+
+/* Limits canonical Huffman code table's max code size. */
+enum
+{
+    TDEFL_MAX_SUPPORTED_HUFF_CODESIZE = 32
+};
+static void tdefl_huffman_enforce_max_code_size(int *pNum_codes, int code_list_len, int max_code_size)
+{
+    int i;
+    mz_uint32 total = 0;
+    if (code_list_len <= 1)
+        return;
+    for (i = max_code_size + 1; i <= TDEFL_MAX_SUPPORTED_HUFF_CODESIZE; i++)
+        pNum_codes[max_code_size] += pNum_codes[i];
+    for (i = max_code_size; i > 0; i--)
+        total += (((mz_uint32)pNum_codes[i]) << (max_code_size - i));
+    while (total != (1UL << max_code_size))
+    {
+        pNum_codes[max_code_size]--;
+        for (i = max_code_size - 1; i > 0; i--)
+            if (pNum_codes[i])
+            {
+                pNum_codes[i]--;
+                pNum_codes[i + 1] += 2;
+                break;
+            }
+        total--;
+    }
+}
+
+static void tdefl_optimize_huffman_table(tdefl_compressor *d, int table_num, int table_len, int code_size_limit, int static_table)
+{
+    int i, j, l, num_codes[1 + TDEFL_MAX_SUPPORTED_HUFF_CODESIZE];
+    mz_uint next_code[TDEFL_MAX_SUPPORTED_HUFF_CODESIZE + 1];
+    MZ_CLEAR_OBJ(num_codes);
+    if (static_table)
+    {
+        for (i = 0; i < table_len; i++)
+            num_codes[d->m_huff_code_sizes[table_num][i]]++;
+    }
+    else
+    {
+        tdefl_sym_freq syms0[TDEFL_MAX_HUFF_SYMBOLS], syms1[TDEFL_MAX_HUFF_SYMBOLS], *pSyms;
+        int num_used_syms = 0;
+        const mz_uint16 *pSym_count = &d->m_huff_count[table_num][0];
+        for (i = 0; i < table_len; i++)
+            if (pSym_count[i])
+            {
+                syms0[num_used_syms].m_key = (mz_uint16)pSym_count[i];
+                syms0[num_used_syms++].m_sym_index = (mz_uint16)i;
+            }
+
+        pSyms = tdefl_radix_sort_syms(num_used_syms, syms0, syms1);
+        tdefl_calculate_minimum_redundancy(pSyms, num_used_syms);
+
+        for (i = 0; i < num_used_syms; i++)
+            num_codes[pSyms[i].m_key]++;
+
+        tdefl_huffman_enforce_max_code_size(num_codes, num_used_syms, code_size_limit);
+
+        MZ_CLEAR_OBJ(d->m_huff_code_sizes[table_num]);
+        MZ_CLEAR_OBJ(d->m_huff_codes[table_num]);
+        for (i = 1, j = num_used_syms; i <= code_size_limit; i++)
+            for (l = num_codes[i]; l > 0; l--)
+                d->m_huff_code_sizes[table_num][pSyms[--j].m_sym_index] = (mz_uint8)(i);
+    }
+
+    next_code[1] = 0;
+    for (j = 0, i = 2; i <= code_size_limit; i++)
+        next_code[i] = j = ((j + num_codes[i - 1]) << 1);
+
+    for (i = 0; i < table_len; i++)
+    {
+        mz_uint rev_code = 0, code, code_size;
+        if ((code_size = d->m_huff_code_sizes[table_num][i]) == 0)
+            continue;
+        code = next_code[code_size]++;
+        for (l = code_size; l > 0; l--, code >>= 1)
+            rev_code = (rev_code << 1) | (code & 1);
+        d->m_huff_codes[table_num][i] = (mz_uint16)rev_code;
+    }
+}
+
+#define TDEFL_PUT_BITS(b, l)                                       \
+    do                                                             \
+    {                                                              \
+        mz_uint bits = b;                                          \
+        mz_uint len = l;                                           \
+        MZ_ASSERT(bits <= ((1U << len) - 1U));                     \
+        d->m_bit_buffer |= (bits << d->m_bits_in);                 \
+        d->m_bits_in += len;                                       \
+        while (d->m_bits_in >= 8)                                  \
+        {                                                          \
+            if (d->m_pOutput_buf < d->m_pOutput_buf_end)           \
+                *d->m_pOutput_buf++ = (mz_uint8)(d->m_bit_buffer); \
+            d->m_bit_buffer >>= 8;                                 \
+            d->m_bits_in -= 8;                                     \
+        }                                                          \
+    }                                                              \
+    MZ_MACRO_END
+
+#define TDEFL_RLE_PREV_CODE_SIZE()                                                                                       \
+    {                                                                                                                    \
+        if (rle_repeat_count)                                                                                            \
+        {                                                                                                                \
+            if (rle_repeat_count < 3)                                                                                    \
+            {                                                                                                            \
+                d->m_huff_count[2][prev_code_size] = (mz_uint16)(d->m_huff_count[2][prev_code_size] + rle_repeat_count); \
+                while (rle_repeat_count--)                                                                               \
+                    packed_code_sizes[num_packed_code_sizes++] = prev_code_size;                                         \
+            }                                                                                                            \
+            else                                                                                                         \
+            {                                                                                                            \
+                d->m_huff_count[2][16] = (mz_uint16)(d->m_huff_count[2][16] + 1);                                        \
+                packed_code_sizes[num_packed_code_sizes++] = 16;                                                         \
+                packed_code_sizes[num_packed_code_sizes++] = (mz_uint8)(rle_repeat_count - 3);                           \
+            }                                                                                                            \
+            rle_repeat_count = 0;                                                                                        \
+        }                                                                                                                \
+    }
+
+#define TDEFL_RLE_ZERO_CODE_SIZE()                                                         \
+    {                                                                                      \
+        if (rle_z_count)                                                                   \
+        {                                                                                  \
+            if (rle_z_count < 3)                                                           \
+            {                                                                              \
+                d->m_huff_count[2][0] = (mz_uint16)(d->m_huff_count[2][0] + rle_z_count);  \
+                while (rle_z_count--)                                                      \
+                    packed_code_sizes[num_packed_code_sizes++] = 0;                        \
+            }                                                                              \
+            else if (rle_z_count <= 10)                                                    \
+            {                                                                              \
+                d->m_huff_count[2][17] = (mz_uint16)(d->m_huff_count[2][17] + 1);          \
+                packed_code_sizes[num_packed_code_sizes++] = 17;                           \
+                packed_code_sizes[num_packed_code_sizes++] = (mz_uint8)(rle_z_count - 3);  \
+            }                                                                              \
+            else                                                                           \
+            {                                                                              \
+                d->m_huff_count[2][18] = (mz_uint16)(d->m_huff_count[2][18] + 1);          \
+                packed_code_sizes[num_packed_code_sizes++] = 18;                           \
+                packed_code_sizes[num_packed_code_sizes++] = (mz_uint8)(rle_z_count - 11); \
+            }                                                                              \
+            rle_z_count = 0;                                                               \
+        }                                                                                  \
+    }
+
+static mz_uint8 s_tdefl_packed_code_size_syms_swizzle[] = { 16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15 };
+
+static void tdefl_start_dynamic_block(tdefl_compressor *d)
+{
+    int num_lit_codes, num_dist_codes, num_bit_lengths;
+    mz_uint i, total_code_sizes_to_pack, num_packed_code_sizes, rle_z_count, rle_repeat_count, packed_code_sizes_index;
+    mz_uint8 code_sizes_to_pack[TDEFL_MAX_HUFF_SYMBOLS_0 + TDEFL_MAX_HUFF_SYMBOLS_1], packed_code_sizes[TDEFL_MAX_HUFF_SYMBOLS_0 + TDEFL_MAX_HUFF_SYMBOLS_1], prev_code_size = 0xFF;
+
+    d->m_huff_count[0][256] = 1;
+
+    tdefl_optimize_huffman_table(d, 0, TDEFL_MAX_HUFF_SYMBOLS_0, 15, MZ_FALSE);
+    tdefl_optimize_huffman_table(d, 1, TDEFL_MAX_HUFF_SYMBOLS_1, 15, MZ_FALSE);
+
+    for (num_lit_codes = 286; num_lit_codes > 257; num_lit_codes--)
+        if (d->m_huff_code_sizes[0][num_lit_codes - 1])
+            break;
+    for (num_dist_codes = 30; num_dist_codes > 1; num_dist_codes--)
+        if (d->m_huff_code_sizes[1][num_dist_codes - 1])
+            break;
+
+    memcpy(code_sizes_to_pack, &d->m_huff_code_sizes[0][0], num_lit_codes);
+    memcpy(code_sizes_to_pack + num_lit_codes, &d->m_huff_code_sizes[1][0], num_dist_codes);
+    total_code_sizes_to_pack = num_lit_codes + num_dist_codes;
+    num_packed_code_sizes = 0;
+    rle_z_count = 0;
+    rle_repeat_count = 0;
+
+    memset(&d->m_huff_count[2][0], 0, sizeof(d->m_huff_count[2][0]) * TDEFL_MAX_HUFF_SYMBOLS_2);
+    for (i = 0; i < total_code_sizes_to_pack; i++)
+    {
+        mz_uint8 code_size = code_sizes_to_pack[i];
+        if (!code_size)
+        {
+            TDEFL_RLE_PREV_CODE_SIZE();
+            if (++rle_z_count == 138)
+            {
+                TDEFL_RLE_ZERO_CODE_SIZE();
+            }
+        }
+        else
+        {
+            TDEFL_RLE_ZERO_CODE_SIZE();
+            if (code_size != prev_code_size)
+            {
+                TDEFL_RLE_PREV_CODE_SIZE();
+                d->m_huff_count[2][code_size] = (mz_uint16)(d->m_huff_count[2][code_size] + 1);
+                packed_code_sizes[num_packed_code_sizes++] = code_size;
+            }
+            else if (++rle_repeat_count == 6)
+            {
+                TDEFL_RLE_PREV_CODE_SIZE();
+            }
+        }
+        prev_code_size = code_size;
+    }
+    if (rle_repeat_count)
+    {
+        TDEFL_RLE_PREV_CODE_SIZE();
+    }
+    else
+    {
+        TDEFL_RLE_ZERO_CODE_SIZE();
+    }
+
+    tdefl_optimize_huffman_table(d, 2, TDEFL_MAX_HUFF_SYMBOLS_2, 7, MZ_FALSE);
+
+    TDEFL_PUT_BITS(2, 2);
+
+    TDEFL_PUT_BITS(num_lit_codes - 257, 5);
+    TDEFL_PUT_BITS(num_dist_codes - 1, 5);
+
+    for (num_bit_lengths = 18; num_bit_lengths >= 0; num_bit_lengths--)
+        if (d->m_huff_code_sizes[2][s_tdefl_packed_code_size_syms_swizzle[num_bit_lengths]])
+            break;
+    num_bit_lengths = MZ_MAX(4, (num_bit_lengths + 1));
+    TDEFL_PUT_BITS(num_bit_lengths - 4, 4);
+    for (i = 0; (int)i < num_bit_lengths; i++)
+        TDEFL_PUT_BITS(d->m_huff_code_sizes[2][s_tdefl_packed_code_size_syms_swizzle[i]], 3);
+
+    for (packed_code_sizes_index = 0; packed_code_sizes_index < num_packed_code_sizes;)
+    {
+        mz_uint code = packed_code_sizes[packed_code_sizes_index++];
+        MZ_ASSERT(code < TDEFL_MAX_HUFF_SYMBOLS_2);
+        TDEFL_PUT_BITS(d->m_huff_codes[2][code], d->m_huff_code_sizes[2][code]);
+        if (code >= 16)
+            TDEFL_PUT_BITS(packed_code_sizes[packed_code_sizes_index++], "\02\03\07"[code - 16]);
+    }
+}
+
+static void tdefl_start_static_block(tdefl_compressor *d)
+{
+    mz_uint i;
+    mz_uint8 *p = &d->m_huff_code_sizes[0][0];
+
+    for (i = 0; i <= 143; ++i)
+        *p++ = 8;
+    for (; i <= 255; ++i)
+        *p++ = 9;
+    for (; i <= 279; ++i)
+        *p++ = 7;
+    for (; i <= 287; ++i)
+        *p++ = 8;
+
+    memset(d->m_huff_code_sizes[1], 5, 32);
+
+    tdefl_optimize_huffman_table(d, 0, 288, 15, MZ_TRUE);
+    tdefl_optimize_huffman_table(d, 1, 32, 15, MZ_TRUE);
+
+    TDEFL_PUT_BITS(1, 2);
+}
+
+static const mz_uint mz_bitmasks[17] = { 0x0000, 0x0001, 0x0003, 0x0007, 0x000F, 0x001F, 0x003F, 0x007F, 0x00FF, 0x01FF, 0x03FF, 0x07FF, 0x0FFF, 0x1FFF, 0x3FFF, 0x7FFF, 0xFFFF };
+
+#if MINIZ_USE_UNALIGNED_LOADS_AND_STORES && MINIZ_LITTLE_ENDIAN && MINIZ_HAS_64BIT_REGISTERS
+static mz_bool tdefl_compress_lz_codes(tdefl_compressor *d)
+{
+    mz_uint flags;
+    mz_uint8 *pLZ_codes;
+    mz_uint8 *pOutput_buf = d->m_pOutput_buf;
+    mz_uint8 *pLZ_code_buf_end = d->m_pLZ_code_buf;
+    mz_uint64 bit_buffer = d->m_bit_buffer;
+    mz_uint bits_in = d->m_bits_in;
+
+#define TDEFL_PUT_BITS_FAST(b, l)                    \
+    {                                                \
+        bit_buffer |= (((mz_uint64)(b)) << bits_in); \
+        bits_in += (l);                              \
+    }
+
+    flags = 1;
+    for (pLZ_codes = d->m_lz_code_buf; pLZ_codes < pLZ_code_buf_end; flags >>= 1)
+    {
+        if (flags == 1)
+            flags = *pLZ_codes++ | 0x100;
+
+        if (flags & 1)
+        {
+            mz_uint s0, s1, n0, n1, sym, num_extra_bits;
+            mz_uint match_len = pLZ_codes[0], match_dist = *(const mz_uint16 *)(pLZ_codes + 1);
+            pLZ_codes += 3;
+
+            MZ_ASSERT(d->m_huff_code_sizes[0][s_tdefl_len_sym[match_len]]);
+            TDEFL_PUT_BITS_FAST(d->m_huff_codes[0][s_tdefl_len_sym[match_len]], d->m_huff_code_sizes[0][s_tdefl_len_sym[match_len]]);
+            TDEFL_PUT_BITS_FAST(match_len & mz_bitmasks[s_tdefl_len_extra[match_len]], s_tdefl_len_extra[match_len]);
+
+            /* This sequence coaxes MSVC into using cmov's vs. jmp's. */
+            s0 = s_tdefl_small_dist_sym[match_dist & 511];
+            n0 = s_tdefl_small_dist_extra[match_dist & 511];
+            s1 = s_tdefl_large_dist_sym[match_dist >> 8];
+            n1 = s_tdefl_large_dist_extra[match_dist >> 8];
+            sym = (match_dist < 512) ? s0 : s1;
+            num_extra_bits = (match_dist < 512) ? n0 : n1;
+
+            MZ_ASSERT(d->m_huff_code_sizes[1][sym]);
+            TDEFL_PUT_BITS_FAST(d->m_huff_codes[1][sym], d->m_huff_code_sizes[1][sym]);
+            TDEFL_PUT_BITS_FAST(match_dist & mz_bitmasks[num_extra_bits], num_extra_bits);
+        }
+        else
+        {
+            mz_uint lit = *pLZ_codes++;
+            MZ_ASSERT(d->m_huff_code_sizes[0][lit]);
+            TDEFL_PUT_BITS_FAST(d->m_huff_codes[0][lit], d->m_huff_code_sizes[0][lit]);
+
+            if (((flags & 2) == 0) && (pLZ_codes < pLZ_code_buf_end))
+            {
+                flags >>= 1;
+                lit = *pLZ_codes++;
+                MZ_ASSERT(d->m_huff_code_sizes[0][lit]);
+                TDEFL_PUT_BITS_FAST(d->m_huff_codes[0][lit], d->m_huff_code_sizes[0][lit]);
+
+                if (((flags & 2) == 0) && (pLZ_codes < pLZ_code_buf_end))
+                {
+                    flags >>= 1;
+                    lit = *pLZ_codes++;
+                    MZ_ASSERT(d->m_huff_code_sizes[0][lit]);
+                    TDEFL_PUT_BITS_FAST(d->m_huff_codes[0][lit], d->m_huff_code_sizes[0][lit]);
+                }
+            }
+        }
+
+        if (pOutput_buf >= d->m_pOutput_buf_end)
+            return MZ_FALSE;
+
+        *(mz_uint64 *)pOutput_buf = bit_buffer;
+        pOutput_buf += (bits_in >> 3);
+        bit_buffer >>= (bits_in & ~7);
+        bits_in &= 7;
+    }
+
+#undef TDEFL_PUT_BITS_FAST
+
+    d->m_pOutput_buf = pOutput_buf;
+    d->m_bits_in = 0;
+    d->m_bit_buffer = 0;
+
+    while (bits_in)
+    {
+        mz_uint32 n = MZ_MIN(bits_in, 16);
+        TDEFL_PUT_BITS((mz_uint)bit_buffer & mz_bitmasks[n], n);
+        bit_buffer >>= n;
+        bits_in -= n;
+    }
+
+    TDEFL_PUT_BITS(d->m_huff_codes[0][256], d->m_huff_code_sizes[0][256]);
+
+    return (d->m_pOutput_buf < d->m_pOutput_buf_end);
+}
+#else
+static mz_bool tdefl_compress_lz_codes(tdefl_compressor *d)
+{
+    mz_uint flags;
+    mz_uint8 *pLZ_codes;
+
+    flags = 1;
+    for (pLZ_codes = d->m_lz_code_buf; pLZ_codes < d->m_pLZ_code_buf; flags >>= 1)
+    {
+        if (flags == 1)
+            flags = *pLZ_codes++ | 0x100;
+        if (flags & 1)
+        {
+            mz_uint sym, num_extra_bits;
+            mz_uint match_len = pLZ_codes[0], match_dist = (pLZ_codes[1] | (pLZ_codes[2] << 8));
+            pLZ_codes += 3;
+
+            MZ_ASSERT(d->m_huff_code_sizes[0][s_tdefl_len_sym[match_len]]);
+            TDEFL_PUT_BITS(d->m_huff_codes[0][s_tdefl_len_sym[match_len]], d->m_huff_code_sizes[0][s_tdefl_len_sym[match_len]]);
+            TDEFL_PUT_BITS(match_len & mz_bitmasks[s_tdefl_len_extra[match_len]], s_tdefl_len_extra[match_len]);
+
+            if (match_dist < 512)
+            {
+                sym = s_tdefl_small_dist_sym[match_dist];
+                num_extra_bits = s_tdefl_small_dist_extra[match_dist];
+            }
+            else
+            {
+                sym = s_tdefl_large_dist_sym[match_dist >> 8];
+                num_extra_bits = s_tdefl_large_dist_extra[match_dist >> 8];
+            }
+            MZ_ASSERT(d->m_huff_code_sizes[1][sym]);
+            TDEFL_PUT_BITS(d->m_huff_codes[1][sym], d->m_huff_code_sizes[1][sym]);
+            TDEFL_PUT_BITS(match_dist & mz_bitmasks[num_extra_bits], num_extra_bits);
+        }
+        else
+        {
+            mz_uint lit = *pLZ_codes++;
+            MZ_ASSERT(d->m_huff_code_sizes[0][lit]);
+            TDEFL_PUT_BITS(d->m_huff_codes[0][lit], d->m_huff_code_sizes[0][lit]);
+        }
+    }
+
+    TDEFL_PUT_BITS(d->m_huff_codes[0][256], d->m_huff_code_sizes[0][256]);
+
+    return (d->m_pOutput_buf < d->m_pOutput_buf_end);
+}
+#endif /* MINIZ_USE_UNALIGNED_LOADS_AND_STORES && MINIZ_LITTLE_ENDIAN && MINIZ_HAS_64BIT_REGISTERS */
+
+static mz_bool tdefl_compress_block(tdefl_compressor *d, mz_bool static_block)
+{
+    if (static_block)
+        tdefl_start_static_block(d);
+    else
+        tdefl_start_dynamic_block(d);
+    return tdefl_compress_lz_codes(d);
+}
+
+static int tdefl_flush_block(tdefl_compressor *d, int flush)
+{
+    mz_uint saved_bit_buf, saved_bits_in;
+    mz_uint8 *pSaved_output_buf;
+    mz_bool comp_block_succeeded = MZ_FALSE;
+    int n, use_raw_block = ((d->m_flags & TDEFL_FORCE_ALL_RAW_BLOCKS) != 0) && (d->m_lookahead_pos - d->m_lz_code_buf_dict_pos) <= d->m_dict_size;
+    mz_uint8 *pOutput_buf_start = ((d->m_pPut_buf_func == NULL) && ((*d->m_pOut_buf_size - d->m_out_buf_ofs) >= TDEFL_OUT_BUF_SIZE)) ? ((mz_uint8 *)d->m_pOut_buf + d->m_out_buf_ofs) : d->m_output_buf;
+
+    d->m_pOutput_buf = pOutput_buf_start;
+    d->m_pOutput_buf_end = d->m_pOutput_buf + TDEFL_OUT_BUF_SIZE - 16;
+
+    MZ_ASSERT(!d->m_output_flush_remaining);
+    d->m_output_flush_ofs = 0;
+    d->m_output_flush_remaining = 0;
+
+    *d->m_pLZ_flags = (mz_uint8)(*d->m_pLZ_flags >> d->m_num_flags_left);
+    d->m_pLZ_code_buf -= (d->m_num_flags_left == 8);
+
+    if ((d->m_flags & TDEFL_WRITE_ZLIB_HEADER) && (!d->m_block_index))
+    {
+        TDEFL_PUT_BITS(0x78, 8);
+        TDEFL_PUT_BITS(0x01, 8);
+    }
+
+    TDEFL_PUT_BITS(flush == TDEFL_FINISH, 1);
+
+    pSaved_output_buf = d->m_pOutput_buf;
+    saved_bit_buf = d->m_bit_buffer;
+    saved_bits_in = d->m_bits_in;
+
+    if (!use_raw_block)
+        comp_block_succeeded = tdefl_compress_block(d, (d->m_flags & TDEFL_FORCE_ALL_STATIC_BLOCKS) || (d->m_total_lz_bytes < 48));
+
+    /* If the block gets expanded, forget the current contents of the output buffer and send a raw block instead. */
+    if (((use_raw_block) || ((d->m_total_lz_bytes) && ((d->m_pOutput_buf - pSaved_output_buf + 1U) >= d->m_total_lz_bytes))) &&
+        ((d->m_lookahead_pos - d->m_lz_code_buf_dict_pos) <= d->m_dict_size))
+    {
+        mz_uint i;
+        d->m_pOutput_buf = pSaved_output_buf;
+        d->m_bit_buffer = saved_bit_buf, d->m_bits_in = saved_bits_in;
+        TDEFL_PUT_BITS(0, 2);
+        if (d->m_bits_in)
+        {
+            TDEFL_PUT_BITS(0, 8 - d->m_bits_in);
+        }
+        for (i = 2; i; --i, d->m_total_lz_bytes ^= 0xFFFF)
+        {
+            TDEFL_PUT_BITS(d->m_total_lz_bytes & 0xFFFF, 16);
+        }
+        for (i = 0; i < d->m_total_lz_bytes; ++i)
+        {
+            TDEFL_PUT_BITS(d->m_dict[(d->m_lz_code_buf_dict_pos + i) & TDEFL_LZ_DICT_SIZE_MASK], 8);
+        }
+    }
+    /* Check for the extremely unlikely (if not impossible) case of the compressed block not fitting into the output buffer when using dynamic codes. */
+    else if (!comp_block_succeeded)
+    {
+        d->m_pOutput_buf = pSaved_output_buf;
+        d->m_bit_buffer = saved_bit_buf, d->m_bits_in = saved_bits_in;
+        tdefl_compress_block(d, MZ_TRUE);
+    }
+
+    if (flush)
+    {
+        if (flush == TDEFL_FINISH)
+        {
+            if (d->m_bits_in)
+            {
+                TDEFL_PUT_BITS(0, 8 - d->m_bits_in);
+            }
+            if (d->m_flags & TDEFL_WRITE_ZLIB_HEADER)
+            {
+                mz_uint i, a = d->m_adler32;
+                for (i = 0; i < 4; i++)
+                {
+                    TDEFL_PUT_BITS((a >> 24) & 0xFF, 8);
+                    a <<= 8;
+                }
+            }
+        }
+        else
+        {
+            mz_uint i, z = 0;
+            TDEFL_PUT_BITS(0, 3);
+            if (d->m_bits_in)
+            {
+                TDEFL_PUT_BITS(0, 8 - d->m_bits_in);
+            }
+            for (i = 2; i; --i, z ^= 0xFFFF)
+            {
+                TDEFL_PUT_BITS(z & 0xFFFF, 16);
+            }
+        }
+    }
+
+    MZ_ASSERT(d->m_pOutput_buf < d->m_pOutput_buf_end);
+
+    memset(&d->m_huff_count[0][0], 0, sizeof(d->m_huff_count[0][0]) * TDEFL_MAX_HUFF_SYMBOLS_0);
+    memset(&d->m_huff_count[1][0], 0, sizeof(d->m_huff_count[1][0]) * TDEFL_MAX_HUFF_SYMBOLS_1);
+
+    d->m_pLZ_code_buf = d->m_lz_code_buf + 1;
+    d->m_pLZ_flags = d->m_lz_code_buf;
+    d->m_num_flags_left = 8;
+    d->m_lz_code_buf_dict_pos += d->m_total_lz_bytes;
+    d->m_total_lz_bytes = 0;
+    d->m_block_index++;
+
+    if ((n = (int)(d->m_pOutput_buf - pOutput_buf_start)) != 0)
+    {
+        if (d->m_pPut_buf_func)
+        {
+            *d->m_pIn_buf_size = d->m_pSrc - (const mz_uint8 *)d->m_pIn_buf;
+            if (!(*d->m_pPut_buf_func)(d->m_output_buf, n, d->m_pPut_buf_user))
+                return (d->m_prev_return_status = TDEFL_STATUS_PUT_BUF_FAILED);
+        }
+        else if (pOutput_buf_start == d->m_output_buf)
+        {
+            int bytes_to_copy = (int)MZ_MIN((size_t)n, (size_t)(*d->m_pOut_buf_size - d->m_out_buf_ofs));
+            memcpy((mz_uint8 *)d->m_pOut_buf + d->m_out_buf_ofs, d->m_output_buf, bytes_to_copy);
+            d->m_out_buf_ofs += bytes_to_copy;
+            if ((n -= bytes_to_copy) != 0)
+            {
+                d->m_output_flush_ofs = bytes_to_copy;
+                d->m_output_flush_remaining = n;
+            }
+        }
+        else
+        {
+            d->m_out_buf_ofs += n;
+        }
+    }
+
+    return d->m_output_flush_remaining;
+}
+
+#if MINIZ_USE_UNALIGNED_LOADS_AND_STORES
+#ifdef MINIZ_UNALIGNED_USE_MEMCPY
+static mz_uint16 TDEFL_READ_UNALIGNED_WORD(const mz_uint8* p)
+{
+	mz_uint16 ret;
+	memcpy(&ret, p, sizeof(mz_uint16));
+	return ret;
+}
+static mz_uint16 TDEFL_READ_UNALIGNED_WORD2(const mz_uint16* p)
+{
+	mz_uint16 ret;
+	memcpy(&ret, p, sizeof(mz_uint16));
+	return ret;
+}
+#else
+#define TDEFL_READ_UNALIGNED_WORD(p) *(const mz_uint16 *)(p)
+#define TDEFL_READ_UNALIGNED_WORD2(p) *(const mz_uint16 *)(p)
+#endif
+static MZ_FORCEINLINE void tdefl_find_match(tdefl_compressor *d, mz_uint lookahead_pos, mz_uint max_dist, mz_uint max_match_len, mz_uint *pMatch_dist, mz_uint *pMatch_len)
+{
+    mz_uint dist, pos = lookahead_pos & TDEFL_LZ_DICT_SIZE_MASK, match_len = *pMatch_len, probe_pos = pos, next_probe_pos, probe_len;
+    mz_uint num_probes_left = d->m_max_probes[match_len >= 32];
+    const mz_uint16 *s = (const mz_uint16 *)(d->m_dict + pos), *p, *q;
+    mz_uint16 c01 = TDEFL_READ_UNALIGNED_WORD(&d->m_dict[pos + match_len - 1]), s01 = TDEFL_READ_UNALIGNED_WORD2(s);
+    MZ_ASSERT(max_match_len <= TDEFL_MAX_MATCH_LEN);
+    if (max_match_len <= match_len)
+        return;
+    for (;;)
+    {
+        for (;;)
+        {
+            if (--num_probes_left == 0)
+                return;
+#define TDEFL_PROBE                                                                             \
+    next_probe_pos = d->m_next[probe_pos];                                                      \
+    if ((!next_probe_pos) || ((dist = (mz_uint16)(lookahead_pos - next_probe_pos)) > max_dist)) \
+        return;                                                                                 \
+    probe_pos = next_probe_pos & TDEFL_LZ_DICT_SIZE_MASK;                                       \
+    if (TDEFL_READ_UNALIGNED_WORD(&d->m_dict[probe_pos + match_len - 1]) == c01)                \
+        break;
+            TDEFL_PROBE;
+            TDEFL_PROBE;
+            TDEFL_PROBE;
+        }
+        if (!dist)
+            break;
+        q = (const mz_uint16 *)(d->m_dict + probe_pos);
+        if (TDEFL_READ_UNALIGNED_WORD2(q) != s01)
+            continue;
+        p = s;
+        probe_len = 32;
+        do
+        {
+        } while ((TDEFL_READ_UNALIGNED_WORD2(++p) == TDEFL_READ_UNALIGNED_WORD2(++q)) && (TDEFL_READ_UNALIGNED_WORD2(++p) == TDEFL_READ_UNALIGNED_WORD2(++q)) &&
+                 (TDEFL_READ_UNALIGNED_WORD2(++p) == TDEFL_READ_UNALIGNED_WORD2(++q)) && (TDEFL_READ_UNALIGNED_WORD2(++p) == TDEFL_READ_UNALIGNED_WORD2(++q)) && (--probe_len > 0));
+        if (!probe_len)
+        {
+            *pMatch_dist = dist;
+            *pMatch_len = MZ_MIN(max_match_len, (mz_uint)TDEFL_MAX_MATCH_LEN);
+            break;
+        }
+        else if ((probe_len = ((mz_uint)(p - s) * 2) + (mz_uint)(*(const mz_uint8 *)p == *(const mz_uint8 *)q)) > match_len)
+        {
+            *pMatch_dist = dist;
+            if ((*pMatch_len = match_len = MZ_MIN(max_match_len, probe_len)) == max_match_len)
+                break;
+            c01 = TDEFL_READ_UNALIGNED_WORD(&d->m_dict[pos + match_len - 1]);
+        }
+    }
+}
+#else
+static MZ_FORCEINLINE void tdefl_find_match(tdefl_compressor *d, mz_uint lookahead_pos, mz_uint max_dist, mz_uint max_match_len, mz_uint *pMatch_dist, mz_uint *pMatch_len)
+{
+    mz_uint dist, pos = lookahead_pos & TDEFL_LZ_DICT_SIZE_MASK, match_len = *pMatch_len, probe_pos = pos, next_probe_pos, probe_len;
+    mz_uint num_probes_left = d->m_max_probes[match_len >= 32];
+    const mz_uint8 *s = d->m_dict + pos, *p, *q;
+    mz_uint8 c0 = d->m_dict[pos + match_len], c1 = d->m_dict[pos + match_len - 1];
+    MZ_ASSERT(max_match_len <= TDEFL_MAX_MATCH_LEN);
+    if (max_match_len <= match_len)
+        return;
+    for (;;)
+    {
+        for (;;)
+        {
+            if (--num_probes_left == 0)
+                return;
+#define TDEFL_PROBE                                                                               \
+    next_probe_pos = d->m_next[probe_pos];                                                        \
+    if ((!next_probe_pos) || ((dist = (mz_uint16)(lookahead_pos - next_probe_pos)) > max_dist))   \
+        return;                                                                                   \
+    probe_pos = next_probe_pos & TDEFL_LZ_DICT_SIZE_MASK;                                         \
+    if ((d->m_dict[probe_pos + match_len] == c0) && (d->m_dict[probe_pos + match_len - 1] == c1)) \
+        break;
+            TDEFL_PROBE;
+            TDEFL_PROBE;
+            TDEFL_PROBE;
+        }
+        if (!dist)
+            break;
+        p = s;
+        q = d->m_dict + probe_pos;
+        for (probe_len = 0; probe_len < max_match_len; probe_len++)
+            if (*p++ != *q++)
+                break;
+        if (probe_len > match_len)
+        {
+            *pMatch_dist = dist;
+            if ((*pMatch_len = match_len = probe_len) == max_match_len)
+                return;
+            c0 = d->m_dict[pos + match_len];
+            c1 = d->m_dict[pos + match_len - 1];
+        }
+    }
+}
+#endif /* #if MINIZ_USE_UNALIGNED_LOADS_AND_STORES */
+
+#if MINIZ_USE_UNALIGNED_LOADS_AND_STORES && MINIZ_LITTLE_ENDIAN
+#ifdef MINIZ_UNALIGNED_USE_MEMCPY
+static mz_uint32 TDEFL_READ_UNALIGNED_WORD32(const mz_uint8* p)
+{
+	mz_uint32 ret;
+	memcpy(&ret, p, sizeof(mz_uint32));
+	return ret;
+}
+#else
+#define TDEFL_READ_UNALIGNED_WORD32(p) *(const mz_uint32 *)(p)
+#endif
+static mz_bool tdefl_compress_fast(tdefl_compressor *d)
+{
+    /* Faster, minimally featured LZRW1-style match+parse loop with better register utilization. Intended for applications where raw throughput is valued more highly than ratio. */
+    mz_uint lookahead_pos = d->m_lookahead_pos, lookahead_size = d->m_lookahead_size, dict_size = d->m_dict_size, total_lz_bytes = d->m_total_lz_bytes, num_flags_left = d->m_num_flags_left;
+    mz_uint8 *pLZ_code_buf = d->m_pLZ_code_buf, *pLZ_flags = d->m_pLZ_flags;
+    mz_uint cur_pos = lookahead_pos & TDEFL_LZ_DICT_SIZE_MASK;
+
+    while ((d->m_src_buf_left) || ((d->m_flush) && (lookahead_size)))
+    {
+        const mz_uint TDEFL_COMP_FAST_LOOKAHEAD_SIZE = 4096;
+        mz_uint dst_pos = (lookahead_pos + lookahead_size) & TDEFL_LZ_DICT_SIZE_MASK;
+        mz_uint num_bytes_to_process = (mz_uint)MZ_MIN(d->m_src_buf_left, TDEFL_COMP_FAST_LOOKAHEAD_SIZE - lookahead_size);
+        d->m_src_buf_left -= num_bytes_to_process;
+        lookahead_size += num_bytes_to_process;
+
+        while (num_bytes_to_process)
+        {
+            mz_uint32 n = MZ_MIN(TDEFL_LZ_DICT_SIZE - dst_pos, num_bytes_to_process);
+            memcpy(d->m_dict + dst_pos, d->m_pSrc, n);
+            if (dst_pos < (TDEFL_MAX_MATCH_LEN - 1))
+                memcpy(d->m_dict + TDEFL_LZ_DICT_SIZE + dst_pos, d->m_pSrc, MZ_MIN(n, (TDEFL_MAX_MATCH_LEN - 1) - dst_pos));
+            d->m_pSrc += n;
+            dst_pos = (dst_pos + n) & TDEFL_LZ_DICT_SIZE_MASK;
+            num_bytes_to_process -= n;
+        }
+
+        dict_size = MZ_MIN(TDEFL_LZ_DICT_SIZE - lookahead_size, dict_size);
+        if ((!d->m_flush) && (lookahead_size < TDEFL_COMP_FAST_LOOKAHEAD_SIZE))
+            break;
+
+        while (lookahead_size >= 4)
+        {
+            mz_uint cur_match_dist, cur_match_len = 1;
+            mz_uint8 *pCur_dict = d->m_dict + cur_pos;
+            mz_uint first_trigram = TDEFL_READ_UNALIGNED_WORD32(pCur_dict) & 0xFFFFFF;
+            mz_uint hash = (first_trigram ^ (first_trigram >> (24 - (TDEFL_LZ_HASH_BITS - 8)))) & TDEFL_LEVEL1_HASH_SIZE_MASK;
+            mz_uint probe_pos = d->m_hash[hash];
+            d->m_hash[hash] = (mz_uint16)lookahead_pos;
+
+            if (((cur_match_dist = (mz_uint16)(lookahead_pos - probe_pos)) <= dict_size) && ((TDEFL_READ_UNALIGNED_WORD32(d->m_dict + (probe_pos &= TDEFL_LZ_DICT_SIZE_MASK)) & 0xFFFFFF) == first_trigram))
+            {
+                const mz_uint16 *p = (const mz_uint16 *)pCur_dict;
+                const mz_uint16 *q = (const mz_uint16 *)(d->m_dict + probe_pos);
+                mz_uint32 probe_len = 32;
+                do
+                {
+                } while ((TDEFL_READ_UNALIGNED_WORD2(++p) == TDEFL_READ_UNALIGNED_WORD2(++q)) && (TDEFL_READ_UNALIGNED_WORD2(++p) == TDEFL_READ_UNALIGNED_WORD2(++q)) &&
+                         (TDEFL_READ_UNALIGNED_WORD2(++p) == TDEFL_READ_UNALIGNED_WORD2(++q)) && (TDEFL_READ_UNALIGNED_WORD2(++p) == TDEFL_READ_UNALIGNED_WORD2(++q)) && (--probe_len > 0));
+                cur_match_len = ((mz_uint)(p - (const mz_uint16 *)pCur_dict) * 2) + (mz_uint)(*(const mz_uint8 *)p == *(const mz_uint8 *)q);
+                if (!probe_len)
+                    cur_match_len = cur_match_dist ? TDEFL_MAX_MATCH_LEN : 0;
+
+                if ((cur_match_len < TDEFL_MIN_MATCH_LEN) || ((cur_match_len == TDEFL_MIN_MATCH_LEN) && (cur_match_dist >= 8U * 1024U)))
+                {
+                    cur_match_len = 1;
+                    *pLZ_code_buf++ = (mz_uint8)first_trigram;
+                    *pLZ_flags = (mz_uint8)(*pLZ_flags >> 1);
+                    d->m_huff_count[0][(mz_uint8)first_trigram]++;
+                }
+                else
+                {
+                    mz_uint32 s0, s1;
+                    cur_match_len = MZ_MIN(cur_match_len, lookahead_size);
+
+                    MZ_ASSERT((cur_match_len >= TDEFL_MIN_MATCH_LEN) && (cur_match_dist >= 1) && (cur_match_dist <= TDEFL_LZ_DICT_SIZE));
+
+                    cur_match_dist--;
+
+                    pLZ_code_buf[0] = (mz_uint8)(cur_match_len - TDEFL_MIN_MATCH_LEN);
+#ifdef MINIZ_UNALIGNED_USE_MEMCPY
+					memcpy(&pLZ_code_buf[1], &cur_match_dist, sizeof(cur_match_dist));
+#else
+                    *(mz_uint16 *)(&pLZ_code_buf[1]) = (mz_uint16)cur_match_dist;
+#endif
+                    pLZ_code_buf += 3;
+                    *pLZ_flags = (mz_uint8)((*pLZ_flags >> 1) | 0x80);
+
+                    s0 = s_tdefl_small_dist_sym[cur_match_dist & 511];
+                    s1 = s_tdefl_large_dist_sym[cur_match_dist >> 8];
+                    d->m_huff_count[1][(cur_match_dist < 512) ? s0 : s1]++;
+
+                    d->m_huff_count[0][s_tdefl_len_sym[cur_match_len - TDEFL_MIN_MATCH_LEN]]++;
+                }
+            }
+            else
+            {
+                *pLZ_code_buf++ = (mz_uint8)first_trigram;
+                *pLZ_flags = (mz_uint8)(*pLZ_flags >> 1);
+                d->m_huff_count[0][(mz_uint8)first_trigram]++;
+            }
+
+            if (--num_flags_left == 0)
+            {
+                num_flags_left = 8;
+                pLZ_flags = pLZ_code_buf++;
+            }
+
+            total_lz_bytes += cur_match_len;
+            lookahead_pos += cur_match_len;
+            dict_size = MZ_MIN(dict_size + cur_match_len, (mz_uint)TDEFL_LZ_DICT_SIZE);
+            cur_pos = (cur_pos + cur_match_len) & TDEFL_LZ_DICT_SIZE_MASK;
+            MZ_ASSERT(lookahead_size >= cur_match_len);
+            lookahead_size -= cur_match_len;
+
+            if (pLZ_code_buf > &d->m_lz_code_buf[TDEFL_LZ_CODE_BUF_SIZE - 8])
+            {
+                int n;
+                d->m_lookahead_pos = lookahead_pos;
+                d->m_lookahead_size = lookahead_size;
+                d->m_dict_size = dict_size;
+                d->m_total_lz_bytes = total_lz_bytes;
+                d->m_pLZ_code_buf = pLZ_code_buf;
+                d->m_pLZ_flags = pLZ_flags;
+                d->m_num_flags_left = num_flags_left;
+                if ((n = tdefl_flush_block(d, 0)) != 0)
+                    return (n < 0) ? MZ_FALSE : MZ_TRUE;
+                total_lz_bytes = d->m_total_lz_bytes;
+                pLZ_code_buf = d->m_pLZ_code_buf;
+                pLZ_flags = d->m_pLZ_flags;
+                num_flags_left = d->m_num_flags_left;
+            }
+        }
+
+        while (lookahead_size)
+        {
+            mz_uint8 lit = d->m_dict[cur_pos];
+
+            total_lz_bytes++;
+            *pLZ_code_buf++ = lit;
+            *pLZ_flags = (mz_uint8)(*pLZ_flags >> 1);
+            if (--num_flags_left == 0)
+            {
+                num_flags_left = 8;
+                pLZ_flags = pLZ_code_buf++;
+            }
+
+            d->m_huff_count[0][lit]++;
+
+            lookahead_pos++;
+            dict_size = MZ_MIN(dict_size + 1, (mz_uint)TDEFL_LZ_DICT_SIZE);
+            cur_pos = (cur_pos + 1) & TDEFL_LZ_DICT_SIZE_MASK;
+            lookahead_size--;
+
+            if (pLZ_code_buf > &d->m_lz_code_buf[TDEFL_LZ_CODE_BUF_SIZE - 8])
+            {
+                int n;
+                d->m_lookahead_pos = lookahead_pos;
+                d->m_lookahead_size = lookahead_size;
+                d->m_dict_size = dict_size;
+                d->m_total_lz_bytes = total_lz_bytes;
+                d->m_pLZ_code_buf = pLZ_code_buf;
+                d->m_pLZ_flags = pLZ_flags;
+                d->m_num_flags_left = num_flags_left;
+                if ((n = tdefl_flush_block(d, 0)) != 0)
+                    return (n < 0) ? MZ_FALSE : MZ_TRUE;
+                total_lz_bytes = d->m_total_lz_bytes;
+                pLZ_code_buf = d->m_pLZ_code_buf;
+                pLZ_flags = d->m_pLZ_flags;
+                num_flags_left = d->m_num_flags_left;
+            }
+        }
+    }
+
+    d->m_lookahead_pos = lookahead_pos;
+    d->m_lookahead_size = lookahead_size;
+    d->m_dict_size = dict_size;
+    d->m_total_lz_bytes = total_lz_bytes;
+    d->m_pLZ_code_buf = pLZ_code_buf;
+    d->m_pLZ_flags = pLZ_flags;
+    d->m_num_flags_left = num_flags_left;
+    return MZ_TRUE;
+}
+#endif /* MINIZ_USE_UNALIGNED_LOADS_AND_STORES && MINIZ_LITTLE_ENDIAN */
+
+static MZ_FORCEINLINE void tdefl_record_literal(tdefl_compressor *d, mz_uint8 lit)
+{
+    d->m_total_lz_bytes++;
+    *d->m_pLZ_code_buf++ = lit;
+    *d->m_pLZ_flags = (mz_uint8)(*d->m_pLZ_flags >> 1);
+    if (--d->m_num_flags_left == 0)
+    {
+        d->m_num_flags_left = 8;
+        d->m_pLZ_flags = d->m_pLZ_code_buf++;
+    }
+    d->m_huff_count[0][lit]++;
+}
+
+static MZ_FORCEINLINE void tdefl_record_match(tdefl_compressor *d, mz_uint match_len, mz_uint match_dist)
+{
+    mz_uint32 s0, s1;
+
+    MZ_ASSERT((match_len >= TDEFL_MIN_MATCH_LEN) && (match_dist >= 1) && (match_dist <= TDEFL_LZ_DICT_SIZE));
+
+    d->m_total_lz_bytes += match_len;
+
+    d->m_pLZ_code_buf[0] = (mz_uint8)(match_len - TDEFL_MIN_MATCH_LEN);
+
+    match_dist -= 1;
+    d->m_pLZ_code_buf[1] = (mz_uint8)(match_dist & 0xFF);
+    d->m_pLZ_code_buf[2] = (mz_uint8)(match_dist >> 8);
+    d->m_pLZ_code_buf += 3;
+
+    *d->m_pLZ_flags = (mz_uint8)((*d->m_pLZ_flags >> 1) | 0x80);
+    if (--d->m_num_flags_left == 0)
+    {
+        d->m_num_flags_left = 8;
+        d->m_pLZ_flags = d->m_pLZ_code_buf++;
+    }
+
+    s0 = s_tdefl_small_dist_sym[match_dist & 511];
+    s1 = s_tdefl_large_dist_sym[(match_dist >> 8) & 127];
+    d->m_huff_count[1][(match_dist < 512) ? s0 : s1]++;
+    d->m_huff_count[0][s_tdefl_len_sym[match_len - TDEFL_MIN_MATCH_LEN]]++;
+}
+
+static mz_bool tdefl_compress_normal(tdefl_compressor *d)
+{
+    const mz_uint8 *pSrc = d->m_pSrc;
+    size_t src_buf_left = d->m_src_buf_left;
+    tdefl_flush flush = d->m_flush;
+
+    while ((src_buf_left) || ((flush) && (d->m_lookahead_size)))
+    {
+        mz_uint len_to_move, cur_match_dist, cur_match_len, cur_pos;
+        /* Update dictionary and hash chains. Keeps the lookahead size equal to TDEFL_MAX_MATCH_LEN. */
+        if ((d->m_lookahead_size + d->m_dict_size) >= (TDEFL_MIN_MATCH_LEN - 1))
+        {
+            mz_uint dst_pos = (d->m_lookahead_pos + d->m_lookahead_size) & TDEFL_LZ_DICT_SIZE_MASK, ins_pos = d->m_lookahead_pos + d->m_lookahead_size - 2;
+            mz_uint hash = (d->m_dict[ins_pos & TDEFL_LZ_DICT_SIZE_MASK] << TDEFL_LZ_HASH_SHIFT) ^ d->m_dict[(ins_pos + 1) & TDEFL_LZ_DICT_SIZE_MASK];
+            mz_uint num_bytes_to_process = (mz_uint)MZ_MIN(src_buf_left, TDEFL_MAX_MATCH_LEN - d->m_lookahead_size);
+            const mz_uint8 *pSrc_end = pSrc + num_bytes_to_process;
+            src_buf_left -= num_bytes_to_process;
+            d->m_lookahead_size += num_bytes_to_process;
+            while (pSrc != pSrc_end)
+            {
+                mz_uint8 c = *pSrc++;
+                d->m_dict[dst_pos] = c;
+                if (dst_pos < (TDEFL_MAX_MATCH_LEN - 1))
+                    d->m_dict[TDEFL_LZ_DICT_SIZE + dst_pos] = c;
+                hash = ((hash << TDEFL_LZ_HASH_SHIFT) ^ c) & (TDEFL_LZ_HASH_SIZE - 1);
+                d->m_next[ins_pos & TDEFL_LZ_DICT_SIZE_MASK] = d->m_hash[hash];
+                d->m_hash[hash] = (mz_uint16)(ins_pos);
+                dst_pos = (dst_pos + 1) & TDEFL_LZ_DICT_SIZE_MASK;
+                ins_pos++;
+            }
+        }
+        else
+        {
+            while ((src_buf_left) && (d->m_lookahead_size < TDEFL_MAX_MATCH_LEN))
+            {
+                mz_uint8 c = *pSrc++;
+                mz_uint dst_pos = (d->m_lookahead_pos + d->m_lookahead_size) & TDEFL_LZ_DICT_SIZE_MASK;
+                src_buf_left--;
+                d->m_dict[dst_pos] = c;
+                if (dst_pos < (TDEFL_MAX_MATCH_LEN - 1))
+                    d->m_dict[TDEFL_LZ_DICT_SIZE + dst_pos] = c;
+                if ((++d->m_lookahead_size + d->m_dict_size) >= TDEFL_MIN_MATCH_LEN)
+                {
+                    mz_uint ins_pos = d->m_lookahead_pos + (d->m_lookahead_size - 1) - 2;
+                    mz_uint hash = ((d->m_dict[ins_pos & TDEFL_LZ_DICT_SIZE_MASK] << (TDEFL_LZ_HASH_SHIFT * 2)) ^ (d->m_dict[(ins_pos + 1) & TDEFL_LZ_DICT_SIZE_MASK] << TDEFL_LZ_HASH_SHIFT) ^ c) & (TDEFL_LZ_HASH_SIZE - 1);
+                    d->m_next[ins_pos & TDEFL_LZ_DICT_SIZE_MASK] = d->m_hash[hash];
+                    d->m_hash[hash] = (mz_uint16)(ins_pos);
+                }
+            }
+        }
+        d->m_dict_size = MZ_MIN(TDEFL_LZ_DICT_SIZE - d->m_lookahead_size, d->m_dict_size);
+        if ((!flush) && (d->m_lookahead_size < TDEFL_MAX_MATCH_LEN))
+            break;
+
+        /* Simple lazy/greedy parsing state machine. */
+        len_to_move = 1;
+        cur_match_dist = 0;
+        cur_match_len = d->m_saved_match_len ? d->m_saved_match_len : (TDEFL_MIN_MATCH_LEN - 1);
+        cur_pos = d->m_lookahead_pos & TDEFL_LZ_DICT_SIZE_MASK;
+        if (d->m_flags & (TDEFL_RLE_MATCHES | TDEFL_FORCE_ALL_RAW_BLOCKS))
+        {
+            if ((d->m_dict_size) && (!(d->m_flags & TDEFL_FORCE_ALL_RAW_BLOCKS)))
+            {
+                mz_uint8 c = d->m_dict[(cur_pos - 1) & TDEFL_LZ_DICT_SIZE_MASK];
+                cur_match_len = 0;
+                while (cur_match_len < d->m_lookahead_size)
+                {
+                    if (d->m_dict[cur_pos + cur_match_len] != c)
+                        break;
+                    cur_match_len++;
+                }
+                if (cur_match_len < TDEFL_MIN_MATCH_LEN)
+                    cur_match_len = 0;
+                else
+                    cur_match_dist = 1;
+            }
+        }
+        else
+        {
+            tdefl_find_match(d, d->m_lookahead_pos, d->m_dict_size, d->m_lookahead_size, &cur_match_dist, &cur_match_len);
+        }
+        if (((cur_match_len == TDEFL_MIN_MATCH_LEN) && (cur_match_dist >= 8U * 1024U)) || (cur_pos == cur_match_dist) || ((d->m_flags & TDEFL_FILTER_MATCHES) && (cur_match_len <= 5)))
+        {
+            cur_match_dist = cur_match_len = 0;
+        }
+        if (d->m_saved_match_len)
+        {
+            if (cur_match_len > d->m_saved_match_len)
+            {
+                tdefl_record_literal(d, (mz_uint8)d->m_saved_lit);
+                if (cur_match_len >= 128)
+                {
+                    tdefl_record_match(d, cur_match_len, cur_match_dist);
+                    d->m_saved_match_len = 0;
+                    len_to_move = cur_match_len;
+                }
+                else
+                {
+                    d->m_saved_lit = d->m_dict[cur_pos];
+                    d->m_saved_match_dist = cur_match_dist;
+                    d->m_saved_match_len = cur_match_len;
+                }
+            }
+            else
+            {
+                tdefl_record_match(d, d->m_saved_match_len, d->m_saved_match_dist);
+                len_to_move = d->m_saved_match_len - 1;
+                d->m_saved_match_len = 0;
+            }
+        }
+        else if (!cur_match_dist)
+            tdefl_record_literal(d, d->m_dict[MZ_MIN(cur_pos, sizeof(d->m_dict) - 1)]);
+        else if ((d->m_greedy_parsing) || (d->m_flags & TDEFL_RLE_MATCHES) || (cur_match_len >= 128))
+        {
+            tdefl_record_match(d, cur_match_len, cur_match_dist);
+            len_to_move = cur_match_len;
+        }
+        else
+        {
+            d->m_saved_lit = d->m_dict[MZ_MIN(cur_pos, sizeof(d->m_dict) - 1)];
+            d->m_saved_match_dist = cur_match_dist;
+            d->m_saved_match_len = cur_match_len;
+        }
+        /* Move the lookahead forward by len_to_move bytes. */
+        d->m_lookahead_pos += len_to_move;
+        MZ_ASSERT(d->m_lookahead_size >= len_to_move);
+        d->m_lookahead_size -= len_to_move;
+        d->m_dict_size = MZ_MIN(d->m_dict_size + len_to_move, (mz_uint)TDEFL_LZ_DICT_SIZE);
+        /* Check if it's time to flush the current LZ codes to the internal output buffer. */
+        if ((d->m_pLZ_code_buf > &d->m_lz_code_buf[TDEFL_LZ_CODE_BUF_SIZE - 8]) ||
+            ((d->m_total_lz_bytes > 31 * 1024) && (((((mz_uint)(d->m_pLZ_code_buf - d->m_lz_code_buf) * 115) >> 7) >= d->m_total_lz_bytes) || (d->m_flags & TDEFL_FORCE_ALL_RAW_BLOCKS))))
+        {
+            int n;
+            d->m_pSrc = pSrc;
+            d->m_src_buf_left = src_buf_left;
+            if ((n = tdefl_flush_block(d, 0)) != 0)
+                return (n < 0) ? MZ_FALSE : MZ_TRUE;
+        }
+    }
+
+    d->m_pSrc = pSrc;
+    d->m_src_buf_left = src_buf_left;
+    return MZ_TRUE;
+}
+
+static tdefl_status tdefl_flush_output_buffer(tdefl_compressor *d)
+{
+    if (d->m_pIn_buf_size)
+    {
+        *d->m_pIn_buf_size = d->m_pSrc - (const mz_uint8 *)d->m_pIn_buf;
+    }
+
+    if (d->m_pOut_buf_size)
+    {
+        size_t n = MZ_MIN(*d->m_pOut_buf_size - d->m_out_buf_ofs, d->m_output_flush_remaining);
+        memcpy((mz_uint8 *)d->m_pOut_buf + d->m_out_buf_ofs, d->m_output_buf + d->m_output_flush_ofs, n);
+        d->m_output_flush_ofs += (mz_uint)n;
+        d->m_output_flush_remaining -= (mz_uint)n;
+        d->m_out_buf_ofs += n;
+
+        *d->m_pOut_buf_size = d->m_out_buf_ofs;
+    }
+
+    return (d->m_finished && !d->m_output_flush_remaining) ? TDEFL_STATUS_DONE : TDEFL_STATUS_OKAY;
+}
+
+tdefl_status tdefl_compress(tdefl_compressor *d, const void *pIn_buf, size_t *pIn_buf_size, void *pOut_buf, size_t *pOut_buf_size, tdefl_flush flush)
+{
+    if (!d)
+    {
+        if (pIn_buf_size)
+            *pIn_buf_size = 0;
+        if (pOut_buf_size)
+            *pOut_buf_size = 0;
+        return TDEFL_STATUS_BAD_PARAM;
+    }
+
+    d->m_pIn_buf = pIn_buf;
+    d->m_pIn_buf_size = pIn_buf_size;
+    d->m_pOut_buf = pOut_buf;
+    d->m_pOut_buf_size = pOut_buf_size;
+    d->m_pSrc = (const mz_uint8 *)(pIn_buf);
+    d->m_src_buf_left = pIn_buf_size ? *pIn_buf_size : 0;
+    d->m_out_buf_ofs = 0;
+    d->m_flush = flush;
+
+    if (((d->m_pPut_buf_func != NULL) == ((pOut_buf != NULL) || (pOut_buf_size != NULL))) || (d->m_prev_return_status != TDEFL_STATUS_OKAY) ||
+        (d->m_wants_to_finish && (flush != TDEFL_FINISH)) || (pIn_buf_size && *pIn_buf_size && !pIn_buf) || (pOut_buf_size && *pOut_buf_size && !pOut_buf))
+    {
+        if (pIn_buf_size)
+            *pIn_buf_size = 0;
+        if (pOut_buf_size)
+            *pOut_buf_size = 0;
+        return (d->m_prev_return_status = TDEFL_STATUS_BAD_PARAM);
+    }
+    d->m_wants_to_finish |= (flush == TDEFL_FINISH);
+
+    if ((d->m_output_flush_remaining) || (d->m_finished))
+        return (d->m_prev_return_status = tdefl_flush_output_buffer(d));
+
+#if MINIZ_USE_UNALIGNED_LOADS_AND_STORES && MINIZ_LITTLE_ENDIAN
+    if (((d->m_flags & TDEFL_MAX_PROBES_MASK) == 1) &&
+        ((d->m_flags & TDEFL_GREEDY_PARSING_FLAG) != 0) &&
+        ((d->m_flags & (TDEFL_FILTER_MATCHES | TDEFL_FORCE_ALL_RAW_BLOCKS | TDEFL_RLE_MATCHES)) == 0))
+    {
+        if (!tdefl_compress_fast(d))
+            return d->m_prev_return_status;
+    }
+    else
+#endif /* #if MINIZ_USE_UNALIGNED_LOADS_AND_STORES && MINIZ_LITTLE_ENDIAN */
+    {
+        if (!tdefl_compress_normal(d))
+            return d->m_prev_return_status;
+    }
+
+    if ((d->m_flags & (TDEFL_WRITE_ZLIB_HEADER | TDEFL_COMPUTE_ADLER32)) && (pIn_buf))
+        d->m_adler32 = (mz_uint32)mz_adler32(d->m_adler32, (const mz_uint8 *)pIn_buf, d->m_pSrc - (const mz_uint8 *)pIn_buf);
+
+    if ((flush) && (!d->m_lookahead_size) && (!d->m_src_buf_left) && (!d->m_output_flush_remaining))
+    {
+        if (tdefl_flush_block(d, flush) < 0)
+            return d->m_prev_return_status;
+        d->m_finished = (flush == TDEFL_FINISH);
+        if (flush == TDEFL_FULL_FLUSH)
+        {
+            MZ_CLEAR_OBJ(d->m_hash);
+            MZ_CLEAR_OBJ(d->m_next);
+            d->m_dict_size = 0;
+        }
+    }
+
+    return (d->m_prev_return_status = tdefl_flush_output_buffer(d));
+}
+
+tdefl_status tdefl_compress_buffer(tdefl_compressor *d, const void *pIn_buf, size_t in_buf_size, tdefl_flush flush)
+{
+    MZ_ASSERT(d->m_pPut_buf_func);
+    return tdefl_compress(d, pIn_buf, &in_buf_size, NULL, NULL, flush);
+}
+
+tdefl_status tdefl_init(tdefl_compressor *d, tdefl_put_buf_func_ptr pPut_buf_func, void *pPut_buf_user, int flags)
+{
+    d->m_pPut_buf_func = pPut_buf_func;
+    d->m_pPut_buf_user = pPut_buf_user;
+    d->m_flags = (mz_uint)(flags);
+    d->m_max_probes[0] = 1 + ((flags & 0xFFF) + 2) / 3;
+    d->m_greedy_parsing = (flags & TDEFL_GREEDY_PARSING_FLAG) != 0;
+    d->m_max_probes[1] = 1 + (((flags & 0xFFF) >> 2) + 2) / 3;
+    if (!(flags & TDEFL_NONDETERMINISTIC_PARSING_FLAG))
+        MZ_CLEAR_OBJ(d->m_hash);
+    d->m_lookahead_pos = d->m_lookahead_size = d->m_dict_size = d->m_total_lz_bytes = d->m_lz_code_buf_dict_pos = d->m_bits_in = 0;
+    d->m_output_flush_ofs = d->m_output_flush_remaining = d->m_finished = d->m_block_index = d->m_bit_buffer = d->m_wants_to_finish = 0;
+    d->m_pLZ_code_buf = d->m_lz_code_buf + 1;
+    d->m_pLZ_flags = d->m_lz_code_buf;
+    *d->m_pLZ_flags = 0;
+    d->m_num_flags_left = 8;
+    d->m_pOutput_buf = d->m_output_buf;
+    d->m_pOutput_buf_end = d->m_output_buf;
+    d->m_prev_return_status = TDEFL_STATUS_OKAY;
+    d->m_saved_match_dist = d->m_saved_match_len = d->m_saved_lit = 0;
+    d->m_adler32 = 1;
+    d->m_pIn_buf = NULL;
+    d->m_pOut_buf = NULL;
+    d->m_pIn_buf_size = NULL;
+    d->m_pOut_buf_size = NULL;
+    d->m_flush = TDEFL_NO_FLUSH;
+    d->m_pSrc = NULL;
+    d->m_src_buf_left = 0;
+    d->m_out_buf_ofs = 0;
+    if (!(flags & TDEFL_NONDETERMINISTIC_PARSING_FLAG))
+        MZ_CLEAR_OBJ(d->m_dict);
+    memset(&d->m_huff_count[0][0], 0, sizeof(d->m_huff_count[0][0]) * TDEFL_MAX_HUFF_SYMBOLS_0);
+    memset(&d->m_huff_count[1][0], 0, sizeof(d->m_huff_count[1][0]) * TDEFL_MAX_HUFF_SYMBOLS_1);
+    return TDEFL_STATUS_OKAY;
+}
+
+tdefl_status tdefl_get_prev_return_status(tdefl_compressor *d)
+{
+    return d->m_prev_return_status;
+}
+
+mz_uint32 tdefl_get_adler32(tdefl_compressor *d)
+{
+    return d->m_adler32;
+}
+
+mz_bool tdefl_compress_mem_to_output(const void *pBuf, size_t buf_len, tdefl_put_buf_func_ptr pPut_buf_func, void *pPut_buf_user, int flags)
+{
+    tdefl_compressor *pComp;
+    mz_bool succeeded;
+    if (((buf_len) && (!pBuf)) || (!pPut_buf_func))
+        return MZ_FALSE;
+    pComp = (tdefl_compressor *)MZ_MALLOC(sizeof(tdefl_compressor));
+    if (!pComp)
+        return MZ_FALSE;
+    succeeded = (tdefl_init(pComp, pPut_buf_func, pPut_buf_user, flags) == TDEFL_STATUS_OKAY);
+    succeeded = succeeded && (tdefl_compress_buffer(pComp, pBuf, buf_len, TDEFL_FINISH) == TDEFL_STATUS_DONE);
+    MZ_FREE(pComp);
+    return succeeded;
+}
+
+typedef struct
+{
+    size_t m_size, m_capacity;
+    mz_uint8 *m_pBuf;
+    mz_bool m_expandable;
+} tdefl_output_buffer;
+
+static mz_bool tdefl_output_buffer_putter(const void *pBuf, int len, void *pUser)
+{
+    tdefl_output_buffer *p = (tdefl_output_buffer *)pUser;
+    size_t new_size = p->m_size + len;
+    if (new_size > p->m_capacity)
+    {
+        size_t new_capacity = p->m_capacity;
+        mz_uint8 *pNew_buf;
+        if (!p->m_expandable)
+            return MZ_FALSE;
+        do
+        {
+            new_capacity = MZ_MAX(128U, new_capacity << 1U);
+        } while (new_size > new_capacity);
+        pNew_buf = (mz_uint8 *)MZ_REALLOC(p->m_pBuf, new_capacity);
+        if (!pNew_buf)
+            return MZ_FALSE;
+        p->m_pBuf = pNew_buf;
+        p->m_capacity = new_capacity;
+    }
+    memcpy((mz_uint8 *)p->m_pBuf + p->m_size, pBuf, len);
+    p->m_size = new_size;
+    return MZ_TRUE;
+}
+
+void *tdefl_compress_mem_to_heap(const void *pSrc_buf, size_t src_buf_len, size_t *pOut_len, int flags)
+{
+    tdefl_output_buffer out_buf;
+    MZ_CLEAR_OBJ(out_buf);
+    if (!pOut_len)
+        return MZ_FALSE;
+    else
+        *pOut_len = 0;
+    out_buf.m_expandable = MZ_TRUE;
+    if (!tdefl_compress_mem_to_output(pSrc_buf, src_buf_len, tdefl_output_buffer_putter, &out_buf, flags))
+        return NULL;
+    *pOut_len = out_buf.m_size;
+    return out_buf.m_pBuf;
+}
+
+size_t tdefl_compress_mem_to_mem(void *pOut_buf, size_t out_buf_len, const void *pSrc_buf, size_t src_buf_len, int flags)
+{
+    tdefl_output_buffer out_buf;
+    MZ_CLEAR_OBJ(out_buf);
+    if (!pOut_buf)
+        return 0;
+    out_buf.m_pBuf = (mz_uint8 *)pOut_buf;
+    out_buf.m_capacity = out_buf_len;
+    if (!tdefl_compress_mem_to_output(pSrc_buf, src_buf_len, tdefl_output_buffer_putter, &out_buf, flags))
+        return 0;
+    return out_buf.m_size;
+}
+
+static const mz_uint s_tdefl_num_probes[11] = { 0, 1, 6, 32, 16, 32, 128, 256, 512, 768, 1500 };
+
+/* level may actually range from [0,10] (10 is a "hidden" max level, where we want a bit more compression and it's fine if throughput to fall off a cliff on some files). */
+mz_uint tdefl_create_comp_flags_from_zip_params(int level, int window_bits, int strategy)
+{
+    mz_uint comp_flags = s_tdefl_num_probes[(level >= 0) ? MZ_MIN(10, level) : MZ_DEFAULT_LEVEL] | ((level <= 3) ? TDEFL_GREEDY_PARSING_FLAG : 0);
+    if (window_bits > 0)
+        comp_flags |= TDEFL_WRITE_ZLIB_HEADER;
+
+    if (!level)
+        comp_flags |= TDEFL_FORCE_ALL_RAW_BLOCKS;
+    else if (strategy == MZ_FILTERED)
+        comp_flags |= TDEFL_FILTER_MATCHES;
+    else if (strategy == MZ_HUFFMAN_ONLY)
+        comp_flags &= ~TDEFL_MAX_PROBES_MASK;
+    else if (strategy == MZ_FIXED)
+        comp_flags |= TDEFL_FORCE_ALL_STATIC_BLOCKS;
+    else if (strategy == MZ_RLE)
+        comp_flags |= TDEFL_RLE_MATCHES;
+
+    return comp_flags;
+}
+
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable : 4204) /* nonstandard extension used : non-constant aggregate initializer (also supported by GNU C and C99, so no big deal) */
+#endif
+
+/* Simple PNG writer function by Alex Evans, 2011. Released into the public domain: https://gist.github.com/908299, more context at
+ http://altdevblogaday.org/2011/04/06/a-smaller-jpg-encoder/.
+ This is actually a modification of Alex's original code so PNG files generated by this function pass pngcheck. */
+void *tdefl_write_image_to_png_file_in_memory_ex(const void *pImage, int w, int h, int num_chans, size_t *pLen_out, mz_uint level, mz_bool flip)
+{
+    /* Using a local copy of this array here in case MINIZ_NO_ZLIB_APIS was defined. */
+    static const mz_uint s_tdefl_png_num_probes[11] = { 0, 1, 6, 32, 16, 32, 128, 256, 512, 768, 1500 };
+    tdefl_compressor *pComp = (tdefl_compressor *)MZ_MALLOC(sizeof(tdefl_compressor));
+    tdefl_output_buffer out_buf;
+    int i, bpl = w * num_chans, y, z;
+    mz_uint32 c;
+    *pLen_out = 0;
+    if (!pComp)
+        return NULL;
+    MZ_CLEAR_OBJ(out_buf);
+    out_buf.m_expandable = MZ_TRUE;
+    out_buf.m_capacity = 57 + MZ_MAX(64, (1 + bpl) * h);
+    if (NULL == (out_buf.m_pBuf = (mz_uint8 *)MZ_MALLOC(out_buf.m_capacity)))
+    {
+        MZ_FREE(pComp);
+        return NULL;
+    }
+    /* write dummy header */
+    for (z = 41; z; --z)
+        tdefl_output_buffer_putter(&z, 1, &out_buf);
+    /* compress image data */
+    tdefl_init(pComp, tdefl_output_buffer_putter, &out_buf, s_tdefl_png_num_probes[MZ_MIN(10, level)] | TDEFL_WRITE_ZLIB_HEADER);
+    for (y = 0; y < h; ++y)
+    {
+        tdefl_compress_buffer(pComp, &z, 1, TDEFL_NO_FLUSH);
+        tdefl_compress_buffer(pComp, (mz_uint8 *)pImage + (flip ? (h - 1 - y) : y) * bpl, bpl, TDEFL_NO_FLUSH);
+    }
+    if (tdefl_compress_buffer(pComp, NULL, 0, TDEFL_FINISH) != TDEFL_STATUS_DONE)
+    {
+        MZ_FREE(pComp);
+        MZ_FREE(out_buf.m_pBuf);
+        return NULL;
+    }
+    /* write real header */
+    *pLen_out = out_buf.m_size - 41;
+    {
+        static const mz_uint8 chans[] = { 0x00, 0x00, 0x04, 0x02, 0x06 };
+        mz_uint8 pnghdr[41] = { 0x89, 0x50, 0x4e, 0x47, 0x0d,
+                                0x0a, 0x1a, 0x0a, 0x00, 0x00,
+                                0x00, 0x0d, 0x49, 0x48, 0x44,
+                                0x52, 0x00, 0x00, 0x00, 0x00,
+                                0x00, 0x00, 0x00, 0x00, 0x08,
+                                0x00, 0x00, 0x00, 0x00, 0x00,
+                                0x00, 0x00, 0x00, 0x00, 0x00,
+                                0x00, 0x00, 0x49, 0x44, 0x41,
+                                0x54 };
+        pnghdr[18] = (mz_uint8)(w >> 8);
+        pnghdr[19] = (mz_uint8)w;
+        pnghdr[22] = (mz_uint8)(h >> 8);
+        pnghdr[23] = (mz_uint8)h;
+        pnghdr[25] = chans[num_chans];
+        pnghdr[33] = (mz_uint8)(*pLen_out >> 24);
+        pnghdr[34] = (mz_uint8)(*pLen_out >> 16);
+        pnghdr[35] = (mz_uint8)(*pLen_out >> 8);
+        pnghdr[36] = (mz_uint8)*pLen_out;
+        c = (mz_uint32)mz_crc32(MZ_CRC32_INIT, pnghdr + 12, 17);
+        for (i = 0; i < 4; ++i, c <<= 8)
+            ((mz_uint8 *)(pnghdr + 29))[i] = (mz_uint8)(c >> 24);
+        memcpy(out_buf.m_pBuf, pnghdr, 41);
+    }
+    /* write footer (IDAT CRC-32, followed by IEND chunk) */
+    if (!tdefl_output_buffer_putter("\0\0\0\0\0\0\0\0\x49\x45\x4e\x44\xae\x42\x60\x82", 16, &out_buf))
+    {
+        *pLen_out = 0;
+        MZ_FREE(pComp);
+        MZ_FREE(out_buf.m_pBuf);
+        return NULL;
+    }
+    c = (mz_uint32)mz_crc32(MZ_CRC32_INIT, out_buf.m_pBuf + 41 - 4, *pLen_out + 4);
+    for (i = 0; i < 4; ++i, c <<= 8)
+        (out_buf.m_pBuf + out_buf.m_size - 16)[i] = (mz_uint8)(c >> 24);
+    /* compute final size of file, grab compressed data buffer and return */
+    *pLen_out += 57;
+    MZ_FREE(pComp);
+    return out_buf.m_pBuf;
+}
+void *tdefl_write_image_to_png_file_in_memory(const void *pImage, int w, int h, int num_chans, size_t *pLen_out)
+{
+    /* Level 6 corresponds to TDEFL_DEFAULT_MAX_PROBES or MZ_DEFAULT_LEVEL (but we can't depend on MZ_DEFAULT_LEVEL being available in case the zlib API's where #defined out) */
+    return tdefl_write_image_to_png_file_in_memory_ex(pImage, w, h, num_chans, pLen_out, 6, MZ_FALSE);
+}
+
+#ifndef MINIZ_NO_MALLOC
+/* Allocate the tdefl_compressor and tinfl_decompressor structures in C so that */
+/* non-C language bindings to tdefL_ and tinfl_ API don't need to worry about */
+/* structure size and allocation mechanism. */
+tdefl_compressor *tdefl_compressor_alloc()
+{
+    return (tdefl_compressor *)MZ_MALLOC(sizeof(tdefl_compressor));
+}
+
+void tdefl_compressor_free(tdefl_compressor *pComp)
+{
+    MZ_FREE(pComp);
+}
+#endif
+
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+ /**************************************************************************
+ *
+ * Copyright 2013-2014 RAD Game Tools and Valve Software
+ * Copyright 2010-2014 Rich Geldreich and Tenacious Software LLC
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* ------------------- Low-level Decompression (completely independent from all compression API's) */
+
+#define TINFL_MEMCPY(d, s, l) memcpy(d, s, l)
+#define TINFL_MEMSET(p, c, l) memset(p, c, l)
+
+#define TINFL_CR_BEGIN  \
+    switch (r->m_state) \
+    {                   \
+        case 0:
+#define TINFL_CR_RETURN(state_index, result) \
+    do                                       \
+    {                                        \
+        status = result;                     \
+        r->m_state = state_index;            \
+        goto common_exit;                    \
+        case state_index:;                   \
+    }                                        \
+    MZ_MACRO_END
+#define TINFL_CR_RETURN_FOREVER(state_index, result) \
+    do                                               \
+    {                                                \
+        for (;;)                                     \
+        {                                            \
+            TINFL_CR_RETURN(state_index, result);    \
+        }                                            \
+    }                                                \
+    MZ_MACRO_END
+#define TINFL_CR_FINISH }
+
+#define TINFL_GET_BYTE(state_index, c)                                                                                                                           \
+    do                                                                                                                                                           \
+    {                                                                                                                                                            \
+        while (pIn_buf_cur >= pIn_buf_end)                                                                                                                       \
+        {                                                                                                                                                        \
+            TINFL_CR_RETURN(state_index, (decomp_flags & TINFL_FLAG_HAS_MORE_INPUT) ? TINFL_STATUS_NEEDS_MORE_INPUT : TINFL_STATUS_FAILED_CANNOT_MAKE_PROGRESS); \
+        }                                                                                                                                                        \
+        c = *pIn_buf_cur++;                                                                                                                                      \
+    }                                                                                                                                                            \
+    MZ_MACRO_END
+
+#define TINFL_NEED_BITS(state_index, n)                \
+    do                                                 \
+    {                                                  \
+        mz_uint c;                                     \
+        TINFL_GET_BYTE(state_index, c);                \
+        bit_buf |= (((tinfl_bit_buf_t)c) << num_bits); \
+        num_bits += 8;                                 \
+    } while (num_bits < (mz_uint)(n))
+#define TINFL_SKIP_BITS(state_index, n)      \
+    do                                       \
+    {                                        \
+        if (num_bits < (mz_uint)(n))         \
+        {                                    \
+            TINFL_NEED_BITS(state_index, n); \
+        }                                    \
+        bit_buf >>= (n);                     \
+        num_bits -= (n);                     \
+    }                                        \
+    MZ_MACRO_END
+#define TINFL_GET_BITS(state_index, b, n)    \
+    do                                       \
+    {                                        \
+        if (num_bits < (mz_uint)(n))         \
+        {                                    \
+            TINFL_NEED_BITS(state_index, n); \
+        }                                    \
+        b = bit_buf & ((1 << (n)) - 1);      \
+        bit_buf >>= (n);                     \
+        num_bits -= (n);                     \
+    }                                        \
+    MZ_MACRO_END
+
+/* TINFL_HUFF_BITBUF_FILL() is only used rarely, when the number of bytes remaining in the input buffer falls below 2. */
+/* It reads just enough bytes from the input stream that are needed to decode the next Huffman code (and absolutely no more). It works by trying to fully decode a */
+/* Huffman code by using whatever bits are currently present in the bit buffer. If this fails, it reads another byte, and tries again until it succeeds or until the */
+/* bit buffer contains >=15 bits (deflate's max. Huffman code size). */
+#define TINFL_HUFF_BITBUF_FILL(state_index, pHuff)                             \
+    do                                                                         \
+    {                                                                          \
+        temp = (pHuff)->m_look_up[bit_buf & (TINFL_FAST_LOOKUP_SIZE - 1)];     \
+        if (temp >= 0)                                                         \
+        {                                                                      \
+            code_len = temp >> 9;                                              \
+            if ((code_len) && (num_bits >= code_len))                          \
+                break;                                                         \
+        }                                                                      \
+        else if (num_bits > TINFL_FAST_LOOKUP_BITS)                            \
+        {                                                                      \
+            code_len = TINFL_FAST_LOOKUP_BITS;                                 \
+            do                                                                 \
+            {                                                                  \
+                temp = (pHuff)->m_tree[~temp + ((bit_buf >> code_len++) & 1)]; \
+            } while ((temp < 0) && (num_bits >= (code_len + 1)));              \
+            if (temp >= 0)                                                     \
+                break;                                                         \
+        }                                                                      \
+        TINFL_GET_BYTE(state_index, c);                                        \
+        bit_buf |= (((tinfl_bit_buf_t)c) << num_bits);                         \
+        num_bits += 8;                                                         \
+    } while (num_bits < 15);
+
+/* TINFL_HUFF_DECODE() decodes the next Huffman coded symbol. It's more complex than you would initially expect because the zlib API expects the decompressor to never read */
+/* beyond the final byte of the deflate stream. (In other words, when this macro wants to read another byte from the input, it REALLY needs another byte in order to fully */
+/* decode the next Huffman code.) Handling this properly is particularly important on raw deflate (non-zlib) streams, which aren't followed by a byte aligned adler-32. */
+/* The slow path is only executed at the very end of the input buffer. */
+/* v1.16: The original macro handled the case at the very end of the passed-in input buffer, but we also need to handle the case where the user passes in 1+zillion bytes */
+/* following the deflate data and our non-conservative read-ahead path won't kick in here on this code. This is much trickier. */
+#define TINFL_HUFF_DECODE(state_index, sym, pHuff)                                                                                  \
+    do                                                                                                                              \
+    {                                                                                                                               \
+        int temp;                                                                                                                   \
+        mz_uint code_len, c;                                                                                                        \
+        if (num_bits < 15)                                                                                                          \
+        {                                                                                                                           \
+            if ((pIn_buf_end - pIn_buf_cur) < 2)                                                                                    \
+            {                                                                                                                       \
+                TINFL_HUFF_BITBUF_FILL(state_index, pHuff);                                                                         \
+            }                                                                                                                       \
+            else                                                                                                                    \
+            {                                                                                                                       \
+                bit_buf |= (((tinfl_bit_buf_t)pIn_buf_cur[0]) << num_bits) | (((tinfl_bit_buf_t)pIn_buf_cur[1]) << (num_bits + 8)); \
+                pIn_buf_cur += 2;                                                                                                   \
+                num_bits += 16;                                                                                                     \
+            }                                                                                                                       \
+        }                                                                                                                           \
+        if ((temp = (pHuff)->m_look_up[bit_buf & (TINFL_FAST_LOOKUP_SIZE - 1)]) >= 0)                                               \
+            code_len = temp >> 9, temp &= 511;                                                                                      \
+        else                                                                                                                        \
+        {                                                                                                                           \
+            code_len = TINFL_FAST_LOOKUP_BITS;                                                                                      \
+            do                                                                                                                      \
+            {                                                                                                                       \
+                temp = (pHuff)->m_tree[~temp + ((bit_buf >> code_len++) & 1)];                                                      \
+            } while (temp < 0);                                                                                                     \
+        }                                                                                                                           \
+        sym = temp;                                                                                                                 \
+        bit_buf >>= code_len;                                                                                                       \
+        num_bits -= code_len;                                                                                                       \
+    }                                                                                                                               \
+    MZ_MACRO_END
+
+tinfl_status tinfl_decompress(tinfl_decompressor *r, const mz_uint8 *pIn_buf_next, size_t *pIn_buf_size, mz_uint8 *pOut_buf_start, mz_uint8 *pOut_buf_next, size_t *pOut_buf_size, const mz_uint32 decomp_flags)
+{
+    static const int s_length_base[31] = { 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 15, 17, 19, 23, 27, 31, 35, 43, 51, 59, 67, 83, 99, 115, 131, 163, 195, 227, 258, 0, 0 };
+    static const int s_length_extra[31] = { 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 0, 0, 0 };
+    static const int s_dist_base[32] = { 1, 2, 3, 4, 5, 7, 9, 13, 17, 25, 33, 49, 65, 97, 129, 193, 257, 385, 513, 769, 1025, 1537, 2049, 3073, 4097, 6145, 8193, 12289, 16385, 24577, 0, 0 };
+    static const int s_dist_extra[32] = { 0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13 };
+    static const mz_uint8 s_length_dezigzag[19] = { 16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15 };
+    static const int s_min_table_sizes[3] = { 257, 1, 4 };
+
+    tinfl_status status = TINFL_STATUS_FAILED;
+    mz_uint32 num_bits, dist, counter, num_extra;
+    tinfl_bit_buf_t bit_buf;
+    const mz_uint8 *pIn_buf_cur = pIn_buf_next, *const pIn_buf_end = pIn_buf_next + *pIn_buf_size;
+    mz_uint8 *pOut_buf_cur = pOut_buf_next, *const pOut_buf_end = pOut_buf_next + *pOut_buf_size;
+    size_t out_buf_size_mask = (decomp_flags & TINFL_FLAG_USING_NON_WRAPPING_OUTPUT_BUF) ? (size_t)-1 : ((pOut_buf_next - pOut_buf_start) + *pOut_buf_size) - 1, dist_from_out_buf_start;
+
+    /* Ensure the output buffer's size is a power of 2, unless the output buffer is large enough to hold the entire output file (in which case it doesn't matter). */
+    if (((out_buf_size_mask + 1) & out_buf_size_mask) || (pOut_buf_next < pOut_buf_start))
+    {
+        *pIn_buf_size = *pOut_buf_size = 0;
+        return TINFL_STATUS_BAD_PARAM;
+    }
+
+    num_bits = r->m_num_bits;
+    bit_buf = r->m_bit_buf;
+    dist = r->m_dist;
+    counter = r->m_counter;
+    num_extra = r->m_num_extra;
+    dist_from_out_buf_start = r->m_dist_from_out_buf_start;
+    TINFL_CR_BEGIN
+
+    bit_buf = num_bits = dist = counter = num_extra = r->m_zhdr0 = r->m_zhdr1 = 0;
+    r->m_z_adler32 = r->m_check_adler32 = 1;
+    if (decomp_flags & TINFL_FLAG_PARSE_ZLIB_HEADER)
+    {
+        TINFL_GET_BYTE(1, r->m_zhdr0);
+        TINFL_GET_BYTE(2, r->m_zhdr1);
+        counter = (((r->m_zhdr0 * 256 + r->m_zhdr1) % 31 != 0) || (r->m_zhdr1 & 32) || ((r->m_zhdr0 & 15) != 8));
+        if (!(decomp_flags & TINFL_FLAG_USING_NON_WRAPPING_OUTPUT_BUF))
+            counter |= (((1U << (8U + (r->m_zhdr0 >> 4))) > 32768U) || ((out_buf_size_mask + 1) < (size_t)(1U << (8U + (r->m_zhdr0 >> 4)))));
+        if (counter)
+        {
+            TINFL_CR_RETURN_FOREVER(36, TINFL_STATUS_FAILED);
+        }
+    }
+
+    do
+    {
+        TINFL_GET_BITS(3, r->m_final, 3);
+        r->m_type = r->m_final >> 1;
+        if (r->m_type == 0)
+        {
+            TINFL_SKIP_BITS(5, num_bits & 7);
+            for (counter = 0; counter < 4; ++counter)
+            {
+                if (num_bits)
+                    TINFL_GET_BITS(6, r->m_raw_header[counter], 8);
+                else
+                    TINFL_GET_BYTE(7, r->m_raw_header[counter]);
+            }
+            if ((counter = (r->m_raw_header[0] | (r->m_raw_header[1] << 8))) != (mz_uint)(0xFFFF ^ (r->m_raw_header[2] | (r->m_raw_header[3] << 8))))
+            {
+                TINFL_CR_RETURN_FOREVER(39, TINFL_STATUS_FAILED);
+            }
+            while ((counter) && (num_bits))
+            {
+                TINFL_GET_BITS(51, dist, 8);
+                while (pOut_buf_cur >= pOut_buf_end)
+                {
+                    TINFL_CR_RETURN(52, TINFL_STATUS_HAS_MORE_OUTPUT);
+                }
+                *pOut_buf_cur++ = (mz_uint8)dist;
+                counter--;
+            }
+            while (counter)
+            {
+                size_t n;
+                while (pOut_buf_cur >= pOut_buf_end)
+                {
+                    TINFL_CR_RETURN(9, TINFL_STATUS_HAS_MORE_OUTPUT);
+                }
+                while (pIn_buf_cur >= pIn_buf_end)
+                {
+                    TINFL_CR_RETURN(38, (decomp_flags & TINFL_FLAG_HAS_MORE_INPUT) ? TINFL_STATUS_NEEDS_MORE_INPUT : TINFL_STATUS_FAILED_CANNOT_MAKE_PROGRESS);
+                }
+                n = MZ_MIN(MZ_MIN((size_t)(pOut_buf_end - pOut_buf_cur), (size_t)(pIn_buf_end - pIn_buf_cur)), counter);
+                TINFL_MEMCPY(pOut_buf_cur, pIn_buf_cur, n);
+                pIn_buf_cur += n;
+                pOut_buf_cur += n;
+                counter -= (mz_uint)n;
+            }
+        }
+        else if (r->m_type == 3)
+        {
+            TINFL_CR_RETURN_FOREVER(10, TINFL_STATUS_FAILED);
+        }
+        else
+        {
+            if (r->m_type == 1)
+            {
+                mz_uint8 *p = r->m_tables[0].m_code_size;
+                mz_uint i;
+                r->m_table_sizes[0] = 288;
+                r->m_table_sizes[1] = 32;
+                TINFL_MEMSET(r->m_tables[1].m_code_size, 5, 32);
+                for (i = 0; i <= 143; ++i)
+                    *p++ = 8;
+                for (; i <= 255; ++i)
+                    *p++ = 9;
+                for (; i <= 279; ++i)
+                    *p++ = 7;
+                for (; i <= 287; ++i)
+                    *p++ = 8;
+            }
+            else
+            {
+                for (counter = 0; counter < 3; counter++)
+                {
+                    TINFL_GET_BITS(11, r->m_table_sizes[counter], "\05\05\04"[counter]);
+                    r->m_table_sizes[counter] += s_min_table_sizes[counter];
+                }
+                MZ_CLEAR_OBJ(r->m_tables[2].m_code_size);
+                for (counter = 0; counter < r->m_table_sizes[2]; counter++)
+                {
+                    mz_uint s;
+                    TINFL_GET_BITS(14, s, 3);
+                    r->m_tables[2].m_code_size[s_length_dezigzag[counter]] = (mz_uint8)s;
+                }
+                r->m_table_sizes[2] = 19;
+            }
+            for (; (int)r->m_type >= 0; r->m_type--)
+            {
+                int tree_next, tree_cur;
+                tinfl_huff_table *pTable;
+                mz_uint i, j, used_syms, total, sym_index, next_code[17], total_syms[16];
+                pTable = &r->m_tables[r->m_type];
+                MZ_CLEAR_OBJ(total_syms);
+                MZ_CLEAR_OBJ(pTable->m_look_up);
+                MZ_CLEAR_OBJ(pTable->m_tree);
+                for (i = 0; i < r->m_table_sizes[r->m_type]; ++i)
+                    total_syms[pTable->m_code_size[i]]++;
+                used_syms = 0, total = 0;
+                next_code[0] = next_code[1] = 0;
+                for (i = 1; i <= 15; ++i)
+                {
+                    used_syms += total_syms[i];
+                    next_code[i + 1] = (total = ((total + total_syms[i]) << 1));
+                }
+                if ((65536 != total) && (used_syms > 1))
+                {
+                    TINFL_CR_RETURN_FOREVER(35, TINFL_STATUS_FAILED);
+                }
+                for (tree_next = -1, sym_index = 0; sym_index < r->m_table_sizes[r->m_type]; ++sym_index)
+                {
+                    mz_uint rev_code = 0, l, cur_code, code_size = pTable->m_code_size[sym_index];
+                    if (!code_size)
+                        continue;
+                    cur_code = next_code[code_size]++;
+                    for (l = code_size; l > 0; l--, cur_code >>= 1)
+                        rev_code = (rev_code << 1) | (cur_code & 1);
+                    if (code_size <= TINFL_FAST_LOOKUP_BITS)
+                    {
+                        mz_int16 k = (mz_int16)((code_size << 9) | sym_index);
+                        while (rev_code < TINFL_FAST_LOOKUP_SIZE)
+                        {
+                            pTable->m_look_up[rev_code] = k;
+                            rev_code += (1 << code_size);
+                        }
+                        continue;
+                    }
+                    if (0 == (tree_cur = pTable->m_look_up[rev_code & (TINFL_FAST_LOOKUP_SIZE - 1)]))
+                    {
+                        pTable->m_look_up[rev_code & (TINFL_FAST_LOOKUP_SIZE - 1)] = (mz_int16)tree_next;
+                        tree_cur = tree_next;
+                        tree_next -= 2;
+                    }
+                    rev_code >>= (TINFL_FAST_LOOKUP_BITS - 1);
+                    for (j = code_size; j > (TINFL_FAST_LOOKUP_BITS + 1); j--)
+                    {
+                        tree_cur -= ((rev_code >>= 1) & 1);
+                        if (!pTable->m_tree[-tree_cur - 1])
+                        {
+                            pTable->m_tree[-tree_cur - 1] = (mz_int16)tree_next;
+                            tree_cur = tree_next;
+                            tree_next -= 2;
+                        }
+                        else
+                            tree_cur = pTable->m_tree[-tree_cur - 1];
+                    }
+                    tree_cur -= ((rev_code >>= 1) & 1);
+                    pTable->m_tree[-tree_cur - 1] = (mz_int16)sym_index;
+                }
+                if (r->m_type == 2)
+                {
+                    for (counter = 0; counter < (r->m_table_sizes[0] + r->m_table_sizes[1]);)
+                    {
+                        mz_uint s;
+                        TINFL_HUFF_DECODE(16, dist, &r->m_tables[2]);
+                        if (dist < 16)
+                        {
+                            r->m_len_codes[counter++] = (mz_uint8)dist;
+                            continue;
+                        }
+                        if ((dist == 16) && (!counter))
+                        {
+                            TINFL_CR_RETURN_FOREVER(17, TINFL_STATUS_FAILED);
+                        }
+                        num_extra = "\02\03\07"[dist - 16];
+                        TINFL_GET_BITS(18, s, num_extra);
+                        s += "\03\03\013"[dist - 16];
+                        TINFL_MEMSET(r->m_len_codes + counter, (dist == 16) ? r->m_len_codes[counter - 1] : 0, s);
+                        counter += s;
+                    }
+                    if ((r->m_table_sizes[0] + r->m_table_sizes[1]) != counter)
+                    {
+                        TINFL_CR_RETURN_FOREVER(21, TINFL_STATUS_FAILED);
+                    }
+                    TINFL_MEMCPY(r->m_tables[0].m_code_size, r->m_len_codes, r->m_table_sizes[0]);
+                    TINFL_MEMCPY(r->m_tables[1].m_code_size, r->m_len_codes + r->m_table_sizes[0], r->m_table_sizes[1]);
+                }
+            }
+            for (;;)
+            {
+                mz_uint8 *pSrc;
+                for (;;)
+                {
+                    if (((pIn_buf_end - pIn_buf_cur) < 4) || ((pOut_buf_end - pOut_buf_cur) < 2))
+                    {
+                        TINFL_HUFF_DECODE(23, counter, &r->m_tables[0]);
+                        if (counter >= 256)
+                            break;
+                        while (pOut_buf_cur >= pOut_buf_end)
+                        {
+                            TINFL_CR_RETURN(24, TINFL_STATUS_HAS_MORE_OUTPUT);
+                        }
+                        *pOut_buf_cur++ = (mz_uint8)counter;
+                    }
+                    else
+                    {
+                        int sym2;
+                        mz_uint code_len;
+#if TINFL_USE_64BIT_BITBUF
+                        if (num_bits < 30)
+                        {
+                            bit_buf |= (((tinfl_bit_buf_t)MZ_READ_LE32(pIn_buf_cur)) << num_bits);
+                            pIn_buf_cur += 4;
+                            num_bits += 32;
+                        }
+#else
+                        if (num_bits < 15)
+                        {
+                            bit_buf |= (((tinfl_bit_buf_t)MZ_READ_LE16(pIn_buf_cur)) << num_bits);
+                            pIn_buf_cur += 2;
+                            num_bits += 16;
+                        }
+#endif
+                        if ((sym2 = r->m_tables[0].m_look_up[bit_buf & (TINFL_FAST_LOOKUP_SIZE - 1)]) >= 0)
+                            code_len = sym2 >> 9;
+                        else
+                        {
+                            code_len = TINFL_FAST_LOOKUP_BITS;
+                            do
+                            {
+                                sym2 = r->m_tables[0].m_tree[~sym2 + ((bit_buf >> code_len++) & 1)];
+                            } while (sym2 < 0);
+                        }
+                        counter = sym2;
+                        bit_buf >>= code_len;
+                        num_bits -= code_len;
+                        if (counter & 256)
+                            break;
+
+#if !TINFL_USE_64BIT_BITBUF
+                        if (num_bits < 15)
+                        {
+                            bit_buf |= (((tinfl_bit_buf_t)MZ_READ_LE16(pIn_buf_cur)) << num_bits);
+                            pIn_buf_cur += 2;
+                            num_bits += 16;
+                        }
+#endif
+                        if ((sym2 = r->m_tables[0].m_look_up[bit_buf & (TINFL_FAST_LOOKUP_SIZE - 1)]) >= 0)
+                            code_len = sym2 >> 9;
+                        else
+                        {
+                            code_len = TINFL_FAST_LOOKUP_BITS;
+                            do
+                            {
+                                sym2 = r->m_tables[0].m_tree[~sym2 + ((bit_buf >> code_len++) & 1)];
+                            } while (sym2 < 0);
+                        }
+                        bit_buf >>= code_len;
+                        num_bits -= code_len;
+
+                        pOut_buf_cur[0] = (mz_uint8)counter;
+                        if (sym2 & 256)
+                        {
+                            pOut_buf_cur++;
+                            counter = sym2;
+                            break;
+                        }
+                        pOut_buf_cur[1] = (mz_uint8)sym2;
+                        pOut_buf_cur += 2;
+                    }
+                }
+                if ((counter &= 511) == 256)
+                    break;
+
+                num_extra = s_length_extra[counter - 257];
+                counter = s_length_base[counter - 257];
+                if (num_extra)
+                {
+                    mz_uint extra_bits;
+                    TINFL_GET_BITS(25, extra_bits, num_extra);
+                    counter += extra_bits;
+                }
+
+                TINFL_HUFF_DECODE(26, dist, &r->m_tables[1]);
+                num_extra = s_dist_extra[dist];
+                dist = s_dist_base[dist];
+                if (num_extra)
+                {
+                    mz_uint extra_bits;
+                    TINFL_GET_BITS(27, extra_bits, num_extra);
+                    dist += extra_bits;
+                }
+
+                dist_from_out_buf_start = pOut_buf_cur - pOut_buf_start;
+                if ((dist == 0 || dist > dist_from_out_buf_start || dist_from_out_buf_start == 0) && (decomp_flags & TINFL_FLAG_USING_NON_WRAPPING_OUTPUT_BUF))
+                {
+                    TINFL_CR_RETURN_FOREVER(37, TINFL_STATUS_FAILED);
+                }
+
+                pSrc = pOut_buf_start + ((dist_from_out_buf_start - dist) & out_buf_size_mask);
+
+                if ((MZ_MAX(pOut_buf_cur, pSrc) + counter) > pOut_buf_end)
+                {
+                    while (counter--)
+                    {
+                        while (pOut_buf_cur >= pOut_buf_end)
+                        {
+                            TINFL_CR_RETURN(53, TINFL_STATUS_HAS_MORE_OUTPUT);
+                        }
+                        *pOut_buf_cur++ = pOut_buf_start[(dist_from_out_buf_start++ - dist) & out_buf_size_mask];
+                    }
+                    continue;
+                }
+#if MINIZ_USE_UNALIGNED_LOADS_AND_STORES
+                else if ((counter >= 9) && (counter <= dist))
+                {
+                    const mz_uint8 *pSrc_end = pSrc + (counter & ~7);
+                    do
+                    {
+#ifdef MINIZ_UNALIGNED_USE_MEMCPY
+						memcpy(pOut_buf_cur, pSrc, sizeof(mz_uint32)*2);
+#else
+                        ((mz_uint32 *)pOut_buf_cur)[0] = ((const mz_uint32 *)pSrc)[0];
+                        ((mz_uint32 *)pOut_buf_cur)[1] = ((const mz_uint32 *)pSrc)[1];
+#endif
+                        pOut_buf_cur += 8;
+                    } while ((pSrc += 8) < pSrc_end);
+                    if ((counter &= 7) < 3)
+                    {
+                        if (counter)
+                        {
+                            pOut_buf_cur[0] = pSrc[0];
+                            if (counter > 1)
+                                pOut_buf_cur[1] = pSrc[1];
+                            pOut_buf_cur += counter;
+                        }
+                        continue;
+                    }
+                }
+#endif
+                while(counter>2)
+                {
+                    pOut_buf_cur[0] = pSrc[0];
+                    pOut_buf_cur[1] = pSrc[1];
+                    pOut_buf_cur[2] = pSrc[2];
+                    pOut_buf_cur += 3;
+                    pSrc += 3;
+					counter -= 3;
+                }
+                if (counter > 0)
+                {
+                    pOut_buf_cur[0] = pSrc[0];
+                    if (counter > 1)
+                        pOut_buf_cur[1] = pSrc[1];
+                    pOut_buf_cur += counter;
+                }
+            }
+        }
+    } while (!(r->m_final & 1));
+
+    /* Ensure byte alignment and put back any bytes from the bitbuf if we've looked ahead too far on gzip, or other Deflate streams followed by arbitrary data. */
+    /* I'm being super conservative here. A number of simplifications can be made to the byte alignment part, and the Adler32 check shouldn't ever need to worry about reading from the bitbuf now. */
+    TINFL_SKIP_BITS(32, num_bits & 7);
+    while ((pIn_buf_cur > pIn_buf_next) && (num_bits >= 8))
+    {
+        --pIn_buf_cur;
+        num_bits -= 8;
+    }
+    bit_buf &= (tinfl_bit_buf_t)((((mz_uint64)1) << num_bits) - (mz_uint64)1);
+    MZ_ASSERT(!num_bits); /* if this assert fires then we've read beyond the end of non-deflate/zlib streams with following data (such as gzip streams). */
+
+    if (decomp_flags & TINFL_FLAG_PARSE_ZLIB_HEADER)
+    {
+        for (counter = 0; counter < 4; ++counter)
+        {
+            mz_uint s;
+            if (num_bits)
+                TINFL_GET_BITS(41, s, 8);
+            else
+                TINFL_GET_BYTE(42, s);
+            r->m_z_adler32 = (r->m_z_adler32 << 8) | s;
+        }
+    }
+    TINFL_CR_RETURN_FOREVER(34, TINFL_STATUS_DONE);
+
+    TINFL_CR_FINISH
+
+common_exit:
+    /* As long as we aren't telling the caller that we NEED more input to make forward progress: */
+    /* Put back any bytes from the bitbuf in case we've looked ahead too far on gzip, or other Deflate streams followed by arbitrary data. */
+    /* We need to be very careful here to NOT push back any bytes we definitely know we need to make forward progress, though, or we'll lock the caller up into an inf loop. */
+    if ((status != TINFL_STATUS_NEEDS_MORE_INPUT) && (status != TINFL_STATUS_FAILED_CANNOT_MAKE_PROGRESS))
+    {
+        while ((pIn_buf_cur > pIn_buf_next) && (num_bits >= 8))
+        {
+            --pIn_buf_cur;
+            num_bits -= 8;
+        }
+    }
+    r->m_num_bits = num_bits;
+    r->m_bit_buf = bit_buf & (tinfl_bit_buf_t)((((mz_uint64)1) << num_bits) - (mz_uint64)1);
+    r->m_dist = dist;
+    r->m_counter = counter;
+    r->m_num_extra = num_extra;
+    r->m_dist_from_out_buf_start = dist_from_out_buf_start;
+    *pIn_buf_size = pIn_buf_cur - pIn_buf_next;
+    *pOut_buf_size = pOut_buf_cur - pOut_buf_next;
+    if ((decomp_flags & (TINFL_FLAG_PARSE_ZLIB_HEADER | TINFL_FLAG_COMPUTE_ADLER32)) && (status >= 0))
+    {
+        const mz_uint8 *ptr = pOut_buf_next;
+        size_t buf_len = *pOut_buf_size;
+        mz_uint32 i, s1 = r->m_check_adler32 & 0xffff, s2 = r->m_check_adler32 >> 16;
+        size_t block_len = buf_len % 5552;
+        while (buf_len)
+        {
+            for (i = 0; i + 7 < block_len; i += 8, ptr += 8)
+            {
+                s1 += ptr[0], s2 += s1;
+                s1 += ptr[1], s2 += s1;
+                s1 += ptr[2], s2 += s1;
+                s1 += ptr[3], s2 += s1;
+                s1 += ptr[4], s2 += s1;
+                s1 += ptr[5], s2 += s1;
+                s1 += ptr[6], s2 += s1;
+                s1 += ptr[7], s2 += s1;
+            }
+            for (; i < block_len; ++i)
+                s1 += *ptr++, s2 += s1;
+            s1 %= 65521U, s2 %= 65521U;
+            buf_len -= block_len;
+            block_len = 5552;
+        }
+        r->m_check_adler32 = (s2 << 16) + s1;
+        if ((status == TINFL_STATUS_DONE) && (decomp_flags & TINFL_FLAG_PARSE_ZLIB_HEADER) && (r->m_check_adler32 != r->m_z_adler32))
+            status = TINFL_STATUS_ADLER32_MISMATCH;
+    }
+    return status;
+}
+
+/* Higher level helper functions. */
+void *tinfl_decompress_mem_to_heap(const void *pSrc_buf, size_t src_buf_len, size_t *pOut_len, int flags)
+{
+    tinfl_decompressor decomp;
+    void *pBuf = NULL, *pNew_buf;
+    size_t src_buf_ofs = 0, out_buf_capacity = 0;
+    *pOut_len = 0;
+    tinfl_init(&decomp);
+    for (;;)
+    {
+        size_t src_buf_size = src_buf_len - src_buf_ofs, dst_buf_size = out_buf_capacity - *pOut_len, new_out_buf_capacity;
+        tinfl_status status = tinfl_decompress(&decomp, (const mz_uint8 *)pSrc_buf + src_buf_ofs, &src_buf_size, (mz_uint8 *)pBuf, pBuf ? (mz_uint8 *)pBuf + *pOut_len : NULL, &dst_buf_size,
+                                               (flags & ~TINFL_FLAG_HAS_MORE_INPUT) | TINFL_FLAG_USING_NON_WRAPPING_OUTPUT_BUF);
+        if ((status < 0) || (status == TINFL_STATUS_NEEDS_MORE_INPUT))
+        {
+            MZ_FREE(pBuf);
+            *pOut_len = 0;
+            return NULL;
+        }
+        src_buf_ofs += src_buf_size;
+        *pOut_len += dst_buf_size;
+        if (status == TINFL_STATUS_DONE)
+            break;
+        new_out_buf_capacity = out_buf_capacity * 2;
+        if (new_out_buf_capacity < 128)
+            new_out_buf_capacity = 128;
+        pNew_buf = MZ_REALLOC(pBuf, new_out_buf_capacity);
+        if (!pNew_buf)
+        {
+            MZ_FREE(pBuf);
+            *pOut_len = 0;
+            return NULL;
+        }
+        pBuf = pNew_buf;
+        out_buf_capacity = new_out_buf_capacity;
+    }
+    return pBuf;
+}
+
+size_t tinfl_decompress_mem_to_mem(void *pOut_buf, size_t out_buf_len, const void *pSrc_buf, size_t src_buf_len, int flags)
+{
+    tinfl_decompressor decomp;
+    tinfl_status status;
+    tinfl_init(&decomp);
+    status = tinfl_decompress(&decomp, (const mz_uint8 *)pSrc_buf, &src_buf_len, (mz_uint8 *)pOut_buf, (mz_uint8 *)pOut_buf, &out_buf_len, (flags & ~TINFL_FLAG_HAS_MORE_INPUT) | TINFL_FLAG_USING_NON_WRAPPING_OUTPUT_BUF);
+    return (status != TINFL_STATUS_DONE) ? TINFL_DECOMPRESS_MEM_TO_MEM_FAILED : out_buf_len;
+}
+
+int tinfl_decompress_mem_to_callback(const void *pIn_buf, size_t *pIn_buf_size, tinfl_put_buf_func_ptr pPut_buf_func, void *pPut_buf_user, int flags)
+{
+    int result = 0;
+    tinfl_decompressor decomp;
+    mz_uint8 *pDict = (mz_uint8 *)MZ_MALLOC(TINFL_LZ_DICT_SIZE);
+    size_t in_buf_ofs = 0, dict_ofs = 0;
+    if (!pDict)
+        return TINFL_STATUS_FAILED;
+    tinfl_init(&decomp);
+    for (;;)
+    {
+        size_t in_buf_size = *pIn_buf_size - in_buf_ofs, dst_buf_size = TINFL_LZ_DICT_SIZE - dict_ofs;
+        tinfl_status status = tinfl_decompress(&decomp, (const mz_uint8 *)pIn_buf + in_buf_ofs, &in_buf_size, pDict, pDict + dict_ofs, &dst_buf_size,
+                                               (flags & ~(TINFL_FLAG_HAS_MORE_INPUT | TINFL_FLAG_USING_NON_WRAPPING_OUTPUT_BUF)));
+        in_buf_ofs += in_buf_size;
+        if ((dst_buf_size) && (!(*pPut_buf_func)(pDict + dict_ofs, (int)dst_buf_size, pPut_buf_user)))
+            break;
+        if (status != TINFL_STATUS_HAS_MORE_OUTPUT)
+        {
+            result = (status == TINFL_STATUS_DONE);
+            break;
+        }
+        dict_ofs = (dict_ofs + dst_buf_size) & (TINFL_LZ_DICT_SIZE - 1);
+    }
+    MZ_FREE(pDict);
+    *pIn_buf_size = in_buf_ofs;
+    return result;
+}
+
+#ifndef MINIZ_NO_MALLOC
+tinfl_decompressor *tinfl_decompressor_alloc()
+{
+    tinfl_decompressor *pDecomp = (tinfl_decompressor *)MZ_MALLOC(sizeof(tinfl_decompressor));
+    if (pDecomp)
+        tinfl_init(pDecomp);
+    return pDecomp;
+}
+
+void tinfl_decompressor_free(tinfl_decompressor *pDecomp)
+{
+    MZ_FREE(pDecomp);
+}
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+ /**************************************************************************
+ *
+ * Copyright 2013-2014 RAD Game Tools and Valve Software
+ * Copyright 2010-2014 Rich Geldreich and Tenacious Software LLC
+ * Copyright 2016 Martin Raiber
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+#ifndef MINIZ_NO_ARCHIVE_APIS
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* ------------------- .ZIP archive reading */
+
+#ifdef MINIZ_NO_STDIO
+#define MZ_FILE void *
+#else
+#include <sys/stat.h>
+
+#if defined(_MSC_VER) || defined(__MINGW64__)
+static FILE *mz_fopen(const char *pFilename, const char *pMode)
+{
+    FILE *pFile = NULL;
+    fopen_s(&pFile, pFilename, pMode);
+    return pFile;
+}
+static FILE *mz_freopen(const char *pPath, const char *pMode, FILE *pStream)
+{
+    FILE *pFile = NULL;
+    if (freopen_s(&pFile, pPath, pMode, pStream))
+        return NULL;
+    return pFile;
+}
+#ifndef MINIZ_NO_TIME
+#include <sys/utime.h>
+#endif
+#define MZ_FOPEN mz_fopen
+#define MZ_FCLOSE fclose
+#define MZ_FREAD fread
+#define MZ_FWRITE fwrite
+#define MZ_FTELL64 _ftelli64
+#define MZ_FSEEK64 _fseeki64
+#define MZ_FILE_STAT_STRUCT _stat64
+#define MZ_FILE_STAT _stat64
+#define MZ_FFLUSH fflush
+#define MZ_FREOPEN mz_freopen
+#define MZ_DELETE_FILE remove
+#elif defined(__MINGW32__)
+#ifndef MINIZ_NO_TIME
+#include <sys/utime.h>
+#endif
+#define MZ_FOPEN(f, m) fopen(f, m)
+#define MZ_FCLOSE fclose
+#define MZ_FREAD fread
+#define MZ_FWRITE fwrite
+#define MZ_FTELL64 ftello64
+#define MZ_FSEEK64 fseeko64
+#define MZ_FILE_STAT_STRUCT _stat
+#define MZ_FILE_STAT _stat
+#define MZ_FFLUSH fflush
+#define MZ_FREOPEN(f, m, s) freopen(f, m, s)
+#define MZ_DELETE_FILE remove
+#elif defined(__TINYC__)
+#ifndef MINIZ_NO_TIME
+#include <sys/utime.h>
+#endif
+#define MZ_FOPEN(f, m) fopen(f, m)
+#define MZ_FCLOSE fclose
+#define MZ_FREAD fread
+#define MZ_FWRITE fwrite
+#define MZ_FTELL64 ftell
+#define MZ_FSEEK64 fseek
+#define MZ_FILE_STAT_STRUCT stat
+#define MZ_FILE_STAT stat
+#define MZ_FFLUSH fflush
+#define MZ_FREOPEN(f, m, s) freopen(f, m, s)
+#define MZ_DELETE_FILE remove
+#elif defined(__USE_LARGEFILE64) /* gcc, clang */
+#ifndef MINIZ_NO_TIME
+#include <utime.h>
+#endif
+#define MZ_FOPEN(f, m) fopen64(f, m)
+#define MZ_FCLOSE fclose
+#define MZ_FREAD fread
+#define MZ_FWRITE fwrite
+#define MZ_FTELL64 ftello64
+#define MZ_FSEEK64 fseeko64
+#define MZ_FILE_STAT_STRUCT stat64
+#define MZ_FILE_STAT stat64
+#define MZ_FFLUSH fflush
+#define MZ_FREOPEN(p, m, s) freopen64(p, m, s)
+#define MZ_DELETE_FILE remove
+#elif defined(__APPLE__)
+#ifndef MINIZ_NO_TIME
+#include <utime.h>
+#endif
+#define MZ_FOPEN(f, m) fopen(f, m)
+#define MZ_FCLOSE fclose
+#define MZ_FREAD fread
+#define MZ_FWRITE fwrite
+#define MZ_FTELL64 ftello
+#define MZ_FSEEK64 fseeko
+#define MZ_FILE_STAT_STRUCT stat
+#define MZ_FILE_STAT stat
+#define MZ_FFLUSH fflush
+#define MZ_FREOPEN(p, m, s) freopen(p, m, s)
+#define MZ_DELETE_FILE remove
+
+#else
+#pragma message("Using fopen, ftello, fseeko, stat() etc. path for file I/O - this path may not support large files.")
+#ifndef MINIZ_NO_TIME
+#include <utime.h>
+#endif
+#define MZ_FOPEN(f, m) fopen(f, m)
+#define MZ_FCLOSE fclose
+#define MZ_FREAD fread
+#define MZ_FWRITE fwrite
+#ifdef __STRICT_ANSI__
+#define MZ_FTELL64 ftell
+#define MZ_FSEEK64 fseek
+#else
+#define MZ_FTELL64 ftello
+#define MZ_FSEEK64 fseeko
+#endif
+#define MZ_FILE_STAT_STRUCT stat
+#define MZ_FILE_STAT stat
+#define MZ_FFLUSH fflush
+#define MZ_FREOPEN(f, m, s) freopen(f, m, s)
+#define MZ_DELETE_FILE remove
+#endif /* #ifdef _MSC_VER */
+#endif /* #ifdef MINIZ_NO_STDIO */
+
+#define MZ_TOLOWER(c) ((((c) >= 'A') && ((c) <= 'Z')) ? ((c) - 'A' + 'a') : (c))
+
+/* Various ZIP archive enums. To completely avoid cross platform compiler alignment and platform endian issues, miniz.c doesn't use structs for any of this stuff. */
+enum
+{
+    /* ZIP archive identifiers and record sizes */
+    MZ_ZIP_END_OF_CENTRAL_DIR_HEADER_SIG = 0x06054b50,
+    MZ_ZIP_CENTRAL_DIR_HEADER_SIG = 0x02014b50,
+    MZ_ZIP_LOCAL_DIR_HEADER_SIG = 0x04034b50,
+    MZ_ZIP_LOCAL_DIR_HEADER_SIZE = 30,
+    MZ_ZIP_CENTRAL_DIR_HEADER_SIZE = 46,
+    MZ_ZIP_END_OF_CENTRAL_DIR_HEADER_SIZE = 22,
+
+    /* ZIP64 archive identifier and record sizes */
+    MZ_ZIP64_END_OF_CENTRAL_DIR_HEADER_SIG = 0x06064b50,
+    MZ_ZIP64_END_OF_CENTRAL_DIR_LOCATOR_SIG = 0x07064b50,
+    MZ_ZIP64_END_OF_CENTRAL_DIR_HEADER_SIZE = 56,
+    MZ_ZIP64_END_OF_CENTRAL_DIR_LOCATOR_SIZE = 20,
+    MZ_ZIP64_EXTENDED_INFORMATION_FIELD_HEADER_ID = 0x0001,
+    MZ_ZIP_DATA_DESCRIPTOR_ID = 0x08074b50,
+    MZ_ZIP_DATA_DESCRIPTER_SIZE64 = 24,
+    MZ_ZIP_DATA_DESCRIPTER_SIZE32 = 16,
+
+    /* Central directory header record offsets */
+    MZ_ZIP_CDH_SIG_OFS = 0,
+    MZ_ZIP_CDH_VERSION_MADE_BY_OFS = 4,
+    MZ_ZIP_CDH_VERSION_NEEDED_OFS = 6,
+    MZ_ZIP_CDH_BIT_FLAG_OFS = 8,
+    MZ_ZIP_CDH_METHOD_OFS = 10,
+    MZ_ZIP_CDH_FILE_TIME_OFS = 12,
+    MZ_ZIP_CDH_FILE_DATE_OFS = 14,
+    MZ_ZIP_CDH_CRC32_OFS = 16,
+    MZ_ZIP_CDH_COMPRESSED_SIZE_OFS = 20,
+    MZ_ZIP_CDH_DECOMPRESSED_SIZE_OFS = 24,
+    MZ_ZIP_CDH_FILENAME_LEN_OFS = 28,
+    MZ_ZIP_CDH_EXTRA_LEN_OFS = 30,
+    MZ_ZIP_CDH_COMMENT_LEN_OFS = 32,
+    MZ_ZIP_CDH_DISK_START_OFS = 34,
+    MZ_ZIP_CDH_INTERNAL_ATTR_OFS = 36,
+    MZ_ZIP_CDH_EXTERNAL_ATTR_OFS = 38,
+    MZ_ZIP_CDH_LOCAL_HEADER_OFS = 42,
+
+    /* Local directory header offsets */
+    MZ_ZIP_LDH_SIG_OFS = 0,
+    MZ_ZIP_LDH_VERSION_NEEDED_OFS = 4,
+    MZ_ZIP_LDH_BIT_FLAG_OFS = 6,
+    MZ_ZIP_LDH_METHOD_OFS = 8,
+    MZ_ZIP_LDH_FILE_TIME_OFS = 10,
+    MZ_ZIP_LDH_FILE_DATE_OFS = 12,
+    MZ_ZIP_LDH_CRC32_OFS = 14,
+    MZ_ZIP_LDH_COMPRESSED_SIZE_OFS = 18,
+    MZ_ZIP_LDH_DECOMPRESSED_SIZE_OFS = 22,
+    MZ_ZIP_LDH_FILENAME_LEN_OFS = 26,
+    MZ_ZIP_LDH_EXTRA_LEN_OFS = 28,
+    MZ_ZIP_LDH_BIT_FLAG_HAS_LOCATOR = 1 << 3,
+
+    /* End of central directory offsets */
+    MZ_ZIP_ECDH_SIG_OFS = 0,
+    MZ_ZIP_ECDH_NUM_THIS_DISK_OFS = 4,
+    MZ_ZIP_ECDH_NUM_DISK_CDIR_OFS = 6,
+    MZ_ZIP_ECDH_CDIR_NUM_ENTRIES_ON_DISK_OFS = 8,
+    MZ_ZIP_ECDH_CDIR_TOTAL_ENTRIES_OFS = 10,
+    MZ_ZIP_ECDH_CDIR_SIZE_OFS = 12,
+    MZ_ZIP_ECDH_CDIR_OFS_OFS = 16,
+    MZ_ZIP_ECDH_COMMENT_SIZE_OFS = 20,
+
+    /* ZIP64 End of central directory locator offsets */
+    MZ_ZIP64_ECDL_SIG_OFS = 0,                    /* 4 bytes */
+    MZ_ZIP64_ECDL_NUM_DISK_CDIR_OFS = 4,          /* 4 bytes */
+    MZ_ZIP64_ECDL_REL_OFS_TO_ZIP64_ECDR_OFS = 8,  /* 8 bytes */
+    MZ_ZIP64_ECDL_TOTAL_NUMBER_OF_DISKS_OFS = 16, /* 4 bytes */
+
+    /* ZIP64 End of central directory header offsets */
+    MZ_ZIP64_ECDH_SIG_OFS = 0,                       /* 4 bytes */
+    MZ_ZIP64_ECDH_SIZE_OF_RECORD_OFS = 4,            /* 8 bytes */
+    MZ_ZIP64_ECDH_VERSION_MADE_BY_OFS = 12,          /* 2 bytes */
+    MZ_ZIP64_ECDH_VERSION_NEEDED_OFS = 14,           /* 2 bytes */
+    MZ_ZIP64_ECDH_NUM_THIS_DISK_OFS = 16,            /* 4 bytes */
+    MZ_ZIP64_ECDH_NUM_DISK_CDIR_OFS = 20,            /* 4 bytes */
+    MZ_ZIP64_ECDH_CDIR_NUM_ENTRIES_ON_DISK_OFS = 24, /* 8 bytes */
+    MZ_ZIP64_ECDH_CDIR_TOTAL_ENTRIES_OFS = 32,       /* 8 bytes */
+    MZ_ZIP64_ECDH_CDIR_SIZE_OFS = 40,                /* 8 bytes */
+    MZ_ZIP64_ECDH_CDIR_OFS_OFS = 48,                 /* 8 bytes */
+    MZ_ZIP_VERSION_MADE_BY_DOS_FILESYSTEM_ID = 0,
+    MZ_ZIP_DOS_DIR_ATTRIBUTE_BITFLAG = 0x10,
+    MZ_ZIP_GENERAL_PURPOSE_BIT_FLAG_IS_ENCRYPTED = 1,
+    MZ_ZIP_GENERAL_PURPOSE_BIT_FLAG_COMPRESSED_PATCH_FLAG = 32,
+    MZ_ZIP_GENERAL_PURPOSE_BIT_FLAG_USES_STRONG_ENCRYPTION = 64,
+    MZ_ZIP_GENERAL_PURPOSE_BIT_FLAG_LOCAL_DIR_IS_MASKED = 8192,
+    MZ_ZIP_GENERAL_PURPOSE_BIT_FLAG_UTF8 = 1 << 11
+};
+
+typedef struct
+{
+    void *m_p;
+    size_t m_size, m_capacity;
+    mz_uint m_element_size;
+} mz_zip_array;
+
+struct mz_zip_internal_state_tag
+{
+    mz_zip_array m_central_dir;
+    mz_zip_array m_central_dir_offsets;
+    mz_zip_array m_sorted_central_dir_offsets;
+
+    /* The flags passed in when the archive is initially opened. */
+    uint32_t m_init_flags;
+
+    /* MZ_TRUE if the archive has a zip64 end of central directory headers, etc. */
+    mz_bool m_zip64;
+
+    /* MZ_TRUE if we found zip64 extended info in the central directory (m_zip64 will also be slammed to true too, even if we didn't find a zip64 end of central dir header, etc.) */
+    mz_bool m_zip64_has_extended_info_fields;
+
+    /* These fields are used by the file, FILE, memory, and memory/heap read/write helpers. */
+    MZ_FILE *m_pFile;
+    mz_uint64 m_file_archive_start_ofs;
+
+    void *m_pMem;
+    size_t m_mem_size;
+    size_t m_mem_capacity;
+};
+
+#define MZ_ZIP_ARRAY_SET_ELEMENT_SIZE(array_ptr, element_size) (array_ptr)->m_element_size = element_size
+
+#if defined(DEBUG) || defined(_DEBUG)
+static MZ_FORCEINLINE mz_uint mz_zip_array_range_check(const mz_zip_array *pArray, mz_uint index)
+{
+    MZ_ASSERT(index < pArray->m_size);
+    return index;
+}
+#define MZ_ZIP_ARRAY_ELEMENT(array_ptr, element_type, index) ((element_type *)((array_ptr)->m_p))[mz_zip_array_range_check(array_ptr, index)]
+#else
+#define MZ_ZIP_ARRAY_ELEMENT(array_ptr, element_type, index) ((element_type *)((array_ptr)->m_p))[index]
+#endif
+
+static MZ_FORCEINLINE void mz_zip_array_init(mz_zip_array *pArray, mz_uint32 element_size)
+{
+    memset(pArray, 0, sizeof(mz_zip_array));
+    pArray->m_element_size = element_size;
+}
+
+static MZ_FORCEINLINE void mz_zip_array_clear(mz_zip_archive *pZip, mz_zip_array *pArray)
+{
+    pZip->m_pFree(pZip->m_pAlloc_opaque, pArray->m_p);
+    memset(pArray, 0, sizeof(mz_zip_array));
+}
+
+static mz_bool mz_zip_array_ensure_capacity(mz_zip_archive *pZip, mz_zip_array *pArray, size_t min_new_capacity, mz_uint growing)
+{
+    void *pNew_p;
+    size_t new_capacity = min_new_capacity;
+    MZ_ASSERT(pArray->m_element_size);
+    if (pArray->m_capacity >= min_new_capacity)
+        return MZ_TRUE;
+    if (growing)
+    {
+        new_capacity = MZ_MAX(1, pArray->m_capacity);
+        while (new_capacity < min_new_capacity)
+            new_capacity *= 2;
+    }
+    if (NULL == (pNew_p = pZip->m_pRealloc(pZip->m_pAlloc_opaque, pArray->m_p, pArray->m_element_size, new_capacity)))
+        return MZ_FALSE;
+    pArray->m_p = pNew_p;
+    pArray->m_capacity = new_capacity;
+    return MZ_TRUE;
+}
+
+static MZ_FORCEINLINE mz_bool mz_zip_array_reserve(mz_zip_archive *pZip, mz_zip_array *pArray, size_t new_capacity, mz_uint growing)
+{
+    if (new_capacity > pArray->m_capacity)
+    {
+        if (!mz_zip_array_ensure_capacity(pZip, pArray, new_capacity, growing))
+            return MZ_FALSE;
+    }
+    return MZ_TRUE;
+}
+
+static MZ_FORCEINLINE mz_bool mz_zip_array_resize(mz_zip_archive *pZip, mz_zip_array *pArray, size_t new_size, mz_uint growing)
+{
+    if (new_size > pArray->m_capacity)
+    {
+        if (!mz_zip_array_ensure_capacity(pZip, pArray, new_size, growing))
+            return MZ_FALSE;
+    }
+    pArray->m_size = new_size;
+    return MZ_TRUE;
+}
+
+static MZ_FORCEINLINE mz_bool mz_zip_array_ensure_room(mz_zip_archive *pZip, mz_zip_array *pArray, size_t n)
+{
+    return mz_zip_array_reserve(pZip, pArray, pArray->m_size + n, MZ_TRUE);
+}
+
+static MZ_FORCEINLINE mz_bool mz_zip_array_push_back(mz_zip_archive *pZip, mz_zip_array *pArray, const void *pElements, size_t n)
+{
+    size_t orig_size = pArray->m_size;
+    if (!mz_zip_array_resize(pZip, pArray, orig_size + n, MZ_TRUE))
+        return MZ_FALSE;
+    if (n > 0)
+        memcpy((mz_uint8 *)pArray->m_p + orig_size * pArray->m_element_size, pElements, n * pArray->m_element_size);
+    return MZ_TRUE;
+}
+
+#ifndef MINIZ_NO_TIME
+static MZ_TIME_T mz_zip_dos_to_time_t(int dos_time, int dos_date)
+{
+    struct tm tm;
+    memset(&tm, 0, sizeof(tm));
+    tm.tm_isdst = -1;
+    tm.tm_year = ((dos_date >> 9) & 127) + 1980 - 1900;
+    tm.tm_mon = ((dos_date >> 5) & 15) - 1;
+    tm.tm_mday = dos_date & 31;
+    tm.tm_hour = (dos_time >> 11) & 31;
+    tm.tm_min = (dos_time >> 5) & 63;
+    tm.tm_sec = (dos_time << 1) & 62;
+    return mktime(&tm);
+}
+
+#ifndef MINIZ_NO_ARCHIVE_WRITING_APIS
+static void mz_zip_time_t_to_dos_time(MZ_TIME_T time, mz_uint16 *pDOS_time, mz_uint16 *pDOS_date)
+{
+#ifdef _MSC_VER
+    struct tm tm_struct;
+    struct tm *tm = &tm_struct;
+    errno_t err = localtime_s(tm, &time);
+    if (err)
+    {
+        *pDOS_date = 0;
+        *pDOS_time = 0;
+        return;
+    }
+#else
+    struct tm *tm = localtime(&time);
+#endif /* #ifdef _MSC_VER */
+
+    *pDOS_time = (mz_uint16)(((tm->tm_hour) << 11) + ((tm->tm_min) << 5) + ((tm->tm_sec) >> 1));
+    *pDOS_date = (mz_uint16)(((tm->tm_year + 1900 - 1980) << 9) + ((tm->tm_mon + 1) << 5) + tm->tm_mday);
+}
+#endif /* MINIZ_NO_ARCHIVE_WRITING_APIS */
+
+#ifndef MINIZ_NO_STDIO
+#ifndef MINIZ_NO_ARCHIVE_WRITING_APIS
+static mz_bool mz_zip_get_file_modified_time(const char *pFilename, MZ_TIME_T *pTime)
+{
+    struct MZ_FILE_STAT_STRUCT file_stat;
+
+    /* On Linux with x86 glibc, this call will fail on large files (I think >= 0x80000000 bytes) unless you compiled with _LARGEFILE64_SOURCE. Argh. */
+    if (MZ_FILE_STAT(pFilename, &file_stat) != 0)
+        return MZ_FALSE;
+
+    *pTime = file_stat.st_mtime;
+
+    return MZ_TRUE;
+}
+#endif /* #ifndef MINIZ_NO_ARCHIVE_WRITING_APIS*/
+
+static mz_bool mz_zip_set_file_times(const char *pFilename, MZ_TIME_T access_time, MZ_TIME_T modified_time)
+{
+    struct utimbuf t;
+
+    memset(&t, 0, sizeof(t));
+    t.actime = access_time;
+    t.modtime = modified_time;
+
+    return !utime(pFilename, &t);
+}
+#endif /* #ifndef MINIZ_NO_STDIO */
+#endif /* #ifndef MINIZ_NO_TIME */
+
+static MZ_FORCEINLINE mz_bool mz_zip_set_error(mz_zip_archive *pZip, mz_zip_error err_num)
+{
+    if (pZip)
+        pZip->m_last_error = err_num;
+    return MZ_FALSE;
+}
+
+static mz_bool mz_zip_reader_init_internal(mz_zip_archive *pZip, mz_uint flags)
+{
+    (void)flags;
+    if ((!pZip) || (pZip->m_pState) || (pZip->m_zip_mode != MZ_ZIP_MODE_INVALID))
+        return mz_zip_set_error(pZip, MZ_ZIP_INVALID_PARAMETER);
+
+    if (!pZip->m_pAlloc)
+        pZip->m_pAlloc = miniz_def_alloc_func;
+    if (!pZip->m_pFree)
+        pZip->m_pFree = miniz_def_free_func;
+    if (!pZip->m_pRealloc)
+        pZip->m_pRealloc = miniz_def_realloc_func;
+
+    pZip->m_archive_size = 0;
+    pZip->m_central_directory_file_ofs = 0;
+    pZip->m_total_files = 0;
+    pZip->m_last_error = MZ_ZIP_NO_ERROR;
+
+    if (NULL == (pZip->m_pState = (mz_zip_internal_state *)pZip->m_pAlloc(pZip->m_pAlloc_opaque, 1, sizeof(mz_zip_internal_state))))
+        return mz_zip_set_error(pZip, MZ_ZIP_ALLOC_FAILED);
+
+    memset(pZip->m_pState, 0, sizeof(mz_zip_internal_state));
+    MZ_ZIP_ARRAY_SET_ELEMENT_SIZE(&pZip->m_pState->m_central_dir, sizeof(mz_uint8));
+    MZ_ZIP_ARRAY_SET_ELEMENT_SIZE(&pZip->m_pState->m_central_dir_offsets, sizeof(mz_uint32));
+    MZ_ZIP_ARRAY_SET_ELEMENT_SIZE(&pZip->m_pState->m_sorted_central_dir_offsets, sizeof(mz_uint32));
+    pZip->m_pState->m_init_flags = flags;
+    pZip->m_pState->m_zip64 = MZ_FALSE;
+    pZip->m_pState->m_zip64_has_extended_info_fields = MZ_FALSE;
+
+    pZip->m_zip_mode = MZ_ZIP_MODE_READING;
+
+    return MZ_TRUE;
+}
+
+static MZ_FORCEINLINE mz_bool mz_zip_reader_filename_less(const mz_zip_array *pCentral_dir_array, const mz_zip_array *pCentral_dir_offsets, mz_uint l_index, mz_uint r_index)
+{
+    const mz_uint8 *pL = &MZ_ZIP_ARRAY_ELEMENT(pCentral_dir_array, mz_uint8, MZ_ZIP_ARRAY_ELEMENT(pCentral_dir_offsets, mz_uint32, l_index)), *pE;
+    const mz_uint8 *pR = &MZ_ZIP_ARRAY_ELEMENT(pCentral_dir_array, mz_uint8, MZ_ZIP_ARRAY_ELEMENT(pCentral_dir_offsets, mz_uint32, r_index));
+    mz_uint l_len = MZ_READ_LE16(pL + MZ_ZIP_CDH_FILENAME_LEN_OFS), r_len = MZ_READ_LE16(pR + MZ_ZIP_CDH_FILENAME_LEN_OFS);
+    mz_uint8 l = 0, r = 0;
+    pL += MZ_ZIP_CENTRAL_DIR_HEADER_SIZE;
+    pR += MZ_ZIP_CENTRAL_DIR_HEADER_SIZE;
+    pE = pL + MZ_MIN(l_len, r_len);
+    while (pL < pE)
+    {
+        if ((l = MZ_TOLOWER(*pL)) != (r = MZ_TOLOWER(*pR)))
+            break;
+        pL++;
+        pR++;
+    }
+    return (pL == pE) ? (l_len < r_len) : (l < r);
+}
+
+#define MZ_SWAP_UINT32(a, b) \
+    do                       \
+    {                        \
+        mz_uint32 t = a;     \
+        a = b;               \
+        b = t;               \
+    }                        \
+    MZ_MACRO_END
+
+/* Heap sort of lowercased filenames, used to help accelerate plain central directory searches by mz_zip_reader_locate_file(). (Could also use qsort(), but it could allocate memory.) */
+static void mz_zip_reader_sort_central_dir_offsets_by_filename(mz_zip_archive *pZip)
+{
+    mz_zip_internal_state *pState = pZip->m_pState;
+    const mz_zip_array *pCentral_dir_offsets = &pState->m_central_dir_offsets;
+    const mz_zip_array *pCentral_dir = &pState->m_central_dir;
+    mz_uint32 *pIndices;
+    mz_uint32 start, end;
+    const mz_uint32 size = pZip->m_total_files;
+
+    if (size <= 1U)
+        return;
+
+    pIndices = &MZ_ZIP_ARRAY_ELEMENT(&pState->m_sorted_central_dir_offsets, mz_uint32, 0);
+
+    start = (size - 2U) >> 1U;
+    for (;;)
+    {
+        mz_uint64 child, root = start;
+        for (;;)
+        {
+            if ((child = (root << 1U) + 1U) >= size)
+                break;
+            child += (((child + 1U) < size) && (mz_zip_reader_filename_less(pCentral_dir, pCentral_dir_offsets, pIndices[child], pIndices[child + 1U])));
+            if (!mz_zip_reader_filename_less(pCentral_dir, pCentral_dir_offsets, pIndices[root], pIndices[child]))
+                break;
+            MZ_SWAP_UINT32(pIndices[root], pIndices[child]);
+            root = child;
+        }
+        if (!start)
+            break;
+        start--;
+    }
+
+    end = size - 1;
+    while (end > 0)
+    {
+        mz_uint64 child, root = 0;
+        MZ_SWAP_UINT32(pIndices[end], pIndices[0]);
+        for (;;)
+        {
+            if ((child = (root << 1U) + 1U) >= end)
+                break;
+            child += (((child + 1U) < end) && mz_zip_reader_filename_less(pCentral_dir, pCentral_dir_offsets, pIndices[child], pIndices[child + 1U]));
+            if (!mz_zip_reader_filename_less(pCentral_dir, pCentral_dir_offsets, pIndices[root], pIndices[child]))
+                break;
+            MZ_SWAP_UINT32(pIndices[root], pIndices[child]);
+            root = child;
+        }
+        end--;
+    }
+}
+
+static mz_bool mz_zip_reader_locate_header_sig(mz_zip_archive *pZip, mz_uint32 record_sig, mz_uint32 record_size, mz_int64 *pOfs)
+{
+    mz_int64 cur_file_ofs;
+    mz_uint32 buf_u32[4096 / sizeof(mz_uint32)];
+    mz_uint8 *pBuf = (mz_uint8 *)buf_u32;
+
+    /* Basic sanity checks - reject files which are too small */
+    if (pZip->m_archive_size < record_size)
+        return MZ_FALSE;
+
+    /* Find the record by scanning the file from the end towards the beginning. */
+    cur_file_ofs = MZ_MAX((mz_int64)pZip->m_archive_size - (mz_int64)sizeof(buf_u32), 0);
+    for (;;)
+    {
+        int i, n = (int)MZ_MIN(sizeof(buf_u32), pZip->m_archive_size - cur_file_ofs);
+
+        if (pZip->m_pRead(pZip->m_pIO_opaque, cur_file_ofs, pBuf, n) != (mz_uint)n)
+            return MZ_FALSE;
+
+        for (i = n - 4; i >= 0; --i)
+        {
+            mz_uint s = MZ_READ_LE32(pBuf + i);
+            if (s == record_sig)
+            {
+                if ((pZip->m_archive_size - (cur_file_ofs + i)) >= record_size)
+                    break;
+            }
+        }
+
+        if (i >= 0)
+        {
+            cur_file_ofs += i;
+            break;
+        }
+
+        /* Give up if we've searched the entire file, or we've gone back "too far" (~64kb) */
+        if ((!cur_file_ofs) || ((pZip->m_archive_size - cur_file_ofs) >= (MZ_UINT16_MAX + record_size)))
+            return MZ_FALSE;
+
+        cur_file_ofs = MZ_MAX(cur_file_ofs - (sizeof(buf_u32) - 3), 0);
+    }
+
+    *pOfs = cur_file_ofs;
+    return MZ_TRUE;
+}
+
+static mz_bool mz_zip_reader_read_central_dir(mz_zip_archive *pZip, mz_uint flags)
+{
+    mz_uint cdir_size = 0, cdir_entries_on_this_disk = 0, num_this_disk = 0, cdir_disk_index = 0;
+    mz_uint64 cdir_ofs = 0;
+    mz_int64 cur_file_ofs = 0;
+    const mz_uint8 *p;
+
+    mz_uint32 buf_u32[4096 / sizeof(mz_uint32)];
+    mz_uint8 *pBuf = (mz_uint8 *)buf_u32;
+    mz_bool sort_central_dir = ((flags & MZ_ZIP_FLAG_DO_NOT_SORT_CENTRAL_DIRECTORY) == 0);
+    mz_uint32 zip64_end_of_central_dir_locator_u32[(MZ_ZIP64_END_OF_CENTRAL_DIR_LOCATOR_SIZE + sizeof(mz_uint32) - 1) / sizeof(mz_uint32)];
+    mz_uint8 *pZip64_locator = (mz_uint8 *)zip64_end_of_central_dir_locator_u32;
+
+    mz_uint32 zip64_end_of_central_dir_header_u32[(MZ_ZIP64_END_OF_CENTRAL_DIR_HEADER_SIZE + sizeof(mz_uint32) - 1) / sizeof(mz_uint32)];
+    mz_uint8 *pZip64_end_of_central_dir = (mz_uint8 *)zip64_end_of_central_dir_header_u32;
+
+    mz_uint64 zip64_end_of_central_dir_ofs = 0;
+
+    /* Basic sanity checks - reject files which are too small, and check the first 4 bytes of the file to make sure a local header is there. */
+    if (pZip->m_archive_size < MZ_ZIP_END_OF_CENTRAL_DIR_HEADER_SIZE)
+        return mz_zip_set_error(pZip, MZ_ZIP_NOT_AN_ARCHIVE);
+
+    if (!mz_zip_reader_locate_header_sig(pZip, MZ_ZIP_END_OF_CENTRAL_DIR_HEADER_SIG, MZ_ZIP_END_OF_CENTRAL_DIR_HEADER_SIZE, &cur_file_ofs))
+        return mz_zip_set_error(pZip, MZ_ZIP_FAILED_FINDING_CENTRAL_DIR);
+
+    /* Read and verify the end of central directory record. */
+    if (pZip->m_pRead(pZip->m_pIO_opaque, cur_file_ofs, pBuf, MZ_ZIP_END_OF_CENTRAL_DIR_HEADER_SIZE) != MZ_ZIP_END_OF_CENTRAL_DIR_HEADER_SIZE)
+        return mz_zip_set_error(pZip, MZ_ZIP_FILE_READ_FAILED);
+
+    if (MZ_READ_LE32(pBuf + MZ_ZIP_ECDH_SIG_OFS) != MZ_ZIP_END_OF_CENTRAL_DIR_HEADER_SIG)
+        return mz_zip_set_error(pZip, MZ_ZIP_NOT_AN_ARCHIVE);
+
+    if (cur_file_ofs >= (MZ_ZIP64_END_OF_CENTRAL_DIR_LOCATOR_SIZE + MZ_ZIP64_END_OF_CENTRAL_DIR_HEADER_SIZE))
+    {
+        if (pZip->m_pRead(pZip->m_pIO_opaque, cur_file_ofs - MZ_ZIP64_END_OF_CENTRAL_DIR_LOCATOR_SIZE, pZip64_locator, MZ_ZIP64_END_OF_CENTRAL_DIR_LOCATOR_SIZE) == MZ_ZIP64_END_OF_CENTRAL_DIR_LOCATOR_SIZE)
+        {
+            if (MZ_READ_LE32(pZip64_locator + MZ_ZIP64_ECDL_SIG_OFS) == MZ_ZIP64_END_OF_CENTRAL_DIR_LOCATOR_SIG)
+            {
+                zip64_end_of_central_dir_ofs = MZ_READ_LE64(pZip64_locator + MZ_ZIP64_ECDL_REL_OFS_TO_ZIP64_ECDR_OFS);
+                if (zip64_end_of_central_dir_ofs > (pZip->m_archive_size - MZ_ZIP64_END_OF_CENTRAL_DIR_HEADER_SIZE))
+                    return mz_zip_set_error(pZip, MZ_ZIP_NOT_AN_ARCHIVE);
+
+                if (pZip->m_pRead(pZip->m_pIO_opaque, zip64_end_of_central_dir_ofs, pZip64_end_of_central_dir, MZ_ZIP64_END_OF_CENTRAL_DIR_HEADER_SIZE) == MZ_ZIP64_END_OF_CENTRAL_DIR_HEADER_SIZE)
+                {
+                    if (MZ_READ_LE32(pZip64_end_of_central_dir + MZ_ZIP64_ECDH_SIG_OFS) == MZ_ZIP64_END_OF_CENTRAL_DIR_HEADER_SIG)
+                    {
+                        pZip->m_pState->m_zip64 = MZ_TRUE;
+                    }
+                }
+            }
+        }
+    }
+
+    pZip->m_total_files = MZ_READ_LE16(pBuf + MZ_ZIP_ECDH_CDIR_TOTAL_ENTRIES_OFS);
+    cdir_entries_on_this_disk = MZ_READ_LE16(pBuf + MZ_ZIP_ECDH_CDIR_NUM_ENTRIES_ON_DISK_OFS);
+    num_this_disk = MZ_READ_LE16(pBuf + MZ_ZIP_ECDH_NUM_THIS_DISK_OFS);
+    cdir_disk_index = MZ_READ_LE16(pBuf + MZ_ZIP_ECDH_NUM_DISK_CDIR_OFS);
+    cdir_size = MZ_READ_LE32(pBuf + MZ_ZIP_ECDH_CDIR_SIZE_OFS);
+    cdir_ofs = MZ_READ_LE32(pBuf + MZ_ZIP_ECDH_CDIR_OFS_OFS);
+
+    if (pZip->m_pState->m_zip64)
+    {
+        mz_uint32 zip64_total_num_of_disks = MZ_READ_LE32(pZip64_locator + MZ_ZIP64_ECDL_TOTAL_NUMBER_OF_DISKS_OFS);
+        mz_uint64 zip64_cdir_total_entries = MZ_READ_LE64(pZip64_end_of_central_dir + MZ_ZIP64_ECDH_CDIR_TOTAL_ENTRIES_OFS);
+        mz_uint64 zip64_cdir_total_entries_on_this_disk = MZ_READ_LE64(pZip64_end_of_central_dir + MZ_ZIP64_ECDH_CDIR_NUM_ENTRIES_ON_DISK_OFS);
+        mz_uint64 zip64_size_of_end_of_central_dir_record = MZ_READ_LE64(pZip64_end_of_central_dir + MZ_ZIP64_ECDH_SIZE_OF_RECORD_OFS);
+        mz_uint64 zip64_size_of_central_directory = MZ_READ_LE64(pZip64_end_of_central_dir + MZ_ZIP64_ECDH_CDIR_SIZE_OFS);
+
+        if (zip64_size_of_end_of_central_dir_record < (MZ_ZIP64_END_OF_CENTRAL_DIR_HEADER_SIZE - 12))
+            return mz_zip_set_error(pZip, MZ_ZIP_INVALID_HEADER_OR_CORRUPTED);
+
+        if (zip64_total_num_of_disks != 1U)
+            return mz_zip_set_error(pZip, MZ_ZIP_UNSUPPORTED_MULTIDISK);
+
+        /* Check for miniz's practical limits */
+        if (zip64_cdir_total_entries > MZ_UINT32_MAX)
+            return mz_zip_set_error(pZip, MZ_ZIP_TOO_MANY_FILES);
+
+        pZip->m_total_files = (mz_uint32)zip64_cdir_total_entries;
+
+        if (zip64_cdir_total_entries_on_this_disk > MZ_UINT32_MAX)
+            return mz_zip_set_error(pZip, MZ_ZIP_TOO_MANY_FILES);
+
+        cdir_entries_on_this_disk = (mz_uint32)zip64_cdir_total_entries_on_this_disk;
+
+        /* Check for miniz's current practical limits (sorry, this should be enough for millions of files) */
+        if (zip64_size_of_central_directory > MZ_UINT32_MAX)
+            return mz_zip_set_error(pZip, MZ_ZIP_UNSUPPORTED_CDIR_SIZE);
+
+        cdir_size = (mz_uint32)zip64_size_of_central_directory;
+
+        num_this_disk = MZ_READ_LE32(pZip64_end_of_central_dir + MZ_ZIP64_ECDH_NUM_THIS_DISK_OFS);
+
+        cdir_disk_index = MZ_READ_LE32(pZip64_end_of_central_dir + MZ_ZIP64_ECDH_NUM_DISK_CDIR_OFS);
+
+        cdir_ofs = MZ_READ_LE64(pZip64_end_of_central_dir + MZ_ZIP64_ECDH_CDIR_OFS_OFS);
+    }
+
+    if (pZip->m_total_files != cdir_entries_on_this_disk)
+        return mz_zip_set_error(pZip, MZ_ZIP_UNSUPPORTED_MULTIDISK);
+
+    if (((num_this_disk | cdir_disk_index) != 0) && ((num_this_disk != 1) || (cdir_disk_index != 1)))
+        return mz_zip_set_error(pZip, MZ_ZIP_UNSUPPORTED_MULTIDISK);
+
+    if (cdir_size < pZip->m_total_files * MZ_ZIP_CENTRAL_DIR_HEADER_SIZE)
+        return mz_zip_set_error(pZip, MZ_ZIP_INVALID_HEADER_OR_CORRUPTED);
+
+    if ((cdir_ofs + (mz_uint64)cdir_size) > pZip->m_archive_size)
+        return mz_zip_set_error(pZip, MZ_ZIP_INVALID_HEADER_OR_CORRUPTED);
+
+    pZip->m_central_directory_file_ofs = cdir_ofs;
+
+    if (pZip->m_total_files)
+    {
+        mz_uint i, n;
+        /* Read the entire central directory into a heap block, and allocate another heap block to hold the unsorted central dir file record offsets, and possibly another to hold the sorted indices. */
+        if ((!mz_zip_array_resize(pZip, &pZip->m_pState->m_central_dir, cdir_size, MZ_FALSE)) ||
+            (!mz_zip_array_resize(pZip, &pZip->m_pState->m_central_dir_offsets, pZip->m_total_files, MZ_FALSE)))
+            return mz_zip_set_error(pZip, MZ_ZIP_ALLOC_FAILED);
+
+        if (sort_central_dir)
+        {
+            if (!mz_zip_array_resize(pZip, &pZip->m_pState->m_sorted_central_dir_offsets, pZip->m_total_files, MZ_FALSE))
+                return mz_zip_set_error(pZip, MZ_ZIP_ALLOC_FAILED);
+        }
+
+        if (pZip->m_pRead(pZip->m_pIO_opaque, cdir_ofs, pZip->m_pState->m_central_dir.m_p, cdir_size) != cdir_size)
+            return mz_zip_set_error(pZip, MZ_ZIP_FILE_READ_FAILED);
+
+        /* Now create an index into the central directory file records, do some basic sanity checking on each record */
+        p = (const mz_uint8 *)pZip->m_pState->m_central_dir.m_p;
+        for (n = cdir_size, i = 0; i < pZip->m_total_files; ++i)
+        {
+            mz_uint total_header_size, disk_index, bit_flags, filename_size, ext_data_size;
+            mz_uint64 comp_size, decomp_size, local_header_ofs;
+
+            if ((n < MZ_ZIP_CENTRAL_DIR_HEADER_SIZE) || (MZ_READ_LE32(p) != MZ_ZIP_CENTRAL_DIR_HEADER_SIG))
+                return mz_zip_set_error(pZip, MZ_ZIP_INVALID_HEADER_OR_CORRUPTED);
+
+            MZ_ZIP_ARRAY_ELEMENT(&pZip->m_pState->m_central_dir_offsets, mz_uint32, i) = (mz_uint32)(p - (const mz_uint8 *)pZip->m_pState->m_central_dir.m_p);
+
+            if (sort_central_dir)
+                MZ_ZIP_ARRAY_ELEMENT(&pZip->m_pState->m_sorted_central_dir_offsets, mz_uint32, i) = i;
+
+            comp_size = MZ_READ_LE32(p + MZ_ZIP_CDH_COMPRESSED_SIZE_OFS);
+            decomp_size = MZ_READ_LE32(p + MZ_ZIP_CDH_DECOMPRESSED_SIZE_OFS);
+            local_header_ofs = MZ_READ_LE32(p + MZ_ZIP_CDH_LOCAL_HEADER_OFS);
+            filename_size = MZ_READ_LE16(p + MZ_ZIP_CDH_FILENAME_LEN_OFS);
+            ext_data_size = MZ_READ_LE16(p + MZ_ZIP_CDH_EXTRA_LEN_OFS);
+
+            if ((!pZip->m_pState->m_zip64_has_extended_info_fields) &&
+                (ext_data_size) &&
+                (MZ_MAX(MZ_MAX(comp_size, decomp_size), local_header_ofs) == MZ_UINT32_MAX))
+            {
+                /* Attempt to find zip64 extended information field in the entry's extra data */
+                mz_uint32 extra_size_remaining = ext_data_size;
+
+                if (extra_size_remaining)
+                {
+					const mz_uint8 *pExtra_data;
+					void* buf = NULL;
+
+					if (MZ_ZIP_CENTRAL_DIR_HEADER_SIZE + filename_size + ext_data_size > n)
+					{
+						buf = MZ_MALLOC(ext_data_size);
+						if(buf==NULL)
+							return mz_zip_set_error(pZip, MZ_ZIP_ALLOC_FAILED);
+
+						if (pZip->m_pRead(pZip->m_pIO_opaque, cdir_ofs + MZ_ZIP_CENTRAL_DIR_HEADER_SIZE + filename_size, buf, ext_data_size) != ext_data_size)
+						{
+							MZ_FREE(buf);
+							return mz_zip_set_error(pZip, MZ_ZIP_FILE_READ_FAILED);
+						}
+
+						pExtra_data = (mz_uint8*)buf;
+					}
+					else
+					{
+						pExtra_data = p + MZ_ZIP_CENTRAL_DIR_HEADER_SIZE + filename_size;
+					}
+
+                    do
+                    {
+                        mz_uint32 field_id;
+                        mz_uint32 field_data_size;
+
+						if (extra_size_remaining < (sizeof(mz_uint16) * 2))
+						{
+							MZ_FREE(buf);
+							return mz_zip_set_error(pZip, MZ_ZIP_INVALID_HEADER_OR_CORRUPTED);
+						}
+
+                        field_id = MZ_READ_LE16(pExtra_data);
+                        field_data_size = MZ_READ_LE16(pExtra_data + sizeof(mz_uint16));
+
+						if ((field_data_size + sizeof(mz_uint16) * 2) > extra_size_remaining)
+						{
+							MZ_FREE(buf);
+							return mz_zip_set_error(pZip, MZ_ZIP_INVALID_HEADER_OR_CORRUPTED);
+						}
+
+                        if (field_id == MZ_ZIP64_EXTENDED_INFORMATION_FIELD_HEADER_ID)
+                        {
+                            /* Ok, the archive didn't have any zip64 headers but it uses a zip64 extended information field so mark it as zip64 anyway (this can occur with infozip's zip util when it reads compresses files from stdin). */
+                            pZip->m_pState->m_zip64 = MZ_TRUE;
+                            pZip->m_pState->m_zip64_has_extended_info_fields = MZ_TRUE;
+                            break;
+                        }
+
+                        pExtra_data += sizeof(mz_uint16) * 2 + field_data_size;
+                        extra_size_remaining = extra_size_remaining - sizeof(mz_uint16) * 2 - field_data_size;
+                    } while (extra_size_remaining);
+
+					MZ_FREE(buf);
+                }
+            }
+
+            /* I've seen archives that aren't marked as zip64 that uses zip64 ext data, argh */
+            if ((comp_size != MZ_UINT32_MAX) && (decomp_size != MZ_UINT32_MAX))
+            {
+                if (((!MZ_READ_LE32(p + MZ_ZIP_CDH_METHOD_OFS)) && (decomp_size != comp_size)) || (decomp_size && !comp_size))
+                    return mz_zip_set_error(pZip, MZ_ZIP_INVALID_HEADER_OR_CORRUPTED);
+            }
+
+            disk_index = MZ_READ_LE16(p + MZ_ZIP_CDH_DISK_START_OFS);
+            if ((disk_index == MZ_UINT16_MAX) || ((disk_index != num_this_disk) && (disk_index != 1)))
+                return mz_zip_set_error(pZip, MZ_ZIP_UNSUPPORTED_MULTIDISK);
+
+            if (comp_size != MZ_UINT32_MAX)
+            {
+                if (((mz_uint64)MZ_READ_LE32(p + MZ_ZIP_CDH_LOCAL_HEADER_OFS) + MZ_ZIP_LOCAL_DIR_HEADER_SIZE + comp_size) > pZip->m_archive_size)
+                    return mz_zip_set_error(pZip, MZ_ZIP_INVALID_HEADER_OR_CORRUPTED);
+            }
+
+            bit_flags = MZ_READ_LE16(p + MZ_ZIP_CDH_BIT_FLAG_OFS);
+            if (bit_flags & MZ_ZIP_GENERAL_PURPOSE_BIT_FLAG_LOCAL_DIR_IS_MASKED)
+                return mz_zip_set_error(pZip, MZ_ZIP_UNSUPPORTED_ENCRYPTION);
+
+            if ((total_header_size = MZ_ZIP_CENTRAL_DIR_HEADER_SIZE + MZ_READ_LE16(p + MZ_ZIP_CDH_FILENAME_LEN_OFS) + MZ_READ_LE16(p + MZ_ZIP_CDH_EXTRA_LEN_OFS) + MZ_READ_LE16(p + MZ_ZIP_CDH_COMMENT_LEN_OFS)) > n)
+                return mz_zip_set_error(pZip, MZ_ZIP_INVALID_HEADER_OR_CORRUPTED);
+
+            n -= total_header_size;
+            p += total_header_size;
+        }
+    }
+
+    if (sort_central_dir)
+        mz_zip_reader_sort_central_dir_offsets_by_filename(pZip);
+
+    return MZ_TRUE;
+}
+
+void mz_zip_zero_struct(mz_zip_archive *pZip)
+{
+    if (pZip)
+        MZ_CLEAR_OBJ(*pZip);
+}
+
+static mz_bool mz_zip_reader_end_internal(mz_zip_archive *pZip, mz_bool set_last_error)
+{
+    mz_bool status = MZ_TRUE;
+
+    if (!pZip)
+        return MZ_FALSE;
+
+    if ((!pZip->m_pState) || (!pZip->m_pAlloc) || (!pZip->m_pFree) || (pZip->m_zip_mode != MZ_ZIP_MODE_READING))
+    {
+        if (set_last_error)
+            pZip->m_last_error = MZ_ZIP_INVALID_PARAMETER;
+
+        return MZ_FALSE;
+    }
+
+    if (pZip->m_pState)
+    {
+        mz_zip_internal_state *pState = pZip->m_pState;
+        pZip->m_pState = NULL;
+
+        mz_zip_array_clear(pZip, &pState->m_central_dir);
+        mz_zip_array_clear(pZip, &pState->m_central_dir_offsets);
+        mz_zip_array_clear(pZip, &pState->m_sorted_central_dir_offsets);
+
+#ifndef MINIZ_NO_STDIO
+        if (pState->m_pFile)
+        {
+            if (pZip->m_zip_type == MZ_ZIP_TYPE_FILE)
+            {
+                if (MZ_FCLOSE(pState->m_pFile) == EOF)
+                {
+                    if (set_last_error)
+                        pZip->m_last_error = MZ_ZIP_FILE_CLOSE_FAILED;
+                    status = MZ_FALSE;
+                }
+            }
+            pState->m_pFile = NULL;
+        }
+#endif /* #ifndef MINIZ_NO_STDIO */
+
+        pZip->m_pFree(pZip->m_pAlloc_opaque, pState);
+    }
+    pZip->m_zip_mode = MZ_ZIP_MODE_INVALID;
+
+    return status;
+}
+
+mz_bool mz_zip_reader_end(mz_zip_archive *pZip)
+{
+    return mz_zip_reader_end_internal(pZip, MZ_TRUE);
+}
+mz_bool mz_zip_reader_init(mz_zip_archive *pZip, mz_uint64 size, mz_uint flags)
+{
+    if ((!pZip) || (!pZip->m_pRead))
+        return mz_zip_set_error(pZip, MZ_ZIP_INVALID_PARAMETER);
+
+    if (!mz_zip_reader_init_internal(pZip, flags))
+        return MZ_FALSE;
+
+    pZip->m_zip_type = MZ_ZIP_TYPE_USER;
+    pZip->m_archive_size = size;
+
+    if (!mz_zip_reader_read_central_dir(pZip, flags))
+    {
+        mz_zip_reader_end_internal(pZip, MZ_FALSE);
+        return MZ_FALSE;
+    }
+
+    return MZ_TRUE;
+}
+
+static size_t mz_zip_mem_read_func(void *pOpaque, mz_uint64 file_ofs, void *pBuf, size_t n)
+{
+    mz_zip_archive *pZip = (mz_zip_archive *)pOpaque;
+    size_t s = (file_ofs >= pZip->m_archive_size) ? 0 : (size_t)MZ_MIN(pZip->m_archive_size - file_ofs, n);
+    memcpy(pBuf, (const mz_uint8 *)pZip->m_pState->m_pMem + file_ofs, s);
+    return s;
+}
+
+mz_bool mz_zip_reader_init_mem(mz_zip_archive *pZip, const void *pMem, size_t size, mz_uint flags)
+{
+    if (!pMem)
+        return mz_zip_set_error(pZip, MZ_ZIP_INVALID_PARAMETER);
+
+    if (size < MZ_ZIP_END_OF_CENTRAL_DIR_HEADER_SIZE)
+        return mz_zip_set_error(pZip, MZ_ZIP_NOT_AN_ARCHIVE);
+
+    if (!mz_zip_reader_init_internal(pZip, flags))
+        return MZ_FALSE;
+
+    pZip->m_zip_type = MZ_ZIP_TYPE_MEMORY;
+    pZip->m_archive_size = size;
+    pZip->m_pRead = mz_zip_mem_read_func;
+    pZip->m_pIO_opaque = pZip;
+    pZip->m_pNeeds_keepalive = NULL;
+
+#ifdef __cplusplus
+    pZip->m_pState->m_pMem = const_cast<void *>(pMem);
+#else
+    pZip->m_pState->m_pMem = (void *)pMem;
+#endif
+
+    pZip->m_pState->m_mem_size = size;
+
+    if (!mz_zip_reader_read_central_dir(pZip, flags))
+    {
+        mz_zip_reader_end_internal(pZip, MZ_FALSE);
+        return MZ_FALSE;
+    }
+
+    return MZ_TRUE;
+}
+
+#ifndef MINIZ_NO_STDIO
+static size_t mz_zip_file_read_func(void *pOpaque, mz_uint64 file_ofs, void *pBuf, size_t n)
+{
+    mz_zip_archive *pZip = (mz_zip_archive *)pOpaque;
+    mz_int64 cur_ofs = MZ_FTELL64(pZip->m_pState->m_pFile);
+
+    file_ofs += pZip->m_pState->m_file_archive_start_ofs;
+
+    if (((mz_int64)file_ofs < 0) || (((cur_ofs != (mz_int64)file_ofs)) && (MZ_FSEEK64(pZip->m_pState->m_pFile, (mz_int64)file_ofs, SEEK_SET))))
+        return 0;
+
+    return MZ_FREAD(pBuf, 1, n, pZip->m_pState->m_pFile);
+}
+
+mz_bool mz_zip_reader_init_file(mz_zip_archive *pZip, const char *pFilename, mz_uint32 flags)
+{
+    return mz_zip_reader_init_file_v2(pZip, pFilename, flags, 0, 0);
+}
+
+mz_bool mz_zip_reader_init_file_v2(mz_zip_archive *pZip, const char *pFilename, mz_uint flags, mz_uint64 file_start_ofs, mz_uint64 archive_size)
+{
+    mz_uint64 file_size;
+    MZ_FILE *pFile;
+
+    if ((!pZip) || (!pFilename) || ((archive_size) && (archive_size < MZ_ZIP_END_OF_CENTRAL_DIR_HEADER_SIZE)))
+        return mz_zip_set_error(pZip, MZ_ZIP_INVALID_PARAMETER);
+
+    pFile = MZ_FOPEN(pFilename, "rb");
+    if (!pFile)
+        return mz_zip_set_error(pZip, MZ_ZIP_FILE_OPEN_FAILED);
+
+    file_size = archive_size;
+    if (!file_size)
+    {
+        if (MZ_FSEEK64(pFile, 0, SEEK_END))
+        {
+            MZ_FCLOSE(pFile);
+            return mz_zip_set_error(pZip, MZ_ZIP_FILE_SEEK_FAILED);
+        }
+
+        file_size = MZ_FTELL64(pFile);
+    }
+
+    /* TODO: Better sanity check archive_size and the # of actual remaining bytes */
+
+    if (file_size < MZ_ZIP_END_OF_CENTRAL_DIR_HEADER_SIZE)
+    {
+	MZ_FCLOSE(pFile);
+        return mz_zip_set_error(pZip, MZ_ZIP_NOT_AN_ARCHIVE);
+    }
+
+    if (!mz_zip_reader_init_internal(pZip, flags))
+    {
+        MZ_FCLOSE(pFile);
+        return MZ_FALSE;
+    }
+
+    pZip->m_zip_type = MZ_ZIP_TYPE_FILE;
+    pZip->m_pRead = mz_zip_file_read_func;
+    pZip->m_pIO_opaque = pZip;
+    pZip->m_pState->m_pFile = pFile;
+    pZip->m_archive_size = file_size;
+    pZip->m_pState->m_file_archive_start_ofs = file_start_ofs;
+
+    if (!mz_zip_reader_read_central_dir(pZip, flags))
+    {
+        mz_zip_reader_end_internal(pZip, MZ_FALSE);
+        return MZ_FALSE;
+    }
+
+    return MZ_TRUE;
+}
+
+mz_bool mz_zip_reader_init_cfile(mz_zip_archive *pZip, MZ_FILE *pFile, mz_uint64 archive_size, mz_uint flags)
+{
+    mz_uint64 cur_file_ofs;
+
+    if ((!pZip) || (!pFile))
+        return mz_zip_set_error(pZip, MZ_ZIP_FILE_OPEN_FAILED);
+
+    cur_file_ofs = MZ_FTELL64(pFile);
+
+    if (!archive_size)
+    {
+        if (MZ_FSEEK64(pFile, 0, SEEK_END))
+            return mz_zip_set_error(pZip, MZ_ZIP_FILE_SEEK_FAILED);
+
+        archive_size = MZ_FTELL64(pFile) - cur_file_ofs;
+
+        if (archive_size < MZ_ZIP_END_OF_CENTRAL_DIR_HEADER_SIZE)
+            return mz_zip_set_error(pZip, MZ_ZIP_NOT_AN_ARCHIVE);
+    }
+
+    if (!mz_zip_reader_init_internal(pZip, flags))
+        return MZ_FALSE;
+
+    pZip->m_zip_type = MZ_ZIP_TYPE_CFILE;
+    pZip->m_pRead = mz_zip_file_read_func;
+
+    pZip->m_pIO_opaque = pZip;
+    pZip->m_pState->m_pFile = pFile;
+    pZip->m_archive_size = archive_size;
+    pZip->m_pState->m_file_archive_start_ofs = cur_file_ofs;
+
+    if (!mz_zip_reader_read_central_dir(pZip, flags))
+    {
+        mz_zip_reader_end_internal(pZip, MZ_FALSE);
+        return MZ_FALSE;
+    }
+
+    return MZ_TRUE;
+}
+
+#endif /* #ifndef MINIZ_NO_STDIO */
+
+static MZ_FORCEINLINE const mz_uint8 *mz_zip_get_cdh(mz_zip_archive *pZip, mz_uint file_index)
+{
+    if ((!pZip) || (!pZip->m_pState) || (file_index >= pZip->m_total_files))
+        return NULL;
+    return &MZ_ZIP_ARRAY_ELEMENT(&pZip->m_pState->m_central_dir, mz_uint8, MZ_ZIP_ARRAY_ELEMENT(&pZip->m_pState->m_central_dir_offsets, mz_uint32, file_index));
+}
+
+mz_bool mz_zip_reader_is_file_encrypted(mz_zip_archive *pZip, mz_uint file_index)
+{
+    mz_uint m_bit_flag;
+    const mz_uint8 *p = mz_zip_get_cdh(pZip, file_index);
+    if (!p)
+    {
+        mz_zip_set_error(pZip, MZ_ZIP_INVALID_PARAMETER);
+        return MZ_FALSE;
+    }
+
+    m_bit_flag = MZ_READ_LE16(p + MZ_ZIP_CDH_BIT_FLAG_OFS);
+    return (m_bit_flag & (MZ_ZIP_GENERAL_PURPOSE_BIT_FLAG_IS_ENCRYPTED | MZ_ZIP_GENERAL_PURPOSE_BIT_FLAG_USES_STRONG_ENCRYPTION)) != 0;
+}
+
+mz_bool mz_zip_reader_is_file_supported(mz_zip_archive *pZip, mz_uint file_index)
+{
+    mz_uint bit_flag;
+    mz_uint method;
+
+    const mz_uint8 *p = mz_zip_get_cdh(pZip, file_index);
+    if (!p)
+    {
+        mz_zip_set_error(pZip, MZ_ZIP_INVALID_PARAMETER);
+        return MZ_FALSE;
+    }
+
+    method = MZ_READ_LE16(p + MZ_ZIP_CDH_METHOD_OFS);
+    bit_flag = MZ_READ_LE16(p + MZ_ZIP_CDH_BIT_FLAG_OFS);
+
+    if ((method != 0) && (method != MZ_DEFLATED))
+    {
+        mz_zip_set_error(pZip, MZ_ZIP_UNSUPPORTED_METHOD);
+        return MZ_FALSE;
+    }
+
+    if (bit_flag & (MZ_ZIP_GENERAL_PURPOSE_BIT_FLAG_IS_ENCRYPTED | MZ_ZIP_GENERAL_PURPOSE_BIT_FLAG_USES_STRONG_ENCRYPTION))
+    {
+        mz_zip_set_error(pZip, MZ_ZIP_UNSUPPORTED_ENCRYPTION);
+        return MZ_FALSE;
+    }
+
+    if (bit_flag & MZ_ZIP_GENERAL_PURPOSE_BIT_FLAG_COMPRESSED_PATCH_FLAG)
+    {
+        mz_zip_set_error(pZip, MZ_ZIP_UNSUPPORTED_FEATURE);
+        return MZ_FALSE;
+    }
+
+    return MZ_TRUE;
+}
+
+mz_bool mz_zip_reader_is_file_a_directory(mz_zip_archive *pZip, mz_uint file_index)
+{
+    mz_uint filename_len, attribute_mapping_id, external_attr;
+    const mz_uint8 *p = mz_zip_get_cdh(pZip, file_index);
+    if (!p)
+    {
+        mz_zip_set_error(pZip, MZ_ZIP_INVALID_PARAMETER);
+        return MZ_FALSE;
+    }
+
+    filename_len = MZ_READ_LE16(p + MZ_ZIP_CDH_FILENAME_LEN_OFS);
+    if (filename_len)
+    {
+        if (*(p + MZ_ZIP_CENTRAL_DIR_HEADER_SIZE + filename_len - 1) == '/')
+            return MZ_TRUE;
+    }
+
+    /* Bugfix: This code was also checking if the internal attribute was non-zero, which wasn't correct. */
+    /* Most/all zip writers (hopefully) set DOS file/directory attributes in the low 16-bits, so check for the DOS directory flag and ignore the source OS ID in the created by field. */
+    /* FIXME: Remove this check? Is it necessary - we already check the filename. */
+    attribute_mapping_id = MZ_READ_LE16(p + MZ_ZIP_CDH_VERSION_MADE_BY_OFS) >> 8;
+    (void)attribute_mapping_id;
+
+    external_attr = MZ_READ_LE32(p + MZ_ZIP_CDH_EXTERNAL_ATTR_OFS);
+    if ((external_attr & MZ_ZIP_DOS_DIR_ATTRIBUTE_BITFLAG) != 0)
+    {
+        return MZ_TRUE;
+    }
+
+    return MZ_FALSE;
+}
+
+static mz_bool mz_zip_file_stat_internal(mz_zip_archive *pZip, mz_uint file_index, const mz_uint8 *pCentral_dir_header, mz_zip_archive_file_stat *pStat, mz_bool *pFound_zip64_extra_data)
+{
+    mz_uint n;
+    const mz_uint8 *p = pCentral_dir_header;
+
+    if (pFound_zip64_extra_data)
+        *pFound_zip64_extra_data = MZ_FALSE;
+
+    if ((!p) || (!pStat))
+        return mz_zip_set_error(pZip, MZ_ZIP_INVALID_PARAMETER);
+
+    /* Extract fields from the central directory record. */
+    pStat->m_file_index = file_index;
+    pStat->m_central_dir_ofs = MZ_ZIP_ARRAY_ELEMENT(&pZip->m_pState->m_central_dir_offsets, mz_uint32, file_index);
+    pStat->m_version_made_by = MZ_READ_LE16(p + MZ_ZIP_CDH_VERSION_MADE_BY_OFS);
+    pStat->m_version_needed = MZ_READ_LE16(p + MZ_ZIP_CDH_VERSION_NEEDED_OFS);
+    pStat->m_bit_flag = MZ_READ_LE16(p + MZ_ZIP_CDH_BIT_FLAG_OFS);
+    pStat->m_method = MZ_READ_LE16(p + MZ_ZIP_CDH_METHOD_OFS);
+#ifndef MINIZ_NO_TIME
+    pStat->m_time = mz_zip_dos_to_time_t(MZ_READ_LE16(p + MZ_ZIP_CDH_FILE_TIME_OFS), MZ_READ_LE16(p + MZ_ZIP_CDH_FILE_DATE_OFS));
+#endif
+    pStat->m_crc32 = MZ_READ_LE32(p + MZ_ZIP_CDH_CRC32_OFS);
+    pStat->m_comp_size = MZ_READ_LE32(p + MZ_ZIP_CDH_COMPRESSED_SIZE_OFS);
+    pStat->m_uncomp_size = MZ_READ_LE32(p + MZ_ZIP_CDH_DECOMPRESSED_SIZE_OFS);
+    pStat->m_internal_attr = MZ_READ_LE16(p + MZ_ZIP_CDH_INTERNAL_ATTR_OFS);
+    pStat->m_external_attr = MZ_READ_LE32(p + MZ_ZIP_CDH_EXTERNAL_ATTR_OFS);
+    pStat->m_local_header_ofs = MZ_READ_LE32(p + MZ_ZIP_CDH_LOCAL_HEADER_OFS);
+
+    /* Copy as much of the filename and comment as possible. */
+    n = MZ_READ_LE16(p + MZ_ZIP_CDH_FILENAME_LEN_OFS);
+    n = MZ_MIN(n, MZ_ZIP_MAX_ARCHIVE_FILENAME_SIZE - 1);
+    memcpy(pStat->m_filename, p + MZ_ZIP_CENTRAL_DIR_HEADER_SIZE, n);
+    pStat->m_filename[n] = '\0';
+
+    n = MZ_READ_LE16(p + MZ_ZIP_CDH_COMMENT_LEN_OFS);
+    n = MZ_MIN(n, MZ_ZIP_MAX_ARCHIVE_FILE_COMMENT_SIZE - 1);
+    pStat->m_comment_size = n;
+    memcpy(pStat->m_comment, p + MZ_ZIP_CENTRAL_DIR_HEADER_SIZE + MZ_READ_LE16(p + MZ_ZIP_CDH_FILENAME_LEN_OFS) + MZ_READ_LE16(p + MZ_ZIP_CDH_EXTRA_LEN_OFS), n);
+    pStat->m_comment[n] = '\0';
+
+    /* Set some flags for convienance */
+    pStat->m_is_directory = mz_zip_reader_is_file_a_directory(pZip, file_index);
+    pStat->m_is_encrypted = mz_zip_reader_is_file_encrypted(pZip, file_index);
+    pStat->m_is_supported = mz_zip_reader_is_file_supported(pZip, file_index);
+
+    /* See if we need to read any zip64 extended information fields. */
+    /* Confusingly, these zip64 fields can be present even on non-zip64 archives (Debian zip on a huge files from stdin piped to stdout creates them). */
+    if (MZ_MAX(MZ_MAX(pStat->m_comp_size, pStat->m_uncomp_size), pStat->m_local_header_ofs) == MZ_UINT32_MAX)
+    {
+        /* Attempt to find zip64 extended information field in the entry's extra data */
+        mz_uint32 extra_size_remaining = MZ_READ_LE16(p + MZ_ZIP_CDH_EXTRA_LEN_OFS);
+
+        if (extra_size_remaining)
+        {
+            const mz_uint8 *pExtra_data = p + MZ_ZIP_CENTRAL_DIR_HEADER_SIZE + MZ_READ_LE16(p + MZ_ZIP_CDH_FILENAME_LEN_OFS);
+
+            do
+            {
+                mz_uint32 field_id;
+                mz_uint32 field_data_size;
+
+                if (extra_size_remaining < (sizeof(mz_uint16) * 2))
+                    return mz_zip_set_error(pZip, MZ_ZIP_INVALID_HEADER_OR_CORRUPTED);
+
+                field_id = MZ_READ_LE16(pExtra_data);
+                field_data_size = MZ_READ_LE16(pExtra_data + sizeof(mz_uint16));
+
+                if ((field_data_size + sizeof(mz_uint16) * 2) > extra_size_remaining)
+                    return mz_zip_set_error(pZip, MZ_ZIP_INVALID_HEADER_OR_CORRUPTED);
+
+                if (field_id == MZ_ZIP64_EXTENDED_INFORMATION_FIELD_HEADER_ID)
+                {
+                    const mz_uint8 *pField_data = pExtra_data + sizeof(mz_uint16) * 2;
+                    mz_uint32 field_data_remaining = field_data_size;
+
+                    if (pFound_zip64_extra_data)
+                        *pFound_zip64_extra_data = MZ_TRUE;
+
+                    if (pStat->m_uncomp_size == MZ_UINT32_MAX)
+                    {
+                        if (field_data_remaining < sizeof(mz_uint64))
+                            return mz_zip_set_error(pZip, MZ_ZIP_INVALID_HEADER_OR_CORRUPTED);
+
+                        pStat->m_uncomp_size = MZ_READ_LE64(pField_data);
+                        pField_data += sizeof(mz_uint64);
+                        field_data_remaining -= sizeof(mz_uint64);
+                    }
+
+                    if (pStat->m_comp_size == MZ_UINT32_MAX)
+                    {
+                        if (field_data_remaining < sizeof(mz_uint64))
+                            return mz_zip_set_error(pZip, MZ_ZIP_INVALID_HEADER_OR_CORRUPTED);
+
+                        pStat->m_comp_size = MZ_READ_LE64(pField_data);
+                        pField_data += sizeof(mz_uint64);
+                        field_data_remaining -= sizeof(mz_uint64);
+                    }
+
+                    if (pStat->m_local_header_ofs == MZ_UINT32_MAX)
+                    {
+                        if (field_data_remaining < sizeof(mz_uint64))
+                            return mz_zip_set_error(pZip, MZ_ZIP_INVALID_HEADER_OR_CORRUPTED);
+
+                        pStat->m_local_header_ofs = MZ_READ_LE64(pField_data);
+                        pField_data += sizeof(mz_uint64);
+                        field_data_remaining -= sizeof(mz_uint64);
+                    }
+
+                    break;
+                }
+
+                pExtra_data += sizeof(mz_uint16) * 2 + field_data_size;
+                extra_size_remaining = extra_size_remaining - sizeof(mz_uint16) * 2 - field_data_size;
+            } while (extra_size_remaining);
+        }
+    }
+
+    return MZ_TRUE;
+}
+
+static MZ_FORCEINLINE mz_bool mz_zip_string_equal(const char *pA, const char *pB, mz_uint len, mz_uint flags)
+{
+    mz_uint i;
+    if (flags & MZ_ZIP_FLAG_CASE_SENSITIVE)
+        return 0 == memcmp(pA, pB, len);
+    for (i = 0; i < len; ++i)
+        if (MZ_TOLOWER(pA[i]) != MZ_TOLOWER(pB[i]))
+            return MZ_FALSE;
+    return MZ_TRUE;
+}
+
+static MZ_FORCEINLINE int mz_zip_filename_compare(const mz_zip_array *pCentral_dir_array, const mz_zip_array *pCentral_dir_offsets, mz_uint l_index, const char *pR, mz_uint r_len)
+{
+    const mz_uint8 *pL = &MZ_ZIP_ARRAY_ELEMENT(pCentral_dir_array, mz_uint8, MZ_ZIP_ARRAY_ELEMENT(pCentral_dir_offsets, mz_uint32, l_index)), *pE;
+    mz_uint l_len = MZ_READ_LE16(pL + MZ_ZIP_CDH_FILENAME_LEN_OFS);
+    mz_uint8 l = 0, r = 0;
+    pL += MZ_ZIP_CENTRAL_DIR_HEADER_SIZE;
+    pE = pL + MZ_MIN(l_len, r_len);
+    while (pL < pE)
+    {
+        if ((l = MZ_TOLOWER(*pL)) != (r = MZ_TOLOWER(*pR)))
+            break;
+        pL++;
+        pR++;
+    }
+    return (pL == pE) ? (int)(l_len - r_len) : (l - r);
+}
+
+static mz_bool mz_zip_locate_file_binary_search(mz_zip_archive *pZip, const char *pFilename, mz_uint32 *pIndex)
+{
+    mz_zip_internal_state *pState = pZip->m_pState;
+    const mz_zip_array *pCentral_dir_offsets = &pState->m_central_dir_offsets;
+    const mz_zip_array *pCentral_dir = &pState->m_central_dir;
+    mz_uint32 *pIndices = &MZ_ZIP_ARRAY_ELEMENT(&pState->m_sorted_central_dir_offsets, mz_uint32, 0);
+    const uint32_t size = pZip->m_total_files;
+    const mz_uint filename_len = (mz_uint)strlen(pFilename);
+
+    if (pIndex)
+        *pIndex = 0;
+
+    if (size)
+    {
+        /* yes I could use uint32_t's, but then we would have to add some special case checks in the loop, argh, and */
+        /* honestly the major expense here on 32-bit CPU's will still be the filename compare */
+        mz_int64 l = 0, h = (mz_int64)size - 1;
+
+        while (l <= h)
+        {
+            mz_int64 m = l + ((h - l) >> 1);
+            uint32_t file_index = pIndices[(uint32_t)m];
+
+            int comp = mz_zip_filename_compare(pCentral_dir, pCentral_dir_offsets, file_index, pFilename, filename_len);
+            if (!comp)
+            {
+                if (pIndex)
+                    *pIndex = file_index;
+                return MZ_TRUE;
+            }
+            else if (comp < 0)
+                l = m + 1;
+            else
+                h = m - 1;
+        }
+    }
+
+    return mz_zip_set_error(pZip, MZ_ZIP_FILE_NOT_FOUND);
+}
+
+int mz_zip_reader_locate_file(mz_zip_archive *pZip, const char *pName, const char *pComment, mz_uint flags)
+{
+    mz_uint32 index;
+    if (!mz_zip_reader_locate_file_v2(pZip, pName, pComment, flags, &index))
+        return -1;
+    else
+        return (int)index;
+}
+
+mz_bool mz_zip_reader_locate_file_v2(mz_zip_archive *pZip, const char *pName, const char *pComment, mz_uint flags, mz_uint32 *pIndex)
+{
+    mz_uint file_index;
+    size_t name_len, comment_len;
+
+    if (pIndex)
+        *pIndex = 0;
+
+    if ((!pZip) || (!pZip->m_pState) || (!pName))
+        return mz_zip_set_error(pZip, MZ_ZIP_INVALID_PARAMETER);
+
+    /* See if we can use a binary search */
+    if (((pZip->m_pState->m_init_flags & MZ_ZIP_FLAG_DO_NOT_SORT_CENTRAL_DIRECTORY) == 0) &&
+        (pZip->m_zip_mode == MZ_ZIP_MODE_READING) &&
+        ((flags & (MZ_ZIP_FLAG_IGNORE_PATH | MZ_ZIP_FLAG_CASE_SENSITIVE)) == 0) && (!pComment) && (pZip->m_pState->m_sorted_central_dir_offsets.m_size))
+    {
+        return mz_zip_locate_file_binary_search(pZip, pName, pIndex);
+    }
+
+    /* Locate the entry by scanning the entire central directory */
+    name_len = strlen(pName);
+    if (name_len > MZ_UINT16_MAX)
+        return mz_zip_set_error(pZip, MZ_ZIP_INVALID_PARAMETER);
+
+    comment_len = pComment ? strlen(pComment) : 0;
+    if (comment_len > MZ_UINT16_MAX)
+        return mz_zip_set_error(pZip, MZ_ZIP_INVALID_PARAMETER);
+
+    for (file_index = 0; file_index < pZip->m_total_files; file_index++)
+    {
+        const mz_uint8 *pHeader = &MZ_ZIP_ARRAY_ELEMENT(&pZip->m_pState->m_central_dir, mz_uint8, MZ_ZIP_ARRAY_ELEMENT(&pZip->m_pState->m_central_dir_offsets, mz_uint32, file_index));
+        mz_uint filename_len = MZ_READ_LE16(pHeader + MZ_ZIP_CDH_FILENAME_LEN_OFS);
+        const char *pFilename = (const char *)pHeader + MZ_ZIP_CENTRAL_DIR_HEADER_SIZE;
+        if (filename_len < name_len)
+            continue;
+        if (comment_len)
+        {
+            mz_uint file_extra_len = MZ_READ_LE16(pHeader + MZ_ZIP_CDH_EXTRA_LEN_OFS), file_comment_len = MZ_READ_LE16(pHeader + MZ_ZIP_CDH_COMMENT_LEN_OFS);
+            const char *pFile_comment = pFilename + filename_len + file_extra_len;
+            if ((file_comment_len != comment_len) || (!mz_zip_string_equal(pComment, pFile_comment, file_comment_len, flags)))
+                continue;
+        }
+        if ((flags & MZ_ZIP_FLAG_IGNORE_PATH) && (filename_len))
+        {
+            int ofs = filename_len - 1;
+            do
+            {
+                if ((pFilename[ofs] == '/') || (pFilename[ofs] == '\\') || (pFilename[ofs] == ':'))
+                    break;
+            } while (--ofs >= 0);
+            ofs++;
+            pFilename += ofs;
+            filename_len -= ofs;
+        }
+        if ((filename_len == name_len) && (mz_zip_string_equal(pName, pFilename, filename_len, flags)))
+        {
+            if (pIndex)
+                *pIndex = file_index;
+            return MZ_TRUE;
+        }
+    }
+
+    return mz_zip_set_error(pZip, MZ_ZIP_FILE_NOT_FOUND);
+}
+
+mz_bool mz_zip_reader_extract_to_mem_no_alloc(mz_zip_archive *pZip, mz_uint file_index, void *pBuf, size_t buf_size, mz_uint flags, void *pUser_read_buf, size_t user_read_buf_size)
+{
+    int status = TINFL_STATUS_DONE;
+    mz_uint64 needed_size, cur_file_ofs, comp_remaining, out_buf_ofs = 0, read_buf_size, read_buf_ofs = 0, read_buf_avail;
+    mz_zip_archive_file_stat file_stat;
+    void *pRead_buf;
+    mz_uint32 local_header_u32[(MZ_ZIP_LOCAL_DIR_HEADER_SIZE + sizeof(mz_uint32) - 1) / sizeof(mz_uint32)];
+    mz_uint8 *pLocal_header = (mz_uint8 *)local_header_u32;
+    tinfl_decompressor inflator;
+
+    if ((!pZip) || (!pZip->m_pState) || ((buf_size) && (!pBuf)) || ((user_read_buf_size) && (!pUser_read_buf)) || (!pZip->m_pRead))
+        return mz_zip_set_error(pZip, MZ_ZIP_INVALID_PARAMETER);
+
+    if (!mz_zip_reader_file_stat(pZip, file_index, &file_stat))
+        return MZ_FALSE;
+
+    /* A directory or zero length file */
+    if ((file_stat.m_is_directory) || (!file_stat.m_comp_size))
+        return MZ_TRUE;
+
+    /* Encryption and patch files are not supported. */
+    if (file_stat.m_bit_flag & (MZ_ZIP_GENERAL_PURPOSE_BIT_FLAG_IS_ENCRYPTED | MZ_ZIP_GENERAL_PURPOSE_BIT_FLAG_USES_STRONG_ENCRYPTION | MZ_ZIP_GENERAL_PURPOSE_BIT_FLAG_COMPRESSED_PATCH_FLAG))
+        return mz_zip_set_error(pZip, MZ_ZIP_UNSUPPORTED_ENCRYPTION);
+
+    /* This function only supports decompressing stored and deflate. */
+    if ((!(flags & MZ_ZIP_FLAG_COMPRESSED_DATA)) && (file_stat.m_method != 0) && (file_stat.m_method != MZ_DEFLATED))
+        return mz_zip_set_error(pZip, MZ_ZIP_UNSUPPORTED_METHOD);
+
+    /* Ensure supplied output buffer is large enough. */
+    needed_size = (flags & MZ_ZIP_FLAG_COMPRESSED_DATA) ? file_stat.m_comp_size : file_stat.m_uncomp_size;
+    if (buf_size < needed_size)
+        return mz_zip_set_error(pZip, MZ_ZIP_BUF_TOO_SMALL);
+
+    /* Read and parse the local directory entry. */
+    cur_file_ofs = file_stat.m_local_header_ofs;
+    if (pZip->m_pRead(pZip->m_pIO_opaque, cur_file_ofs, pLocal_header, MZ_ZIP_LOCAL_DIR_HEADER_SIZE) != MZ_ZIP_LOCAL_DIR_HEADER_SIZE)
+        return mz_zip_set_error(pZip, MZ_ZIP_FILE_READ_FAILED);
+
+    if (MZ_READ_LE32(pLocal_header) != MZ_ZIP_LOCAL_DIR_HEADER_SIG)
+        return mz_zip_set_error(pZip, MZ_ZIP_INVALID_HEADER_OR_CORRUPTED);
+
+    cur_file_ofs += MZ_ZIP_LOCAL_DIR_HEADER_SIZE + MZ_READ_LE16(pLocal_header + MZ_ZIP_LDH_FILENAME_LEN_OFS) + MZ_READ_LE16(pLocal_header + MZ_ZIP_LDH_EXTRA_LEN_OFS);
+    if ((cur_file_ofs + file_stat.m_comp_size) > pZip->m_archive_size)
+        return mz_zip_set_error(pZip, MZ_ZIP_INVALID_HEADER_OR_CORRUPTED);
+
+    if ((flags & MZ_ZIP_FLAG_COMPRESSED_DATA) || (!file_stat.m_method))
+    {
+        /* The file is stored or the caller has requested the compressed data. */
+        if (pZip->m_pRead(pZip->m_pIO_opaque, cur_file_ofs, pBuf, (size_t)needed_size) != needed_size)
+            return mz_zip_set_error(pZip, MZ_ZIP_FILE_READ_FAILED);
+
+#ifndef MINIZ_DISABLE_ZIP_READER_CRC32_CHECKS
+        if ((flags & MZ_ZIP_FLAG_COMPRESSED_DATA) == 0)
+        {
+            if (mz_crc32(MZ_CRC32_INIT, (const mz_uint8 *)pBuf, (size_t)file_stat.m_uncomp_size) != file_stat.m_crc32)
+                return mz_zip_set_error(pZip, MZ_ZIP_CRC_CHECK_FAILED);
+        }
+#endif
+
+        return MZ_TRUE;
+    }
+
+    /* Decompress the file either directly from memory or from a file input buffer. */
+    tinfl_init(&inflator);
+
+    if (pZip->m_pState->m_pMem)
+    {
+        /* Read directly from the archive in memory. */
+        pRead_buf = (mz_uint8 *)pZip->m_pState->m_pMem + cur_file_ofs;
+        read_buf_size = read_buf_avail = file_stat.m_comp_size;
+        comp_remaining = 0;
+    }
+    else if (pUser_read_buf)
+    {
+        /* Use a user provided read buffer. */
+        if (!user_read_buf_size)
+            return MZ_FALSE;
+        pRead_buf = (mz_uint8 *)pUser_read_buf;
+        read_buf_size = user_read_buf_size;
+        read_buf_avail = 0;
+        comp_remaining = file_stat.m_comp_size;
+    }
+    else
+    {
+        /* Temporarily allocate a read buffer. */
+        read_buf_size = MZ_MIN(file_stat.m_comp_size, (mz_uint64)MZ_ZIP_MAX_IO_BUF_SIZE);
+        if (((sizeof(size_t) == sizeof(mz_uint32))) && (read_buf_size > 0x7FFFFFFF))
+            return mz_zip_set_error(pZip, MZ_ZIP_INTERNAL_ERROR);
+
+        if (NULL == (pRead_buf = pZip->m_pAlloc(pZip->m_pAlloc_opaque, 1, (size_t)read_buf_size)))
+            return mz_zip_set_error(pZip, MZ_ZIP_ALLOC_FAILED);
+
+        read_buf_avail = 0;
+        comp_remaining = file_stat.m_comp_size;
+    }
+
+    do
+    {
+        /* The size_t cast here should be OK because we've verified that the output buffer is >= file_stat.m_uncomp_size above */
+        size_t in_buf_size, out_buf_size = (size_t)(file_stat.m_uncomp_size - out_buf_ofs);
+        if ((!read_buf_avail) && (!pZip->m_pState->m_pMem))
+        {
+            read_buf_avail = MZ_MIN(read_buf_size, comp_remaining);
+            if (pZip->m_pRead(pZip->m_pIO_opaque, cur_file_ofs, pRead_buf, (size_t)read_buf_avail) != read_buf_avail)
+            {
+                status = TINFL_STATUS_FAILED;
+                mz_zip_set_error(pZip, MZ_ZIP_DECOMPRESSION_FAILED);
+                break;
+            }
+            cur_file_ofs += read_buf_avail;
+            comp_remaining -= read_buf_avail;
+            read_buf_ofs = 0;
+        }
+        in_buf_size = (size_t)read_buf_avail;
+        status = tinfl_decompress(&inflator, (mz_uint8 *)pRead_buf + read_buf_ofs, &in_buf_size, (mz_uint8 *)pBuf, (mz_uint8 *)pBuf + out_buf_ofs, &out_buf_size, TINFL_FLAG_USING_NON_WRAPPING_OUTPUT_BUF | (comp_remaining ? TINFL_FLAG_HAS_MORE_INPUT : 0));
+        read_buf_avail -= in_buf_size;
+        read_buf_ofs += in_buf_size;
+        out_buf_ofs += out_buf_size;
+    } while (status == TINFL_STATUS_NEEDS_MORE_INPUT);
+
+    if (status == TINFL_STATUS_DONE)
+    {
+        /* Make sure the entire file was decompressed, and check its CRC. */
+        if (out_buf_ofs != file_stat.m_uncomp_size)
+        {
+            mz_zip_set_error(pZip, MZ_ZIP_UNEXPECTED_DECOMPRESSED_SIZE);
+            status = TINFL_STATUS_FAILED;
+        }
+#ifndef MINIZ_DISABLE_ZIP_READER_CRC32_CHECKS
+        else if (mz_crc32(MZ_CRC32_INIT, (const mz_uint8 *)pBuf, (size_t)file_stat.m_uncomp_size) != file_stat.m_crc32)
+        {
+            mz_zip_set_error(pZip, MZ_ZIP_CRC_CHECK_FAILED);
+            status = TINFL_STATUS_FAILED;
+        }
+#endif
+    }
+
+    if ((!pZip->m_pState->m_pMem) && (!pUser_read_buf))
+        pZip->m_pFree(pZip->m_pAlloc_opaque, pRead_buf);
+
+    return status == TINFL_STATUS_DONE;
+}
+
+mz_bool mz_zip_reader_extract_file_to_mem_no_alloc(mz_zip_archive *pZip, const char *pFilename, void *pBuf, size_t buf_size, mz_uint flags, void *pUser_read_buf, size_t user_read_buf_size)
+{
+    mz_uint32 file_index;
+    if (!mz_zip_reader_locate_file_v2(pZip, pFilename, NULL, flags, &file_index))
+        return MZ_FALSE;
+    return mz_zip_reader_extract_to_mem_no_alloc(pZip, file_index, pBuf, buf_size, flags, pUser_read_buf, user_read_buf_size);
+}
+
+mz_bool mz_zip_reader_extract_to_mem(mz_zip_archive *pZip, mz_uint file_index, void *pBuf, size_t buf_size, mz_uint flags)
+{
+    return mz_zip_reader_extract_to_mem_no_alloc(pZip, file_index, pBuf, buf_size, flags, NULL, 0);
+}
+
+mz_bool mz_zip_reader_extract_file_to_mem(mz_zip_archive *pZip, const char *pFilename, void *pBuf, size_t buf_size, mz_uint flags)
+{
+    return mz_zip_reader_extract_file_to_mem_no_alloc(pZip, pFilename, pBuf, buf_size, flags, NULL, 0);
+}
+
+void *mz_zip_reader_extract_to_heap(mz_zip_archive *pZip, mz_uint file_index, size_t *pSize, mz_uint flags)
+{
+    mz_uint64 comp_size, uncomp_size, alloc_size;
+    const mz_uint8 *p = mz_zip_get_cdh(pZip, file_index);
+    void *pBuf;
+
+    if (pSize)
+        *pSize = 0;
+
+    if (!p)
+    {
+        mz_zip_set_error(pZip, MZ_ZIP_INVALID_PARAMETER);
+        return NULL;
+    }
+
+    comp_size = MZ_READ_LE32(p + MZ_ZIP_CDH_COMPRESSED_SIZE_OFS);
+    uncomp_size = MZ_READ_LE32(p + MZ_ZIP_CDH_DECOMPRESSED_SIZE_OFS);
+
+    alloc_size = (flags & MZ_ZIP_FLAG_COMPRESSED_DATA) ? comp_size : uncomp_size;
+    if (((sizeof(size_t) == sizeof(mz_uint32))) && (alloc_size > 0x7FFFFFFF))
+    {
+        mz_zip_set_error(pZip, MZ_ZIP_INTERNAL_ERROR);
+        return NULL;
+    }
+
+    if (NULL == (pBuf = pZip->m_pAlloc(pZip->m_pAlloc_opaque, 1, (size_t)alloc_size)))
+    {
+        mz_zip_set_error(pZip, MZ_ZIP_ALLOC_FAILED);
+        return NULL;
+    }
+
+    if (!mz_zip_reader_extract_to_mem(pZip, file_index, pBuf, (size_t)alloc_size, flags))
+    {
+        pZip->m_pFree(pZip->m_pAlloc_opaque, pBuf);
+        return NULL;
+    }
+
+    if (pSize)
+        *pSize = (size_t)alloc_size;
+    return pBuf;
+}
+
+void *mz_zip_reader_extract_file_to_heap(mz_zip_archive *pZip, const char *pFilename, size_t *pSize, mz_uint flags)
+{
+    mz_uint32 file_index;
+    if (!mz_zip_reader_locate_file_v2(pZip, pFilename, NULL, flags, &file_index))
+    {
+        if (pSize)
+            *pSize = 0;
+        return MZ_FALSE;
+    }
+    return mz_zip_reader_extract_to_heap(pZip, file_index, pSize, flags);
+}
+
+mz_bool mz_zip_reader_extract_to_callback(mz_zip_archive *pZip, mz_uint file_index, mz_file_write_func pCallback, void *pOpaque, mz_uint flags)
+{
+    int status = TINFL_STATUS_DONE;
+#ifndef MINIZ_DISABLE_ZIP_READER_CRC32_CHECKS
+    mz_uint file_crc32 = MZ_CRC32_INIT;
+#endif
+    mz_uint64 read_buf_size, read_buf_ofs = 0, read_buf_avail, comp_remaining, out_buf_ofs = 0, cur_file_ofs;
+    mz_zip_archive_file_stat file_stat;
+    void *pRead_buf = NULL;
+    void *pWrite_buf = NULL;
+    mz_uint32 local_header_u32[(MZ_ZIP_LOCAL_DIR_HEADER_SIZE + sizeof(mz_uint32) - 1) / sizeof(mz_uint32)];
+    mz_uint8 *pLocal_header = (mz_uint8 *)local_header_u32;
+
+    if ((!pZip) || (!pZip->m_pState) || (!pCallback) || (!pZip->m_pRead))
+        return mz_zip_set_error(pZip, MZ_ZIP_INVALID_PARAMETER);
+
+    if (!mz_zip_reader_file_stat(pZip, file_index, &file_stat))
+        return MZ_FALSE;
+
+    /* A directory or zero length file */
+    if ((file_stat.m_is_directory) || (!file_stat.m_comp_size))
+        return MZ_TRUE;
+
+    /* Encryption and patch files are not supported. */
+    if (file_stat.m_bit_flag & (MZ_ZIP_GENERAL_PURPOSE_BIT_FLAG_IS_ENCRYPTED | MZ_ZIP_GENERAL_PURPOSE_BIT_FLAG_USES_STRONG_ENCRYPTION | MZ_ZIP_GENERAL_PURPOSE_BIT_FLAG_COMPRESSED_PATCH_FLAG))
+        return mz_zip_set_error(pZip, MZ_ZIP_UNSUPPORTED_ENCRYPTION);
+
+    /* This function only supports decompressing stored and deflate. */
+    if ((!(flags & MZ_ZIP_FLAG_COMPRESSED_DATA)) && (file_stat.m_method != 0) && (file_stat.m_method != MZ_DEFLATED))
+        return mz_zip_set_error(pZip, MZ_ZIP_UNSUPPORTED_METHOD);
+
+    /* Read and do some minimal validation of the local directory entry (this doesn't crack the zip64 stuff, which we already have from the central dir) */
+    cur_file_ofs = file_stat.m_local_header_ofs;
+    if (pZip->m_pRead(pZip->m_pIO_opaque, cur_file_ofs, pLocal_header, MZ_ZIP_LOCAL_DIR_HEADER_SIZE) != MZ_ZIP_LOCAL_DIR_HEADER_SIZE)
+        return mz_zip_set_error(pZip, MZ_ZIP_FILE_READ_FAILED);
+
+    if (MZ_READ_LE32(pLocal_header) != MZ_ZIP_LOCAL_DIR_HEADER_SIG)
+        return mz_zip_set_error(pZip, MZ_ZIP_INVALID_HEADER_OR_CORRUPTED);
+
+    cur_file_ofs += MZ_ZIP_LOCAL_DIR_HEADER_SIZE + MZ_READ_LE16(pLocal_header + MZ_ZIP_LDH_FILENAME_LEN_OFS) + MZ_READ_LE16(pLocal_header + MZ_ZIP_LDH_EXTRA_LEN_OFS);
+    if ((cur_file_ofs + file_stat.m_comp_size) > pZip->m_archive_size)
+        return mz_zip_set_error(pZip, MZ_ZIP_INVALID_HEADER_OR_CORRUPTED);
+
+    /* Decompress the file either directly from memory or from a file input buffer. */
+    if (pZip->m_pState->m_pMem)
+    {
+        pRead_buf = (mz_uint8 *)pZip->m_pState->m_pMem + cur_file_ofs;
+        read_buf_size = read_buf_avail = file_stat.m_comp_size;
+        comp_remaining = 0;
+    }
+    else
+    {
+        read_buf_size = MZ_MIN(file_stat.m_comp_size, (mz_uint64)MZ_ZIP_MAX_IO_BUF_SIZE);
+        if (NULL == (pRead_buf = pZip->m_pAlloc(pZip->m_pAlloc_opaque, 1, (size_t)read_buf_size)))
+            return mz_zip_set_error(pZip, MZ_ZIP_ALLOC_FAILED);
+
+        read_buf_avail = 0;
+        comp_remaining = file_stat.m_comp_size;
+    }
+
+    if ((flags & MZ_ZIP_FLAG_COMPRESSED_DATA) || (!file_stat.m_method))
+    {
+        /* The file is stored or the caller has requested the compressed data. */
+        if (pZip->m_pState->m_pMem)
+        {
+            if (((sizeof(size_t) == sizeof(mz_uint32))) && (file_stat.m_comp_size > MZ_UINT32_MAX))
+                return mz_zip_set_error(pZip, MZ_ZIP_INTERNAL_ERROR);
+
+            if (pCallback(pOpaque, out_buf_ofs, pRead_buf, (size_t)file_stat.m_comp_size) != file_stat.m_comp_size)
+            {
+                mz_zip_set_error(pZip, MZ_ZIP_WRITE_CALLBACK_FAILED);
+                status = TINFL_STATUS_FAILED;
+            }
+            else if (!(flags & MZ_ZIP_FLAG_COMPRESSED_DATA))
+            {
+#ifndef MINIZ_DISABLE_ZIP_READER_CRC32_CHECKS
+                file_crc32 = (mz_uint32)mz_crc32(file_crc32, (const mz_uint8 *)pRead_buf, (size_t)file_stat.m_comp_size);
+#endif
+            }
+
+            cur_file_ofs += file_stat.m_comp_size;
+            out_buf_ofs += file_stat.m_comp_size;
+            comp_remaining = 0;
+        }
+        else
+        {
+            while (comp_remaining)
+            {
+                read_buf_avail = MZ_MIN(read_buf_size, comp_remaining);
+                if (pZip->m_pRead(pZip->m_pIO_opaque, cur_file_ofs, pRead_buf, (size_t)read_buf_avail) != read_buf_avail)
+                {
+                    mz_zip_set_error(pZip, MZ_ZIP_FILE_READ_FAILED);
+                    status = TINFL_STATUS_FAILED;
+                    break;
+                }
+
+#ifndef MINIZ_DISABLE_ZIP_READER_CRC32_CHECKS
+                if (!(flags & MZ_ZIP_FLAG_COMPRESSED_DATA))
+                {
+                    file_crc32 = (mz_uint32)mz_crc32(file_crc32, (const mz_uint8 *)pRead_buf, (size_t)read_buf_avail);
+                }
+#endif
+
+                if (pCallback(pOpaque, out_buf_ofs, pRead_buf, (size_t)read_buf_avail) != read_buf_avail)
+                {
+                    mz_zip_set_error(pZip, MZ_ZIP_WRITE_CALLBACK_FAILED);
+                    status = TINFL_STATUS_FAILED;
+                    break;
+                }
+
+                cur_file_ofs += read_buf_avail;
+                out_buf_ofs += read_buf_avail;
+                comp_remaining -= read_buf_avail;
+            }
+        }
+    }
+    else
+    {
+        tinfl_decompressor inflator;
+        tinfl_init(&inflator);
+
+        if (NULL == (pWrite_buf = pZip->m_pAlloc(pZip->m_pAlloc_opaque, 1, TINFL_LZ_DICT_SIZE)))
+        {
+            mz_zip_set_error(pZip, MZ_ZIP_ALLOC_FAILED);
+            status = TINFL_STATUS_FAILED;
+        }
+        else
+        {
+            do
+            {
+                mz_uint8 *pWrite_buf_cur = (mz_uint8 *)pWrite_buf + (out_buf_ofs & (TINFL_LZ_DICT_SIZE - 1));
+                size_t in_buf_size, out_buf_size = TINFL_LZ_DICT_SIZE - (out_buf_ofs & (TINFL_LZ_DICT_SIZE - 1));
+                if ((!read_buf_avail) && (!pZip->m_pState->m_pMem))
+                {
+                    read_buf_avail = MZ_MIN(read_buf_size, comp_remaining);
+                    if (pZip->m_pRead(pZip->m_pIO_opaque, cur_file_ofs, pRead_buf, (size_t)read_buf_avail) != read_buf_avail)
+                    {
+                        mz_zip_set_error(pZip, MZ_ZIP_FILE_READ_FAILED);
+                        status = TINFL_STATUS_FAILED;
+                        break;
+                    }
+                    cur_file_ofs += read_buf_avail;
+                    comp_remaining -= read_buf_avail;
+                    read_buf_ofs = 0;
+                }
+
+                in_buf_size = (size_t)read_buf_avail;
+                status = tinfl_decompress(&inflator, (const mz_uint8 *)pRead_buf + read_buf_ofs, &in_buf_size, (mz_uint8 *)pWrite_buf, pWrite_buf_cur, &out_buf_size, comp_remaining ? TINFL_FLAG_HAS_MORE_INPUT : 0);
+                read_buf_avail -= in_buf_size;
+                read_buf_ofs += in_buf_size;
+
+                if (out_buf_size)
+                {
+                    if (pCallback(pOpaque, out_buf_ofs, pWrite_buf_cur, out_buf_size) != out_buf_size)
+                    {
+                        mz_zip_set_error(pZip, MZ_ZIP_WRITE_CALLBACK_FAILED);
+                        status = TINFL_STATUS_FAILED;
+                        break;
+                    }
+
+#ifndef MINIZ_DISABLE_ZIP_READER_CRC32_CHECKS
+                    file_crc32 = (mz_uint32)mz_crc32(file_crc32, pWrite_buf_cur, out_buf_size);
+#endif
+                    if ((out_buf_ofs += out_buf_size) > file_stat.m_uncomp_size)
+                    {
+                        mz_zip_set_error(pZip, MZ_ZIP_DECOMPRESSION_FAILED);
+                        status = TINFL_STATUS_FAILED;
+                        break;
+                    }
+                }
+            } while ((status == TINFL_STATUS_NEEDS_MORE_INPUT) || (status == TINFL_STATUS_HAS_MORE_OUTPUT));
+        }
+    }
+
+    if ((status == TINFL_STATUS_DONE) && (!(flags & MZ_ZIP_FLAG_COMPRESSED_DATA)))
+    {
+        /* Make sure the entire file was decompressed, and check its CRC. */
+        if (out_buf_ofs != file_stat.m_uncomp_size)
+        {
+            mz_zip_set_error(pZip, MZ_ZIP_UNEXPECTED_DECOMPRESSED_SIZE);
+            status = TINFL_STATUS_FAILED;
+        }
+#ifndef MINIZ_DISABLE_ZIP_READER_CRC32_CHECKS
+        else if (file_crc32 != file_stat.m_crc32)
+        {
+            mz_zip_set_error(pZip, MZ_ZIP_DECOMPRESSION_FAILED);
+            status = TINFL_STATUS_FAILED;
+        }
+#endif
+    }
+
+    if (!pZip->m_pState->m_pMem)
+        pZip->m_pFree(pZip->m_pAlloc_opaque, pRead_buf);
+
+    if (pWrite_buf)
+        pZip->m_pFree(pZip->m_pAlloc_opaque, pWrite_buf);
+
+    return status == TINFL_STATUS_DONE;
+}
+
+mz_bool mz_zip_reader_extract_file_to_callback(mz_zip_archive *pZip, const char *pFilename, mz_file_write_func pCallback, void *pOpaque, mz_uint flags)
+{
+    mz_uint32 file_index;
+    if (!mz_zip_reader_locate_file_v2(pZip, pFilename, NULL, flags, &file_index))
+        return MZ_FALSE;
+
+    return mz_zip_reader_extract_to_callback(pZip, file_index, pCallback, pOpaque, flags);
+}
+
+mz_zip_reader_extract_iter_state* mz_zip_reader_extract_iter_new(mz_zip_archive *pZip, mz_uint file_index, mz_uint flags)
+{
+    mz_zip_reader_extract_iter_state *pState;
+    mz_uint32 local_header_u32[(MZ_ZIP_LOCAL_DIR_HEADER_SIZE + sizeof(mz_uint32) - 1) / sizeof(mz_uint32)];
+    mz_uint8 *pLocal_header = (mz_uint8 *)local_header_u32;
+
+    /* Argument sanity check */
+    if ((!pZip) || (!pZip->m_pState))
+        return NULL;
+
+    /* Allocate an iterator status structure */
+    pState = (mz_zip_reader_extract_iter_state*)pZip->m_pAlloc(pZip->m_pAlloc_opaque, 1, sizeof(mz_zip_reader_extract_iter_state));
+    if (!pState)
+    {
+        mz_zip_set_error(pZip, MZ_ZIP_ALLOC_FAILED);
+        return NULL;
+    }
+
+    /* Fetch file details */
+    if (!mz_zip_reader_file_stat(pZip, file_index, &pState->file_stat))
+    {
+        pZip->m_pFree(pZip->m_pAlloc_opaque, pState);
+        return NULL;
+    }
+
+    /* Encryption and patch files are not supported. */
+    if (pState->file_stat.m_bit_flag & (MZ_ZIP_GENERAL_PURPOSE_BIT_FLAG_IS_ENCRYPTED | MZ_ZIP_GENERAL_PURPOSE_BIT_FLAG_USES_STRONG_ENCRYPTION | MZ_ZIP_GENERAL_PURPOSE_BIT_FLAG_COMPRESSED_PATCH_FLAG))
+    {
+        mz_zip_set_error(pZip, MZ_ZIP_UNSUPPORTED_ENCRYPTION);
+        pZip->m_pFree(pZip->m_pAlloc_opaque, pState);
+        return NULL;
+    }
+
+    /* This function only supports decompressing stored and deflate. */
+    if ((!(flags & MZ_ZIP_FLAG_COMPRESSED_DATA)) && (pState->file_stat.m_method != 0) && (pState->file_stat.m_method != MZ_DEFLATED))
+    {
+        mz_zip_set_error(pZip, MZ_ZIP_UNSUPPORTED_METHOD);
+        pZip->m_pFree(pZip->m_pAlloc_opaque, pState);
+        return NULL;
+    }
+
+    /* Init state - save args */
+    pState->pZip = pZip;
+    pState->flags = flags;
+
+    /* Init state - reset variables to defaults */
+    pState->status = TINFL_STATUS_DONE;
+#ifndef MINIZ_DISABLE_ZIP_READER_CRC32_CHECKS
+    pState->file_crc32 = MZ_CRC32_INIT;
+#endif
+    pState->read_buf_ofs = 0;
+    pState->out_buf_ofs = 0;
+    pState->pRead_buf = NULL;
+    pState->pWrite_buf = NULL;
+    pState->out_blk_remain = 0;
+
+    /* Read and parse the local directory entry. */
+    pState->cur_file_ofs = pState->file_stat.m_local_header_ofs;
+    if (pZip->m_pRead(pZip->m_pIO_opaque, pState->cur_file_ofs, pLocal_header, MZ_ZIP_LOCAL_DIR_HEADER_SIZE) != MZ_ZIP_LOCAL_DIR_HEADER_SIZE)
+    {
+        mz_zip_set_error(pZip, MZ_ZIP_FILE_READ_FAILED);
+        pZip->m_pFree(pZip->m_pAlloc_opaque, pState);
+        return NULL;
+    }
+
+    if (MZ_READ_LE32(pLocal_header) != MZ_ZIP_LOCAL_DIR_HEADER_SIG)
+    {
+        mz_zip_set_error(pZip, MZ_ZIP_INVALID_HEADER_OR_CORRUPTED);
+        pZip->m_pFree(pZip->m_pAlloc_opaque, pState);
+        return NULL;
+    }
+
+    pState->cur_file_ofs += MZ_ZIP_LOCAL_DIR_HEADER_SIZE + MZ_READ_LE16(pLocal_header + MZ_ZIP_LDH_FILENAME_LEN_OFS) + MZ_READ_LE16(pLocal_header + MZ_ZIP_LDH_EXTRA_LEN_OFS);
+    if ((pState->cur_file_ofs + pState->file_stat.m_comp_size) > pZip->m_archive_size)
+    {
+        mz_zip_set_error(pZip, MZ_ZIP_INVALID_HEADER_OR_CORRUPTED);
+        pZip->m_pFree(pZip->m_pAlloc_opaque, pState);
+        return NULL;
+    }
+
+    /* Decompress the file either directly from memory or from a file input buffer. */
+    if (pZip->m_pState->m_pMem)
+    {
+        pState->pRead_buf = (mz_uint8 *)pZip->m_pState->m_pMem + pState->cur_file_ofs;
+        pState->read_buf_size = pState->read_buf_avail = pState->file_stat.m_comp_size;
+        pState->comp_remaining = pState->file_stat.m_comp_size;
+    }
+    else
+    {
+        if (!((flags & MZ_ZIP_FLAG_COMPRESSED_DATA) || (!pState->file_stat.m_method)))
+        {
+            /* Decompression required, therefore intermediate read buffer required */
+            pState->read_buf_size = MZ_MIN(pState->file_stat.m_comp_size, (mz_uint64)MZ_ZIP_MAX_IO_BUF_SIZE);
+            if (NULL == (pState->pRead_buf = pZip->m_pAlloc(pZip->m_pAlloc_opaque, 1, (size_t)pState->read_buf_size)))
+            {
+                mz_zip_set_error(pZip, MZ_ZIP_ALLOC_FAILED);
+                pZip->m_pFree(pZip->m_pAlloc_opaque, pState);
+                return NULL;
+            }
+        }
+        else
+        {
+            /* Decompression not required - we will be reading directly into user buffer, no temp buf required */
+            pState->read_buf_size = 0;
+        }
+        pState->read_buf_avail = 0;
+        pState->comp_remaining = pState->file_stat.m_comp_size;
+    }
+
+    if (!((flags & MZ_ZIP_FLAG_COMPRESSED_DATA) || (!pState->file_stat.m_method)))
+    {
+        /* Decompression required, init decompressor */
+        tinfl_init( &pState->inflator );
+
+        /* Allocate write buffer */
+        if (NULL == (pState->pWrite_buf = pZip->m_pAlloc(pZip->m_pAlloc_opaque, 1, TINFL_LZ_DICT_SIZE)))
+        {
+            mz_zip_set_error(pZip, MZ_ZIP_ALLOC_FAILED);
+            if (pState->pRead_buf)
+                pZip->m_pFree(pZip->m_pAlloc_opaque, pState->pRead_buf);
+            pZip->m_pFree(pZip->m_pAlloc_opaque, pState);
+            return NULL;
+        }
+    }
+
+    return pState;
+}
+
+mz_zip_reader_extract_iter_state* mz_zip_reader_extract_file_iter_new(mz_zip_archive *pZip, const char *pFilename, mz_uint flags)
+{
+    mz_uint32 file_index;
+
+    /* Locate file index by name */
+    if (!mz_zip_reader_locate_file_v2(pZip, pFilename, NULL, flags, &file_index))
+        return NULL;
+
+    /* Construct iterator */
+    return mz_zip_reader_extract_iter_new(pZip, file_index, flags);
+}
+
+size_t mz_zip_reader_extract_iter_read(mz_zip_reader_extract_iter_state* pState, void* pvBuf, size_t buf_size)
+{
+    size_t copied_to_caller = 0;
+
+    /* Argument sanity check */
+    if ((!pState) || (!pState->pZip) || (!pState->pZip->m_pState) || (!pvBuf))
+        return 0;
+
+    if ((pState->flags & MZ_ZIP_FLAG_COMPRESSED_DATA) || (!pState->file_stat.m_method))
+    {
+        /* The file is stored or the caller has requested the compressed data, calc amount to return. */
+        copied_to_caller = (size_t)MZ_MIN( buf_size, pState->comp_remaining );
+
+        /* Zip is in memory....or requires reading from a file? */
+        if (pState->pZip->m_pState->m_pMem)
+        {
+            /* Copy data to caller's buffer */
+            memcpy( pvBuf, pState->pRead_buf, copied_to_caller );
+            pState->pRead_buf = ((mz_uint8*)pState->pRead_buf) + copied_to_caller;
+        }
+        else
+        {
+            /* Read directly into caller's buffer */
+            if (pState->pZip->m_pRead(pState->pZip->m_pIO_opaque, pState->cur_file_ofs, pvBuf, copied_to_caller) != copied_to_caller)
+            {
+                /* Failed to read all that was asked for, flag failure and alert user */
+                mz_zip_set_error(pState->pZip, MZ_ZIP_FILE_READ_FAILED);
+                pState->status = TINFL_STATUS_FAILED;
+                copied_to_caller = 0;
+            }
+        }
+
+#ifndef MINIZ_DISABLE_ZIP_READER_CRC32_CHECKS
+        /* Compute CRC if not returning compressed data only */
+        if (!(pState->flags & MZ_ZIP_FLAG_COMPRESSED_DATA))
+            pState->file_crc32 = (mz_uint32)mz_crc32(pState->file_crc32, (const mz_uint8 *)pvBuf, copied_to_caller);
+#endif
+
+        /* Advance offsets, dec counters */
+        pState->cur_file_ofs += copied_to_caller;
+        pState->out_buf_ofs += copied_to_caller;
+        pState->comp_remaining -= copied_to_caller;
+    }
+    else
+    {
+        do
+        {
+            /* Calc ptr to write buffer - given current output pos and block size */
+            mz_uint8 *pWrite_buf_cur = (mz_uint8 *)pState->pWrite_buf + (pState->out_buf_ofs & (TINFL_LZ_DICT_SIZE - 1));
+
+            /* Calc max output size - given current output pos and block size */
+            size_t in_buf_size, out_buf_size = TINFL_LZ_DICT_SIZE - (pState->out_buf_ofs & (TINFL_LZ_DICT_SIZE - 1));
+
+            if (!pState->out_blk_remain)
+            {
+                /* Read more data from file if none available (and reading from file) */
+                if ((!pState->read_buf_avail) && (!pState->pZip->m_pState->m_pMem))
+                {
+                    /* Calc read size */
+                    pState->read_buf_avail = MZ_MIN(pState->read_buf_size, pState->comp_remaining);
+                    if (pState->pZip->m_pRead(pState->pZip->m_pIO_opaque, pState->cur_file_ofs, pState->pRead_buf, (size_t)pState->read_buf_avail) != pState->read_buf_avail)
+                    {
+                        mz_zip_set_error(pState->pZip, MZ_ZIP_FILE_READ_FAILED);
+                        pState->status = TINFL_STATUS_FAILED;
+                        break;
+                    }
+
+                    /* Advance offsets, dec counters */
+                    pState->cur_file_ofs += pState->read_buf_avail;
+                    pState->comp_remaining -= pState->read_buf_avail;
+                    pState->read_buf_ofs = 0;
+                }
+
+                /* Perform decompression */
+                in_buf_size = (size_t)pState->read_buf_avail;
+                pState->status = tinfl_decompress(&pState->inflator, (const mz_uint8 *)pState->pRead_buf + pState->read_buf_ofs, &in_buf_size, (mz_uint8 *)pState->pWrite_buf, pWrite_buf_cur, &out_buf_size, pState->comp_remaining ? TINFL_FLAG_HAS_MORE_INPUT : 0);
+                pState->read_buf_avail -= in_buf_size;
+                pState->read_buf_ofs += in_buf_size;
+
+                /* Update current output block size remaining */
+                pState->out_blk_remain = out_buf_size;
+            }
+
+            if (pState->out_blk_remain)
+            {
+                /* Calc amount to return. */
+                size_t to_copy = MZ_MIN( (buf_size - copied_to_caller), pState->out_blk_remain );
+
+                /* Copy data to caller's buffer */
+                memcpy( (uint8_t*)pvBuf + copied_to_caller, pWrite_buf_cur, to_copy );
+
+#ifndef MINIZ_DISABLE_ZIP_READER_CRC32_CHECKS
+                /* Perform CRC */
+                pState->file_crc32 = (mz_uint32)mz_crc32(pState->file_crc32, pWrite_buf_cur, to_copy);
+#endif
+
+                /* Decrement data consumed from block */
+                pState->out_blk_remain -= to_copy;
+
+                /* Inc output offset, while performing sanity check */
+                if ((pState->out_buf_ofs += to_copy) > pState->file_stat.m_uncomp_size)
+                {
+                    mz_zip_set_error(pState->pZip, MZ_ZIP_DECOMPRESSION_FAILED);
+                    pState->status = TINFL_STATUS_FAILED;
+                    break;
+                }
+
+                /* Increment counter of data copied to caller */
+                copied_to_caller += to_copy;
+            }
+        } while ( (copied_to_caller < buf_size) && ((pState->status == TINFL_STATUS_NEEDS_MORE_INPUT) || (pState->status == TINFL_STATUS_HAS_MORE_OUTPUT)) );
+    }
+
+    /* Return how many bytes were copied into user buffer */
+    return copied_to_caller;
+}
+
+mz_bool mz_zip_reader_extract_iter_free(mz_zip_reader_extract_iter_state* pState)
+{
+    int status;
+
+    /* Argument sanity check */
+    if ((!pState) || (!pState->pZip) || (!pState->pZip->m_pState))
+        return MZ_FALSE;
+
+    /* Was decompression completed and requested? */
+    if ((pState->status == TINFL_STATUS_DONE) && (!(pState->flags & MZ_ZIP_FLAG_COMPRESSED_DATA)))
+    {
+        /* Make sure the entire file was decompressed, and check its CRC. */
+        if (pState->out_buf_ofs != pState->file_stat.m_uncomp_size)
+        {
+            mz_zip_set_error(pState->pZip, MZ_ZIP_UNEXPECTED_DECOMPRESSED_SIZE);
+            pState->status = TINFL_STATUS_FAILED;
+        }
+#ifndef MINIZ_DISABLE_ZIP_READER_CRC32_CHECKS
+        else if (pState->file_crc32 != pState->file_stat.m_crc32)
+        {
+            mz_zip_set_error(pState->pZip, MZ_ZIP_DECOMPRESSION_FAILED);
+            pState->status = TINFL_STATUS_FAILED;
+        }
+#endif
+    }
+
+    /* Free buffers */
+    if (!pState->pZip->m_pState->m_pMem)
+        pState->pZip->m_pFree(pState->pZip->m_pAlloc_opaque, pState->pRead_buf);
+    if (pState->pWrite_buf)
+        pState->pZip->m_pFree(pState->pZip->m_pAlloc_opaque, pState->pWrite_buf);
+
+    /* Save status */
+    status = pState->status;
+
+    /* Free context */
+    pState->pZip->m_pFree(pState->pZip->m_pAlloc_opaque, pState);
+
+    return status == TINFL_STATUS_DONE;
+}
+
+#ifndef MINIZ_NO_STDIO
+static size_t mz_zip_file_write_callback(void *pOpaque, mz_uint64 ofs, const void *pBuf, size_t n)
+{
+    (void)ofs;
+
+    return MZ_FWRITE(pBuf, 1, n, (MZ_FILE *)pOpaque);
+}
+
+mz_bool mz_zip_reader_extract_to_file(mz_zip_archive *pZip, mz_uint file_index, const char *pDst_filename, mz_uint flags)
+{
+    mz_bool status;
+    mz_zip_archive_file_stat file_stat;
+    MZ_FILE *pFile;
+
+    if (!mz_zip_reader_file_stat(pZip, file_index, &file_stat))
+        return MZ_FALSE;
+
+    if ((file_stat.m_is_directory) || (!file_stat.m_is_supported))
+        return mz_zip_set_error(pZip, MZ_ZIP_UNSUPPORTED_FEATURE);
+
+    pFile = MZ_FOPEN(pDst_filename, "wb");
+    if (!pFile)
+        return mz_zip_set_error(pZip, MZ_ZIP_FILE_OPEN_FAILED);
+
+    status = mz_zip_reader_extract_to_callback(pZip, file_index, mz_zip_file_write_callback, pFile, flags);
+
+    if (MZ_FCLOSE(pFile) == EOF)
+    {
+        if (status)
+            mz_zip_set_error(pZip, MZ_ZIP_FILE_CLOSE_FAILED);
+
+        status = MZ_FALSE;
+    }
+
+#if !defined(MINIZ_NO_TIME) && !defined(MINIZ_NO_STDIO)
+    if (status)
+        mz_zip_set_file_times(pDst_filename, file_stat.m_time, file_stat.m_time);
+#endif
+
+    return status;
+}
+
+mz_bool mz_zip_reader_extract_file_to_file(mz_zip_archive *pZip, const char *pArchive_filename, const char *pDst_filename, mz_uint flags)
+{
+    mz_uint32 file_index;
+    if (!mz_zip_reader_locate_file_v2(pZip, pArchive_filename, NULL, flags, &file_index))
+        return MZ_FALSE;
+
+    return mz_zip_reader_extract_to_file(pZip, file_index, pDst_filename, flags);
+}
+
+mz_bool mz_zip_reader_extract_to_cfile(mz_zip_archive *pZip, mz_uint file_index, MZ_FILE *pFile, mz_uint flags)
+{
+    mz_zip_archive_file_stat file_stat;
+
+    if (!mz_zip_reader_file_stat(pZip, file_index, &file_stat))
+        return MZ_FALSE;
+
+    if ((file_stat.m_is_directory) || (!file_stat.m_is_supported))
+        return mz_zip_set_error(pZip, MZ_ZIP_UNSUPPORTED_FEATURE);
+
+    return mz_zip_reader_extract_to_callback(pZip, file_index, mz_zip_file_write_callback, pFile, flags);
+}
+
+mz_bool mz_zip_reader_extract_file_to_cfile(mz_zip_archive *pZip, const char *pArchive_filename, MZ_FILE *pFile, mz_uint flags)
+{
+    mz_uint32 file_index;
+    if (!mz_zip_reader_locate_file_v2(pZip, pArchive_filename, NULL, flags, &file_index))
+        return MZ_FALSE;
+
+    return mz_zip_reader_extract_to_cfile(pZip, file_index, pFile, flags);
+}
+#endif /* #ifndef MINIZ_NO_STDIO */
+
+static size_t mz_zip_compute_crc32_callback(void *pOpaque, mz_uint64 file_ofs, const void *pBuf, size_t n)
+{
+    mz_uint32 *p = (mz_uint32 *)pOpaque;
+    (void)file_ofs;
+    *p = (mz_uint32)mz_crc32(*p, (const mz_uint8 *)pBuf, n);
+    return n;
+}
+
+mz_bool mz_zip_validate_file(mz_zip_archive *pZip, mz_uint file_index, mz_uint flags)
+{
+    mz_zip_archive_file_stat file_stat;
+    mz_zip_internal_state *pState;
+    const mz_uint8 *pCentral_dir_header;
+    mz_bool found_zip64_ext_data_in_cdir = MZ_FALSE;
+    mz_bool found_zip64_ext_data_in_ldir = MZ_FALSE;
+    mz_uint32 local_header_u32[(MZ_ZIP_LOCAL_DIR_HEADER_SIZE + sizeof(mz_uint32) - 1) / sizeof(mz_uint32)];
+    mz_uint8 *pLocal_header = (mz_uint8 *)local_header_u32;
+    mz_uint64 local_header_ofs = 0;
+    mz_uint32 local_header_filename_len, local_header_extra_len, local_header_crc32;
+    mz_uint64 local_header_comp_size, local_header_uncomp_size;
+    mz_uint32 uncomp_crc32 = MZ_CRC32_INIT;
+    mz_bool has_data_descriptor;
+    mz_uint32 local_header_bit_flags;
+
+    mz_zip_array file_data_array;
+    mz_zip_array_init(&file_data_array, 1);
+
+    if ((!pZip) || (!pZip->m_pState) || (!pZip->m_pAlloc) || (!pZip->m_pFree) || (!pZip->m_pRead))
+        return mz_zip_set_error(pZip, MZ_ZIP_INVALID_PARAMETER);
+
+    if (file_index > pZip->m_total_files)
+        return mz_zip_set_error(pZip, MZ_ZIP_INVALID_PARAMETER);
+
+    pState = pZip->m_pState;
+
+    pCentral_dir_header = mz_zip_get_cdh(pZip, file_index);
+
+    if (!mz_zip_file_stat_internal(pZip, file_index, pCentral_dir_header, &file_stat, &found_zip64_ext_data_in_cdir))
+        return MZ_FALSE;
+
+    /* A directory or zero length file */
+    if ((file_stat.m_is_directory) || (!file_stat.m_uncomp_size))
+        return MZ_TRUE;
+
+    /* Encryption and patch files are not supported. */
+    if (file_stat.m_is_encrypted)
+        return mz_zip_set_error(pZip, MZ_ZIP_UNSUPPORTED_ENCRYPTION);
+
+    /* This function only supports stored and deflate. */
+    if ((file_stat.m_method != 0) && (file_stat.m_method != MZ_DEFLATED))
+        return mz_zip_set_error(pZip, MZ_ZIP_UNSUPPORTED_METHOD);
+
+    if (!file_stat.m_is_supported)
+        return mz_zip_set_error(pZip, MZ_ZIP_UNSUPPORTED_FEATURE);
+
+    /* Read and parse the local directory entry. */
+    local_header_ofs = file_stat.m_local_header_ofs;
+    if (pZip->m_pRead(pZip->m_pIO_opaque, local_header_ofs, pLocal_header, MZ_ZIP_LOCAL_DIR_HEADER_SIZE) != MZ_ZIP_LOCAL_DIR_HEADER_SIZE)
+        return mz_zip_set_error(pZip, MZ_ZIP_FILE_READ_FAILED);
+
+    if (MZ_READ_LE32(pLocal_header) != MZ_ZIP_LOCAL_DIR_HEADER_SIG)
+        return mz_zip_set_error(pZip, MZ_ZIP_INVALID_HEADER_OR_CORRUPTED);
+
+    local_header_filename_len = MZ_READ_LE16(pLocal_header + MZ_ZIP_LDH_FILENAME_LEN_OFS);
+    local_header_extra_len = MZ_READ_LE16(pLocal_header + MZ_ZIP_LDH_EXTRA_LEN_OFS);
+    local_header_comp_size = MZ_READ_LE32(pLocal_header + MZ_ZIP_LDH_COMPRESSED_SIZE_OFS);
+    local_header_uncomp_size = MZ_READ_LE32(pLocal_header + MZ_ZIP_LDH_DECOMPRESSED_SIZE_OFS);
+    local_header_crc32 = MZ_READ_LE32(pLocal_header + MZ_ZIP_LDH_CRC32_OFS);
+    local_header_bit_flags = MZ_READ_LE16(pLocal_header + MZ_ZIP_LDH_BIT_FLAG_OFS);
+    has_data_descriptor = (local_header_bit_flags & 8) != 0;
+
+    if (local_header_filename_len != strlen(file_stat.m_filename))
+        return mz_zip_set_error(pZip, MZ_ZIP_INVALID_HEADER_OR_CORRUPTED);
+
+    if ((local_header_ofs + MZ_ZIP_LOCAL_DIR_HEADER_SIZE + local_header_filename_len + local_header_extra_len + file_stat.m_comp_size) > pZip->m_archive_size)
+        return mz_zip_set_error(pZip, MZ_ZIP_INVALID_HEADER_OR_CORRUPTED);
+
+    if (!mz_zip_array_resize(pZip, &file_data_array, MZ_MAX(local_header_filename_len, local_header_extra_len), MZ_FALSE))
+    {
+        mz_zip_set_error(pZip, MZ_ZIP_ALLOC_FAILED);
+        goto handle_failure;
+    }
+
+    if (local_header_filename_len)
+    {
+        if (pZip->m_pRead(pZip->m_pIO_opaque, local_header_ofs + MZ_ZIP_LOCAL_DIR_HEADER_SIZE, file_data_array.m_p, local_header_filename_len) != local_header_filename_len)
+        {
+            mz_zip_set_error(pZip, MZ_ZIP_FILE_READ_FAILED);
+            goto handle_failure;
+        }
+
+        /* I've seen 1 archive that had the same pathname, but used backslashes in the local dir and forward slashes in the central dir. Do we care about this? For now, this case will fail validation. */
+        if (memcmp(file_stat.m_filename, file_data_array.m_p, local_header_filename_len) != 0)
+        {
+            mz_zip_set_error(pZip, MZ_ZIP_VALIDATION_FAILED);
+            goto handle_failure;
+        }
+    }
+
+    if ((local_header_extra_len) && ((local_header_comp_size == MZ_UINT32_MAX) || (local_header_uncomp_size == MZ_UINT32_MAX)))
+    {
+        mz_uint32 extra_size_remaining = local_header_extra_len;
+        const mz_uint8 *pExtra_data = (const mz_uint8 *)file_data_array.m_p;
+
+        if (pZip->m_pRead(pZip->m_pIO_opaque, local_header_ofs + MZ_ZIP_LOCAL_DIR_HEADER_SIZE + local_header_filename_len, file_data_array.m_p, local_header_extra_len) != local_header_extra_len)
+        {
+            mz_zip_set_error(pZip, MZ_ZIP_FILE_READ_FAILED);
+            goto handle_failure;
+        }
+
+        do
+        {
+            mz_uint32 field_id, field_data_size, field_total_size;
+
+            if (extra_size_remaining < (sizeof(mz_uint16) * 2))
+            {
+                mz_zip_set_error(pZip, MZ_ZIP_INVALID_HEADER_OR_CORRUPTED);
+                goto handle_failure;
+            }
+
+            field_id = MZ_READ_LE16(pExtra_data);
+            field_data_size = MZ_READ_LE16(pExtra_data + sizeof(mz_uint16));
+            field_total_size = field_data_size + sizeof(mz_uint16) * 2;
+
+            if (field_total_size > extra_size_remaining)
+            {
+                mz_zip_set_error(pZip, MZ_ZIP_INVALID_HEADER_OR_CORRUPTED);
+                goto handle_failure;
+            }
+
+            if (field_id == MZ_ZIP64_EXTENDED_INFORMATION_FIELD_HEADER_ID)
+            {
+                const mz_uint8 *pSrc_field_data = pExtra_data + sizeof(mz_uint32);
+
+                if (field_data_size < sizeof(mz_uint64) * 2)
+                {
+                    mz_zip_set_error(pZip, MZ_ZIP_INVALID_HEADER_OR_CORRUPTED);
+                    goto handle_failure;
+                }
+
+                local_header_uncomp_size = MZ_READ_LE64(pSrc_field_data);
+                local_header_comp_size = MZ_READ_LE64(pSrc_field_data + sizeof(mz_uint64));
+
+                found_zip64_ext_data_in_ldir = MZ_TRUE;
+                break;
+            }
+
+            pExtra_data += field_total_size;
+            extra_size_remaining -= field_total_size;
+        } while (extra_size_remaining);
+    }
+
+    /* TODO: parse local header extra data when local_header_comp_size is 0xFFFFFFFF! (big_descriptor.zip) */
+    /* I've seen zips in the wild with the data descriptor bit set, but proper local header values and bogus data descriptors */
+    if ((has_data_descriptor) && (!local_header_comp_size) && (!local_header_crc32))
+    {
+        mz_uint8 descriptor_buf[32];
+        mz_bool has_id;
+        const mz_uint8 *pSrc;
+        mz_uint32 file_crc32;
+        mz_uint64 comp_size = 0, uncomp_size = 0;
+
+        mz_uint32 num_descriptor_uint32s = ((pState->m_zip64) || (found_zip64_ext_data_in_ldir)) ? 6 : 4;
+
+        if (pZip->m_pRead(pZip->m_pIO_opaque, local_header_ofs + MZ_ZIP_LOCAL_DIR_HEADER_SIZE + local_header_filename_len + local_header_extra_len + file_stat.m_comp_size, descriptor_buf, sizeof(mz_uint32) * num_descriptor_uint32s) != (sizeof(mz_uint32) * num_descriptor_uint32s))
+        {
+            mz_zip_set_error(pZip, MZ_ZIP_FILE_READ_FAILED);
+            goto handle_failure;
+        }
+
+        has_id = (MZ_READ_LE32(descriptor_buf) == MZ_ZIP_DATA_DESCRIPTOR_ID);
+        pSrc = has_id ? (descriptor_buf + sizeof(mz_uint32)) : descriptor_buf;
+
+        file_crc32 = MZ_READ_LE32(pSrc);
+
+        if ((pState->m_zip64) || (found_zip64_ext_data_in_ldir))
+        {
+            comp_size = MZ_READ_LE64(pSrc + sizeof(mz_uint32));
+            uncomp_size = MZ_READ_LE64(pSrc + sizeof(mz_uint32) + sizeof(mz_uint64));
+        }
+        else
+        {
+            comp_size = MZ_READ_LE32(pSrc + sizeof(mz_uint32));
+            uncomp_size = MZ_READ_LE32(pSrc + sizeof(mz_uint32) + sizeof(mz_uint32));
+        }
+
+        if ((file_crc32 != file_stat.m_crc32) || (comp_size != file_stat.m_comp_size) || (uncomp_size != file_stat.m_uncomp_size))
+        {
+            mz_zip_set_error(pZip, MZ_ZIP_VALIDATION_FAILED);
+            goto handle_failure;
+        }
+    }
+    else
+    {
+        if ((local_header_crc32 != file_stat.m_crc32) || (local_header_comp_size != file_stat.m_comp_size) || (local_header_uncomp_size != file_stat.m_uncomp_size))
+        {
+            mz_zip_set_error(pZip, MZ_ZIP_VALIDATION_FAILED);
+            goto handle_failure;
+        }
+    }
+
+    mz_zip_array_clear(pZip, &file_data_array);
+
+    if ((flags & MZ_ZIP_FLAG_VALIDATE_HEADERS_ONLY) == 0)
+    {
+        if (!mz_zip_reader_extract_to_callback(pZip, file_index, mz_zip_compute_crc32_callback, &uncomp_crc32, 0))
+            return MZ_FALSE;
+
+        /* 1 more check to be sure, although the extract checks too. */
+        if (uncomp_crc32 != file_stat.m_crc32)
+        {
+            mz_zip_set_error(pZip, MZ_ZIP_VALIDATION_FAILED);
+            return MZ_FALSE;
+        }
+    }
+
+    return MZ_TRUE;
+
+handle_failure:
+    mz_zip_array_clear(pZip, &file_data_array);
+    return MZ_FALSE;
+}
+
+mz_bool mz_zip_validate_archive(mz_zip_archive *pZip, mz_uint flags)
+{
+    mz_zip_internal_state *pState;
+    uint32_t i;
+
+    if ((!pZip) || (!pZip->m_pState) || (!pZip->m_pAlloc) || (!pZip->m_pFree) || (!pZip->m_pRead))
+        return mz_zip_set_error(pZip, MZ_ZIP_INVALID_PARAMETER);
+
+    pState = pZip->m_pState;
+
+    /* Basic sanity checks */
+    if (!pState->m_zip64)
+    {
+        if (pZip->m_total_files > MZ_UINT16_MAX)
+            return mz_zip_set_error(pZip, MZ_ZIP_ARCHIVE_TOO_LARGE);
+
+        if (pZip->m_archive_size > MZ_UINT32_MAX)
+            return mz_zip_set_error(pZip, MZ_ZIP_ARCHIVE_TOO_LARGE);
+    }
+    else
+    {
+        if (pZip->m_total_files >= MZ_UINT32_MAX)
+            return mz_zip_set_error(pZip, MZ_ZIP_ARCHIVE_TOO_LARGE);
+
+        if (pState->m_central_dir.m_size >= MZ_UINT32_MAX)
+            return mz_zip_set_error(pZip, MZ_ZIP_ARCHIVE_TOO_LARGE);
+    }
+
+    for (i = 0; i < pZip->m_total_files; i++)
+    {
+        if (MZ_ZIP_FLAG_VALIDATE_LOCATE_FILE_FLAG & flags)
+        {
+            mz_uint32 found_index;
+            mz_zip_archive_file_stat stat;
+
+            if (!mz_zip_reader_file_stat(pZip, i, &stat))
+                return MZ_FALSE;
+
+            if (!mz_zip_reader_locate_file_v2(pZip, stat.m_filename, NULL, 0, &found_index))
+                return MZ_FALSE;
+
+            /* This check can fail if there are duplicate filenames in the archive (which we don't check for when writing - that's up to the user) */
+            if (found_index != i)
+                return mz_zip_set_error(pZip, MZ_ZIP_VALIDATION_FAILED);
+        }
+
+        if (!mz_zip_validate_file(pZip, i, flags))
+            return MZ_FALSE;
+    }
+
+    return MZ_TRUE;
+}
+
+mz_bool mz_zip_validate_mem_archive(const void *pMem, size_t size, mz_uint flags, mz_zip_error *pErr)
+{
+    mz_bool success = MZ_TRUE;
+    mz_zip_archive zip;
+    mz_zip_error actual_err = MZ_ZIP_NO_ERROR;
+
+    if ((!pMem) || (!size))
+    {
+        if (pErr)
+            *pErr = MZ_ZIP_INVALID_PARAMETER;
+        return MZ_FALSE;
+    }
+
+    mz_zip_zero_struct(&zip);
+
+    if (!mz_zip_reader_init_mem(&zip, pMem, size, flags))
+    {
+        if (pErr)
+            *pErr = zip.m_last_error;
+        return MZ_FALSE;
+    }
+
+    if (!mz_zip_validate_archive(&zip, flags))
+    {
+        actual_err = zip.m_last_error;
+        success = MZ_FALSE;
+    }
+
+    if (!mz_zip_reader_end_internal(&zip, success))
+    {
+        if (!actual_err)
+            actual_err = zip.m_last_error;
+        success = MZ_FALSE;
+    }
+
+    if (pErr)
+        *pErr = actual_err;
+
+    return success;
+}
+
+#ifndef MINIZ_NO_STDIO
+mz_bool mz_zip_validate_file_archive(const char *pFilename, mz_uint flags, mz_zip_error *pErr)
+{
+    mz_bool success = MZ_TRUE;
+    mz_zip_archive zip;
+    mz_zip_error actual_err = MZ_ZIP_NO_ERROR;
+
+    if (!pFilename)
+    {
+        if (pErr)
+            *pErr = MZ_ZIP_INVALID_PARAMETER;
+        return MZ_FALSE;
+    }
+
+    mz_zip_zero_struct(&zip);
+
+    if (!mz_zip_reader_init_file_v2(&zip, pFilename, flags, 0, 0))
+    {
+        if (pErr)
+            *pErr = zip.m_last_error;
+        return MZ_FALSE;
+    }
+
+    if (!mz_zip_validate_archive(&zip, flags))
+    {
+        actual_err = zip.m_last_error;
+        success = MZ_FALSE;
+    }
+
+    if (!mz_zip_reader_end_internal(&zip, success))
+    {
+        if (!actual_err)
+            actual_err = zip.m_last_error;
+        success = MZ_FALSE;
+    }
+
+    if (pErr)
+        *pErr = actual_err;
+
+    return success;
+}
+#endif /* #ifndef MINIZ_NO_STDIO */
+
+/* ------------------- .ZIP archive writing */
+
+#ifndef MINIZ_NO_ARCHIVE_WRITING_APIS
+
+static MZ_FORCEINLINE void mz_write_le16(mz_uint8 *p, mz_uint16 v)
+{
+    p[0] = (mz_uint8)v;
+    p[1] = (mz_uint8)(v >> 8);
+}
+static MZ_FORCEINLINE void mz_write_le32(mz_uint8 *p, mz_uint32 v)
+{
+    p[0] = (mz_uint8)v;
+    p[1] = (mz_uint8)(v >> 8);
+    p[2] = (mz_uint8)(v >> 16);
+    p[3] = (mz_uint8)(v >> 24);
+}
+static MZ_FORCEINLINE void mz_write_le64(mz_uint8 *p, mz_uint64 v)
+{
+    mz_write_le32(p, (mz_uint32)v);
+    mz_write_le32(p + sizeof(mz_uint32), (mz_uint32)(v >> 32));
+}
+
+#define MZ_WRITE_LE16(p, v) mz_write_le16((mz_uint8 *)(p), (mz_uint16)(v))
+#define MZ_WRITE_LE32(p, v) mz_write_le32((mz_uint8 *)(p), (mz_uint32)(v))
+#define MZ_WRITE_LE64(p, v) mz_write_le64((mz_uint8 *)(p), (mz_uint64)(v))
+
+static size_t mz_zip_heap_write_func(void *pOpaque, mz_uint64 file_ofs, const void *pBuf, size_t n)
+{
+    mz_zip_archive *pZip = (mz_zip_archive *)pOpaque;
+    mz_zip_internal_state *pState = pZip->m_pState;
+    mz_uint64 new_size = MZ_MAX(file_ofs + n, pState->m_mem_size);
+
+    if (!n)
+        return 0;
+
+    /* An allocation this big is likely to just fail on 32-bit systems, so don't even go there. */
+    if ((sizeof(size_t) == sizeof(mz_uint32)) && (new_size > 0x7FFFFFFF))
+    {
+        mz_zip_set_error(pZip, MZ_ZIP_FILE_TOO_LARGE);
+        return 0;
+    }
+
+    if (new_size > pState->m_mem_capacity)
+    {
+        void *pNew_block;
+        size_t new_capacity = MZ_MAX(64, pState->m_mem_capacity);
+
+        while (new_capacity < new_size)
+            new_capacity *= 2;
+
+        if (NULL == (pNew_block = pZip->m_pRealloc(pZip->m_pAlloc_opaque, pState->m_pMem, 1, new_capacity)))
+        {
+            mz_zip_set_error(pZip, MZ_ZIP_ALLOC_FAILED);
+            return 0;
+        }
+
+        pState->m_pMem = pNew_block;
+        pState->m_mem_capacity = new_capacity;
+    }
+    memcpy((mz_uint8 *)pState->m_pMem + file_ofs, pBuf, n);
+    pState->m_mem_size = (size_t)new_size;
+    return n;
+}
+
+static mz_bool mz_zip_writer_end_internal(mz_zip_archive *pZip, mz_bool set_last_error)
+{
+    mz_zip_internal_state *pState;
+    mz_bool status = MZ_TRUE;
+
+    if ((!pZip) || (!pZip->m_pState) || (!pZip->m_pAlloc) || (!pZip->m_pFree) || ((pZip->m_zip_mode != MZ_ZIP_MODE_WRITING) && (pZip->m_zip_mode != MZ_ZIP_MODE_WRITING_HAS_BEEN_FINALIZED)))
+    {
+        if (set_last_error)
+            mz_zip_set_error(pZip, MZ_ZIP_INVALID_PARAMETER);
+        return MZ_FALSE;
+    }
+
+    pState = pZip->m_pState;
+    pZip->m_pState = NULL;
+    mz_zip_array_clear(pZip, &pState->m_central_dir);
+    mz_zip_array_clear(pZip, &pState->m_central_dir_offsets);
+    mz_zip_array_clear(pZip, &pState->m_sorted_central_dir_offsets);
+
+#ifndef MINIZ_NO_STDIO
+    if (pState->m_pFile)
+    {
+        if (pZip->m_zip_type == MZ_ZIP_TYPE_FILE)
+        {
+            if (MZ_FCLOSE(pState->m_pFile) == EOF)
+            {
+                if (set_last_error)
+                    mz_zip_set_error(pZip, MZ_ZIP_FILE_CLOSE_FAILED);
+                status = MZ_FALSE;
+            }
+        }
+
+        pState->m_pFile = NULL;
+    }
+#endif /* #ifndef MINIZ_NO_STDIO */
+
+    if ((pZip->m_pWrite == mz_zip_heap_write_func) && (pState->m_pMem))
+    {
+        pZip->m_pFree(pZip->m_pAlloc_opaque, pState->m_pMem);
+        pState->m_pMem = NULL;
+    }
+
+    pZip->m_pFree(pZip->m_pAlloc_opaque, pState);
+    pZip->m_zip_mode = MZ_ZIP_MODE_INVALID;
+    return status;
+}
+
+mz_bool mz_zip_writer_init_v2(mz_zip_archive *pZip, mz_uint64 existing_size, mz_uint flags)
+{
+    mz_bool zip64 = (flags & MZ_ZIP_FLAG_WRITE_ZIP64) != 0;
+
+    if ((!pZip) || (pZip->m_pState) || (!pZip->m_pWrite) || (pZip->m_zip_mode != MZ_ZIP_MODE_INVALID))
+        return mz_zip_set_error(pZip, MZ_ZIP_INVALID_PARAMETER);
+
+    if (flags & MZ_ZIP_FLAG_WRITE_ALLOW_READING)
+    {
+        if (!pZip->m_pRead)
+            return mz_zip_set_error(pZip, MZ_ZIP_INVALID_PARAMETER);
+    }
+
+    if (pZip->m_file_offset_alignment)
+    {
+        /* Ensure user specified file offset alignment is a power of 2. */
+        if (pZip->m_file_offset_alignment & (pZip->m_file_offset_alignment - 1))
+            return mz_zip_set_error(pZip, MZ_ZIP_INVALID_PARAMETER);
+    }
+
+    if (!pZip->m_pAlloc)
+        pZip->m_pAlloc = miniz_def_alloc_func;
+    if (!pZip->m_pFree)
+        pZip->m_pFree = miniz_def_free_func;
+    if (!pZip->m_pRealloc)
+        pZip->m_pRealloc = miniz_def_realloc_func;
+
+    pZip->m_archive_size = existing_size;
+    pZip->m_central_directory_file_ofs = 0;
+    pZip->m_total_files = 0;
+
+    if (NULL == (pZip->m_pState = (mz_zip_internal_state *)pZip->m_pAlloc(pZip->m_pAlloc_opaque, 1, sizeof(mz_zip_internal_state))))
+        return mz_zip_set_error(pZip, MZ_ZIP_ALLOC_FAILED);
+
+    memset(pZip->m_pState, 0, sizeof(mz_zip_internal_state));
+
+    MZ_ZIP_ARRAY_SET_ELEMENT_SIZE(&pZip->m_pState->m_central_dir, sizeof(mz_uint8));
+    MZ_ZIP_ARRAY_SET_ELEMENT_SIZE(&pZip->m_pState->m_central_dir_offsets, sizeof(mz_uint32));
+    MZ_ZIP_ARRAY_SET_ELEMENT_SIZE(&pZip->m_pState->m_sorted_central_dir_offsets, sizeof(mz_uint32));
+
+    pZip->m_pState->m_zip64 = zip64;
+    pZip->m_pState->m_zip64_has_extended_info_fields = zip64;
+
+    pZip->m_zip_type = MZ_ZIP_TYPE_USER;
+    pZip->m_zip_mode = MZ_ZIP_MODE_WRITING;
+
+    return MZ_TRUE;
+}
+
+mz_bool mz_zip_writer_init(mz_zip_archive *pZip, mz_uint64 existing_size)
+{
+    return mz_zip_writer_init_v2(pZip, existing_size, 0);
+}
+
+mz_bool mz_zip_writer_init_heap_v2(mz_zip_archive *pZip, size_t size_to_reserve_at_beginning, size_t initial_allocation_size, mz_uint flags)
+{
+    pZip->m_pWrite = mz_zip_heap_write_func;
+    pZip->m_pNeeds_keepalive = NULL;
+
+    if (flags & MZ_ZIP_FLAG_WRITE_ALLOW_READING)
+        pZip->m_pRead = mz_zip_mem_read_func;
+
+    pZip->m_pIO_opaque = pZip;
+
+    if (!mz_zip_writer_init_v2(pZip, size_to_reserve_at_beginning, flags))
+        return MZ_FALSE;
+
+    pZip->m_zip_type = MZ_ZIP_TYPE_HEAP;
+
+    if (0 != (initial_allocation_size = MZ_MAX(initial_allocation_size, size_to_reserve_at_beginning)))
+    {
+        if (NULL == (pZip->m_pState->m_pMem = pZip->m_pAlloc(pZip->m_pAlloc_opaque, 1, initial_allocation_size)))
+        {
+            mz_zip_writer_end_internal(pZip, MZ_FALSE);
+            return mz_zip_set_error(pZip, MZ_ZIP_ALLOC_FAILED);
+        }
+        pZip->m_pState->m_mem_capacity = initial_allocation_size;
+    }
+
+    return MZ_TRUE;
+}
+
+mz_bool mz_zip_writer_init_heap(mz_zip_archive *pZip, size_t size_to_reserve_at_beginning, size_t initial_allocation_size)
+{
+    return mz_zip_writer_init_heap_v2(pZip, size_to_reserve_at_beginning, initial_allocation_size, 0);
+}
+
+#ifndef MINIZ_NO_STDIO
+static size_t mz_zip_file_write_func(void *pOpaque, mz_uint64 file_ofs, const void *pBuf, size_t n)
+{
+    mz_zip_archive *pZip = (mz_zip_archive *)pOpaque;
+    mz_int64 cur_ofs = MZ_FTELL64(pZip->m_pState->m_pFile);
+
+    file_ofs += pZip->m_pState->m_file_archive_start_ofs;
+
+    if (((mz_int64)file_ofs < 0) || (((cur_ofs != (mz_int64)file_ofs)) && (MZ_FSEEK64(pZip->m_pState->m_pFile, (mz_int64)file_ofs, SEEK_SET))))
+    {
+        mz_zip_set_error(pZip, MZ_ZIP_FILE_SEEK_FAILED);
+        return 0;
+    }
+
+    return MZ_FWRITE(pBuf, 1, n, pZip->m_pState->m_pFile);
+}
+
+mz_bool mz_zip_writer_init_file(mz_zip_archive *pZip, const char *pFilename, mz_uint64 size_to_reserve_at_beginning)
+{
+    return mz_zip_writer_init_file_v2(pZip, pFilename, size_to_reserve_at_beginning, 0);
+}
+
+mz_bool mz_zip_writer_init_file_v2(mz_zip_archive *pZip, const char *pFilename, mz_uint64 size_to_reserve_at_beginning, mz_uint flags)
+{
+    MZ_FILE *pFile;
+
+    pZip->m_pWrite = mz_zip_file_write_func;
+    pZip->m_pNeeds_keepalive = NULL;
+
+    if (flags & MZ_ZIP_FLAG_WRITE_ALLOW_READING)
+        pZip->m_pRead = mz_zip_file_read_func;
+
+    pZip->m_pIO_opaque = pZip;
+
+    if (!mz_zip_writer_init_v2(pZip, size_to_reserve_at_beginning, flags))
+        return MZ_FALSE;
+
+    if (NULL == (pFile = MZ_FOPEN(pFilename, (flags & MZ_ZIP_FLAG_WRITE_ALLOW_READING) ? "w+b" : "wb")))
+    {
+        mz_zip_writer_end(pZip);
+        return mz_zip_set_error(pZip, MZ_ZIP_FILE_OPEN_FAILED);
+    }
+
+    pZip->m_pState->m_pFile = pFile;
+    pZip->m_zip_type = MZ_ZIP_TYPE_FILE;
+
+    if (size_to_reserve_at_beginning)
+    {
+        mz_uint64 cur_ofs = 0;
+        char buf[4096];
+
+        MZ_CLEAR_OBJ(buf);
+
+        do
+        {
+            size_t n = (size_t)MZ_MIN(sizeof(buf), size_to_reserve_at_beginning);
+            if (pZip->m_pWrite(pZip->m_pIO_opaque, cur_ofs, buf, n) != n)
+            {
+                mz_zip_writer_end(pZip);
+                return mz_zip_set_error(pZip, MZ_ZIP_FILE_WRITE_FAILED);
+            }
+            cur_ofs += n;
+            size_to_reserve_at_beginning -= n;
+        } while (size_to_reserve_at_beginning);
+    }
+
+    return MZ_TRUE;
+}
+
+mz_bool mz_zip_writer_init_cfile(mz_zip_archive *pZip, MZ_FILE *pFile, mz_uint flags)
+{
+    pZip->m_pWrite = mz_zip_file_write_func;
+    pZip->m_pNeeds_keepalive = NULL;
+
+    if (flags & MZ_ZIP_FLAG_WRITE_ALLOW_READING)
+        pZip->m_pRead = mz_zip_file_read_func;
+
+    pZip->m_pIO_opaque = pZip;
+
+    if (!mz_zip_writer_init_v2(pZip, 0, flags))
+        return MZ_FALSE;
+
+    pZip->m_pState->m_pFile = pFile;
+    pZip->m_pState->m_file_archive_start_ofs = MZ_FTELL64(pZip->m_pState->m_pFile);
+    pZip->m_zip_type = MZ_ZIP_TYPE_CFILE;
+
+    return MZ_TRUE;
+}
+#endif /* #ifndef MINIZ_NO_STDIO */
+
+mz_bool mz_zip_writer_init_from_reader_v2(mz_zip_archive *pZip, const char *pFilename, mz_uint flags)
+{
+    mz_zip_internal_state *pState;
+
+    if ((!pZip) || (!pZip->m_pState) || (pZip->m_zip_mode != MZ_ZIP_MODE_READING))
+        return mz_zip_set_error(pZip, MZ_ZIP_INVALID_PARAMETER);
+
+    if (flags & MZ_ZIP_FLAG_WRITE_ZIP64)
+    {
+        /* We don't support converting a non-zip64 file to zip64 - this seems like more trouble than it's worth. (What about the existing 32-bit data descriptors that could follow the compressed data?) */
+        if (!pZip->m_pState->m_zip64)
+            return mz_zip_set_error(pZip, MZ_ZIP_INVALID_PARAMETER);
+    }
+
+    /* No sense in trying to write to an archive that's already at the support max size */
+    if (pZip->m_pState->m_zip64)
+    {
+        if (pZip->m_total_files == MZ_UINT32_MAX)
+            return mz_zip_set_error(pZip, MZ_ZIP_TOO_MANY_FILES);
+    }
+    else
+    {
+        if (pZip->m_total_files == MZ_UINT16_MAX)
+            return mz_zip_set_error(pZip, MZ_ZIP_TOO_MANY_FILES);
+
+        if ((pZip->m_archive_size + MZ_ZIP_CENTRAL_DIR_HEADER_SIZE + MZ_ZIP_LOCAL_DIR_HEADER_SIZE) > MZ_UINT32_MAX)
+            return mz_zip_set_error(pZip, MZ_ZIP_FILE_TOO_LARGE);
+    }
+
+    pState = pZip->m_pState;
+
+    if (pState->m_pFile)
+    {
+#ifdef MINIZ_NO_STDIO
+        (void)pFilename;
+        return mz_zip_set_error(pZip, MZ_ZIP_INVALID_PARAMETER);
+#else
+        if (pZip->m_pIO_opaque != pZip)
+            return mz_zip_set_error(pZip, MZ_ZIP_INVALID_PARAMETER);
+
+        if (pZip->m_zip_type == MZ_ZIP_TYPE_FILE)
+        {
+            if (!pFilename)
+                return mz_zip_set_error(pZip, MZ_ZIP_INVALID_PARAMETER);
+
+            /* Archive is being read from stdio and was originally opened only for reading. Try to reopen as writable. */
+            if (NULL == (pState->m_pFile = MZ_FREOPEN(pFilename, "r+b", pState->m_pFile)))
+            {
+                /* The mz_zip_archive is now in a bogus state because pState->m_pFile is NULL, so just close it. */
+                mz_zip_reader_end_internal(pZip, MZ_FALSE);
+                return mz_zip_set_error(pZip, MZ_ZIP_FILE_OPEN_FAILED);
+            }
+        }
+
+        pZip->m_pWrite = mz_zip_file_write_func;
+        pZip->m_pNeeds_keepalive = NULL;
+#endif /* #ifdef MINIZ_NO_STDIO */
+    }
+    else if (pState->m_pMem)
+    {
+        /* Archive lives in a memory block. Assume it's from the heap that we can resize using the realloc callback. */
+        if (pZip->m_pIO_opaque != pZip)
+            return mz_zip_set_error(pZip, MZ_ZIP_INVALID_PARAMETER);
+
+        pState->m_mem_capacity = pState->m_mem_size;
+        pZip->m_pWrite = mz_zip_heap_write_func;
+        pZip->m_pNeeds_keepalive = NULL;
+    }
+    /* Archive is being read via a user provided read function - make sure the user has specified a write function too. */
+    else if (!pZip->m_pWrite)
+        return mz_zip_set_error(pZip, MZ_ZIP_INVALID_PARAMETER);
+
+    /* Start writing new files at the archive's current central directory location. */
+    /* TODO: We could add a flag that lets the user start writing immediately AFTER the existing central dir - this would be safer. */
+    pZip->m_archive_size = pZip->m_central_directory_file_ofs;
+    pZip->m_central_directory_file_ofs = 0;
+
+    /* Clear the sorted central dir offsets, they aren't useful or maintained now. */
+    /* Even though we're now in write mode, files can still be extracted and verified, but file locates will be slow. */
+    /* TODO: We could easily maintain the sorted central directory offsets. */
+    mz_zip_array_clear(pZip, &pZip->m_pState->m_sorted_central_dir_offsets);
+
+    pZip->m_zip_mode = MZ_ZIP_MODE_WRITING;
+
+    return MZ_TRUE;
+}
+
+mz_bool mz_zip_writer_init_from_reader(mz_zip_archive *pZip, const char *pFilename)
+{
+    return mz_zip_writer_init_from_reader_v2(pZip, pFilename, 0);
+}
+
+/* TODO: pArchive_name is a terrible name here! */
+mz_bool mz_zip_writer_add_mem(mz_zip_archive *pZip, const char *pArchive_name, const void *pBuf, size_t buf_size, mz_uint level_and_flags)
+{
+    return mz_zip_writer_add_mem_ex(pZip, pArchive_name, pBuf, buf_size, NULL, 0, level_and_flags, 0, 0);
+}
+
+typedef struct
+{
+    mz_zip_archive *m_pZip;
+    mz_uint64 m_cur_archive_file_ofs;
+    mz_uint64 m_comp_size;
+} mz_zip_writer_add_state;
+
+static mz_bool mz_zip_writer_add_put_buf_callback(const void *pBuf, int len, void *pUser)
+{
+    mz_zip_writer_add_state *pState = (mz_zip_writer_add_state *)pUser;
+    if ((int)pState->m_pZip->m_pWrite(pState->m_pZip->m_pIO_opaque, pState->m_cur_archive_file_ofs, pBuf, len) != len)
+        return MZ_FALSE;
+
+    pState->m_cur_archive_file_ofs += len;
+    pState->m_comp_size += len;
+    return MZ_TRUE;
+}
+
+#define MZ_ZIP64_MAX_LOCAL_EXTRA_FIELD_SIZE (sizeof(mz_uint16) * 2 + sizeof(mz_uint64) * 2)
+#define MZ_ZIP64_MAX_CENTRAL_EXTRA_FIELD_SIZE (sizeof(mz_uint16) * 2 + sizeof(mz_uint64) * 3)
+static mz_uint32 mz_zip_writer_create_zip64_extra_data(mz_uint8 *pBuf, mz_uint64 *pUncomp_size, mz_uint64 *pComp_size, mz_uint64 *pLocal_header_ofs)
+{
+    mz_uint8 *pDst = pBuf;
+    mz_uint32 field_size = 0;
+
+    MZ_WRITE_LE16(pDst + 0, MZ_ZIP64_EXTENDED_INFORMATION_FIELD_HEADER_ID);
+    MZ_WRITE_LE16(pDst + 2, 0);
+    pDst += sizeof(mz_uint16) * 2;
+
+    if (pUncomp_size)
+    {
+        MZ_WRITE_LE64(pDst, *pUncomp_size);
+        pDst += sizeof(mz_uint64);
+        field_size += sizeof(mz_uint64);
+    }
+
+    if (pComp_size)
+    {
+        MZ_WRITE_LE64(pDst, *pComp_size);
+        pDst += sizeof(mz_uint64);
+        field_size += sizeof(mz_uint64);
+    }
+
+    if (pLocal_header_ofs)
+    {
+        MZ_WRITE_LE64(pDst, *pLocal_header_ofs);
+        pDst += sizeof(mz_uint64);
+        field_size += sizeof(mz_uint64);
+    }
+
+    MZ_WRITE_LE16(pBuf + 2, field_size);
+
+    return (mz_uint32)(pDst - pBuf);
+}
+
+static mz_bool mz_zip_writer_create_local_dir_header(mz_zip_archive *pZip, mz_uint8 *pDst, mz_uint16 filename_size, mz_uint16 extra_size, mz_uint64 uncomp_size, mz_uint64 comp_size, mz_uint32 uncomp_crc32, mz_uint16 method, mz_uint16 bit_flags, mz_uint16 dos_time, mz_uint16 dos_date)
+{
+    (void)pZip;
+    memset(pDst, 0, MZ_ZIP_LOCAL_DIR_HEADER_SIZE);
+    MZ_WRITE_LE32(pDst + MZ_ZIP_LDH_SIG_OFS, MZ_ZIP_LOCAL_DIR_HEADER_SIG);
+    MZ_WRITE_LE16(pDst + MZ_ZIP_LDH_VERSION_NEEDED_OFS, method ? 20 : 0);
+    MZ_WRITE_LE16(pDst + MZ_ZIP_LDH_BIT_FLAG_OFS, bit_flags);
+    MZ_WRITE_LE16(pDst + MZ_ZIP_LDH_METHOD_OFS, method);
+    MZ_WRITE_LE16(pDst + MZ_ZIP_LDH_FILE_TIME_OFS, dos_time);
+    MZ_WRITE_LE16(pDst + MZ_ZIP_LDH_FILE_DATE_OFS, dos_date);
+    MZ_WRITE_LE32(pDst + MZ_ZIP_LDH_CRC32_OFS, uncomp_crc32);
+    MZ_WRITE_LE32(pDst + MZ_ZIP_LDH_COMPRESSED_SIZE_OFS, MZ_MIN(comp_size, MZ_UINT32_MAX));
+    MZ_WRITE_LE32(pDst + MZ_ZIP_LDH_DECOMPRESSED_SIZE_OFS, MZ_MIN(uncomp_size, MZ_UINT32_MAX));
+    MZ_WRITE_LE16(pDst + MZ_ZIP_LDH_FILENAME_LEN_OFS, filename_size);
+    MZ_WRITE_LE16(pDst + MZ_ZIP_LDH_EXTRA_LEN_OFS, extra_size);
+    return MZ_TRUE;
+}
+
+static mz_bool mz_zip_writer_create_central_dir_header(mz_zip_archive *pZip, mz_uint8 *pDst,
+                                                       mz_uint16 filename_size, mz_uint16 extra_size, mz_uint16 comment_size,
+                                                       mz_uint64 uncomp_size, mz_uint64 comp_size, mz_uint32 uncomp_crc32,
+                                                       mz_uint16 method, mz_uint16 bit_flags, mz_uint16 dos_time, mz_uint16 dos_date,
+                                                       mz_uint64 local_header_ofs, mz_uint32 ext_attributes)
+{
+    (void)pZip;
+    memset(pDst, 0, MZ_ZIP_CENTRAL_DIR_HEADER_SIZE);
+    MZ_WRITE_LE32(pDst + MZ_ZIP_CDH_SIG_OFS, MZ_ZIP_CENTRAL_DIR_HEADER_SIG);
+    MZ_WRITE_LE16(pDst + MZ_ZIP_CDH_VERSION_NEEDED_OFS, method ? 20 : 0);
+    MZ_WRITE_LE16(pDst + MZ_ZIP_CDH_BIT_FLAG_OFS, bit_flags);
+    MZ_WRITE_LE16(pDst + MZ_ZIP_CDH_METHOD_OFS, method);
+    MZ_WRITE_LE16(pDst + MZ_ZIP_CDH_FILE_TIME_OFS, dos_time);
+    MZ_WRITE_LE16(pDst + MZ_ZIP_CDH_FILE_DATE_OFS, dos_date);
+    MZ_WRITE_LE32(pDst + MZ_ZIP_CDH_CRC32_OFS, uncomp_crc32);
+    MZ_WRITE_LE32(pDst + MZ_ZIP_CDH_COMPRESSED_SIZE_OFS, MZ_MIN(comp_size, MZ_UINT32_MAX));
+    MZ_WRITE_LE32(pDst + MZ_ZIP_CDH_DECOMPRESSED_SIZE_OFS, MZ_MIN(uncomp_size, MZ_UINT32_MAX));
+    MZ_WRITE_LE16(pDst + MZ_ZIP_CDH_FILENAME_LEN_OFS, filename_size);
+    MZ_WRITE_LE16(pDst + MZ_ZIP_CDH_EXTRA_LEN_OFS, extra_size);
+    MZ_WRITE_LE16(pDst + MZ_ZIP_CDH_COMMENT_LEN_OFS, comment_size);
+    MZ_WRITE_LE32(pDst + MZ_ZIP_CDH_EXTERNAL_ATTR_OFS, ext_attributes);
+    MZ_WRITE_LE32(pDst + MZ_ZIP_CDH_LOCAL_HEADER_OFS, MZ_MIN(local_header_ofs, MZ_UINT32_MAX));
+    return MZ_TRUE;
+}
+
+static mz_bool mz_zip_writer_add_to_central_dir(mz_zip_archive *pZip, const char *pFilename, mz_uint16 filename_size,
+                                                const void *pExtra, mz_uint16 extra_size, const void *pComment, mz_uint16 comment_size,
+                                                mz_uint64 uncomp_size, mz_uint64 comp_size, mz_uint32 uncomp_crc32,
+                                                mz_uint16 method, mz_uint16 bit_flags, mz_uint16 dos_time, mz_uint16 dos_date,
+                                                mz_uint64 local_header_ofs, mz_uint32 ext_attributes,
+                                                const char *user_extra_data, mz_uint user_extra_data_len)
+{
+    mz_zip_internal_state *pState = pZip->m_pState;
+    mz_uint32 central_dir_ofs = (mz_uint32)pState->m_central_dir.m_size;
+    size_t orig_central_dir_size = pState->m_central_dir.m_size;
+    mz_uint8 central_dir_header[MZ_ZIP_CENTRAL_DIR_HEADER_SIZE];
+
+    if (!pZip->m_pState->m_zip64)
+    {
+        if (local_header_ofs > 0xFFFFFFFF)
+            return mz_zip_set_error(pZip, MZ_ZIP_FILE_TOO_LARGE);
+    }
+
+    /* miniz doesn't support central dirs >= MZ_UINT32_MAX bytes yet */
+    if (((mz_uint64)pState->m_central_dir.m_size + MZ_ZIP_CENTRAL_DIR_HEADER_SIZE + filename_size + extra_size + user_extra_data_len + comment_size) >= MZ_UINT32_MAX)
+        return mz_zip_set_error(pZip, MZ_ZIP_UNSUPPORTED_CDIR_SIZE);
+
+    if (!mz_zip_writer_create_central_dir_header(pZip, central_dir_header, filename_size, (mz_uint16)(extra_size + user_extra_data_len), comment_size, uncomp_size, comp_size, uncomp_crc32, method, bit_flags, dos_time, dos_date, local_header_ofs, ext_attributes))
+        return mz_zip_set_error(pZip, MZ_ZIP_INTERNAL_ERROR);
+
+    if ((!mz_zip_array_push_back(pZip, &pState->m_central_dir, central_dir_header, MZ_ZIP_CENTRAL_DIR_HEADER_SIZE)) ||
+        (!mz_zip_array_push_back(pZip, &pState->m_central_dir, pFilename, filename_size)) ||
+        (!mz_zip_array_push_back(pZip, &pState->m_central_dir, pExtra, extra_size)) ||
+        (!mz_zip_array_push_back(pZip, &pState->m_central_dir, user_extra_data, user_extra_data_len)) ||
+        (!mz_zip_array_push_back(pZip, &pState->m_central_dir, pComment, comment_size)) ||
+        (!mz_zip_array_push_back(pZip, &pState->m_central_dir_offsets, &central_dir_ofs, 1)))
+    {
+        /* Try to resize the central directory array back into its original state. */
+        mz_zip_array_resize(pZip, &pState->m_central_dir, orig_central_dir_size, MZ_FALSE);
+        return mz_zip_set_error(pZip, MZ_ZIP_ALLOC_FAILED);
+    }
+
+    return MZ_TRUE;
+}
+
+static mz_bool mz_zip_writer_validate_archive_name(const char *pArchive_name)
+{
+    /* Basic ZIP archive filename validity checks: Valid filenames cannot start with a forward slash, cannot contain a drive letter, and cannot use DOS-style backward slashes. */
+    if (*pArchive_name == '/')
+        return MZ_FALSE;
+
+    /* Making sure the name does not contain drive letters or DOS style backward slashes is the responsibility of the program using miniz*/
+
+    return MZ_TRUE;
+}
+
+static mz_uint mz_zip_writer_compute_padding_needed_for_file_alignment(mz_zip_archive *pZip)
+{
+    mz_uint32 n;
+    if (!pZip->m_file_offset_alignment)
+        return 0;
+    n = (mz_uint32)(pZip->m_archive_size & (pZip->m_file_offset_alignment - 1));
+    return (mz_uint)((pZip->m_file_offset_alignment - n) & (pZip->m_file_offset_alignment - 1));
+}
+
+static mz_bool mz_zip_writer_write_zeros(mz_zip_archive *pZip, mz_uint64 cur_file_ofs, mz_uint32 n)
+{
+    char buf[4096];
+    memset(buf, 0, MZ_MIN(sizeof(buf), n));
+    while (n)
+    {
+        mz_uint32 s = MZ_MIN(sizeof(buf), n);
+        if (pZip->m_pWrite(pZip->m_pIO_opaque, cur_file_ofs, buf, s) != s)
+            return mz_zip_set_error(pZip, MZ_ZIP_FILE_WRITE_FAILED);
+
+        cur_file_ofs += s;
+        n -= s;
+    }
+    return MZ_TRUE;
+}
+
+mz_bool mz_zip_writer_add_mem_ex(mz_zip_archive *pZip, const char *pArchive_name, const void *pBuf, size_t buf_size, const void *pComment, mz_uint16 comment_size, mz_uint level_and_flags,
+                                 mz_uint64 uncomp_size, mz_uint32 uncomp_crc32)
+{
+    return mz_zip_writer_add_mem_ex_v2(pZip, pArchive_name, pBuf, buf_size, pComment, comment_size, level_and_flags, uncomp_size, uncomp_crc32, NULL, NULL, 0, NULL, 0);
+}
+
+mz_bool mz_zip_writer_add_mem_ex_v2(mz_zip_archive *pZip, const char *pArchive_name, const void *pBuf, size_t buf_size, const void *pComment, mz_uint16 comment_size,
+                                    mz_uint level_and_flags, mz_uint64 uncomp_size, mz_uint32 uncomp_crc32, MZ_TIME_T *last_modified,
+                                    const char *user_extra_data, mz_uint user_extra_data_len, const char *user_extra_data_central, mz_uint user_extra_data_central_len)
+{
+    mz_uint16 method = 0, dos_time = 0, dos_date = 0;
+    mz_uint level, ext_attributes = 0, num_alignment_padding_bytes;
+    mz_uint64 local_dir_header_ofs = pZip->m_archive_size, cur_archive_file_ofs = pZip->m_archive_size, comp_size = 0;
+    size_t archive_name_size;
+    mz_uint8 local_dir_header[MZ_ZIP_LOCAL_DIR_HEADER_SIZE];
+    tdefl_compressor *pComp = NULL;
+    mz_bool store_data_uncompressed;
+    mz_zip_internal_state *pState;
+    mz_uint8 *pExtra_data = NULL;
+    mz_uint32 extra_size = 0;
+    mz_uint8 extra_data[MZ_ZIP64_MAX_CENTRAL_EXTRA_FIELD_SIZE];
+    mz_uint16 bit_flags = 0;
+
+    if ((int)level_and_flags < 0)
+        level_and_flags = MZ_DEFAULT_LEVEL;
+
+    if (uncomp_size || (buf_size && !(level_and_flags & MZ_ZIP_FLAG_COMPRESSED_DATA)))
+        bit_flags |= MZ_ZIP_LDH_BIT_FLAG_HAS_LOCATOR;
+
+    if (!(level_and_flags & MZ_ZIP_FLAG_ASCII_FILENAME))
+        bit_flags |= MZ_ZIP_GENERAL_PURPOSE_BIT_FLAG_UTF8;
+
+    level = level_and_flags & 0xF;
+    store_data_uncompressed = ((!level) || (level_and_flags & MZ_ZIP_FLAG_COMPRESSED_DATA));
+
+    if ((!pZip) || (!pZip->m_pState) || (pZip->m_zip_mode != MZ_ZIP_MODE_WRITING) || ((buf_size) && (!pBuf)) || (!pArchive_name) || ((comment_size) && (!pComment)) || (level > MZ_UBER_COMPRESSION))
+        return mz_zip_set_error(pZip, MZ_ZIP_INVALID_PARAMETER);
+
+    pState = pZip->m_pState;
+
+    if (pState->m_zip64)
+    {
+        if (pZip->m_total_files == MZ_UINT32_MAX)
+            return mz_zip_set_error(pZip, MZ_ZIP_TOO_MANY_FILES);
+    }
+    else
+    {
+        if (pZip->m_total_files == MZ_UINT16_MAX)
+        {
+            pState->m_zip64 = MZ_TRUE;
+            /*return mz_zip_set_error(pZip, MZ_ZIP_TOO_MANY_FILES); */
+        }
+        if ((buf_size > 0xFFFFFFFF) || (uncomp_size > 0xFFFFFFFF))
+        {
+            pState->m_zip64 = MZ_TRUE;
+            /*return mz_zip_set_error(pZip, MZ_ZIP_ARCHIVE_TOO_LARGE); */
+        }
+    }
+
+    if ((!(level_and_flags & MZ_ZIP_FLAG_COMPRESSED_DATA)) && (uncomp_size))
+        return mz_zip_set_error(pZip, MZ_ZIP_INVALID_PARAMETER);
+
+    if (!mz_zip_writer_validate_archive_name(pArchive_name))
+        return mz_zip_set_error(pZip, MZ_ZIP_INVALID_FILENAME);
+
+#ifndef MINIZ_NO_TIME
+    if (last_modified != NULL)
+    {
+        mz_zip_time_t_to_dos_time(*last_modified, &dos_time, &dos_date);
+    }
+    else
+    {
+        MZ_TIME_T cur_time;
+        time(&cur_time);
+        mz_zip_time_t_to_dos_time(cur_time, &dos_time, &dos_date);
+    }
+#endif /* #ifndef MINIZ_NO_TIME */
+
+	if (!(level_and_flags & MZ_ZIP_FLAG_COMPRESSED_DATA))
+	{
+		uncomp_crc32 = (mz_uint32)mz_crc32(MZ_CRC32_INIT, (const mz_uint8 *)pBuf, buf_size);
+		uncomp_size = buf_size;
+		if (uncomp_size <= 3)
+		{
+			level = 0;
+			store_data_uncompressed = MZ_TRUE;
+		}
+	}
+
+    archive_name_size = strlen(pArchive_name);
+    if (archive_name_size > MZ_UINT16_MAX)
+        return mz_zip_set_error(pZip, MZ_ZIP_INVALID_FILENAME);
+
+    num_alignment_padding_bytes = mz_zip_writer_compute_padding_needed_for_file_alignment(pZip);
+
+    /* miniz doesn't support central dirs >= MZ_UINT32_MAX bytes yet */
+    if (((mz_uint64)pState->m_central_dir.m_size + MZ_ZIP_CENTRAL_DIR_HEADER_SIZE + archive_name_size + MZ_ZIP64_MAX_CENTRAL_EXTRA_FIELD_SIZE + comment_size) >= MZ_UINT32_MAX)
+        return mz_zip_set_error(pZip, MZ_ZIP_UNSUPPORTED_CDIR_SIZE);
+
+    if (!pState->m_zip64)
+    {
+        /* Bail early if the archive would obviously become too large */
+        if ((pZip->m_archive_size + num_alignment_padding_bytes + MZ_ZIP_LOCAL_DIR_HEADER_SIZE + archive_name_size
+			+ MZ_ZIP_CENTRAL_DIR_HEADER_SIZE + archive_name_size + comment_size + user_extra_data_len +
+			pState->m_central_dir.m_size + MZ_ZIP_END_OF_CENTRAL_DIR_HEADER_SIZE + user_extra_data_central_len
+			+ MZ_ZIP_DATA_DESCRIPTER_SIZE32) > 0xFFFFFFFF)
+        {
+            pState->m_zip64 = MZ_TRUE;
+            /*return mz_zip_set_error(pZip, MZ_ZIP_ARCHIVE_TOO_LARGE); */
+        }
+    }
+
+    if ((archive_name_size) && (pArchive_name[archive_name_size - 1] == '/'))
+    {
+        /* Set DOS Subdirectory attribute bit. */
+        ext_attributes |= MZ_ZIP_DOS_DIR_ATTRIBUTE_BITFLAG;
+
+        /* Subdirectories cannot contain data. */
+        if ((buf_size) || (uncomp_size))
+            return mz_zip_set_error(pZip, MZ_ZIP_INVALID_PARAMETER);
+    }
+
+    /* Try to do any allocations before writing to the archive, so if an allocation fails the file remains unmodified. (A good idea if we're doing an in-place modification.) */
+    if ((!mz_zip_array_ensure_room(pZip, &pState->m_central_dir, MZ_ZIP_CENTRAL_DIR_HEADER_SIZE + archive_name_size + comment_size + (pState->m_zip64 ? MZ_ZIP64_MAX_CENTRAL_EXTRA_FIELD_SIZE : 0))) || (!mz_zip_array_ensure_room(pZip, &pState->m_central_dir_offsets, 1)))
+        return mz_zip_set_error(pZip, MZ_ZIP_ALLOC_FAILED);
+
+    if ((!store_data_uncompressed) && (buf_size))
+    {
+        if (NULL == (pComp = (tdefl_compressor *)pZip->m_pAlloc(pZip->m_pAlloc_opaque, 1, sizeof(tdefl_compressor))))
+            return mz_zip_set_error(pZip, MZ_ZIP_ALLOC_FAILED);
+    }
+
+    if (!mz_zip_writer_write_zeros(pZip, cur_archive_file_ofs, num_alignment_padding_bytes))
+    {
+        pZip->m_pFree(pZip->m_pAlloc_opaque, pComp);
+        return MZ_FALSE;
+    }
+
+    local_dir_header_ofs += num_alignment_padding_bytes;
+    if (pZip->m_file_offset_alignment)
+    {
+        MZ_ASSERT((local_dir_header_ofs & (pZip->m_file_offset_alignment - 1)) == 0);
+    }
+    cur_archive_file_ofs += num_alignment_padding_bytes;
+
+    MZ_CLEAR_OBJ(local_dir_header);
+
+    if (!store_data_uncompressed || (level_and_flags & MZ_ZIP_FLAG_COMPRESSED_DATA))
+    {
+        method = MZ_DEFLATED;
+    }
+
+    if (pState->m_zip64)
+    {
+        if (uncomp_size >= MZ_UINT32_MAX || local_dir_header_ofs >= MZ_UINT32_MAX)
+        {
+            pExtra_data = extra_data;
+            extra_size = mz_zip_writer_create_zip64_extra_data(extra_data, (uncomp_size >= MZ_UINT32_MAX) ? &uncomp_size : NULL,
+                                                               (uncomp_size >= MZ_UINT32_MAX) ? &comp_size : NULL, (local_dir_header_ofs >= MZ_UINT32_MAX) ? &local_dir_header_ofs : NULL);
+        }
+
+        if (!mz_zip_writer_create_local_dir_header(pZip, local_dir_header, (mz_uint16)archive_name_size, (mz_uint16)(extra_size + user_extra_data_len), 0, 0, 0, method, bit_flags, dos_time, dos_date))
+            return mz_zip_set_error(pZip, MZ_ZIP_INTERNAL_ERROR);
+
+        if (pZip->m_pWrite(pZip->m_pIO_opaque, local_dir_header_ofs, local_dir_header, sizeof(local_dir_header)) != sizeof(local_dir_header))
+            return mz_zip_set_error(pZip, MZ_ZIP_FILE_WRITE_FAILED);
+
+        cur_archive_file_ofs += sizeof(local_dir_header);
+
+        if (pZip->m_pWrite(pZip->m_pIO_opaque, cur_archive_file_ofs, pArchive_name, archive_name_size) != archive_name_size)
+        {
+            pZip->m_pFree(pZip->m_pAlloc_opaque, pComp);
+            return mz_zip_set_error(pZip, MZ_ZIP_FILE_WRITE_FAILED);
+        }
+        cur_archive_file_ofs += archive_name_size;
+
+        if (pExtra_data != NULL)
+        {
+            if (pZip->m_pWrite(pZip->m_pIO_opaque, cur_archive_file_ofs, extra_data, extra_size) != extra_size)
+                return mz_zip_set_error(pZip, MZ_ZIP_FILE_WRITE_FAILED);
+
+            cur_archive_file_ofs += extra_size;
+        }
+    }
+    else
+    {
+        if ((comp_size > MZ_UINT32_MAX) || (cur_archive_file_ofs > MZ_UINT32_MAX))
+            return mz_zip_set_error(pZip, MZ_ZIP_ARCHIVE_TOO_LARGE);
+        if (!mz_zip_writer_create_local_dir_header(pZip, local_dir_header, (mz_uint16)archive_name_size, (mz_uint16)user_extra_data_len, 0, 0, 0, method, bit_flags, dos_time, dos_date))
+            return mz_zip_set_error(pZip, MZ_ZIP_INTERNAL_ERROR);
+
+        if (pZip->m_pWrite(pZip->m_pIO_opaque, local_dir_header_ofs, local_dir_header, sizeof(local_dir_header)) != sizeof(local_dir_header))
+            return mz_zip_set_error(pZip, MZ_ZIP_FILE_WRITE_FAILED);
+
+        cur_archive_file_ofs += sizeof(local_dir_header);
+
+        if (pZip->m_pWrite(pZip->m_pIO_opaque, cur_archive_file_ofs, pArchive_name, archive_name_size) != archive_name_size)
+        {
+            pZip->m_pFree(pZip->m_pAlloc_opaque, pComp);
+            return mz_zip_set_error(pZip, MZ_ZIP_FILE_WRITE_FAILED);
+        }
+        cur_archive_file_ofs += archive_name_size;
+    }
+
+	if (user_extra_data_len > 0)
+	{
+		if (pZip->m_pWrite(pZip->m_pIO_opaque, cur_archive_file_ofs, user_extra_data, user_extra_data_len) != user_extra_data_len)
+			return mz_zip_set_error(pZip, MZ_ZIP_FILE_WRITE_FAILED);
+
+		cur_archive_file_ofs += user_extra_data_len;
+	}
+
+    if (store_data_uncompressed)
+    {
+        if (pZip->m_pWrite(pZip->m_pIO_opaque, cur_archive_file_ofs, pBuf, buf_size) != buf_size)
+        {
+            pZip->m_pFree(pZip->m_pAlloc_opaque, pComp);
+            return mz_zip_set_error(pZip, MZ_ZIP_FILE_WRITE_FAILED);
+        }
+
+        cur_archive_file_ofs += buf_size;
+        comp_size = buf_size;
+    }
+    else if (buf_size)
+    {
+        mz_zip_writer_add_state state;
+
+        state.m_pZip = pZip;
+        state.m_cur_archive_file_ofs = cur_archive_file_ofs;
+        state.m_comp_size = 0;
+
+        if ((tdefl_init(pComp, mz_zip_writer_add_put_buf_callback, &state, tdefl_create_comp_flags_from_zip_params(level, -15, MZ_DEFAULT_STRATEGY)) != TDEFL_STATUS_OKAY) ||
+            (tdefl_compress_buffer(pComp, pBuf, buf_size, TDEFL_FINISH) != TDEFL_STATUS_DONE))
+        {
+            pZip->m_pFree(pZip->m_pAlloc_opaque, pComp);
+            return mz_zip_set_error(pZip, MZ_ZIP_COMPRESSION_FAILED);
+        }
+
+        comp_size = state.m_comp_size;
+        cur_archive_file_ofs = state.m_cur_archive_file_ofs;
+    }
+
+    pZip->m_pFree(pZip->m_pAlloc_opaque, pComp);
+    pComp = NULL;
+
+    if (uncomp_size)
+    {
+        mz_uint8 local_dir_footer[MZ_ZIP_DATA_DESCRIPTER_SIZE64];
+        mz_uint32 local_dir_footer_size = MZ_ZIP_DATA_DESCRIPTER_SIZE32;
+
+        MZ_ASSERT(bit_flags & MZ_ZIP_LDH_BIT_FLAG_HAS_LOCATOR);
+
+        MZ_WRITE_LE32(local_dir_footer + 0, MZ_ZIP_DATA_DESCRIPTOR_ID);
+        MZ_WRITE_LE32(local_dir_footer + 4, uncomp_crc32);
+        if (pExtra_data == NULL)
+        {
+            if (comp_size > MZ_UINT32_MAX)
+                return mz_zip_set_error(pZip, MZ_ZIP_ARCHIVE_TOO_LARGE);
+
+            MZ_WRITE_LE32(local_dir_footer + 8, comp_size);
+            MZ_WRITE_LE32(local_dir_footer + 12, uncomp_size);
+        }
+        else
+        {
+            MZ_WRITE_LE64(local_dir_footer + 8, comp_size);
+            MZ_WRITE_LE64(local_dir_footer + 16, uncomp_size);
+            local_dir_footer_size = MZ_ZIP_DATA_DESCRIPTER_SIZE64;
+        }
+
+        if (pZip->m_pWrite(pZip->m_pIO_opaque, cur_archive_file_ofs, local_dir_footer, local_dir_footer_size) != local_dir_footer_size)
+            return MZ_FALSE;
+
+        cur_archive_file_ofs += local_dir_footer_size;
+    }
+
+    if (pExtra_data != NULL)
+    {
+        extra_size = mz_zip_writer_create_zip64_extra_data(extra_data, (uncomp_size >= MZ_UINT32_MAX) ? &uncomp_size : NULL,
+                                                           (uncomp_size >= MZ_UINT32_MAX) ? &comp_size : NULL, (local_dir_header_ofs >= MZ_UINT32_MAX) ? &local_dir_header_ofs : NULL);
+    }
+
+    if (!mz_zip_writer_add_to_central_dir(pZip, pArchive_name, (mz_uint16)archive_name_size, pExtra_data, (mz_uint16)extra_size, pComment,
+                                          comment_size, uncomp_size, comp_size, uncomp_crc32, method, bit_flags, dos_time, dos_date, local_dir_header_ofs, ext_attributes,
+                                          user_extra_data_central, user_extra_data_central_len))
+        return MZ_FALSE;
+
+    pZip->m_total_files++;
+    pZip->m_archive_size = cur_archive_file_ofs;
+
+    return MZ_TRUE;
+}
+
+mz_bool mz_zip_writer_add_read_buf_callback(mz_zip_archive *pZip, const char *pArchive_name, mz_file_read_func read_callback, void* callback_opaque, mz_uint64 max_size, const MZ_TIME_T *pFile_time, const void *pComment, mz_uint16 comment_size, mz_uint level_and_flags,
+                                const char *user_extra_data, mz_uint user_extra_data_len, const char *user_extra_data_central, mz_uint user_extra_data_central_len)
+{
+    mz_uint16 gen_flags = (level_and_flags & MZ_ZIP_FLAG_WRITE_HEADER_SET_SIZE) ? 0 : MZ_ZIP_LDH_BIT_FLAG_HAS_LOCATOR;
+    mz_uint uncomp_crc32 = MZ_CRC32_INIT, level, num_alignment_padding_bytes;
+    mz_uint16 method = 0, dos_time = 0, dos_date = 0, ext_attributes = 0;
+    mz_uint64 local_dir_header_ofs, cur_archive_file_ofs = pZip->m_archive_size, uncomp_size = 0, comp_size = 0;
+    size_t archive_name_size;
+    mz_uint8 local_dir_header[MZ_ZIP_LOCAL_DIR_HEADER_SIZE];
+    mz_uint8 *pExtra_data = NULL;
+    mz_uint32 extra_size = 0;
+    mz_uint8 extra_data[MZ_ZIP64_MAX_CENTRAL_EXTRA_FIELD_SIZE];
+    mz_zip_internal_state *pState;
+    mz_uint64 file_ofs = 0, cur_archive_header_file_ofs;
+
+    if (!(level_and_flags & MZ_ZIP_FLAG_ASCII_FILENAME))
+        gen_flags |= MZ_ZIP_GENERAL_PURPOSE_BIT_FLAG_UTF8;
+
+    if ((int)level_and_flags < 0)
+        level_and_flags = MZ_DEFAULT_LEVEL;
+    level = level_and_flags & 0xF;
+
+    /* Sanity checks */
+    if ((!pZip) || (!pZip->m_pState) || (pZip->m_zip_mode != MZ_ZIP_MODE_WRITING) || (!pArchive_name) || ((comment_size) && (!pComment)) || (level > MZ_UBER_COMPRESSION))
+        return mz_zip_set_error(pZip, MZ_ZIP_INVALID_PARAMETER);
+
+    pState = pZip->m_pState;
+
+    if ((!pState->m_zip64) && (max_size > MZ_UINT32_MAX))
+    {
+        /* Source file is too large for non-zip64 */
+        /*return mz_zip_set_error(pZip, MZ_ZIP_ARCHIVE_TOO_LARGE); */
+        pState->m_zip64 = MZ_TRUE;
+    }
+
+    /* We could support this, but why? */
+    if (level_and_flags & MZ_ZIP_FLAG_COMPRESSED_DATA)
+        return mz_zip_set_error(pZip, MZ_ZIP_INVALID_PARAMETER);
+
+    if (!mz_zip_writer_validate_archive_name(pArchive_name))
+        return mz_zip_set_error(pZip, MZ_ZIP_INVALID_FILENAME);
+
+    if (pState->m_zip64)
+    {
+        if (pZip->m_total_files == MZ_UINT32_MAX)
+            return mz_zip_set_error(pZip, MZ_ZIP_TOO_MANY_FILES);
+    }
+    else
+    {
+        if (pZip->m_total_files == MZ_UINT16_MAX)
+        {
+            pState->m_zip64 = MZ_TRUE;
+            /*return mz_zip_set_error(pZip, MZ_ZIP_TOO_MANY_FILES); */
+        }
+    }
+
+    archive_name_size = strlen(pArchive_name);
+    if (archive_name_size > MZ_UINT16_MAX)
+        return mz_zip_set_error(pZip, MZ_ZIP_INVALID_FILENAME);
+
+    num_alignment_padding_bytes = mz_zip_writer_compute_padding_needed_for_file_alignment(pZip);
+
+    /* miniz doesn't support central dirs >= MZ_UINT32_MAX bytes yet */
+    if (((mz_uint64)pState->m_central_dir.m_size + MZ_ZIP_CENTRAL_DIR_HEADER_SIZE + archive_name_size + MZ_ZIP64_MAX_CENTRAL_EXTRA_FIELD_SIZE + comment_size) >= MZ_UINT32_MAX)
+        return mz_zip_set_error(pZip, MZ_ZIP_UNSUPPORTED_CDIR_SIZE);
+
+    if (!pState->m_zip64)
+    {
+        /* Bail early if the archive would obviously become too large */
+        if ((pZip->m_archive_size + num_alignment_padding_bytes + MZ_ZIP_LOCAL_DIR_HEADER_SIZE + archive_name_size + MZ_ZIP_CENTRAL_DIR_HEADER_SIZE
+			+ archive_name_size + comment_size + user_extra_data_len + pState->m_central_dir.m_size + MZ_ZIP_END_OF_CENTRAL_DIR_HEADER_SIZE + 1024
+			+ MZ_ZIP_DATA_DESCRIPTER_SIZE32 + user_extra_data_central_len) > 0xFFFFFFFF)
+        {
+            pState->m_zip64 = MZ_TRUE;
+            /*return mz_zip_set_error(pZip, MZ_ZIP_ARCHIVE_TOO_LARGE); */
+        }
+    }
+
+#ifndef MINIZ_NO_TIME
+    if (pFile_time)
+    {
+        mz_zip_time_t_to_dos_time(*pFile_time, &dos_time, &dos_date);
+    }
+#endif
+
+    if (max_size <= 3)
+        level = 0;
+
+    if (!mz_zip_writer_write_zeros(pZip, cur_archive_file_ofs, num_alignment_padding_bytes))
+    {
+        return mz_zip_set_error(pZip, MZ_ZIP_FILE_WRITE_FAILED);
+    }
+
+    cur_archive_file_ofs += num_alignment_padding_bytes;
+    local_dir_header_ofs = cur_archive_file_ofs;
+
+    if (pZip->m_file_offset_alignment)
+    {
+        MZ_ASSERT((cur_archive_file_ofs & (pZip->m_file_offset_alignment - 1)) == 0);
+    }
+
+    if (max_size && level)
+    {
+        method = MZ_DEFLATED;
+    }
+
+    MZ_CLEAR_OBJ(local_dir_header);
+    if (pState->m_zip64)
+    {
+        if (max_size >= MZ_UINT32_MAX || local_dir_header_ofs >= MZ_UINT32_MAX)
+        {
+            pExtra_data = extra_data;
+            if (level_and_flags & MZ_ZIP_FLAG_WRITE_HEADER_SET_SIZE)
+                extra_size = mz_zip_writer_create_zip64_extra_data(extra_data, (max_size >= MZ_UINT32_MAX) ? &uncomp_size : NULL,
+                                                               (max_size >= MZ_UINT32_MAX) ? &comp_size : NULL,
+                                                                (local_dir_header_ofs >= MZ_UINT32_MAX) ? &local_dir_header_ofs : NULL);
+            else
+                extra_size = mz_zip_writer_create_zip64_extra_data(extra_data, NULL,
+                                                                   NULL,
+                                                                   (local_dir_header_ofs >= MZ_UINT32_MAX) ? &local_dir_header_ofs : NULL);
+        }
+
+        if (!mz_zip_writer_create_local_dir_header(pZip, local_dir_header, (mz_uint16)archive_name_size, (mz_uint16)(extra_size + user_extra_data_len), 0, 0, 0, method, gen_flags, dos_time, dos_date))
+            return mz_zip_set_error(pZip, MZ_ZIP_INTERNAL_ERROR);
+
+        if (pZip->m_pWrite(pZip->m_pIO_opaque, cur_archive_file_ofs, local_dir_header, sizeof(local_dir_header)) != sizeof(local_dir_header))
+            return mz_zip_set_error(pZip, MZ_ZIP_FILE_WRITE_FAILED);
+
+        cur_archive_file_ofs += sizeof(local_dir_header);
+
+        if (pZip->m_pWrite(pZip->m_pIO_opaque, cur_archive_file_ofs, pArchive_name, archive_name_size) != archive_name_size)
+        {
+            return mz_zip_set_error(pZip, MZ_ZIP_FILE_WRITE_FAILED);
+        }
+
+        cur_archive_file_ofs += archive_name_size;
+
+        if (pZip->m_pWrite(pZip->m_pIO_opaque, cur_archive_file_ofs, extra_data, extra_size) != extra_size)
+            return mz_zip_set_error(pZip, MZ_ZIP_FILE_WRITE_FAILED);
+
+        cur_archive_file_ofs += extra_size;
+    }
+    else
+    {
+        if ((comp_size > MZ_UINT32_MAX) || (cur_archive_file_ofs > MZ_UINT32_MAX))
+            return mz_zip_set_error(pZip, MZ_ZIP_ARCHIVE_TOO_LARGE);
+        if (!mz_zip_writer_create_local_dir_header(pZip, local_dir_header, (mz_uint16)archive_name_size, (mz_uint16)user_extra_data_len, 0, 0, 0, method, gen_flags, dos_time, dos_date))
+            return mz_zip_set_error(pZip, MZ_ZIP_INTERNAL_ERROR);
+
+        if (pZip->m_pWrite(pZip->m_pIO_opaque, cur_archive_file_ofs, local_dir_header, sizeof(local_dir_header)) != sizeof(local_dir_header))
+            return mz_zip_set_error(pZip, MZ_ZIP_FILE_WRITE_FAILED);
+
+        cur_archive_file_ofs += sizeof(local_dir_header);
+
+        if (pZip->m_pWrite(pZip->m_pIO_opaque, cur_archive_file_ofs, pArchive_name, archive_name_size) != archive_name_size)
+        {
+            return mz_zip_set_error(pZip, MZ_ZIP_FILE_WRITE_FAILED);
+        }
+
+        cur_archive_file_ofs += archive_name_size;
+    }
+
+    if (user_extra_data_len > 0)
+    {
+        if (pZip->m_pWrite(pZip->m_pIO_opaque, cur_archive_file_ofs, user_extra_data, user_extra_data_len) != user_extra_data_len)
+            return mz_zip_set_error(pZip, MZ_ZIP_FILE_WRITE_FAILED);
+
+        cur_archive_file_ofs += user_extra_data_len;
+    }
+
+    if (max_size)
+    {
+        void *pRead_buf = pZip->m_pAlloc(pZip->m_pAlloc_opaque, 1, MZ_ZIP_MAX_IO_BUF_SIZE);
+        if (!pRead_buf)
+        {
+            return mz_zip_set_error(pZip, MZ_ZIP_ALLOC_FAILED);
+        }
+
+        if (!level)
+        {
+            while (1)
+            {
+                size_t n = read_callback(callback_opaque, file_ofs, pRead_buf, MZ_ZIP_MAX_IO_BUF_SIZE);
+                if (n == 0)
+                    break;
+
+                if ((n > MZ_ZIP_MAX_IO_BUF_SIZE) || (file_ofs + n > max_size))
+                {
+                    pZip->m_pFree(pZip->m_pAlloc_opaque, pRead_buf);
+                    return mz_zip_set_error(pZip, MZ_ZIP_FILE_READ_FAILED);
+                }
+                if (pZip->m_pWrite(pZip->m_pIO_opaque, cur_archive_file_ofs, pRead_buf, n) != n)
+                {
+                    pZip->m_pFree(pZip->m_pAlloc_opaque, pRead_buf);
+                    return mz_zip_set_error(pZip, MZ_ZIP_FILE_WRITE_FAILED);
+                }
+                file_ofs += n;
+                uncomp_crc32 = (mz_uint32)mz_crc32(uncomp_crc32, (const mz_uint8 *)pRead_buf, n);
+                cur_archive_file_ofs += n;
+            }
+            uncomp_size = file_ofs;
+            comp_size = uncomp_size;
+        }
+        else
+        {
+            mz_bool result = MZ_FALSE;
+            mz_zip_writer_add_state state;
+            tdefl_compressor *pComp = (tdefl_compressor *)pZip->m_pAlloc(pZip->m_pAlloc_opaque, 1, sizeof(tdefl_compressor));
+            if (!pComp)
+            {
+                pZip->m_pFree(pZip->m_pAlloc_opaque, pRead_buf);
+                return mz_zip_set_error(pZip, MZ_ZIP_ALLOC_FAILED);
+            }
+
+            state.m_pZip = pZip;
+            state.m_cur_archive_file_ofs = cur_archive_file_ofs;
+            state.m_comp_size = 0;
+
+            if (tdefl_init(pComp, mz_zip_writer_add_put_buf_callback, &state, tdefl_create_comp_flags_from_zip_params(level, -15, MZ_DEFAULT_STRATEGY)) != TDEFL_STATUS_OKAY)
+            {
+                pZip->m_pFree(pZip->m_pAlloc_opaque, pComp);
+                pZip->m_pFree(pZip->m_pAlloc_opaque, pRead_buf);
+                return mz_zip_set_error(pZip, MZ_ZIP_INTERNAL_ERROR);
+            }
+
+            for (;;)
+            {
+                tdefl_status status;
+                tdefl_flush flush = TDEFL_NO_FLUSH;
+
+                size_t n = read_callback(callback_opaque, file_ofs, pRead_buf, MZ_ZIP_MAX_IO_BUF_SIZE);
+                if ((n > MZ_ZIP_MAX_IO_BUF_SIZE) || (file_ofs + n > max_size))
+                {
+                    mz_zip_set_error(pZip, MZ_ZIP_FILE_READ_FAILED);
+                    break;
+                }
+
+                file_ofs += n;
+                uncomp_crc32 = (mz_uint32)mz_crc32(uncomp_crc32, (const mz_uint8 *)pRead_buf, n);
+
+                if (pZip->m_pNeeds_keepalive != NULL && pZip->m_pNeeds_keepalive(pZip->m_pIO_opaque))
+                    flush = TDEFL_FULL_FLUSH;
+
+                if (n == 0)
+                    flush = TDEFL_FINISH;
+
+                status = tdefl_compress_buffer(pComp, pRead_buf, n, flush);
+                if (status == TDEFL_STATUS_DONE)
+                {
+                    result = MZ_TRUE;
+                    break;
+                }
+                else if (status != TDEFL_STATUS_OKAY)
+                {
+                    mz_zip_set_error(pZip, MZ_ZIP_COMPRESSION_FAILED);
+                    break;
+                }
+            }
+
+            pZip->m_pFree(pZip->m_pAlloc_opaque, pComp);
+
+            if (!result)
+            {
+                pZip->m_pFree(pZip->m_pAlloc_opaque, pRead_buf);
+                return MZ_FALSE;
+            }
+
+            uncomp_size = file_ofs;
+            comp_size = state.m_comp_size;
+            cur_archive_file_ofs = state.m_cur_archive_file_ofs;
+        }
+
+        pZip->m_pFree(pZip->m_pAlloc_opaque, pRead_buf);
+    }
+
+    if (!(level_and_flags & MZ_ZIP_FLAG_WRITE_HEADER_SET_SIZE))
+    {
+        mz_uint8 local_dir_footer[MZ_ZIP_DATA_DESCRIPTER_SIZE64];
+        mz_uint32 local_dir_footer_size = MZ_ZIP_DATA_DESCRIPTER_SIZE32;
+
+        MZ_WRITE_LE32(local_dir_footer + 0, MZ_ZIP_DATA_DESCRIPTOR_ID);
+        MZ_WRITE_LE32(local_dir_footer + 4, uncomp_crc32);
+        if (pExtra_data == NULL)
+        {
+            if (comp_size > MZ_UINT32_MAX)
+                return mz_zip_set_error(pZip, MZ_ZIP_ARCHIVE_TOO_LARGE);
+
+            MZ_WRITE_LE32(local_dir_footer + 8, comp_size);
+            MZ_WRITE_LE32(local_dir_footer + 12, uncomp_size);
+        }
+        else
+        {
+            MZ_WRITE_LE64(local_dir_footer + 8, comp_size);
+            MZ_WRITE_LE64(local_dir_footer + 16, uncomp_size);
+            local_dir_footer_size = MZ_ZIP_DATA_DESCRIPTER_SIZE64;
+        }
+
+        if (pZip->m_pWrite(pZip->m_pIO_opaque, cur_archive_file_ofs, local_dir_footer, local_dir_footer_size) != local_dir_footer_size)
+            return MZ_FALSE;
+
+        cur_archive_file_ofs += local_dir_footer_size;
+    }
+
+    if (level_and_flags & MZ_ZIP_FLAG_WRITE_HEADER_SET_SIZE)
+    {
+        if (pExtra_data != NULL)
+        {
+            extra_size = mz_zip_writer_create_zip64_extra_data(extra_data, (max_size >= MZ_UINT32_MAX) ? &uncomp_size : NULL,
+                                                               (max_size >= MZ_UINT32_MAX) ? &comp_size : NULL, (local_dir_header_ofs >= MZ_UINT32_MAX) ? &local_dir_header_ofs : NULL);
+        }
+
+        if (!mz_zip_writer_create_local_dir_header(pZip, local_dir_header,
+                                                   (mz_uint16)archive_name_size, (mz_uint16)(extra_size + user_extra_data_len),
+                                                   (max_size >= MZ_UINT32_MAX) ? MZ_UINT32_MAX : uncomp_size, 
+                                                    (max_size >= MZ_UINT32_MAX) ? MZ_UINT32_MAX : comp_size,
+                                                   uncomp_crc32, method, gen_flags, dos_time, dos_date))
+            return mz_zip_set_error(pZip, MZ_ZIP_INTERNAL_ERROR);
+
+        cur_archive_header_file_ofs = local_dir_header_ofs;
+
+        if (pZip->m_pWrite(pZip->m_pIO_opaque, cur_archive_header_file_ofs, local_dir_header, sizeof(local_dir_header)) != sizeof(local_dir_header))
+            return mz_zip_set_error(pZip, MZ_ZIP_FILE_WRITE_FAILED);
+
+        if (pExtra_data != NULL)
+        {
+            cur_archive_header_file_ofs += sizeof(local_dir_header);
+
+            if (pZip->m_pWrite(pZip->m_pIO_opaque, cur_archive_header_file_ofs, pArchive_name, archive_name_size) != archive_name_size)
+            {
+                return mz_zip_set_error(pZip, MZ_ZIP_FILE_WRITE_FAILED);
+            }
+
+            cur_archive_header_file_ofs += archive_name_size;
+
+            if (pZip->m_pWrite(pZip->m_pIO_opaque, cur_archive_header_file_ofs, extra_data, extra_size) != extra_size)
+                return mz_zip_set_error(pZip, MZ_ZIP_FILE_WRITE_FAILED);
+
+            cur_archive_header_file_ofs += extra_size;
+        }
+    }
+
+    if (pExtra_data != NULL)
+    {
+        extra_size = mz_zip_writer_create_zip64_extra_data(extra_data, (uncomp_size >= MZ_UINT32_MAX) ? &uncomp_size : NULL,
+                                                           (uncomp_size >= MZ_UINT32_MAX) ? &comp_size : NULL, (local_dir_header_ofs >= MZ_UINT32_MAX) ? &local_dir_header_ofs : NULL);
+    }
+
+    if (!mz_zip_writer_add_to_central_dir(pZip, pArchive_name, (mz_uint16)archive_name_size, pExtra_data, (mz_uint16)extra_size, pComment, comment_size,
+                                          uncomp_size, comp_size, uncomp_crc32, method, gen_flags, dos_time, dos_date, local_dir_header_ofs, ext_attributes,
+                                          user_extra_data_central, user_extra_data_central_len))
+        return MZ_FALSE;
+
+    pZip->m_total_files++;
+    pZip->m_archive_size = cur_archive_file_ofs;
+
+    return MZ_TRUE;
+}
+
+#ifndef MINIZ_NO_STDIO
+
+static size_t mz_file_read_func_stdio(void *pOpaque, mz_uint64 file_ofs, void *pBuf, size_t n)
+{
+	MZ_FILE *pSrc_file = (MZ_FILE *)pOpaque;
+	mz_int64 cur_ofs = MZ_FTELL64(pSrc_file);
+
+	if (((mz_int64)file_ofs < 0) || (((cur_ofs != (mz_int64)file_ofs)) && (MZ_FSEEK64(pSrc_file, (mz_int64)file_ofs, SEEK_SET))))
+		return 0;
+
+	return MZ_FREAD(pBuf, 1, n, pSrc_file);
+}
+
+mz_bool mz_zip_writer_add_cfile(mz_zip_archive *pZip, const char *pArchive_name, MZ_FILE *pSrc_file, mz_uint64 max_size, const MZ_TIME_T *pFile_time, const void *pComment, mz_uint16 comment_size, mz_uint level_and_flags,
+	const char *user_extra_data, mz_uint user_extra_data_len, const char *user_extra_data_central, mz_uint user_extra_data_central_len)
+{
+	return mz_zip_writer_add_read_buf_callback(pZip, pArchive_name, mz_file_read_func_stdio, pSrc_file, max_size, pFile_time, pComment, comment_size, level_and_flags,
+		user_extra_data, user_extra_data_len, user_extra_data_central, user_extra_data_central_len);
+}
+
+mz_bool mz_zip_writer_add_file(mz_zip_archive *pZip, const char *pArchive_name, const char *pSrc_filename, const void *pComment, mz_uint16 comment_size, mz_uint level_and_flags)
+{
+    MZ_FILE *pSrc_file = NULL;
+    mz_uint64 uncomp_size = 0;
+    MZ_TIME_T file_modified_time;
+    MZ_TIME_T *pFile_time = NULL;
+    mz_bool status;
+
+    memset(&file_modified_time, 0, sizeof(file_modified_time));
+
+#if !defined(MINIZ_NO_TIME) && !defined(MINIZ_NO_STDIO)
+    pFile_time = &file_modified_time;
+    if (!mz_zip_get_file_modified_time(pSrc_filename, &file_modified_time))
+        return mz_zip_set_error(pZip, MZ_ZIP_FILE_STAT_FAILED);
+#endif
+
+    pSrc_file = MZ_FOPEN(pSrc_filename, "rb");
+    if (!pSrc_file)
+        return mz_zip_set_error(pZip, MZ_ZIP_FILE_OPEN_FAILED);
+
+    MZ_FSEEK64(pSrc_file, 0, SEEK_END);
+    uncomp_size = MZ_FTELL64(pSrc_file);
+    MZ_FSEEK64(pSrc_file, 0, SEEK_SET);
+
+    status = mz_zip_writer_add_cfile(pZip, pArchive_name, pSrc_file, uncomp_size, pFile_time, pComment, comment_size, level_and_flags, NULL, 0, NULL, 0);
+
+    MZ_FCLOSE(pSrc_file);
+
+    return status;
+}
+#endif /* #ifndef MINIZ_NO_STDIO */
+
+static mz_bool mz_zip_writer_update_zip64_extension_block(mz_zip_array *pNew_ext, mz_zip_archive *pZip, const mz_uint8 *pExt, uint32_t ext_len, mz_uint64 *pComp_size, mz_uint64 *pUncomp_size, mz_uint64 *pLocal_header_ofs, mz_uint32 *pDisk_start)
+{
+    /* + 64 should be enough for any new zip64 data */
+    if (!mz_zip_array_reserve(pZip, pNew_ext, ext_len + 64, MZ_FALSE))
+        return mz_zip_set_error(pZip, MZ_ZIP_ALLOC_FAILED);
+
+    mz_zip_array_resize(pZip, pNew_ext, 0, MZ_FALSE);
+
+    if ((pUncomp_size) || (pComp_size) || (pLocal_header_ofs) || (pDisk_start))
+    {
+        mz_uint8 new_ext_block[64];
+        mz_uint8 *pDst = new_ext_block;
+        mz_write_le16(pDst, MZ_ZIP64_EXTENDED_INFORMATION_FIELD_HEADER_ID);
+        mz_write_le16(pDst + sizeof(mz_uint16), 0);
+        pDst += sizeof(mz_uint16) * 2;
+
+        if (pUncomp_size)
+        {
+            mz_write_le64(pDst, *pUncomp_size);
+            pDst += sizeof(mz_uint64);
+        }
+
+        if (pComp_size)
+        {
+            mz_write_le64(pDst, *pComp_size);
+            pDst += sizeof(mz_uint64);
+        }
+
+        if (pLocal_header_ofs)
+        {
+            mz_write_le64(pDst, *pLocal_header_ofs);
+            pDst += sizeof(mz_uint64);
+        }
+
+        if (pDisk_start)
+        {
+            mz_write_le32(pDst, *pDisk_start);
+            pDst += sizeof(mz_uint32);
+        }
+
+        mz_write_le16(new_ext_block + sizeof(mz_uint16), (mz_uint16)((pDst - new_ext_block) - sizeof(mz_uint16) * 2));
+
+        if (!mz_zip_array_push_back(pZip, pNew_ext, new_ext_block, pDst - new_ext_block))
+            return mz_zip_set_error(pZip, MZ_ZIP_ALLOC_FAILED);
+    }
+
+    if ((pExt) && (ext_len))
+    {
+        mz_uint32 extra_size_remaining = ext_len;
+        const mz_uint8 *pExtra_data = pExt;
+
+        do
+        {
+            mz_uint32 field_id, field_data_size, field_total_size;
+
+            if (extra_size_remaining < (sizeof(mz_uint16) * 2))
+                return mz_zip_set_error(pZip, MZ_ZIP_INVALID_HEADER_OR_CORRUPTED);
+
+            field_id = MZ_READ_LE16(pExtra_data);
+            field_data_size = MZ_READ_LE16(pExtra_data + sizeof(mz_uint16));
+            field_total_size = field_data_size + sizeof(mz_uint16) * 2;
+
+            if (field_total_size > extra_size_remaining)
+                return mz_zip_set_error(pZip, MZ_ZIP_INVALID_HEADER_OR_CORRUPTED);
+
+            if (field_id != MZ_ZIP64_EXTENDED_INFORMATION_FIELD_HEADER_ID)
+            {
+                if (!mz_zip_array_push_back(pZip, pNew_ext, pExtra_data, field_total_size))
+                    return mz_zip_set_error(pZip, MZ_ZIP_ALLOC_FAILED);
+            }
+
+            pExtra_data += field_total_size;
+            extra_size_remaining -= field_total_size;
+        } while (extra_size_remaining);
+    }
+
+    return MZ_TRUE;
+}
+
+/* TODO: This func is now pretty freakin complex due to zip64, split it up? */
+mz_bool mz_zip_writer_add_from_zip_reader(mz_zip_archive *pZip, mz_zip_archive *pSource_zip, mz_uint src_file_index)
+{
+    mz_uint n, bit_flags, num_alignment_padding_bytes, src_central_dir_following_data_size;
+    mz_uint64 src_archive_bytes_remaining, local_dir_header_ofs;
+    mz_uint64 cur_src_file_ofs, cur_dst_file_ofs;
+    mz_uint32 local_header_u32[(MZ_ZIP_LOCAL_DIR_HEADER_SIZE + sizeof(mz_uint32) - 1) / sizeof(mz_uint32)];
+    mz_uint8 *pLocal_header = (mz_uint8 *)local_header_u32;
+    mz_uint8 new_central_header[MZ_ZIP_CENTRAL_DIR_HEADER_SIZE];
+    size_t orig_central_dir_size;
+    mz_zip_internal_state *pState;
+    void *pBuf;
+    const mz_uint8 *pSrc_central_header;
+    mz_zip_archive_file_stat src_file_stat;
+    mz_uint32 src_filename_len, src_comment_len, src_ext_len;
+    mz_uint32 local_header_filename_size, local_header_extra_len;
+    mz_uint64 local_header_comp_size, local_header_uncomp_size;
+    mz_bool found_zip64_ext_data_in_ldir = MZ_FALSE;
+
+    /* Sanity checks */
+    if ((!pZip) || (!pZip->m_pState) || (pZip->m_zip_mode != MZ_ZIP_MODE_WRITING) || (!pSource_zip->m_pRead))
+        return mz_zip_set_error(pZip, MZ_ZIP_INVALID_PARAMETER);
+
+    pState = pZip->m_pState;
+
+    /* Don't support copying files from zip64 archives to non-zip64, even though in some cases this is possible */
+    if ((pSource_zip->m_pState->m_zip64) && (!pZip->m_pState->m_zip64))
+        return mz_zip_set_error(pZip, MZ_ZIP_INVALID_PARAMETER);
+
+    /* Get pointer to the source central dir header and crack it */
+    if (NULL == (pSrc_central_header = mz_zip_get_cdh(pSource_zip, src_file_index)))
+        return mz_zip_set_error(pZip, MZ_ZIP_INVALID_PARAMETER);
+
+    if (MZ_READ_LE32(pSrc_central_header + MZ_ZIP_CDH_SIG_OFS) != MZ_ZIP_CENTRAL_DIR_HEADER_SIG)
+        return mz_zip_set_error(pZip, MZ_ZIP_INVALID_HEADER_OR_CORRUPTED);
+
+    src_filename_len = MZ_READ_LE16(pSrc_central_header + MZ_ZIP_CDH_FILENAME_LEN_OFS);
+    src_comment_len = MZ_READ_LE16(pSrc_central_header + MZ_ZIP_CDH_COMMENT_LEN_OFS);
+    src_ext_len = MZ_READ_LE16(pSrc_central_header + MZ_ZIP_CDH_EXTRA_LEN_OFS);
+    src_central_dir_following_data_size = src_filename_len + src_ext_len + src_comment_len;
+
+    /* TODO: We don't support central dir's >= MZ_UINT32_MAX bytes right now (+32 fudge factor in case we need to add more extra data) */
+    if ((pState->m_central_dir.m_size + MZ_ZIP_CENTRAL_DIR_HEADER_SIZE + src_central_dir_following_data_size + 32) >= MZ_UINT32_MAX)
+        return mz_zip_set_error(pZip, MZ_ZIP_UNSUPPORTED_CDIR_SIZE);
+
+    num_alignment_padding_bytes = mz_zip_writer_compute_padding_needed_for_file_alignment(pZip);
+
+    if (!pState->m_zip64)
+    {
+        if (pZip->m_total_files == MZ_UINT16_MAX)
+            return mz_zip_set_error(pZip, MZ_ZIP_TOO_MANY_FILES);
+    }
+    else
+    {
+        /* TODO: Our zip64 support still has some 32-bit limits that may not be worth fixing. */
+        if (pZip->m_total_files == MZ_UINT32_MAX)
+            return mz_zip_set_error(pZip, MZ_ZIP_TOO_MANY_FILES);
+    }
+
+    if (!mz_zip_file_stat_internal(pSource_zip, src_file_index, pSrc_central_header, &src_file_stat, NULL))
+        return MZ_FALSE;
+
+    cur_src_file_ofs = src_file_stat.m_local_header_ofs;
+    cur_dst_file_ofs = pZip->m_archive_size;
+
+    /* Read the source archive's local dir header */
+    if (pSource_zip->m_pRead(pSource_zip->m_pIO_opaque, cur_src_file_ofs, pLocal_header, MZ_ZIP_LOCAL_DIR_HEADER_SIZE) != MZ_ZIP_LOCAL_DIR_HEADER_SIZE)
+        return mz_zip_set_error(pZip, MZ_ZIP_FILE_READ_FAILED);
+
+    if (MZ_READ_LE32(pLocal_header) != MZ_ZIP_LOCAL_DIR_HEADER_SIG)
+        return mz_zip_set_error(pZip, MZ_ZIP_INVALID_HEADER_OR_CORRUPTED);
+
+    cur_src_file_ofs += MZ_ZIP_LOCAL_DIR_HEADER_SIZE;
+
+    /* Compute the total size we need to copy (filename+extra data+compressed data) */
+    local_header_filename_size = MZ_READ_LE16(pLocal_header + MZ_ZIP_LDH_FILENAME_LEN_OFS);
+    local_header_extra_len = MZ_READ_LE16(pLocal_header + MZ_ZIP_LDH_EXTRA_LEN_OFS);
+    local_header_comp_size = MZ_READ_LE32(pLocal_header + MZ_ZIP_LDH_COMPRESSED_SIZE_OFS);
+    local_header_uncomp_size = MZ_READ_LE32(pLocal_header + MZ_ZIP_LDH_DECOMPRESSED_SIZE_OFS);
+    src_archive_bytes_remaining = local_header_filename_size + local_header_extra_len + src_file_stat.m_comp_size;
+
+    /* Try to find a zip64 extended information field */
+    if ((local_header_extra_len) && ((local_header_comp_size == MZ_UINT32_MAX) || (local_header_uncomp_size == MZ_UINT32_MAX)))
+    {
+        mz_zip_array file_data_array;
+        const mz_uint8 *pExtra_data;
+        mz_uint32 extra_size_remaining = local_header_extra_len;
+
+        mz_zip_array_init(&file_data_array, 1);
+        if (!mz_zip_array_resize(pZip, &file_data_array, local_header_extra_len, MZ_FALSE))
+        {
+            return mz_zip_set_error(pZip, MZ_ZIP_ALLOC_FAILED);
+        }
+
+        if (pSource_zip->m_pRead(pSource_zip->m_pIO_opaque, src_file_stat.m_local_header_ofs + MZ_ZIP_LOCAL_DIR_HEADER_SIZE + local_header_filename_size, file_data_array.m_p, local_header_extra_len) != local_header_extra_len)
+        {
+            mz_zip_array_clear(pZip, &file_data_array);
+            return mz_zip_set_error(pZip, MZ_ZIP_FILE_READ_FAILED);
+        }
+
+        pExtra_data = (const mz_uint8 *)file_data_array.m_p;
+
+        do
+        {
+            mz_uint32 field_id, field_data_size, field_total_size;
+
+            if (extra_size_remaining < (sizeof(mz_uint16) * 2))
+            {
+                mz_zip_array_clear(pZip, &file_data_array);
+                return mz_zip_set_error(pZip, MZ_ZIP_INVALID_HEADER_OR_CORRUPTED);
+            }
+
+            field_id = MZ_READ_LE16(pExtra_data);
+            field_data_size = MZ_READ_LE16(pExtra_data + sizeof(mz_uint16));
+            field_total_size = field_data_size + sizeof(mz_uint16) * 2;
+
+            if (field_total_size > extra_size_remaining)
+            {
+                mz_zip_array_clear(pZip, &file_data_array);
+                return mz_zip_set_error(pZip, MZ_ZIP_INVALID_HEADER_OR_CORRUPTED);
+            }
+
+            if (field_id == MZ_ZIP64_EXTENDED_INFORMATION_FIELD_HEADER_ID)
+            {
+                const mz_uint8 *pSrc_field_data = pExtra_data + sizeof(mz_uint32);
+
+                if (field_data_size < sizeof(mz_uint64) * 2)
+                {
+                    mz_zip_array_clear(pZip, &file_data_array);
+                    return mz_zip_set_error(pZip, MZ_ZIP_INVALID_HEADER_OR_CORRUPTED);
+                }
+
+                local_header_uncomp_size = MZ_READ_LE64(pSrc_field_data);
+                local_header_comp_size = MZ_READ_LE64(pSrc_field_data + sizeof(mz_uint64)); /* may be 0 if there's a descriptor */
+
+                found_zip64_ext_data_in_ldir = MZ_TRUE;
+                break;
+            }
+
+            pExtra_data += field_total_size;
+            extra_size_remaining -= field_total_size;
+        } while (extra_size_remaining);
+
+        mz_zip_array_clear(pZip, &file_data_array);
+    }
+
+    if (!pState->m_zip64)
+    {
+        /* Try to detect if the new archive will most likely wind up too big and bail early (+(sizeof(mz_uint32) * 4) is for the optional descriptor which could be present, +64 is a fudge factor). */
+        /* We also check when the archive is finalized so this doesn't need to be perfect. */
+        mz_uint64 approx_new_archive_size = cur_dst_file_ofs + num_alignment_padding_bytes + MZ_ZIP_LOCAL_DIR_HEADER_SIZE + src_archive_bytes_remaining + (sizeof(mz_uint32) * 4) +
+                                            pState->m_central_dir.m_size + MZ_ZIP_CENTRAL_DIR_HEADER_SIZE + src_central_dir_following_data_size + MZ_ZIP_END_OF_CENTRAL_DIR_HEADER_SIZE + 64;
+
+        if (approx_new_archive_size >= MZ_UINT32_MAX)
+            return mz_zip_set_error(pZip, MZ_ZIP_ARCHIVE_TOO_LARGE);
+    }
+
+    /* Write dest archive padding */
+    if (!mz_zip_writer_write_zeros(pZip, cur_dst_file_ofs, num_alignment_padding_bytes))
+        return MZ_FALSE;
+
+    cur_dst_file_ofs += num_alignment_padding_bytes;
+
+    local_dir_header_ofs = cur_dst_file_ofs;
+    if (pZip->m_file_offset_alignment)
+    {
+        MZ_ASSERT((local_dir_header_ofs & (pZip->m_file_offset_alignment - 1)) == 0);
+    }
+
+    /* The original zip's local header+ext block doesn't change, even with zip64, so we can just copy it over to the dest zip */
+    if (pZip->m_pWrite(pZip->m_pIO_opaque, cur_dst_file_ofs, pLocal_header, MZ_ZIP_LOCAL_DIR_HEADER_SIZE) != MZ_ZIP_LOCAL_DIR_HEADER_SIZE)
+        return mz_zip_set_error(pZip, MZ_ZIP_FILE_WRITE_FAILED);
+
+    cur_dst_file_ofs += MZ_ZIP_LOCAL_DIR_HEADER_SIZE;
+
+    /* Copy over the source archive bytes to the dest archive, also ensure we have enough buf space to handle optional data descriptor */
+    if (NULL == (pBuf = pZip->m_pAlloc(pZip->m_pAlloc_opaque, 1, (size_t)MZ_MAX(32U, MZ_MIN((mz_uint64)MZ_ZIP_MAX_IO_BUF_SIZE, src_archive_bytes_remaining)))))
+        return mz_zip_set_error(pZip, MZ_ZIP_ALLOC_FAILED);
+
+    while (src_archive_bytes_remaining)
+    {
+        n = (mz_uint)MZ_MIN((mz_uint64)MZ_ZIP_MAX_IO_BUF_SIZE, src_archive_bytes_remaining);
+        if (pSource_zip->m_pRead(pSource_zip->m_pIO_opaque, cur_src_file_ofs, pBuf, n) != n)
+        {
+            pZip->m_pFree(pZip->m_pAlloc_opaque, pBuf);
+            return mz_zip_set_error(pZip, MZ_ZIP_FILE_READ_FAILED);
+        }
+        cur_src_file_ofs += n;
+
+        if (pZip->m_pWrite(pZip->m_pIO_opaque, cur_dst_file_ofs, pBuf, n) != n)
+        {
+            pZip->m_pFree(pZip->m_pAlloc_opaque, pBuf);
+            return mz_zip_set_error(pZip, MZ_ZIP_FILE_WRITE_FAILED);
+        }
+        cur_dst_file_ofs += n;
+
+        src_archive_bytes_remaining -= n;
+    }
+
+    /* Now deal with the optional data descriptor */
+    bit_flags = MZ_READ_LE16(pLocal_header + MZ_ZIP_LDH_BIT_FLAG_OFS);
+    if (bit_flags & 8)
+    {
+        /* Copy data descriptor */
+        if ((pSource_zip->m_pState->m_zip64) || (found_zip64_ext_data_in_ldir))
+        {
+            /* src is zip64, dest must be zip64 */
+
+            /* name			uint32_t's */
+            /* id				1 (optional in zip64?) */
+            /* crc			1 */
+            /* comp_size	2 */
+            /* uncomp_size 2 */
+            if (pSource_zip->m_pRead(pSource_zip->m_pIO_opaque, cur_src_file_ofs, pBuf, (sizeof(mz_uint32) * 6)) != (sizeof(mz_uint32) * 6))
+            {
+                pZip->m_pFree(pZip->m_pAlloc_opaque, pBuf);
+                return mz_zip_set_error(pZip, MZ_ZIP_FILE_READ_FAILED);
+            }
+
+            n = sizeof(mz_uint32) * ((MZ_READ_LE32(pBuf) == MZ_ZIP_DATA_DESCRIPTOR_ID) ? 6 : 5);
+        }
+        else
+        {
+            /* src is NOT zip64 */
+            mz_bool has_id;
+
+            if (pSource_zip->m_pRead(pSource_zip->m_pIO_opaque, cur_src_file_ofs, pBuf, sizeof(mz_uint32) * 4) != sizeof(mz_uint32) * 4)
+            {
+                pZip->m_pFree(pZip->m_pAlloc_opaque, pBuf);
+                return mz_zip_set_error(pZip, MZ_ZIP_FILE_READ_FAILED);
+            }
+
+            has_id = (MZ_READ_LE32(pBuf) == MZ_ZIP_DATA_DESCRIPTOR_ID);
+
+            if (pZip->m_pState->m_zip64)
+            {
+                /* dest is zip64, so upgrade the data descriptor */
+                const mz_uint32 *pSrc_descriptor = (const mz_uint32 *)((const mz_uint8 *)pBuf + (has_id ? sizeof(mz_uint32) : 0));
+                const mz_uint32 src_crc32 = pSrc_descriptor[0];
+                const mz_uint64 src_comp_size = pSrc_descriptor[1];
+                const mz_uint64 src_uncomp_size = pSrc_descriptor[2];
+
+                mz_write_le32((mz_uint8 *)pBuf, MZ_ZIP_DATA_DESCRIPTOR_ID);
+                mz_write_le32((mz_uint8 *)pBuf + sizeof(mz_uint32) * 1, src_crc32);
+                mz_write_le64((mz_uint8 *)pBuf + sizeof(mz_uint32) * 2, src_comp_size);
+                mz_write_le64((mz_uint8 *)pBuf + sizeof(mz_uint32) * 4, src_uncomp_size);
+
+                n = sizeof(mz_uint32) * 6;
+            }
+            else
+            {
+                /* dest is NOT zip64, just copy it as-is */
+                n = sizeof(mz_uint32) * (has_id ? 4 : 3);
+            }
+        }
+
+        if (pZip->m_pWrite(pZip->m_pIO_opaque, cur_dst_file_ofs, pBuf, n) != n)
+        {
+            pZip->m_pFree(pZip->m_pAlloc_opaque, pBuf);
+            return mz_zip_set_error(pZip, MZ_ZIP_FILE_WRITE_FAILED);
+        }
+
+        cur_src_file_ofs += n;
+        cur_dst_file_ofs += n;
+    }
+    pZip->m_pFree(pZip->m_pAlloc_opaque, pBuf);
+
+    /* Finally, add the new central dir header */
+    orig_central_dir_size = pState->m_central_dir.m_size;
+
+    memcpy(new_central_header, pSrc_central_header, MZ_ZIP_CENTRAL_DIR_HEADER_SIZE);
+
+    if (pState->m_zip64)
+    {
+        /* This is the painful part: We need to write a new central dir header + ext block with updated zip64 fields, and ensure the old fields (if any) are not included. */
+        const mz_uint8 *pSrc_ext = pSrc_central_header + MZ_ZIP_CENTRAL_DIR_HEADER_SIZE + src_filename_len;
+        mz_zip_array new_ext_block;
+
+        mz_zip_array_init(&new_ext_block, sizeof(mz_uint8));
+
+        MZ_WRITE_LE32(new_central_header + MZ_ZIP_CDH_COMPRESSED_SIZE_OFS, MZ_UINT32_MAX);
+        MZ_WRITE_LE32(new_central_header + MZ_ZIP_CDH_DECOMPRESSED_SIZE_OFS, MZ_UINT32_MAX);
+        MZ_WRITE_LE32(new_central_header + MZ_ZIP_CDH_LOCAL_HEADER_OFS, MZ_UINT32_MAX);
+
+        if (!mz_zip_writer_update_zip64_extension_block(&new_ext_block, pZip, pSrc_ext, src_ext_len, &src_file_stat.m_comp_size, &src_file_stat.m_uncomp_size, &local_dir_header_ofs, NULL))
+        {
+            mz_zip_array_clear(pZip, &new_ext_block);
+            return MZ_FALSE;
+        }
+
+        MZ_WRITE_LE16(new_central_header + MZ_ZIP_CDH_EXTRA_LEN_OFS, new_ext_block.m_size);
+
+        if (!mz_zip_array_push_back(pZip, &pState->m_central_dir, new_central_header, MZ_ZIP_CENTRAL_DIR_HEADER_SIZE))
+        {
+            mz_zip_array_clear(pZip, &new_ext_block);
+            return mz_zip_set_error(pZip, MZ_ZIP_ALLOC_FAILED);
+        }
+
+        if (!mz_zip_array_push_back(pZip, &pState->m_central_dir, pSrc_central_header + MZ_ZIP_CENTRAL_DIR_HEADER_SIZE, src_filename_len))
+        {
+            mz_zip_array_clear(pZip, &new_ext_block);
+            mz_zip_array_resize(pZip, &pState->m_central_dir, orig_central_dir_size, MZ_FALSE);
+            return mz_zip_set_error(pZip, MZ_ZIP_ALLOC_FAILED);
+        }
+
+        if (!mz_zip_array_push_back(pZip, &pState->m_central_dir, new_ext_block.m_p, new_ext_block.m_size))
+        {
+            mz_zip_array_clear(pZip, &new_ext_block);
+            mz_zip_array_resize(pZip, &pState->m_central_dir, orig_central_dir_size, MZ_FALSE);
+            return mz_zip_set_error(pZip, MZ_ZIP_ALLOC_FAILED);
+        }
+
+        if (!mz_zip_array_push_back(pZip, &pState->m_central_dir, pSrc_central_header + MZ_ZIP_CENTRAL_DIR_HEADER_SIZE + src_filename_len + src_ext_len, src_comment_len))
+        {
+            mz_zip_array_clear(pZip, &new_ext_block);
+            mz_zip_array_resize(pZip, &pState->m_central_dir, orig_central_dir_size, MZ_FALSE);
+            return mz_zip_set_error(pZip, MZ_ZIP_ALLOC_FAILED);
+        }
+
+        mz_zip_array_clear(pZip, &new_ext_block);
+    }
+    else
+    {
+        /* sanity checks */
+        if (cur_dst_file_ofs > MZ_UINT32_MAX)
+            return mz_zip_set_error(pZip, MZ_ZIP_ARCHIVE_TOO_LARGE);
+
+        if (local_dir_header_ofs >= MZ_UINT32_MAX)
+            return mz_zip_set_error(pZip, MZ_ZIP_ARCHIVE_TOO_LARGE);
+
+        MZ_WRITE_LE32(new_central_header + MZ_ZIP_CDH_LOCAL_HEADER_OFS, local_dir_header_ofs);
+
+        if (!mz_zip_array_push_back(pZip, &pState->m_central_dir, new_central_header, MZ_ZIP_CENTRAL_DIR_HEADER_SIZE))
+            return mz_zip_set_error(pZip, MZ_ZIP_ALLOC_FAILED);
+
+        if (!mz_zip_array_push_back(pZip, &pState->m_central_dir, pSrc_central_header + MZ_ZIP_CENTRAL_DIR_HEADER_SIZE, src_central_dir_following_data_size))
+        {
+            mz_zip_array_resize(pZip, &pState->m_central_dir, orig_central_dir_size, MZ_FALSE);
+            return mz_zip_set_error(pZip, MZ_ZIP_ALLOC_FAILED);
+        }
+    }
+
+    /* This shouldn't trigger unless we screwed up during the initial sanity checks */
+    if (pState->m_central_dir.m_size >= MZ_UINT32_MAX)
+    {
+        /* TODO: Support central dirs >= 32-bits in size */
+        mz_zip_array_resize(pZip, &pState->m_central_dir, orig_central_dir_size, MZ_FALSE);
+        return mz_zip_set_error(pZip, MZ_ZIP_UNSUPPORTED_CDIR_SIZE);
+    }
+
+    n = (mz_uint32)orig_central_dir_size;
+    if (!mz_zip_array_push_back(pZip, &pState->m_central_dir_offsets, &n, 1))
+    {
+        mz_zip_array_resize(pZip, &pState->m_central_dir, orig_central_dir_size, MZ_FALSE);
+        return mz_zip_set_error(pZip, MZ_ZIP_ALLOC_FAILED);
+    }
+
+    pZip->m_total_files++;
+    pZip->m_archive_size = cur_dst_file_ofs;
+
+    return MZ_TRUE;
+}
+
+mz_bool mz_zip_writer_finalize_archive(mz_zip_archive *pZip)
+{
+    mz_zip_internal_state *pState;
+    mz_uint64 central_dir_ofs, central_dir_size;
+    mz_uint8 hdr[256];
+
+    if ((!pZip) || (!pZip->m_pState) || (pZip->m_zip_mode != MZ_ZIP_MODE_WRITING))
+        return mz_zip_set_error(pZip, MZ_ZIP_INVALID_PARAMETER);
+
+    pState = pZip->m_pState;
+
+    if (pState->m_zip64)
+    {
+        if ((pZip->m_total_files > MZ_UINT32_MAX) || (pState->m_central_dir.m_size >= MZ_UINT32_MAX))
+            return mz_zip_set_error(pZip, MZ_ZIP_TOO_MANY_FILES);
+    }
+    else
+    {
+        if ((pZip->m_total_files > MZ_UINT16_MAX) || ((pZip->m_archive_size + pState->m_central_dir.m_size + MZ_ZIP_END_OF_CENTRAL_DIR_HEADER_SIZE) > MZ_UINT32_MAX))
+            return mz_zip_set_error(pZip, MZ_ZIP_TOO_MANY_FILES);
+    }
+
+    central_dir_ofs = 0;
+    central_dir_size = 0;
+    if (pZip->m_total_files)
+    {
+        /* Write central directory */
+        central_dir_ofs = pZip->m_archive_size;
+        central_dir_size = pState->m_central_dir.m_size;
+        pZip->m_central_directory_file_ofs = central_dir_ofs;
+        if (pZip->m_pWrite(pZip->m_pIO_opaque, central_dir_ofs, pState->m_central_dir.m_p, (size_t)central_dir_size) != central_dir_size)
+            return mz_zip_set_error(pZip, MZ_ZIP_FILE_WRITE_FAILED);
+
+        pZip->m_archive_size += central_dir_size;
+    }
+
+    if (pState->m_zip64)
+    {
+        /* Write zip64 end of central directory header */
+        mz_uint64 rel_ofs_to_zip64_ecdr = pZip->m_archive_size;
+
+        MZ_CLEAR_OBJ(hdr);
+        MZ_WRITE_LE32(hdr + MZ_ZIP64_ECDH_SIG_OFS, MZ_ZIP64_END_OF_CENTRAL_DIR_HEADER_SIG);
+        MZ_WRITE_LE64(hdr + MZ_ZIP64_ECDH_SIZE_OF_RECORD_OFS, MZ_ZIP64_END_OF_CENTRAL_DIR_HEADER_SIZE - sizeof(mz_uint32) - sizeof(mz_uint64));
+        MZ_WRITE_LE16(hdr + MZ_ZIP64_ECDH_VERSION_MADE_BY_OFS, 0x031E); /* TODO: always Unix */
+        MZ_WRITE_LE16(hdr + MZ_ZIP64_ECDH_VERSION_NEEDED_OFS, 0x002D);
+        MZ_WRITE_LE64(hdr + MZ_ZIP64_ECDH_CDIR_NUM_ENTRIES_ON_DISK_OFS, pZip->m_total_files);
+        MZ_WRITE_LE64(hdr + MZ_ZIP64_ECDH_CDIR_TOTAL_ENTRIES_OFS, pZip->m_total_files);
+        MZ_WRITE_LE64(hdr + MZ_ZIP64_ECDH_CDIR_SIZE_OFS, central_dir_size);
+        MZ_WRITE_LE64(hdr + MZ_ZIP64_ECDH_CDIR_OFS_OFS, central_dir_ofs);
+        if (pZip->m_pWrite(pZip->m_pIO_opaque, pZip->m_archive_size, hdr, MZ_ZIP64_END_OF_CENTRAL_DIR_HEADER_SIZE) != MZ_ZIP64_END_OF_CENTRAL_DIR_HEADER_SIZE)
+            return mz_zip_set_error(pZip, MZ_ZIP_FILE_WRITE_FAILED);
+
+        pZip->m_archive_size += MZ_ZIP64_END_OF_CENTRAL_DIR_HEADER_SIZE;
+
+        /* Write zip64 end of central directory locator */
+        MZ_CLEAR_OBJ(hdr);
+        MZ_WRITE_LE32(hdr + MZ_ZIP64_ECDL_SIG_OFS, MZ_ZIP64_END_OF_CENTRAL_DIR_LOCATOR_SIG);
+        MZ_WRITE_LE64(hdr + MZ_ZIP64_ECDL_REL_OFS_TO_ZIP64_ECDR_OFS, rel_ofs_to_zip64_ecdr);
+        MZ_WRITE_LE32(hdr + MZ_ZIP64_ECDL_TOTAL_NUMBER_OF_DISKS_OFS, 1);
+        if (pZip->m_pWrite(pZip->m_pIO_opaque, pZip->m_archive_size, hdr, MZ_ZIP64_END_OF_CENTRAL_DIR_LOCATOR_SIZE) != MZ_ZIP64_END_OF_CENTRAL_DIR_LOCATOR_SIZE)
+            return mz_zip_set_error(pZip, MZ_ZIP_FILE_WRITE_FAILED);
+
+        pZip->m_archive_size += MZ_ZIP64_END_OF_CENTRAL_DIR_LOCATOR_SIZE;
+    }
+
+    /* Write end of central directory record */
+    MZ_CLEAR_OBJ(hdr);
+    MZ_WRITE_LE32(hdr + MZ_ZIP_ECDH_SIG_OFS, MZ_ZIP_END_OF_CENTRAL_DIR_HEADER_SIG);
+    MZ_WRITE_LE16(hdr + MZ_ZIP_ECDH_CDIR_NUM_ENTRIES_ON_DISK_OFS, MZ_MIN(MZ_UINT16_MAX, pZip->m_total_files));
+    MZ_WRITE_LE16(hdr + MZ_ZIP_ECDH_CDIR_TOTAL_ENTRIES_OFS, MZ_MIN(MZ_UINT16_MAX, pZip->m_total_files));
+    MZ_WRITE_LE32(hdr + MZ_ZIP_ECDH_CDIR_SIZE_OFS, MZ_MIN(MZ_UINT32_MAX, central_dir_size));
+    MZ_WRITE_LE32(hdr + MZ_ZIP_ECDH_CDIR_OFS_OFS, MZ_MIN(MZ_UINT32_MAX, central_dir_ofs));
+
+    if (pZip->m_pWrite(pZip->m_pIO_opaque, pZip->m_archive_size, hdr, MZ_ZIP_END_OF_CENTRAL_DIR_HEADER_SIZE) != MZ_ZIP_END_OF_CENTRAL_DIR_HEADER_SIZE)
+        return mz_zip_set_error(pZip, MZ_ZIP_FILE_WRITE_FAILED);
+
+#ifndef MINIZ_NO_STDIO
+    if ((pState->m_pFile) && (MZ_FFLUSH(pState->m_pFile) == EOF))
+        return mz_zip_set_error(pZip, MZ_ZIP_FILE_CLOSE_FAILED);
+#endif /* #ifndef MINIZ_NO_STDIO */
+
+    pZip->m_archive_size += MZ_ZIP_END_OF_CENTRAL_DIR_HEADER_SIZE;
+
+    pZip->m_zip_mode = MZ_ZIP_MODE_WRITING_HAS_BEEN_FINALIZED;
+    return MZ_TRUE;
+}
+
+mz_bool mz_zip_writer_finalize_heap_archive(mz_zip_archive *pZip, void **ppBuf, size_t *pSize)
+{
+    if ((!ppBuf) || (!pSize))
+        return mz_zip_set_error(pZip, MZ_ZIP_INVALID_PARAMETER);
+
+    *ppBuf = NULL;
+    *pSize = 0;
+
+    if ((!pZip) || (!pZip->m_pState))
+        return mz_zip_set_error(pZip, MZ_ZIP_INVALID_PARAMETER);
+
+    if (pZip->m_pWrite != mz_zip_heap_write_func)
+        return mz_zip_set_error(pZip, MZ_ZIP_INVALID_PARAMETER);
+
+    if (!mz_zip_writer_finalize_archive(pZip))
+        return MZ_FALSE;
+
+    *ppBuf = pZip->m_pState->m_pMem;
+    *pSize = pZip->m_pState->m_mem_size;
+    pZip->m_pState->m_pMem = NULL;
+    pZip->m_pState->m_mem_size = pZip->m_pState->m_mem_capacity = 0;
+
+    return MZ_TRUE;
+}
+
+mz_bool mz_zip_writer_end(mz_zip_archive *pZip)
+{
+    return mz_zip_writer_end_internal(pZip, MZ_TRUE);
+}
+
+#ifndef MINIZ_NO_STDIO
+mz_bool mz_zip_add_mem_to_archive_file_in_place(const char *pZip_filename, const char *pArchive_name, const void *pBuf, size_t buf_size, const void *pComment, mz_uint16 comment_size, mz_uint level_and_flags)
+{
+    return mz_zip_add_mem_to_archive_file_in_place_v2(pZip_filename, pArchive_name, pBuf, buf_size, pComment, comment_size, level_and_flags, NULL);
+}
+
+mz_bool mz_zip_add_mem_to_archive_file_in_place_v2(const char *pZip_filename, const char *pArchive_name, const void *pBuf, size_t buf_size, const void *pComment, mz_uint16 comment_size, mz_uint level_and_flags, mz_zip_error *pErr)
+{
+    mz_bool status, created_new_archive = MZ_FALSE;
+    mz_zip_archive zip_archive;
+    struct MZ_FILE_STAT_STRUCT file_stat;
+    mz_zip_error actual_err = MZ_ZIP_NO_ERROR;
+
+    mz_zip_zero_struct(&zip_archive);
+    if ((int)level_and_flags < 0)
+        level_and_flags = MZ_DEFAULT_LEVEL;
+
+    if ((!pZip_filename) || (!pArchive_name) || ((buf_size) && (!pBuf)) || ((comment_size) && (!pComment)) || ((level_and_flags & 0xF) > MZ_UBER_COMPRESSION))
+    {
+        if (pErr)
+            *pErr = MZ_ZIP_INVALID_PARAMETER;
+        return MZ_FALSE;
+    }
+
+    if (!mz_zip_writer_validate_archive_name(pArchive_name))
+    {
+        if (pErr)
+            *pErr = MZ_ZIP_INVALID_FILENAME;
+        return MZ_FALSE;
+    }
+
+    /* Important: The regular non-64 bit version of stat() can fail here if the file is very large, which could cause the archive to be overwritten. */
+    /* So be sure to compile with _LARGEFILE64_SOURCE 1 */
+    if (MZ_FILE_STAT(pZip_filename, &file_stat) != 0)
+    {
+        /* Create a new archive. */
+        if (!mz_zip_writer_init_file_v2(&zip_archive, pZip_filename, 0, level_and_flags))
+        {
+            if (pErr)
+                *pErr = zip_archive.m_last_error;
+            return MZ_FALSE;
+        }
+
+        created_new_archive = MZ_TRUE;
+    }
+    else
+    {
+        /* Append to an existing archive. */
+        if (!mz_zip_reader_init_file_v2(&zip_archive, pZip_filename, level_and_flags | MZ_ZIP_FLAG_DO_NOT_SORT_CENTRAL_DIRECTORY, 0, 0))
+        {
+            if (pErr)
+                *pErr = zip_archive.m_last_error;
+            return MZ_FALSE;
+        }
+
+        if (!mz_zip_writer_init_from_reader_v2(&zip_archive, pZip_filename, level_and_flags))
+        {
+            if (pErr)
+                *pErr = zip_archive.m_last_error;
+
+            mz_zip_reader_end_internal(&zip_archive, MZ_FALSE);
+
+            return MZ_FALSE;
+        }
+    }
+
+    status = mz_zip_writer_add_mem_ex(&zip_archive, pArchive_name, pBuf, buf_size, pComment, comment_size, level_and_flags, 0, 0);
+    actual_err = zip_archive.m_last_error;
+
+    /* Always finalize, even if adding failed for some reason, so we have a valid central directory. (This may not always succeed, but we can try.) */
+    if (!mz_zip_writer_finalize_archive(&zip_archive))
+    {
+        if (!actual_err)
+            actual_err = zip_archive.m_last_error;
+
+        status = MZ_FALSE;
+    }
+
+    if (!mz_zip_writer_end_internal(&zip_archive, status))
+    {
+        if (!actual_err)
+            actual_err = zip_archive.m_last_error;
+
+        status = MZ_FALSE;
+    }
+
+    if ((!status) && (created_new_archive))
+    {
+        /* It's a new archive and something went wrong, so just delete it. */
+        int ignoredStatus = MZ_DELETE_FILE(pZip_filename);
+        (void)ignoredStatus;
+    }
+
+    if (pErr)
+        *pErr = actual_err;
+
+    return status;
+}
+
+void *mz_zip_extract_archive_file_to_heap_v2(const char *pZip_filename, const char *pArchive_name, const char *pComment, size_t *pSize, mz_uint flags, mz_zip_error *pErr)
+{
+    mz_uint32 file_index;
+    mz_zip_archive zip_archive;
+    void *p = NULL;
+
+    if (pSize)
+        *pSize = 0;
+
+    if ((!pZip_filename) || (!pArchive_name))
+    {
+        if (pErr)
+            *pErr = MZ_ZIP_INVALID_PARAMETER;
+
+        return NULL;
+    }
+
+    mz_zip_zero_struct(&zip_archive);
+    if (!mz_zip_reader_init_file_v2(&zip_archive, pZip_filename, flags | MZ_ZIP_FLAG_DO_NOT_SORT_CENTRAL_DIRECTORY, 0, 0))
+    {
+        if (pErr)
+            *pErr = zip_archive.m_last_error;
+
+        return NULL;
+    }
+
+    if (mz_zip_reader_locate_file_v2(&zip_archive, pArchive_name, pComment, flags, &file_index))
+    {
+        p = mz_zip_reader_extract_to_heap(&zip_archive, file_index, pSize, flags);
+    }
+
+    mz_zip_reader_end_internal(&zip_archive, p != NULL);
+
+    if (pErr)
+        *pErr = zip_archive.m_last_error;
+
+    return p;
+}
+
+void *mz_zip_extract_archive_file_to_heap(const char *pZip_filename, const char *pArchive_name, size_t *pSize, mz_uint flags)
+{
+    return mz_zip_extract_archive_file_to_heap_v2(pZip_filename, pArchive_name, NULL, pSize, flags, NULL);
+}
+
+#endif /* #ifndef MINIZ_NO_STDIO */
+
+#endif /* #ifndef MINIZ_NO_ARCHIVE_WRITING_APIS */
+
+/* ------------------- Misc utils */
+
+mz_zip_mode mz_zip_get_mode(mz_zip_archive *pZip)
+{
+    return pZip ? pZip->m_zip_mode : MZ_ZIP_MODE_INVALID;
+}
+
+mz_zip_type mz_zip_get_type(mz_zip_archive *pZip)
+{
+    return pZip ? pZip->m_zip_type : MZ_ZIP_TYPE_INVALID;
+}
+
+mz_zip_error mz_zip_set_last_error(mz_zip_archive *pZip, mz_zip_error err_num)
+{
+    mz_zip_error prev_err;
+
+    if (!pZip)
+        return MZ_ZIP_INVALID_PARAMETER;
+
+    prev_err = pZip->m_last_error;
+
+    pZip->m_last_error = err_num;
+    return prev_err;
+}
+
+mz_zip_error mz_zip_peek_last_error(mz_zip_archive *pZip)
+{
+    if (!pZip)
+        return MZ_ZIP_INVALID_PARAMETER;
+
+    return pZip->m_last_error;
+}
+
+mz_zip_error mz_zip_clear_last_error(mz_zip_archive *pZip)
+{
+    return mz_zip_set_last_error(pZip, MZ_ZIP_NO_ERROR);
+}
+
+mz_zip_error mz_zip_get_last_error(mz_zip_archive *pZip)
+{
+    mz_zip_error prev_err;
+
+    if (!pZip)
+        return MZ_ZIP_INVALID_PARAMETER;
+
+    prev_err = pZip->m_last_error;
+
+    pZip->m_last_error = MZ_ZIP_NO_ERROR;
+    return prev_err;
+}
+
+const char *mz_zip_get_error_string(mz_zip_error mz_err)
+{
+    switch (mz_err)
+    {
+        case MZ_ZIP_NO_ERROR:
+            return "no error";
+        case MZ_ZIP_UNDEFINED_ERROR:
+            return "undefined error";
+        case MZ_ZIP_TOO_MANY_FILES:
+            return "too many files";
+        case MZ_ZIP_FILE_TOO_LARGE:
+            return "file too large";
+        case MZ_ZIP_UNSUPPORTED_METHOD:
+            return "unsupported method";
+        case MZ_ZIP_UNSUPPORTED_ENCRYPTION:
+            return "unsupported encryption";
+        case MZ_ZIP_UNSUPPORTED_FEATURE:
+            return "unsupported feature";
+        case MZ_ZIP_FAILED_FINDING_CENTRAL_DIR:
+            return "failed finding central directory";
+        case MZ_ZIP_NOT_AN_ARCHIVE:
+            return "not a ZIP archive";
+        case MZ_ZIP_INVALID_HEADER_OR_CORRUPTED:
+            return "invalid header or archive is corrupted";
+        case MZ_ZIP_UNSUPPORTED_MULTIDISK:
+            return "unsupported multidisk archive";
+        case MZ_ZIP_DECOMPRESSION_FAILED:
+            return "decompression failed or archive is corrupted";
+        case MZ_ZIP_COMPRESSION_FAILED:
+            return "compression failed";
+        case MZ_ZIP_UNEXPECTED_DECOMPRESSED_SIZE:
+            return "unexpected decompressed size";
+        case MZ_ZIP_CRC_CHECK_FAILED:
+            return "CRC-32 check failed";
+        case MZ_ZIP_UNSUPPORTED_CDIR_SIZE:
+            return "unsupported central directory size";
+        case MZ_ZIP_ALLOC_FAILED:
+            return "allocation failed";
+        case MZ_ZIP_FILE_OPEN_FAILED:
+            return "file open failed";
+        case MZ_ZIP_FILE_CREATE_FAILED:
+            return "file create failed";
+        case MZ_ZIP_FILE_WRITE_FAILED:
+            return "file write failed";
+        case MZ_ZIP_FILE_READ_FAILED:
+            return "file read failed";
+        case MZ_ZIP_FILE_CLOSE_FAILED:
+            return "file close failed";
+        case MZ_ZIP_FILE_SEEK_FAILED:
+            return "file seek failed";
+        case MZ_ZIP_FILE_STAT_FAILED:
+            return "file stat failed";
+        case MZ_ZIP_INVALID_PARAMETER:
+            return "invalid parameter";
+        case MZ_ZIP_INVALID_FILENAME:
+            return "invalid filename";
+        case MZ_ZIP_BUF_TOO_SMALL:
+            return "buffer too small";
+        case MZ_ZIP_INTERNAL_ERROR:
+            return "internal error";
+        case MZ_ZIP_FILE_NOT_FOUND:
+            return "file not found";
+        case MZ_ZIP_ARCHIVE_TOO_LARGE:
+            return "archive is too large";
+        case MZ_ZIP_VALIDATION_FAILED:
+            return "validation failed";
+        case MZ_ZIP_WRITE_CALLBACK_FAILED:
+            return "write calledback failed";
+        default:
+            break;
+    }
+
+    return "unknown error";
+}
+
+/* Note: Just because the archive is not zip64 doesn't necessarily mean it doesn't have Zip64 extended information extra field, argh. */
+mz_bool mz_zip_is_zip64(mz_zip_archive *pZip)
+{
+    if ((!pZip) || (!pZip->m_pState))
+        return MZ_FALSE;
+
+    return pZip->m_pState->m_zip64;
+}
+
+size_t mz_zip_get_central_dir_size(mz_zip_archive *pZip)
+{
+    if ((!pZip) || (!pZip->m_pState))
+        return 0;
+
+    return pZip->m_pState->m_central_dir.m_size;
+}
+
+mz_uint mz_zip_reader_get_num_files(mz_zip_archive *pZip)
+{
+    return pZip ? pZip->m_total_files : 0;
+}
+
+mz_uint64 mz_zip_get_archive_size(mz_zip_archive *pZip)
+{
+    if (!pZip)
+        return 0;
+    return pZip->m_archive_size;
+}
+
+mz_uint64 mz_zip_get_archive_file_start_offset(mz_zip_archive *pZip)
+{
+    if ((!pZip) || (!pZip->m_pState))
+        return 0;
+    return pZip->m_pState->m_file_archive_start_ofs;
+}
+
+MZ_FILE *mz_zip_get_cfile(mz_zip_archive *pZip)
+{
+    if ((!pZip) || (!pZip->m_pState))
+        return 0;
+    return pZip->m_pState->m_pFile;
+}
+
+size_t mz_zip_read_archive_data(mz_zip_archive *pZip, mz_uint64 file_ofs, void *pBuf, size_t n)
+{
+    if ((!pZip) || (!pZip->m_pState) || (!pBuf) || (!pZip->m_pRead))
+        return mz_zip_set_error(pZip, MZ_ZIP_INVALID_PARAMETER);
+
+    return pZip->m_pRead(pZip->m_pIO_opaque, file_ofs, pBuf, n);
+}
+
+mz_uint mz_zip_reader_get_filename(mz_zip_archive *pZip, mz_uint file_index, char *pFilename, mz_uint filename_buf_size)
+{
+    mz_uint n;
+    const mz_uint8 *p = mz_zip_get_cdh(pZip, file_index);
+    if (!p)
+    {
+        if (filename_buf_size)
+            pFilename[0] = '\0';
+        mz_zip_set_error(pZip, MZ_ZIP_INVALID_PARAMETER);
+        return 0;
+    }
+    n = MZ_READ_LE16(p + MZ_ZIP_CDH_FILENAME_LEN_OFS);
+    if (filename_buf_size)
+    {
+        n = MZ_MIN(n, filename_buf_size - 1);
+        memcpy(pFilename, p + MZ_ZIP_CENTRAL_DIR_HEADER_SIZE, n);
+        pFilename[n] = '\0';
+    }
+    return n + 1;
+}
+
+mz_bool mz_zip_reader_file_stat(mz_zip_archive *pZip, mz_uint file_index, mz_zip_archive_file_stat *pStat)
+{
+    return mz_zip_file_stat_internal(pZip, file_index, mz_zip_get_cdh(pZip, file_index), pStat, NULL);
+}
+
+mz_bool mz_zip_end(mz_zip_archive *pZip)
+{
+    if (!pZip)
+        return MZ_FALSE;
+
+    if (pZip->m_zip_mode == MZ_ZIP_MODE_READING)
+        return mz_zip_reader_end(pZip);
+#ifndef MINIZ_NO_ARCHIVE_WRITING_APIS
+    else if ((pZip->m_zip_mode == MZ_ZIP_MODE_WRITING) || (pZip->m_zip_mode == MZ_ZIP_MODE_WRITING_HAS_BEEN_FINALIZED))
+        return mz_zip_writer_end(pZip);
+#endif
+
+    return MZ_FALSE;
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /*#ifndef MINIZ_NO_ARCHIVE_APIS*/
diff --git a/source/luametatex/source/libraries/miniz/miniz.h b/source/luametatex/source/libraries/miniz/miniz.h
new file mode 100644
index 000000000..0e65e38b1
--- /dev/null
+++ b/source/luametatex/source/libraries/miniz/miniz.h
@@ -0,0 +1,1350 @@
+#define MINIZ_EXPORT
+/* miniz.c 2.2.0 - public domain deflate/inflate, zlib-subset, ZIP reading/writing/appending, PNG writing
+   See "unlicense" statement at the end of this file.
+   Rich Geldreich <richgel99@gmail.com>, last updated Oct. 13, 2013
+   Implements RFC 1950: http://www.ietf.org/rfc/rfc1950.txt and RFC 1951: http://www.ietf.org/rfc/rfc1951.txt
+
+   Most API's defined in miniz.c are optional. For example, to disable the archive related functions just define
+   MINIZ_NO_ARCHIVE_APIS, or to get rid of all stdio usage define MINIZ_NO_STDIO (see the list below for more macros).
+
+   * Low-level Deflate/Inflate implementation notes:
+
+     Compression: Use the "tdefl" API's. The compressor supports raw, static, and dynamic blocks, lazy or
+     greedy parsing, match length filtering, RLE-only, and Huffman-only streams. It performs and compresses
+     approximately as well as zlib.
+
+     Decompression: Use the "tinfl" API's. The entire decompressor is implemented as a single function
+     coroutine: see tinfl_decompress(). It supports decompression into a 32KB (or larger power of 2) wrapping buffer, or into a memory
+     block large enough to hold the entire file.
+
+     The low-level tdefl/tinfl API's do not make any use of dynamic memory allocation.
+
+   * zlib-style API notes:
+
+     miniz.c implements a fairly large subset of zlib. There's enough functionality present for it to be a drop-in
+     zlib replacement in many apps:
+        The z_stream struct, optional memory allocation callbacks
+        deflateInit/deflateInit2/deflate/deflateReset/deflateEnd/deflateBound
+        inflateInit/inflateInit2/inflate/inflateReset/inflateEnd
+        compress, compress2, compressBound, uncompress
+        CRC-32, Adler-32 - Using modern, minimal code size, CPU cache friendly routines.
+        Supports raw deflate streams or standard zlib streams with adler-32 checking.
+
+     Limitations:
+      The callback API's are not implemented yet. No support for gzip headers or zlib static dictionaries.
+      I've tried to closely emulate zlib's various flavors of stream flushing and return status codes, but
+      there are no guarantees that miniz.c pulls this off perfectly.
+
+   * PNG writing: See the tdefl_write_image_to_png_file_in_memory() function, originally written by
+     Alex Evans. Supports 1-4 bytes/pixel images.
+
+   * ZIP archive API notes:
+
+     The ZIP archive API's where designed with simplicity and efficiency in mind, with just enough abstraction to
+     get the job done with minimal fuss. There are simple API's to retrieve file information, read files from
+     existing archives, create new archives, append new files to existing archives, or clone archive data from
+     one archive to another. It supports archives located in memory or the heap, on disk (using stdio.h),
+     or you can specify custom file read/write callbacks.
+
+     - Archive reading: Just call this function to read a single file from a disk archive:
+
+      void *mz_zip_extract_archive_file_to_heap(const char *pZip_filename, const char *pArchive_name,
+        size_t *pSize, mz_uint zip_flags);
+
+     For more complex cases, use the "mz_zip_reader" functions. Upon opening an archive, the entire central
+     directory is located and read as-is into memory, and subsequent file access only occurs when reading individual files.
+
+     - Archives file scanning: The simple way is to use this function to scan a loaded archive for a specific file:
+
+     int mz_zip_reader_locate_file(mz_zip_archive *pZip, const char *pName, const char *pComment, mz_uint flags);
+
+     The locate operation can optionally check file comments too, which (as one example) can be used to identify
+     multiple versions of the same file in an archive. This function uses a simple linear search through the central
+     directory, so it's not very fast.
+
+     Alternately, you can iterate through all the files in an archive (using mz_zip_reader_get_num_files()) and
+     retrieve detailed info on each file by calling mz_zip_reader_file_stat().
+
+     - Archive creation: Use the "mz_zip_writer" functions. The ZIP writer immediately writes compressed file data
+     to disk and builds an exact image of the central directory in memory. The central directory image is written
+     all at once at the end of the archive file when the archive is finalized.
+
+     The archive writer can optionally align each file's local header and file data to any power of 2 alignment,
+     which can be useful when the archive will be read from optical media. Also, the writer supports placing
+     arbitrary data blobs at the very beginning of ZIP archives. Archives written using either feature are still
+     readable by any ZIP tool.
+
+     - Archive appending: The simple way to add a single file to an archive is to call this function:
+
+      mz_bool mz_zip_add_mem_to_archive_file_in_place(const char *pZip_filename, const char *pArchive_name,
+        const void *pBuf, size_t buf_size, const void *pComment, mz_uint16 comment_size, mz_uint level_and_flags);
+
+     The archive will be created if it doesn't already exist, otherwise it'll be appended to.
+     Note the appending is done in-place and is not an atomic operation, so if something goes wrong
+     during the operation it's possible the archive could be left without a central directory (although the local
+     file headers and file data will be fine, so the archive will be recoverable).
+
+     For more complex archive modification scenarios:
+     1. The safest way is to use a mz_zip_reader to read the existing archive, cloning only those bits you want to
+     preserve into a new archive using using the mz_zip_writer_add_from_zip_reader() function (which compiles the
+     compressed file data as-is). When you're done, delete the old archive and rename the newly written archive, and
+     you're done. This is safe but requires a bunch of temporary disk space or heap memory.
+
+     2. Or, you can convert an mz_zip_reader in-place to an mz_zip_writer using mz_zip_writer_init_from_reader(),
+     append new files as needed, then finalize the archive which will write an updated central directory to the
+     original archive. (This is basically what mz_zip_add_mem_to_archive_file_in_place() does.) There's a
+     possibility that the archive's central directory could be lost with this method if anything goes wrong, though.
+
+     - ZIP archive support limitations:
+     No spanning support. Extraction functions can only handle unencrypted, stored or deflated files.
+     Requires streams capable of seeking.
+
+   * This is a header file library, like stb_image.c. To get only a header file, either cut and paste the
+     below header, or create miniz.h, #define MINIZ_HEADER_FILE_ONLY, and then include miniz.c from it.
+
+   * Important: For best perf. be sure to customize the below macros for your target platform:
+     #define MINIZ_USE_UNALIGNED_LOADS_AND_STORES 1
+     #define MINIZ_LITTLE_ENDIAN 1
+     #define MINIZ_HAS_64BIT_REGISTERS 1
+
+   * On platforms using glibc, Be sure to "#define _LARGEFILE64_SOURCE 1" before including miniz.c to ensure miniz
+     uses the 64-bit variants: fopen64(), stat64(), etc. Otherwise you won't be able to process large files
+     (i.e. 32-bit stat() fails for me on files > 0x7FFFFFFF bytes).
+*/
+#pragma once
+
+
+
+/* Defines to completely disable specific portions of miniz.c:
+   If all macros here are defined the only functionality remaining will be CRC-32, adler-32, tinfl, and tdefl. */
+
+/* Define MINIZ_NO_STDIO to disable all usage and any functions which rely on stdio for file I/O. */
+/*#define MINIZ_NO_STDIO */
+
+/* If MINIZ_NO_TIME is specified then the ZIP archive functions will not be able to get the current time, or */
+/* get/set file times, and the C run-time funcs that get/set times won't be called. */
+/* The current downside is the times written to your archives will be from 1979. */
+/*#define MINIZ_NO_TIME */
+
+/* Define MINIZ_NO_ARCHIVE_APIS to disable all ZIP archive API's. */
+/*#define MINIZ_NO_ARCHIVE_APIS */
+
+/* Define MINIZ_NO_ARCHIVE_WRITING_APIS to disable all writing related ZIP archive API's. */
+/*#define MINIZ_NO_ARCHIVE_WRITING_APIS */
+
+/* Define MINIZ_NO_ZLIB_APIS to remove all ZLIB-style compression/decompression API's. */
+/*#define MINIZ_NO_ZLIB_APIS */
+
+/* Define MINIZ_NO_ZLIB_COMPATIBLE_NAME to disable zlib names, to prevent conflicts against stock zlib. */
+/*#define MINIZ_NO_ZLIB_COMPATIBLE_NAMES */
+
+/* Define MINIZ_NO_MALLOC to disable all calls to malloc, free, and realloc.
+   Note if MINIZ_NO_MALLOC is defined then the user must always provide custom user alloc/free/realloc
+   callbacks to the zlib and archive API's, and a few stand-alone helper API's which don't provide custom user
+   functions (such as tdefl_compress_mem_to_heap() and tinfl_decompress_mem_to_heap()) won't work. */
+/*#define MINIZ_NO_MALLOC */
+
+#if defined(__TINYC__) && (defined(__linux) || defined(__linux__))
+/* TODO: Work around "error: include file 'sys\utime.h' when compiling with tcc on Linux */
+#define MINIZ_NO_TIME
+#endif
+
+#include <stddef.h>
+
+#if !defined(MINIZ_NO_TIME) && !defined(MINIZ_NO_ARCHIVE_APIS)
+#include <time.h>
+#endif
+
+#if defined(_M_IX86) || defined(_M_X64) || defined(__i386__) || defined(__i386) || defined(__i486__) || defined(__i486) || defined(i386) || defined(__ia64__) || defined(__x86_64__)
+/* MINIZ_X86_OR_X64_CPU is only used to help set the below macros. */
+#define MINIZ_X86_OR_X64_CPU 1
+#else
+#define MINIZ_X86_OR_X64_CPU 0
+#endif
+
+#if (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) || MINIZ_X86_OR_X64_CPU
+/* Set MINIZ_LITTLE_ENDIAN to 1 if the processor is little endian. */
+#define MINIZ_LITTLE_ENDIAN 1
+#else
+#define MINIZ_LITTLE_ENDIAN 0
+#endif
+
+/* Set MINIZ_USE_UNALIGNED_LOADS_AND_STORES only if not set */
+#if !defined(MINIZ_USE_UNALIGNED_LOADS_AND_STORES)
+#if MINIZ_X86_OR_X64_CPU
+/* Set MINIZ_USE_UNALIGNED_LOADS_AND_STORES to 1 on CPU's that permit efficient integer loads and stores from unaligned addresses. */
+#define MINIZ_USE_UNALIGNED_LOADS_AND_STORES 1
+#define MINIZ_UNALIGNED_USE_MEMCPY
+#else
+#define MINIZ_USE_UNALIGNED_LOADS_AND_STORES 0
+#endif
+#endif
+
+#if defined(_M_X64) || defined(_WIN64) || defined(__MINGW64__) || defined(_LP64) || defined(__LP64__) || defined(__ia64__) || defined(__x86_64__)
+/* Set MINIZ_HAS_64BIT_REGISTERS to 1 if operations on 64-bit integers are reasonably fast (and don't involve compiler generated calls to helper functions). */
+#define MINIZ_HAS_64BIT_REGISTERS 1
+#else
+#define MINIZ_HAS_64BIT_REGISTERS 0
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* ------------------- zlib-style API Definitions. */
+
+/* For more compatibility with zlib, miniz.c uses unsigned long for some parameters/struct members. Beware: mz_ulong can be either 32 or 64-bits! */
+typedef unsigned long mz_ulong;
+
+/* mz_free() internally uses the MZ_FREE() macro (which by default calls free() unless you've modified the MZ_MALLOC macro) to release a block allocated from the heap. */
+MINIZ_EXPORT void mz_free(void *p);
+
+#define MZ_ADLER32_INIT (1)
+/* mz_adler32() returns the initial adler-32 value to use when called with ptr==NULL. */
+MINIZ_EXPORT mz_ulong mz_adler32(mz_ulong adler, const unsigned char *ptr, size_t buf_len);
+
+#define MZ_CRC32_INIT (0)
+/* mz_crc32() returns the initial CRC-32 value to use when called with ptr==NULL. */
+MINIZ_EXPORT mz_ulong mz_crc32(mz_ulong crc, const unsigned char *ptr, size_t buf_len);
+
+/* Compression strategies. */
+enum
+{
+    MZ_DEFAULT_STRATEGY = 0,
+    MZ_FILTERED = 1,
+    MZ_HUFFMAN_ONLY = 2,
+    MZ_RLE = 3,
+    MZ_FIXED = 4
+};
+
+/* Method */
+#define MZ_DEFLATED 8
+
+/* Heap allocation callbacks.
+Note that mz_alloc_func parameter types purposely differ from zlib's: items/size is size_t, not unsigned long. */
+typedef void *(*mz_alloc_func)(void *opaque, size_t items, size_t size);
+typedef void (*mz_free_func)(void *opaque, void *address);
+typedef void *(*mz_realloc_func)(void *opaque, void *address, size_t items, size_t size);
+
+/* Compression levels: 0-9 are the standard zlib-style levels, 10 is best possible compression (not zlib compatible, and may be very slow), MZ_DEFAULT_COMPRESSION=MZ_DEFAULT_LEVEL. */
+enum
+{
+    MZ_NO_COMPRESSION = 0,
+    MZ_BEST_SPEED = 1,
+    MZ_BEST_COMPRESSION = 9,
+    MZ_UBER_COMPRESSION = 10,
+    MZ_DEFAULT_LEVEL = 6,
+    MZ_DEFAULT_COMPRESSION = -1
+};
+
+#define MZ_VERSION "10.2.0"
+#define MZ_VERNUM 0xA100
+#define MZ_VER_MAJOR 10
+#define MZ_VER_MINOR 2
+#define MZ_VER_REVISION 0
+#define MZ_VER_SUBREVISION 0
+
+#ifndef MINIZ_NO_ZLIB_APIS
+
+/* Flush values. For typical usage you only need MZ_NO_FLUSH and MZ_FINISH. The other values are for advanced use (refer to the zlib docs). */
+enum
+{
+    MZ_NO_FLUSH = 0,
+    MZ_PARTIAL_FLUSH = 1,
+    MZ_SYNC_FLUSH = 2,
+    MZ_FULL_FLUSH = 3,
+    MZ_FINISH = 4,
+    MZ_BLOCK = 5
+};
+
+/* Return status codes. MZ_PARAM_ERROR is non-standard. */
+enum
+{
+    MZ_OK = 0,
+    MZ_STREAM_END = 1,
+    MZ_NEED_DICT = 2,
+    MZ_ERRNO = -1,
+    MZ_STREAM_ERROR = -2,
+    MZ_DATA_ERROR = -3,
+    MZ_MEM_ERROR = -4,
+    MZ_BUF_ERROR = -5,
+    MZ_VERSION_ERROR = -6,
+    MZ_PARAM_ERROR = -10000
+};
+
+/* Window bits */
+#define MZ_DEFAULT_WINDOW_BITS 15
+
+struct mz_internal_state;
+
+/* Compression/decompression stream struct. */
+typedef struct mz_stream_s
+{
+    const unsigned char *next_in; /* pointer to next byte to read */
+    unsigned int avail_in;        /* number of bytes available at next_in */
+    mz_ulong total_in;            /* total number of bytes consumed so far */
+
+    unsigned char *next_out; /* pointer to next byte to write */
+    unsigned int avail_out;  /* number of bytes that can be written to next_out */
+    mz_ulong total_out;      /* total number of bytes produced so far */
+
+    char *msg;                       /* error msg (unused) */
+    struct mz_internal_state *state; /* internal state, allocated by zalloc/zfree */
+
+    mz_alloc_func zalloc; /* optional heap allocation function (defaults to malloc) */
+    mz_free_func zfree;   /* optional heap free function (defaults to free) */
+    void *opaque;         /* heap alloc function user pointer */
+
+    int data_type;     /* data_type (unused) */
+    mz_ulong adler;    /* adler32 of the source or uncompressed data */
+    mz_ulong reserved; /* not used */
+} mz_stream;
+
+typedef mz_stream *mz_streamp;
+
+/* Returns the version string of miniz.c. */
+MINIZ_EXPORT const char *mz_version(void);
+
+/* mz_deflateInit() initializes a compressor with default options: */
+/* Parameters: */
+/*  pStream must point to an initialized mz_stream struct. */
+/*  level must be between [MZ_NO_COMPRESSION, MZ_BEST_COMPRESSION]. */
+/*  level 1 enables a specially optimized compression function that's been optimized purely for performance, not ratio. */
+/*  (This special func. is currently only enabled when MINIZ_USE_UNALIGNED_LOADS_AND_STORES and MINIZ_LITTLE_ENDIAN are defined.) */
+/* Return values: */
+/*  MZ_OK on success. */
+/*  MZ_STREAM_ERROR if the stream is bogus. */
+/*  MZ_PARAM_ERROR if the input parameters are bogus. */
+/*  MZ_MEM_ERROR on out of memory. */
+MINIZ_EXPORT int mz_deflateInit(mz_streamp pStream, int level);
+
+/* mz_deflateInit2() is like mz_deflate(), except with more control: */
+/* Additional parameters: */
+/*   method must be MZ_DEFLATED */
+/*   window_bits must be MZ_DEFAULT_WINDOW_BITS (to wrap the deflate stream with zlib header/adler-32 footer) or -MZ_DEFAULT_WINDOW_BITS (raw deflate/no header or footer) */
+/*   mem_level must be between [1, 9] (it's checked but ignored by miniz.c) */
+MINIZ_EXPORT int mz_deflateInit2(mz_streamp pStream, int level, int method, int window_bits, int mem_level, int strategy);
+
+/* Quickly resets a compressor without having to reallocate anything. Same as calling mz_deflateEnd() followed by mz_deflateInit()/mz_deflateInit2(). */
+MINIZ_EXPORT int mz_deflateReset(mz_streamp pStream);
+
+/* mz_deflate() compresses the input to output, consuming as much of the input and producing as much output as possible. */
+/* Parameters: */
+/*   pStream is the stream to read from and write to. You must initialize/update the next_in, avail_in, next_out, and avail_out members. */
+/*   flush may be MZ_NO_FLUSH, MZ_PARTIAL_FLUSH/MZ_SYNC_FLUSH, MZ_FULL_FLUSH, or MZ_FINISH. */
+/* Return values: */
+/*   MZ_OK on success (when flushing, or if more input is needed but not available, and/or there's more output to be written but the output buffer is full). */
+/*   MZ_STREAM_END if all input has been consumed and all output bytes have been written. Don't call mz_deflate() on the stream anymore. */
+/*   MZ_STREAM_ERROR if the stream is bogus. */
+/*   MZ_PARAM_ERROR if one of the parameters is invalid. */
+/*   MZ_BUF_ERROR if no forward progress is possible because the input and/or output buffers are empty. (Fill up the input buffer or free up some output space and try again.) */
+MINIZ_EXPORT int mz_deflate(mz_streamp pStream, int flush);
+
+/* mz_deflateEnd() deinitializes a compressor: */
+/* Return values: */
+/*  MZ_OK on success. */
+/*  MZ_STREAM_ERROR if the stream is bogus. */
+MINIZ_EXPORT int mz_deflateEnd(mz_streamp pStream);
+
+/* mz_deflateBound() returns a (very) conservative upper bound on the amount of data that could be generated by deflate(), assuming flush is set to only MZ_NO_FLUSH or MZ_FINISH. */
+MINIZ_EXPORT mz_ulong mz_deflateBound(mz_streamp pStream, mz_ulong source_len);
+
+/* Single-call compression functions mz_compress() and mz_compress2(): */
+/* Returns MZ_OK on success, or one of the error codes from mz_deflate() on failure. */
+MINIZ_EXPORT int mz_compress(unsigned char *pDest, mz_ulong *pDest_len, const unsigned char *pSource, mz_ulong source_len);
+MINIZ_EXPORT int mz_compress2(unsigned char *pDest, mz_ulong *pDest_len, const unsigned char *pSource, mz_ulong source_len, int level);
+
+/* mz_compressBound() returns a (very) conservative upper bound on the amount of data that could be generated by calling mz_compress(). */
+MINIZ_EXPORT mz_ulong mz_compressBound(mz_ulong source_len);
+
+/* Initializes a decompressor. */
+MINIZ_EXPORT int mz_inflateInit(mz_streamp pStream);
+
+/* mz_inflateInit2() is like mz_inflateInit() with an additional option that controls the window size and whether or not the stream has been wrapped with a zlib header/footer: */
+/* window_bits must be MZ_DEFAULT_WINDOW_BITS (to parse zlib header/footer) or -MZ_DEFAULT_WINDOW_BITS (raw deflate). */
+MINIZ_EXPORT int mz_inflateInit2(mz_streamp pStream, int window_bits);
+
+/* Quickly resets a compressor without having to reallocate anything. Same as calling mz_inflateEnd() followed by mz_inflateInit()/mz_inflateInit2(). */
+MINIZ_EXPORT int mz_inflateReset(mz_streamp pStream);
+
+/* Decompresses the input stream to the output, consuming only as much of the input as needed, and writing as much to the output as possible. */
+/* Parameters: */
+/*   pStream is the stream to read from and write to. You must initialize/update the next_in, avail_in, next_out, and avail_out members. */
+/*   flush may be MZ_NO_FLUSH, MZ_SYNC_FLUSH, or MZ_FINISH. */
+/*   On the first call, if flush is MZ_FINISH it's assumed the input and output buffers are both sized large enough to decompress the entire stream in a single call (this is slightly faster). */
+/*   MZ_FINISH implies that there are no more source bytes available beside what's already in the input buffer, and that the output buffer is large enough to hold the rest of the decompressed data. */
+/* Return values: */
+/*   MZ_OK on success. Either more input is needed but not available, and/or there's more output to be written but the output buffer is full. */
+/*   MZ_STREAM_END if all needed input has been consumed and all output bytes have been written. For zlib streams, the adler-32 of the decompressed data has also been verified. */
+/*   MZ_STREAM_ERROR if the stream is bogus. */
+/*   MZ_DATA_ERROR if the deflate stream is invalid. */
+/*   MZ_PARAM_ERROR if one of the parameters is invalid. */
+/*   MZ_BUF_ERROR if no forward progress is possible because the input buffer is empty but the inflater needs more input to continue, or if the output buffer is not large enough. Call mz_inflate() again */
+/*   with more input data, or with more room in the output buffer (except when using single call decompression, described above). */
+MINIZ_EXPORT int mz_inflate(mz_streamp pStream, int flush);
+
+/* Deinitializes a decompressor. */
+MINIZ_EXPORT int mz_inflateEnd(mz_streamp pStream);
+
+/* Single-call decompression. */
+/* Returns MZ_OK on success, or one of the error codes from mz_inflate() on failure. */
+MINIZ_EXPORT int mz_uncompress(unsigned char *pDest, mz_ulong *pDest_len, const unsigned char *pSource, mz_ulong source_len);
+MINIZ_EXPORT int mz_uncompress2(unsigned char *pDest, mz_ulong *pDest_len, const unsigned char *pSource, mz_ulong *pSource_len);
+
+/* Returns a string description of the specified error code, or NULL if the error code is invalid. */
+MINIZ_EXPORT const char *mz_error(int err);
+
+/* Redefine zlib-compatible names to miniz equivalents, so miniz.c can be used as a drop-in replacement for the subset of zlib that miniz.c supports. */
+/* Define MINIZ_NO_ZLIB_COMPATIBLE_NAMES to disable zlib-compatibility if you use zlib in the same project. */
+#ifndef MINIZ_NO_ZLIB_COMPATIBLE_NAMES
+typedef unsigned char Byte;
+typedef unsigned int uInt;
+typedef mz_ulong uLong;
+typedef Byte Bytef;
+typedef uInt uIntf;
+typedef char charf;
+typedef int intf;
+typedef void *voidpf;
+typedef uLong uLongf;
+typedef void *voidp;
+typedef void *const voidpc;
+#define Z_NULL 0
+#define Z_NO_FLUSH MZ_NO_FLUSH
+#define Z_PARTIAL_FLUSH MZ_PARTIAL_FLUSH
+#define Z_SYNC_FLUSH MZ_SYNC_FLUSH
+#define Z_FULL_FLUSH MZ_FULL_FLUSH
+#define Z_FINISH MZ_FINISH
+#define Z_BLOCK MZ_BLOCK
+#define Z_OK MZ_OK
+#define Z_STREAM_END MZ_STREAM_END
+#define Z_NEED_DICT MZ_NEED_DICT
+#define Z_ERRNO MZ_ERRNO
+#define Z_STREAM_ERROR MZ_STREAM_ERROR
+#define Z_DATA_ERROR MZ_DATA_ERROR
+#define Z_MEM_ERROR MZ_MEM_ERROR
+#define Z_BUF_ERROR MZ_BUF_ERROR
+#define Z_VERSION_ERROR MZ_VERSION_ERROR
+#define Z_PARAM_ERROR MZ_PARAM_ERROR
+#define Z_NO_COMPRESSION MZ_NO_COMPRESSION
+#define Z_BEST_SPEED MZ_BEST_SPEED
+#define Z_BEST_COMPRESSION MZ_BEST_COMPRESSION
+#define Z_DEFAULT_COMPRESSION MZ_DEFAULT_COMPRESSION
+#define Z_DEFAULT_STRATEGY MZ_DEFAULT_STRATEGY
+#define Z_FILTERED MZ_FILTERED
+#define Z_HUFFMAN_ONLY MZ_HUFFMAN_ONLY
+#define Z_RLE MZ_RLE
+#define Z_FIXED MZ_FIXED
+#define Z_DEFLATED MZ_DEFLATED
+#define Z_DEFAULT_WINDOW_BITS MZ_DEFAULT_WINDOW_BITS
+#define alloc_func mz_alloc_func
+#define free_func mz_free_func
+#define internal_state mz_internal_state
+#define z_stream mz_stream
+#define deflateInit mz_deflateInit
+#define deflateInit2 mz_deflateInit2
+#define deflateReset mz_deflateReset
+#define deflate mz_deflate
+#define deflateEnd mz_deflateEnd
+#define deflateBound mz_deflateBound
+#define compress mz_compress
+#define compress2 mz_compress2
+#define compressBound mz_compressBound
+#define inflateInit mz_inflateInit
+#define inflateInit2 mz_inflateInit2
+#define inflateReset mz_inflateReset
+#define inflate mz_inflate
+#define inflateEnd mz_inflateEnd
+#define uncompress mz_uncompress
+#define uncompress2 mz_uncompress2
+#define crc32 mz_crc32
+#define adler32 mz_adler32
+#define MAX_WBITS 15
+#define MAX_MEM_LEVEL 9
+#define zError mz_error
+#define ZLIB_VERSION MZ_VERSION
+#define ZLIB_VERNUM MZ_VERNUM
+#define ZLIB_VER_MAJOR MZ_VER_MAJOR
+#define ZLIB_VER_MINOR MZ_VER_MINOR
+#define ZLIB_VER_REVISION MZ_VER_REVISION
+#define ZLIB_VER_SUBREVISION MZ_VER_SUBREVISION
+#define zlibVersion mz_version
+#define zlib_version mz_version()
+#endif /* #ifndef MINIZ_NO_ZLIB_COMPATIBLE_NAMES */
+
+#endif /* MINIZ_NO_ZLIB_APIS */
+
+#ifdef __cplusplus
+}
+#endif
+
+
+
+
+
+#pragma once
+#include <assert.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+
+
+
+/* ------------------- Types and macros */
+typedef unsigned char mz_uint8;
+typedef signed short mz_int16;
+typedef unsigned short mz_uint16;
+typedef unsigned int mz_uint32;
+typedef unsigned int mz_uint;
+typedef int64_t mz_int64;
+typedef uint64_t mz_uint64;
+typedef int mz_bool;
+
+#define MZ_FALSE (0)
+#define MZ_TRUE (1)
+
+/* Works around MSVC's spammy "warning C4127: conditional expression is constant" message. */
+#ifdef _MSC_VER
+#define MZ_MACRO_END while (0, 0)
+#else
+#define MZ_MACRO_END while (0)
+#endif
+
+#ifdef MINIZ_NO_STDIO
+#define MZ_FILE void *
+#else
+#include <stdio.h>
+#define MZ_FILE FILE
+#endif /* #ifdef MINIZ_NO_STDIO */
+
+#ifdef MINIZ_NO_TIME
+typedef struct mz_dummy_time_t_tag
+{
+    int m_dummy;
+} mz_dummy_time_t;
+#define MZ_TIME_T mz_dummy_time_t
+#else
+#define MZ_TIME_T time_t
+#endif
+
+#define MZ_ASSERT(x) assert(x)
+
+#ifdef MINIZ_NO_MALLOC
+#define MZ_MALLOC(x) NULL
+#define MZ_FREE(x) (void)x, ((void)0)
+#define MZ_REALLOC(p, x) NULL
+#else
+#define MZ_MALLOC(x) malloc(x)
+#define MZ_FREE(x) free(x)
+#define MZ_REALLOC(p, x) realloc(p, x)
+#endif
+
+#define MZ_MAX(a, b) (((a) > (b)) ? (a) : (b))
+#define MZ_MIN(a, b) (((a) < (b)) ? (a) : (b))
+#define MZ_CLEAR_OBJ(obj) memset(&(obj), 0, sizeof(obj))
+
+#if MINIZ_USE_UNALIGNED_LOADS_AND_STORES && MINIZ_LITTLE_ENDIAN
+#define MZ_READ_LE16(p) *((const mz_uint16 *)(p))
+#define MZ_READ_LE32(p) *((const mz_uint32 *)(p))
+#else
+#define MZ_READ_LE16(p) ((mz_uint32)(((const mz_uint8 *)(p))[0]) | ((mz_uint32)(((const mz_uint8 *)(p))[1]) << 8U))
+#define MZ_READ_LE32(p) ((mz_uint32)(((const mz_uint8 *)(p))[0]) | ((mz_uint32)(((const mz_uint8 *)(p))[1]) << 8U) | ((mz_uint32)(((const mz_uint8 *)(p))[2]) << 16U) | ((mz_uint32)(((const mz_uint8 *)(p))[3]) << 24U))
+#endif
+
+#define MZ_READ_LE64(p) (((mz_uint64)MZ_READ_LE32(p)) | (((mz_uint64)MZ_READ_LE32((const mz_uint8 *)(p) + sizeof(mz_uint32))) << 32U))
+
+#ifdef _MSC_VER
+#define MZ_FORCEINLINE __forceinline
+#elif defined(__GNUC__)
+#define MZ_FORCEINLINE __inline__ __attribute__((__always_inline__))
+#else
+#define MZ_FORCEINLINE inline
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern MINIZ_EXPORT void *miniz_def_alloc_func(void *opaque, size_t items, size_t size);
+extern MINIZ_EXPORT void miniz_def_free_func(void *opaque, void *address);
+extern MINIZ_EXPORT void *miniz_def_realloc_func(void *opaque, void *address, size_t items, size_t size);
+
+#define MZ_UINT16_MAX (0xFFFFU)
+#define MZ_UINT32_MAX (0xFFFFFFFFU)
+
+#ifdef __cplusplus
+}
+#endif
+ #pragma once
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+/* ------------------- Low-level Compression API Definitions */
+
+/* Set TDEFL_LESS_MEMORY to 1 to use less memory (compression will be slightly slower, and raw/dynamic blocks will be output more frequently). */
+#define TDEFL_LESS_MEMORY 0
+
+/* tdefl_init() compression flags logically OR'd together (low 12 bits contain the max. number of probes per dictionary search): */
+/* TDEFL_DEFAULT_MAX_PROBES: The compressor defaults to 128 dictionary probes per dictionary search. 0=Huffman only, 1=Huffman+LZ (fastest/crap compression), 4095=Huffman+LZ (slowest/best compression). */
+enum
+{
+    TDEFL_HUFFMAN_ONLY = 0,
+    TDEFL_DEFAULT_MAX_PROBES = 128,
+    TDEFL_MAX_PROBES_MASK = 0xFFF
+};
+
+/* TDEFL_WRITE_ZLIB_HEADER: If set, the compressor outputs a zlib header before the deflate data, and the Adler-32 of the source data at the end. Otherwise, you'll get raw deflate data. */
+/* TDEFL_COMPUTE_ADLER32: Always compute the adler-32 of the input data (even when not writing zlib headers). */
+/* TDEFL_GREEDY_PARSING_FLAG: Set to use faster greedy parsing, instead of more efficient lazy parsing. */
+/* TDEFL_NONDETERMINISTIC_PARSING_FLAG: Enable to decrease the compressor's initialization time to the minimum, but the output may vary from run to run given the same input (depending on the contents of memory). */
+/* TDEFL_RLE_MATCHES: Only look for RLE matches (matches with a distance of 1) */
+/* TDEFL_FILTER_MATCHES: Discards matches <= 5 chars if enabled. */
+/* TDEFL_FORCE_ALL_STATIC_BLOCKS: Disable usage of optimized Huffman tables. */
+/* TDEFL_FORCE_ALL_RAW_BLOCKS: Only use raw (uncompressed) deflate blocks. */
+/* The low 12 bits are reserved to control the max # of hash probes per dictionary lookup (see TDEFL_MAX_PROBES_MASK). */
+enum
+{
+    TDEFL_WRITE_ZLIB_HEADER = 0x01000,
+    TDEFL_COMPUTE_ADLER32 = 0x02000,
+    TDEFL_GREEDY_PARSING_FLAG = 0x04000,
+    TDEFL_NONDETERMINISTIC_PARSING_FLAG = 0x08000,
+    TDEFL_RLE_MATCHES = 0x10000,
+    TDEFL_FILTER_MATCHES = 0x20000,
+    TDEFL_FORCE_ALL_STATIC_BLOCKS = 0x40000,
+    TDEFL_FORCE_ALL_RAW_BLOCKS = 0x80000
+};
+
+/* High level compression functions: */
+/* tdefl_compress_mem_to_heap() compresses a block in memory to a heap block allocated via malloc(). */
+/* On entry: */
+/*  pSrc_buf, src_buf_len: Pointer and size of source block to compress. */
+/*  flags: The max match finder probes (default is 128) logically OR'd against the above flags. Higher probes are slower but improve compression. */
+/* On return: */
+/*  Function returns a pointer to the compressed data, or NULL on failure. */
+/*  *pOut_len will be set to the compressed data's size, which could be larger than src_buf_len on uncompressible data. */
+/*  The caller must free() the returned block when it's no longer needed. */
+MINIZ_EXPORT void *tdefl_compress_mem_to_heap(const void *pSrc_buf, size_t src_buf_len, size_t *pOut_len, int flags);
+
+/* tdefl_compress_mem_to_mem() compresses a block in memory to another block in memory. */
+/* Returns 0 on failure. */
+MINIZ_EXPORT size_t tdefl_compress_mem_to_mem(void *pOut_buf, size_t out_buf_len, const void *pSrc_buf, size_t src_buf_len, int flags);
+
+/* Compresses an image to a compressed PNG file in memory. */
+/* On entry: */
+/*  pImage, w, h, and num_chans describe the image to compress. num_chans may be 1, 2, 3, or 4. */
+/*  The image pitch in bytes per scanline will be w*num_chans. The leftmost pixel on the top scanline is stored first in memory. */
+/*  level may range from [0,10], use MZ_NO_COMPRESSION, MZ_BEST_SPEED, MZ_BEST_COMPRESSION, etc. or a decent default is MZ_DEFAULT_LEVEL */
+/*  If flip is true, the image will be flipped on the Y axis (useful for OpenGL apps). */
+/* On return: */
+/*  Function returns a pointer to the compressed data, or NULL on failure. */
+/*  *pLen_out will be set to the size of the PNG image file. */
+/*  The caller must mz_free() the returned heap block (which will typically be larger than *pLen_out) when it's no longer needed. */
+MINIZ_EXPORT void *tdefl_write_image_to_png_file_in_memory_ex(const void *pImage, int w, int h, int num_chans, size_t *pLen_out, mz_uint level, mz_bool flip);
+MINIZ_EXPORT void *tdefl_write_image_to_png_file_in_memory(const void *pImage, int w, int h, int num_chans, size_t *pLen_out);
+
+/* Output stream interface. The compressor uses this interface to write compressed data. It'll typically be called TDEFL_OUT_BUF_SIZE at a time. */
+typedef mz_bool (*tdefl_put_buf_func_ptr)(const void *pBuf, int len, void *pUser);
+
+/* tdefl_compress_mem_to_output() compresses a block to an output stream. The above helpers use this function internally. */
+MINIZ_EXPORT mz_bool tdefl_compress_mem_to_output(const void *pBuf, size_t buf_len, tdefl_put_buf_func_ptr pPut_buf_func, void *pPut_buf_user, int flags);
+
+enum
+{
+    TDEFL_MAX_HUFF_TABLES = 3,
+    TDEFL_MAX_HUFF_SYMBOLS_0 = 288,
+    TDEFL_MAX_HUFF_SYMBOLS_1 = 32,
+    TDEFL_MAX_HUFF_SYMBOLS_2 = 19,
+    TDEFL_LZ_DICT_SIZE = 32768,
+    TDEFL_LZ_DICT_SIZE_MASK = TDEFL_LZ_DICT_SIZE - 1,
+    TDEFL_MIN_MATCH_LEN = 3,
+    TDEFL_MAX_MATCH_LEN = 258
+};
+
+/* TDEFL_OUT_BUF_SIZE MUST be large enough to hold a single entire compressed output block (using static/fixed Huffman codes). */
+#if TDEFL_LESS_MEMORY
+enum
+{
+    TDEFL_LZ_CODE_BUF_SIZE = 24 * 1024,
+    TDEFL_OUT_BUF_SIZE = (TDEFL_LZ_CODE_BUF_SIZE * 13) / 10,
+    TDEFL_MAX_HUFF_SYMBOLS = 288,
+    TDEFL_LZ_HASH_BITS = 12,
+    TDEFL_LEVEL1_HASH_SIZE_MASK = 4095,
+    TDEFL_LZ_HASH_SHIFT = (TDEFL_LZ_HASH_BITS + 2) / 3,
+    TDEFL_LZ_HASH_SIZE = 1 << TDEFL_LZ_HASH_BITS
+};
+#else
+enum
+{
+    TDEFL_LZ_CODE_BUF_SIZE = 64 * 1024,
+    TDEFL_OUT_BUF_SIZE = (TDEFL_LZ_CODE_BUF_SIZE * 13) / 10,
+    TDEFL_MAX_HUFF_SYMBOLS = 288,
+    TDEFL_LZ_HASH_BITS = 15,
+    TDEFL_LEVEL1_HASH_SIZE_MASK = 4095,
+    TDEFL_LZ_HASH_SHIFT = (TDEFL_LZ_HASH_BITS + 2) / 3,
+    TDEFL_LZ_HASH_SIZE = 1 << TDEFL_LZ_HASH_BITS
+};
+#endif
+
+/* The low-level tdefl functions below may be used directly if the above helper functions aren't flexible enough. The low-level functions don't make any heap allocations, unlike the above helper functions. */
+typedef enum {
+    TDEFL_STATUS_BAD_PARAM = -2,
+    TDEFL_STATUS_PUT_BUF_FAILED = -1,
+    TDEFL_STATUS_OKAY = 0,
+    TDEFL_STATUS_DONE = 1
+} tdefl_status;
+
+/* Must map to MZ_NO_FLUSH, MZ_SYNC_FLUSH, etc. enums */
+typedef enum {
+    TDEFL_NO_FLUSH = 0,
+    TDEFL_SYNC_FLUSH = 2,
+    TDEFL_FULL_FLUSH = 3,
+    TDEFL_FINISH = 4
+} tdefl_flush;
+
+/* tdefl's compression state structure. */
+typedef struct
+{
+    tdefl_put_buf_func_ptr m_pPut_buf_func;
+    void *m_pPut_buf_user;
+    mz_uint m_flags, m_max_probes[2];
+    int m_greedy_parsing;
+    mz_uint m_adler32, m_lookahead_pos, m_lookahead_size, m_dict_size;
+    mz_uint8 *m_pLZ_code_buf, *m_pLZ_flags, *m_pOutput_buf, *m_pOutput_buf_end;
+    mz_uint m_num_flags_left, m_total_lz_bytes, m_lz_code_buf_dict_pos, m_bits_in, m_bit_buffer;
+    mz_uint m_saved_match_dist, m_saved_match_len, m_saved_lit, m_output_flush_ofs, m_output_flush_remaining, m_finished, m_block_index, m_wants_to_finish;
+    tdefl_status m_prev_return_status;
+    const void *m_pIn_buf;
+    void *m_pOut_buf;
+    size_t *m_pIn_buf_size, *m_pOut_buf_size;
+    tdefl_flush m_flush;
+    const mz_uint8 *m_pSrc;
+    size_t m_src_buf_left, m_out_buf_ofs;
+    mz_uint8 m_dict[TDEFL_LZ_DICT_SIZE + TDEFL_MAX_MATCH_LEN - 1];
+    mz_uint16 m_huff_count[TDEFL_MAX_HUFF_TABLES][TDEFL_MAX_HUFF_SYMBOLS];
+    mz_uint16 m_huff_codes[TDEFL_MAX_HUFF_TABLES][TDEFL_MAX_HUFF_SYMBOLS];
+    mz_uint8 m_huff_code_sizes[TDEFL_MAX_HUFF_TABLES][TDEFL_MAX_HUFF_SYMBOLS];
+    mz_uint8 m_lz_code_buf[TDEFL_LZ_CODE_BUF_SIZE];
+    mz_uint16 m_next[TDEFL_LZ_DICT_SIZE];
+    mz_uint16 m_hash[TDEFL_LZ_HASH_SIZE];
+    mz_uint8 m_output_buf[TDEFL_OUT_BUF_SIZE];
+} tdefl_compressor;
+
+/* Initializes the compressor. */
+/* There is no corresponding deinit() function because the tdefl API's do not dynamically allocate memory. */
+/* pBut_buf_func: If NULL, output data will be supplied to the specified callback. In this case, the user should call the tdefl_compress_buffer() API for compression. */
+/* If pBut_buf_func is NULL the user should always call the tdefl_compress() API. */
+/* flags: See the above enums (TDEFL_HUFFMAN_ONLY, TDEFL_WRITE_ZLIB_HEADER, etc.) */
+MINIZ_EXPORT tdefl_status tdefl_init(tdefl_compressor *d, tdefl_put_buf_func_ptr pPut_buf_func, void *pPut_buf_user, int flags);
+
+/* Compresses a block of data, consuming as much of the specified input buffer as possible, and writing as much compressed data to the specified output buffer as possible. */
+MINIZ_EXPORT tdefl_status tdefl_compress(tdefl_compressor *d, const void *pIn_buf, size_t *pIn_buf_size, void *pOut_buf, size_t *pOut_buf_size, tdefl_flush flush);
+
+/* tdefl_compress_buffer() is only usable when the tdefl_init() is called with a non-NULL tdefl_put_buf_func_ptr. */
+/* tdefl_compress_buffer() always consumes the entire input buffer. */
+MINIZ_EXPORT tdefl_status tdefl_compress_buffer(tdefl_compressor *d, const void *pIn_buf, size_t in_buf_size, tdefl_flush flush);
+
+MINIZ_EXPORT tdefl_status tdefl_get_prev_return_status(tdefl_compressor *d);
+MINIZ_EXPORT mz_uint32 tdefl_get_adler32(tdefl_compressor *d);
+
+/* Create tdefl_compress() flags given zlib-style compression parameters. */
+/* level may range from [0,10] (where 10 is absolute max compression, but may be much slower on some files) */
+/* window_bits may be -15 (raw deflate) or 15 (zlib) */
+/* strategy may be either MZ_DEFAULT_STRATEGY, MZ_FILTERED, MZ_HUFFMAN_ONLY, MZ_RLE, or MZ_FIXED */
+MINIZ_EXPORT mz_uint tdefl_create_comp_flags_from_zip_params(int level, int window_bits, int strategy);
+
+#ifndef MINIZ_NO_MALLOC
+/* Allocate the tdefl_compressor structure in C so that */
+/* non-C language bindings to tdefl_ API don't need to worry about */
+/* structure size and allocation mechanism. */
+MINIZ_EXPORT tdefl_compressor *tdefl_compressor_alloc(void);
+MINIZ_EXPORT void tdefl_compressor_free(tdefl_compressor *pComp);
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+ #pragma once
+
+/* ------------------- Low-level Decompression API Definitions */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+/* Decompression flags used by tinfl_decompress(). */
+/* TINFL_FLAG_PARSE_ZLIB_HEADER: If set, the input has a valid zlib header and ends with an adler32 checksum (it's a valid zlib stream). Otherwise, the input is a raw deflate stream. */
+/* TINFL_FLAG_HAS_MORE_INPUT: If set, there are more input bytes available beyond the end of the supplied input buffer. If clear, the input buffer contains all remaining input. */
+/* TINFL_FLAG_USING_NON_WRAPPING_OUTPUT_BUF: If set, the output buffer is large enough to hold the entire decompressed stream. If clear, the output buffer is at least the size of the dictionary (typically 32KB). */
+/* TINFL_FLAG_COMPUTE_ADLER32: Force adler-32 checksum computation of the decompressed bytes. */
+enum
+{
+    TINFL_FLAG_PARSE_ZLIB_HEADER = 1,
+    TINFL_FLAG_HAS_MORE_INPUT = 2,
+    TINFL_FLAG_USING_NON_WRAPPING_OUTPUT_BUF = 4,
+    TINFL_FLAG_COMPUTE_ADLER32 = 8
+};
+
+/* High level decompression functions: */
+/* tinfl_decompress_mem_to_heap() decompresses a block in memory to a heap block allocated via malloc(). */
+/* On entry: */
+/*  pSrc_buf, src_buf_len: Pointer and size of the Deflate or zlib source data to decompress. */
+/* On return: */
+/*  Function returns a pointer to the decompressed data, or NULL on failure. */
+/*  *pOut_len will be set to the decompressed data's size, which could be larger than src_buf_len on uncompressible data. */
+/*  The caller must call mz_free() on the returned block when it's no longer needed. */
+MINIZ_EXPORT void *tinfl_decompress_mem_to_heap(const void *pSrc_buf, size_t src_buf_len, size_t *pOut_len, int flags);
+
+/* tinfl_decompress_mem_to_mem() decompresses a block in memory to another block in memory. */
+/* Returns TINFL_DECOMPRESS_MEM_TO_MEM_FAILED on failure, or the number of bytes written on success. */
+#define TINFL_DECOMPRESS_MEM_TO_MEM_FAILED ((size_t)(-1))
+MINIZ_EXPORT size_t tinfl_decompress_mem_to_mem(void *pOut_buf, size_t out_buf_len, const void *pSrc_buf, size_t src_buf_len, int flags);
+
+/* tinfl_decompress_mem_to_callback() decompresses a block in memory to an internal 32KB buffer, and a user provided callback function will be called to flush the buffer. */
+/* Returns 1 on success or 0 on failure. */
+typedef int (*tinfl_put_buf_func_ptr)(const void *pBuf, int len, void *pUser);
+MINIZ_EXPORT int tinfl_decompress_mem_to_callback(const void *pIn_buf, size_t *pIn_buf_size, tinfl_put_buf_func_ptr pPut_buf_func, void *pPut_buf_user, int flags);
+
+struct tinfl_decompressor_tag;
+typedef struct tinfl_decompressor_tag tinfl_decompressor;
+
+#ifndef MINIZ_NO_MALLOC
+/* Allocate the tinfl_decompressor structure in C so that */
+/* non-C language bindings to tinfl_ API don't need to worry about */
+/* structure size and allocation mechanism. */
+MINIZ_EXPORT tinfl_decompressor *tinfl_decompressor_alloc(void);
+MINIZ_EXPORT void tinfl_decompressor_free(tinfl_decompressor *pDecomp);
+#endif
+
+/* Max size of LZ dictionary. */
+#define TINFL_LZ_DICT_SIZE 32768
+
+/* Return status. */
+typedef enum {
+    /* This flags indicates the inflator needs 1 or more input bytes to make forward progress, but the caller is indicating that no more are available. The compressed data */
+    /* is probably corrupted. If you call the inflator again with more bytes it'll try to continue processing the input but this is a BAD sign (either the data is corrupted or you called it incorrectly). */
+    /* If you call it again with no input you'll just get TINFL_STATUS_FAILED_CANNOT_MAKE_PROGRESS again. */
+    TINFL_STATUS_FAILED_CANNOT_MAKE_PROGRESS = -4,
+
+    /* This flag indicates that one or more of the input parameters was obviously bogus. (You can try calling it again, but if you get this error the calling code is wrong.) */
+    TINFL_STATUS_BAD_PARAM = -3,
+
+    /* This flags indicate the inflator is finished but the adler32 check of the uncompressed data didn't match. If you call it again it'll return TINFL_STATUS_DONE. */
+    TINFL_STATUS_ADLER32_MISMATCH = -2,
+
+    /* This flags indicate the inflator has somehow failed (bad code, corrupted input, etc.). If you call it again without resetting via tinfl_init() it it'll just keep on returning the same status failure code. */
+    TINFL_STATUS_FAILED = -1,
+
+    /* Any status code less than TINFL_STATUS_DONE must indicate a failure. */
+
+    /* This flag indicates the inflator has returned every byte of uncompressed data that it can, has consumed every byte that it needed, has successfully reached the end of the deflate stream, and */
+    /* if zlib headers and adler32 checking enabled that it has successfully checked the uncompressed data's adler32. If you call it again you'll just get TINFL_STATUS_DONE over and over again. */
+    TINFL_STATUS_DONE = 0,
+
+    /* This flag indicates the inflator MUST have more input data (even 1 byte) before it can make any more forward progress, or you need to clear the TINFL_FLAG_HAS_MORE_INPUT */
+    /* flag on the next call if you don't have any more source data. If the source data was somehow corrupted it's also possible (but unlikely) for the inflator to keep on demanding input to */
+    /* proceed, so be sure to properly set the TINFL_FLAG_HAS_MORE_INPUT flag. */
+    TINFL_STATUS_NEEDS_MORE_INPUT = 1,
+
+    /* This flag indicates the inflator definitely has 1 or more bytes of uncompressed data available, but it cannot write this data into the output buffer. */
+    /* Note if the source compressed data was corrupted it's possible for the inflator to return a lot of uncompressed data to the caller. I've been assuming you know how much uncompressed data to expect */
+    /* (either exact or worst case) and will stop calling the inflator and fail after receiving too much. In pure streaming scenarios where you have no idea how many bytes to expect this may not be possible */
+    /* so I may need to add some code to address this. */
+    TINFL_STATUS_HAS_MORE_OUTPUT = 2
+} tinfl_status;
+
+/* Initializes the decompressor to its initial state. */
+#define tinfl_init(r)     \
+    do                    \
+    {                     \
+        (r)->m_state = 0; \
+    }                     \
+    MZ_MACRO_END
+#define tinfl_get_adler32(r) (r)->m_check_adler32
+
+/* Main low-level decompressor coroutine function. This is the only function actually needed for decompression. All the other functions are just high-level helpers for improved usability. */
+/* This is a universal API, i.e. it can be used as a building block to build any desired higher level decompression API. In the limit case, it can be called once per every byte input or output. */
+MINIZ_EXPORT tinfl_status tinfl_decompress(tinfl_decompressor *r, const mz_uint8 *pIn_buf_next, size_t *pIn_buf_size, mz_uint8 *pOut_buf_start, mz_uint8 *pOut_buf_next, size_t *pOut_buf_size, const mz_uint32 decomp_flags);
+
+/* Internal/private bits follow. */
+enum
+{
+    TINFL_MAX_HUFF_TABLES = 3,
+    TINFL_MAX_HUFF_SYMBOLS_0 = 288,
+    TINFL_MAX_HUFF_SYMBOLS_1 = 32,
+    TINFL_MAX_HUFF_SYMBOLS_2 = 19,
+    TINFL_FAST_LOOKUP_BITS = 10,
+    TINFL_FAST_LOOKUP_SIZE = 1 << TINFL_FAST_LOOKUP_BITS
+};
+
+typedef struct
+{
+    mz_uint8 m_code_size[TINFL_MAX_HUFF_SYMBOLS_0];
+    mz_int16 m_look_up[TINFL_FAST_LOOKUP_SIZE], m_tree[TINFL_MAX_HUFF_SYMBOLS_0 * 2];
+} tinfl_huff_table;
+
+#if MINIZ_HAS_64BIT_REGISTERS
+#define TINFL_USE_64BIT_BITBUF 1
+#else
+#define TINFL_USE_64BIT_BITBUF 0
+#endif
+
+#if TINFL_USE_64BIT_BITBUF
+typedef mz_uint64 tinfl_bit_buf_t;
+#define TINFL_BITBUF_SIZE (64)
+#else
+typedef mz_uint32 tinfl_bit_buf_t;
+#define TINFL_BITBUF_SIZE (32)
+#endif
+
+struct tinfl_decompressor_tag
+{
+    mz_uint32 m_state, m_num_bits, m_zhdr0, m_zhdr1, m_z_adler32, m_final, m_type, m_check_adler32, m_dist, m_counter, m_num_extra, m_table_sizes[TINFL_MAX_HUFF_TABLES];
+    tinfl_bit_buf_t m_bit_buf;
+    size_t m_dist_from_out_buf_start;
+    tinfl_huff_table m_tables[TINFL_MAX_HUFF_TABLES];
+    mz_uint8 m_raw_header[4], m_len_codes[TINFL_MAX_HUFF_SYMBOLS_0 + TINFL_MAX_HUFF_SYMBOLS_1 + 137];
+};
+
+#ifdef __cplusplus
+}
+#endif
+
+#pragma once
+
+
+/* ------------------- ZIP archive reading/writing */
+
+#ifndef MINIZ_NO_ARCHIVE_APIS
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+enum
+{
+    /* Note: These enums can be reduced as needed to save memory or stack space - they are pretty conservative. */
+    MZ_ZIP_MAX_IO_BUF_SIZE = 64 * 1024,
+    MZ_ZIP_MAX_ARCHIVE_FILENAME_SIZE = 512,
+    MZ_ZIP_MAX_ARCHIVE_FILE_COMMENT_SIZE = 512
+};
+
+typedef struct
+{
+    /* Central directory file index. */
+    mz_uint32 m_file_index;
+
+    /* Byte offset of this entry in the archive's central directory. Note we currently only support up to UINT_MAX or less bytes in the central dir. */
+    mz_uint64 m_central_dir_ofs;
+
+    /* These fields are copied directly from the zip's central dir. */
+    mz_uint16 m_version_made_by;
+    mz_uint16 m_version_needed;
+    mz_uint16 m_bit_flag;
+    mz_uint16 m_method;
+
+#ifndef MINIZ_NO_TIME
+    MZ_TIME_T m_time;
+#endif
+
+    /* CRC-32 of uncompressed data. */
+    mz_uint32 m_crc32;
+
+    /* File's compressed size. */
+    mz_uint64 m_comp_size;
+
+    /* File's uncompressed size. Note, I've seen some old archives where directory entries had 512 bytes for their uncompressed sizes, but when you try to unpack them you actually get 0 bytes. */
+    mz_uint64 m_uncomp_size;
+
+    /* Zip internal and external file attributes. */
+    mz_uint16 m_internal_attr;
+    mz_uint32 m_external_attr;
+
+    /* Entry's local header file offset in bytes. */
+    mz_uint64 m_local_header_ofs;
+
+    /* Size of comment in bytes. */
+    mz_uint32 m_comment_size;
+
+    /* MZ_TRUE if the entry appears to be a directory. */
+    mz_bool m_is_directory;
+
+    /* MZ_TRUE if the entry uses encryption/strong encryption (which miniz_zip doesn't support) */
+    mz_bool m_is_encrypted;
+
+    /* MZ_TRUE if the file is not encrypted, a patch file, and if it uses a compression method we support. */
+    mz_bool m_is_supported;
+
+    /* Filename. If string ends in '/' it's a subdirectory entry. */
+    /* Guaranteed to be zero terminated, may be truncated to fit. */
+    char m_filename[MZ_ZIP_MAX_ARCHIVE_FILENAME_SIZE];
+
+    /* Comment field. */
+    /* Guaranteed to be zero terminated, may be truncated to fit. */
+    char m_comment[MZ_ZIP_MAX_ARCHIVE_FILE_COMMENT_SIZE];
+
+} mz_zip_archive_file_stat;
+
+typedef size_t (*mz_file_read_func)(void *pOpaque, mz_uint64 file_ofs, void *pBuf, size_t n);
+typedef size_t (*mz_file_write_func)(void *pOpaque, mz_uint64 file_ofs, const void *pBuf, size_t n);
+typedef mz_bool (*mz_file_needs_keepalive)(void *pOpaque);
+
+struct mz_zip_internal_state_tag;
+typedef struct mz_zip_internal_state_tag mz_zip_internal_state;
+
+typedef enum {
+    MZ_ZIP_MODE_INVALID = 0,
+    MZ_ZIP_MODE_READING = 1,
+    MZ_ZIP_MODE_WRITING = 2,
+    MZ_ZIP_MODE_WRITING_HAS_BEEN_FINALIZED = 3
+} mz_zip_mode;
+
+typedef enum {
+    MZ_ZIP_FLAG_CASE_SENSITIVE = 0x0100,
+    MZ_ZIP_FLAG_IGNORE_PATH = 0x0200,
+    MZ_ZIP_FLAG_COMPRESSED_DATA = 0x0400,
+    MZ_ZIP_FLAG_DO_NOT_SORT_CENTRAL_DIRECTORY = 0x0800,
+    MZ_ZIP_FLAG_VALIDATE_LOCATE_FILE_FLAG = 0x1000, /* if enabled, mz_zip_reader_locate_file() will be called on each file as its validated to ensure the func finds the file in the central dir (intended for testing) */
+    MZ_ZIP_FLAG_VALIDATE_HEADERS_ONLY = 0x2000,     /* validate the local headers, but don't decompress the entire file and check the crc32 */
+    MZ_ZIP_FLAG_WRITE_ZIP64 = 0x4000,               /* always use the zip64 file format, instead of the original zip file format with automatic switch to zip64. Use as flags parameter with mz_zip_writer_init*_v2 */
+    MZ_ZIP_FLAG_WRITE_ALLOW_READING = 0x8000,
+    MZ_ZIP_FLAG_ASCII_FILENAME = 0x10000,
+    /*After adding a compressed file, seek back
+    to local file header and set the correct sizes*/
+    MZ_ZIP_FLAG_WRITE_HEADER_SET_SIZE = 0x20000
+} mz_zip_flags;
+
+typedef enum {
+    MZ_ZIP_TYPE_INVALID = 0,
+    MZ_ZIP_TYPE_USER,
+    MZ_ZIP_TYPE_MEMORY,
+    MZ_ZIP_TYPE_HEAP,
+    MZ_ZIP_TYPE_FILE,
+    MZ_ZIP_TYPE_CFILE,
+    MZ_ZIP_TOTAL_TYPES
+} mz_zip_type;
+
+/* miniz error codes. Be sure to update mz_zip_get_error_string() if you add or modify this enum. */
+typedef enum {
+    MZ_ZIP_NO_ERROR = 0,
+    MZ_ZIP_UNDEFINED_ERROR,
+    MZ_ZIP_TOO_MANY_FILES,
+    MZ_ZIP_FILE_TOO_LARGE,
+    MZ_ZIP_UNSUPPORTED_METHOD,
+    MZ_ZIP_UNSUPPORTED_ENCRYPTION,
+    MZ_ZIP_UNSUPPORTED_FEATURE,
+    MZ_ZIP_FAILED_FINDING_CENTRAL_DIR,
+    MZ_ZIP_NOT_AN_ARCHIVE,
+    MZ_ZIP_INVALID_HEADER_OR_CORRUPTED,
+    MZ_ZIP_UNSUPPORTED_MULTIDISK,
+    MZ_ZIP_DECOMPRESSION_FAILED,
+    MZ_ZIP_COMPRESSION_FAILED,
+    MZ_ZIP_UNEXPECTED_DECOMPRESSED_SIZE,
+    MZ_ZIP_CRC_CHECK_FAILED,
+    MZ_ZIP_UNSUPPORTED_CDIR_SIZE,
+    MZ_ZIP_ALLOC_FAILED,
+    MZ_ZIP_FILE_OPEN_FAILED,
+    MZ_ZIP_FILE_CREATE_FAILED,
+    MZ_ZIP_FILE_WRITE_FAILED,
+    MZ_ZIP_FILE_READ_FAILED,
+    MZ_ZIP_FILE_CLOSE_FAILED,
+    MZ_ZIP_FILE_SEEK_FAILED,
+    MZ_ZIP_FILE_STAT_FAILED,
+    MZ_ZIP_INVALID_PARAMETER,
+    MZ_ZIP_INVALID_FILENAME,
+    MZ_ZIP_BUF_TOO_SMALL,
+    MZ_ZIP_INTERNAL_ERROR,
+    MZ_ZIP_FILE_NOT_FOUND,
+    MZ_ZIP_ARCHIVE_TOO_LARGE,
+    MZ_ZIP_VALIDATION_FAILED,
+    MZ_ZIP_WRITE_CALLBACK_FAILED,
+    MZ_ZIP_TOTAL_ERRORS
+} mz_zip_error;
+
+typedef struct
+{
+    mz_uint64 m_archive_size;
+    mz_uint64 m_central_directory_file_ofs;
+
+    /* We only support up to UINT32_MAX files in zip64 mode. */
+    mz_uint32 m_total_files;
+    mz_zip_mode m_zip_mode;
+    mz_zip_type m_zip_type;
+    mz_zip_error m_last_error;
+
+    mz_uint64 m_file_offset_alignment;
+
+    mz_alloc_func m_pAlloc;
+    mz_free_func m_pFree;
+    mz_realloc_func m_pRealloc;
+    void *m_pAlloc_opaque;
+
+    mz_file_read_func m_pRead;
+    mz_file_write_func m_pWrite;
+    mz_file_needs_keepalive m_pNeeds_keepalive;
+    void *m_pIO_opaque;
+
+    mz_zip_internal_state *m_pState;
+
+} mz_zip_archive;
+
+typedef struct
+{
+    mz_zip_archive *pZip;
+    mz_uint flags;
+
+    int status;
+#ifndef MINIZ_DISABLE_ZIP_READER_CRC32_CHECKS
+    mz_uint file_crc32;
+#endif
+    mz_uint64 read_buf_size, read_buf_ofs, read_buf_avail, comp_remaining, out_buf_ofs, cur_file_ofs;
+    mz_zip_archive_file_stat file_stat;
+    void *pRead_buf;
+    void *pWrite_buf;
+
+    size_t out_blk_remain;
+
+    tinfl_decompressor inflator;
+
+} mz_zip_reader_extract_iter_state;
+
+/* -------- ZIP reading */
+
+/* Inits a ZIP archive reader. */
+/* These functions read and validate the archive's central directory. */
+MINIZ_EXPORT mz_bool mz_zip_reader_init(mz_zip_archive *pZip, mz_uint64 size, mz_uint flags);
+
+MINIZ_EXPORT mz_bool mz_zip_reader_init_mem(mz_zip_archive *pZip, const void *pMem, size_t size, mz_uint flags);
+
+#ifndef MINIZ_NO_STDIO
+/* Read a archive from a disk file. */
+/* file_start_ofs is the file offset where the archive actually begins, or 0. */
+/* actual_archive_size is the true total size of the archive, which may be smaller than the file's actual size on disk. If zero the entire file is treated as the archive. */
+MINIZ_EXPORT mz_bool mz_zip_reader_init_file(mz_zip_archive *pZip, const char *pFilename, mz_uint32 flags);
+MINIZ_EXPORT mz_bool mz_zip_reader_init_file_v2(mz_zip_archive *pZip, const char *pFilename, mz_uint flags, mz_uint64 file_start_ofs, mz_uint64 archive_size);
+
+/* Read an archive from an already opened FILE, beginning at the current file position. */
+/* The archive is assumed to be archive_size bytes long. If archive_size is 0, then the entire rest of the file is assumed to contain the archive. */
+/* The FILE will NOT be closed when mz_zip_reader_end() is called. */
+MINIZ_EXPORT mz_bool mz_zip_reader_init_cfile(mz_zip_archive *pZip, MZ_FILE *pFile, mz_uint64 archive_size, mz_uint flags);
+#endif
+
+/* Ends archive reading, freeing all allocations, and closing the input archive file if mz_zip_reader_init_file() was used. */
+MINIZ_EXPORT mz_bool mz_zip_reader_end(mz_zip_archive *pZip);
+
+/* -------- ZIP reading or writing */
+
+/* Clears a mz_zip_archive struct to all zeros. */
+/* Important: This must be done before passing the struct to any mz_zip functions. */
+MINIZ_EXPORT void mz_zip_zero_struct(mz_zip_archive *pZip);
+
+MINIZ_EXPORT mz_zip_mode mz_zip_get_mode(mz_zip_archive *pZip);
+MINIZ_EXPORT mz_zip_type mz_zip_get_type(mz_zip_archive *pZip);
+
+/* Returns the total number of files in the archive. */
+MINIZ_EXPORT mz_uint mz_zip_reader_get_num_files(mz_zip_archive *pZip);
+
+MINIZ_EXPORT mz_uint64 mz_zip_get_archive_size(mz_zip_archive *pZip);
+MINIZ_EXPORT mz_uint64 mz_zip_get_archive_file_start_offset(mz_zip_archive *pZip);
+MINIZ_EXPORT MZ_FILE *mz_zip_get_cfile(mz_zip_archive *pZip);
+
+/* Reads n bytes of raw archive data, starting at file offset file_ofs, to pBuf. */
+MINIZ_EXPORT size_t mz_zip_read_archive_data(mz_zip_archive *pZip, mz_uint64 file_ofs, void *pBuf, size_t n);
+
+/* All mz_zip funcs set the m_last_error field in the mz_zip_archive struct. These functions retrieve/manipulate this field. */
+/* Note that the m_last_error functionality is not thread safe. */
+MINIZ_EXPORT mz_zip_error mz_zip_set_last_error(mz_zip_archive *pZip, mz_zip_error err_num);
+MINIZ_EXPORT mz_zip_error mz_zip_peek_last_error(mz_zip_archive *pZip);
+MINIZ_EXPORT mz_zip_error mz_zip_clear_last_error(mz_zip_archive *pZip);
+MINIZ_EXPORT mz_zip_error mz_zip_get_last_error(mz_zip_archive *pZip);
+MINIZ_EXPORT const char *mz_zip_get_error_string(mz_zip_error mz_err);
+
+/* MZ_TRUE if the archive file entry is a directory entry. */
+MINIZ_EXPORT mz_bool mz_zip_reader_is_file_a_directory(mz_zip_archive *pZip, mz_uint file_index);
+
+/* MZ_TRUE if the file is encrypted/strong encrypted. */
+MINIZ_EXPORT mz_bool mz_zip_reader_is_file_encrypted(mz_zip_archive *pZip, mz_uint file_index);
+
+/* MZ_TRUE if the compression method is supported, and the file is not encrypted, and the file is not a compressed patch file. */
+MINIZ_EXPORT mz_bool mz_zip_reader_is_file_supported(mz_zip_archive *pZip, mz_uint file_index);
+
+/* Retrieves the filename of an archive file entry. */
+/* Returns the number of bytes written to pFilename, or if filename_buf_size is 0 this function returns the number of bytes needed to fully store the filename. */
+MINIZ_EXPORT mz_uint mz_zip_reader_get_filename(mz_zip_archive *pZip, mz_uint file_index, char *pFilename, mz_uint filename_buf_size);
+
+/* Attempts to locates a file in the archive's central directory. */
+/* Valid flags: MZ_ZIP_FLAG_CASE_SENSITIVE, MZ_ZIP_FLAG_IGNORE_PATH */
+/* Returns -1 if the file cannot be found. */
+MINIZ_EXPORT int mz_zip_reader_locate_file(mz_zip_archive *pZip, const char *pName, const char *pComment, mz_uint flags);
+MINIZ_EXPORT mz_bool mz_zip_reader_locate_file_v2(mz_zip_archive *pZip, const char *pName, const char *pComment, mz_uint flags, mz_uint32 *file_index);
+
+/* Returns detailed information about an archive file entry. */
+MINIZ_EXPORT mz_bool mz_zip_reader_file_stat(mz_zip_archive *pZip, mz_uint file_index, mz_zip_archive_file_stat *pStat);
+
+/* MZ_TRUE if the file is in zip64 format. */
+/* A file is considered zip64 if it contained a zip64 end of central directory marker, or if it contained any zip64 extended file information fields in the central directory. */
+MINIZ_EXPORT mz_bool mz_zip_is_zip64(mz_zip_archive *pZip);
+
+/* Returns the total central directory size in bytes. */
+/* The current max supported size is <= MZ_UINT32_MAX. */
+MINIZ_EXPORT size_t mz_zip_get_central_dir_size(mz_zip_archive *pZip);
+
+/* Extracts a archive file to a memory buffer using no memory allocation. */
+/* There must be at least enough room on the stack to store the inflator's state (~34KB or so). */
+MINIZ_EXPORT mz_bool mz_zip_reader_extract_to_mem_no_alloc(mz_zip_archive *pZip, mz_uint file_index, void *pBuf, size_t buf_size, mz_uint flags, void *pUser_read_buf, size_t user_read_buf_size);
+MINIZ_EXPORT mz_bool mz_zip_reader_extract_file_to_mem_no_alloc(mz_zip_archive *pZip, const char *pFilename, void *pBuf, size_t buf_size, mz_uint flags, void *pUser_read_buf, size_t user_read_buf_size);
+
+/* Extracts a archive file to a memory buffer. */
+MINIZ_EXPORT mz_bool mz_zip_reader_extract_to_mem(mz_zip_archive *pZip, mz_uint file_index, void *pBuf, size_t buf_size, mz_uint flags);
+MINIZ_EXPORT mz_bool mz_zip_reader_extract_file_to_mem(mz_zip_archive *pZip, const char *pFilename, void *pBuf, size_t buf_size, mz_uint flags);
+
+/* Extracts a archive file to a dynamically allocated heap buffer. */
+/* The memory will be allocated via the mz_zip_archive's alloc/realloc functions. */
+/* Returns NULL and sets the last error on failure. */
+MINIZ_EXPORT void *mz_zip_reader_extract_to_heap(mz_zip_archive *pZip, mz_uint file_index, size_t *pSize, mz_uint flags);
+MINIZ_EXPORT void *mz_zip_reader_extract_file_to_heap(mz_zip_archive *pZip, const char *pFilename, size_t *pSize, mz_uint flags);
+
+/* Extracts a archive file using a callback function to output the file's data. */
+MINIZ_EXPORT mz_bool mz_zip_reader_extract_to_callback(mz_zip_archive *pZip, mz_uint file_index, mz_file_write_func pCallback, void *pOpaque, mz_uint flags);
+MINIZ_EXPORT mz_bool mz_zip_reader_extract_file_to_callback(mz_zip_archive *pZip, const char *pFilename, mz_file_write_func pCallback, void *pOpaque, mz_uint flags);
+
+/* Extract a file iteratively */
+MINIZ_EXPORT mz_zip_reader_extract_iter_state* mz_zip_reader_extract_iter_new(mz_zip_archive *pZip, mz_uint file_index, mz_uint flags);
+MINIZ_EXPORT mz_zip_reader_extract_iter_state* mz_zip_reader_extract_file_iter_new(mz_zip_archive *pZip, const char *pFilename, mz_uint flags);
+MINIZ_EXPORT size_t mz_zip_reader_extract_iter_read(mz_zip_reader_extract_iter_state* pState, void* pvBuf, size_t buf_size);
+MINIZ_EXPORT mz_bool mz_zip_reader_extract_iter_free(mz_zip_reader_extract_iter_state* pState);
+
+#ifndef MINIZ_NO_STDIO
+/* Extracts a archive file to a disk file and sets its last accessed and modified times. */
+/* This function only extracts files, not archive directory records. */
+MINIZ_EXPORT mz_bool mz_zip_reader_extract_to_file(mz_zip_archive *pZip, mz_uint file_index, const char *pDst_filename, mz_uint flags);
+MINIZ_EXPORT mz_bool mz_zip_reader_extract_file_to_file(mz_zip_archive *pZip, const char *pArchive_filename, const char *pDst_filename, mz_uint flags);
+
+/* Extracts a archive file starting at the current position in the destination FILE stream. */
+MINIZ_EXPORT mz_bool mz_zip_reader_extract_to_cfile(mz_zip_archive *pZip, mz_uint file_index, MZ_FILE *File, mz_uint flags);
+MINIZ_EXPORT mz_bool mz_zip_reader_extract_file_to_cfile(mz_zip_archive *pZip, const char *pArchive_filename, MZ_FILE *pFile, mz_uint flags);
+#endif
+
+#if 0
+/* TODO */
+	typedef void *mz_zip_streaming_extract_state_ptr;
+	mz_zip_streaming_extract_state_ptr mz_zip_streaming_extract_begin(mz_zip_archive *pZip, mz_uint file_index, mz_uint flags);
+	uint64_t mz_zip_streaming_extract_get_size(mz_zip_archive *pZip, mz_zip_streaming_extract_state_ptr pState);
+	uint64_t mz_zip_streaming_extract_get_cur_ofs(mz_zip_archive *pZip, mz_zip_streaming_extract_state_ptr pState);
+	mz_bool mz_zip_streaming_extract_seek(mz_zip_archive *pZip, mz_zip_streaming_extract_state_ptr pState, uint64_t new_ofs);
+	size_t mz_zip_streaming_extract_read(mz_zip_archive *pZip, mz_zip_streaming_extract_state_ptr pState, void *pBuf, size_t buf_size);
+	mz_bool mz_zip_streaming_extract_end(mz_zip_archive *pZip, mz_zip_streaming_extract_state_ptr pState);
+#endif
+
+/* This function compares the archive's local headers, the optional local zip64 extended information block, and the optional descriptor following the compressed data vs. the data in the central directory. */
+/* It also validates that each file can be successfully uncompressed unless the MZ_ZIP_FLAG_VALIDATE_HEADERS_ONLY is specified. */
+MINIZ_EXPORT mz_bool mz_zip_validate_file(mz_zip_archive *pZip, mz_uint file_index, mz_uint flags);
+
+/* Validates an entire archive by calling mz_zip_validate_file() on each file. */
+MINIZ_EXPORT mz_bool mz_zip_validate_archive(mz_zip_archive *pZip, mz_uint flags);
+
+/* Misc utils/helpers, valid for ZIP reading or writing */
+MINIZ_EXPORT mz_bool mz_zip_validate_mem_archive(const void *pMem, size_t size, mz_uint flags, mz_zip_error *pErr);
+MINIZ_EXPORT mz_bool mz_zip_validate_file_archive(const char *pFilename, mz_uint flags, mz_zip_error *pErr);
+
+/* Universal end function - calls either mz_zip_reader_end() or mz_zip_writer_end(). */
+MINIZ_EXPORT mz_bool mz_zip_end(mz_zip_archive *pZip);
+
+/* -------- ZIP writing */
+
+#ifndef MINIZ_NO_ARCHIVE_WRITING_APIS
+
+/* Inits a ZIP archive writer. */
+/*Set pZip->m_pWrite (and pZip->m_pIO_opaque) before calling mz_zip_writer_init or mz_zip_writer_init_v2*/
+/*The output is streamable, i.e. file_ofs in mz_file_write_func always increases only by n*/
+MINIZ_EXPORT mz_bool mz_zip_writer_init(mz_zip_archive *pZip, mz_uint64 existing_size);
+MINIZ_EXPORT mz_bool mz_zip_writer_init_v2(mz_zip_archive *pZip, mz_uint64 existing_size, mz_uint flags);
+
+MINIZ_EXPORT mz_bool mz_zip_writer_init_heap(mz_zip_archive *pZip, size_t size_to_reserve_at_beginning, size_t initial_allocation_size);
+MINIZ_EXPORT mz_bool mz_zip_writer_init_heap_v2(mz_zip_archive *pZip, size_t size_to_reserve_at_beginning, size_t initial_allocation_size, mz_uint flags);
+
+#ifndef MINIZ_NO_STDIO
+MINIZ_EXPORT mz_bool mz_zip_writer_init_file(mz_zip_archive *pZip, const char *pFilename, mz_uint64 size_to_reserve_at_beginning);
+MINIZ_EXPORT mz_bool mz_zip_writer_init_file_v2(mz_zip_archive *pZip, const char *pFilename, mz_uint64 size_to_reserve_at_beginning, mz_uint flags);
+MINIZ_EXPORT mz_bool mz_zip_writer_init_cfile(mz_zip_archive *pZip, MZ_FILE *pFile, mz_uint flags);
+#endif
+
+/* Converts a ZIP archive reader object into a writer object, to allow efficient in-place file appends to occur on an existing archive. */
+/* For archives opened using mz_zip_reader_init_file, pFilename must be the archive's filename so it can be reopened for writing. If the file can't be reopened, mz_zip_reader_end() will be called. */
+/* For archives opened using mz_zip_reader_init_mem, the memory block must be growable using the realloc callback (which defaults to realloc unless you've overridden it). */
+/* Finally, for archives opened using mz_zip_reader_init, the mz_zip_archive's user provided m_pWrite function cannot be NULL. */
+/* Note: In-place archive modification is not recommended unless you know what you're doing, because if execution stops or something goes wrong before */
+/* the archive is finalized the file's central directory will be hosed. */
+MINIZ_EXPORT mz_bool mz_zip_writer_init_from_reader(mz_zip_archive *pZip, const char *pFilename);
+MINIZ_EXPORT mz_bool mz_zip_writer_init_from_reader_v2(mz_zip_archive *pZip, const char *pFilename, mz_uint flags);
+
+/* Adds the contents of a memory buffer to an archive. These functions record the current local time into the archive. */
+/* To add a directory entry, call this method with an archive name ending in a forwardslash with an empty buffer. */
+/* level_and_flags - compression level (0-10, see MZ_BEST_SPEED, MZ_BEST_COMPRESSION, etc.) logically OR'd with zero or more mz_zip_flags, or just set to MZ_DEFAULT_COMPRESSION. */
+MINIZ_EXPORT mz_bool mz_zip_writer_add_mem(mz_zip_archive *pZip, const char *pArchive_name, const void *pBuf, size_t buf_size, mz_uint level_and_flags);
+
+/* Like mz_zip_writer_add_mem(), except you can specify a file comment field, and optionally supply the function with already compressed data. */
+/* uncomp_size/uncomp_crc32 are only used if the MZ_ZIP_FLAG_COMPRESSED_DATA flag is specified. */
+MINIZ_EXPORT mz_bool mz_zip_writer_add_mem_ex(mz_zip_archive *pZip, const char *pArchive_name, const void *pBuf, size_t buf_size, const void *pComment, mz_uint16 comment_size, mz_uint level_and_flags,
+                                              mz_uint64 uncomp_size, mz_uint32 uncomp_crc32);
+
+MINIZ_EXPORT mz_bool mz_zip_writer_add_mem_ex_v2(mz_zip_archive *pZip, const char *pArchive_name, const void *pBuf, size_t buf_size, const void *pComment, mz_uint16 comment_size, mz_uint level_and_flags,
+                                                 mz_uint64 uncomp_size, mz_uint32 uncomp_crc32, MZ_TIME_T *last_modified, const char *user_extra_data_local, mz_uint user_extra_data_local_len,
+                                                 const char *user_extra_data_central, mz_uint user_extra_data_central_len);
+
+/* Adds the contents of a file to an archive. This function also records the disk file's modified time into the archive. */
+/* File data is supplied via a read callback function. User mz_zip_writer_add_(c)file to add a file directly.*/
+MINIZ_EXPORT mz_bool mz_zip_writer_add_read_buf_callback(mz_zip_archive *pZip, const char *pArchive_name, mz_file_read_func read_callback, void* callback_opaque, mz_uint64 max_size,
+	const MZ_TIME_T *pFile_time, const void *pComment, mz_uint16 comment_size, mz_uint level_and_flags, const char *user_extra_data_local, mz_uint user_extra_data_local_len,
+	const char *user_extra_data_central, mz_uint user_extra_data_central_len);
+
+
+#ifndef MINIZ_NO_STDIO
+/* Adds the contents of a disk file to an archive. This function also records the disk file's modified time into the archive. */
+/* level_and_flags - compression level (0-10, see MZ_BEST_SPEED, MZ_BEST_COMPRESSION, etc.) logically OR'd with zero or more mz_zip_flags, or just set to MZ_DEFAULT_COMPRESSION. */
+MINIZ_EXPORT mz_bool mz_zip_writer_add_file(mz_zip_archive *pZip, const char *pArchive_name, const char *pSrc_filename, const void *pComment, mz_uint16 comment_size, mz_uint level_and_flags);
+
+/* Like mz_zip_writer_add_file(), except the file data is read from the specified FILE stream. */
+MINIZ_EXPORT mz_bool mz_zip_writer_add_cfile(mz_zip_archive *pZip, const char *pArchive_name, MZ_FILE *pSrc_file, mz_uint64 max_size,
+                                const MZ_TIME_T *pFile_time, const void *pComment, mz_uint16 comment_size, mz_uint level_and_flags, const char *user_extra_data_local, mz_uint user_extra_data_local_len,
+                                const char *user_extra_data_central, mz_uint user_extra_data_central_len);
+#endif
+
+/* Adds a file to an archive by fully cloning the data from another archive. */
+/* This function fully clones the source file's compressed data (no recompression), along with its full filename, extra data (it may add or modify the zip64 local header extra data field), and the optional descriptor following the compressed data. */
+MINIZ_EXPORT mz_bool mz_zip_writer_add_from_zip_reader(mz_zip_archive *pZip, mz_zip_archive *pSource_zip, mz_uint src_file_index);
+
+/* Finalizes the archive by writing the central directory records followed by the end of central directory record. */
+/* After an archive is finalized, the only valid call on the mz_zip_archive struct is mz_zip_writer_end(). */
+/* An archive must be manually finalized by calling this function for it to be valid. */
+MINIZ_EXPORT mz_bool mz_zip_writer_finalize_archive(mz_zip_archive *pZip);
+
+/* Finalizes a heap archive, returning a poiner to the heap block and its size. */
+/* The heap block will be allocated using the mz_zip_archive's alloc/realloc callbacks. */
+MINIZ_EXPORT mz_bool mz_zip_writer_finalize_heap_archive(mz_zip_archive *pZip, void **ppBuf, size_t *pSize);
+
+/* Ends archive writing, freeing all allocations, and closing the output file if mz_zip_writer_init_file() was used. */
+/* Note for the archive to be valid, it *must* have been finalized before ending (this function will not do it for you). */
+MINIZ_EXPORT mz_bool mz_zip_writer_end(mz_zip_archive *pZip);
+
+/* -------- Misc. high-level helper functions: */
+
+/* mz_zip_add_mem_to_archive_file_in_place() efficiently (but not atomically) appends a memory blob to a ZIP archive. */
+/* Note this is NOT a fully safe operation. If it crashes or dies in some way your archive can be left in a screwed up state (without a central directory). */
+/* level_and_flags - compression level (0-10, see MZ_BEST_SPEED, MZ_BEST_COMPRESSION, etc.) logically OR'd with zero or more mz_zip_flags, or just set to MZ_DEFAULT_COMPRESSION. */
+/* TODO: Perhaps add an option to leave the existing central dir in place in case the add dies? We could then truncate the file (so the old central dir would be at the end) if something goes wrong. */
+MINIZ_EXPORT mz_bool mz_zip_add_mem_to_archive_file_in_place(const char *pZip_filename, const char *pArchive_name, const void *pBuf, size_t buf_size, const void *pComment, mz_uint16 comment_size, mz_uint level_and_flags);
+MINIZ_EXPORT mz_bool mz_zip_add_mem_to_archive_file_in_place_v2(const char *pZip_filename, const char *pArchive_name, const void *pBuf, size_t buf_size, const void *pComment, mz_uint16 comment_size, mz_uint level_and_flags, mz_zip_error *pErr);
+
+/* Reads a single file from an archive into a heap block. */
+/* If pComment is not NULL, only the file with the specified comment will be extracted. */
+/* Returns NULL on failure. */
+MINIZ_EXPORT void *mz_zip_extract_archive_file_to_heap(const char *pZip_filename, const char *pArchive_name, size_t *pSize, mz_uint flags);
+MINIZ_EXPORT void *mz_zip_extract_archive_file_to_heap_v2(const char *pZip_filename, const char *pArchive_name, const char *pComment, size_t *pSize, mz_uint flags, mz_zip_error *pErr);
+
+#endif /* #ifndef MINIZ_NO_ARCHIVE_WRITING_APIS */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* MINIZ_NO_ARCHIVE_APIS */
diff --git a/source/luametatex/source/libraries/miniz/readme.md b/source/luametatex/source/libraries/miniz/readme.md
new file mode 100644
index 000000000..3f8fd7324
--- /dev/null
+++ b/source/luametatex/source/libraries/miniz/readme.md
@@ -0,0 +1,34 @@
+## Miniz
+
+Miniz is a lossless, high performance data compression library in a single source file that implements the zlib (RFC 1950) and Deflate (RFC 1951) compressed data format specification standards. It supports the most commonly used functions exported by the zlib library, but is a completely independent implementation so zlib's licensing requirements do not apply. Miniz also contains simple to use functions for writing .PNG format image files and reading/writing/appending .ZIP format archives. Miniz's compression speed has been tuned to be comparable to zlib's, and it also has a specialized real-time compressor function designed to compare well against fastlz/minilzo.
+
+## Usage
+
+Please use the files from the [releases page](https://github.com/richgel999/miniz/releases) in your projects. Do not use the git checkout directly! The different source and header files are [amalgamated](https://www.sqlite.org/amalgamation.html) into one `miniz.c`/`miniz.h` pair in a build step (`amalgamate.sh`). Include `miniz.c` and `miniz.h` in your project to use Miniz.
+
+## Features
+
+* MIT licensed
+* A portable, single source and header file library written in plain C. Tested with GCC, clang and Visual Studio.
+* Easily tuned and trimmed down by defines
+* A drop-in replacement for zlib's most used API's (tested in several open source projects that use zlib, such as libpng and libzip).
+* Fills a single threaded performance vs. compression ratio gap between several popular real-time compressors and zlib. For example, at level 1, miniz.c compresses around 5-9% better than minilzo, but is approx. 35% slower. At levels 2-9, miniz.c is designed to compare favorably against zlib's ratio and speed. See the miniz performance comparison page for example timings.
+* Not a block based compressor: miniz.c fully supports stream based processing using a coroutine-style implementation. The zlib-style API functions can be called a single byte at a time if that's all you've got.
+* Easy to use. The low-level compressor (tdefl) and decompressor (tinfl) have simple state structs which can be saved/restored as needed with simple memcpy's. The low-level codec API's don't use the heap in any way.
+* Entire inflater (including optional zlib header parsing and Adler-32 checking) is implemented in a single function as a coroutine, which is separately available in a small (~550 line) source file: miniz_tinfl.c
+* A fairly complete (but totally optional) set of .ZIP archive manipulation and extraction API's. The archive functionality is intended to solve common problems encountered in embedded, mobile, or game development situations. (The archive API's are purposely just powerful enough to write an entire archiver given a bit of additional higher-level logic.)
+
+## Known Problems
+
+* No support for encrypted archives. Not sure how useful this stuff is in practice.
+* Minimal documentation. The assumption is that the user is already familiar with the basic zlib API. I need to write an API wiki - for now I've tried to place key comments before each enum/API, and I've included 6 examples that demonstrate how to use the module's major features.
+
+## Special Thanks
+
+Thanks to Alex Evans for the PNG writer function. Also, thanks to Paul Holden and Thorsten Scheuermann for feedback and testing, Matt Pritchard for all his encouragement, and Sean Barrett's various public domain libraries for inspiration (and encouraging me to write miniz.c in C, which was much more enjoyable and less painful than I thought it would be considering I've been programming in C++ for so long).
+
+Thanks to Bruce Dawson for reporting a problem with the level_and_flags archive API parameter (which is fixed in v1.12) and general feedback, and Janez Zemva for indirectly encouraging me into writing more examples.
+
+## Patents
+
+I was recently asked if miniz avoids patent issues. miniz purposely uses the same core algorithms as the ones used by zlib. The compressor uses vanilla hash chaining as described [here](https://datatracker.ietf.org/doc/html/rfc1951#section-4). Also see the [gzip FAQ](https://web.archive.org/web/20160308045258/http://www.gzip.org/#faq11). In my opinion, if miniz falls prey to a patent attack then zlib/gzip are likely to be at serious risk too.
diff --git a/source/luametatex/source/libraries/miniz/readme.txt b/source/luametatex/source/libraries/miniz/readme.txt
new file mode 100644
index 000000000..8a5e6979e
--- /dev/null
+++ b/source/luametatex/source/libraries/miniz/readme.txt
@@ -0,0 +1,8 @@
+Remark
+
+Conform the recommendation we use the official merged files (release) not the github files. Also, we
+only use part of that single file because we do all file handling ourselves because we operate within
+the file name regime of LuaMetaTeX that is aware of operating system specifics like wide filenames on
+MSWindows). We don't drop in updates without careful checking them first for potential clashes.\\
+
+release url: https://github.com/richgel999/miniz/releases
+\ No newline at end of file
diff --git a/source/luametatex/source/libraries/pplib/html.zip b/source/luametatex/source/libraries/pplib/html.zip
new file mode 100644
index 000000000..3139244dd
--- /dev/null
+++ b/source/luametatex/source/libraries/pplib/html.zip
diff --git a/source/luametatex/source/libraries/pplib/ppapi.h b/source/luametatex/source/libraries/pplib/ppapi.h
new file mode 100644
index 000000000..e9ced5718
--- /dev/null
+++ b/source/luametatex/source/libraries/pplib/ppapi.h
@@ -0,0 +1,404 @@
+
+#ifndef PP_API_H
+#define PP_API_H
+
+#include <stdint.h>
+#include <stddef.h>
+#include <string.h>
+
+#include "ppconf.h"
+
+#define pplib_version "v2.1"
+#define pplib_author "p.jackowski@gust.org.pl"
+
+/* types */
+
+typedef int64_t ppint;
+typedef size_t ppuint; // machine word
+
+typedef char ppbyte;
+
+typedef double ppnum;
+
+typedef struct ppname ppname;
+typedef struct ppstring ppstring;
+
+struct ppname {
+  ppbyte *data;
+  size_t size;
+  ppname *alterego;
+  int flags;
+};
+
+struct ppstring {
+  ppbyte *data;
+  size_t size;
+  ppstring *alterego;
+  int flags;
+};
+
+typedef struct ppobj ppobj;
+typedef struct ppref ppref;
+
+typedef struct {
+  ppobj *data;
+  size_t size;
+} pparray;
+
+typedef struct {
+  ppobj *data;
+  ppname **keys;
+  size_t size;
+} ppdict;
+
+typedef enum {
+  PPSTREAM_BASE16 = 0,
+  PPSTREAM_BASE85,
+  PPSTREAM_RUNLENGTH,
+  PPSTREAM_FLATE,
+  PPSTREAM_LZW,
+  PPSTREAM_CCITT,
+  PPSTREAM_DCT,
+  PPSTREAM_JBIG2,
+  PPSTREAM_JPX,
+  PPSTREAM_CRYPT
+} ppstreamtp;
+
+typedef struct {
+  ppstreamtp *filters;
+  ppdict **params;
+  size_t count;
+} ppstream_filter;
+
+typedef struct {
+  ppdict *dict;
+  void *input, *I;
+  size_t offset;
+  size_t length;
+  ppstream_filter filter;
+  ppobj *filespec;
+  ppstring *cryptkey;
+  int flags;
+} ppstream;
+
+PPDEF extern const char * ppstream_filter_name[];
+PPAPI int ppstream_filter_type (ppname *filtername, ppstreamtp *filtertype);
+PPAPI void ppstream_filter_info (ppstream *stream, ppstream_filter *info, int decode);
+
+#define PPSTREAM_FILTER (1<<0)
+#define PPSTREAM_IMAGE (1<<1)
+#define PPSTREAM_ENCRYPTED_AES (1<<2)
+#define PPSTREAM_ENCRYPTED_RC4 (1<<3)
+#define PPSTREAM_ENCRYPTED (PPSTREAM_ENCRYPTED_AES|PPSTREAM_ENCRYPTED_RC4)
+#define PPSTREAM_ENCRYPTED_OWN (1<<4)
+#define PPSTREAM_NOT_SUPPORTED (1<<6)
+
+#define ppstream_compressed(stream) ((stream)->flags & (PPSTREAM_FILTER|PPSTREAM_IMAGE))
+#define ppstream_filtered(stream) ((stream)->flags & PPSTREAM_FILTER)
+#define ppstream_image(stream) ((stream)->flags & PPSTREAM_IMAGE)
+#define ppstream_encrypted(stream) ((stream)->flags & PPSTREAM_ENCRYPTED)
+
+typedef enum {
+  PPNONE = 0,
+  PPNULL,
+  PPBOOL,
+  PPINT,
+  PPNUM,
+  PPNAME,
+  PPSTRING,
+  PPARRAY,
+  PPDICT,
+  PPSTREAM,
+  PPREF
+} ppobjtp;
+
+PPDEF extern const char * ppobj_kind[];
+
+struct ppobj {
+  union {
+    ppint integer;
+    ppnum number;
+    ppname *name;
+    ppstring *string;
+    pparray *array;
+    ppdict *dict;
+    ppstream *stream;
+    ppref *ref;
+    void *any;
+  };
+  ppobjtp type;
+};
+
+typedef struct ppxref ppxref;
+
+struct ppref {
+  ppobj object;
+  ppuint number, version;
+  size_t offset;
+  size_t length;
+  ppxref *xref;
+};
+
+typedef struct ppdoc ppdoc;
+
+/* object */
+
+#define ppobj_get_null(o) ((o)->type == PPNULL ? 1 : 0)
+#define ppobj_get_bool(o, v) ((o)->type == PPBOOL ? ((v = ((o)->integer != 0)), 1) : 0)
+#define ppobj_get_int(o, v) ((o)->type == PPINT ? ((v = (o)->integer), 1) : 0)
+#define ppobj_get_uint(o, v) ((o)->type == PPINT && (o)->integer >= 0 ? ((v = (ppuint)((o)->integer)), 1) : 0)
+#define ppobj_get_num(o, v) ((o)->type == PPNUM ? ((v = (o)->number), 1) : (((o)->type == PPINT ? ((v = (ppnum)((o)->integer)), 1) : 0)))
+#define ppobj_get_name(o) ((o)->type == PPNAME ? (o)->name : NULL)
+#define ppobj_get_string(o) ((o)->type == PPSTRING ? (o)->string : NULL)
+#define ppobj_get_array(o) ((o)->type == PPARRAY ? (o)->array : NULL)
+#define ppobj_get_dict(o) ((o)->type == PPDICT ? (o)->dict : NULL)
+#define ppobj_get_stream(o) ((o)->type == PPSTREAM ? (o)->stream : NULL)
+#define ppobj_get_ref(o) ((o)->type == PPREF ? (o)->ref : NULL)
+
+#define ppobj_rget_obj(o) ((o)->type == PPREF ? ppref_obj((o)->ref) : o)
+#define ppobj_rget_null(o) ((o)->type == PPNULL ? 1 : ((o)->type == PPREF ? ppobj_get_null(ppref_obj((o)->ref)) : 0))
+#define ppobj_rget_bool(o, v) ((o)->type == PPBOOL ? ((v = ((o)->integer != 0)), 1) : ((o)->type == PPREF ? ppobj_get_bool(ppref_obj((o)->ref), v) : 0))
+#define ppobj_rget_int(o, v) ((o)->type == PPINT ? ((v = (o)->integer), 1) : ((o)->type == PPREF ? ppobj_get_int(ppref_obj((o)->ref), v) : 0))
+#define ppobj_rget_uint(o, v) ((o)->type == PPINT && (o)->integer >= 0 ? ((v = (ppuint)((o)->integer)), 1) : ((o)->type == PPREF ? ppobj_get_uint(ppref_obj((o)->ref), v) : 0))
+#define ppobj_rget_num(o, v) ((o)->type == PPNUM ? ((v = (o)->number), 1) : (((o)->type == PPINT ? ((v = (ppnum)((o)->integer)), 1) : ((o)->type == PPREF ? ppobj_get_num(ppref_obj((o)->ref), v) : 0))))
+#define ppobj_rget_name(o) ((o)->type == PPNAME ? (o)->name : ((o)->type == PPREF ? ppobj_get_name(ppref_obj((o)->ref)) : NULL))
+#define ppobj_rget_string(o) ((o)->type == PPSTRING ? (o)->string : ((o)->type == PPREF ? ppobj_get_string(ppref_obj((o)->ref)) : NULL))
+#define ppobj_rget_array(o) ((o)->type == PPARRAY ? (o)->array : ((o)->type == PPREF ? ppobj_get_array(ppref_obj((o)->ref)) : NULL))
+#define ppobj_rget_dict(o) ((o)->type == PPDICT ? (o)->dict : ((o)->type == PPREF ? ppobj_get_dict(ppref_obj((o)->ref)) : NULL))
+#define ppobj_rget_stream(o) ((o)->type == PPSTREAM ? (o)->stream : ((o)->type == PPREF ? ppobj_get_stream(ppref_obj((o)->ref)) : NULL))
+#define ppobj_rget_ref(o) ((o)->type == PPREF ? (o)->ref : ((o)->type == PPREF ? ppobj_get_ref(ppref_obj((o)->ref)) : NULL))
+
+#define ppobj_get_bool_value(o) ((o)->type == PPBOOL ? ((o)->integer != 0) : 0)
+#define ppobj_get_int_value(o) ((o)->type == PPINT ? (o)->integer : 0)
+#define ppobj_get_num_value(o) ((o)->type == PPNUM  ? (o)->number : ((o)->type == PPINT  ? (ppnum)((o)->integer) : 0.0))
+
+/* name */
+
+#define ppname_is(name, s) (memcmp((name)->data, s, sizeof("" s) - 1) == 0)
+#define ppname_eq(name, n) (memcmp((name)->data, s, (name)->size) == 0)
+
+#define ppname_size(name) ((name)->size)
+#define ppname_exec(name) ((name)->flags & PPNAME_EXEC)
+#define ppname_data(name) ((name)->data)
+
+#define PPNAME_ENCODED (1 << 0)
+#define PPNAME_DECODED (1 << 1)
+#define PPNAME_EXEC (1 << 1)
+
+PPAPI ppname * ppname_decoded (ppname *name);
+PPAPI ppname * ppname_encoded (ppname *name);
+
+PPAPI ppbyte * ppname_decoded_data (ppname *name);
+PPAPI ppbyte * ppname_encoded_data (ppname *name);
+
+/* string */
+
+#define ppstring_size(string) ((string)->size)
+#define ppstring_data(string) ((string)->data)
+
+#define PPSTRING_ENCODED (1 << 0)
+#define PPSTRING_DECODED (1 << 1)
+//#define PPSTRING_EXEC (1 << 2) // postscript only
+#define PPSTRING_PLAIN 0
+#define PPSTRING_BASE16 (1 << 3)
+#define PPSTRING_BASE85 (1 << 4)
+#define PPSTRING_UTF16BE (1 << 5)
+#define PPSTRING_UTF16LE (1 << 6)
+
+#define ppstring_type(string) ((string)->flags & (PPSTRING_BASE16|PPSTRING_BASE85))
+#define ppstring_hex(string) ((string)->flags & PPSTRING_BASE16)
+#define ppstring_utf(string) ((string)->flags & (PPSTRING_UTF16BE|PPSTRING_UTF16LE))
+
+PPAPI ppstring * ppstring_decoded (ppstring *string);
+PPAPI ppstring * ppstring_encoded (ppstring *string);
+
+PPAPI ppbyte * ppstring_decoded_data (ppstring *string);
+PPAPI ppbyte * ppstring_encoded_data (ppstring *string);
+
+/* array */
+
+#define pparray_size(array) ((array)->size)
+#define pparray_at(array, index) ((array)->data + index)
+
+#define pparray_first(array, index, obj) ((index) = 0, (obj) = pparray_at(array,  0))
+#define pparray_next(index, obj) (++(index), ++(obj))
+
+#define pparray_get(array, index) (index < (array)->size ? pparray_at(array, index) : NULL)
+
+PPAPI ppobj * pparray_get_obj (pparray *array, size_t index);
+PPAPI int pparray_get_bool (pparray *array, size_t index, int *v);
+PPAPI int pparray_get_int (pparray *array, size_t index, ppint *v);
+PPAPI int pparray_get_uint (pparray *array, size_t index, ppuint *v);
+PPAPI int pparray_get_num (pparray *array, size_t index, ppnum *v);
+PPAPI ppname * pparray_get_name (pparray *array, size_t index);
+PPAPI ppstring * pparray_get_string (pparray *array, size_t index);
+PPAPI pparray * pparray_get_array (pparray *array, size_t index);
+PPAPI ppdict * pparray_get_dict (pparray *array, size_t index);
+//PPAPI ppstream * pparray_get_stream (pparray *array, size_t index);
+PPAPI ppref * pparray_get_ref (pparray *array, size_t index);
+
+PPAPI ppobj * pparray_rget_obj (pparray *array, size_t index);
+PPAPI int pparray_rget_bool (pparray *array, size_t index, int *v);
+PPAPI int pparray_rget_int (pparray *array, size_t index, ppint *v);
+PPAPI int pparray_rget_uint (pparray *array, size_t index, ppuint *v);
+PPAPI int pparray_rget_num (pparray *array, size_t index, ppnum *v);
+PPAPI ppname * pparray_rget_name (pparray *array, size_t index);
+PPAPI ppstring * pparray_rget_string (pparray *array, size_t index);
+PPAPI pparray * pparray_rget_array (pparray *array, size_t index);
+PPAPI ppdict * pparray_rget_dict (pparray *array, size_t index);
+PPAPI ppstream * pparray_rget_stream (pparray *array, size_t index);
+PPAPI ppref * pparray_rget_ref (pparray *array, size_t index);
+
+/* dict */
+
+#define ppdict_size(dict) ((dict)->size)
+#define ppdict_at(dict, index) ((dict)->data + index)
+#define ppdict_key(dict, index) ((dict)->keys[index])
+
+PPAPI ppobj * ppdict_get_obj (ppdict *dict, const char *name);
+PPAPI int ppdict_get_bool (ppdict *dict, const char *name, int *v);
+PPAPI int ppdict_get_int (ppdict *dict, const char *name, ppint *v);
+PPAPI int ppdict_get_uint (ppdict *dict, const char *name, ppuint *v);
+PPAPI int ppdict_get_num (ppdict *dict, const char *name, ppnum *v);
+PPAPI ppname * ppdict_get_name (ppdict *dict, const char *name);
+PPAPI ppstring * ppdict_get_string (ppdict *dict, const char *name);
+PPAPI pparray * ppdict_get_array (ppdict *dict, const char *name);
+PPAPI ppdict * ppdict_get_dict (ppdict *dict, const char *name);
+//PPAPI ppstream * ppdict_get_stream (ppdict *dict, const char *name);
+PPAPI ppref * ppdict_get_ref (ppdict *dict, const char *name);
+
+PPAPI ppobj * ppdict_rget_obj (ppdict *dict, const char *name);
+PPAPI int ppdict_rget_bool (ppdict *dict, const char *name, int *v);
+PPAPI int ppdict_rget_int (ppdict *dict, const char *name, ppint *v);
+PPAPI int ppdict_rget_uint (ppdict *dict, const char *name, ppuint *v);
+PPAPI int ppdict_rget_num (ppdict *dict, const char *name, ppnum *v);
+PPAPI ppname * ppdict_rget_name (ppdict *dict, const char *name);
+PPAPI ppstring * ppdict_rget_string (ppdict *dict, const char *name);
+PPAPI pparray * ppdict_rget_array (ppdict *dict, const char *name);
+PPAPI ppdict * ppdict_rget_dict (ppdict *dict, const char *name);
+PPAPI ppstream * ppdict_rget_stream (ppdict *dict, const char *name);
+PPAPI ppref * ppdict_rget_ref (ppdict *dict, const char *name);
+
+#define ppdict_first(dict, pkey, obj) (pkey = (dict)->keys, obj = (dict)->data)
+#define ppdict_next(pkey, obj) (++(pkey), ++(obj))
+
+/* stream */
+
+#define ppstream_dict(stream) ((stream)->dict)
+
+PPAPI uint8_t * ppstream_first (ppstream *stream, size_t *size, int decode);
+PPAPI uint8_t * ppstream_next (ppstream *stream, size_t *size);
+PPAPI uint8_t * ppstream_all (ppstream *stream, size_t *size, int decode);
+PPAPI void ppstream_done (ppstream *stream);
+
+PPAPI void ppstream_init_buffers (void);
+PPAPI void ppstream_free_buffers (void);
+
+/* ref */
+
+#define ppref_obj(ref) (&(ref)->object)
+
+/* xref */
+
+PPAPI ppxref * ppdoc_xref (ppdoc *pdf);
+PPAPI ppxref * ppxref_prev (ppxref *xref);
+PPAPI ppdict * ppxref_trailer (ppxref *xref);
+PPAPI ppdict * ppxref_catalog (ppxref *xref);
+PPAPI ppdict * ppxref_info (ppxref *xref);
+PPAPI ppref * ppxref_pages (ppxref *xref);
+PPAPI ppref * ppxref_find (ppxref *xref, ppuint refnumber);
+
+/* doc */
+
+PPAPI ppdoc * ppdoc_load (const char *filename);
+PPAPI ppdoc * ppdoc_filehandle (FILE *file, int closefile);
+#define ppdoc_file(file) ppdoc_filehandle(file, 1)
+PPAPI ppdoc * ppdoc_mem (const void *data, size_t size);
+PPAPI void ppdoc_free (ppdoc *pdf);
+
+#define ppdoc_trailer(pdf) ppxref_trailer(ppdoc_xref(pdf))
+#define ppdoc_catalog(pdf) ppxref_catalog(ppdoc_xref(pdf))
+#define ppdoc_info(pdf) ppxref_info(ppdoc_xref(pdf))
+#define ppdoc_pages(pdf) ppxref_pages(ppdoc_xref(pdf))
+
+PPAPI ppuint ppdoc_page_count (ppdoc *pdf);
+PPAPI ppref * ppdoc_page (ppdoc *pdf, ppuint index);
+PPAPI ppref *  ppdoc_first_page (ppdoc *pdf);
+PPAPI ppref * ppdoc_next_page (ppdoc *pdf);
+
+PPAPI ppstream * ppcontents_first (ppdict *dict);
+PPAPI ppstream * ppcontents_next (ppdict *dict, ppstream *stream);
+
+/* crypt */
+
+typedef enum {
+  PPCRYPT_NONE = 0,
+  PPCRYPT_DONE = 1,
+  PPCRYPT_FAIL = -1,
+  PPCRYPT_PASS = -2
+} ppcrypt_status;
+
+PPAPI ppcrypt_status ppdoc_crypt_status (ppdoc *pdf);
+PPAPI ppcrypt_status ppdoc_crypt_pass (ppdoc *pdf, const void *userpass, size_t userpasslength, const void *ownerpass, size_t ownerpasslength);
+
+/* permission flags, effect in Acrobat File -> Properties -> Security tab */
+
+PPAPI ppint ppdoc_permissions (ppdoc *pdf);
+
+#define PPDOC_ALLOW_PRINT (1<<2)        // printing
+#define PPDOC_ALLOW_MODIFY (1<<3)       // filling form fields, signing, creating template pages
+#define PPDOC_ALLOW_COPY (1<<4)         // copying, copying for accessibility
+#define PPDOC_ALLOW_ANNOTS (1<<5)       // filling form fields, copying, signing
+#define PPDOC_ALLOW_EXTRACT (1<<9)      // contents copying for accessibility
+#define PPDOC_ALLOW_ASSEMBLY (1<<10)    // (no effect)
+#define PPDOC_ALLOW_PRINT_HIRES (1<<11) // (no effect)
+
+/* context */
+
+typedef struct ppcontext ppcontext;
+
+PPAPI ppcontext * ppcontext_new (void);
+PPAPI void ppcontext_done (ppcontext *context);
+PPAPI void ppcontext_free (ppcontext *context);
+
+/* contents parser */
+
+PPAPI ppobj * ppcontents_first_op (ppcontext *context, ppstream *stream, size_t *psize, ppname **pname);
+PPAPI ppobj * ppcontents_next_op (ppcontext *context, ppstream *stream, size_t *psize, ppname **pname);
+PPAPI ppobj * ppcontents_parse (ppcontext *context, ppstream *stream, size_t *psize);
+
+/* boxes and transforms */
+
+typedef struct {
+  ppnum lx, ly, rx, ry;
+} pprect;
+
+PPAPI pprect * pparray_to_rect (pparray *array, pprect *rect);
+PPAPI pprect * ppdict_get_rect (ppdict *dict, const char *name, pprect *rect);
+PPAPI pprect * ppdict_get_box (ppdict *dict, const char *name, pprect *rect);
+
+typedef struct {
+  ppnum xx, xy, yx, yy, x, y;
+} ppmatrix;
+
+PPAPI ppmatrix * pparray_to_matrix (pparray *array, ppmatrix *matrix);
+PPAPI ppmatrix * ppdict_get_matrix (ppdict *dict, const char *name, ppmatrix *matrix);
+
+/* logger */
+
+typedef void (*pplogger_callback) (const char *message, void *alien);
+PPAPI void pplog_callback (pplogger_callback logger, void *alien);
+PPAPI int pplog_prefix (const char *prefix);
+
+/* version */
+
+PPAPI const char * ppdoc_version_string (ppdoc *pdf);
+PPAPI int ppdoc_version_number (ppdoc *pdf, int *minor);
+
+/* doc info */
+
+PPAPI size_t ppdoc_file_size (ppdoc *pdf);
+PPAPI ppuint ppdoc_objects (ppdoc *pdf);
+PPAPI size_t ppdoc_memory (ppdoc *pdf, size_t *waste);
+
+#endif
diff --git a/source/luametatex/source/libraries/pplib/pparray.c b/source/luametatex/source/libraries/pplib/pparray.c
new file mode 100644
index 000000000..944596bdc
--- /dev/null
+++ b/source/luametatex/source/libraries/pplib/pparray.c
@@ -0,0 +1,145 @@
+
+#include "pplib.h"
+
+pparray * pparray_create (const ppobj *stackpos, size_t size, ppheap *heap)
+{
+  pparray *array;
+  array = (pparray *)ppstruct_take(heap, sizeof(pparray));
+  array->data = (ppobj *)ppstruct_take(heap, size * sizeof(ppobj)); // separate chunk, alignment requirements warning otherwise
+  array->size = size;  
+  memcpy(array->data, stackpos, size * sizeof(ppobj));
+  return array;
+}
+
+ppobj * pparray_get_obj (pparray *array, size_t index)
+{
+  return pparray_get(array, index);
+}
+
+ppobj * pparray_rget_obj (pparray *array, size_t index)
+{
+  ppobj *obj;
+  return (obj = pparray_get(array, index)) != NULL ? ppobj_rget_obj(obj) : NULL;
+}
+
+int pparray_get_bool (pparray *array, size_t index, int *v)
+{
+  ppobj *obj;
+  return (obj = pparray_get(array, index)) != NULL ? ppobj_get_bool(obj, *v) : 0;
+}
+
+int pparray_rget_bool (pparray *array, size_t index, int *v)
+{
+  ppobj *obj;
+  return (obj = pparray_get(array, index)) != NULL ? ppobj_rget_bool(obj, *v) : 0;
+}
+
+int pparray_get_int (pparray *array, size_t index, ppint *v)
+{
+  ppobj *obj;
+  return (obj = pparray_get(array, index)) != NULL ? ppobj_get_int(obj, *v) : 0;
+}
+
+int pparray_rget_int (pparray *array, size_t index, ppint *v)
+{
+  ppobj *obj;
+  return (obj = pparray_get(array, index)) != NULL ? ppobj_rget_int(obj, *v) : 0;
+}
+
+int pparray_get_uint (pparray *array, size_t index, ppuint *v)
+{
+  ppobj *obj;
+  return (obj = pparray_get(array, index)) != NULL ? ppobj_get_uint(obj, *v) : 0;
+}
+
+int pparray_rget_uint (pparray *array, size_t index, ppuint *v)
+{
+  ppobj *obj;
+  return (obj = pparray_get(array, index)) != NULL ? ppobj_rget_uint(obj, *v) : 0;
+}
+
+int pparray_get_num (pparray *array, size_t index, ppnum *v)
+{
+  ppobj *obj;
+  return (obj = pparray_get(array, index)) != NULL ? ppobj_get_num(obj, *v) : 0;
+}
+
+int pparray_rget_num (pparray *array, size_t index, ppnum *v)
+{
+  ppobj *obj;
+  return (obj = pparray_get(array, index)) != NULL ? ppobj_rget_num(obj, *v) : 0;
+}
+
+ppname * pparray_get_name (pparray *array, size_t index)
+{
+  ppobj *obj;
+  return (obj = pparray_get(array, index)) != NULL ? ppobj_get_name(obj) : NULL;
+}
+
+ppname * pparray_rget_name (pparray *array, size_t index)
+{
+  ppobj *obj;
+  return (obj = pparray_get(array, index)) != NULL ? ppobj_rget_name(obj) : NULL;
+}
+
+ppstring * pparray_get_string (pparray *array, size_t index)
+{
+  ppobj *obj;
+  return (obj = pparray_get(array, index)) != NULL ? ppobj_get_string(obj) : NULL;
+}
+
+ppstring * pparray_rget_string (pparray *array, size_t index)
+{
+  ppobj *obj;
+  return (obj = pparray_get(array, index)) != NULL ? ppobj_rget_string(obj) : NULL;
+}
+
+pparray * pparray_get_array (pparray *array, size_t index)
+{
+  ppobj *obj;
+  return (obj = pparray_get(array, index)) != NULL ? ppobj_get_array(obj) : NULL;
+}
+
+pparray * pparray_rget_array (pparray *array, size_t index)
+{
+  ppobj *obj;
+  return (obj = pparray_get(array, index)) != NULL ? ppobj_rget_array(obj) : NULL;
+}
+
+ppdict * pparray_get_dict (pparray *array, size_t index)
+{
+  ppobj *obj;
+  return (obj = pparray_get(array, index)) != NULL ? ppobj_get_dict(obj) : NULL;
+}
+
+ppdict * pparray_rget_dict (pparray *array, size_t index)
+{
+  ppobj *obj;
+  return (obj = pparray_get(array, index)) != NULL ? ppobj_rget_dict(obj) : NULL;
+}
+
+/*
+ppstream * pparray_get_stream (pparray *array, size_t index)
+{
+  ppobj *obj;
+  return (obj = pparray_get(array, index)) != NULL ? ppobj_get_stream(obj) : NULL;
+}
+*/
+
+ppstream * pparray_rget_stream (pparray *array, size_t index)
+{
+  ppobj *obj;
+  return (obj = pparray_get(array, index)) != NULL ? ppobj_rget_stream(obj) : NULL;
+}
+
+ppref * pparray_get_ref (pparray *array, size_t index)
+{
+  ppobj *obj;
+  return (obj = pparray_get(array, index)) != NULL ? ppobj_get_ref(obj) : NULL;
+}
+
+ppref * pparray_rget_ref (pparray *array, size_t index)
+{
+  ppobj *obj;
+  return (obj = pparray_get(array, index)) != NULL ? ppobj_rget_ref(obj) : NULL;
+}
diff --git a/source/luametatex/source/libraries/pplib/pparray.h b/source/luametatex/source/libraries/pplib/pparray.h
new file mode 100644
index 000000000..df0d8e8b2
--- /dev/null
+++ b/source/luametatex/source/libraries/pplib/pparray.h
@@ -0,0 +1,7 @@
+
+#ifndef PP_ARRAY_H
+#define PP_ARRAY_H
+
+pparray * pparray_create (const ppobj *stack, size_t size, ppheap *heap);
+
+#endif
+\ No newline at end of file
diff --git a/source/luametatex/source/libraries/pplib/ppconf.h b/source/luametatex/source/libraries/pplib/ppconf.h
new file mode 100644
index 000000000..0211eb51e
--- /dev/null
+++ b/source/luametatex/source/libraries/pplib/ppconf.h
@@ -0,0 +1,76 @@
+
+#ifndef PP_CONF_H
+#define PP_CONF_H
+
+/*
+Aux flags:
+  PPDLL -- indicates a part of a shared library
+  PPEXE -- indicates a host program using shared library functions
+*/
+
+#if defined(_WIN32) || defined(_WIN64)
+#  ifdef PPDLL
+#    define PPAPI __declspec(dllexport)
+#    define PPDEF __declspec(dllexport)
+#  else
+#    ifdef PPEXE
+#      define PPAPI __declspec(dllimport)
+#      define PPDEF
+#    else
+#      define PPAPI
+#      define PPDEF
+#    endif
+#  endif
+#else
+#  define PPAPI
+#  define PPDEF
+#endif
+
+/* platform vs integers */
+
+#if defined(_WIN32) || defined(WIN32)
+#  ifdef _MSC_VER
+#    if defined(_M_64) || defined(_WIN64)
+#      define MSVC64
+#    else
+#      define MSVC32
+#    endif
+#  else
+#    if defined(__MINGW64__)
+#      define MINGW64
+#    else
+#      if defined(__MINGW32__)
+#        define MINGW32
+#      endif
+#    endif
+#  endif
+#endif
+
+#if defined(_WIN64) || defined(__MINGW32__)
+#  define PPINT64F "%I64d"
+#  define PPUINT64F "%I64u"
+#else
+#  define PPINT64F "%lld"
+#  define PPUINT64F "%llu"
+#endif
+
+#if defined(MSVC64)
+#  define PPINT(N) N##I64
+#  define PPUINT(N) N##UI64
+#  define PPINTF PPINT64F
+#  define PPUINTF PPUINT64F
+#elif defined(MINGW64)
+#  define PPINT(N) N##LL
+#  define PPUINT(N) N##ULL
+#  define PPINTF PPINT64F
+#  define PPUINTF PPUINT64F
+#else // 32bit or sane 64bit (LP64, where long is long indeed)
+#  define PPINT(N) N##L
+#  define PPUINT(N) N##UL
+#  define PPINTF "%ld"
+#  define PPUINTF "%lu"
+#endif
+
+#define PPSIZEF PPUINTF
+
+#endif
diff --git a/source/luametatex/source/libraries/pplib/ppcrypt.c b/source/luametatex/source/libraries/pplib/ppcrypt.c
new file mode 100644
index 000000000..ce63e7cab
--- /dev/null
+++ b/source/luametatex/source/libraries/pplib/ppcrypt.c
@@ -0,0 +1,748 @@
+
+#include "utilmd5.h"
+#include "utilsha.h"
+
+#include "pplib.h"
+
+/* crypt struct */
+
+static ppcrypt * ppcrypt_create (ppheap *heap)
+{
+  ppcrypt *crypt;
+  crypt = (ppcrypt *)ppstruct_take(heap, sizeof(ppcrypt));
+  memset(crypt, 0, sizeof(ppcrypt));
+  return crypt;
+}
+
+int ppcrypt_type (ppcrypt *crypt, ppname *cryptname, ppuint *length, int *cryptflags)
+{
+  ppdict *filterdict;
+  ppname *filtertype;
+  int cryptmd = 0, default256 = 0;
+
+  if (crypt->map == NULL || (filterdict = ppdict_rget_dict(crypt->map, cryptname->data)) == NULL)
+    return 0;
+  if ((filtertype = ppdict_get_name(filterdict, "CFM")) == NULL)
+    return 0;
+  *cryptflags = 0;
+  if (ppname_is(filtertype, "V2"))
+    *cryptflags |= PPCRYPT_INFO_RC4;
+  else if (ppname_is(filtertype, "AESV2"))
+    *cryptflags |= PPCRYPT_INFO_AES;
+  else if (ppname_is(filtertype, "AESV3"))
+    *cryptflags |= PPCRYPT_INFO_AES, default256 = 1;
+  else
+    return 0;
+  /* pdf spec page. 134: /Length is said to be optional bit-length of the key, but it seems to be a mistake, as Acrobat
+     produces /Length key with bytes lengths, opposite to /Length key of the main encrypt dict. */
+  if (length != NULL)
+    if (!ppdict_get_uint(filterdict, "Length", length))
+      *length = (*cryptflags & PPCRYPT_INFO_RC4) ? 5 : (default256 ? 32 : 16);
+  /* one of metadata flags is set iff there is an explicit EncryptMetadata key */
+  if (ppdict_get_bool(filterdict, "EncryptMetadata", &cryptmd))
+    *cryptflags |= (cryptmd ? PPCRYPT_INFO_MD : PPCRYPT_INFO_NOMD);
+  return 1;
+}
+
+/* V1..4 algorithms */
+
+/* V1..4 unicode do PdfDocEncoding */
+
+typedef struct {
+  uint32_t unicode;
+  uint32_t code;
+  uint32_t count;
+} map_range_t;
+
+static const map_range_t unicode_to_pdf_doc_encoding_map[] = {
+  { 32, 32, 95 },
+  { 161, 161, 12 },
+  { 174, 174, 82 },
+  { 305, 154, 1 },
+  { 321, 149, 1 },
+  { 322, 155, 1 },
+  { 338, 150, 1 },
+  { 339, 156, 1 },
+  { 352, 151, 1 },
+  { 353, 157, 1 },
+  { 376, 152, 1 },
+  { 381, 153, 1 },
+  { 382, 158, 1 },
+  { 402, 134, 1 },
+  { 710, 26, 1 },
+  { 711, 25, 1 },
+  { 728, 24, 1 },
+  { 729, 27, 1 },
+  { 730, 30, 1 },
+  { 731, 29, 1 },
+  { 732, 31, 1 },
+  { 733, 28, 1 },
+  { 8211, 133, 1 },
+  { 8212, 132, 1 },
+  { 8216, 143, 3 },
+  { 8220, 141, 2 },
+  { 8222, 140, 1 },
+  { 8224, 129, 2 },
+  { 8226, 128, 1 },
+  { 8230, 131, 1 },
+  { 8240, 139, 1 },
+  { 8249, 136, 2 },
+  { 8260, 135, 1 },
+  { 8364, 160, 1 },
+  { 8482, 146, 1 },
+  { 8722, 138, 1 },
+  { 64257, 147, 2 }
+};
+
+#define unicode_to_pdf_doc_encoding_entries (sizeof(unicode_to_pdf_doc_encoding_map) / sizeof(map_range_t))
+
+static int unicode_to_pdf_doc_encoding (uint32_t unicode, uint8_t *pcode)
+{
+  const map_range_t *left, *right, *mid;
+
+  left = &unicode_to_pdf_doc_encoding_map[0];
+  right = &unicode_to_pdf_doc_encoding_map[unicode_to_pdf_doc_encoding_entries - 1];
+  for ( ; left <= right; )
+  {
+    mid = left + ((right - left) / 2);
+    if (unicode > mid->unicode + mid->count - 1)
+      left = mid + 1;
+    else if (unicode < mid->unicode)
+      right = mid - 1;
+    else
+    {
+      *pcode = (uint8_t)(mid->code + (unicode - mid->unicode));
+      return 1;
+    }
+  }
+  return 0;
+}
+
+#define utf8_unicode2(p) (((p[0]&31)<<6)|(p[1]&63))
+#define utf8_unicode3(p) (((p[0]&15)<<12)|((p[1]&63)<<6)|(p[2]&63))
+#define utf8_unicode4(p) (((p[0]&7)<<18)|((p[1]&63)<<12)|((p[2]&63)<<6)|(p[3]&63))
+
+#define utf8_get1(p, e, unicode) ((unicode = p[0]), p + 1)
+#define utf8_get2(p, e, unicode) (p + 1 < e ? ((unicode = utf8_unicode2(p)), p + 2) : NULL)
+#define utf8_get3(p, e, unicode) (p + 2 < e ? ((unicode = utf8_unicode3(p)), p + 3) : NULL)
+#define utf8_get4(p, e, unicode) (p + 4 < e ? ((unicode = utf8_unicode3(p)), p + 4) : NULL)
+
+#define utf8_get(p, e, unicode) \
+  (p[0] < 0x80 ? utf8_get1(p, e, unicode) : \
+   p[0] < 0xC0 ? NULL : \
+   p[0] < 0xE0 ? utf8_get2(p, e, unicode) : \
+   p[0] < 0xF0 ? utf8_get3(p, e, unicode) : utf8_get4(p, e, unicode))
+
+static int ppcrypt_password_encoding (uint8_t *password, size_t *passwordlength)
+{
+  uint8_t *p, newpassword[PPCRYPT_MAX_PASSWORD], *n;
+  const uint8_t *e;
+  uint32_t unicode;
+
+  for (n = &newpassword[0], p = &password[0], e = p + *passwordlength; p < e; ++n)
+  {
+    p = utf8_get(p, e, unicode);
+    if (p == NULL)
+      return 0;
+    if (unicode_to_pdf_doc_encoding(unicode, n) == 0)
+      return 0;
+  }
+  *passwordlength = n - &newpassword[0];
+  memcpy(password, newpassword, *passwordlength);
+  return 1;
+}
+
+/* setup passwords */
+
+static const uint8_t password_padding[] = {
+  0x28, 0xBF, 0x4E, 0x5E, 0x4E, 0x75, 0x8A, 0x41, 0x64, 0x00, 0x4E, 0x56, 0xFF, 0xFA, 0x01, 0x08,
+  0x2E, 0x2E, 0x00, 0xB6, 0xD0, 0x68, 0x3E, 0x80, 0x2F, 0x0C, 0xA9, 0xFE, 0x64, 0x53, 0x69, 0x7A
+};
+
+static void ppcrypt_set_user_password (ppcrypt *crypt, const void *userpass, size_t userpasslength)
+{
+  crypt->userpasslength = userpasslength > PPCRYPT_MAX_PASSWORD ? PPCRYPT_MAX_PASSWORD : userpasslength;
+  memcpy(crypt->userpass, userpass, crypt->userpasslength);
+  if (crypt->algorithm_variant < 5)
+  {
+    if (ppcrypt_password_encoding(crypt->userpass, &crypt->userpasslength) == 0)
+      return;
+    if (crypt->userpasslength > 32)
+      crypt->userpasslength = 32;
+    else if (crypt->userpasslength < 32)
+      memcpy(&crypt->userpass[crypt->userpasslength], password_padding, 32 - crypt->userpasslength);
+  }
+  crypt->flags |= PPCRYPT_USER_PASSWORD;
+}
+
+static void ppcrypt_set_owner_password (ppcrypt *crypt, const void *ownerpass, size_t ownerpasslength)
+{
+  crypt->ownerpasslength = ownerpasslength > PPCRYPT_MAX_PASSWORD ? PPCRYPT_MAX_PASSWORD : ownerpasslength;
+  memcpy(crypt->ownerpass, ownerpass, crypt->ownerpasslength);
+  if (crypt->algorithm_variant < 5)
+  {
+    if (ppcrypt_password_encoding(crypt->ownerpass, &crypt->ownerpasslength) == 0)
+      return;
+    if (crypt->ownerpasslength > 32)
+      crypt->ownerpasslength = 32;
+    else if (crypt->ownerpasslength < 32)
+      memcpy(&crypt->ownerpass[crypt->ownerpasslength], password_padding, 32 - crypt->ownerpasslength);
+  }
+  crypt->flags |= PPCRYPT_OWNER_PASSWORD;
+}
+
+/* V1..4 retrieving user password from owner password and owner key (variant < 5) */
+
+static void ppcrypt_user_password_from_owner_key (ppcrypt *crypt, const void *ownerkey, size_t ownerkeysize)
+{
+  uint8_t temp[16], rc4key[32], rc4key2[32];
+  uint8_t i;
+  ppuint k;
+  md5_state md5;
+
+  md5_digest_init(&md5);
+  md5_digest_add(&md5, crypt->ownerpass, 32);
+  md5_digest_get(&md5, rc4key, MD5_BYTES);
+  if (crypt->algorithm_revision >= 3)
+  {
+    for (i = 0; i < 50; ++i)
+    {
+      md5_digest(rc4key, 16, temp, MD5_BYTES);
+      memcpy(rc4key, temp, 16);
+    }
+  }
+  rc4_decode_data(ownerkey, ownerkeysize, crypt->userpass, rc4key, crypt->filekeylength);
+  if (crypt->algorithm_revision >= 3)
+  {
+    for (i = 1; i <= 19; ++i)
+    {
+      for (k = 0; k < crypt->filekeylength; ++k)
+        rc4key2[k] = rc4key[k] ^ i;
+      rc4_decode_data(crypt->userpass, 32, crypt->userpass, rc4key2, crypt->filekeylength);
+    }
+  }
+  //crypt->userpasslength = 32;
+  for (crypt->userpasslength = 0; crypt->userpasslength < 32; ++crypt->userpasslength)
+    if (memcmp(&crypt->userpass[crypt->userpasslength], password_padding, 32 - crypt->userpasslength) == 0)
+      break;
+  crypt->flags |= PPCRYPT_USER_PASSWORD;
+}
+
+/* V1..4 generating file key; pdf spec p. 125 */
+
+static void ppcrypt_compute_file_key (ppcrypt *crypt, const void *ownerkey, size_t ownerkeysize, const void *id, size_t idsize)
+{
+  uint32_t p;
+  uint8_t permissions[4], temp[16];
+  int i;
+  md5_state md5;
+
+  md5_digest_init(&md5);
+  md5_digest_add(&md5, crypt->userpass, 32);
+  md5_digest_add(&md5, ownerkey, ownerkeysize);
+  p = (uint32_t)crypt->permissions;
+  permissions[0] = get_number_byte1(p);
+  permissions[1] = get_number_byte2(p);
+  permissions[2] = get_number_byte3(p);
+  permissions[3] = get_number_byte4(p);
+  md5_digest_add(&md5, permissions, 4);
+  md5_digest_add(&md5, id, idsize);
+  if (crypt->algorithm_revision >= 4 && (crypt->flags & PPCRYPT_NO_METADATA))
+    md5_digest_add(&md5, "\xFF\xFF\xFF\xFF", 4);
+  md5_digest_get(&md5, crypt->filekey, MD5_BYTES);
+  if (crypt->algorithm_revision >= 3)
+  {
+    for (i = 0; i < 50; ++i)
+    {
+      md5_digest(crypt->filekey, (size_t)crypt->filekeylength, temp, MD5_BYTES);
+      memcpy(crypt->filekey, temp, 16);
+    }
+  }
+}
+
+/* V1..4 generating userkey for comparison with /U; requires a general file key and id; pdf spec page 126-127 */
+
+static void ppcrypt_compute_user_key (ppcrypt *crypt, const void *id, size_t idsize, uint8_t password_hash[32])
+{
+  uint8_t rc4key2[32];
+  uint8_t i;
+  ppuint k;
+
+  if (crypt->algorithm_revision <= 2)
+  {
+    rc4_encode_data(password_padding, 32, password_hash, crypt->filekey, crypt->filekeylength);
+  }
+  else
+  {
+    md5_state md5;
+    md5_digest_init(&md5);
+    md5_digest_add(&md5, password_padding, 32);
+    md5_digest_add(&md5, id, idsize);
+    md5_digest_get(&md5, password_hash, MD5_BYTES);
+    rc4_encode_data(password_hash, 16, password_hash, crypt->filekey, crypt->filekeylength);
+    for (i = 1; i <= 19; ++i)
+    {
+      for (k = 0; k < crypt->filekeylength; ++k)
+        rc4key2[k] = crypt->filekey[k] ^ i;
+      rc4_encode_data(password_hash, 16, password_hash, rc4key2, crypt->filekeylength);
+    }
+    for (i = 16; i < 32; ++i)
+      password_hash[i] = password_hash[i - 16] ^ i; /* arbitrary 16-bytes padding */
+  }
+}
+
+static ppcrypt_status ppcrypt_authenticate_legacy (ppcrypt *crypt, ppstring *userkey, ppstring *ownerkey, ppstring *id)
+{
+  uint8_t password_hash[32];
+
+  if ((crypt->flags & PPCRYPT_USER_PASSWORD) == 0 && (crypt->flags & PPCRYPT_OWNER_PASSWORD) != 0)
+    ppcrypt_user_password_from_owner_key(crypt, ownerkey, ownerkey->size);
+  ppcrypt_compute_file_key(crypt, ownerkey->data, ownerkey->size, id->data, id->size);
+  ppcrypt_compute_user_key(crypt, id->data, id->size, password_hash); /* needs file key */
+  return memcmp(userkey->data, password_hash, (crypt->algorithm_revision >= 3 ? 16 : 32)) == 0 ? PPCRYPT_DONE : PPCRYPT_PASS;
+}
+
+/* V5 */
+
+static const uint8_t nulliv[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; /* AES-256 initialization vector */
+
+/* V5 R5..6 password hash */
+
+#define PPCRYPT_MAX_MANGLED ((127+64+48)*64) // 127 password, 64 hash, 48 /U key
+
+static void ppcrypt_password_hash_indeed (const uint8_t *password, size_t passwordlength, const uint8_t *userkey, uint8_t hash[64])
+{
+  size_t hashlength, datalength;
+  uint8_t data[PPCRYPT_MAX_MANGLED], *pdata;
+  uint8_t *key, *iv;
+  uint8_t round, i;
+  uint32_t div3;
+
+  hashlength = 32; /* initial hash is sha256 */
+  round = 0;
+  do
+  {
+    /* concat password, hash, and /U value 64 times */
+    pdata = &data[0];
+    memcpy(pdata, password, passwordlength);
+    pdata += passwordlength;
+    memcpy(pdata, hash, hashlength);
+    pdata += hashlength;
+    if (userkey != NULL)
+    {
+      memcpy(pdata, userkey, 48);
+      pdata += 48;
+    }
+    datalength = pdata - &data[0];
+    for (i = 1; i < 64; ++i, pdata += datalength)
+      memcpy(pdata, &data[0], datalength);
+    datalength *= 64;
+
+    /* encrypt the data with aes128 using hash bytes 1..16 as key and bytes 17..32 as initialization vector
+       encryption inplace, CBC, no-padding, no change to datalength */
+    key = &hash[0]; iv = &hash[16];
+    aes_encode_data(data, datalength, data, key, 16, iv, AES_NULL_PADDING);
+
+    /* get modulo 3 of first 16 bytes number of encrypted data (sum of digits modulo 3) */
+    for (i = 0, div3 = 0; i < 16; ++i)
+      div3 += data[i];
+
+    /* compute new hash using sha256/384/512 */
+    switch (div3 % 3)
+    {
+      case 0:
+        sha256_digest(data, datalength, hash, SHA_BYTES);
+        hashlength = 32;
+        break;
+      case 1:
+        sha384_digest(data, datalength, hash, SHA_BYTES);
+        hashlength = 48;
+        break;
+      case 2:
+        sha512_digest(data, datalength, hash, SHA_BYTES);
+        hashlength = 64;
+        break;
+    }
+
+    /* do 64 times, then keep going until the last byte of data <= round - 32 */
+  } while (++round < 64 || round < data[datalength - 1] + 32);
+
+}
+
+static void ppcrypt_password_hash (ppcrypt *crypt, const uint8_t *password, size_t passwordlength, const uint8_t *salt, const uint8_t *userkey, uint8_t password_hash[32])
+{
+  sha256_state sha;
+  uint8_t hash[64]; /* result password_hash is 32 bytes, but we need 64 for R6 procedure */
+
+  /* take sha256 of password, salt and /U */
+  sha256_digest_init(&sha);
+  sha256_digest_add(&sha, password, passwordlength);
+  sha256_digest_add(&sha, salt, 8);
+  if (userkey != NULL)
+    sha256_digest_add(&sha, userkey, 48);
+  sha256_digest_get(&sha, hash, SHA_BYTES);
+
+  /* V5 R5 - password_hash is the digest, V5 R6 - password_hash is mangled */
+  if (crypt->algorithm_revision >= 6)
+    ppcrypt_password_hash_indeed(password, passwordlength, userkey, hash);
+
+  memcpy(password_hash, hash, 32);
+}
+
+/* V5 permissions */
+
+static ppcrypt_status ppcrypt_authenticate_permissions (ppcrypt *crypt, ppstring *perms)
+{
+  uint8_t permsdata[16];
+
+  aes_decode_data(perms->data, perms->size, permsdata, crypt->filekey, crypt->filekeylength, nulliv, AES_NULL_PADDING);
+
+  if (permsdata[9] != 'a' || permsdata[10] != 'd' || permsdata[11] != 'b')
+    return PPCRYPT_FAIL;
+
+  /* do not check/update permissions flags here; they might be different inside crypt string */
+  if (0)
+  {
+    int64_t p;
+    int i;
+    for (p = 0, i = 0; i < 8; ++i)
+      p = p + (permsdata[i] << (i << 3)); /* low order bytes first */
+    crypt->permissions = (ppint)((int32_t)(p & 0x00000000FFFFFFFFLL)); /* unset bits 33..64, treat as 32-bit signed int */
+  }
+
+  if (permsdata[8] == 'T')
+    crypt->flags &= ~PPCRYPT_NO_METADATA;
+  else if (permsdata[8] == 'F')
+    crypt->flags |= PPCRYPT_NO_METADATA;
+
+  return PPCRYPT_DONE;
+}
+
+/* V5 authentication */
+
+static ppcrypt_status ppcrypt_authenticate_user (ppcrypt *crypt, ppstring *u, ppstring *ue, ppstring *perms)
+{
+  uint8_t password_hash[32], *salt;
+
+  salt = (uint8_t *)&u->data[32]; /* validation salt */
+  ppcrypt_password_hash(crypt, crypt->userpass, crypt->userpasslength, salt, NULL, password_hash);
+  if (memcmp(u->data, password_hash, 32) != 0)
+    return PPCRYPT_PASS;
+
+  salt = (uint8_t *)&u->data[40]; /* key salt */
+  ppcrypt_password_hash(crypt, crypt->userpass, crypt->userpasslength, salt, NULL, password_hash);
+  aes_decode_data(ue->data, 32, crypt->filekey, password_hash, 32, nulliv, AES_NULL_PADDING);
+
+  return ppcrypt_authenticate_permissions(crypt, perms);
+}
+
+static ppcrypt_status ppcrypt_authenticate_owner (ppcrypt *crypt, ppstring *u, ppstring *o, ppstring *oe, ppstring *perms)
+{
+  uint8_t password_hash[32], *salt;
+
+  salt = (uint8_t *)&o->data[32]; /* validation salt */
+  ppcrypt_password_hash(crypt, crypt->ownerpass, crypt->ownerpasslength, salt, (uint8_t *)u->data, password_hash);
+  if (memcmp(o->data, password_hash, 32) != 0)
+    return PPCRYPT_PASS;
+
+  salt = (uint8_t *)&o->data[40]; /* key salt */
+  ppcrypt_password_hash(crypt, crypt->ownerpass, crypt->ownerpasslength, salt, (uint8_t *)u->data, password_hash);
+  aes_decode_data(oe->data, 32, crypt->filekey, password_hash, 32, nulliv, AES_NULL_PADDING);
+
+  return ppcrypt_authenticate_permissions(crypt, perms);
+}
+
+
+/* authentication */
+
+static ppcrypt_status ppcrypt_authenticate (ppcrypt *crypt, ppstring *u, ppstring *ue, ppstring *o, ppstring *oe, ppstring *id, ppstring *perms)
+{
+  /* V1..V4 */
+  if (crypt->algorithm_variant < 5)
+    return ppcrypt_authenticate_legacy(crypt, u, o, id);
+
+  /* V5 */
+  if (crypt->flags & PPCRYPT_USER_PASSWORD)
+    if (ppcrypt_authenticate_user(crypt, u, ue, perms) == PPCRYPT_DONE)
+      return PPCRYPT_DONE;
+  if (crypt->flags & PPCRYPT_OWNER_PASSWORD)
+    return ppcrypt_authenticate_owner(crypt, u, o, oe, perms);
+
+  return PPCRYPT_PASS;
+}
+
+/**/
+
+ppcrypt_status ppdoc_crypt_init (ppdoc *pdf, const void *userpass, size_t userpasslength, const void *ownerpass, size_t ownerpasslength)
+{
+  ppcrypt *crypt;
+  ppdict *trailer, *encrypt;
+  ppobj *obj;
+  ppname *name, **pkey;
+  ppstring *userkey, *ownerkey, *userkey_e = NULL, *ownerkey_e = NULL;
+  size_t hashlength;
+  pparray *idarray;
+  ppstring *id = NULL, *perms = NULL;
+  int cryptflags, encryptmd;
+  size_t strkeylength, stmkeylength;
+
+  trailer = ppxref_trailer(pdf->xref);
+  if ((obj = ppdict_get_obj(trailer, "Encrypt")) == NULL)
+    return PPCRYPT_NONE;
+
+  /* this happens early, before loading body, so if /Encrypt is indirect reference, it points nothing */
+  obj = ppobj_preloaded(pdf, obj);
+  if (obj->type != PPDICT)
+    return PPCRYPT_FAIL;
+  encrypt = obj->dict;
+  for (ppdict_first(encrypt, pkey, obj); *pkey != NULL; ppdict_next(pkey, obj))
+    (void)ppobj_preloaded(pdf, obj);
+
+  if ((name = ppdict_get_name(encrypt, "Filter")) != NULL && !ppname_is(name, "Standard"))
+    return PPCRYPT_FAIL;
+
+  if ((crypt = pdf->crypt) == NULL)
+    crypt = pdf->crypt = ppcrypt_create(&pdf->heap);
+
+  /* get /V /R /P */
+  if (!ppdict_get_uint(encrypt, "V", &crypt->algorithm_variant))
+    crypt->algorithm_variant = 0;
+  if (crypt->algorithm_variant < 1 || crypt->algorithm_variant > 5)
+    return PPCRYPT_FAIL;
+  if (!ppdict_get_uint(encrypt, "R", &crypt->algorithm_revision))
+    return PPCRYPT_FAIL;
+  if (!ppdict_get_int(encrypt, "P", &crypt->permissions))
+    return PPCRYPT_FAIL;
+
+  /* get /O /U /ID /OE /UE */
+  if ((userkey = ppdict_get_string(encrypt, "U")) == NULL || (ownerkey = ppdict_get_string(encrypt, "O")) == NULL)
+    return PPCRYPT_FAIL;
+  userkey = ppstring_decoded(userkey);
+  ownerkey = ppstring_decoded(ownerkey);
+
+  /* for some reason acrobat pads /O and /U to 127 bytes with NULL, so we don't check the exact length but ensure the minimal */
+  hashlength = crypt->algorithm_variant < 5 ? 32 : 48;
+  if (userkey->size < hashlength || ownerkey->size < hashlength)
+    return PPCRYPT_FAIL;
+  if (crypt->algorithm_variant < 5)
+  { // get first string from /ID (must not be ref)
+    if ((idarray = ppdict_get_array(trailer, "ID")) == NULL || (id = pparray_get_string(idarray, 0)) == NULL)
+      return PPCRYPT_FAIL;
+    id = ppstring_decoded(id);
+  }
+  else
+  {
+    if ((userkey_e = ppdict_get_string(encrypt, "UE")) == NULL || (ownerkey_e = ppdict_get_string(encrypt, "OE")) == NULL)
+      return PPCRYPT_FAIL;
+    userkey_e = ppstring_decoded(userkey_e);
+    ownerkey_e = ppstring_decoded(ownerkey_e);
+    if (userkey_e->size < 32 || ownerkey_e->size < 32)
+      return PPCRYPT_FAIL;
+    if ((perms = ppdict_get_string(encrypt, "Perms")) == NULL)
+      return PPCRYPT_FAIL;
+    perms = ppstring_decoded(perms);
+    if (perms->size != 16)
+      return PPCRYPT_FAIL;
+  }
+
+  /* collect flags and keylength */
+  switch (crypt->algorithm_revision)
+  {
+    case 1:
+      crypt->filekeylength = 5;
+      crypt->flags |= PPCRYPT_RC4;
+      break;
+    case 2: case 3:
+      if (ppdict_get_uint(encrypt, "Length", &crypt->filekeylength))
+        crypt->filekeylength >>= 3; /* 40..256 bits, 5..32 bytes*/
+      else
+        crypt->filekeylength = 5; /* 40 bits, 5 bytes */
+      crypt->flags |= PPCRYPT_RC4;
+      break;
+    case 4: case 5: case 6:
+      if ((crypt->map = ppdict_rget_dict(encrypt, "CF")) == NULL)
+        return PPCRYPT_FAIL;
+      for (ppdict_first(crypt->map, pkey, obj); *pkey != NULL; ppdict_next(pkey, obj))
+        (void)ppobj_preloaded(pdf, obj);
+      /* /EncryptMetadata relevant only for version >=4, may be also provided in crypt filter dictionary; which takes a precedence then?
+         we assume that if there is an explicit EncryptMetadata key, it overrides main encrypt dict flag or default flag (the default is true,
+         meaning that Metadata stream is encrypted as others) */
+      if (ppdict_get_bool(encrypt, "EncryptMetadata", &encryptmd) && !encryptmd)
+        crypt->flags |= PPCRYPT_NO_METADATA;
+
+      strkeylength = stmkeylength = 0;
+      /* streams filter */
+      if ((name = ppdict_get_name(encrypt, "StmF")) != NULL && ppcrypt_type(crypt, name, &stmkeylength, &cryptflags))
+      {
+        if (cryptflags & PPCRYPT_INFO_AES)
+          crypt->flags |= PPCRYPT_STREAM_AES;
+        else if (cryptflags & PPCRYPT_INFO_RC4)
+          crypt->flags |= PPCRYPT_STREAM_RC4;
+        if (cryptflags & PPCRYPT_INFO_NOMD)
+          crypt->flags |= PPCRYPT_NO_METADATA;
+        else if (cryptflags & PPCRYPT_INFO_MD)
+          crypt->flags &= ~PPCRYPT_NO_METADATA;
+      } /* else identity */
+      /* strings filter */
+      if ((name = ppdict_get_name(encrypt, "StrF")) != NULL && ppcrypt_type(crypt, name, &strkeylength, &cryptflags))
+      {
+        if (cryptflags & PPCRYPT_INFO_AES)
+          crypt->flags |= PPCRYPT_STRING_AES;
+        else if (cryptflags & PPCRYPT_INFO_RC4)
+          crypt->flags |= PPCRYPT_STRING_RC4;
+      } /* else identity */
+
+      /* /Length of encrypt dict is irrelevant here, theoretically every crypt filter may have own length... It means that we should
+         actually keep a different file key for streams and strings. But it leads to nonsense, as /U and /O entries refers to a single
+         keylength, without a distinction for strings/streams. So we have to assume /Length is consistent. To expose the limitation: */
+      if ((crypt->flags & PPCRYPT_STREAM) && (crypt->flags & PPCRYPT_STRING))
+        if (strkeylength != stmkeylength)
+          return PPCRYPT_FAIL;
+      crypt->filekeylength = stmkeylength ? stmkeylength : strkeylength;
+      if ((crypt->flags & PPCRYPT_STREAM) || (crypt->flags & PPCRYPT_STRING))
+        if (crypt->filekeylength == 0)
+          return PPCRYPT_FAIL;
+      break;
+    default:
+      return PPCRYPT_FAIL;
+  }
+
+  /* setup passwords */
+  if (userpass != NULL)
+    ppcrypt_set_user_password(crypt, userpass, userpasslength);
+  if (ownerpass != NULL)
+    ppcrypt_set_owner_password(crypt, ownerpass, ownerpasslength);
+  if ((crypt->flags & (PPCRYPT_USER_PASSWORD|PPCRYPT_OWNER_PASSWORD)) == 0)
+    return PPCRYPT_PASS;
+
+  return ppcrypt_authenticate(crypt, userkey, userkey_e, ownerkey, ownerkey_e, id, perms);
+}
+
+/* decrypting strings */
+
+/*
+Since strings are generally rare, but might occur in mass (name trees). We generate decryption key when needed.
+All strings within the same reference are crypted with the same key. Both RC4 and AES algorithms expands
+the crypt key in some way and the result of expansion is the same for the same crypt key. Instead of recreating
+the ky for every string, we backup the initial decryption state.
+*/
+
+static void ppcrypt_strkey (ppcrypt *crypt, ppref *ref, int aes)
+{
+  if (crypt->cryptkeylength > 0)
+  { /* crypt key already generated, just reinitialize crypt states */
+    if (aes)
+    { /* aes codecs that works on c-strings do not modify aes_state flags at all, so we actually don't need to revitalize the state,
+         we only rewrite an initialization vector, which is modified during crypt procedure */
+    }
+    else
+    { /* rc4 crypt map is modified during crypt procedure, so here we reinitialize rc4 bytes map */
+      rc4_map_restore(&crypt->rc4state, &crypt->rc4copy);
+    }
+    return;
+  }
+
+  if (crypt->algorithm_variant < 5)
+  {
+    crypt->filekey[crypt->filekeylength + 0] = get_number_byte1(ref->number);
+    crypt->filekey[crypt->filekeylength + 1] = get_number_byte2(ref->number);
+    crypt->filekey[crypt->filekeylength + 2] = get_number_byte3(ref->number);
+    crypt->filekey[crypt->filekeylength + 3] = get_number_byte1(ref->version);
+    crypt->filekey[crypt->filekeylength + 4] = get_number_byte2(ref->version);
+
+    if (aes)
+    {
+      crypt->filekey[crypt->filekeylength + 5] = 0x73; // s
+      crypt->filekey[crypt->filekeylength + 6] = 0x41; // A
+      crypt->filekey[crypt->filekeylength + 7] = 0x6C; // l
+      crypt->filekey[crypt->filekeylength + 8] = 0x54; // T
+    }
+
+    md5_digest(crypt->filekey, crypt->filekeylength + (aes ? 9 : 5), crypt->cryptkey, MD5_BYTES);
+    crypt->cryptkeylength = crypt->filekeylength + 5 >= 16 ? 16 : crypt->filekeylength + 5;
+  }
+  else
+  {
+    memcpy(crypt->cryptkey, crypt->filekey, 32);
+    crypt->cryptkeylength = 32;
+  }
+
+  if (aes)
+  {
+    aes_decode_initialize(&crypt->aesstate, &crypt->aeskeyblock, crypt->cryptkey, crypt->cryptkeylength, NULL);
+    aes_pdf_mode(&crypt->aesstate);
+  }
+  else
+  {
+    rc4_state_initialize(&crypt->rc4state, &crypt->rc4map, crypt->cryptkey, crypt->cryptkeylength);
+    rc4_map_save(&crypt->rc4state, &crypt->rc4copy);
+  }
+}
+
+int ppstring_decrypt (ppcrypt *crypt, const void *input, size_t size, void *output, size_t *newsize)
+{
+  int aes, rc4;
+  aes = crypt->flags & PPCRYPT_STRING_AES;
+  rc4 = crypt->flags & PPCRYPT_STRING_RC4;
+  if (aes || rc4)
+  {
+    ppcrypt_strkey(crypt, crypt->ref, aes);
+    if (aes)
+      *newsize = aes_decode_state_data(&crypt->aesstate, input, size, output);
+    else // if (rc4)
+      *newsize = rc4_decode_state_data(&crypt->rc4state, input, size, output);
+    return 1;
+  }
+  return 0; // identity crypt
+}
+
+/* decrypting streams */
+
+/*
+Streams are decrypted everytime when accessing the stream data. We need to be able to get or make
+the key for decryption as long as the stream is alive. And to get the key we need the reference
+number and version, plus document crypt info. First thought was to keep the reference to which
+the stream belongs; stream->ref and accessing the crypt info stream->ref->xref->pdf->crypt.
+It would be ok as long as absolutelly nothing happens with ref and crypt. At some point pplib
+may drift into rewriting support, which would imply ref/xref/crypt/pdf structures modifications.
+So I feel better with generating a crypt key for every stream in encrypted document, paying a cost
+of md5 for all streams, not necessarily those actually read.
+
+Key generation is the same as for strings, but different for distinct encryption methods (rc4 vs aes).
+Since streams and strings might theoretically be encrypted with different filters. No reason to cacche
+decryption state here.
+*/
+
+ppstring * ppcrypt_stmkey (ppcrypt *crypt, ppref *ref, int aes, ppheap *heap)
+{
+  ppstring *cryptkeystring;
+  //if (crypt->cryptkeylength > 0)
+  //  return;
+
+  if (crypt->algorithm_variant < 5)
+  {
+    crypt->filekey[crypt->filekeylength + 0] = get_number_byte1(ref->number);
+    crypt->filekey[crypt->filekeylength + 1] = get_number_byte2(ref->number);
+    crypt->filekey[crypt->filekeylength + 2] = get_number_byte3(ref->number);
+    crypt->filekey[crypt->filekeylength + 3] = get_number_byte1(ref->version);
+    crypt->filekey[crypt->filekeylength + 4] = get_number_byte2(ref->version);
+
+    if (aes)
+    {
+      crypt->filekey[crypt->filekeylength + 5] = 0x73;
+      crypt->filekey[crypt->filekeylength + 6] = 0x41;
+      crypt->filekey[crypt->filekeylength + 7] = 0x6C;
+      crypt->filekey[crypt->filekeylength + 8] = 0x54;
+    }
+
+    md5_digest(crypt->filekey, crypt->filekeylength + (aes ? 9 : 5), crypt->cryptkey, MD5_BYTES);
+    crypt->cryptkeylength = crypt->filekeylength + 5 >= 16 ? 16 : crypt->filekeylength + 5; // how about 256bits AES??
+  }
+  else
+  { // we could actually generate this string once, but.. aes itself is way more expensive that we can earn here
+    memcpy(crypt->cryptkey, crypt->filekey, 32); // just for the record
+    crypt->cryptkeylength = 32;
+  }
+  cryptkeystring = ppstring_internal(crypt->cryptkey, crypt->cryptkeylength, heap);
+  return ppstring_decoded(cryptkeystring);
+}
diff --git a/source/luametatex/source/libraries/pplib/ppcrypt.h b/source/luametatex/source/libraries/pplib/ppcrypt.h
new file mode 100644
index 000000000..9fa52d878
--- /dev/null
+++ b/source/luametatex/source/libraries/pplib/ppcrypt.h
@@ -0,0 +1,70 @@
+
+#ifndef PP_CRYPT_H
+#define PP_CRYPT_H
+
+#include "ppfilter.h"
+#include "utilcrypt.h"
+#include "utilcryptdef.h"
+
+#define PPCRYPT_MAX_PASSWORD 127
+#define PPCRYPT_MAX_KEY 32
+
+typedef struct {
+  ppuint algorithm_variant;                /* /V entry of encrypt dict */
+  ppuint algorithm_revision;               /* /R entry of encrypt dict */
+  ppint permissions;                       /* /P entry of encrypt dict */
+  ppdict *map;                             /* /CF filters map of encrypt dict */
+  uint8_t userpass[PPCRYPT_MAX_PASSWORD];  /* user password */
+  size_t userpasslength;                   /* user password length */
+  uint8_t ownerpass[PPCRYPT_MAX_PASSWORD]; /* owner password */
+  size_t ownerpasslength;                  /* owner password length */
+  uint8_t filekey[PPCRYPT_MAX_KEY+5+4];    /* file key with an extra space for salt */
+  size_t filekeylength;                    /* key length; usually 5, 16 or 32 bytes */
+  uint8_t cryptkey[PPCRYPT_MAX_KEY];       /* crypt key for a recent reference */
+  size_t cryptkeylength;                   /* crypt key length; usually keylength + 5 */
+  //ppstring *cryptkeystring;                /* todo: cached cryptkey string for V5, where all refs has the same */
+  ppref *ref;                              /* recent reference */
+  union { /* cached crypt states for strings encrypted/decrypted with the same key */
+    struct {
+      rc4_state rc4state;
+      rc4_map rc4map;
+      rc4_map rc4copy;
+    };
+    struct {
+      aes_state aesstate;
+      aes_keyblock aeskeyblock;
+      uint8_t ivcopy[16];
+    };
+  };
+  int flags;
+} ppcrypt;
+
+#define PPCRYPT_NO_METADATA (1<<0)
+#define PPCRYPT_USER_PASSWORD (1<<1)
+#define PPCRYPT_OWNER_PASSWORD (1<<2)
+#define PPCRYPT_STREAM_RC4 (1<<3)
+#define PPCRYPT_STRING_RC4 (1<<4)
+#define PPCRYPT_STREAM_AES (1<<5)
+#define PPCRYPT_STRING_AES (1<<6)
+
+#define PPCRYPT_STREAM (PPCRYPT_STREAM_AES|PPCRYPT_STREAM_RC4)
+#define PPCRYPT_STRING (PPCRYPT_STRING_AES|PPCRYPT_STRING_RC4)
+#define PPCRYPT_RC4 (PPCRYPT_STREAM_RC4|PPCRYPT_STRING_RC4)
+#define PPCRYPT_AES (PPCRYPT_STREAM_AES|PPCRYPT_STRING_AES)
+
+#define PPCRYPT_INFO_AES  (1<<0)
+#define PPCRYPT_INFO_RC4  (1<<1)
+#define PPCRYPT_INFO_MD   (1<<2)
+#define PPCRYPT_INFO_NOMD (1<<3)
+
+ppcrypt_status ppdoc_crypt_init (ppdoc *pdf, const void *userpass, size_t userpasslength, const void *ownerpass, size_t ownerpasslength);
+int ppstring_decrypt (ppcrypt *crypt, const void *input, size_t size, void *output, size_t *newsize);
+
+#define ppcrypt_start_ref(crypt, r) ((crypt)->ref = r, (crypt)->cryptkeylength = 0)
+#define ppcrypt_end_ref(crypt) ((crypt)->ref = NULL, (crypt)->cryptkeylength = 0)
+#define ppcrypt_ref(pdf, crypt) ((crypt = (pdf)->crypt) != NULL && crypt->ref != NULL)
+
+int ppcrypt_type (ppcrypt *crypt, ppname *cryptname, ppuint *length, int *cryptflags);
+ppstring * ppcrypt_stmkey (ppcrypt *crypt, ppref *ref, int aes, ppheap *heap);
+
+#endif
diff --git a/source/luametatex/source/libraries/pplib/ppdict.c b/source/luametatex/source/libraries/pplib/ppdict.c
new file mode 100644
index 000000000..95ea96b9f
--- /dev/null
+++ b/source/luametatex/source/libraries/pplib/ppdict.c
@@ -0,0 +1,166 @@
+
+#include "pplib.h"
+
+ppdict * ppdict_create (const ppobj *stackpos, size_t size, ppheap *heap)
+{
+  ppdict *dict;
+  ppobj *data;
+  ppname **pkey;
+  size_t i;
+
+  size >>= 1; // num of key-value pairs
+  dict = (ppdict *)ppstruct_take(heap, sizeof(ppdict));
+  dict->data = data = (ppobj *)ppstruct_take(heap, size * sizeof(ppobj));
+  dict->keys = pkey = (ppname **)ppstruct_take(heap, (size + 1) * sizeof(ppname **));
+	dict->size = 0;
+
+  for (i = 0; i < size; ++i, stackpos += 2)
+  {
+    if (stackpos->type != PPNAME) // we need this check at lest for trailer hack
+      continue;
+    *pkey = stackpos->name;
+    *data = *(stackpos + 1);
+    ++pkey, ++data, ++dict->size;
+  }
+  *pkey = NULL; // sentinel for convinient iteration
+  return dict;
+}
+
+ppobj * ppdict_get_obj (ppdict *dict, const char *name)
+{
+  ppname **pkey;
+  ppobj *obj;
+
+  for (ppdict_first(dict, pkey, obj); *pkey != NULL; ppdict_next(pkey, obj))
+    if (strcmp((*pkey)->data, name) == 0) // not ppname_eq() or ppname_is()!!
+      return obj;
+  return NULL;
+}
+
+ppobj * ppdict_rget_obj (ppdict *dict, const char *name)
+{
+  ppobj *obj;
+  return (obj = ppdict_get_obj(dict, name)) != NULL ? ppobj_rget_obj(obj) : NULL;
+}
+
+int ppdict_get_bool (ppdict *dict, const char *name, int *v)
+{
+  ppobj *obj;
+  return (obj = ppdict_get_obj(dict, name)) != NULL ? ppobj_get_bool(obj, *v) : 0;
+}
+
+int ppdict_rget_bool (ppdict *dict, const char *name, int *v)
+{
+  ppobj *obj;
+  return (obj = ppdict_get_obj(dict, name)) != NULL ? ppobj_rget_bool(obj, *v) : 0;
+}
+
+int ppdict_get_int (ppdict *dict, const char *name, ppint *v)
+{
+  ppobj *obj;
+  return (obj = ppdict_get_obj(dict, name)) != NULL ? ppobj_get_int(obj, *v) : 0;
+}
+
+int ppdict_rget_int (ppdict *dict, const char *name, ppint *v)
+{
+  ppobj *obj;
+  return (obj = ppdict_get_obj(dict, name)) != NULL ? ppobj_rget_int(obj, *v) : 0;
+}
+
+int ppdict_get_uint (ppdict *dict, const char *name, ppuint *v)
+{
+  ppobj *obj;
+  return (obj = ppdict_get_obj(dict, name)) != NULL ? ppobj_get_uint(obj, *v) : 0;
+}
+
+int ppdict_rget_uint (ppdict *dict, const char *name, ppuint *v)
+{
+  ppobj *obj;
+  return (obj = ppdict_get_obj(dict, name)) != NULL ? ppobj_rget_uint(obj, *v) : 0;
+}
+
+int ppdict_get_num (ppdict *dict, const char *name, ppnum *v)
+{
+  ppobj *obj;
+  return (obj = ppdict_get_obj(dict, name)) != NULL ? ppobj_get_num(obj, *v) : 0;
+}
+
+int ppdict_rget_num (ppdict *dict, const char *name, ppnum *v)
+{
+  ppobj *obj;
+  return (obj = ppdict_get_obj(dict, name)) != NULL ? ppobj_rget_num(obj, *v) : 0;
+}
+
+ppname * ppdict_get_name (ppdict *dict, const char *name)
+{
+  ppobj *obj;
+  return (obj = ppdict_get_obj(dict, name)) != NULL ? ppobj_get_name(obj) : NULL;
+}
+
+ppname * ppdict_rget_name (ppdict *dict, const char *name)
+{
+  ppobj *obj;
+  return (obj = ppdict_get_obj(dict, name)) != NULL ? ppobj_rget_name(obj) : NULL;
+}
+
+ppstring * ppdict_get_string (ppdict *dict, const char *name)
+{
+  ppobj *obj;
+  return (obj = ppdict_get_obj(dict, name)) != NULL ? ppobj_get_string(obj) : NULL;
+}
+
+ppstring * ppdict_rget_string (ppdict *dict, const char *name)
+{
+  ppobj *obj;
+  return (obj = ppdict_get_obj(dict, name)) != NULL ? ppobj_rget_string(obj) : NULL;
+}
+
+pparray * ppdict_get_array (ppdict *dict, const char *name)
+{
+  ppobj *obj;
+  return (obj = ppdict_get_obj(dict, name)) != NULL ? ppobj_get_array(obj) : NULL;
+}
+
+pparray * ppdict_rget_array (ppdict *dict, const char *name)
+{
+  ppobj *obj;
+  return (obj = ppdict_get_obj(dict, name)) != NULL ? ppobj_rget_array(obj) : NULL;
+}
+
+ppdict * ppdict_get_dict (ppdict *dict, const char *name)
+{
+  ppobj *obj;
+  return (obj = ppdict_get_obj(dict, name)) != NULL ? ppobj_get_dict(obj) : NULL;
+}
+
+ppdict * ppdict_rget_dict (ppdict *dict, const char *name)
+{
+  ppobj *obj;
+  return (obj = ppdict_get_obj(dict, name)) != NULL ? ppobj_rget_dict(obj) : NULL;
+}
+
+/*
+ppstream * ppdict_get_stream (ppdict *dict, const char *name)
+{
+  ppobj *obj;
+  return (obj = ppdict_get_obj(dict, name)) != NULL ? ppobj_get_stream(obj) : NULL;
+}
+*/
+
+ppstream * ppdict_rget_stream (ppdict *dict, const char *name)
+{
+  ppobj *obj;
+  return (obj = ppdict_get_obj(dict, name)) != NULL ? ppobj_rget_stream(obj) : NULL;
+}
+
+ppref * ppdict_get_ref (ppdict *dict, const char *name)
+{
+  ppobj *obj;
+  return (obj = ppdict_get_obj(dict, name)) != NULL ? ppobj_get_ref(obj) : NULL;
+}
+
+ppref * ppdict_rget_ref (ppdict *dict, const char *name)
+{
+  ppobj *obj;
+  return (obj = ppdict_get_obj(dict, name)) != NULL ? ppobj_rget_ref(obj) : NULL;
+}
diff --git a/source/luametatex/source/libraries/pplib/ppdict.h b/source/luametatex/source/libraries/pplib/ppdict.h
new file mode 100644
index 000000000..b13ff8eb2
--- /dev/null
+++ b/source/luametatex/source/libraries/pplib/ppdict.h
@@ -0,0 +1,7 @@
+
+#ifndef PP_DICT_H
+#define PP_DICT_H
+
+ppdict * ppdict_create (const ppobj *stack, size_t size, ppheap *heap);
+
+#endif
+\ No newline at end of file
diff --git a/source/luametatex/source/libraries/pplib/ppfilter.h b/source/luametatex/source/libraries/pplib/ppfilter.h
new file mode 100644
index 000000000..583aa8cf4
--- /dev/null
+++ b/source/luametatex/source/libraries/pplib/ppfilter.h
@@ -0,0 +1,10 @@
+
+#ifndef PP_FILTER_H
+#define PP_FILTER_H
+
+#include "utilbasexx.h"
+#include "utilflate.h"
+#include "utillzw.h"
+#include "utilfpred.h"
+
+#endif
+\ No newline at end of file
diff --git a/source/luametatex/source/libraries/pplib/ppheap.c b/source/luametatex/source/libraries/pplib/ppheap.c
new file mode 100644
index 000000000..f2fbc2b7e
--- /dev/null
+++ b/source/luametatex/source/libraries/pplib/ppheap.c
@@ -0,0 +1,40 @@
+
+#include "pplib.h"
+
+#define PPBYTES_HEAP_BLOCK 0xFFF
+#define PPBYTES_HEAP_LARGE (PPBYTES_HEAP_BLOCK >> 2)
+#define PPSTRUCT_HEAP_BLOCK 0xFFF
+#define PPSTRUCT_HEAP_LARGE (PPSTRUCT_HEAP_BLOCK >> 2)
+
+void ppheap_init (ppheap *heap)
+{
+  ppstruct_heap_init(heap, PPSTRUCT_HEAP_BLOCK, PPSTRUCT_HEAP_LARGE, 0);
+  ppbytes_heap_init(heap, PPBYTES_HEAP_BLOCK, PPBYTES_HEAP_LARGE, 0);
+}
+
+void ppheap_free (ppheap *heap)
+{
+  ppstruct_heap_free(heap);
+  ppbytes_heap_free(heap);
+}
+
+void ppheap_renew (ppheap *heap)
+{
+  ppstruct_heap_clear(heap);
+  ppbytes_heap_clear(heap);
+  ppbytes_buffer_init(heap);
+}
+
+ppbyte * ppbytes_flush (ppheap *heap, iof *O, size_t *psize)
+{
+  ppbyte *data;
+  size_t size;
+  
+  //ASSERT(&heap->bytesheap == O->link);
+  iof_put(O, '\0');
+  data = (ppbyte *)O->buf;
+  size = (size_t)iof_size(O);
+  ppbytes_heap_done(heap, data, size);
+  *psize = size - 1;
+  return data;
+}
+\ No newline at end of file
diff --git a/source/luametatex/source/libraries/pplib/ppheap.h b/source/luametatex/source/libraries/pplib/ppheap.h
new file mode 100644
index 000000000..85a59ee0a
--- /dev/null
+++ b/source/luametatex/source/libraries/pplib/ppheap.h
@@ -0,0 +1,46 @@
+
+#ifndef PP_HEAP_H
+#define PP_HEAP_H
+
+#include "utilmem.h"
+
+#define pp_malloc util_malloc
+//#define pp_callic util_calloc
+//#define pp_realloc util_realloc
+#define pp_free util_free
+
+#include "utilmemheapiof.h"
+//#include "utilmeminfo.h"
+
+#define ppbytes_heap heap16
+#define ppbytes_heap_init(heap, space, large, flags) (heap16_init(&(heap)->bytesheap, space, large, flags), heap16_head(&(heap)->bytesheap))
+//#define ppbytes_heap_some(heap, size, pspace) _heap16_some(&(heap)->bytesheap, size, pspace)
+#define ppbytes_heap_done(heap, data, written) heap16_done(&(heap)->bytesheap, data, written)
+#define ppbytes_heap_clear(heap) heap16_clear(&(heap)->bytesheap)
+#define ppbytes_heap_free(heap) heap16_free(&(heap)->bytesheap)
+#define ppbytes_heap_info(heap, info, append) heap16_stats(&(heap)->bytesheap, info, append)
+
+#define ppbytes_take(heap, size) _heap16_take(&(heap)->bytesheap, size)
+#define ppbytes_buffer_init(heap) heap16_buffer_init(&(heap)->bytesheap, &(heap)->bytesbuffer)
+#define ppbytes_buffer(heap, atleast) _heap16_buffer_some(&(heap)->bytesheap, &(heap)->bytesbuffer, atleast)
+
+#define ppstruct_heap heap64
+#define ppstruct_heap_init(heap, space, large, flags) (heap64_init(&(heap)->structheap, space, large, flags), heap64_head(&(heap)->structheap))
+#define ppstruct_heap_clear(heap) heap64_clear(&(heap)->structheap)
+#define ppstruct_heap_free(heap) heap64_free(&(heap)->structheap)
+#define ppstruct_heap_info(heap, info, append) heap64_stats(&(heap)->structheap, info, append)
+#define ppstruct_take(heap, size) _heap64_take(&(heap)->structheap, size)
+
+typedef struct {
+  ppbytes_heap bytesheap;
+  ppstruct_heap structheap;
+  iof bytesbuffer;
+} ppheap;
+
+ppbyte * ppbytes_flush (ppheap *heap, iof *O, size_t *psize);
+
+void ppheap_init (ppheap *heap);
+void ppheap_free (ppheap *heap);
+void ppheap_renew (ppheap *heap);
+
+#endif
+\ No newline at end of file
diff --git a/source/luametatex/source/libraries/pplib/pplib.h b/source/luametatex/source/libraries/pplib/pplib.h
new file mode 100644
index 000000000..e753cfa05
--- /dev/null
+++ b/source/luametatex/source/libraries/pplib/pplib.h
@@ -0,0 +1,22 @@
+
+#ifndef PP_LIB_H
+#define PP_LIB_H
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+
+#include "utiliof.h"
+#include "utillog.h"
+
+#include "ppapi.h"
+#include "ppheap.h"
+#include "ppdict.h"
+#include "ppstream.h"
+#include "pparray.h"
+#include "ppcrypt.h"
+#include "ppxref.h"
+#include "ppload.h"
+
+#endif
+\ No newline at end of file
diff --git a/source/luametatex/source/libraries/pplib/ppload.c b/source/luametatex/source/libraries/pplib/ppload.c
new file mode 100644
index 000000000..0e72039d8
--- /dev/null
+++ b/source/luametatex/source/libraries/pplib/ppload.c
@@ -0,0 +1,2769 @@
+
+#include "pplib.h"
+
+const char * ppobj_kind[] = { "none", "null", "bool", "integer", "number", "name", "string", "array", "dict", "stream", "ref" };
+
+#define ignored_char(c) (c == 0x20 || c == 0x0A || c == 0x0D || c == 0x09 || c == 0x00)
+#define newline_char(c) (c == 0x0A || c == 0x0D)
+#define IGNORED_CHAR_CASE 0x20: case 0x0A: case 0x0D: case 0x09: case 0x00
+#define NEWLINE_CHAR_CASE 0x0A: case 0x0D
+#define DIGIT_CHAR_CASE '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9'
+#define OCTAL_CHAR_CASE '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7'
+
+#define MAX_INT_DIGITS 32
+
+#define PP_LENGTH_UNKNOWN ((size_t)-1)
+
+static const char * ppref_str (ppuint refnumber, ppuint refversion)
+{
+  static char buffer[MAX_INT_DIGITS + 1 + MAX_INT_DIGITS + 1 + 1 + 1];
+  sprintf(buffer, "%lu %lu R", (unsigned long)(refnumber), (unsigned long)(refversion));
+  return buffer;
+}
+
+/* name */
+
+/*
+pdf spec page 57:
+"The name may include any regular characters, but not delimiter or white-space characters (see Section 3.1, �Lexical Conventions�)."
+"The token / (a slash followed by no regular characters) is a valid name"
+"Beginning with PDF 1.2, any character except null (character code 0) may be included in a name by writing its 2-digit hexadecimal code,
+preceded by the number sign character (#); see implementation notes 3 and 4 in Appendix H. This syntax is required to represent any of the
+delimiter or white-space characters or the number sign character itself; it is recommended but not required for characters whose codes
+are outside the range 33 (!) to 126 (~)."
+
+This suggests we should accept bytes 128..255 as a part of the name.
+*/
+
+// pdf name delimiters: 0..32, ()<>[]{}/%
+// # treated specially
+// .+- are valid part of name; keep in mind names such as -| | |- .notdef ABCDEF+Font etc.
+static const int8_t ppname_byte_lookup[] = {
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 1, 1, '#', 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
+};
+
+/*
+20190827: The end of the name is any byte with 0 lookup value. When reading a ref object or objstm stream containing
+a single name, we may get input byte IOFEOF (-1), which must not be treated as 255. So a check for (c >= 0) is needed,
+otherwise we keep writing byte 255 to the output buffer until not enough memory.
+*/
+
+#define ppnamebyte(c) (c >= 0 && ppname_byte_lookup[(uint8_t)(c)])
+
+static const int8_t pphex_byte_lookup[] = {
+  -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+  -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+  -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+   0, 1, 2, 3, 4, 5, 6, 7, 8, 9,-1,-1,-1,-1,-1,-1,
+  -1,10,11,12,13,14,15,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+  -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+  -1,10,11,12,13,14,15,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+  -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+  -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+  -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+  -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+  -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+  -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+  -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+  -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+  -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
+};
+
+/* no need for (c >= 0) check here */
+
+#define pphex(c) pphex_byte_lookup[(uint8_t)(c)]
+
+#define PPNAME_INIT (7 + 1)
+
+static ppname * ppscan_name (iof *I, ppheap *heap)
+{
+  ppname *encoded, *decoded;
+  iof *O;
+  int decode, c;
+  uint8_t *p, *e;
+  int8_t h1, h2;
+
+  O = ppbytes_buffer(heap, PPNAME_INIT);
+  for (decode = 0, c = iof_char(I); ppnamebyte(c); c = iof_next(I))
+  {
+    if (c == '#') decode = 1;
+    iof_put(O, c);
+  }
+  encoded = (ppname *)ppstruct_take(heap, sizeof(ppname));
+  encoded->data = ppbytes_flush(heap, O, &encoded->size);
+  if (decode)
+  {
+    O = ppbytes_buffer(heap, encoded->size); // decoded always a bit smaller
+    for (p = (uint8_t *)encoded->data, e = p + encoded->size; p < e; ++p)
+    {
+      if (*p == '#' && p + 2 < e && (h1 = pphex(p[1])) >= 0 && (h2 = pphex(p[2])) >= 0)
+      {
+        iof_set(O, ((h1 << 4)|h2));
+        p += 2;
+      }
+      else
+        iof_set(O, *p);
+    }
+    decoded = (ppname *)ppstruct_take(heap, sizeof(ppname));
+    decoded->data = ppbytes_flush(heap, O, &decoded->size);
+    encoded->flags = PPNAME_ENCODED;
+    decoded->flags = PPNAME_DECODED;
+    encoded->alterego = decoded, decoded->alterego = encoded;
+  }
+  else
+  {
+    encoded->flags = 0;
+    encoded->alterego = encoded;
+  }
+  return encoded;
+}
+
+static ppname * ppscan_exec (iof *I, ppheap *heap, uint8_t firstbyte)
+{
+  ppname *encoded, *decoded;
+  iof *O;
+  int decode, c;
+  uint8_t *p, *e;
+  int8_t h1, h2;
+
+  O = ppbytes_buffer(heap, PPNAME_INIT);
+  iof_put(O, firstbyte);
+  for (decode = 0, c = iof_char(I); ppnamebyte(c); c = iof_next(I))
+  {
+    if (c == '#') decode = 1;
+    iof_put(O, c);
+  }
+  encoded = (ppname *)ppstruct_take(heap, sizeof(ppname));
+  encoded->data = ppbytes_flush(heap, O, &encoded->size);
+  if (decode)
+  {
+    O = ppbytes_buffer(heap, encoded->size);
+    for (p = (uint8_t *)encoded->data, e = p + encoded->size; p < e; ++p)
+    {
+      if (*p == '#' && p + 2 < e && (h1 = pphex(p[1])) >= 0 && (h2 = pphex(p[2])) >= 0)
+      {
+        iof_set(O, ((h1 << 4)|h2));
+        p += 2;
+      }
+      else
+        iof_set(O, *p);
+    }
+    decoded = (ppname *)ppstruct_take(heap, sizeof(ppname));
+    decoded->data = ppbytes_flush(heap, O, &decoded->size);
+    encoded->flags = PPNAME_EXEC|PPNAME_ENCODED;
+    decoded->flags = PPNAME_EXEC|PPNAME_DECODED;
+    encoded->alterego = decoded, decoded->alterego = encoded;
+  }
+  else
+  {
+    encoded->flags = PPNAME_EXEC;
+    encoded->alterego = encoded;
+  }
+  return encoded;
+}
+
+static ppname * ppname_internal (const void *data, size_t size, int flags, ppheap *heap)
+{ // so far needed only for 'EI' operator
+  ppname *encoded;
+  encoded = (ppname *)ppstruct_take(heap, sizeof(ppname));
+  encoded->data = (ppbyte *)ppbytes_take(heap, size + 1);
+  memcpy(encoded->data, data, size);
+  encoded->data[size] = '\0';
+  encoded->size = size;
+  encoded->alterego = encoded;
+  encoded->flags = flags;
+  return encoded;
+}
+
+#define ppexec_internal(data, size, heap) ppname_internal(data, size, PPNAME_EXEC, heap)
+
+ppname * ppname_decoded (ppname *name)
+{
+  return (name->flags & PPNAME_ENCODED) ? name->alterego : name;
+}
+
+ppname * ppname_encoded (ppname *name)
+{
+  return (name->flags & PPNAME_DECODED) ? name->alterego : name;
+}
+
+ppbyte * ppname_decoded_data (ppname *name)
+{
+  return (name->flags & PPNAME_ENCODED) ? name->alterego->data : name->data;
+}
+
+ppbyte * ppname_encoded_data (ppname *name)
+{
+  return (name->flags & PPNAME_DECODED) ? name->alterego->data : name->data;
+}
+
+/* string */
+
+static const int8_t ppstring_byte_escape[] = { /* -1 escaped with octal, >0 escaped with \\, 0 left intact*/
+ -1,-1,-1,-1,-1,-1,-1,-1,'b','t','n',-1,'f','r',-1,-1,
+ -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+  0, 0, 0, 0, 0, 0, 0, 0,'(',')', 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,'\\', 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+ -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+ -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+ -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+ -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+ -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+ -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+ -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
+};
+
+//// pp string
+
+#define PPSTRING_INIT (7 + 1)
+
+#define ppstring_check_bom(decoded) ((void)\
+  (decoded->size >= 2 ? (ppstring_utf16be_bom(decoded->data) ? (decoded->flags |= PPSTRING_UTF16BE) : \
+                        (ppstring_utf16le_bom(decoded->data) ? (decoded->flags |= PPSTRING_UTF16LE) : 0)) : 0))
+
+#define ppstring_check_bom2(decoded, encoded) ((void)\
+  (decoded->size >= 2 ? (ppstring_utf16be_bom(decoded->data) ? ((decoded->flags |= PPSTRING_UTF16BE), (encoded->flags |= PPSTRING_UTF16BE)) : \
+                        (ppstring_utf16le_bom(decoded->data) ? ((decoded->flags |= PPSTRING_UTF16LE), (encoded->flags |= PPSTRING_UTF16LE)) : 0)) : 0))
+
+#define ppstring_utf16be_bom(data) (data[0] == '\xFE' && data[1] == '\xFF')
+#define ppstring_utf16le_bom(data) (data[0] == '\xFF' && data[1] == '\xFE')
+
+#define ppstringesc(c) ppstring_byte_escape[(uint8_t)(c)]
+
+static ppstring * ppscan_string (iof *I, ppheap *heap)
+{
+  ppstring *encoded, *decoded;
+  iof *O;
+  int c, decode, balance;
+  uint8_t *p, *e;
+
+  O = ppbytes_buffer(heap, PPSTRING_INIT);
+  for (decode = 0, balance = 0, c = iof_char(I); c >= 0; )
+  {
+    switch (c)
+    {
+      case '\\':
+        decode = 1;
+        iof_put(O, '\\');
+        if ((c = iof_next(I)) >= 0)
+        {
+          iof_put(O, c);
+          c = iof_next(I);
+        }
+        break;
+      case '(': // may be unescaped if balanced
+        ++balance;
+        iof_put(O, '(');
+        c = iof_next(I);
+        break;
+      case ')':
+        if (balance == 0)
+        {
+          c = IOFEOF;
+          ++I->pos;
+          break;
+        }
+        --balance;
+        iof_put(O, ')');
+        c = iof_next(I);
+        break;
+      default:
+        iof_put(O, c);
+        c = iof_next(I);
+    }
+  }
+  encoded = (ppstring *)ppstruct_take(heap, sizeof(ppstring));
+  encoded->data = ppbytes_flush(heap, O, &encoded->size);
+  if (decode)
+  {
+    O = ppbytes_buffer(heap, encoded->size); // decoded can only be smaller
+    for (p = (uint8_t *)encoded->data, e = p + encoded->size; p < e; ++p)
+    {
+      if (*p == '\\')
+      {
+        if (++p >= e)
+          break;
+        switch (*p)
+        {
+          case OCTAL_CHAR_CASE:
+            c = *p - '0';
+            if (++p < e && *p >= '0' && *p <= '7')
+            {
+              c = (c << 3) + *p - '0';
+              if (++p < e && *p >= '0' && *p <= '7')
+                c = (c << 3) + *p - '0';
+            }
+            iof_set(O, c);
+            break;
+          case 'n':
+            iof_set(O, '\n');
+            break;
+          case 'r':
+            iof_set(O, '\r');
+            break;
+          case 't':
+            iof_set(O, '\t');
+            break;
+          case 'b':
+            iof_set(O, '\b');
+            break;
+          case 'f':
+            iof_set(O, '\f');
+            break;
+          case NEWLINE_CHAR_CASE: // not a part of the string, ignore (pdf spec page 55)
+            break;
+          case '(': case ')': case '\\':
+          default: // for anything else backslash is ignored (pdf spec page 54)
+            iof_set(O, *p);
+            break;
+        }
+      }
+      else
+        iof_set(O, *p);
+    }
+    decoded = (ppstring *)ppstruct_take(heap, sizeof(ppstring));
+    decoded->data = ppbytes_flush(heap, O, &decoded->size);
+    encoded->flags = PPSTRING_ENCODED;
+    decoded->flags = PPSTRING_DECODED;
+    encoded->alterego = decoded, decoded->alterego = encoded;
+    ppstring_check_bom2(decoded, encoded);
+  }
+  else
+  {
+    encoded->flags = 0;
+    encoded->alterego = encoded;
+    ppstring_check_bom(encoded);
+  }
+  return encoded;
+}
+
+static ppstring * ppscan_base16 (iof *I, ppheap *heap)
+{
+  ppstring *encoded, *decoded;
+  iof *O;
+  int c;
+  uint8_t *p, *e;
+  int8_t h1, h2;
+
+  O = ppbytes_buffer(heap, PPSTRING_INIT);
+  for (c = iof_char(I); (pphex(c) >= 0 || ignored_char(c)); c = iof_next(I))
+    iof_put(O, c);
+  if (c == '>')
+    ++I->pos;
+  encoded = (ppstring *)ppstruct_take(heap, sizeof(ppstring));
+  encoded->data = ppbytes_flush(heap, O, &encoded->size);
+
+  O = ppbytes_buffer(heap, ((encoded->size + 1) >> 1) + 1); // decoded can only be smaller
+  for (p = (uint8_t *)encoded->data, e = p + encoded->size; p < e; ++p)
+  {
+    if ((h1 = pphex(*p)) < 0) // ignored
+      continue;
+    for (h2 = 0, ++p; p < e && (h2 = pphex(*p)) < 0; ++p);
+    iof_set(O, (h1 << 4)|h2);
+  }
+  decoded = (ppstring *)ppstruct_take(heap, sizeof(ppstring));
+  decoded->data = ppbytes_flush(heap, O, &decoded->size);
+
+  encoded->flags = PPSTRING_BASE16|PPSTRING_ENCODED;
+  decoded->flags = PPSTRING_BASE16|PPSTRING_DECODED;
+  encoded->alterego = decoded, decoded->alterego = encoded;
+
+  ppstring_check_bom2(decoded, encoded);
+  return encoded;
+}
+
+static ppstring * ppstring_buffer (iof *O, ppheap *heap)
+{
+  ppstring *encoded, *decoded;
+  uint8_t *p, *e;
+
+  decoded = (ppstring *)ppstruct_take(heap, sizeof(ppstring));
+  decoded->data = ppbytes_flush(heap, O, &decoded->size);
+
+  O = ppbytes_buffer(heap, (decoded->size << 1) + 1); // the exact size known
+  for (p = (uint8_t *)decoded->data, e = p + decoded->size; p < e; ++p)
+    iof_set2(O, base16_uc_alphabet[(*p) >> 4], base16_uc_alphabet[(*p) & 0xF]);
+  encoded = ppstruct_take(heap, sizeof(ppstring));
+  encoded->data = ppbytes_flush(heap, O, &encoded->size);
+  encoded->flags = PPSTRING_BASE16|PPSTRING_ENCODED;
+  decoded->flags = PPSTRING_BASE16|PPSTRING_DECODED;
+  encoded->alterego = decoded, decoded->alterego = encoded;
+  // ppstring_check_bom2(decoded, encoded); // ?
+  return encoded;
+}
+
+ppstring * ppstring_internal (const void *data, size_t size, ppheap *heap)
+{ // so far used only for crypt key
+  iof *O;
+  O = ppbytes_buffer(heap, size);
+  memcpy(O->buf, data, size);
+  O->pos = O->buf + size;
+  return ppstring_buffer(O, heap);
+}
+
+/* base85; local function for that to make that part independent from utilbasexx */
+
+static const int8_t ppstring_base85_lookup[] = {
+  -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+  -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+  -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,
+  15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,
+  31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,
+  47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,
+  63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,
+  79,80,81,82,83,84,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+  -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+  -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+  -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+  -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+  -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+  -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+  -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+  -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
+};
+
+#define base85_value(c) ppstring_base85_lookup[(uint8_t)(c)]
+
+#define base85_code(c1, c2, c3, c4, c5) ((((c1 * 85 + c2) * 85 + c3) * 85 + c4) * 85 + c5)
+#define base85_eof(c) (c == '~' || c < 0)
+
+static iof_status ppscan_base85_decode (iof *I, iof *O)
+{
+  int c1, c2, c3, c4, c5;
+  uint32_t code;
+  while (iof_ensure(O, 4))
+  {
+    do { c1 = iof_get(I); } while (ignored_char(c1));
+    if (base85_eof(c1))
+      return IOFEOF;
+    switch (c1)
+    {
+      case 'z':
+        iof_set4(O, '\0', '\0', '\0', '\0');
+        continue;
+      case 'y':
+        iof_set4(O, ' ', ' ', ' ', ' ');
+        continue;
+    }
+    do { c2 = iof_get(I); } while (ignored_char(c2));
+    if (base85_eof(c2))
+      return IOFERR;
+    do { c3 = iof_get(I); } while (ignored_char(c3));
+    if (base85_eof(c3))
+    {
+      if ((c1 = base85_value(c1)) < 0 || (c2 = base85_value(c2)) < 0)
+        return IOFERR;
+      code = base85_code(c1, c2, 84, 84, 84); /* padding with 'u' (117); 117-33 = 84 */
+      iof_set(O, (code >> 24));
+      return IOFEOF;
+    }
+    do { c4 = iof_get(I); } while (ignored_char(c4));
+    if (base85_eof(c4))
+    {
+      if ((c1 = base85_value(c1)) < 0 || (c2 = base85_value(c2)) < 0 || (c3 = base85_value(c3)) < 0)
+        return IOFERR;
+      code = base85_code(c1, c2, c3, 84, 84);
+      iof_set2(O, code>>24, ((code>>16) & 0xff));
+      return IOFEOF;
+    }
+    do { c5 = iof_get(I); } while (ignored_char(c5));
+    if (base85_eof(c5))
+    {
+      if ((c1 = base85_value(c1)) < 0 || (c2 = base85_value(c2)) < 0 ||
+          (c3 = base85_value(c3)) < 0 || (c4 = base85_value(c4)) < 0)
+        return IOFERR;
+      code = base85_code(c1, c2, c3, c4, 84);
+      iof_set3(O, (code >> 24), ((code >> 16) & 0xff), ((code >> 8) & 0xff));
+      return IOFEOF;
+    }
+    if ((c1 = base85_value(c1)) < 0 || (c2 = base85_value(c2)) < 0 || (c3 = base85_value(c3)) < 0 ||
+        (c4 = base85_value(c4)) < 0 || (c5 = base85_value(c5)) < 0)
+      return IOFERR;
+    code = base85_code(c1, c2, c3, c4, c5);
+    iof_set4(O, (code >> 24), ((code >> 16) & 0xff), ((code >> 8) & 0xff), (code & 0xff));
+  }
+  return IOFFULL;
+}
+
+static ppstring * ppscan_base85 (iof *I, ppheap *heap)
+{ // base85 alphabet is 33..117, adobe also hires 'z' and 'y' for compression
+  ppstring *encoded, *decoded;
+  iof *O, B;
+  int c;
+
+  O = ppbytes_buffer(heap, PPSTRING_INIT);
+  for (c = iof_char(I); (c >= '!' && c <= 'u') || c == 'z' || c == 'y'; c = iof_next(I))
+    iof_put(O, c);
+  if (c == '~')
+    if ((c = iof_next(I)) == '>')
+      ++I->pos;
+  encoded = (ppstring *)ppstruct_take(heap, sizeof(ppstring));
+  encoded->data = ppbytes_flush(heap, O, &encoded->size);
+
+  iof_string_reader(&B, encoded->data, encoded->size);
+  O = ppbytes_buffer(heap, (encoded->size * 5 / 4) + 1); // may be larger that that because of 'z' and 'y'
+  ppscan_base85_decode(&B, O);
+  decoded = (ppstring *)ppstruct_take(heap, sizeof(ppstring));
+  decoded->data = ppbytes_flush(heap, O, &decoded->size);
+
+  encoded->flags = PPSTRING_BASE85|PPSTRING_ENCODED;
+  decoded->flags = PPSTRING_BASE85|PPSTRING_DECODED;
+  encoded->alterego = decoded, decoded->alterego = encoded;
+
+  ppstring_check_bom2(decoded, encoded);
+  return encoded;
+}
+
+ppstring * ppstring_decoded (ppstring *string)
+{
+  return (string->flags & PPSTRING_ENCODED) ? string->alterego : string;
+}
+
+ppstring * ppstring_encoded (ppstring *string)
+{
+  return (string->flags & PPSTRING_DECODED) ? string->alterego : string;
+}
+
+ppbyte * ppstring_decoded_data (ppstring *string)
+{
+  return (string->flags & PPSTRING_ENCODED) ? string->alterego->data : string->data;
+}
+
+ppbyte * ppstring_encoded_data (ppstring *string)
+{
+  return (string->flags & PPSTRING_DECODED) ? string->alterego->data : string->data;
+}
+
+
+/* encrypted string */
+
+static ppstring * ppscan_crypt_string (iof *I, ppcrypt *crypt, ppheap *heap)
+{
+  ppstring *encoded, *decoded;
+  iof *O;
+  int c, b, balance, encode;
+  uint8_t *p, *e;
+  size_t size;
+
+  O = ppbytes_buffer(heap, PPSTRING_INIT);
+  for (balance = 0, encode = 0, c = iof_char(I); c >= 0; )
+  {
+    switch (c)
+    {
+      case '\\':
+        if ((c = iof_next(I)) < 0)
+          break;
+        encode = 1;
+        switch (c)
+        {
+          case OCTAL_CHAR_CASE:
+            b = c - '0';
+            if ((c = iof_next(I)) >= 0 && c >= '0' && c <= '7')
+            {
+              b = (b << 3) + c - '0';
+              if ((c = iof_next(I)) >= 0 && c >= '0' && c <= '7')
+              {
+                b = (b << 3) + c - '0';
+                c = iof_next(I);
+              }
+            }
+            iof_put(O, b);
+            // c is set to the next char
+            break;
+          case 'n':
+            iof_put(O, '\n');
+            c = iof_next(I);
+            break;
+          case 'r':
+            iof_put(O, '\r');
+            c = iof_next(I);
+            break;
+          case 't':
+            iof_put(O, '\t');
+            c = iof_next(I);
+            break;
+          case 'b':
+            iof_put(O, '\b');
+            c = iof_next(I);
+            break;
+          case 'f':
+            iof_put(O, '\f');
+            c = iof_next(I);
+            break;
+          case NEWLINE_CHAR_CASE: // not a part of the string, ignore (pdf spec page 55)
+            c = iof_next(I);
+            break;
+          case '(': case ')': case '\\':
+          default: // for enything else backslash is ignored (pdf spec page 54)
+            iof_put(O, c);
+            c = iof_next(I);
+            break;
+        }
+        break;
+      case '(':
+        ++balance;
+        encode = 1;
+        iof_put(O, '(');
+        c = iof_next(I);
+        break;
+      case ')':
+        if (balance == 0)
+        {
+          c = IOFEOF;
+          ++I->pos;
+        }
+        else
+        {
+          --balance;
+          //encode = 1;
+          iof_put(O, ')');
+          c = iof_next(I);
+        }
+        break;
+      default:
+        if (ppstringesc(c) != 0)
+          encode = 1;
+        iof_put(O, c);
+        c = iof_next(I);
+    }
+  }
+  /* decrypt the buffer in place, update size */
+  if (ppstring_decrypt(crypt, O->buf, iof_size(O), O->buf, &size))
+    O->pos = O->buf + size;
+  decoded = (ppstring *)ppstruct_take(heap, sizeof(ppstring));
+  decoded->data = ppbytes_flush(heap, O, &decoded->size);
+  /* make encoded counterpart */
+  if (encode)
+  {
+    O = ppbytes_buffer(heap, decoded->size + 1); // we don't know
+    for (p = (uint8_t *)decoded->data, e = p + decoded->size; p < e; ++p)
+    {
+      b = ppstringesc(*p);
+      switch (b)
+      {
+        case 0:
+          iof_put(O, *p);
+          break;
+        case -1:
+          iof_put4(O, '\\', ((*p) >> 6) + '0', (((*p) >> 3) & 7) + '0', ((*p) & 7) + '0');
+          break;
+        default:
+          iof_put2(O, '\\', b);
+          break;
+      }
+    }
+    encoded = (ppstring *)ppstruct_take(heap, sizeof(ppstring));
+    encoded->data = ppbytes_flush(heap, O, &encoded->size);
+    encoded->flags = PPSTRING_ENCODED;
+    decoded->flags = PPSTRING_DECODED;
+    encoded->alterego = decoded, decoded->alterego = encoded;
+    ppstring_check_bom2(decoded, encoded);
+  }
+  else
+  {
+    decoded->flags = 0;
+    decoded->alterego = decoded;
+    ppstring_check_bom(decoded);
+    encoded = decoded;
+  }
+  return encoded;
+}
+
+static ppstring * ppscan_crypt_base16 (iof *I, ppcrypt *crypt, ppheap *heap)
+{
+  ppstring *encoded, *decoded;
+  iof *O;
+  int c;
+  uint8_t *p, *e;
+  int8_t h1, h2;
+  size_t size;
+
+  O = ppbytes_buffer(heap, PPSTRING_INIT);
+  for (c = iof_char(I); c != '>'; )
+  {
+    if ((h1 = pphex(c)) < 0)
+    {
+      if (ignored_char(c))
+      {
+        c = iof_next(I);
+        continue;
+      }
+      break;
+    }
+    do {
+      c = iof_next(I);
+      if ((h2 = pphex(c)) >= 0)
+      {
+        c = iof_next(I);
+        break;
+      }
+      if (!ignored_char(c)) // c == '>' || c < 0 or some crap
+      {
+        h2 = 0;
+        break;
+      }
+    } while (1);
+    iof_put(O, (h1 << 4)|h2);
+  }
+  if (c == '>')
+    ++I->pos;
+  /* decrypt the buffer in place, update size */
+  if (ppstring_decrypt(crypt, O->buf, iof_size(O), O->buf, &size))
+    O->pos = O->buf + size;
+  decoded = (ppstring *)ppstruct_take(heap, sizeof(ppstring));
+  decoded->data = ppbytes_flush(heap, O, &decoded->size);
+
+  O = ppbytes_buffer(heap, (decoded->size << 1) + 1);
+  for (p = (uint8_t *)decoded->data, e = p + decoded->size; p < e; ++p)
+    iof_set2(O, base16_uc_alphabet[(*p) >> 4], base16_uc_alphabet[(*p) & 0xF]);
+  encoded = (ppstring *)ppstruct_take(heap, sizeof(ppstring));
+  encoded->data = ppbytes_flush(heap, O, &encoded->size);
+
+  encoded->flags = PPSTRING_BASE16|PPSTRING_ENCODED;
+  decoded->flags = PPSTRING_BASE16|PPSTRING_DECODED;
+  encoded->alterego = decoded, decoded->alterego = encoded;
+
+  ppstring_check_bom2(decoded, encoded);
+  return encoded;
+}
+
+/* scanner stack */
+
+#define PPSTACK_BUFFER 512
+
+static void ppstack_init (ppstack *stack, ppheap *heap)
+{
+  stack->buf = stack->pos = (ppobj *)pp_malloc(PPSTACK_BUFFER * sizeof(ppobj));
+  stack->size = 0;
+  stack->space = PPSTACK_BUFFER;
+  stack->heap = heap;
+}
+
+#define ppstack_free_buffer(stack) (pp_free((stack)->buf))
+
+static void ppstack_resize (ppstack *stack)
+{
+  ppobj *newbuffer;
+  stack->space <<= 1;
+  newbuffer = (ppobj *)pp_malloc(stack->space * sizeof(ppobj));
+  memcpy(newbuffer, stack->buf, stack->size * sizeof(ppobj));
+  ppstack_free_buffer(stack);
+  stack->buf = newbuffer;
+  stack->pos = newbuffer + stack->size;
+}
+
+#define ppstack_push(stack) ((void)((stack)->size < (stack)->space || (ppstack_resize(stack), 0)), ++(stack)->size, (stack)->pos++)
+#define ppstack_pop(stack, n) ((stack)->size -= (n), (stack)->pos -= (n))
+#define ppstack_at(stack, i) ((stack)->buf + i)
+#define ppstack_clear(stack) ((stack)->pos = (stack)->buf, (stack)->size = 0)
+
+/* scanner commons */
+
+#define ppscan_uint(I, u) iof_get_usize(I, u)
+#define ppread_uint(s, u) string_to_usize((const char *)(s), u)
+
+static ppobj * ppscan_numobj (iof *I, ppobj *obj, int negative)
+{
+  ppint integer;
+  ppnum number;
+  int exponent;
+  int c;
+  c = iof_char(I);
+  iof_scan_integer(I, c, integer);
+  switch(c)
+  {
+    case '.':
+    {
+      number = (ppnum)integer;
+      c = iof_next(I);
+      iof_scan_fraction(I, c, number, exponent);
+      double_negative_exp10(number, exponent);
+      obj->type = PPNUM, obj->number = negative ? -number : number;
+      break;
+    }
+    default:
+      obj->type = PPINT, obj->integer = negative ? -integer : integer;
+      break;
+  }
+  return obj;
+}
+
+static ppobj * ppscan_numobj_frac (iof *I, ppobj *obj, int negative)
+{
+  ppnum number;
+  int c, exponent;
+
+  number = 0.0;
+  c = iof_next(I);
+  iof_scan_fraction(I, c, number, exponent);
+  double_negative_exp10(number, exponent);
+  obj->type = PPNUM, obj->number = negative ? -number : number;
+  return obj;
+}
+
+static int ppscan_find (iof *I)
+{ // skips whitechars and comments
+  int c;
+  for (c = iof_char(I); ; c = iof_next(I))
+  {
+    switch (c)
+    {
+      case IGNORED_CHAR_CASE:
+        break;
+      case '%': {
+        do {
+          if ((c = iof_next(I)) < 0)
+            return c;
+        } while (!newline_char(c));
+        break;
+      }
+      default:
+        return c;
+    }
+  }
+  return c; // never reached
+}
+
+static int ppscan_keyword (iof *I, const char *keyword, size_t size)
+{
+  size_t i;
+  int c;
+  if ((size_t)iof_left(I) >= size)
+  {
+    if (memcmp(I->pos, keyword, size) != 0)
+      return 0;
+    I->pos += size;
+    return 1;
+  }
+  // sticky case, we can't go back
+  for (i = 0, c = iof_char(I); i < size; ++i, ++keyword, c = iof_next(I))
+    if (c < 0 || *keyword != c) /* PJ20190503 bugfix: there was (i!=c), we actually never get here anyway */
+      return 0;
+  return 1;
+}
+
+#define ppscan_key(I, literal) ppscan_keyword(I, "" literal, sizeof(literal) - 1)
+
+/* objects parser */
+
+static ppref * ppref_unresolved (ppheap *heap, ppuint refnumber, ppuint refversion)
+{
+  ppref *ref = (ppref *)ppstruct_take(heap, sizeof(ppref));
+  memset(ref, 0, sizeof(ppref));
+  ref->object.type = PPNONE;
+  ref->number = refnumber;
+  ref->version = refversion;
+  return ref;
+}
+
+#define PPMARK PPNONE
+
+static ppobj * ppscan_obj (iof *I, ppdoc *pdf, ppxref *xref)
+{
+  int c;
+  ppobj *obj;
+  size_t mark, size;
+  ppuint refnumber, refversion;
+  ppref *ref;
+  ppstack *stack;
+  ppcrypt *crypt;
+
+  stack = &pdf->stack;
+  c = iof_char(I);
+  switch (c)
+  {
+    case DIGIT_CHAR_CASE:
+      return ppscan_numobj(I, ppstack_push(stack), 0);
+    case '.':
+      return ppscan_numobj_frac(I, ppstack_push(stack), 0);
+    case '+':
+      ++I->pos;
+      return ppscan_numobj(I, ppstack_push(stack), 0);
+    case '-':
+      ++I->pos;
+      return ppscan_numobj(I, ppstack_push(stack), 1);
+    case '/':
+      ++I->pos;
+      obj = ppstack_push(stack);
+      obj->type = PPNAME;
+      obj->name = ppscan_name(I, &pdf->heap);
+      return obj;
+    case '(':
+      ++I->pos;
+      obj = ppstack_push(stack);
+      obj->type = PPSTRING;
+      if (ppcrypt_ref(pdf, crypt))
+        obj->string = ppscan_crypt_string(I, crypt, &pdf->heap);
+      else
+        obj->string = ppscan_string(I, &pdf->heap);
+      return obj;
+    case '[':
+      mark = stack->size;
+      obj = ppstack_push(stack);
+      obj->type = PPMARK; // ppscan_obj() checks types backward for 'R', so set the type immediatelly (reserved for PPARRAY)
+      obj->any = NULL;
+      ++I->pos;
+      for (c = ppscan_find(I); c != ']'; c = ppscan_find(I))
+      {
+        if (ppscan_obj(I, pdf, xref) == NULL)
+        { // callers assume that NULL returns means nothing pushed
+          size = stack->size - mark; // pop items AND the obj reserved for array
+          ppstack_pop(stack, size);
+          return NULL;
+        }
+      }
+      ++I->pos;
+      size = stack->size - mark - 1;
+      obj = ppstack_at(stack, mark); // stack might have been realocated
+      obj->type = PPARRAY;
+      obj->array = pparray_create(ppstack_at(stack, mark + 1), size, &pdf->heap);
+      ppstack_pop(stack, size); // pop array items, leave the array on top
+      return obj;
+    case '<':
+      if ((c = iof_next(I)) == '<')
+      {
+        mark = stack->size;
+        obj = ppstack_push(stack);
+        obj->type = PPMARK;
+        obj->any = NULL;
+        ++I->pos;
+        for (c = ppscan_find(I); c != '>'; c = ppscan_find(I))
+        {
+          if (ppscan_obj(I, pdf, xref) == NULL)
+          {
+            size = stack->size - mark;
+            ppstack_pop(stack, size);
+            return NULL;
+          }
+        }
+        if (iof_next(I) == '>')
+          ++I->pos;
+        size = stack->size - mark - 1;
+        obj = ppstack_at(stack, mark);
+        obj->type = PPDICT;
+        obj->dict = ppdict_create(ppstack_at(stack, mark + 1), size, &pdf->heap);
+        ppstack_pop(stack, size);
+        return obj;
+      }
+      obj = ppstack_push(stack);
+      obj->type = PPSTRING;
+      if (ppcrypt_ref(pdf, crypt))
+        obj->string = ppscan_crypt_base16(I, crypt, &pdf->heap);
+      else
+        obj->string = ppscan_base16(I, &pdf->heap);
+      return obj;
+    case 'R':
+      if (stack->size >= 2 && stack->pos[-1].type == PPINT && stack->pos[-2].type == PPINT)
+      {
+        ++I->pos;
+        obj = &stack->pos[-2];
+        refnumber = (ppuint)obj->integer;
+        ppstack_pop(stack, 1); // pop version number, retype obj to a reference
+        if (xref == NULL || (ref = ppxref_find(xref, refnumber)) == NULL)
+        { /* pdf spec page 64: unresolvable reference is not an error, should just be treated as a reference to null.
+             we also need this to read trailer, where refs can't be resolved yet */
+          refversion = (obj + 1)->integer;
+          //if (xref != NULL)
+          //  loggerf("unresolved reference %s", ppref_str(refnumber, refversion));
+          ref = ppref_unresolved(stack->heap, refnumber, refversion);
+        }
+        obj->type = PPREF;
+        obj->ref = ref;
+        return obj;
+      }
+      break;
+    case 't':
+      if (iof_next(I) == 'r' && iof_next(I) == 'u' && iof_next(I) == 'e')
+      {
+        ++I->pos;
+        obj = ppstack_push(stack);
+        obj->type = PPBOOL;
+        obj->integer = 1;
+        return obj;
+      }
+      break;
+    case 'f':
+      if (iof_next(I) == 'a' && iof_next(I) == 'l' && iof_next(I) == 's' && iof_next(I) == 'e')
+      {
+        ++I->pos;
+        obj = ppstack_push(stack);
+        obj->type = PPBOOL;
+        obj->integer = 0;
+        return obj;
+      }
+      break;
+    case 'n':
+      if (iof_next(I) == 'u' && iof_next(I) == 'l' && iof_next(I) == 'l')
+      {
+        ++I->pos;
+        obj = ppstack_push(stack);
+        obj->type = PPNULL;
+        obj->any = NULL;
+        return obj;
+      }
+      break;
+  }
+  return NULL;
+}
+
+/*
+A variant for contents streams (aka postscript); wise of operators, blind to references.
+We are still PDF, so we don't care about postscript specific stuff such as radix numbers
+and scientific numbers notation. It takes ppstack * as context (no ppdoc *) to be able
+to run contents parser beyond the scope of ppdoc heap.
+*/
+
+static ppstring * ppstring_inline (iof *I, ppdict *imagedict, ppheap *heap);
+
+static ppobj * ppscan_psobj (iof *I, ppstack *stack)
+{
+  int c;
+  ppobj *obj, *op;
+  size_t size, mark;
+  ppname *exec;
+  ppbyte *data;
+
+  c = iof_char(I);
+  switch (c)
+  {
+    case DIGIT_CHAR_CASE:
+      return ppscan_numobj(I, ppstack_push(stack), 0);
+    case '.':
+      return ppscan_numobj_frac(I, ppstack_push(stack), 0);
+    case '+':
+      c = iof_next(I);
+      if (base10_digit(c)) // '+.abc' is probably an executable name, but we are not in postscript
+        return ppscan_numobj(I, ppstack_push(stack), 0);
+      else if (c == '.')
+        return ppscan_numobj_frac(I, ppstack_push(stack), 0);
+      obj = ppstack_push(stack);
+      obj->type = PPNAME;
+      obj->name = ppscan_exec(I, stack->heap, '+');
+      return obj;
+    case '-':
+      c = iof_next(I);
+      if (base10_digit(c)) // ditto, we would handle type1 '-|' '|-' operators though
+        return ppscan_numobj(I, ppstack_push(stack), 1);
+      else if (c == '.')
+        return ppscan_numobj_frac(I, ppstack_push(stack), 1);
+      obj = ppstack_push(stack);
+      obj->type = PPNAME;
+      obj->name = ppscan_exec(I, stack->heap, '-');
+      return obj;
+    case '/':
+      ++I->pos;
+      obj = ppstack_push(stack);
+      obj->type = PPNAME;
+      obj->name = ppscan_name(I, stack->heap);
+      return obj;
+    case '(':
+      ++I->pos;
+      obj = ppstack_push(stack);
+      obj->type = PPSTRING;
+      obj->string = ppscan_string(I, stack->heap);
+      return obj;
+    case '[':
+      mark = stack->size;
+      obj = ppstack_push(stack);
+      obj->type = PPMARK;
+      obj->any = NULL;
+      ++I->pos;
+      for (c = ppscan_find(I); c != ']'; c = ppscan_find(I))
+      {
+        if (ppscan_psobj(I, stack) == NULL)
+        {
+          size = stack->size - mark;
+          ppstack_pop(stack, size);
+          return NULL;
+        }
+      }
+      ++I->pos;
+      size = stack->size - mark - 1;
+      obj = ppstack_at(stack, mark);
+      obj->type = PPARRAY;
+      obj->array = pparray_create(ppstack_at(stack, mark + 1), size, stack->heap);
+      ppstack_pop(stack, size);
+      return obj;
+    case '<':
+      if ((c = iof_next(I)) == '<')
+      {
+        mark = stack->size;
+        obj = ppstack_push(stack);
+        obj->type = PPMARK;
+        obj->any = NULL;
+        ++I->pos;
+        for (c = ppscan_find(I); c != '>'; c = ppscan_find(I))
+        {
+          if (ppscan_psobj(I, stack) == NULL)
+          {
+            size = stack->size - mark;
+            ppstack_pop(stack, size);
+            return NULL;
+          }
+        }
+        if (iof_next(I) == '>')
+          ++I->pos;
+        size = stack->size - mark - 1;
+        obj = ppstack_at(stack, mark);
+        obj->type = PPDICT;
+        obj->dict = ppdict_create(ppstack_at(stack, mark + 1), size, stack->heap);
+        ppstack_pop(stack, size);
+        return obj;
+      }
+      obj = ppstack_push(stack);
+      obj->type = PPSTRING;
+      if (c == '~')
+        ++I->pos, obj->string = ppscan_base85(I, stack->heap);
+      else
+        obj->string = ppscan_base16(I, stack->heap);
+      return obj;
+    default:
+      if (!ppnamebyte(c))
+        break; // forbid empty names; dead loop otherwise
+      ++I->pos;
+      /* true false null practically don't occur in streams so it makes sense to assume that we get an operator name here.
+         If it happen to be a keyword we could give back those several bytes to the heap but.. heap buffer is tricky enough. */
+      exec = ppscan_exec(I, stack->heap, (uint8_t)c);
+      data = exec->data;
+      obj = ppstack_push(stack);
+      switch (data[0])
+      {
+        case 't':
+          if (data[1] == 'r' && data[2] == 'u' && data[3] == 'e' && data[4] == '\0')
+          {
+            obj->type = PPBOOL;
+            obj->integer = 1;
+            return obj;
+          }
+          break;
+        case 'f':
+          if (data[1] == 'a' && data[2] == 'l' && data[3] == 's' && data[4] == 'e' && data[5] == '\0')
+          {
+            obj->type = PPBOOL;
+            obj->integer = 0;
+            return obj;
+          }
+          break;
+        case 'n':
+          if (data[1] == 'u' && data[2] == 'l' && data[3] == 'l' && data[4] == '\0')
+          {
+            obj->type = PPNULL;
+            obj->any = NULL;
+            return obj;
+          }
+          break;
+        case 'B':
+           /*
+           Inline images break rules of operand/operator syntax, so 'BI/ID' operators need to be treated as special syntactic keywords.
+
+             BI <keyval pairs> ID<whitechar?><imagedata><whitechar?>EI
+
+           We treat the image as a single syntactic token; BI starts collecting a dict, ID is the beginning of the data. Effectively EI
+           operator obtains two operands - dict and string. It is ok to put three items onto the stack, callers dont't assume there is just one.
+           */
+          if (data[1] == 'I' && data[2] == '\0')
+          {
+            ppdict *imagedict;
+            ppname *name;
+            /* key val pairs -> dict */
+            mark = stack->size - 1;
+            obj->type = PPMARK;
+            obj->any = NULL;
+            for (c = ppscan_find(I); ; c = ppscan_find(I))
+            {
+              if ((op = ppscan_psobj(I, stack)) == NULL)
+              {
+                size = stack->size - mark;
+                ppstack_pop(stack, size);
+                return NULL;
+              }
+              if (op->type == PPNAME)
+              {
+                name = op->name;
+                if (name->flags & PPNAME_EXEC)
+                {
+                  if (name->size != 2 || name->data[0] != 'I' || name->data[1] != 'D')
+                  { // weird
+                    size = stack->size - mark;
+                    ppstack_pop(stack, size);
+                    return NULL;
+                  }
+                  break;
+                }
+              }
+            }
+            size = stack->size - mark - 1;
+            obj = ppstack_at(stack, mark);
+            obj->type = PPDICT;
+            obj->dict = imagedict = ppdict_create(ppstack_at(stack, mark + 1), size, stack->heap);
+            ppstack_pop(stack, size);
+            /* put image data string */
+            obj = ppstack_push(stack);
+            obj->type = PPSTRING;
+            obj->string = ppstring_inline(I, imagedict, stack->heap);;
+            /* put EI operator name */
+            obj = ppstack_push(stack);
+            obj->type = PPNAME;
+            obj->name = ppexec_internal("EI", 2, stack->heap);
+            return obj;
+          }
+          break;
+      }
+      obj->type = PPNAME;
+      obj->name = exec;
+      return obj;
+  }
+  return NULL;
+}
+
+/*
+We try to get the exact inline image length from its dict params. If cannot predict the length, we have to scan the input until 'EI'.
+I've checked on may examples that it gives the same results but one can never be sure, as 'EI' might happen to be a part of the data.
+Stripping white char is also very heuristic; \0 is a white char in PDF and very likely to be a data byte.. weak method (pdf spec page 352).
+
+Revision 20190327: inline images may be compressed, in which case we can't predict the length.
+*/
+
+static size_t inline_image_length (ppdict *dict)
+{
+  ppuint w, h, bpc, colors;
+  ppname *cs;
+
+  if (ppdict_get_name(dict, "F") == NULL)
+  {
+    if (ppdict_get_uint(dict, "W", &w) && ppdict_get_uint(dict, "H", &h) && ppdict_get_uint(dict, "BPC", &bpc) && (cs = ppdict_get_name(dict, "CS")) != NULL)
+    {
+      if (ppname_is(cs, "DeviceGray"))
+        colors = 1;
+      else if (ppname_is(cs, "DeviceRGB"))
+        colors = 3;
+      else if (ppname_is(cs, "DeviceCMYK"))
+        colors = 4;
+      else
+        return PP_LENGTH_UNKNOWN;
+      return (w * h * bpc * colors + 7) >> 3;
+    }
+  }
+  return PP_LENGTH_UNKNOWN;
+}
+
+static ppstring * ppstring_inline (iof *I, ppdict *imagedict, ppheap *heap)
+{
+  iof *O;
+  int c, d, e;
+  size_t length, leftin, leftout, bytes;
+
+  c = iof_char(I);
+  if (ignored_char(c))
+    c = iof_next(I);
+
+  length = inline_image_length(imagedict);
+  if (length != PP_LENGTH_UNKNOWN)
+  {
+    O = ppbytes_buffer(heap, length);
+    while (length > 0 && iof_readable(I) && iof_writable(O))
+    {
+      leftin = iof_left(I);
+      leftout = iof_left(O);
+      bytes = length;
+      if (bytes > leftin) bytes = leftin;
+      if (bytes > leftout) bytes = leftout;
+      memcpy(O->pos, I->pos, bytes);
+      I->pos += bytes;
+      O->pos += bytes;
+      length -= bytes;
+    }
+    // gobble EI
+    if (ppscan_find(I) == 'E')
+      if (iof_next(I) == 'I')
+        ++I->pos;
+  }
+  else
+  {
+    O = ppbytes_buffer(heap, PPSTRING_INIT); // ?
+    while (c >= 0)
+    {
+      if (c == 'E')
+      {
+        d = iof_next(I);
+        if (d == 'I')
+        {
+          e = iof_next(I);
+          if (!ppnamebyte(e))
+          { /* strip one newline from the end and stop */
+            if (O->pos - 2 >= O->buf) // sanity
+            {
+              c = *(O->pos - 1);
+              if (ignored_char(c))
+              {
+                if (c == 0x0A && *(O->pos - 2) == 0x0D)
+                  O->pos -= 2;
+                else
+                  O->pos -= 1;
+              }
+            }
+            break;
+          }
+          iof_put2(O, c, d);
+          c = e;
+        }
+        else
+        {
+          iof_put(O, c);
+          c = d;
+        }
+      }
+      else
+      {
+        iof_put(O, c);
+        c = iof_next(I);
+      }
+    }
+  }
+  return ppstring_buffer(O, heap);
+}
+
+/* input reader */
+
+/*
+PDF input is a pseudo file that either keeps FILE * or data. Reader iof * is a proxy to input
+that provides byte-by-byte interface. Our iof structure is capable to link iof_file *input,
+but t avoid redundant checks on IOF_DATA flag, here we link iof *I directly to FILE * or mem buffer.
+When reading from file we need an internal buffer, which should be kept rather small, as it is
+only used to parse xrefs and objects (no streams). We allocate the buffer from a private heap
+(not static) to avoid conflicts when processing >1 pdfs at once. Besides, the input buffer may be
+needed after loading the document, eg. to access references raw data.
+*/
+
+#define PPDOC_BUFFER 0xFFF // keep that small, it is only used to parse body objects
+
+static void ppdoc_reader_init (ppdoc *pdf, iof_file *input)
+{
+  iof *I;
+  pdf->input = *input;
+  input = &pdf->input;
+  input->refcount = 1;
+  I = &pdf->reader;
+  if (input->flags & IOF_DATA)
+  {
+    pdf->buffer = NULL;            // input iof_file is the buffer
+    iof_string_reader(I, NULL, 0);
+  }
+  else
+  {
+    pdf->buffer = (uint8_t *)ppbytes_take(&pdf->heap, PPDOC_BUFFER);
+    iof_setup_file_handle_reader(I, NULL, 0, iof_file_get_fh(input)); // gets IOF_FILE_HANDLE flag and FILE *
+    I->space = PPDOC_BUFFER; // used on refill
+  }
+}
+
+/*
+Whenever we need to read the input file, we fseek the to the given offset and fread to the private buffer.
+The length we need is not always predictable, in which case PPDOC_BUFFER bytes are read (keep it small).
+I->buf = I->pos is set to the beginning, I->end set to the end (end is the first byte one shouldn't read).
+*/
+
+static iof * ppdoc_reader (ppdoc *pdf, size_t offset, size_t length)
+{
+  iof_file *input;
+  iof *I;
+  input = &pdf->input;
+  I = &pdf->reader;
+  if (iof_file_seek(input, (long)offset, SEEK_SET) != 0)
+    return NULL;
+  I->flags &= ~IOF_STOPPED;
+  if (input->flags & IOF_DATA)
+  {
+    I->buf = I->pos = input->pos;
+    I->end = (length == PP_LENGTH_UNKNOWN || I->pos + length >= input->end) ? input->end : (I->pos + length);
+  }
+  else
+  {
+    I->buf = I->pos = pdf->buffer; // ->buf is actually permanently equal pdf->buffer but we might need some tricks
+    if (length == PP_LENGTH_UNKNOWN || length > PPDOC_BUFFER)
+      length = PPDOC_BUFFER;
+    length = fread(I->buf, 1, length, I->file);
+    I->end = I->buf + length;
+  }
+  return I;
+}
+
+/* The position from the beginning of input
+- for data buffer: (pdf->input.pos - pdf->input.buf) + (I->pos - I->buf)
+  I->buf == pdf->input.pos, so this resolves to (I->pos - pdf->input.buf), independent from I->buf
+- for file buffer: ftell(pdf->input.file) - (I->end - I->pos)
+*/
+
+#define ppdoc_reader_tell(pdf, I) ((size_t)(((pdf)->input.flags & IOF_DATA) ? ((I)->pos - (pdf)->input.buf) : (ftell(iof_file_get_fh(&(pdf)->input)) - ((I)->end - (I)->pos))))
+
+/* pdf */
+
+#define PPDOC_HEADER 10 // "%PDF-?.??\n"
+
+static int ppdoc_header (ppdoc *pdf, uint8_t header[PPDOC_HEADER])
+{
+  size_t i;
+  if (memcmp(header, "%PDF-", 5) != 0)
+    return 0;
+  for (i = 5; i < PPDOC_HEADER - 1 && !ignored_char(header[i]); ++i)
+    pdf->version[i - 5] = header[i];
+  pdf->version[i - 5] = '\0';
+  return 1;
+}
+
+static int ppdoc_tail (ppdoc *pdf, iof_file *input, size_t *pxrefoffset)
+{
+  int c;
+  uint8_t tail[4*10], *p, back, tailbytes;
+
+  if (iof_file_seek(input, 0, SEEK_END) != 0)
+    return 0;
+  pdf->filesize = (size_t)iof_file_tell(input);
+  // simple heuristic to avoid fgetc() / fseek(-2) hiccup: keep seeking back by len(startxref) + 1 == 10
+  // until a letter found (assuming liberal white characters and tail length)
+  for (back = 1, tailbytes = 0; ; ++back)
+  {
+    if (iof_file_seek(input, -10, SEEK_CUR) != 0)
+      return 0;
+    tailbytes += 10;
+    c = iof_file_getc(input);
+    tailbytes -= 1;
+    switch (c)
+    {
+      case IGNORED_CHAR_CASE:
+      case DIGIT_CHAR_CASE:
+      case '%': case 'E': case 'O': case 'F':
+        if (back > 4) // 2 should be enough
+          return 0;
+        continue;
+      case 's': case 't': case 'a': case 'r': case 'x': case 'e': case 'f':
+        if (iof_file_read(tail, 1, tailbytes, input) != tailbytes)
+          return 0;
+        tail[tailbytes] = '\0';
+        for (p = &tail[0]; ; ++p)
+        {
+          if (*p == '\0')
+            return 0;
+          if ((c = base10_value(*p)) >= 0)
+            break;
+        }
+        ppread_uint(p, pxrefoffset);
+        return 1;
+      default:
+        return 0;
+    }
+  }
+  return 0; // never reached
+}
+
+/* xref/body */
+
+static int ppscan_start_entry (iof *I, ppref *ref)
+{
+  ppuint u;
+  ppscan_find(I); if (!ppscan_uint(I, &u) || u != ref->number) return 0;
+  ppscan_find(I); if (!ppscan_uint(I, &u) || u != ref->version) return 0;
+  ppscan_find(I); if (!ppscan_key(I, "obj")) return 0;
+  ppscan_find(I);
+  return 1;
+}
+
+static int ppscan_skip_entry (iof *I)
+{
+  ppuint u;
+  ppscan_find(I); if (!ppscan_uint(I, &u)) return 0;
+  ppscan_find(I); if (!ppscan_uint(I, &u)) return 0;
+  ppscan_find(I); if (!ppscan_key(I, "obj")) return 0;
+  ppscan_find(I);
+  return 1;
+}
+
+static int ppscan_start_stream (iof *I, ppdoc *pdf, size_t *streamoffset)
+{
+  int c;
+  ppscan_find(I);
+  if (ppscan_key(I, "stream"))
+  { // PJ20180912 bugfix: we were gobbling white characters (also null byte), while "stream" may be followed by EOL
+    // pdf spec page 60: "CARRIAGE RETURN and a LINE FEED or just a LINE FEED, and not by a CARRIAGE RETURN alone"
+    c = iof_char(I);
+    if (c == 0x0D)
+    {
+      if (iof_next(I) == 0x0A) // should be
+        ++I->pos;
+    }
+    else if (c == 0x0A)
+    {
+      ++I->pos;
+    }
+    *streamoffset = ppdoc_reader_tell(pdf, I);
+    return 1;
+  }
+  return 0;
+}
+
+static ppxref * ppxref_load (ppdoc *pdf, size_t xrefoffset);
+static ppxref * ppxref_load_chain (ppdoc *pdf, ppxref *xref);
+
+/* Parsing xref table
+
+  1 10               // first ref number and refs count
+  0000000000 00000 n // 10-digits offset, 5 digits version, type identifier
+  0000000000 00000 n // n states for normal I guess
+  0000000000 00000 f // f states for free (not used)
+  ...
+
+Free entries seem to be a relic of ancient times, completelly useless for us. To avoid parsing xref table twice,
+we waste some space on free entries by allocating one plane of refs for each section. Later on we slice sections,
+so that effectively free entries are not involved in map.
+
+Subsequent refs gets number, version and offset. Other fields initialized when parsing PDF body.
+
+Having xref table loaded, we sort sections for future binary search (xref with objects count == 0 is considered invalid).
+
+Then we have to deal with the trailer dict. In general, to load objects and resolve references we need a complete chain
+of xrefs (not only the top). To load the previous xref, we need its offset, which is given in trailer. So we have to
+parse the trailer ignoring references, which might be unresolvable at this point (objects parser makes a dummy check
+for xref != NULL on refs resolving ppscan_obj(), which irritates me but I don't want a separate parser for trailer..).
+The same applies to xref streams, in which we have parse the trailer not having xref map at all. So the procedure is:
+
+  - load xref map, initialize references, make it ready to search
+  - parse trailer ignoring references
+  - get /Prev xref offset and load older xref (linked list via ->prev)
+  - sort all refs in all xrefs by offset
+  - parse refs in order resolving references in contained objects
+  - fix trailer references
+
+First created xref becomes a pdf->xref (top xref). We link that early to control offsets already read (insane loops?).
+*/
+
+// Every xref table item "0000000000 00000 n" is said to be terminated with 2-byte EOL but we don't like relying on whites.
+#define xref_item_length (10 + 1 + 5 + 1 + 1)
+
+static ppxref * ppxref_load_table (iof *I, ppdoc *pdf, size_t xrefoffset)
+{
+  ppxref *xref;
+  ppxsec *xrefsection;
+  ppref *ref;
+  ppuint first, count, refindex;
+  uint8_t buffer[xref_item_length + 1];
+  const char *p;
+  const ppobj *obj;
+
+  buffer[xref_item_length] = '\0';
+  xref = ppxref_create(pdf, 0, xrefoffset);
+  if (pdf->xref == NULL) pdf->xref = xref;
+
+  for (ppscan_find(I); ppscan_uint(I, &first); ppscan_find(I))
+  {
+    ppscan_find(I);
+    if (!ppscan_uint(I, &count))
+      return NULL;
+    if (count == 0) // weird
+      continue;
+    xref->count += count;
+    xrefsection = NULL;
+    ref = (ppref *)ppstruct_take(&pdf->heap, count * sizeof(ppref));
+    for (refindex = 0; refindex < count; ++refindex, ++ref)
+    {
+      ref->xref = xref;
+      ref->number = first + refindex;
+      ppscan_find(I);
+      iof_read(I, buffer, xref_item_length);
+      switch (buffer[xref_item_length - 1])
+      {
+        case 'n':
+          if (xrefsection == NULL)
+          {
+            xrefsection = ppxref_push_section(xref, &pdf->heap);
+            xrefsection->first = ref->number;
+            xrefsection->refs = ref;
+          }
+          xrefsection->last = ref->number;
+          for (p = (const char *)buffer; *p == '0'; ++p);
+          p = ppread_uint(p, &ref->offset);
+          for ( ; *p == ' ' || *p == '0'; ++p);
+          p = ppread_uint(p, &ref->version);
+          ref->object.type = PPNONE; // init for sanity
+          ref->object.any = NULL;
+          ref->length = 0;
+          break;
+        case 'f':
+        default:
+          --ref;
+          xrefsection = NULL;
+          --xref->count;
+      }
+    }
+  }
+  /* sort section */
+  ppxref_sort(xref); // case of xref->size == 0 handled by ppxref_load_chain()
+  /* get trailer ignoring refs */
+  if (!ppscan_key(I, "trailer"))
+    return NULL;
+  ppscan_find(I);
+  if ((obj = ppscan_obj(I, pdf, NULL)) == NULL)
+    return NULL;
+  ppstack_pop(&pdf->stack, 1);
+  if (obj->type != PPDICT)
+    return NULL;
+  xref->trailer = *obj;
+  return ppxref_load_chain(pdf, xref);
+}
+
+/* Parsing xref stream
+First we load the trailer, ignoring references. Dict defines sections and fields lengths:
+
+  /Size                                  % max ref number plus 1
+  /Index [ first count first count ... ] % a pair of numbers for every section, defaults to [0 Size]
+  /W [w1 w2 w3]                          % fields lengths, 0 states for omitted field
+
+xref stream data is a continuous stream of binary number triplets. First number is a type:
+
+  0 - free entry (as 'f' in xref table)
+  1 - normal entry, followed by offset an version (as 'n' in xref table)
+  2 - compressed entry, followed by parent object stream number and entry index
+
+0 and 1 are handled as 'n' and 'f' entries in xref table. For type 2 we normally initialize
+ref->number and ref->version (the later is implicitly 0). ref->offset is set to 0 (invalid offset),
+which is recognized by objects loader.
+*/
+
+#define XREF_STREAM_MAX_FIELD 4
+
+static ppxref * ppxref_load_stream (iof *I, ppdoc *pdf, size_t xrefoffset)
+{
+  ppxref *xref;
+  ppxsec *xrefsection;
+  ppref *ref;
+  ppobj *obj;
+  ppstream *xrefstream;
+  size_t streamoffset;
+  ppuint w1, w2, w3, w, bufferbytes;
+  uint8_t buffer[3 * XREF_STREAM_MAX_FIELD], *b;
+  ppuint first, count, f1, f2, f3;
+  pparray *fieldwidths, *sectionindices;
+  ppobj sectionmock[2], *sectionfirst, *sectioncount;
+  size_t sections, sectionindex, refindex;
+
+  if (!ppscan_skip_entry(I))
+    return NULL;
+  if ((obj = ppscan_obj(I, pdf, NULL)) == NULL)
+    return NULL;
+  ppstack_pop(&pdf->stack, 1);
+  if (obj->type != PPDICT || !ppscan_start_stream(I, pdf, &streamoffset))
+    return NULL;
+  xrefstream = ppstream_create(pdf, obj->dict, streamoffset);
+  ppstream_info(xrefstream, pdf);
+  if ((fieldwidths = ppdict_get_array(xrefstream->dict, "W")) != NULL)
+  {
+    if (!pparray_get_uint(fieldwidths, 0, &w1)) w1 = 0;
+    if (!pparray_get_uint(fieldwidths, 1, &w2)) w2 = 0;
+    if (!pparray_get_uint(fieldwidths, 2, &w3)) w3 = 0;
+  }
+  else
+    w1 = w2 = w3 = 0;
+  if (w1 > XREF_STREAM_MAX_FIELD || w2 > XREF_STREAM_MAX_FIELD || w3 > XREF_STREAM_MAX_FIELD)
+    return NULL;
+  bufferbytes = w1 + w2 + w3;
+  if ((sectionindices = ppdict_get_array(xrefstream->dict, "Index")) != NULL)
+  {
+    sections = sectionindices->size >> 1;
+    sectionfirst = sectionindices->data;
+  }
+  else
+  {
+    sections = 1;
+    sectionmock[0].type = PPINT;
+    sectionmock[0].integer = 0;
+    sectionmock[1].type = PPINT;
+    if (!ppdict_get_int(xrefstream->dict, "Size", &sectionmock[1].integer))
+      sectionmock[1].integer = 0;
+    sectionfirst = &sectionmock[0];
+  }
+  if ((I = ppstream_read(xrefstream, 1, 0)) == NULL)
+    return NULL; // we fseek() so original I is useless anyway
+  xref = ppxref_create(pdf, sections, xrefoffset);
+  if (pdf->xref == NULL) pdf->xref = xref;
+  xref->trailer.type = PPSTREAM;
+  xref->trailer.stream = xrefstream;
+  for (sectionindex = 0; sectionindex < sections; ++sectionindex, sectionfirst += 2)
+  {
+    sectioncount = sectionfirst + 1;
+    first = 0, count = 0; // warnings
+    if (!ppobj_get_uint(sectionfirst, first) || !ppobj_get_uint(sectioncount, count))
+      goto xref_stream_error;
+    if (count == 0)
+      continue;
+    xref->count += count;
+    xrefsection = NULL;
+    ref = (ppref *)ppstruct_take(&pdf->heap, count * sizeof(ppref));
+    for (refindex = 0; refindex < count; ++refindex, ++ref)
+    {
+      ref->xref = xref;
+      ref->number = first + refindex;
+      if (iof_read(I, buffer, bufferbytes) != bufferbytes)
+        goto xref_stream_error;
+      b = buffer;
+      if (w1 == 0)
+        f1 = 1; // default type is 1
+      else
+        for (f1 = 0, w = 0; w < w1; f1 = (f1 << 8)|(*b), ++w, ++b);
+      for (f2 = 0, w = 0; w < w2; f2 = (f2 << 8)|(*b), ++w, ++b);
+      for (f3 = 0, w = 0; w < w3; f3 = (f3 << 8)|(*b), ++w, ++b);
+      switch (f1)
+      {
+        case 0:
+          //--ref;
+          xrefsection = NULL;
+          --xref->count;
+          break;
+        case 1:
+          if (xrefsection == NULL)
+          {
+            xrefsection = ppxref_push_section(xref, &pdf->heap);
+            xrefsection->first = ref->number;
+            xrefsection->refs = ref;
+          }
+          xrefsection->last = ref->number;
+          ref->offset = f2;
+          ref->version = f3;
+          ref->object.type = PPNONE;
+          ref->object.any = NULL;
+          ref->length = 0;
+          break;
+        case 2:
+          if (xrefsection == NULL)
+          {
+            xrefsection = ppxref_push_section(xref, &pdf->heap);
+            xrefsection->first = ref->number;
+            xrefsection->refs = ref;
+          }
+          xrefsection->last = ref->number;
+          ref->offset = 0; // f2 is parent objstm, f3 is index in parent, both useless
+          ref->version = 0; // compressed objects has implicit version == 0
+          ref->object.type = PPNONE;
+          ref->object.any = NULL;
+          ref->length = 0;
+          break;
+        default:
+          goto xref_stream_error;
+      }
+    }
+  }
+  /* sort sections */
+  ppxref_sort(xref); // case of xref->size == 0 handled by ppxref_load_chain()
+  /* close the stream _before_ loading prev xref */
+  ppstream_done(xrefstream);
+  /* load prev and return */
+  return ppxref_load_chain(pdf, xref);
+xref_stream_error:
+  ppstream_done(xrefstream);
+  return NULL;
+}
+
+/*
+The following procedure loads xref /Prev, links xref->prev and typically returns xref.
+Some docs contain empty xref (one section with zero objects) that is actually a proxy
+to xref stream referred as /XRefStm (genuine concept of xrefs old/new style xrefs in
+the same doc). In case of 0-length xref we ignore the proxy and return the target xref
+(otherwise we would need annoying sanity check for xref->size > 0 on every ref search).
+*/
+
+static ppxref * ppxref_load_chain (ppdoc *pdf, ppxref *xref)
+{
+  ppdict *trailer;
+  ppuint xrefoffset;
+  ppxref *prevxref, *nextxref;
+
+  trailer = ppxref_trailer(xref);
+  if (!ppdict_get_uint(trailer, "Prev", &xrefoffset)) // XRefStm is useless
+    return xref;
+  for (nextxref = pdf->xref; nextxref != NULL; nextxref = nextxref->prev)
+    if (nextxref->offset == xrefoffset) // insane
+      return NULL;
+  if ((prevxref = ppxref_load(pdf, (size_t)xrefoffset)) == NULL)
+    return NULL;
+  if (xref->size > 0)
+  {
+    xref->prev = prevxref;
+    return xref;
+  }
+  if (pdf->xref == xref)
+    pdf->xref = prevxref;
+  return prevxref;
+}
+
+static ppxref * ppxref_load (ppdoc *pdf, size_t xrefoffset)
+{
+  iof *I;
+
+  if ((I = ppdoc_reader(pdf, xrefoffset, PP_LENGTH_UNKNOWN)) == NULL)
+    return NULL;
+  ppscan_find(I);
+  if (ppscan_key(I, "xref"))
+    return ppxref_load_table(I, pdf, xrefoffset);
+  return ppxref_load_stream(I, pdf, xrefoffset);
+  // iof_close(I) does nothing here
+}
+
+static void ppoffmap_sort (ppref **left, ppref **right)
+{
+  ppref **l, **r, *t;
+  ppuint pivot;
+  l = left, r = right;
+  pivot = (*(l + ((r - l) / 2)))->offset;
+  do
+  { // don't read from pointer!
+    while ((*l)->offset < pivot) ++l;
+    while ((*r)->offset > pivot) --r;
+    if (l <= r)
+    {
+      t = *l;
+      *l = *r;
+      *r = t;
+      ++l, --r;
+    }
+  } while (l <= r);
+  if (left < r)
+    ppoffmap_sort(left, r);
+  if (l < right)
+    ppoffmap_sort(l, right);
+}
+
+
+static void fix_trailer_references (ppdoc *pdf)
+{
+  ppxref *xref;
+  ppdict *trailer;
+  ppname **pkey;
+  ppobj *obj;
+  ppref *ref;
+  for (xref = pdf->xref; xref != NULL; xref = xref->prev)
+  {
+    if ((trailer = ppxref_trailer(xref)) == NULL)
+      continue;
+    for (ppdict_first(trailer, pkey, obj); *pkey != NULL; ppdict_next(pkey, obj))
+    { // no need to go deeper in structs, all items in trailer except info and root must be direct refs
+      if (obj->type != PPREF)
+        continue;
+      ref = obj->ref;
+      if (ref->offset == 0) // unresolved?
+        if ((ref = ppxref_find(xref, ref->number)) != NULL)
+          obj->ref = ref; // at this moment the reference still points nothing, but should be the one with the proper offset
+    }
+  }
+}
+
+/*
+Here comes a procedure that loads all entries from all document bodies. We resolve references while
+parsing objects and to make resolving correct, we need a complete chain of xref maps, and a knowledge
+about possible linearized dict (first offset). So loading refs sorted by offsets makes sense (not sure
+if it matters nowadays but we also avoid fseek() by large offsets).
+
+Here is the proc:
+
+  - create a list of all refs in all bodies
+  - sort the list by offsets
+  - for every ref from the sorted list:
+    - estimate object length to avoid fread-ing more than necessary (not perfect but enough)
+    - fseek() to the proper offset, fread() entry data or its part
+    - parse the object with ppscan_obj(I, pdf, xref), where xref is not necessarily top pdf->xref
+      (since v0.98 xref actually no longer matters, see xref_find() notes)
+    - save the actual ref->length (not used so far, but we keep that so..)
+    - make a stream if a dict is followed by "stream" keyword, also save the stream offset
+  - free the list
+
+PJ2080916: Luigi and Hans fixeed a bug (rev 6491); a document having a stream with /Length being
+a reference, that was stored in /ObjStm, and therefore not yet resolved when caching /Length key
+value as stream->offset (ppstream_info()). At the end, references were resolved propertly, but
+the stream was no readable; stream->offset == 0. In rev6491 ObjStm streams are loaded before
+others streams.
+*/
+
+static int ppdoc_load_objstm (ppstream *stream, ppdoc *pdf, ppxref *xref);
+
+#define ppref_is_objstm(ref, stream, type) \
+  ((ref)->xref->trailer.type == PPSTREAM && (type = ppdict_get_name((stream)->dict, "Type")) != NULL && ppname_is(type, "ObjStm"))
+
+
+static void ppdoc_load_entries (ppdoc *pdf)
+{
+  size_t objects, sectionindex, refnumber, offindex;
+  size_t streams = 0, object_streams = 0, redundant_indirections = 0;
+  ppnum linearized;
+  ppref **offmap, **pref, *ref;
+  ppxref *xref;
+  ppxsec *xsec;
+  ppobj *obj;
+  ppname *type;
+  ppcrypt *crypt;
+  ppstream *stream;
+
+  if ((objects = (size_t)ppdoc_objects(pdf)) == 0) // can't happen
+    return;
+  pref = offmap = (ppref **)pp_malloc(objects * sizeof(ppref *));
+  objects = 0; // recount refs with offset > 0
+  for (xref = pdf->xref; xref != NULL; xref = xref->prev)
+    for (sectionindex = 0, xsec = xref->sects; sectionindex < xref->size; ++sectionindex, ++xsec)
+      for (refnumber = xsec->first, ref = xsec->refs; refnumber <= xsec->last; ++refnumber, ++ref)
+        if (ref->offset > 0) // 0 means compressed or insane
+          *pref++ = ref, ++objects;
+  ppoffmap_sort(offmap, offmap + objects - 1);
+
+  crypt = pdf->crypt;
+  for (offindex = 0, pref = offmap; offindex < objects; )
+  {
+    ref = *pref;
+    ++pref;
+    ++offindex;
+    if (ref->object.type != PPNONE) // might be preloaded already (/Encrypt dict, stream filter dicts, stream /Length..)
+      continue;
+    if (offindex < objects)
+      ref->length = (*pref)->offset - ref->offset;
+    else
+      ref->length = pdf->filesize > ref->offset ? pdf->filesize - ref->offset : 0;
+    if (crypt != NULL)
+    {
+      ppcrypt_start_ref(crypt, ref);
+      obj = ppdoc_load_entry(pdf, ref);
+      ppcrypt_end_ref(crypt);
+    }
+    else
+    {
+      obj = ppdoc_load_entry(pdf, ref);
+    }
+    switch (obj->type)
+    {
+      case PPDICT: /* Check if the object at first offset is linearized dict. We need that to resolve all references properly. */
+        if (offindex == 1 && ppdict_get_num(obj->dict, "Linearized", &linearized)) // /Linearized value is a version number, default 1.0
+          pdf->flags |= PPDOC_LINEARIZED;
+        break;
+      case PPSTREAM:
+        ++streams;
+        if (ppref_is_objstm(ref, obj->stream, type))
+          ++object_streams;
+        break;
+      case PPREF:
+        ++redundant_indirections;
+        break;
+      default:
+        break;
+    }
+  }
+
+  /* cut references pointing to references (rare). doing for all effectively cuts all insane chains */
+  for (pref = offmap; redundant_indirections > 0; )
+  {
+    ref = *pref++;
+    if (ref->object.type == PPREF)
+    {
+      --redundant_indirections;
+      ref->object = ref->object.ref->object;
+    }
+  }
+
+  /* load pdf 1.5 object streams _before_ other streams */
+  for (pref = offmap; object_streams > 0; )
+  {
+    ref = *pref++;
+    obj = &ref->object;
+    if (obj->type != PPSTREAM)
+      continue;
+    stream = obj->stream;
+    if (ppref_is_objstm(ref, stream, type))
+    {
+      --object_streams;
+      if (crypt != NULL)
+       {
+         ppcrypt_start_ref(crypt, ref);
+         ppstream_info(stream, pdf);
+         ppcrypt_end_ref(crypt);
+       }
+       else
+       {
+         ppstream_info(stream, pdf);
+       }
+       if (!ppdoc_load_objstm(stream, pdf, ref->xref))
+         loggerf("invalid objects stream %s at offset " PPSIZEF, ppref_str(ref->number, ref->version), ref->offset);
+
+    }
+  }
+
+  /* now handle other streams */
+  for (pref = offmap; streams > 0; )
+  {
+    ref = *pref++;
+    obj = &ref->object;
+    if (obj->type != PPSTREAM)
+      continue;
+    --streams;
+    stream = obj->stream;
+    if (crypt != NULL)
+    {
+      ppcrypt_start_ref(crypt, ref);
+      ppstream_info(stream, pdf);
+      ppcrypt_end_ref(crypt);
+    }
+    else
+    {
+      ppstream_info(stream, pdf);
+    }
+  }
+  pp_free(offmap);
+}
+
+ppobj * ppdoc_load_entry (ppdoc *pdf, ppref *ref)
+{
+  iof *I;
+  size_t length;
+  ppxref *xref;
+  ppobj *obj;
+  ppstack *stack;
+  size_t streamoffset;
+  ppref *refref;
+  ppuint refnumber, refversion;
+
+  length = ref->length > 0 ? ref->length : PP_LENGTH_UNKNOWN; // estimated or unknown
+  if ((I = ppdoc_reader(pdf, ref->offset, length)) == NULL || !ppscan_start_entry(I, ref))
+  {
+    loggerf("invalid %s offset " PPSIZEF, ppref_str(ref->number, ref->version), ref->offset);
+    return &ref->object; // PPNONE
+  }
+  stack = &pdf->stack;
+  xref = ref->xref;
+  if ((obj = ppscan_obj(I, pdf, xref)) == NULL)
+  {
+    loggerf("invalid %s object at offset " PPSIZEF, ppref_str(ref->number, ref->version), ref->offset);
+    return &ref->object; // PPNONE
+  }
+  ref->object = *obj;
+  ppstack_pop(stack, 1);
+  obj = &ref->object;
+  ref->length = ppdoc_reader_tell(pdf, I) - ref->offset;
+  if (obj->type == PPDICT)
+  {
+    if (ppscan_start_stream(I, pdf, &streamoffset))
+    {
+      obj->type = PPSTREAM;
+      obj->stream = ppstream_create(pdf, obj->dict, streamoffset);
+    }
+  }
+  else if (obj->type == PPINT)
+  {
+    ppscan_find(I);
+    if (ppscan_uint(I, &refversion) && ppscan_find(I) == 'R')
+    {
+      refnumber = (ppuint)obj->integer;
+      if ((refref = ppxref_find(xref, refnumber)) != NULL)
+      {
+        obj->type = PPREF;
+        obj->ref = refref;
+      }
+      else
+      {
+        obj->type = PPNONE; // as ppref_unresolved()
+        obj->any = NULL;
+      }
+    }
+  }
+  return obj;
+}
+
+/* Loading entries from object stream
+
+  /N is the number of contained entries
+  /First is the offset of the first item
+
+The stream consists of N pairs of numbers <objnum> <offset> <objnum> <offset> ...
+Offsets are ascending (relative to the first), but ref numbers order is arbitrary.
+PDF spec says there might be some additional data between objects, so we should obey offsets.
+Which means we should basically load the stream at once (may be needed anyway to grab the stream [...]).
+*/
+
+static int ppdoc_load_objstm (ppstream *stream, ppdoc *pdf, ppxref *xref)
+{
+  ppdict *dict; // stream dict, actually still on stack
+  ppref *ref;
+  ppobj *obj;
+  ppuint items, firstoffset, offset, objnum, i, invalid = 0;
+  iof *I;
+  uint8_t *firstdata, *indexdata;
+  ppstack *stack;
+
+  dict = stream->dict;
+  if (!ppdict_rget_uint(dict, "N", &items) || !ppdict_rget_uint(dict, "First", &firstoffset))
+    return 0;
+  if ((I = ppstream_read(stream, 1, 1)) == NULL)
+    return 0;
+  firstdata = I->pos + firstoffset;
+  if (firstdata >= I->end)
+    goto invalid_objstm;
+  stack = &pdf->stack;
+  //if (pdf->crypt != NULL)
+  //  ppcrypt_end_ref(pdf->crypt); // objects are not encrypted, pdf->crypt->ref ensured NULL
+  for (i = 0; i < items; ++i)
+  {
+    ppscan_find(I);
+    if (!ppscan_uint(I, &objnum))
+      goto invalid_objstm;
+    ppscan_find(I);
+    if (!ppscan_uint(I, &offset))
+      goto invalid_objstm;
+    if ((ref = ppxref_find_local(xref, objnum)) == NULL || ref->object.type != PPNONE)
+    {
+      loggerf("invalid compressed object number " PPUINTF " at position " PPUINTF, objnum, i);
+      ++invalid;
+      continue;
+    }
+    if (firstdata + offset >= I->end)
+    {
+      loggerf("invalid compressed object offset " PPUINTF " at position " PPUINTF, offset, i);
+      ++invalid;
+      continue;
+    }
+    indexdata = I->pos; // save position
+    I->pos = firstdata + offset; // go to the object
+    ppscan_find(I);
+    if ((obj = ppscan_obj(I, pdf, xref)) != NULL)
+    {
+      ref->object = *obj;
+      ppstack_pop(stack, 1);
+      // nothing more needed, as obj can never be indirect ref or stream
+    }
+    else
+    {
+      ++invalid;
+      loggerf("invalid compressed object %s at stream offset " PPUINTF, ppref_str(objnum, 0), offset);
+    }
+    I->pos = indexdata; // restore position and read next from index
+  }
+  ppstream_done(stream);
+  return invalid == 0;
+invalid_objstm:
+  ppstream_done(stream);
+  return 0;
+}
+
+/* main PDF loader proc */
+
+ppcrypt_status ppdoc_crypt_pass (ppdoc *pdf, const void *userpass, size_t userpasslength, const void *ownerpass, size_t ownerpasslength)
+{
+  switch (pdf->cryptstatus)
+  {
+    case PPCRYPT_NONE:
+    case PPCRYPT_DONE:
+    case PPCRYPT_FAIL:
+      break;
+    case PPCRYPT_PASS: // initial status or really needs password
+      pdf->cryptstatus = ppdoc_crypt_init(pdf, userpass, userpasslength, ownerpass, ownerpasslength);
+      switch (pdf->cryptstatus)
+      {
+        case PPCRYPT_NONE:
+        case PPCRYPT_DONE:
+          ppdoc_load_entries(pdf);
+          break;
+        case PPCRYPT_PASS: // user needs to check ppdoc_crypt_status() and recall ppdoc_crypt_pass() with the proper password
+        case PPCRYPT_FAIL: // hopeless..
+          break;
+      }
+      break;
+  }
+  return pdf->cryptstatus;
+}
+
+static ppdoc * ppdoc_read (ppdoc *pdf, iof_file *input)
+{
+  uint8_t header[PPDOC_HEADER];
+  size_t xrefoffset;
+
+  input = &pdf->input;
+  if (iof_file_read(header, 1, PPDOC_HEADER, input) != PPDOC_HEADER || !ppdoc_header(pdf, header))
+    return NULL;
+  if (!ppdoc_tail(pdf, input, &xrefoffset))
+    return NULL;
+  if (ppxref_load(pdf, xrefoffset) == NULL)
+    return NULL;
+  fix_trailer_references(pdf); // after loading xrefs but before accessing trailer refs (/Encrypt might be a reference)
+  // check encryption, if any, try empty password
+  switch (ppdoc_crypt_pass(pdf, "", 0, NULL, 0))
+  {
+    case PPCRYPT_NONE: // no encryption
+    case PPCRYPT_DONE: // encryption with an empty password
+    case PPCRYPT_PASS: // the user needs to check ppdoc_crypt_status() and call ppdoc_crypt_pass()
+      break;
+    case PPCRYPT_FAIL: // hopeless
+      //loggerf("decryption failed");
+      //return NULL;
+      break;
+  }
+  return pdf;
+}
+
+static void ppdoc_pages_init (ppdoc *pdf);
+
+/*
+20191214: We used to allocate ppdoc, as all other structs, from the internal heap:
+
+  ppheap heap;
+  ppheap_init(&heap);
+  pdf = (ppdoc *)ppstruct_take(&heap, sizeof(ppdoc));
+  pdf->heap = heap;
+  ppbytes_buffer_init(&pdf->heap);
+  ...
+
+So ppdoc pdf was allocated from the heap owned by the pdf itself. Somewhat tricky, but should work fine,
+as from that point nothing refered to a local heap variable addres. For some reason that causes a crash
+on openbsd.
+*/
+
+static ppdoc * ppdoc_create (iof_file *input)
+{
+  ppdoc *pdf;
+
+  pdf = (ppdoc *)pp_malloc(sizeof(ppdoc));
+  ppheap_init(&pdf->heap);
+  ppbytes_buffer_init(&pdf->heap);
+  ppstack_init(&pdf->stack, &pdf->heap);
+  ppdoc_reader_init(pdf, input);
+  ppdoc_pages_init(pdf);
+  pdf->xref = NULL;
+  pdf->crypt = NULL;
+  pdf->cryptstatus = PPCRYPT_PASS; // check on ppdoc_read() -> ppdoc_crypt_pass()
+  pdf->flags = 0;
+  pdf->version[0] = '\0';
+  if (ppdoc_read(pdf, &pdf->input) != NULL)
+    return pdf;
+  ppdoc_free(pdf);
+  return NULL;
+}
+
+ppdoc * ppdoc_load (const char *filename)
+{
+  FILE *file;
+  iof_file input;
+  if ((file = fopen(filename, "rb")) == NULL)
+    return NULL;
+  iof_file_init(&input, file);
+  input.flags |= IOF_CLOSE_FILE;
+  return ppdoc_create(&input);
+}
+
+ppdoc * ppdoc_filehandle (FILE *file, int closefile)
+{
+  iof_file input;
+  if (file == NULL)
+    return NULL;
+  iof_file_init(&input, file);
+  if (closefile)
+    input.flags |= IOF_CLOSE_FILE;
+  return ppdoc_create(&input);
+}
+
+ppdoc * ppdoc_mem (const void *data, size_t size)
+{
+  iof_file input;
+  iof_file_rdata_init(&input, data, size);
+  input.flags |= IOF_BUFFER_ALLOC;
+  return ppdoc_create(&input);
+}
+
+void ppdoc_free (ppdoc *pdf)
+{
+  iof_file_decref(&pdf->input);
+  ppstack_free_buffer(&pdf->stack);
+  ppheap_free(&pdf->heap);
+  pp_free(pdf);
+}
+
+ppcrypt_status ppdoc_crypt_status (ppdoc *pdf)
+{
+  return pdf->cryptstatus;
+}
+
+ppint ppdoc_permissions (ppdoc *pdf)
+{
+  return pdf->crypt != NULL ? pdf->crypt->permissions : (ppint)0xFFFFFFFFFFFFFFFF;
+}
+
+/* pages access */
+
+static pparray * pppage_node (ppdict *dict, ppuint *count, ppname **type)
+{
+  ppname **pkey, *key;
+  ppobj *obj;
+  pparray *kids = NULL;
+  *count = 0;
+  *type = NULL;
+  for (ppdict_first(dict, pkey, obj); (key = *pkey) != NULL; ppdict_next(pkey, obj))
+  {
+    switch (key->data[0])
+    {
+      case 'T':
+        if (ppname_is(key, "Type"))
+          *type = ppobj_get_name(obj);
+        break;
+      case 'C':
+        if (ppname_is(key, "Count"))
+          ppobj_rget_uint(obj, *count);
+        break;
+      case 'K':
+        if (ppname_is(key, "Kids"))
+          kids = ppobj_rget_array(obj);
+        break;
+    }
+  }
+  return kids;
+}
+
+#define ppname_is_page(type) (type != NULL && ppname_is(type, "Page"))
+
+ppuint ppdoc_page_count (ppdoc *pdf)
+{
+  ppref *ref;
+  ppname *type;
+  ppuint count;
+  if ((ref = ppxref_pages(pdf->xref)) == NULL)
+    return 0;
+  if (pppage_node(ref->object.dict, &count, &type) == NULL)
+    return ppname_is_page(type) ? 1 : 0; // acrobat and ghostscript accept documents with root /Pages entry being a reference to a sole /Page object
+  return count;
+}
+
+ppref * ppdoc_page (ppdoc *pdf, ppuint index)
+{
+  ppdict *dict;
+  ppuint count;
+  pparray *kids;
+  size_t size, i;
+  ppobj *r, *o;
+  ppref *ref;
+  ppname *type;
+
+  if ((ref = ppxref_pages(pdf->xref)) == NULL)
+    return NULL;
+  dict = ref->object.dict;
+  if ((kids = pppage_node(dict, &count, &type)) != NULL)
+  {
+    if (index < 1 || index > count)
+      return NULL;
+  }
+  else
+  {
+    return index == 1 && ppname_is_page(type) ? ref : NULL;
+  }
+scan_array:
+  if (index <= count / 2)
+  { // probably shorter way from the beginning
+    for (i = 0, size = kids->size, r = pparray_at(kids, 0); i < size; ++i, ++r)
+    {
+      if (r->type != PPREF)
+        return NULL;
+      o = &r->ref->object;
+      if (o->type != PPDICT)
+        return NULL;
+      dict = o->dict;
+      if ((kids = pppage_node(dict, &count, &type)) != NULL)
+      {
+        if (index <= count)
+          goto scan_array;
+        index -= count;
+        continue;
+      }
+      if (index == 1 && ppname_is_page(type))
+        return r->ref;
+      --index;
+    }
+  }
+  else if ((size = kids->size) > 0) // for safe (size-1)
+  { // probably shorter way from the end
+    index = count - index + 1;
+    for (i = 0, r = pparray_at(kids, size - 1); i < size; ++i, --r)
+    {
+      if (r->type != PPREF)
+        return NULL;
+      o = &r->ref->object;
+      if (o->type != PPDICT)
+        return NULL;
+      dict = o->dict;
+      if ((kids = pppage_node(dict, &count, &type)) != NULL)
+      {
+        if (index <= count) {
+          index = count - index + 1;
+          goto scan_array;
+        }
+        index -= count;
+        continue;
+      }
+      if (index == 1 && ppname_is_page(type))
+        return r->ref;
+      --index;
+    }
+  }
+  return NULL;
+}
+
+/*
+Through pages iterator. Iterating over pages tree just on the base of /Kids and /Parent keys
+is ineffective, as to get next pageref we need to take parent, find the pageref in /Kids,
+take next (or go upper).. Annoying. We use a dedicated stack for pages iterator. This could
+actually be done with pdf->stack, but some operations may clear it, so safer to keep it independent
+Besides, its depth is constant (set on first use), so no need for allocs.
+*/
+
+static void ppdoc_pages_init (ppdoc *pdf)
+{
+  pppages *pages;
+  pages = &pdf->pages;
+  pages->root = pages->parent = &pages->buffer[0];
+  pages->depth = 0;
+  pages->space = PPPAGES_STACK_DEPTH;
+}
+
+static ppkids * pppages_push (ppdoc *pdf, pparray *kids)
+{
+  ppkids *newroot, *bounds;
+  pppages *pages;
+  pages = &pdf->pages;
+  if (pages->depth == pages->space)
+  {
+    pages->space <<= 1;
+    newroot = (ppkids *)ppstruct_take(&pdf->heap, pages->space * sizeof(ppkids));
+    memcpy(newroot, pages->root, pages->depth * sizeof(ppkids));
+    pages->root = newroot;
+  }
+  bounds = pages->parent = &pages->root[pages->depth++];
+  bounds->current = pparray_at(kids, 0);
+  bounds->sentinel = pparray_at(kids, kids->size);
+  return bounds;
+}
+
+#define pppages_pop(pages) (--((pages)->parent), --((pages)->depth))
+
+static ppref * ppdoc_pages_group_first (ppdoc *pdf, ppref *ref)
+{
+  ppdict *dict;
+  pparray *kids;
+  ppuint count;
+  ppname *type;
+  ppobj *o;
+
+  dict = ref->object.dict; // typecheck made by callers
+  while ((kids = pppage_node(dict, &count, &type)) != NULL)
+  {
+    if ((o = pparray_get_obj(kids, 0)) == NULL) // empty /Kids
+      return ppdoc_next_page(pdf);
+    if ((ref = ppobj_get_ref(o)) == NULL || ref->object.type != PPDICT)
+      return NULL;
+    pppages_push(pdf, kids);
+    dict = ref->object.dict;
+  }
+  return ppname_is_page(type) ? ref : NULL;
+}
+
+ppref * ppdoc_first_page (ppdoc *pdf)
+{
+  ppref *ref;
+  pppages *pages;
+  if ((ref = ppdoc_pages(pdf)) == NULL)
+    return NULL;
+  pages = &pdf->pages;
+  pages->parent = pages->root;
+  pages->depth = 0;
+  return ppdoc_pages_group_first(pdf, ref);
+}
+
+ppref * ppdoc_next_page (ppdoc *pdf)
+{
+  pppages *pages;
+  ppkids *bounds;
+  ppref *ref;
+  ppobj *obj;
+  pages = &pdf->pages;
+  while (pages->depth > 0)
+  {
+    bounds = pages->parent;
+    obj = ++bounds->current;
+    if (obj < bounds->sentinel)
+    {
+      if (obj->type != PPREF)
+        return NULL;
+      ref = obj->ref;
+      if (ref->object.type != PPDICT)
+        return NULL;
+      return ppdoc_pages_group_first(pdf, ref);
+    }
+    else
+    { // no next node, go upper
+      pppages_pop(pages);
+    }
+  }
+  return NULL;
+}
+
+/* context */
+
+ppcontext * ppcontext_new (void)
+{
+  ppcontext *context;
+  context = (ppcontext *)pp_malloc(sizeof(ppcontext));
+  ppheap_init(&context->heap);
+  ppbytes_buffer_init(&context->heap);
+  ppstack_init(&context->stack, &context->heap);
+  return context;
+}
+
+void ppcontext_done (ppcontext *context)
+{
+  ppheap_renew(&context->heap);
+  ppstack_clear(&context->stack);
+}
+
+void ppcontext_free (ppcontext *context)
+{
+  ppstack_free_buffer(&context->stack);
+  ppheap_free(&context->heap);
+  pp_free(context);
+}
+
+/* page contents streams */
+
+//#define ppcontents_first_stream(array) pparray_rget_stream(array, 0)
+
+static ppstream * ppcontents_first_stream (pparray *array)
+{
+  size_t i;
+  ppobj *obj;
+  ppref *ref;
+  for (pparray_first(array, i, obj); i < array->size; pparray_next(i, obj))
+    if ((ref = ppobj_get_ref(obj)) != NULL && ref->object.type == PPSTREAM)
+      return ref->object.stream;
+  return NULL;
+}
+
+static ppstream * ppcontents_next_stream (pparray *array, ppstream *stream)
+{
+  size_t i;
+  ppobj *obj;
+  ppref *ref;
+  for (pparray_first(array, i, obj); i < array->size; pparray_next(i, obj))
+    if ((ref = ppobj_get_ref(obj)) != NULL && ref->object.type == PPSTREAM && ref->object.stream == stream)
+      if (++i < array->size && (ref = ppobj_get_ref(obj + 1)) != NULL && ref->object.type == PPSTREAM)
+        return ref->object.stream;
+  return NULL;
+}
+
+ppstream * ppcontents_first (ppdict *dict)
+{
+  ppobj *contentsobj;
+  if ((contentsobj = ppdict_rget_obj(dict, "Contents")) == NULL)
+    return NULL;
+  switch (contentsobj->type)
+  {
+    case PPARRAY:
+      return ppcontents_first_stream(contentsobj->array);
+    case PPSTREAM:
+      return contentsobj->stream;
+    default:
+      break;
+  }
+  return NULL;
+}
+
+ppstream * ppcontents_next (ppdict *dict, ppstream *stream)
+{
+  ppobj *contentsobj;
+  if ((contentsobj = ppdict_rget_obj(dict, "Contents")) == NULL)
+    return NULL;
+  switch (contentsobj->type)
+  {
+    case PPARRAY:
+      return ppcontents_next_stream(contentsobj->array, stream);
+    case PPSTREAM:
+      break;
+    default:
+      break;
+  }
+  return NULL;
+}
+
+static ppobj * ppcontents_op (iof *I, ppstack *stack, size_t *psize, ppname **pname)
+{
+  ppobj *obj;
+  ppstack_clear(stack);
+  do {
+    if (ppscan_find(I) < 0)
+      return NULL;
+    if ((obj = ppscan_psobj(I, stack)) == NULL)
+      return NULL;
+  } while (obj->type != PPNAME || !ppname_exec(obj->name));
+  *pname = obj->name;
+  *psize = stack->size - 1;
+  return stack->buf;
+}
+
+ppobj * ppcontents_first_op (ppcontext *context, ppstream *stream, size_t *psize, ppname **pname)
+{
+  iof *I;
+  if ((I = ppstream_read(stream, 1, 0)) == NULL)
+    return NULL;
+  return ppcontents_op(I, &context->stack, psize, pname);
+}
+
+ppobj * ppcontents_next_op (ppcontext *context, ppstream *stream, size_t *psize, ppname **pname)
+{
+  return ppcontents_op(ppstream_iof(stream), &context->stack, psize, pname);
+}
+
+ppobj * ppcontents_parse (ppcontext *context, ppstream *stream, size_t *psize)
+{
+  iof *I;
+  ppstack *stack;
+  ppobj *obj;
+  stack = &context->stack;
+  ppstack_clear(stack);
+  if ((I = ppstream_read(stream, 1, 0)) == NULL)
+    return NULL;
+  while (ppscan_find(I) >= 0)
+    if ((obj = ppscan_psobj(I, stack)) == NULL)
+      goto error;
+  *psize = stack->size;
+  ppstream_done(stream);
+  return stack->buf;
+error:
+  ppstream_done(stream);
+  return NULL;
+}
+
+/* boxes */
+
+pprect * pparray_to_rect (pparray *array, pprect *rect)
+{
+  ppobj *obj;
+  if (array->size != 4)
+    return NULL;
+  obj = pparray_at(array, 0);
+  if (!ppobj_get_num(obj, rect->lx)) return NULL;
+  obj = pparray_at(array, 1);
+  if (!ppobj_get_num(obj, rect->ly)) return NULL;
+  obj = pparray_at(array, 2);
+  if (!ppobj_get_num(obj, rect->rx)) return NULL;
+  obj = pparray_at(array, 3);
+  if (!ppobj_get_num(obj, rect->ry)) return NULL;
+  return rect;
+}
+
+pprect * ppdict_get_rect (ppdict *dict, const char *name, pprect *rect)
+{
+  pparray *array;
+  return (array = ppdict_rget_array(dict, name)) != NULL ? pparray_to_rect(array, rect) : NULL;
+}
+
+pprect * ppdict_get_box (ppdict *dict, const char *name, pprect *rect)
+{
+  do {
+    if (ppdict_get_rect(dict, name, rect) != NULL)
+      return rect;
+    dict = ppdict_rget_dict(dict, "Parent");
+  } while (dict != NULL);
+  return NULL;
+}
+
+ppmatrix * pparray_to_matrix (pparray *array, ppmatrix *matrix)
+{
+  ppobj *obj;
+  if (array->size != 6)
+    return NULL;
+  obj = pparray_at(array, 0);
+  if (!ppobj_get_num(obj, matrix->xx)) return NULL;
+  obj = pparray_at(array, 1);
+  if (!ppobj_get_num(obj, matrix->xy)) return NULL;
+  obj = pparray_at(array, 2);
+  if (!ppobj_get_num(obj, matrix->yx)) return NULL;
+  obj = pparray_at(array, 3);
+  if (!ppobj_get_num(obj, matrix->yy)) return NULL;
+  obj = pparray_at(array, 4);
+  if (!ppobj_get_num(obj, matrix->x)) return NULL;
+  obj = pparray_at(array, 5);
+  if (!ppobj_get_num(obj, matrix->y)) return NULL;
+  return matrix;
+}
+
+ppmatrix * ppdict_get_matrix (ppdict *dict, const char *name, ppmatrix *matrix)
+{
+  pparray *array;
+  return (array = ppdict_rget_array(dict, name)) != NULL ? pparray_to_matrix(array, matrix) : NULL;
+}
+
+/* logger */
+
+void pplog_callback (pplogger_callback logger, void *alien)
+{
+  logger_callback((logger_function)logger, alien);
+}
+
+int pplog_prefix (const char *prefix)
+{
+  return logger_prefix(prefix);
+}
+
+/* version */
+
+const char * ppdoc_version_string (ppdoc *pdf)
+{
+  return pdf->version;
+}
+
+int ppdoc_version_number (ppdoc *pdf, int *minor)
+{
+  *minor = pdf->version[2] - '0';
+  return pdf->version[0] - '0';
+}
+
+/* doc info */
+
+size_t ppdoc_file_size (ppdoc *pdf)
+{
+  return pdf->filesize;
+}
+
+ppuint ppdoc_objects (ppdoc *pdf)
+{
+  ppuint count;
+  ppxref *xref;
+  for (count = 0, xref = pdf->xref; xref != NULL; xref = xref->prev)
+    count += xref->count;
+  return count;
+}
+
+size_t ppdoc_memory (ppdoc *pdf, size_t *waste)
+{
+	mem_info info;
+  size_t used;
+  ppbytes_heap_info(&pdf->heap, &info, 0);
+  ppstruct_heap_info(&pdf->heap, &info, 1);
+
+  *waste = info.ghosts + info.blockghosts + info.left; // info.ghosts == 0
+  used = info.used + *waste;
+  used += pdf->stack.space * sizeof(ppobj);
+  return used;
+}
diff --git a/source/luametatex/source/libraries/pplib/ppload.h b/source/luametatex/source/libraries/pplib/ppload.h
new file mode 100644
index 000000000..f9ecca3b9
--- /dev/null
+++ b/source/luametatex/source/libraries/pplib/ppload.h
@@ -0,0 +1,58 @@
+
+#ifndef PP_LOAD_H
+#define PP_LOAD_H
+
+typedef struct {
+  ppobj *buf;     // ppobjects buffer (allocated, not from our heap)
+  ppobj *pos;     // current ppobj *
+  size_t size;    // stack size
+  size_t space;   // available space
+  ppheap *heap;  // allocator (parent pdf->stack->heap or parent context)
+} ppstack;
+
+typedef struct {
+  ppobj *current;
+  ppobj *sentinel;
+} ppkids;
+
+#define PPPAGES_STACK_DEPTH 4
+
+typedef struct {
+  ppkids buffer[PPPAGES_STACK_DEPTH];
+  ppkids *root;
+  ppkids *parent;
+  ppuint depth;
+  ppuint space;
+} pppages;
+
+struct ppdoc {
+  /* input */
+  iof_file input;
+  iof reader;
+  uint8_t *buffer;
+  size_t filesize;
+  /* heap */
+  ppheap heap;
+  ppstack stack;
+  /* output struct */
+  ppxref *xref;
+  pppages pages;
+  ppcrypt *crypt;
+  ppcrypt_status cryptstatus;
+  int flags;
+  char version[5];
+};
+
+#define PPDOC_LINEARIZED (1 << 0)
+
+ppobj * ppdoc_load_entry (ppdoc *pdf, ppref *ref);
+#define ppobj_preloaded(pdf, obj) ((obj)->type != PPREF ? (obj) : ((obj)->ref->object.type == PPNONE ? ppdoc_load_entry(pdf, (obj)->ref) : &(obj)->ref->object))
+
+ppstring * ppstring_internal (const void *data, size_t size, ppheap *heap);
+
+struct ppcontext {
+  ppheap heap;
+  ppstack stack;
+};
+
+#endif
+\ No newline at end of file
diff --git a/source/luametatex/source/libraries/pplib/ppstream.c b/source/luametatex/source/libraries/pplib/ppstream.c
new file mode 100644
index 000000000..c88d7e7fc
--- /dev/null
+++ b/source/luametatex/source/libraries/pplib/ppstream.c
@@ -0,0 +1,491 @@
+
+#include "ppfilter.h"
+#include "pplib.h"
+
+ppstream * ppstream_create (ppdoc *pdf, ppdict *dict, size_t offset)
+{
+  ppstream *stream;
+  stream = (ppstream *)ppstruct_take(&pdf->heap, sizeof(ppstream));
+  stream->dict = dict;
+  stream->offset = offset;
+  //if (!ppdict_rget_uint(dict, "Length", &stream->length)) // may be indirect pointing PPNONE at this moment
+  //  stream->length = 0;
+  stream->length = 0;
+  stream->filespec = NULL;
+  stream->filter.filters = NULL;
+  stream->filter.params = NULL;
+  stream->filter.count = 0;
+  stream->input = &pdf->input;
+  stream->I = NULL;
+  stream->cryptkey = NULL;
+  stream->flags = 0;
+  return stream;
+}
+
+static iof * ppstream_predictor (ppdict *params, iof *N)
+{
+  ppint predictor, rowsamples, components, samplebits;
+
+  if (!ppdict_get_int(params, "Predictor", &predictor) || predictor <= 1)
+    return N;
+  if (!ppdict_get_int(params, "Columns", &rowsamples) || rowsamples == 0) // sanity, filter probably expects >0
+    rowsamples = 1;;
+  if (!ppdict_get_int(params, "Colors", &components) || components == 0) // ditto
+    components = 1;
+  if (!ppdict_get_int(params, "BitsPerComponent", &samplebits) || samplebits == 0)
+    samplebits = 8;
+  return iof_filter_predictor_decoder(N, (int)predictor, (int)rowsamples, (int)components, (int)samplebits);
+}
+
+static iof * ppstream_decoder (ppstream *stream, ppstreamtp filtertype, ppdict *params, iof *N)
+{
+  int flags;
+  iof *F, *P;
+  ppint earlychange;
+  ppstring *cryptkey;
+
+  switch (filtertype)
+  {
+    case PPSTREAM_BASE16:
+      return iof_filter_base16_decoder(N);
+    case PPSTREAM_BASE85:
+      return iof_filter_base85_decoder(N);
+    case PPSTREAM_RUNLENGTH:
+      return iof_filter_runlength_decoder(N);
+    case PPSTREAM_FLATE:
+      if ((F = iof_filter_flate_decoder(N)) != NULL)
+      {
+        if (params != NULL)
+        {
+          if ((P = ppstream_predictor(params, F)) != NULL)
+            return P;
+          iof_close(F);
+          break;
+        }
+        return F;
+      }
+      break;
+    case PPSTREAM_LZW:
+      flags = LZW_DECODER_DEFAULTS;
+      if (params != NULL && ppdict_get_int(params, "EarlyChange", &earlychange) && earlychange == 0) // integer, not boolean
+        flags &= ~LZW_EARLY_INDEX;
+      if ((F = iof_filter_lzw_decoder(N, flags)) != NULL)
+      {
+        if (params != NULL)
+        {
+          if ((P = ppstream_predictor(params, F)) != NULL)
+            return P;
+          iof_close(F);
+          break;
+        }
+        return F;
+      }
+      break;
+    case PPSTREAM_CRYPT:
+      if ((cryptkey = stream->cryptkey) == NULL)
+        return N; // /Identity crypt
+      if (stream->flags & PPSTREAM_ENCRYPTED_AES)
+        return iof_filter_aes_decoder(N, cryptkey->data, cryptkey->size);
+      if (stream->flags & PPSTREAM_ENCRYPTED_RC4)
+        return iof_filter_rc4_decoder(N, cryptkey->data, cryptkey->size);
+      return NULL; // if neither AES or RC4 but cryptkey present, something went wrong; see ppstream_info()
+    case PPSTREAM_CCITT:
+    case PPSTREAM_DCT:
+    case PPSTREAM_JBIG2:
+    case PPSTREAM_JPX:
+      break;
+  }
+  return NULL;
+}
+
+#define ppstream_source(stream) iof_filter_stream_coreader((iof_file *)((stream)->input), (size_t)((stream)->offset), (size_t)((stream)->length))
+#define ppstream_auxsource(filename) iof_filter_file_reader(filename)
+
+static ppname * ppstream_get_filter_name (ppobj *filterobj, size_t index)
+{
+  if (filterobj->type == PPNAME)
+    return index == 0 ? filterobj->name : NULL;
+  if (filterobj->type == PPARRAY)
+    return pparray_get_name(filterobj->array, index);
+  return NULL;
+}
+
+static ppdict * ppstream_get_filter_params (ppobj *paramsobj, size_t index)
+{
+  if (paramsobj->type == PPDICT)
+    return index == 0 ? paramsobj->dict : NULL;
+  if (paramsobj->type == PPARRAY)
+    return pparray_rget_dict(paramsobj->array, index);
+  return NULL;
+}
+
+static const char * ppstream_aux_filename (ppobj *filespec)
+{ // mockup, here we should decode the string
+  if (filespec->type == PPSTRING)
+  {
+    return (const char *)(filespec->string);
+  }
+  // else might be a dict - todo
+  return NULL;
+}
+
+#define ppstream_image_filter(fcode) (fcode == PPSTREAM_DCT || fcode == PPSTREAM_CCITT || fcode == PPSTREAM_JBIG2 || fcode == PPSTREAM_JPX)
+
+iof * ppstream_read (ppstream *stream, int decode, int all)
+{
+  iof *I, *F;
+  ppstreamtp *filtertypes, filtertype;
+  int owncrypt;
+  ppdict **filterparams, *fparams;
+  size_t index, filtercount;
+  const char *filename;
+
+  if (ppstream_iof(stream) != NULL)
+    return NULL; // usage error
+
+  if (stream->filespec != NULL)
+  {
+    filename = ppstream_aux_filename(stream->filespec); // mockup, basic support
+    I = filename != NULL ? ppstream_auxsource(filename) : NULL;
+  }
+  else
+  {
+    I = ppstream_source(stream);
+  }
+  if (I == NULL)
+    return NULL;
+
+  /* If the stream is encrypted, decipher is the first to be applied */
+  owncrypt = (stream->flags & PPSTREAM_ENCRYPTED_OWN) != 0;
+  if (!owncrypt)
+  {
+    if (stream->cryptkey != NULL && stream->filespec == NULL)
+    { /* implied global crypt; does not apply to external files (pdf psec page 115), except for embedded file streams (not supported so far) */
+      if ((F = ppstream_decoder(stream, PPSTREAM_CRYPT, NULL, I)) == NULL)
+        goto stream_error;
+      I = F;
+    } /* otherwise no crypt at all or /Identity */
+  }
+
+  if (decode || owncrypt)
+  {
+    if ((filtercount = stream->filter.count) > 0)
+    {
+      filtertypes = stream->filter.filters;
+      filterparams = stream->filter.params;
+      for (index = 0; index < filtercount; ++index)
+      {
+        fparams = filterparams != NULL ? filterparams[index] : NULL;
+        filtertype = filtertypes[index];
+        if ((F = ppstream_decoder(stream, filtertype, fparams, I)) != NULL)
+        {
+          I = F;
+          if (owncrypt && !decode && filtertype == PPSTREAM_CRYPT)
+            break; // /Crypt filter should always be first, so in practise we return decrypted but compressed
+          continue;
+        }
+        if (!ppstream_image_filter(filtertype))
+          goto stream_error; // failed to create non-image filter, something unexpected
+        break;
+      }
+    }
+  }
+  if (all)
+    iof_load(I);
+  else
+    iof_input(I);
+  stream->I = I;
+  return I;
+stream_error:
+  iof_close(I);
+  return NULL;
+}
+
+uint8_t * ppstream_first (ppstream *stream, size_t *size, int decode)
+{
+  iof *I;
+  if ((I = ppstream_read(stream, decode, 0)) != NULL)
+  {
+    *size = (size_t)iof_left(I);
+    return I->pos;
+  }
+  *size = 0;
+  return NULL;
+}
+
+uint8_t * ppstream_next (ppstream *stream, size_t *size)
+{
+  iof *I;
+  if ((I = ppstream_iof(stream)) != NULL)
+  {
+    I->pos = I->end;
+    if ((*size = iof_input(I)) > 0)
+      return I->pos;
+  }
+  *size = 0;
+  return NULL;
+}
+
+uint8_t * ppstream_all (ppstream *stream, size_t *size, int decode)
+{
+  iof *I;
+  if ((I = ppstream_read(stream, decode, 1)) != NULL)
+  {
+    *size = (size_t)iof_left(I);
+    return I->pos;
+  }
+  *size = 0;
+  return NULL;
+}
+
+void ppstream_done (ppstream *stream)
+{
+  iof *I;
+  if ((I = ppstream_iof(stream)) != NULL)
+  {
+    iof_close(I);
+    stream->I = NULL;
+  }
+}
+
+/* fetching stream info
+PJ20180916: revealed it makes sense to do a lilbit more just after parsing stream entry to simplify stream operations
+and extend ppstream api
+*/
+
+/* stream filters */
+
+const char * ppstream_filter_name[] = {
+  "ASCIIHexDecode",
+  "ASCII85Decode",
+  "RunLengthDecode",
+  "FlateDecode",
+  "LZWDecode",
+  "CCITTFaxDecode",
+  "DCTDecode",
+  "JBIG2Decode",
+  "JPXDecode",
+  "Crypt"
+};
+
+int ppstream_filter_type (ppname *name, ppstreamtp *filtertype)
+{
+  switch (name->data[0])
+  {
+    case 'A':
+      if (ppname_is(name, "ASCIIHexDecode")) { *filtertype = PPSTREAM_BASE16; return 1; }
+      if (ppname_is(name, "ASCII85Decode")) { *filtertype = PPSTREAM_BASE85; return 1; }
+      break;
+    case 'R':
+      if (ppname_is(name, "RunLengthDecode")) { *filtertype = PPSTREAM_RUNLENGTH; return 1; }
+      break;
+    case 'F':
+      if (ppname_is(name, "FlateDecode")) { *filtertype = PPSTREAM_FLATE; return 1; }
+      break;
+    case 'L':
+      if (ppname_is(name, "LZWDecode")) { *filtertype = PPSTREAM_LZW; return 1; }
+      break;
+    case 'D':
+      if (ppname_is(name, "DCTDecode")) { *filtertype = PPSTREAM_DCT; return 1; }
+      break;
+    case 'C':
+      if (ppname_is(name, "CCITTFaxDecode")) { *filtertype = PPSTREAM_CCITT; return 1; }
+      if (ppname_is(name, "Crypt")) { *filtertype = PPSTREAM_CRYPT; return 1; }
+      break;
+    case 'J':
+      if (ppname_is(name, "JPXDecode")) { *filtertype = PPSTREAM_JPX; return 1; }
+      if (ppname_is(name, "JBIG2Decode")) { *filtertype = PPSTREAM_JBIG2; return 1; }
+      break;
+  }
+  return 0;
+}
+
+void ppstream_info (ppstream *stream, ppdoc *pdf)
+{ // called in ppdoc_load_entries() for every stream, but after loading non-stream objects (eg. /Length..)
+  ppdict *dict, *fparams;
+  ppobj *fobj, *pobj;
+  ppname *fname, *tname, *owncryptfilter = NULL;
+  ppcrypt *crypt;
+  ppref *ref;
+  size_t i;
+  int cflags;
+
+  ppstreamtp *filtertypes = NULL, filtertype;
+  ppdict **filterparams = NULL;
+  size_t filtercount = 0, farraysize = 0;
+
+  const char *filterkey, *paramskey;
+
+  dict = stream->dict;
+  ppdict_rget_uint(dict, "Length", &stream->length);
+
+  if ((stream->filespec = ppdict_get_obj(dict, "F")) != NULL)
+  {
+    stream->flags |= PPSTREAM_NOT_SUPPORTED;
+    filterkey = "FFilter", paramskey = "FDecodeParms";
+  }
+  else
+    filterkey = "Filter", paramskey = "DecodeParms";
+
+  if ((fobj = ppdict_rget_obj(dict, filterkey)) != NULL)
+  {
+    switch (fobj->type)
+    {
+      case PPNAME:
+        farraysize = 1;
+        break;
+      case PPARRAY:
+        farraysize = fobj->array->size;
+        break;
+      default:
+        break;
+    }
+    if (farraysize > 0)
+    {
+      filtertypes = (ppstreamtp *)ppstruct_take(&pdf->heap, farraysize * sizeof(ppstreamtp));
+      if ((pobj = ppdict_rget_obj(dict, paramskey)) != NULL)
+      { 
+        filterparams = (ppdict **)ppstruct_take(&pdf->heap, farraysize * sizeof(ppdict *));
+      }
+      for (i = 0; i < farraysize; ++i)
+      {
+        if ((fname = ppstream_get_filter_name(fobj, i)) != NULL && ppstream_filter_type(fname, &filtertype))
+        {
+          filtertypes[filtercount] = filtertype;
+          if (pobj != NULL)
+          {
+            fparams = ppstream_get_filter_params(pobj, i);
+            filterparams[filtercount] = fparams;
+          }
+          else
+            fparams = NULL;
+          switch (filtertype)
+          {
+            case PPSTREAM_BASE16:
+            case PPSTREAM_BASE85:
+            case PPSTREAM_RUNLENGTH:
+            case PPSTREAM_FLATE:
+            case PPSTREAM_LZW:
+              stream->flags |= PPSTREAM_FILTER;
+              break;
+            case PPSTREAM_CCITT:
+            case PPSTREAM_DCT:
+            case PPSTREAM_JBIG2:
+            case PPSTREAM_JPX:
+              stream->flags |= PPSTREAM_IMAGE;
+              break;
+            case PPSTREAM_CRYPT:
+              stream->flags |= PPSTREAM_ENCRYPTED_OWN;
+              owncryptfilter = fparams != NULL ? ppdict_get_name(fparams, "Name") : NULL; // /Type /CryptFilterDecodeParms /Name ...
+              if (i != 0) // we assume it is first
+                stream->flags |= PPSTREAM_NOT_SUPPORTED;
+              break;
+          }
+          ++filtercount;
+        }
+        else
+        {
+          stream->flags |= PPSTREAM_NOT_SUPPORTED;
+        }
+      }
+    }
+  }
+  stream->filter.filters = filtertypes;
+  stream->filter.params = filterparams;
+  stream->filter.count = filtercount;
+
+  if ((crypt = pdf->crypt) == NULL || (ref = crypt->ref) == NULL)
+    return;
+  if (stream->flags & PPSTREAM_ENCRYPTED_OWN)
+  {
+    /* Seems a common habit to use just /Crypt filter name with no params, which defaults to /Identity.
+       A real example with uncompressed metadata: <</Filter[/Crypt]/Length 4217/Subtype/XML/Type/Metadata>> */
+    if (owncryptfilter != NULL && !ppname_is(owncryptfilter, "Identity") && stream->filespec == NULL) // ?
+    {
+      if (crypt->map != NULL && ppcrypt_type(crypt, owncryptfilter, NULL, &cflags))
+      {
+        if (cflags & PPCRYPT_INFO_AES)
+          stream->flags |= PPSTREAM_ENCRYPTED_AES;
+        else if (cflags & PPCRYPT_INFO_RC4)
+          stream->flags |= PPSTREAM_ENCRYPTED_RC4;
+      }
+    }
+  }
+  else
+  {
+    if ((crypt->flags & PPCRYPT_NO_METADATA) && (tname = ppdict_get_name(dict, "Type")) != NULL && ppname_is(tname, "Metadata"))
+      ; /* special treatment of metadata stream; we assume that explicit /Filter /Crypt setup overrides document level setup of EncryptMetadata. */
+    else if (stream->filespec == NULL) /* external files are not encrypted, expect embedded files (not supported yet) */
+    {
+      if (crypt->flags & PPCRYPT_STREAM_RC4)
+        stream->flags |= PPSTREAM_ENCRYPTED_RC4;
+      else if (crypt->flags & PPCRYPT_STREAM_AES)
+        stream->flags |= PPSTREAM_ENCRYPTED_AES;
+    }
+  }
+
+  /* finally, if the stream is encrypted with non-identity crypt (implicit or explicit), make and save the crypt key */
+  if (stream->flags & PPSTREAM_ENCRYPTED)
+    stream->cryptkey = ppcrypt_stmkey(crypt, ref, ((stream->flags & PPSTREAM_ENCRYPTED_AES) != 0), &pdf->heap);
+}
+
+void ppstream_filter_info (ppstream *stream, ppstream_filter *info, int decode)
+{
+  size_t from, index;
+  ppstreamtp filtertype;
+  ppdict *params;
+
+  *info = stream->filter;
+  if (info->count > 0)
+  {
+    from = (stream->flags & PPSTREAM_ENCRYPTED_OWN) && info->filters[0] == PPSTREAM_CRYPT ? 1 : 0;
+    if (decode)
+    {
+      for (index = from; index < info->count; ++index)
+      {
+        filtertype = info->filters[index];
+        if (ppstream_image_filter(filtertype))
+        {
+          break;
+        }
+      }
+    }
+    else
+    {
+      index = from;
+    }
+    if (index > 0) {
+      info->count -= index;
+      if (info->count > 0)
+      {
+        info->filters += index;
+        if (info->params != NULL)
+        {
+          info->params += index;
+          for (index = 0, params = NULL; index < info->count; ++index)
+            if ((params = info->params[index]) != NULL)
+              break;
+          if (params == NULL)
+            info->params = NULL;
+        }
+      }
+      else
+      {
+        info->filters = NULL;
+        info->params = NULL;
+      }
+    }
+  }
+}
+
+/* */
+
+void ppstream_init_buffers (void)
+{
+  iof_filters_init();
+}
+
+void ppstream_free_buffers (void)
+{
+  iof_filters_free();
+}
diff --git a/source/luametatex/source/libraries/pplib/ppstream.h b/source/luametatex/source/libraries/pplib/ppstream.h
new file mode 100644
index 000000000..37e34c56a
--- /dev/null
+++ b/source/luametatex/source/libraries/pplib/ppstream.h
@@ -0,0 +1,10 @@
+
+#ifndef PP_STREAM_H
+#define PP_STREAM_H
+
+ppstream * ppstream_create (ppdoc *pdf, ppdict *dict, size_t offset);
+iof * ppstream_read (ppstream *stream, int decode, int all);
+#define ppstream_iof(stream) ((iof *)((stream)->I))
+void ppstream_info (ppstream *stream, ppdoc *pdf);
+
+#endif
+\ No newline at end of file
diff --git a/source/luametatex/source/libraries/pplib/pptest1.c b/source/luametatex/source/libraries/pplib/pptest1.c
new file mode 100644
index 000000000..eabb0eae9
--- /dev/null
+++ b/source/luametatex/source/libraries/pplib/pptest1.c
@@ -0,0 +1,104 @@
+
+#include <stdio.h>
+#include "ppapi.h"
+#include "util/utiliof.h"
+
+static const char * sizenum (size_t s)
+{
+  static char buffer[32];
+  if (s < 1000)
+    sprintf(buffer, "%uB", (unsigned)s);
+  else if (s < 1000000)
+    sprintf(buffer, "%.2fkB", (double)(s) / 1000);
+  else
+    sprintf(buffer, "%.2fMB", (double)(s) / 1000000);
+  return buffer;
+}
+
+static const char * crypt_info (ppdoc *pdf)
+{
+  switch (ppdoc_crypt_status(pdf))
+  {
+    case PPCRYPT_NONE:
+      return "none";
+    case PPCRYPT_DONE:
+      return "empty password";
+    case PPCRYPT_PASS:
+      return "nonempty password";
+    default:
+      break;
+  }
+  return "this shouldn't happen";
+}
+
+static void print_info (ppdoc *pdf)
+{
+  ppdict *info;
+  ppstring *creator, *producer;
+  size_t memused, memwaste;
+
+  if ((info = ppdoc_info(pdf)) != NULL)
+  {
+    if ((creator = ppdict_rget_string(info, "Creator")) != NULL)
+      printf("  creator: %s\n", ppstring_decoded_data(creator));
+    if ((producer = ppdict_rget_string(info, "Producer")) != NULL)
+      printf("  producer: %s\n", ppstring_decoded_data(producer));
+  }
+  printf("  version: %s\n", ppdoc_version_string(pdf));
+  printf("  protection: %s\n", crypt_info(pdf));
+  printf("  filesize: %s\n", sizenum(ppdoc_file_size(pdf)));
+  printf("  objects: %lu\n", (unsigned long)ppdoc_objects(pdf));
+  printf("  pagecount: %lu\n", (unsigned long)ppdoc_page_count(pdf));
+  memused = ppdoc_memory(pdf, &memwaste);
+  printf("  memused: %s\n", sizenum(memused));
+  printf("  memwaste: %s\n", sizenum(memwaste));
+}
+
+static int usage (const char *argv0)
+{
+  printf("pplib " pplib_version ", " pplib_author "\n");
+  printf("usage: %s file1.pdf file2.pdf ...\n", argv0);
+  return 0;
+}
+
+int main (int argc, const char **argv)
+{
+  const char *filepath;
+  int a;
+  ppdoc *pdf;
+  const void *data;
+  size_t size;
+
+  if (argc < 2)
+    return usage(argv[0]);
+  for (a = 1; a < argc; ++a)
+  {
+    filepath = argv[a];
+    printf("loading %s... ", filepath);
+    pdf = ppdoc_load(filepath);
+    if (pdf == NULL)
+    {
+      printf("failed\n");
+      continue;
+    }
+    printf("done.\n");
+    print_info(pdf);
+    ppdoc_free(pdf);
+    /* now loading from memory buffer */
+    printf("loading %s from mem buffer... ", filepath);
+    data = iof_copy_file_data(filepath, &size);
+    if (data != NULL)
+    {
+      pdf = ppdoc_mem(data, size);
+      if (pdf == NULL)
+      {
+        printf("failed\n");
+        continue;
+      }
+      printf("done.\n");
+      //print_info(pdf);
+      ppdoc_free(pdf);
+    }
+  }
+  return 0;
+}
diff --git a/source/luametatex/source/libraries/pplib/pptest2.c b/source/luametatex/source/libraries/pplib/pptest2.c
new file mode 100644
index 000000000..5dff63afd
--- /dev/null
+++ b/source/luametatex/source/libraries/pplib/pptest2.c
@@ -0,0 +1,170 @@
+
+#include <stdio.h>
+#include <assert.h>
+#include "ppapi.h"
+
+/*
+static const char * get_file_name (const char *path)
+{
+  const char *fn, *p;
+  for (fn = p = path; *p != '\0'; ++p)
+    if (*p == '\\' || *p == '/')
+      fn = p + 1;
+  return fn;
+}
+*/
+
+static void box_info (ppdict *pagedict, FILE *fh)
+{
+  const char *boxes[] = {"MediaBox", "CropBox", "BleedBox", "TrimBox", "ArtBox"};
+  pprect rect;
+  size_t i;
+  for (i = 0; i < sizeof(boxes) / sizeof(const char *); ++i)
+    if (ppdict_get_box(pagedict, boxes[i], &rect))
+      fprintf(fh, "%%%% %s [%f %f %f %f]\n", boxes[i], rect.lx, rect.ly, rect.rx, rect.ry);
+}
+
+static int usage (const char *argv0)
+{
+  printf("pplib " pplib_version ", " pplib_author "\n");
+  printf("usage: %s file1.pdf file2.pdf ...\n", argv0);
+  printf("       %s file.pdf -u userpassword\n", argv0);
+  printf("       %s file.pdf -o ownerpassword\n", argv0);
+  printf("       %s file.pdf -p bothpasswords\n", argv0);
+  return 0;
+}
+
+static void log_callback (const char *message, void *alien)
+{
+  fprintf((FILE *)alien, "\nooops: %s\n", message);
+}
+
+static const char * get_next_argument (const char *opt, int *a, int argc, const char **argv)
+{
+  const char *next;
+  if ((*a) + 2 < argc)
+  {
+    next = argv[*a + 1];
+    if (strcmp(next, opt) == 0)
+    {
+      *a += 2;
+      return argv[*a];
+    }
+  }
+  return NULL;
+}
+
+int main (int argc, const char **argv)
+{
+  const char *filepath, *password;
+  int a;
+  ppdoc *pdf;
+  ppcrypt_status cryptstatus;
+  ppref *pageref;
+  ppdict *pagedict;
+  int pageno;
+  char outname[1024];
+  FILE *fh;
+  ppstream *stream;
+  uint8_t *data;
+  size_t size;
+  ppcontext *context;
+  ppobj *obj;
+  ppname *op;
+  size_t operators;
+
+  if (argc < 2)
+    return usage(argv[0]);
+  ppstream_init_buffers();
+  pplog_callback(log_callback, stderr);
+  context = ppcontext_new();
+  for (a = 1; a < argc; ++a)
+  {
+    /* load */
+    filepath = argv[a];
+    printf("loading %s... ", filepath);
+    pdf = ppdoc_load(filepath);
+    if (pdf == NULL)
+    {
+      printf("failed\n");
+      continue;
+    }
+    printf("done\n");
+
+    /* decrypt */
+    if ((password = get_next_argument("-u", &a, argc, argv)) != NULL)
+      cryptstatus = ppdoc_crypt_pass(pdf, password, strlen(password), NULL, 0);
+    else if ((password = get_next_argument("-o", &a, argc, argv)) != NULL)
+      cryptstatus = ppdoc_crypt_pass(pdf, NULL, 0, password, strlen(password));
+    else if ((password = get_next_argument("-p", &a, argc, argv)) != NULL)
+      cryptstatus = ppdoc_crypt_pass(pdf, password, strlen(password), password, strlen(password));
+    else
+      cryptstatus = ppdoc_crypt_status(pdf);
+    switch (cryptstatus)
+    {
+      case PPCRYPT_NONE:
+        break;
+      case PPCRYPT_DONE:
+        printf("opened with password '%s'\n", password != NULL ? password : "");
+        break;
+      case PPCRYPT_PASS:
+        printf("invalid password\n");
+        ppdoc_free(pdf);
+        continue;
+      case PPCRYPT_FAIL:
+        printf("invalid encryption\n");
+        ppdoc_free(pdf);
+        continue;
+    }
+
+    /* process */
+    sprintf(outname, "%s.out", filepath);
+    fh = fopen(outname, "wb");
+    if (fh == NULL)
+    {
+      printf("can't open %s for writing\n", outname);
+      continue;
+    }
+    for (pageref = ppdoc_first_page(pdf), pageno = 1;
+         pageref != NULL;
+         pageref = ppdoc_next_page(pdf), ++pageno)
+    {
+      pagedict = pageref->object.dict;
+      /* decompress contents data */
+      fprintf(fh, "%%%% PAGE %d\n", pageno);
+      box_info(pagedict, fh);
+      for (stream = ppcontents_first(pagedict);
+           stream != NULL;
+           stream = ppcontents_next(pagedict, stream))
+      {
+        for (data = ppstream_first(stream, &size, 1);
+             data != NULL;
+             data = ppstream_next(stream, &size))
+          fwrite(data, size, 1, fh);
+        ppstream_done(stream);
+      }
+      /* now parse contents */
+      for (stream = ppcontents_first(pagedict);
+           stream != NULL;
+           stream = ppcontents_next(pagedict, stream))
+      {
+        operators = 0;
+        for (obj = ppcontents_first_op(context, stream, &size, &op);
+             obj != NULL;
+             obj = ppcontents_next_op(context, stream, &size, &op))
+          ++operators;
+        fprintf(fh, "%%%% OPERATORS count %lu\n", (unsigned long)operators);
+        ppstream_done(stream);
+        //obj = ppcontents_parse(context, stream, &size);
+        //fprintf(fh, "%%%% items count %lu\n", (unsigned long)size);
+        fprintf(fh, "\n");
+      }
+      ppcontext_done(context);
+    }
+    fclose(fh);
+    ppdoc_free(pdf);
+  }
+  ppcontext_free(context);
+  ppstream_free_buffers();
+  return 0;
+}
diff --git a/source/luametatex/source/libraries/pplib/pptest3.c b/source/luametatex/source/libraries/pplib/pptest3.c
new file mode 100644
index 000000000..815ed51b6
--- /dev/null
+++ b/source/luametatex/source/libraries/pplib/pptest3.c
@@ -0,0 +1,123 @@
+
+#include <stdio.h>
+//#include "ppapi.h"
+#include "pplib.h"
+#include "assert.h"
+
+static int usage (const char *argv0)
+{
+  printf("pplib " pplib_version ", " pplib_author "\n");
+  printf("usage: %s file1.pdf file2.pdf ...\n", argv0);
+  return 0;
+}
+
+static void print_result_filter (ppstream *stream, int decode)
+{
+  ppstream_filter info;
+  size_t i;
+
+  ppstream_filter_info(stream, &info, decode);
+  printf("  when %s: /Filter [", decode ? "uncompressed" : "compressed");
+  for (i = 0; i < info.count; ++i)
+    printf(" /%s", ppstream_filter_name[info.filters[i]]);
+  printf(" ]");
+  if (info.params != NULL)
+  {
+    printf(" /DecodeParms [");
+    for (i = 0; i < info.count; ++i)
+      printf(" %s", info.params[i] != NULL ? "<<...>>" : "null");
+    printf(" ]");
+  }
+  printf("\n");
+}
+
+static void print_stream_info (ppref *ref, ppstream *stream)
+{
+  size_t length;
+  printf("object %lu %lu R\n", (unsigned long)ref->number, (unsigned long)ref->version);
+  if (stream->flags & PPSTREAM_FILTER)
+    printf("  filtered ");
+  else
+    printf("  plain ");
+  if (stream->flags & PPSTREAM_IMAGE)
+    printf("image ");
+  else
+    printf("stream ");
+  if (stream->flags & PPSTREAM_ENCRYPTED)
+    printf("encrypted ");
+  if (stream->flags & PPSTREAM_NOT_SUPPORTED)
+    printf("invalid ");
+  if (!ppdict_rget_uint(stream->dict, "Length", &length))
+    length = 0;
+  assert(stream->length == length);
+  printf("length %lu (/Length %lu)\n", (unsigned long)stream->length, (unsigned long)length);
+  print_result_filter(stream, 0);
+  print_result_filter(stream, 1);
+}
+
+static void check_stream_chunks (ppstream *stream)
+{
+  size_t sum, size;
+  uint8_t *data;
+  const int decode[2] = {0, 1};
+  int d;
+
+  for (d = 0; d < 2; ++d)
+  {
+    for (sum = 0, data = ppstream_first(stream, &size, decode[d]); data != NULL; data = ppstream_next(stream, &size))
+      sum += size;
+    ppstream_done(stream);
+    ppstream_all(stream, &size, decode[d]);
+    ppstream_done(stream);
+    assert(sum == size);
+    printf("  %s chunks size [%lu]\n", (decode[d] ? "decoded" : "raw"), (unsigned long)size);
+  }
+}
+
+#define USE_BUFFERS_POOL 1
+
+int main (int argc, const char **argv)
+{
+  const char *filepath;
+  int a;
+  ppdoc *pdf;
+  ppxref *xref;
+  ppxsec *xsec;
+  size_t xi;
+  ppuint refnum;
+  ppref *ref;
+
+  if (argc < 2)
+    return usage(argv[0]);
+  if (USE_BUFFERS_POOL)
+    ppstream_init_buffers();
+  for (a = 1; a < argc; ++a)
+  {
+    filepath = argv[a];
+    printf("loading %s... ", filepath);
+    pdf = ppdoc_load(filepath);
+    if (pdf == NULL)
+    {
+      printf("failed\n");
+      continue;
+    }
+    printf("done.\n");
+    for (xref = ppdoc_xref(pdf); xref != NULL; xref = ppxref_prev(xref))
+    {
+      for (xi = 0, xsec = xref->sects; xi < xref->size; ++xi, ++xsec)
+      {
+        for (refnum = xsec->first, ref = xsec->refs; refnum <= xsec->last; ++refnum, ++ref)
+        {
+          if (ref->object.type != PPSTREAM)
+            continue;
+          print_stream_info(ref, ref->object.stream);
+          check_stream_chunks(ref->object.stream);
+        }
+      }
+    }
+    ppdoc_free(pdf);
+  }
+  if (USE_BUFFERS_POOL)
+    ppstream_free_buffers();
+  return 0;
+}
diff --git a/source/luametatex/source/libraries/pplib/ppxref.c b/source/luametatex/source/libraries/pplib/ppxref.c
new file mode 100644
index 000000000..fa03fd6c9
--- /dev/null
+++ b/source/luametatex/source/libraries/pplib/ppxref.c
@@ -0,0 +1,215 @@
+
+#include "pplib.h"
+
+#define PPXREF_MAP_INIT 16 // number of xref sections
+
+ppxref * ppxref_create (ppdoc *pdf, size_t initsize, size_t xrefoffset)
+{
+  ppxref *xref;
+
+  if (initsize == 0) // unknown
+    initsize = PPXREF_MAP_INIT;
+  xref = (ppxref *)ppstruct_take(&pdf->heap, sizeof(ppxref));
+  xref->sects = (ppxsec *)ppstruct_take(&pdf->heap, initsize * sizeof(ppxsec));
+  xref->size = 0;
+  xref->space = initsize;
+  xref->count = 0;
+  xref->trailer.type = PPNONE;
+  xref->trailer.dict = NULL;
+  xref->prev = NULL;
+  xref->pdf = pdf;
+  xref->offset = xrefoffset;
+  return xref;
+}
+
+ppxsec * ppxref_push_section (ppxref *xref, ppheap *heap)
+{
+  ppxsec *sects;
+  if (xref->size < xref->space)
+    return &xref->sects[xref->size++];
+  xref->space <<= 1;
+  sects = xref->sects;
+  xref->sects = (ppxsec *)ppstruct_take(heap, xref->space * sizeof(ppxsec)); // waste but rare
+  memcpy(xref->sects, sects, xref->size * sizeof(ppxsec));
+  return &xref->sects[xref->size++];
+}
+
+/* When loading xref table, we don't know how many sections is there. We assume 16, which is
+    more than usual (waste). But if there is more, we double the size, wasting again. This
+    could be made better with a dedicated allocator for xref sections (heap or generic malloc).
+    Or an ephemeric malloced c-array stored in heap once ready (analogical to stack used for dicts/arrays).
+    For xref streams we have explicit num of sections. */
+
+/*
+void ppxref_done_sections (ppxref *xref, ppheap *heap)
+{ // if xref->sects was initialized with mallocted array we could do
+  ppxsec *sects;
+  size_t size;
+  sects = xref->sects;
+  size = xref->size * sizeof(ppxsec);
+  xref->sects = (ppxsec *)ppstruct_take(heap, size);
+  memcpy(xref->sects, sects, size);
+  pp_free(sects);
+  xref->space = xref->size;
+}
+*/
+
+static void ppxref_sort_sects (ppxsec *left, ppxsec *right)
+{
+  ppxsec *l, *r, *m, t;
+  ppuint first, last;
+  l = left, r = right, m = l + ((r - l) / 2);
+  first = m->first, last = m->last;
+  do
+  { // don't take first/last from pointer
+    while (l->first < first) ++l;
+    while (r->first > last) --r;
+    if (l <= r)
+    {
+      t = *l;
+      *l = *r;
+      *r = t;
+      ++l, --r;
+    }
+  } while (l <= r);
+  if (l < right)
+    ppxref_sort_sects(l, right);
+  if (r > left)
+    ppxref_sort_sects(left, r);
+}
+
+int ppxref_sort (ppxref *xref)
+{
+  if (xref->size == 0)
+    return 0;
+  ppxref_sort_sects(xref->sects, xref->sects + xref->size - 1);
+  return 1;
+}
+
+ppref * ppxref_find_local (ppxref *xref, ppuint refnumber)
+{
+  ppxsec *left, *right, *mid;
+  //if (xref->size == 0) // we don't allow that
+  //  return NULL;
+  left = xref->sects;
+  right = xref->sects + xref->size - 1;
+  do
+  {
+    mid = left + ((right - left) / 2);
+    if (refnumber > mid->last)
+      left = mid + 1;
+    else if (refnumber < mid->first)
+      right = mid - 1;
+    else
+      return &mid->refs[refnumber - mid->first];
+  } while (left <= right);
+  return NULL;
+}
+
+/*
+PJ 20180910
+
+So far we were resolving references in the context of the current xref:
+
+- if a given object is found in this xref, than this is the object
+- otherwise older xrefs are queried in order
+- only in linearized documents older body may refer to object from newer xref
+
+Hans sent a document where an incremental update (newer body) has only an updated page object
+(plus /Metadata and /Info), but /Root (catalog) and /Pages dict refs are defined only in the older body.
+If we resolve references using the approach so far, we actually drop the update; newer objects are parsed
+and linked to the newest xref, but never linked to objects tree. Assuming we will never need to interpret
+older versions, makes sense to assume, that the newest object version is always the correct version.
+
+*/
+
+#if 0
+
+ppref * ppxref_find (ppxref *xref, ppuint refnumber)
+{
+  ppref *ref;
+  ppxref *other;
+
+  if ((ref = ppxref_find_local(xref, refnumber)) != NULL)
+    return ref;
+  if (xref->pdf->flags & PPDOC_LINEARIZED)
+  {
+    for (other = xref->pdf->xref; other != NULL; other = other->prev)
+      if (other != xref && (ref = ppxref_find_local(other, refnumber)) != NULL)
+        return ref;
+  }
+  else
+  {
+    for (other = xref->prev; other != NULL; other = other->prev)
+      if ((ref = ppxref_find_local(other, refnumber)) != NULL)
+        return ref;
+    /* This shouldn't happen, but I've met documents that have no linearized dict,
+       but their xrefs are prepared as for linearized; with "older" xrefs referring
+       to "newer". */
+    for (other = xref->pdf->xref; other != NULL && other != xref; other = other->prev)
+      if ((ref = ppxref_find_local(other, refnumber)) != NULL)
+        return ref;
+  }
+  return NULL;
+}
+
+#else
+
+ppref * ppxref_find (ppxref *xref, ppuint refnumber)
+{
+  ppref *ref;
+  ppxref *other;
+
+  for (other = xref->pdf->xref; other != NULL; other = other->prev)
+    if ((ref = ppxref_find_local(other, refnumber)) != NULL)
+      return ref;
+  return NULL;
+}
+
+#endif
+
+ppdict * ppxref_trailer (ppxref *xref)
+{
+  switch (xref->trailer.type)
+  {
+    case PPDICT:
+      return xref->trailer.dict;
+    case PPSTREAM:
+      return xref->trailer.stream->dict;
+    default:
+      break;
+  }
+  return NULL;
+}
+
+ppxref * ppdoc_xref (ppdoc *pdf)
+{
+  return pdf->xref;
+}
+
+ppxref * ppxref_prev (ppxref *xref)
+{
+  return xref->prev;
+}
+
+ppdict * ppxref_catalog (ppxref *xref)
+{
+  ppdict *trailer;
+  return (trailer = ppxref_trailer(xref)) != NULL ? ppdict_rget_dict(trailer, "Root") : NULL;
+}
+
+ppdict * ppxref_info (ppxref *xref)
+{
+  ppdict *trailer;
+  return (trailer = ppxref_trailer(xref)) != NULL ? ppdict_rget_dict(trailer, "Info") : NULL;
+}
+
+ppref * ppxref_pages (ppxref *xref)
+{
+  ppdict *dict;
+  ppref *ref;
+
+  if ((dict = ppxref_catalog(xref)) == NULL || (ref = ppdict_get_ref(dict, "Pages")) == NULL)
+    return NULL;
+  return ref->object.type == PPDICT ? ref : NULL;
+}
diff --git a/source/luametatex/source/libraries/pplib/ppxref.h b/source/luametatex/source/libraries/pplib/ppxref.h
new file mode 100644
index 000000000..fbb83bece
--- /dev/null
+++ b/source/luametatex/source/libraries/pplib/ppxref.h
@@ -0,0 +1,35 @@
+
+#ifndef PP_XREF_H
+#define PP_XREF_H
+
+/*
+What we call xref is actually "xref section" in PDF spec and what we call section is "xref subsection".
+Our ppxref is a list of sections, sorted by xrefsection->first and xrefsection->last bounds. Every section
+keeps a list of ppref *refs, enumerated from xrefsection->first to xrefsection->last. To find a reference
+by number we make a binary search over sections bounds, then jump to the proper ppref *ref.
+*/
+
+typedef struct {
+  ppuint first;     // first reference number in section
+  ppuint last;      // last reference number in section
+  ppref *refs;      // references list
+} ppxsec;
+
+struct ppxref {
+  ppxsec *sects;    // subsections list
+  size_t size;      // actual sections size
+  size_t space;     // available sections space
+  ppobj trailer;    // trailer dict or stream
+  ppuint count;     // count of references in all sections
+  ppxref *prev;     // previous xref
+  ppdoc *pdf;       // parent pdf to access entries in linearized docs
+  size_t offset;    // file offset of xref
+  //ppcrypt *crypt;   // per xref encryption state?
+};
+
+ppxref * ppxref_create (ppdoc *pdf, size_t initsize, size_t xrefoffset);
+ppxsec * ppxref_push_section (ppxref *xref, ppheap *heap);
+int ppxref_sort (ppxref *xref);
+ppref * ppxref_find_local (ppxref *xref, ppuint refnumber);
+
+#endif
+\ No newline at end of file
diff --git a/source/luametatex/source/libraries/pplib/readme.txt b/source/luametatex/source/libraries/pplib/readme.txt
new file mode 100644
index 000000000..ee5d141dc
--- /dev/null
+++ b/source/luametatex/source/libraries/pplib/readme.txt
@@ -0,0 +1,3 @@
+This is (to be) added to util/utilflate.c:
+
+# include "../../utilities/auxzlib.h"
diff --git a/source/luametatex/source/libraries/pplib/util/README.md b/source/luametatex/source/libraries/pplib/util/README.md
new file mode 100644
index 000000000..28f18ca65
--- /dev/null
+++ b/source/luametatex/source/libraries/pplib/util/README.md
@@ -0,0 +1,8 @@
+# pplib util
+
+This part is a toolbox. Contains utilities that are used by `pplib`
+but aren't tightly related to `PDF`. I use the toolbox in different
+projects and repos. It is important to me to keep this part in a perfect
+sync, at the cost of some redundant code (not used in `pplib`). 
+`pplib` is hopefully not a subject for eternal development, so once
+it become final, we will make some cleanups here.
diff --git a/source/luametatex/source/libraries/pplib/util/utilbasexx.c b/source/luametatex/source/libraries/pplib/util/utilbasexx.c
new file mode 100644
index 000000000..cfe148840
--- /dev/null
+++ b/source/luametatex/source/libraries/pplib/util/utilbasexx.c
@@ -0,0 +1,1742 @@
+
+#include "utilnumber.h"
+#include "utilmem.h"
+#include "utilbasexx.h"
+
+/* filters state structs */
+
+struct basexx_state {
+  size_t line, maxline;
+  size_t left;
+  int tail[5];
+  int flush;
+};
+
+struct runlength_state {
+  int run;
+  int flush;
+  int c1, c2;
+  uint8_t *pos;
+};
+
+typedef union { basexx_state *basexxstate; runlength_state *runlengthstate; void *voidstate; } basexx_state_pointer; // to avoid 'dereferencing type-puned ...' warnings
+
+/* config */
+
+#if defined(BASEXX_PDF)
+#  define ignored(c) (c == 0x20 || c == 0x0A || c == 0x0C || c == 0x0D || c == 0x09 || c == 0x00)
+#  define base16_eof(c) (c == '>' || c < 0)
+#  define base85_eof(c) (c == '~' || c < 0)
+#else
+#  define ignored(c) (c == 0x20 || c == 0x0A || c == 0x0D || c == 0x09)
+#  define base16_eof(c) (c < 0)
+#  define base85_eof(c) (c < 0)
+#endif
+
+#define base64_eof(c) (c == '=' || c < 0)
+
+#define basexx_nl '\x0A'
+//#define put_nl(O, line, maxline, n) ((void)((line += n) > maxline && ((line = n), iof_set(O, basexx_nl)))) // assignment in conditional warning
+#define put_nl(O, line, maxline, n) do { line += n; if (line > maxline) { line = n; iof_set(O, basexx_nl); }} while (0)
+
+/* tail macros */
+
+#define set_tail1(state, c1)             (state->left = 1, state->tail[0] = c1)
+#define set_tail2(state, c1, c2)         (state->left = 2, state->tail[0] = c1, state->tail[1] = c2)
+#define set_tail3(state, c1, c2, c3)     (state->left = 3, state->tail[0] = c1, state->tail[1] = c2, state->tail[2] = c3)
+#define set_tail4(state, c1, c2, c3, c4) (state->left = 4, state->tail[0] = c1, state->tail[1] = c2, state->tail[2] = c3, state->tail[3] = c4)
+#define set_tail5(state, c1, c2, c3, c4, c5) \
+  (state->left = 5, state->tail[0] = c1, state->tail[1] = c2, state->tail[2] = c3, state->tail[3] = c4, state->tail[4] = c5)
+
+#define get_tail1(state, c1)             (state->left = 0, c1 = state->tail[0])
+#define get_tail2(state, c1, c2)         (state->left = 0, c1 = state->tail[0], c2 = state->tail[1])
+#define get_tail3(state, c1, c2, c3)     (state->left = 0, c1 = state->tail[0], c2 = state->tail[1], c3 = state->tail[2])
+#define get_tail4(state, c1, c2, c3, c4) (state->left = 0, c1 = state->tail[0], c2 = state->tail[1], c3 = state->tail[2], c4 = state->tail[3])
+
+/* basexx state initialization */
+
+void basexx_state_init_ln (basexx_state *state, size_t line, size_t maxline)
+{
+  state->line = line;
+  state->maxline = maxline;
+  state->left = 0;
+  state->flush = 0;
+}
+
+/* base 16; xxxx|xxxx */
+
+iof_status base16_encoded_uc (const void *data, size_t size, iof *O)
+{
+  const uint8_t *s, *e;
+  for (s = (const uint8_t *)data, e = s + size; s < e; ++s)
+  {
+    if (!iof_ensure(O, 2))
+      return IOFFULL;
+    iof_set_uc_hex(O, *s);
+  }
+  return IOFEOF;
+}
+
+iof_status base16_encoded_lc (const void *data, size_t size, iof *O)
+{
+  const uint8_t *s, *e;
+  for (s = (const uint8_t *)data, e = s + size; s < e; ++s)
+  {
+    if (!iof_ensure(O, 2))
+      return IOFFULL;
+    iof_set_lc_hex(O, *s);
+  }
+  return IOFEOF;
+}
+
+iof_status base16_encoded_uc_ln (const void *data, size_t size, iof *O, size_t line, size_t maxline)
+{
+  const uint8_t *s, *e;
+  for (s = (const uint8_t *)data, e = s + size; s < e; ++s)
+  {
+    if (!iof_ensure(O, 3))
+      return IOFFULL;
+    put_nl(O, line, maxline, 2);
+    iof_set_uc_hex(O, *s);
+  }
+  return IOFFULL;
+}
+
+iof_status base16_encoded_lc_ln (const void *data, size_t size, iof *O, size_t line, size_t maxline)
+{
+  const uint8_t *s, *e;
+  for (s = (const uint8_t *)data, e = s + size; s < e; ++s)
+  {
+    if (!iof_ensure(O, 3))
+      return IOFFULL;
+    put_nl(O, line, maxline, 2);
+    iof_set_lc_hex(O, *s);
+  }
+  return IOFFULL;
+}
+
+iof_status base16_encode_uc (iof *I, iof *O)
+{
+  register int c;
+  while (iof_ensure(O, 2))
+  {
+    if ((c = iof_get(I)) < 0)
+      return IOFEOF;
+    iof_set_uc_hex(O, c);
+  }
+  return IOFFULL;
+}
+
+iof_status base16_encode_state_uc (iof *I, iof *O, basexx_state *state)
+{
+  register int c;
+  while (iof_ensure(O, 2))
+  {
+    if ((c = iof_get(I)) < 0)
+      return (state->flush ? IOFEOF : IOFEMPTY);
+    iof_set_uc_hex(O, c);
+  }
+  return IOFFULL;
+}
+
+iof_status base16_encode_lc (iof *I, iof *O)
+{
+  register int c;
+  while (iof_ensure(O, 2))
+  {
+    if ((c = iof_get(I)) < 0)
+      return IOFEOF;
+    iof_set_lc_hex(O, c);
+  }
+  return IOFFULL;
+}
+
+iof_status base16_encode_state_lc (iof *I, iof *O, basexx_state *state)
+{
+  register int c;
+  while (iof_ensure(O, 2))
+  {
+    if ((c = iof_get(I)) < 0)
+      return (state->flush ? IOFEOF : IOFEMPTY);
+    iof_set_lc_hex(O, c);
+  }
+  return IOFFULL;
+}
+
+iof_status base16_encode_uc_ln (iof *I, iof *O, size_t line, size_t maxline)
+{
+  register int c;
+  while (iof_ensure(O, 3))
+  {
+    if ((c = iof_get(I)) < 0)
+      return IOFEOF;
+    put_nl(O, line, maxline, 2);
+    iof_set_uc_hex(O, c);
+  }
+  return IOFFULL;
+}
+
+iof_status base16_encode_state_uc_ln (iof *I, iof *O, basexx_state *state)
+{
+  register int c;
+  while (iof_ensure(O, 3))
+  {
+    if ((c = iof_get(I)) < 0)
+      return (state->flush ? IOFEOF : IOFEMPTY);
+    put_nl(O, state->line, state->maxline, 2);
+    iof_set_uc_hex(O, c);
+  }
+  return IOFFULL;
+}
+
+iof_status base16_encode_lc_ln (iof *I, iof *O, size_t line, size_t maxline)
+{
+  register int c;
+  while (iof_ensure(O, 3))
+  {
+    if ((c = iof_get(I)) < 0)
+      return IOFEOF;
+    put_nl(O, line, maxline, 2);
+    iof_set_lc_hex(O, c);
+  }
+  return IOFFULL;
+}
+
+iof_status base16_encode_state_lc_ln (iof *I, iof *O, basexx_state *state)
+{
+  register int c;
+  while (iof_ensure(O, 3))
+  {
+    if ((c = iof_get(I)) < 0)
+      return (state->flush ? IOFEOF : IOFEMPTY);
+    put_nl(O, state->line, state->maxline, 2);
+    iof_set_lc_hex(O, c);
+  }
+  return IOFFULL;
+}
+
+int base16_getc (iof *I)
+{
+  register int c1, c2;
+  do { c1 = iof_get(I); } while (ignored(c1));
+  if (base16_eof(c1))
+    return IOFEOF;
+  do { c2 = iof_get(I); } while (ignored(c2));
+  if (base16_eof(c2))
+  {
+    if ((c1 = base16_value(c1)) < 0)
+      return IOFERR;
+    return c1<<4;
+  }
+  if ((c1 = base16_value(c1)) < 0 || (c2 = base16_value(c2)) < 0)
+    return IOFERR;
+  return (c1<<4)|c2;
+}
+
+int base16_lc_putc (iof *O, int c)
+{
+  if (iof_ensure(O, 2))
+    iof_set_lc_hex(O, c);
+  return IOFFULL;
+}
+
+int base16_uc_putc (iof *O, int c)
+{
+  if (iof_ensure(O, 2))
+    iof_set_uc_hex(O, c);
+  return IOFFULL;
+}
+
+
+iof_status base16_decode (iof *I, iof *O)
+{
+  register int c1, c2;
+  while (iof_ensure(O, 1))
+  {
+    do { c1 = iof_get(I); } while (ignored(c1));
+    if (base16_eof(c1))
+      return IOFEOF;
+    do { c2 = iof_get(I); } while (ignored(c2));
+    if (base16_eof(c2))
+    {
+      if ((c1 = base16_value(c1)) < 0)
+        return IOFERR;
+      iof_set(O, c1<<4); // c2 := '0'
+      return IOFEOF;
+    }
+    if ((c1 = base16_value(c1)) < 0 || (c2 = base16_value(c2)) < 0)
+      return IOFERR;
+    iof_set(O, (c1<<4)|c2);
+  }
+  return IOFFULL;
+}
+
+iof_status base16_decode_state (iof *I, iof *O, basexx_state *state)
+{
+  register int c1, c2, d1, d2;
+  if (!(iof_ensure(O, 1)))
+    return IOFFULL;
+  switch(state->left)
+  {
+    case 0: goto byte0;
+    case 1: get_tail1(state, c1); goto byte1;
+  }
+  while (iof_ensure(O, 1))
+  {
+    byte0:
+    do { c1 = iof_get(I); } while (ignored(c1));
+    if (base16_eof(c1))
+      return (state->flush ? IOFEOF : IOFEMPTY);
+    byte1:
+    do { c2 = iof_get(I); } while (ignored(c2));
+    if (base16_eof(c2))
+    {
+      set_tail1(state, c1); /* set tail to let the caller display invalid chars */
+      if (state->flush)
+      {
+        if ((c1 = base16_value(c1)) < 0)
+          return IOFERR;
+        iof_set(O, c1<<4); // c2 := '0'
+        return IOFEOF;
+      }
+      return IOFEMPTY;
+    }
+    if ((d1 = base16_value(c1)) < 0 || (d2 = base16_value(c2)) < 0)
+    {
+      set_tail2(state, c1, c2);
+      return IOFERR;
+    }
+    iof_set(O, (d1<<4)|d2);
+  }
+  return IOFFULL;
+}
+
+/* base 64; xxxxxx|xx xxxx|xxxx xx|xxxxxx */
+
+const char base64_alphabet[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
+
+const int base64_lookup[] = {
+  -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+  -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+  -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,62,-1,-1,-1,63,
+  52,53,54,55,56,57,58,59,60,61,-1,-1,-1,-1,-1,-1,
+  -1, 0, 1, 2, 3, 4, 5, 6, 7, 8,9 ,10,11,12,13,14,
+  15,16,17,18,19,20,21,22,23,24,25,-1,-1,-1,-1,-1,
+  -1,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,
+  41,42,43,44,45,46,47,48,49,50,51,-1,-1,-1,-1,-1,
+  -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+  -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+  -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+  -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+  -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+  -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+  -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+  -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
+};
+
+#define base64_value(c) base64_lookup[(uint8_t)(c)]
+
+#define base64_digit1(c1)     base64_alphabet[c1>>2]
+#define base64_digit2(c1, c2) base64_alphabet[((c1&3)<<4)|(c2>>4)]
+#define base64_digit3(c2, c3) base64_alphabet[((c2&15)<<2)|(c3>>6)]
+#define base64_digit4(c3)     base64_alphabet[c3&63]
+
+#define base64_encode_word(O, c1, c2, c3) \
+  iof_set4(O, base64_digit1(c1), base64_digit2(c1, c2), base64_digit3(c2, c3), base64_digit4(c3))
+
+#define base64_encode_tail2(O, c1, c2) \
+  iof_set3(O, base64_digit1(c1), base64_digit2(c1, c2), base64_digit3(c2, 0))
+
+#define base64_encode_tail1(O, c1) \
+  iof_set2(O, base64_digit1(c1), base64_digit2(c1, 0))
+
+iof_status base64_encoded (const void *data, size_t size, iof *O)
+{
+  const uint8_t *s, *e;
+  uint8_t c1, c2, c3;
+  for (s = (const uint8_t *)data, e = s + size; s + 2 < e; )
+  {
+    if (!iof_ensure(O, 4))
+      return IOFFULL;
+    c1 = *s++;
+    c2 = *s++;
+    c3 = *s++;
+    base64_encode_word(O, c1, c2, c3);
+  }
+  switch (e - s)
+  {
+    case 0:
+      break;
+    case 1:
+      if (!iof_ensure(O, 2))
+        return IOFFULL;
+      c1 = *s;
+      base64_encode_tail1(O, c1);
+      break;
+    case 2:
+      if (!iof_ensure(O, 3))
+        return IOFFULL;
+      c1 = *s++;
+      c2 = *s;
+      base64_encode_tail2(O, c1, c2);
+      break;
+  }
+  return IOFEOF;
+}
+
+iof_status base64_encoded_ln (const void *data, size_t size, iof *O, size_t line, size_t maxline)
+{
+  const uint8_t *s, *e;
+  uint8_t c1, c2, c3;
+  for (s = (const uint8_t *)data, e = s + size; s + 2 < e; )
+  {
+    if (!iof_ensure(O, 5))
+      return IOFFULL;
+    c1 = *s++;
+    c2 = *s++;
+    c3 = *s++;
+    put_nl(O, line, maxline, 4);
+    base64_encode_word(O, c1, c2, c3);
+  }
+  switch (e - s)
+  {
+    case 0:
+      break;
+    case 1:
+      if (!iof_ensure(O, 3))
+        return IOFFULL;
+      c1 = *s;
+      put_nl(O, line, maxline, 2);
+      base64_encode_tail1(O, c1);
+      break;
+    case 2:
+      if (!iof_ensure(O, 4))
+        return IOFFULL;
+      c1 = *s++;
+      c2 = *s;
+      put_nl(O, line, maxline, 3);
+      base64_encode_tail2(O, c1, c2);
+      break;
+  }
+  return IOFEOF;
+}
+
+iof_status base64_encode (iof *I, iof *O)
+{
+  register int c1, c2, c3;
+  while(iof_ensure(O, 4))
+  {
+    if ((c1 = iof_get(I)) < 0)
+      return IOFEOF;
+    if ((c2 = iof_get(I)) < 0)
+    {
+      base64_encode_tail1(O, c1);
+      return IOFEOF;
+    }
+    if ((c3 = iof_get(I)) < 0)
+    {
+      base64_encode_tail2(O, c1, c2);
+      return IOFEOF;
+    }
+    base64_encode_word(O, c1, c2, c3);
+  }
+  return IOFFULL;
+}
+
+iof_status base64_encode_state (iof *I, iof *O, basexx_state *state)
+{
+  register int c1, c2, c3;
+  if (!(iof_ensure(O, 4)))
+    return IOFFULL;
+  switch(state->left)
+  {
+    case 0: goto byte0;
+    case 1: get_tail1(state, c1); goto byte1;
+    case 2: get_tail2(state, c1, c2); goto byte2;
+  }
+  while(iof_ensure(O, 4))
+  {
+    byte0:
+    if ((c1 = iof_get(I)) < 0)
+      return (state->flush ? IOFEOF : IOFEMPTY);
+    byte1:
+    if ((c2 = iof_get(I)) < 0)
+      return (state->flush ? (base64_encode_tail1(O, c1), IOFEOF) : (set_tail1(state, c1), IOFEMPTY));
+    byte2:
+    if ((c3 = iof_get(I)) < 0)
+      return (state->flush ? (base64_encode_tail2(O, c1, c2), IOFEOF) : (set_tail2(state, c1, c2), IOFEMPTY));
+    base64_encode_word(O, c1, c2, c3);
+  }
+  return IOFFULL;
+}
+
+iof_status base64_encode_ln (iof *I, iof *O, size_t line, size_t maxline)
+{
+  register int c1, c2, c3;
+  while(iof_ensure(O, 5))
+  {
+    if ((c1 = iof_get(I)) < 0)
+      return IOFEOF;
+    if ((c2 = iof_get(I)) < 0)
+    {
+      put_nl(O, line, maxline, 2);
+      base64_encode_tail1(O, c1);
+      return IOFEOF;
+    }
+    if ((c3 = iof_get(I)) < 0)
+    {
+      put_nl(O, line, maxline, 3);
+      base64_encode_tail2(O, c1, c2);
+      return IOFEOF;
+    }
+    put_nl(O, line, maxline, 4);
+    base64_encode_word(O, c1, c2, c3);
+  }
+  return IOFFULL;
+}
+
+iof_status base64_encode_state_ln (iof *I, iof *O, basexx_state *state)
+{
+  register int c1, c2, c3;
+  if (!(iof_ensure(O, 5)))
+    return IOFFULL;
+  switch(state->left)
+  {
+    case 0: goto byte0;
+    case 1: get_tail1(state, c1); goto byte1;
+    case 2: get_tail2(state, c1, c2); goto byte2;
+  }
+  while(iof_ensure(O, 5))
+  {
+    byte0:
+    if ((c1 = iof_get(I)) < 0)
+      return (state->flush ? IOFEOF : IOFEMPTY);
+    byte1:
+    if ((c2 = iof_get(I)) < 0)
+    {
+      if (state->flush)
+      {
+        put_nl(O, state->line, state->maxline, 2);
+        base64_encode_tail1(O, c1);
+        return IOFEOF;
+      }
+      set_tail1(state, c1);
+      return IOFEMPTY;
+    }
+    byte2:
+    if ((c3 = iof_get(I)) < 0)
+    {
+      if (state->flush)
+      {
+        put_nl(O, state->line, state->maxline, 3);
+        base64_encode_tail2(O, c1, c2);
+        return IOFEOF;
+      }
+      set_tail2(state, c1, c2);
+      return IOFEMPTY;
+    }
+    put_nl(O, state->line, state->maxline, 4);
+    base64_encode_word(O, c1, c2, c3);
+  }
+  return IOFFULL;
+}
+
+// #define base64_code(c1, c2, c3, c4) ((c1<<18)|(c2<<12)|(c3<<6)|c4)
+
+#define base64_decode_word(O, c1, c2, c3, c4) \
+  iof_set3(O, (c1<<2)|(c2>>4), ((c2&15)<<4)|(c3>>2), ((c3&3)<<6)|c4)
+
+#define base64_decode_tail3(O, c1, c2, c3) \
+  iof_set2(O, (c1<<2)|(c2>>4), ((c2&15)<<4)|(c3>>2))
+
+#define base64_decode_tail2(O, c1, c2) \
+  iof_set(O, (c1<<2)|(c2>>4))
+
+iof_status base64_decode (iof *I, iof *O)
+{
+  register int c1, c2, c3, c4;
+  while(iof_ensure(O, 3))
+  {
+    do { c1 = iof_get(I); } while (ignored(c1));
+    if (base64_eof(c1))
+      return IOFEOF;
+    do { c2 = iof_get(I); } while (ignored(c2));
+    if (base64_eof(c2))
+      return IOFERR;
+    do { c3 = iof_get(I); } while (ignored(c3));
+    if (base64_eof(c3))
+    {
+      if ((c1 = base64_value(c1)) < 0 || (c2 = base64_value(c2)) < 0)
+        return IOFERR;
+      base64_decode_tail2(O, c1, c2);
+      return IOFEOF;
+    }
+    do { c4 = iof_get(I); } while (ignored(c4));
+    if (base64_eof(c4))
+    {
+      if ((c1 = base64_value(c1)) < 0 || (c2 = base64_value(c2)) < 0 || (c3 = base64_value(c3)) < 0)
+        return IOFERR;
+      base64_decode_tail3(O, c1, c2, c3);
+      return IOFEOF;
+    }
+    if ((c1 = base64_value(c1)) < 0 || (c2 = base64_value(c2)) < 0 ||
+        (c3 = base64_value(c3)) < 0 || (c4 = base64_value(c4)) < 0)
+          return IOFERR;
+    base64_decode_word(O, c1, c2, c3, c4);
+  }
+  return IOFFULL;
+}
+
+iof_status base64_decode_state (iof *I, iof *O, basexx_state *state)
+{
+  register int c1, c2, c3, c4;
+  register int d1, d2, d3, d4;
+  switch(state->left)
+  {
+    case 0: goto byte0;
+    case 1: get_tail1(state, c1); goto byte1;
+    case 2: get_tail2(state, c1, c2); goto byte2;
+    case 3: get_tail3(state, c1, c2, c3); goto byte3;
+  }
+  while(iof_ensure(O, 3))
+  {
+    byte0:
+    do { c1 = iof_get(I); } while (ignored(c1));
+    if (base64_eof(c1))
+      return (state->flush ? IOFEOF : IOFEMPTY);
+    byte1:
+    do { c2 = iof_get(I); } while (ignored(c2));
+    if (base64_eof(c2))
+    {
+      set_tail1(state, c1); /* set tail to let the caller make padding or display invalid char in case of error */
+      return (state->flush ? IOFERR : IOFEMPTY); /* if state->flush then error; tail must have at least two bytes */
+    }
+    byte2:
+    do { c3 = iof_get(I); } while (ignored(c3));
+    if (base64_eof(c3))
+    {
+      set_tail2(state, c1, c2);
+      if (state->flush)
+      {
+        if ((c1 = base64_value(c1)) < 0 || (c2 = base64_value(c2)) < 0)
+          return IOFERR;
+        base64_decode_tail2(O, c1, c2);
+        return IOFEOF;
+      }
+      else
+        return IOFEMPTY;
+    }
+    byte3:
+    do { c4 = iof_get(I); } while (ignored(c4));
+    if (base64_eof(c4))
+    {
+      set_tail3(state, c1, c2, c3);
+      if (state->flush)
+      {
+        if ((c1 = base64_value(c1)) < 0 || (c2 = base64_value(c2)) < 0 || (c3 = base64_value(c3)) < 0)
+          return IOFERR;
+        base64_decode_tail3(O, c1, c2, c3);
+        return IOFEOF;
+      }
+      else
+        return IOFEMPTY;
+    }
+    if ((d1 = base64_value(c1)) < 0 || (d2 = base64_value(c2)) < 0 ||
+        (d3 = base64_value(c3)) < 0 || (d4 = base64_value(c4)) < 0)
+    {
+      set_tail4(state, c1, c2, c3, c4);
+      return IOFERR;
+    }
+    base64_decode_word(O, d1, d2, d3, d4);
+  }
+  return IOFFULL;
+}
+
+/* base85 */
+
+const char base85_alphabet[] = "!\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstu"; /* for completness, not used below */
+
+const int base85_lookup[] = {
+  -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+  -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+  -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,
+  15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,
+  31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,
+  47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,
+  63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,
+  79,80,81,82,83,84,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+  -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+  -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+  -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+  -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+  -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+  -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+  -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+  -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
+};
+
+#define base85_value(c) base85_lookup[(uint8_t)(c)]
+
+#define base85_encode_word(O, code) \
+  (*(O->pos+4) = '!' + code%85, code /= 85, *(O->pos+3) = '!' + code%85, code /= 85, \
+   *(O->pos+2) = '!' + code%85, code /= 85, *(O->pos+1) = '!' + code%85, code /= 85, \
+   *(O->pos)   = '!' + code, \
+   O->pos += 5)
+
+#define base85_encode_tail3(O, code) \
+  (*(O->pos+3) = '!' + code%85, code /= 85, *(O->pos+2) = '!' + code%85, code /= 85, \
+   *(O->pos+1) = '!' + code%85, code /= 85, *(O->pos)   = '!' + code, \
+   O->pos += 4)
+
+#define base85_encode_tail2(O, code) \
+  (*(O->pos+2) = '!' + code%85, code /= 85, *(O->pos+1) = '!' + code%85, code /= 85, \
+   *(O->pos)   = '!' + code, \
+   O->pos += 3)
+
+#define base85_encode_tail1(O, code) \
+  (*(O->pos+1) = '!' + code%85, code /= 85, *(O->pos)   = '!' + code, \
+   O->pos += 2)
+
+iof_status base85_encoded (const void *data, size_t size, iof *O)
+{
+  unsigned int code;
+  const uint8_t *s, *e;
+  uint8_t c1, c2, c3, c4;
+  for (s = (const uint8_t *)data, e = s + size; s + 3 < e; )
+  {
+    if (!iof_ensure(O, 5))
+      return IOFFULL;
+    c1 = *s++;
+    c2 = *s++;
+    c3 = *s++;
+    c4 = *s++;
+    code = (c1<<24)|(c2<<16)|(c3<<8)|c4;
+    if (code == 0)
+    {
+      iof_set(O, 'z');
+      continue;
+    }
+    base85_encode_word(O, code);
+  }
+  switch (e - s)
+  {
+    case 0:
+      break;
+    case 1:
+      if (!iof_ensure(O, 2))
+        return IOFFULL;
+      c1 = *s;
+      code = (c1<<24)/85/85/85;
+      base85_encode_tail1(O, code);
+      break;
+    case 2:
+      if (!iof_ensure(O, 3))
+        return IOFFULL;
+      c1 = *s++;
+      c2 = *s;
+      code = ((c1<<24)|(c2<<16))/85/85;
+      base85_encode_tail2(O, code);
+      break;
+    case 3:
+      if (!iof_ensure(O, 4))
+        return IOFFULL;
+      c1 = *s++;
+      c2 = *s++;
+      c3 = *s;
+      code = ((c1<<24)|(c2<<16)|(c3<<8))/85;
+      base85_encode_tail3(O, code);
+      break;
+  }
+  return IOFEOF;
+}
+
+iof_status base85_encoded_ln (const void *data, size_t size, iof *O, size_t line, size_t maxline)
+{
+  unsigned int code;
+  const uint8_t *s, *e;
+  uint8_t c1, c2, c3, c4;
+  for (s = (const uint8_t *)data, e = s + size; s + 3 < e; )
+  {
+    if (!iof_ensure(O, 6))
+      return IOFFULL;
+    c1 = *s++;
+    c2 = *s++;
+    c3 = *s++;
+    c4 = *s++;
+    code = (c1<<24)|(c2<<16)|(c3<<8)|c4;
+    if (code == 0)
+    {
+      put_nl(O, line, maxline, 1);
+      iof_set(O, 'z');
+      continue;
+    }
+    put_nl(O, line, maxline, 5);
+    base85_encode_word(O, code);
+  }
+  switch (e - s)
+  {
+    case 0:
+      break;
+    case 1:
+      if (!iof_ensure(O, 3))
+        return IOFFULL;
+      c1 = *s;
+      code = (c1<<24)/85/85/85;
+      put_nl(O, line, maxline, 2);
+      base85_encode_tail1(O, code);
+      break;
+    case 2:
+      if (!iof_ensure(O, 4))
+        return IOFFULL;
+      c1 = *s++;
+      c2 = *s;
+      code = ((c1<<24)|(c2<<16))/85/85;
+      put_nl(O, line, maxline, 3);
+      base85_encode_tail2(O, code);
+      break;
+    case 3:
+      if (!iof_ensure(O, 5))
+        return IOFFULL;
+      c1 = *s++;
+      c2 = *s++;
+      c3 = *s;
+      code = ((c1<<24)|(c2<<16)|(c3<<8))/85;
+      put_nl(O, line, maxline, 4);
+      base85_encode_tail3(O, code);
+      break;
+  }
+  return IOFEOF;
+}
+
+iof_status base85_encode (iof *I, iof *O)
+{
+  register int c1, c2, c3, c4;
+  register unsigned int code;
+  while(iof_ensure(O, 5))
+  {
+    if ((c1 = iof_get(I)) < 0)
+      return IOFEOF;
+    if ((c2 = iof_get(I)) < 0)
+    {
+      code = (c1<<24)/85/85/85;
+      base85_encode_tail1(O, code);
+      return IOFEOF;
+    }
+    if ((c3 = iof_get(I)) < 0)
+    {
+      code = ((c1<<24)|(c2<<16))/85/85;
+      base85_encode_tail2(O, code);
+      return IOFEOF;
+    }
+    if ((c4 = iof_get(I)) < 0)
+    {
+      code = ((c1<<24)|(c2<<16)|(c3<<8))/85;
+      base85_encode_tail3(O, code);
+      return IOFEOF;
+    }
+    code = (c1<<24)|(c2<<16)|(c3<<8)|c4;
+    if (code == 0)
+    {
+      iof_set(O, 'z');
+      continue;
+    }
+    /* in btoa 'y' character stays for 0x20202020, but pdf does not support this */
+    /* if (code == 0x20202020)
+    {
+      iof_set(O, 'y');
+      continue;
+    } */
+    base85_encode_word(O, code);
+  }
+  return IOFFULL;
+}
+
+iof_status base85_encode_state (iof *I, iof *O, basexx_state *state)
+{
+  register int c1, c2, c3, c4;
+  register unsigned int code;
+  if (!(iof_ensure(O, 5)))
+    return IOFFULL;
+  switch(state->left)
+  {
+    case 0: goto byte0;
+    case 1: get_tail1(state, c1); goto byte1;
+    case 2: get_tail2(state, c1, c2); goto byte2;
+    case 3: get_tail3(state, c1, c2, c3); goto byte3;
+  }
+  while(iof_ensure(O, 5))
+  {
+    byte0:
+    if ((c1 = iof_get(I)) < 0)
+      return (state->flush ? IOFEOF : IOFEMPTY);
+    byte1:
+    if ((c2 = iof_get(I)) < 0)
+    {
+      set_tail1(state, c1);
+      if (state->flush)
+      {
+        code = (c1<<24)/85/85/85;
+        base85_encode_tail1(O, code);
+        return IOFEOF;
+      }
+      return IOFEMPTY;
+    }
+    byte2:
+    if ((c3 = iof_get(I)) < 0)
+    {
+      set_tail2(state, c1, c2);
+      if (state->flush)
+      {
+        code = ((c1<<24)|(c2<<16))/85/85;
+        base85_encode_tail2(O, code);
+        return IOFEOF;
+      }
+      return IOFEMPTY;
+    }
+    byte3:
+    if ((c4 = iof_get(I)) < 0)
+    {
+      set_tail3(state, c1, c2, c3);
+      if (state->flush)
+      {
+        code = ((c1<<24)|(c2<<16)|(c3<<8))/85;
+        base85_encode_tail3(O, code);
+        return IOFEOF;
+      }
+      return IOFEMPTY;
+    }
+    code = (c1<<24)|(c2<<16)|(c3<<8)|c4;
+    if (code == 0)
+    {
+      iof_set(O, 'z');
+      continue;
+    }
+    base85_encode_word(O, code);
+  }
+  return IOFFULL;
+}
+
+iof_status base85_encode_ln (iof *I, iof *O, size_t line, size_t maxline)
+{
+  register int c1, c2, c3, c4;
+  register unsigned int code;
+  while(iof_ensure(O, 6))
+  {
+    if ((c1 = iof_get(I)) < 0)
+      return IOFEOF;
+    if ((c2 = iof_get(I)) < 0)
+    {
+      code = (c1<<24)/85/85/85;
+      put_nl(O, line, maxline, 2);
+      base85_encode_tail1(O, code);
+      return IOFEOF;
+    }
+    if ((c3 = iof_get(I)) < 0)
+    {
+      code = ((c1<<24)|(c2<<16))/85/85;
+      put_nl(O, line, maxline, 3);
+      base85_encode_tail2(O, code);
+      return IOFEOF;
+    }
+    if ((c4 = iof_get(I)) < 0)
+    {
+      code = ((c1<<24)|(c2<<16)|(c3<<8))/85;
+      put_nl(O, line, maxline, 4);
+      base85_encode_tail3(O, code);
+      return IOFEOF;
+    }
+    code = (c1<<24)|(c2<<16)|(c3<<8)|c4;
+    if (code == 0)
+    {
+      put_nl(O, line, maxline, 1);
+      iof_set(O, 'z');
+      continue;
+    }
+    put_nl(O, line, maxline, 5);
+    base85_encode_word(O, code);
+  }
+  return IOFFULL;
+}
+
+iof_status base85_encode_state_ln (iof *I, iof *O, basexx_state *state)
+{
+  register int c1, c2, c3, c4;
+  register unsigned int code;
+  if (!(iof_ensure(O, 6)))
+    return IOFFULL;
+  switch(state->left)
+  {
+    case 0: goto byte0;
+    case 1: get_tail1(state, c1); goto byte1;
+    case 2: get_tail2(state, c1, c2); goto byte2;
+    case 3: get_tail3(state, c1, c2, c3); goto byte3;
+  }
+  while(iof_ensure(O, 6))
+  {
+    byte0:
+    if ((c1 = iof_get(I)) < 0)
+      return (state->flush ? IOFEOF : IOFEMPTY);
+    byte1:
+    if ((c2 = iof_get(I)) < 0)
+    {
+      set_tail1(state, c1);
+      if (state->flush)
+      {
+        code = (c1<<24)/85/85/85;
+        put_nl(O, state->line, state->maxline, 2);
+        base85_encode_tail1(O, code);
+        return IOFEOF;
+      }
+      return IOFEMPTY;
+    }
+    byte2:
+    if ((c3 = iof_get(I)) < 0)
+    {
+      set_tail2(state, c1, c2);
+      if (state->flush)
+      {
+        code = ((c1<<24)|(c2<<16))/85/85;
+        put_nl(O, state->line, state->maxline, 3);
+        base85_encode_tail2(O, code);
+        return IOFEOF;
+      }
+      return IOFEMPTY;
+    }
+    byte3:
+    if ((c4 = iof_get(I)) < 0)
+    {
+      set_tail3(state, c1, c2, c3);
+      if (state->flush)
+      {
+        code = ((c1<<24)|(c2<<16)|(c3<<8))/85;
+        put_nl(O, state->line, state->maxline, 4);
+        base85_encode_tail3(O, code);
+        return IOFEOF;
+      }
+      return IOFEMPTY;
+    }
+    code = (c1<<24)|(c2<<16)|(c3<<8)|c4;
+    if (code == 0)
+    {
+      put_nl(O, state->line, state->maxline, 1);
+      iof_set(O, 'z');
+      continue;
+    }
+    put_nl(O, state->line, state->maxline, 5);
+    base85_encode_word(O, code);
+  }
+  return IOFFULL;
+}
+
+#define base85_code(c1, c2, c3, c4, c5) ((((c1*85+c2)*85+c3)*85+c4)*85+c5)
+
+iof_status base85_decode (iof *I, iof *O)
+{
+  register int c1, c2, c3, c4, c5;
+  register unsigned int code;
+  while (iof_ensure(O, 4))
+  {
+    do { c1 = iof_get(I); } while (ignored(c1));
+    if (base85_eof(c1))
+      return IOFEOF;
+    switch (c1)
+    {
+      case 'z':
+        iof_set4(O, '\0', '\0', '\0', '\0');
+        continue;
+      case 'y':
+        iof_set4(O, ' ', ' ', ' ', ' ');
+        continue;
+    }
+    do { c2 = iof_get(I); } while (ignored(c2));
+    if (base85_eof(c2))
+      return IOFERR;
+    do { c3 = iof_get(I); } while (ignored(c3));
+    if (base85_eof(c3))
+    {
+      if ((c1 = base85_value(c1)) < 0 || (c2 = base85_value(c2)) < 0)
+        return IOFERR;
+      code = base85_code(c1, c2, 84, 84, 84); /* padding with 'u' (117); 117-33 = 84 */
+      iof_set(O, code>>24);
+      return IOFEOF;
+    }
+    do { c4 = iof_get(I); } while (ignored(c4));
+    if (base85_eof(c4))
+    {
+      if ((c1 = base85_value(c1)) < 0 || (c2 = base85_value(c2)) < 0 || (c3 = base85_value(c3)) < 0)
+        return IOFERR;
+      code = base85_code(c1, c2, c3, 84, 84);
+      iof_set2(O, code>>24, (code>>16)&255);
+      return IOFEOF;
+    }
+    do { c5 = iof_get(I); } while (ignored(c5));
+    if (base85_eof(c5))
+    {
+      if ((c1 = base85_value(c1)) < 0 || (c2 = base85_value(c2)) < 0 ||
+          (c3 = base85_value(c3)) < 0 || (c4 = base85_value(c4)) < 0)
+        return IOFERR;
+      code = base85_code(c1, c2, c3, c4, 84);
+      iof_set3(O, code>>24, (code>>16)&255, (code>>8)&255);
+      return IOFEOF;
+    }
+    if ((c1 = base85_value(c1)) < 0 || (c2 = base85_value(c2)) < 0 || (c3 = base85_value(c3)) < 0 ||
+        (c4 = base85_value(c4)) < 0 || (c5 = base85_value(c5)) < 0)
+      return IOFERR;
+    code = base85_code(c1, c2, c3, c4, c5);
+    iof_set4(O, code>>24, (code>>16)&255, (code>>8)&255, code&255);
+  }
+  return IOFFULL;
+}
+
+iof_status base85_decode_state (iof *I, iof *O, basexx_state *state)
+{
+  register int c1, c2, c3, c4, c5;
+  register int d1, d2, d3, d4, d5;
+  register unsigned int code;
+  if (!(iof_ensure(O, 4)))
+    return IOFFULL;
+  switch(state->left)
+  {
+    case 0: goto byte0;
+    case 1: get_tail1(state, c1); goto byte1;
+    case 2: get_tail2(state, c1, c2); goto byte2;
+    case 3: get_tail3(state, c1, c2, c3); goto byte3;
+    case 4: get_tail4(state, c1, c2, c3, c4); goto byte4;
+  }
+  while (iof_ensure(O, 4))
+  {
+    byte0:
+    do { c1 = iof_get(I); } while (ignored(c1));
+    if (base85_eof(c1))
+      return (state->flush ? IOFEOF : IOFEMPTY);
+    switch (c1)
+    {
+      case 'z':
+        iof_set4(O, '\0', '\0', '\0', '\0');
+        continue;
+      case 'y':
+        iof_set4(O, ' ', ' ', ' ', ' ');
+        continue;
+    }
+    byte1:
+    do { c2 = iof_get(I); } while (ignored(c2));
+    if (base85_eof(c2))
+    {
+      set_tail1(state, c1);
+      return (state->flush ? IOFERR : IOFEMPTY); /* if state->flush then error; tail must have at least two bytes */
+    }
+    byte2:
+    do { c3 = iof_get(I); } while (ignored(c3));
+    if (base85_eof(c3))
+    {
+      set_tail2(state, c1, c2);
+      if (state->flush)
+      {
+        if ((c1 = base85_value(c1)) < 0 || (c2 = base85_value(c2)) < 0)
+          return IOFERR;
+        code = base85_code(c1, c2, 84, 84, 84);
+        iof_set(O, code>>24);
+        return IOFEOF;
+      }
+      return IOFEMPTY;
+    }
+    byte3:
+    do { c4 = iof_get(I); } while (ignored(c4));
+    if (base85_eof(c4))
+    {
+      set_tail3(state, c1, c2, c3);
+      if (state->flush)
+      {
+        if ((c1 = base85_value(c1)) < 0 || (c2 = base85_value(c2)) < 0 || (c3 = base85_value(c3)) < 0)
+          return IOFERR;
+        code = base85_code(c1, c2, c3, 84, 84);
+        iof_set2(O, code>>24, (code>>16)&255);
+        return IOFEOF;
+      }
+      return IOFEMPTY;
+    }
+    byte4:
+    do { c5 = iof_get(I); } while (ignored(c5));
+    if (base85_eof(c5))
+    {
+      set_tail4(state, c1, c2, c3, c4);
+      if (state->flush)
+      {
+        if ((c1 = base85_value(c1)) < 0 || (c2 = base85_value(c2)) < 0 ||
+            (c3 = base85_value(c3)) < 0 || (c4 = base85_value(c4)) < 0)
+          return IOFERR;
+        code = base85_code(c1, c2, c3, c4, 84);
+        iof_set3(O, code>>24, (code>>16)&255, (code>>8)&255);
+        return IOFEOF;
+      }
+      return IOFEMPTY;
+    }
+    if ((d1 = base85_value(c1)) < 0 || (d2 = base85_value(c2)) < 0 || (d3 = base85_value(c3)) < 0 ||
+        (d4 = base85_value(c4)) < 0 || (d5 = base85_value(c5)) < 0)
+    {
+      set_tail5(state, c1, c2, c3, c4, c5);
+      return IOFERR;
+    }
+    code = base85_code(d1, d2, d3, d4, d5);
+    iof_set4(O, code>>24, (code>>16)&255, (code>>8)&255, code&255);
+  }
+  return IOFFULL;
+}
+
+/* postscript run length */
+
+void runlength_state_init (runlength_state *state)
+{
+  state->run = -1;
+  state->flush = 0;
+  state->c1 = 0;
+  state->c2 = 0;
+  state->pos = NULL;
+}
+
+iof_status runlength_encode (iof *I, iof *O)
+{
+  register int c1, c2, run = -1;
+  uint8_t *pos;
+  c1 = 0, c2 = 0; /* avoid warning */
+  while (iof_ensure(O, 1+128+1))
+  { /* ensured space for single length byte, up to 128 bytes to be copied, possible eod marker */
+    pos = O->pos++;
+    switch (run)
+    {
+      case -1: /* initial state; get first byte */
+        if ((c1 = iof_get(I)) < 0)
+          return (*pos = 128, IOFEOF);
+        run = 0;
+        FALLTHRU // fall through
+      case 0: /* `repeat' state; get another byte and compare */
+        if ((c2 = iof_get(I)) < 0)
+          return (*pos = 0, iof_set2(O, c1, 128), IOFEOF);
+        run = (c1 == c2 ? 257-2 : 0);
+        break;
+    }
+    if (run < 128)
+    { /* single length byte, up to 128 bytes to be copied, possible eod marker */
+      iof_set(O, c1);
+      for (c1 = c2, c2 = iof_char(I); c1 != c2 && run < 127; c1 = c2, c2 = iof_next(I))
+      {
+        if (c2 < 0) /* O->pos must not change until next call to calling encoder!!! */
+          return (*pos = (uint8_t)run+1, iof_set2(O, c1, 128), IOFEOF);
+        iof_set(O, c1);
+        ++run;
+      }
+    }
+    else // if run > 128
+    {
+      for (c2 = iof_get(I); c1 == c2 && run > 129; c2 = iof_get(I))
+        --run;
+      if (c2 < 0)
+        return (*pos = (uint8_t)run, iof_set2(O, c1, 128), IOFEOF);
+      iof_set(O, c1);
+    }
+    *pos = (uint8_t)run;
+    c1 = c2;
+    run = 0;
+  }
+  return IOFFULL;
+}
+
+iof_status runlength_encode_state (iof *I, iof *O, runlength_state *state)
+{
+  while (iof_ensure(O, 3)) /* single length byte, the byte to be repeated and eod */
+  {
+    state->pos = O->pos++;
+    switch (state->run)
+    {
+      case -1: /* initial state; get first byte */
+        if ((state->c1 = iof_get(I)) < 0)
+          return (state->flush ? (*state->pos = 128, IOFEOF) : IOFEMPTY);
+        state->run = 0;
+        FALLTHRU // fall through
+      case 0: /* `repeat' state; get another byte and compare */
+        if ((state->c2 = iof_get(I)) < 0)
+          return (state->flush ? (*state->pos = 0, iof_set2(O, state->c1, 128), IOFEOF) : IOFEMPTY);
+        state->run = (state->c1 == state->c2 ? 257-2 : 0);
+        break;
+    }
+    if (state->run < 128)
+    { /* ensure space for single length byte, up to 128 bytes to be copied, plus possible eod marker, minus those already copied */
+      if (!iof_ensure(O, 1+128+1-state->run))
+        return IOFFULL;
+      iof_set(O, state->c1);
+      for (state->c1 = state->c2, state->c2 = iof_char(I);
+           state->c1 != state->c2 && state->run < 127;
+           state->c1 = state->c2, state->c2 = iof_next(I))
+      {
+        if (state->c2 < 0) /* O->pos must not change until next call to calling encoder!!! */
+          return (state->flush ? (*state->pos = (uint8_t)state->run+1, iof_set2(O, state->c1, 128), IOFEOF) : IOFEMPTY);
+        iof_set(O, state->c1);
+        ++state->run;
+      }
+    }
+    else // if run > 128
+    {
+      for (state->c2 = iof_get(I); state->c1 == state->c2 && state->run > 129; state->c2 = iof_get(I))
+        --state->run;
+      if (state->c2 < 0)
+        return (state->flush ? (*state->pos = (uint8_t)state->run, iof_set2(O, state->c1, 128), IOFEOF) : IOFEMPTY);
+      iof_set(O, state->c1);
+    }
+    *state->pos = (uint8_t)state->run;
+    state->c1 = state->c2;
+    state->run = 0;
+  }
+  return IOFFULL;
+}
+
+iof_status runlength_decode (iof *I, iof *O)
+{
+  register int c, run = -1;
+  while (1)
+  {
+    if (run == -1) /* initial state */
+    {
+      if ((run = iof_get(I)) < 0)
+      {
+        run = -1; /* don't assume IOFEOF == -1 */
+        return IOFEOF;
+      }
+    }
+    if (run < 128)
+    { /* copy (run + 1) following bytes */
+      while (run > -1)
+      {
+        if (iof_ensure(O, 1))
+        {
+          if ((c = iof_get(I)) < 0)
+            return IOFERR;
+          iof_set(O, c);
+          --run;
+          continue;
+        }
+        return IOFFULL;
+      }
+    }
+    else if (run > 128)
+    { /* replicate the following byte (257 - run) times */
+      if ((c = iof_get(I)) < 0) /* cf. state-wise version; don't change input position until we got this byte */
+        return IOFERR;
+      while (run < 257)
+      {
+        if (iof_ensure(O, 1))
+        {
+          iof_set(O, c);
+          ++run;
+          continue;
+        }
+        return IOFFULL;
+      }
+      run = -1;
+    }
+    else // c == 128
+      return IOFEOF;
+  }
+  // return IOFFULL;
+}
+
+iof_status runlength_decode_state (iof *I, iof *O, runlength_state *state)
+{
+  register int c;
+  while (1)
+  {
+    if (state->run == -1) /* initial state */
+    {
+      if ((state->run = iof_char(I)) < 0)
+      {
+        state->run = -1; /* don't assume IOFEOF == -1 */
+        return (state->flush ? IOFEOF : IOFEMPTY);
+      }
+      ++I->pos;
+    }
+    if (state->run < 128)
+    { /* copy (state->run + 1) following bytes */
+      while (state->run > -1)
+      {
+        if (iof_ensure(O, 1))
+        {
+          if ((c = iof_char(I)) < 0)
+            return (state->flush ? IOFERR : IOFEMPTY);
+          ++I->pos;
+          iof_set(O, c);
+          --state->run;
+          continue;
+        }
+        return IOFFULL;
+      }
+    }
+    else if (state->run > 128)
+    { /* replicate the following byte (257 - state->run) times */
+      if ((c = iof_char(I)) < 0)
+        return (state->flush ? IOFERR : IOFEMPTY);
+      ++I->pos;
+      while (state->run < 257)
+      {
+        if (iof_ensure(O, 1))
+        {
+          iof_set(O, c);
+          ++state->run;
+          continue;
+        }
+        return IOFFULL;
+      }
+      state->run = -1;
+    }
+    else // c == 128
+      return IOFEOF;
+  }
+  // return IOFFULL;
+}
+
+/* filters */
+
+// base16 decoder function
+
+static size_t base16_decoder (iof *F, iof_mode mode)
+{
+  basexx_state *state;
+  iof_status status;
+  size_t tail;
+
+  switch(mode)
+  {
+    case IOFLOAD:
+    case IOFREAD:
+      if (F->flags & IOF_STOPPED)
+        return 0;
+      tail = iof_tail(F);
+      F->pos = F->buf + tail;
+      F->end = F->buf + F->space;
+      state = iof_filter_state(basexx_state *, F);
+      do {
+        status = base16_decode_state(F->next, F, state);
+      } while (mode == IOFLOAD && status == IOFFULL && iof_resize_buffer(F));
+      return iof_decoder_retval(F, "base16", status);
+    case IOFCLOSE:
+      iof_free(F);
+      return 0;
+    default:
+      break;
+  }
+  return 0;
+}
+
+// base16 encoder function
+
+static size_t base16_encoder (iof *F, iof_mode mode)
+{
+  basexx_state *state;
+  iof_status status;
+
+  state = iof_filter_state(basexx_state *, F);
+  switch (mode)
+  {
+    case IOFFLUSH:
+      state->flush = 1;
+      FALLTHRU // fall through
+    case IOFWRITE:
+      F->end = F->pos;
+      F->pos = F->buf;
+      status = base16_encode_state_ln(F, F->next, state);
+      return iof_encoder_retval(F, "base16", status);
+    case IOFCLOSE:
+      if (!state->flush)
+        base16_encoder(F, IOFFLUSH);
+      iof_free(F);
+      return 0;
+    default:
+      break;
+  }
+  return 0;
+}
+
+// base64 decoder function
+
+static size_t base64_decoder (iof *F, iof_mode mode)
+{
+  basexx_state *state;
+  iof_status status;
+  size_t tail;
+
+  switch(mode)
+  {
+    case IOFLOAD:
+    case IOFREAD:
+      if (F->flags & IOF_STOPPED)
+        return 0;
+      tail = iof_tail(F);
+      F->pos = F->buf + tail;
+      F->end = F->buf + F->space;
+      state = iof_filter_state(basexx_state *, F);
+      do {
+        status = base64_decode_state(F->next, F, state);
+      } while (mode == IOFLOAD && status == IOFFULL && iof_resize_buffer(F));
+      return iof_decoder_retval(F, "base64", status);
+    case IOFCLOSE:
+      iof_free(F);
+      return 0;
+    default:
+      break;
+  }
+  return 0;
+}
+
+// base64 encoder function
+
+static size_t base64_encoder (iof *F, iof_mode mode)
+{
+  basexx_state *state;
+  iof_status status;
+
+  state = iof_filter_state(basexx_state *, F);
+  switch (mode)
+  {
+    case IOFFLUSH:
+      state->flush = 1;
+      FALLTHRU // fall through
+    case IOFWRITE:
+      F->end = F->pos;
+      F->pos = F->buf;
+      status = base64_encode_state_ln(F, F->next, state);
+      return iof_encoder_retval(F, "base64", status);
+    case IOFCLOSE:
+      if (!state->flush)
+        base64_encoder(F, IOFFLUSH);
+      iof_free(F);
+      return 0;
+    default:
+      break;
+  }
+  return 0;
+}
+
+// base85 decoder function
+
+static size_t base85_decoder (iof *F, iof_mode mode)
+{
+  basexx_state *state;
+  iof_status status;
+  size_t tail;
+
+  switch(mode)
+  {
+    case IOFLOAD:
+    case IOFREAD:
+      if (F->flags & IOF_STOPPED)
+        return 0;
+      tail = iof_tail(F);
+      F->pos = F->buf + tail;
+      F->end = F->buf + F->space;
+      state = iof_filter_state(basexx_state *, F);
+      do {
+        status = base85_decode_state(F->next, F, state);
+      } while (mode == IOFLOAD && status == IOFFULL && iof_resize_buffer(F));
+      return iof_decoder_retval(F, "base85", status);
+    case IOFCLOSE:
+      iof_free(F);
+      return 0;
+    default:
+      break;
+  }
+  return 0;
+}
+
+// base85 encoder function
+
+static size_t base85_encoder (iof *F, iof_mode mode)
+{
+  basexx_state *state;
+  iof_status status;
+
+  state = iof_filter_state(basexx_state *, F);
+  switch (mode)
+  {
+    case IOFFLUSH:
+      state->flush = 1;
+      FALLTHRU // fall through
+    case IOFWRITE:
+      F->end = F->pos;
+      F->pos = F->buf;
+      status = base85_encode_state_ln(F, F->next, state);
+      return iof_encoder_retval(F, "base85", status);
+    case IOFCLOSE:
+      if (!state->flush)
+        base85_encoder(F, IOFFLUSH);
+      iof_free(F);
+      return 0;
+    default:
+      break;
+  }
+  return 0;
+}
+
+// runlength decoder function
+
+static size_t runlength_decoder (iof *F, iof_mode mode)
+{
+  runlength_state *state;
+  iof_status status;
+  size_t tail;
+
+  switch(mode)
+  {
+    case IOFLOAD:
+    case IOFREAD:
+      if (F->flags & IOF_STOPPED)
+        return 0;
+      tail = iof_tail(F);
+      F->pos = F->buf + tail;
+      F->end = F->buf + F->space;
+      state = iof_filter_state(runlength_state *, F);
+      do {
+        status = runlength_decode_state(F->next, F, state);
+      } while (mode == IOFLOAD && status == IOFFULL && iof_resize_buffer(F));
+      return iof_decoder_retval(F, "runlength", status);
+    case IOFCLOSE:
+      iof_free(F);
+      return 0;
+    default:
+      break;
+  }
+  return 0;
+}
+
+// runlength encoder function
+
+static size_t runlength_encoder (iof *F, iof_mode mode)
+{
+  runlength_state *state;
+  iof_status status;
+
+  state = iof_filter_state(runlength_state *, F);
+  switch (mode)
+  {
+    case IOFFLUSH:
+      state->flush = 1;
+      FALLTHRU // fall through
+    case IOFWRITE:
+      F->end = F->pos;
+      F->pos = F->buf;
+      status = runlength_encode_state(F, F->next, state);
+      return iof_encoder_retval(F, "runlength", status);
+    case IOFCLOSE:
+      if (!state->flush)
+        runlength_encoder(F, IOFFLUSH);
+      iof_free(F);
+      return 0;
+    default:
+      break;
+  }
+  return 0;
+}
+
+//
+
+int iof_filter_basexx_encoder_ln (iof *F, size_t line, size_t maxline)
+{
+  basexx_state *state;
+  if (maxline > 8 && line < maxline)
+  {
+    state = iof_filter_state(basexx_state *, F);
+    state->line = line;
+    state->maxline = maxline;
+    return 1;
+  }
+  return 0;
+}
+
+/* base 16 */
+
+iof * iof_filter_base16_decoder (iof *N)
+{
+  iof *I;
+  basexx_state_pointer P;
+  I = iof_filter_reader(base16_decoder, sizeof(basexx_state), &P.voidstate);
+  iof_setup_next(I, N);
+  basexx_state_init(P.basexxstate);
+  P.basexxstate->flush = 1; // means N is supposed to be continuous input
+  return I;
+}
+
+iof * iof_filter_base16_encoder (iof *N)
+{
+  iof *O;
+  basexx_state_pointer P;
+  O = iof_filter_writer(base16_encoder, sizeof(basexx_state), &P.voidstate);
+  iof_setup_next(O, N);
+  basexx_state_init(P.basexxstate);
+  return O;
+}
+
+/* base 64 */
+
+iof * iof_filter_base64_decoder (iof *N)
+{
+  iof *I;
+  basexx_state_pointer P;
+  I = iof_filter_reader(base64_decoder, sizeof(basexx_state), &P.voidstate);
+  iof_setup_next(I, N);
+  basexx_state_init(P.basexxstate);
+  P.basexxstate->flush = 1;
+  return I;
+}
+
+iof * iof_filter_base64_encoder (iof *N)
+{
+  iof *O;
+  basexx_state_pointer P;
+  O = iof_filter_writer(base64_encoder, sizeof(basexx_state), &P.voidstate);
+  iof_setup_next(O, N);
+  basexx_state_init(P.basexxstate);
+  return O;
+}
+
+/* base 85 */
+
+iof * iof_filter_base85_decoder (iof *N)
+{
+  iof *I;
+  basexx_state_pointer P;
+  I = iof_filter_reader(base85_decoder, sizeof(basexx_state), &P.voidstate);
+  iof_setup_next(I, N);
+  basexx_state_init(P.basexxstate);
+  P.basexxstate->flush = 1;
+  return I;
+}
+
+iof * iof_filter_base85_encoder (iof *N)
+{
+  iof *O;
+  basexx_state_pointer P;
+  O = iof_filter_writer(base85_encoder, sizeof(basexx_state), &P.voidstate);
+  iof_setup_next(O, N);
+  basexx_state_init(P.basexxstate);
+  return O;
+}
+
+/* runlength stream filter */
+
+iof * iof_filter_runlength_decoder (iof *N)
+{
+  iof *I;
+  basexx_state_pointer P;
+  I = iof_filter_reader(runlength_decoder, sizeof(runlength_state), &P.voidstate);
+  iof_setup_next(I, N);
+  runlength_state_init(P.runlengthstate);
+  P.runlengthstate->flush = 1;
+  return I;
+}
+
+iof * iof_filter_runlength_encoder (iof *N)
+{
+  iof *O;
+  basexx_state_pointer P;
+  O = iof_filter_writer(runlength_encoder, sizeof(runlength_state), &P.voidstate);
+  iof_setup_next(O, N);
+  runlength_state_init(P.runlengthstate);
+  return O;
+}
diff --git a/source/luametatex/source/libraries/pplib/util/utilbasexx.h b/source/luametatex/source/libraries/pplib/util/utilbasexx.h
new file mode 100644
index 000000000..81891b549
--- /dev/null
+++ b/source/luametatex/source/libraries/pplib/util/utilbasexx.h
@@ -0,0 +1,111 @@
+
+/* base encodings */
+
+#ifndef UTIL_BASEXX_H
+#define UTIL_BASEXX_H
+
+#include "utiliof.h"
+
+/* base codecs state */
+
+typedef struct basexx_state basexx_state;
+
+#define BASEXX_MAXLINE 80
+#define BASEXX_PDF
+
+void basexx_state_init_ln (basexx_state *state, size_t line, size_t maxline);
+#define basexx_state_init(state) basexx_state_init_ln(state, 0, BASEXX_MAXLINE)
+
+/* base16 */
+
+int base16_getc (iof *I);
+int base16_uc_putc (iof *I, int c);
+int base16_lc_putc (iof *I, int c);
+#define base16_putc base16_uc_putc
+
+iof_status base16_encoded_uc (const void *data, size_t size, iof *O);
+iof_status base16_encoded_lc (const void *data, size_t size, iof *O);
+iof_status base16_encoded_uc_ln (const void *data, size_t size, iof *O, size_t line, size_t maxline);
+iof_status base16_encoded_lc_ln (const void *data, size_t size, iof *O, size_t line, size_t maxline);
+
+iof_status base16_encode_uc (iof *I, iof *O);
+iof_status base16_encode_lc (iof *I, iof *O);
+iof_status base16_encode_uc_ln (iof *I, iof *O, size_t line, size_t maxline);
+iof_status base16_encode_lc_ln (iof *I, iof *O, size_t line, size_t maxline);
+iof_status base16_decode (iof *I, iof *O);
+
+#define base16_encoded base16_encoded_uc
+#define base16_encoded_ln base16_encoded_uc_ln
+#define base16_encode base16_encode_uc
+#define base16_encode_ln base16_encode_uc_ln
+
+iof_status base16_encode_state_uc (iof *I, iof *O, basexx_state *state);
+iof_status base16_encode_state_lc (iof *I, iof *O, basexx_state *state);
+iof_status base16_encode_state_uc_ln (iof *I, iof *O, basexx_state *state);
+iof_status base16_encode_state_lc_ln (iof *I, iof *O, basexx_state *state);
+iof_status base16_decode_state (iof *I, iof *O, basexx_state *state);
+
+#define base16_encode_state base16_encode_state_uc
+#define base16_encode_state_ln base16_encode_state_uc_ln
+
+/* base64 */
+
+extern const char base64_alphabet[];
+extern const int base64_lookup[];
+
+iof_status base64_encoded (const void *data, size_t size, iof *O);
+iof_status base64_encoded_ln (const void *data, size_t size, iof *O, size_t line, size_t maxline);
+
+iof_status base64_encode (iof *I, iof *O);
+iof_status base64_encode_ln (iof *I, iof *O, size_t line, size_t maxline);
+iof_status base64_decode (iof *I, iof *O);
+
+iof_status base64_encode_state (iof *I, iof *O, basexx_state *state);
+iof_status base64_encode_state_ln (iof *I, iof *O, basexx_state *state);
+iof_status base64_decode_state (iof *I, iof *O, basexx_state *state);
+
+/* base85 */
+
+extern const char base85_alphabet[];
+extern const int base85_lookup[];
+
+iof_status base85_encoded (const void *data, size_t size, iof *O);
+iof_status base85_encoded_ln (const void *data, size_t size, iof *O, size_t line, size_t maxline);
+
+iof_status base85_encode (iof *I, iof *O);
+iof_status base85_encode_ln (iof *I, iof *O, size_t line, size_t maxline);
+iof_status base85_decode (iof *I, iof *O);
+
+iof_status base85_encode_state (iof *I, iof *O, basexx_state *state);
+iof_status base85_encode_state_ln (iof *I, iof *O, basexx_state *state);
+iof_status base85_decode_state (iof *I, iof *O, basexx_state *state);
+
+/* run length */
+
+typedef struct runlength_state runlength_state;
+
+void runlength_state_init (runlength_state *state);
+
+iof_status runlength_encode (iof *I, iof *O);
+iof_status runlength_encode_state (iof *I, iof *O, runlength_state *state);
+
+iof_status runlength_decode (iof *I, iof *O);
+iof_status runlength_decode_state (iof *I, iof *O, runlength_state *state);
+
+/* filters */
+
+int iof_filter_basexx_encoder_ln (iof *N, size_t line, size_t maxline);
+
+iof * iof_filter_base16_decoder (iof *N);
+iof * iof_filter_base16_encoder (iof *N);
+
+iof * iof_filter_base64_decoder (iof *N);
+iof * iof_filter_base64_encoder (iof *N);
+
+iof * iof_filter_base85_decoder (iof *N);
+iof * iof_filter_base85_encoder (iof *N);
+
+iof * iof_filter_runlength_decoder (iof *N);
+iof * iof_filter_runlength_encoder (iof *N);
+
+#endif
diff --git a/source/luametatex/source/libraries/pplib/util/utilcrypt.c b/source/luametatex/source/libraries/pplib/util/utilcrypt.c
new file mode 100644
index 000000000..2c77e42a4
--- /dev/null
+++ b/source/luametatex/source/libraries/pplib/util/utilcrypt.c
@@ -0,0 +1,1190 @@
+
+#include "utilmem.h"
+#include "utilcrypt.h"
+#include "utilcryptdef.h"
+#include "utilmd5.h"
+
+/* rc4 */
+
+/*
+Initializer arguments:
+- state - crypt state
+- map - a space for rc4 bytes map; may be left NULL in which case will be allocated
+- vkey - crypt key; may be left NULL iff map is provided and properly initialized
+- keylength - the length of crypt key (from 5 to 16 bytes)
+*/
+
+rc4_state * rc4_state_initialize (rc4_state *state, rc4_map *map, const void *vkey, size_t keylength)
+{
+  int i, j;
+  uint8_t tmp;
+  const uint8_t *key;
+  key = (const uint8_t *)vkey;
+  if (keylength == 0 || keylength > 256)
+    return NULL;
+  state->flags = 0;
+  if (map != NULL)
+  {
+    state->map = map;
+  }
+  else
+  {
+    state->map = (rc4_map *)util_malloc(sizeof(rc4_map));
+    state->flags |= RC4_STATE_ALLOC;
+  }
+
+  if (key != NULL)
+  {
+    for (i = 0; i < 256; ++i)
+      state->smap[i] = (uint8_t)i;
+    for (i = 0, j = 0; i < 256; ++i)
+    {
+      j = (j + state->smap[i] + key[i % keylength]) & 255;
+      tmp = state->smap[i];
+      state->smap[i] = state->smap[j];
+      state->smap[j] = tmp;
+    }
+  }
+  state->i = 0;
+  state->j = 0;
+  state->flush = 0; /* caller is responsible to override if necessary */
+  return state;
+}
+
+void rc4_map_save (rc4_state *state, rc4_map *map)
+{
+  memcpy(map, state->map, sizeof(rc4_map));
+}
+
+void rc4_map_restore (rc4_state *state, rc4_map *map)
+{
+  memcpy(state->map, map, sizeof(rc4_map));
+  //state->flags = 0;
+  //state->flush = 0;
+  state->i = 0;
+  state->j = 0;
+}
+
+static uint8_t rc4_next_random_byte (rc4_state *state)
+{
+  uint8_t tmp;
+  state->i = (state->i + 1) & 255;
+  state->j = (state->j + state->smap[state->i]) & 255;
+  tmp = state->smap[state->i];
+  state->smap[state->i] = state->smap[state->j];
+  state->smap[state->j] = tmp;
+  return state->smap[(state->smap[state->i] + state->smap[state->j]) & 255];
+}
+
+iof_status rc4_crypt_state (iof *I, iof *O, rc4_state *state)
+{
+  uint8_t r;
+  int c;
+  while (iof_ensure(O, 1))
+  {
+    if ((c = iof_get(I)) < 0)
+      return c == IOFERR ? IOFERR : (state->flush ? IOFEOF : IOFEMPTY);
+    r = rc4_next_random_byte(state);
+    //r = r ^ ((uint8_t)c);
+    //iof_set(O, r);
+    iof_set(O, r ^ ((uint8_t)c));
+  }
+  return IOFFULL;
+}
+
+iof_status rc4_crypt (iof *I, iof *O, const void *key, size_t keylength)
+{
+  int ret;
+  rc4_state state;
+  rc4_map map;
+  if (rc4_state_initialize(&state, &map, key, keylength) == NULL)
+    return IOFERR;
+  state.flush = 1;
+  ret = rc4_crypt_state(I, O, &state);
+  rc4_state_close(&state);
+  return ret;
+}
+
+/*
+Variants that operates on c-strings can worn inplace, so output and input can be the same address.
+Variant that takes rc4_state pointer expects the state properly initialized. Keep in mind
+the crypt procedure modifies rc4 bytes map. All returns the size of encrypted/decrypted
+data, which is the same as input data length for rc4.
+*/
+
+size_t rc4_crypt_data (const void *input, size_t length, void *output, const void *key, size_t keylength)
+{
+  rc4_state state;
+  rc4_map map;
+  if (rc4_state_initialize(&state, &map, key, keylength) == NULL)
+    return 0;
+  return rc4_crypt_state_data(&state, input, length, output);
+  // no need to call rc4_state_close()
+}
+
+size_t rc4_crypt_state_data (rc4_state *state, const void *input, size_t length, void *output)
+{ /* state assumed to be initialized and with the proper state of smap */
+  const uint8_t *inp;
+  uint8_t r, *out;
+  size_t size;
+  inp = (const uint8_t *)input;
+  out = (uint8_t *)output;
+  for (size = 0; size < length; ++size, ++inp, ++out)
+  {
+    r = rc4_next_random_byte(state);
+    *out = r ^ *inp;
+  }
+  return length;
+}
+
+void rc4_state_close (rc4_state *state)
+{
+  if (state->smap != NULL && (state->flags & RC4_STATE_ALLOC))
+  {
+    util_free(state->smap);
+    state->smap = NULL;
+  }
+}
+
+/* aes; parts of code excerpted from https://github.com/kokke/tiny-AES128-C */
+
+static const uint8_t sbox[256] =   {
+  0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76,
+  0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0,
+  0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc, 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15,
+  0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75,
+  0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84,
+  0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf,
+  0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8,
+  0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5, 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2,
+  0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17, 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73,
+  0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb,
+  0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c, 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79,
+  0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9, 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08,
+  0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a,
+  0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e,
+  0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf,
+  0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16 };
+
+static const uint8_t rsbox[256] =
+{ 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38, 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb,
+  0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87, 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb,
+  0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d, 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e,
+  0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2, 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25,
+  0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16, 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92,
+  0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda, 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84,
+  0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a, 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06,
+  0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02, 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b,
+  0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea, 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73,
+  0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85, 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e,
+  0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89, 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b,
+  0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20, 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4,
+  0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31, 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f,
+  0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d, 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef,
+  0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0, 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61,
+  0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26, 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d };
+
+/*
+The round constant word array, rcon[i], contains the values given by
+x to th e power (i-1) being powers of x (x is denoted as {02}) in the field GF(2^8)
+Note that i starts at 1, not 0).
+*/
+
+static const uint8_t rcon[255] = {
+  0x8d, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1b, 0x36, 0x6c, 0xd8, 0xab, 0x4d, 0x9a,
+  0x2f, 0x5e, 0xbc, 0x63, 0xc6, 0x97, 0x35, 0x6a, 0xd4, 0xb3, 0x7d, 0xfa, 0xef, 0xc5, 0x91, 0x39,
+  0x72, 0xe4, 0xd3, 0xbd, 0x61, 0xc2, 0x9f, 0x25, 0x4a, 0x94, 0x33, 0x66, 0xcc, 0x83, 0x1d, 0x3a,
+  0x74, 0xe8, 0xcb, 0x8d, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1b, 0x36, 0x6c, 0xd8,
+  0xab, 0x4d, 0x9a, 0x2f, 0x5e, 0xbc, 0x63, 0xc6, 0x97, 0x35, 0x6a, 0xd4, 0xb3, 0x7d, 0xfa, 0xef,
+  0xc5, 0x91, 0x39, 0x72, 0xe4, 0xd3, 0xbd, 0x61, 0xc2, 0x9f, 0x25, 0x4a, 0x94, 0x33, 0x66, 0xcc,
+  0x83, 0x1d, 0x3a, 0x74, 0xe8, 0xcb, 0x8d, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1b,
+  0x36, 0x6c, 0xd8, 0xab, 0x4d, 0x9a, 0x2f, 0x5e, 0xbc, 0x63, 0xc6, 0x97, 0x35, 0x6a, 0xd4, 0xb3,
+  0x7d, 0xfa, 0xef, 0xc5, 0x91, 0x39, 0x72, 0xe4, 0xd3, 0xbd, 0x61, 0xc2, 0x9f, 0x25, 0x4a, 0x94,
+  0x33, 0x66, 0xcc, 0x83, 0x1d, 0x3a, 0x74, 0xe8, 0xcb, 0x8d, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20,
+  0x40, 0x80, 0x1b, 0x36, 0x6c, 0xd8, 0xab, 0x4d, 0x9a, 0x2f, 0x5e, 0xbc, 0x63, 0xc6, 0x97, 0x35,
+  0x6a, 0xd4, 0xb3, 0x7d, 0xfa, 0xef, 0xc5, 0x91, 0x39, 0x72, 0xe4, 0xd3, 0xbd, 0x61, 0xc2, 0x9f,
+  0x25, 0x4a, 0x94, 0x33, 0x66, 0xcc, 0x83, 0x1d, 0x3a, 0x74, 0xe8, 0xcb, 0x8d, 0x01, 0x02, 0x04,
+  0x08, 0x10, 0x20, 0x40, 0x80, 0x1b, 0x36, 0x6c, 0xd8, 0xab, 0x4d, 0x9a, 0x2f, 0x5e, 0xbc, 0x63,
+  0xc6, 0x97, 0x35, 0x6a, 0xd4, 0xb3, 0x7d, 0xfa, 0xef, 0xc5, 0x91, 0x39, 0x72, 0xe4, 0xd3, 0xbd,
+  0x61, 0xc2, 0x9f, 0x25, 0x4a, 0x94, 0x33, 0x66, 0xcc, 0x83, 0x1d, 0x3a, 0x74, 0xe8, 0xcb  };
+
+/* block copying */
+
+#define aes_copy_block(output, input) memcpy(output, input, 16)
+
+static void aes_copy_cbc (uint8_t *data, const uint8_t *input)
+{
+  uint8_t i;
+  for (i = 0; i < 16; ++i)
+    data[i] ^= input[i];
+}
+
+static void aes_copy_xor (uint8_t *data, const uint8_t *input, const uint8_t *iv)
+{
+  uint8_t i;
+  for (i = 0; i < 16; ++i)
+    data[i] = input[i] ^ iv[i];
+}
+
+/* key expansion */
+
+#define AES_COLUMNS 4 // constant in aes
+
+static void key_expansion (aes_state *state, const uint8_t *key)
+{
+  uint32_t i, j;
+  uint8_t t[4], temp;
+  uint8_t *keydata, keywords, columns;
+
+  keywords = (uint8_t)(state->keylength >> 2);
+  keydata = (uint8_t *)state->keyblock;
+
+  /* the first round key is the key itself */
+  for(i = 0; i < keywords; ++i)
+  {
+    keydata[(i * 4) + 0] = key[(i * 4) + 0];
+    keydata[(i * 4) + 1] = key[(i * 4) + 1];
+    keydata[(i * 4) + 2] = key[(i * 4) + 2];
+    keydata[(i * 4) + 3] = key[(i * 4) + 3];
+  }
+
+  /* others derived from the first */
+  for(columns = AES_COLUMNS * (state->rounds + 1); i < columns; ++i)
+  {
+    for(j = 0; j < 4; ++j)
+      t[j] = keydata[(i - 1) * 4 + j];
+    if (i % keywords == 0)
+    {
+      /* rotate the 4 bytes in a word to the left once; [a0,a1,a2,a3] becomes [a1,a2,a3,a0] */
+      temp = t[0];
+      t[0] = t[1];
+      t[1] = t[2];
+      t[2] = t[3];
+      t[3] = temp;
+
+      /* take a four-byte input word and apply the S-box to each of the four bytes to produce an output word */
+      t[0] = sbox[t[0]];
+      t[1] = sbox[t[1]];
+      t[2] = sbox[t[2]];
+      t[3] = sbox[t[3]];
+
+      t[0] =  t[0] ^ rcon[i / keywords];
+    }
+    else if (keywords > 6 && i % keywords == 4)
+    {
+      t[0] = sbox[t[0]];
+      t[1] = sbox[t[1]];
+      t[2] = sbox[t[2]];
+      t[3] = sbox[t[3]];
+    }
+    keydata[i * 4 + 0] = keydata[(i - keywords) * 4 + 0] ^ t[0];
+    keydata[i * 4 + 1] = keydata[(i - keywords) * 4 + 1] ^ t[1];
+    keydata[i * 4 + 2] = keydata[(i - keywords) * 4 + 2] ^ t[2];
+    keydata[i * 4 + 3] = keydata[(i - keywords) * 4 + 3] ^ t[3];
+  }
+
+}
+
+/*
+An original implementation uses no private buffers except a keyblock. We need private buffers to
+keep a CBC vector between calls and to be able to read input data not necessarily in 16-bytes blocks.
+Encrypter would actually require only one such buffer, as CBC vector is applied on input data before
+the actual cipher procedure. And CBC for the next chunk is simply the output from the previous.
+Decrypter, however, applies the cipher first, then applies CBC to the output with a buffered init
+vector, and the vector for the next call is the row input before cipher. Hence we need two 16-bytes
+buffers for decrypter.
+*/
+
+/*
+aes_state * aes_state_initialize_ecb (aes_state *State, uint8_t *keyblock, const uint8_t *key)
+{
+  state->flags = 0;
+
+  state->flags |= AES_ECB_MODE;
+
+  if (keyblock == NULL)
+  {
+    keyblock = util_malloc(sizeof(aes_keyblock));
+    state->flags |= AES_STATE_ALLOC;
+  }
+  state->keyblock = keyblock;
+  key_expansion(state, key);
+  state->flush = 0;
+  return state;
+}
+*/
+
+void aes_pdf_mode (aes_state *state)
+{
+  state->flags |= AES_INLINE_IV;
+  state->flags &= ~AES_NULL_PADDING;
+}
+
+/*
+Initialize arguments:
+- state - crypt state
+- keyblock - a space for aes key expansion; can be left NULL in which case will be allocated
+- key - crypt key; can be left NULL iff keyblock is given and properly initialized
+- keylength - the length of the key (16 or 32 bytes)
+- iv - 16-bytes CBC initialization vector;
+  - if left NULL for encoder, one is generated and stored as state->iv
+  - can also be left NULL for decorer, but then AES_INLINE_IV must be set, as this informs decoder to take
+    an initialization vector from the beginning of the encrypted stream
+
+At the first approach, an initialization vector was copied to state block during initialization and encoders
+assumed that the state block is the current initialization vector. This simplifies encrypting procedure,
+as the output from every 16-bytes chunk encryption is an initialization vector for the next chunk. However,
+it makes api usage cumbersome, as the user has to know that iv may need to be copied to state block
+before each call.
+*/
+
+static int aes_key_length (aes_state *state, size_t keylength)
+{
+  state->keylength = keylength;
+  switch (keylength)
+  {
+    case 16:
+      state->rounds = 10;
+      break;
+    case 24:
+      state->rounds = 12;
+      break;
+    case 32:
+      state->rounds = 14;
+      break;
+    default:
+      return 0;
+  }
+  return 1;
+}
+
+aes_state * aes_encode_initialize (aes_state *state, aes_keyblock *keyblock, const void *key, size_t keylength, const void *iv)
+{
+  state->flags = 0;
+  if (!aes_key_length(state, keylength))
+    return NULL;
+  if (iv != NULL)
+    aes_copy_block(state->iv, iv);
+  else
+    aes_generate_iv(state->iv);
+  state->flags |= AES_HAS_IV;
+
+  if (keyblock == NULL)
+  {
+    keyblock = (aes_keyblock *)util_malloc(sizeof(aes_keyblock));
+    state->flags |= AES_STATE_ALLOC;
+  }
+  state->keyblock = keyblock;
+  if (key != NULL) /* if NULL we assume keyblock is given and already expanded */
+    key_expansion(state, (const uint8_t *)key);
+  state->flush = 0;
+  return state;
+}
+
+aes_state * aes_decode_initialize (aes_state *state, aes_keyblock *keyblock, const void *key, size_t keylength, const void *iv)
+{
+  state->flags = 0;
+  if (!aes_key_length(state, keylength))
+    return NULL;
+  if (iv != NULL)
+  {
+    aes_copy_block(state->iv, iv);
+    state->flags |= AES_HAS_IV;
+  }
+  /* else if AES_INLINE_IV flag is set will be read from input */
+
+  if (keyblock == NULL)
+  {
+    keyblock = (aes_keyblock *)util_malloc(sizeof(aes_keyblock));
+    state->flags |= AES_STATE_ALLOC;
+  }
+  state->keyblock = keyblock;
+  if (key != NULL) /* otherwise keyblock is assumed present and properly initialized */
+    key_expansion(state, (const uint8_t *)key);
+  state->flush = 0;
+  return state;
+}
+
+void aes_state_close (aes_state *state)
+{
+  if (state->keyblock != NULL && (state->flags & AES_STATE_ALLOC))
+    util_free(state->keyblock);
+}
+
+/* add round key */
+
+static void aes_round_key (aes_block block, aes_block keyblock)
+{
+  uint8_t i, j;
+  for(i = 0; i < 4; ++i)
+    for(j = 0; j < 4; ++j)
+      block[i][j] ^= keyblock[i][j];
+}
+
+#define aes_add_key(block, keyblock, round) aes_round_key(block, (*keyblock)[round])
+
+/* substitution */
+
+static void aes_encode_sub (aes_block block)
+{
+  uint8_t i, j, v;
+  for(i = 0; i < 4; ++i)
+    for(j = 0; j < 4; ++j)
+      v = block[i][j], block[i][j] = sbox[v];
+}
+
+/* rows shift; the row index is the shift offset, the first order is not shifted */
+
+static void aes_encode_shift (aes_block block)
+{
+  uint8_t tmp;
+
+  /* 1st row rotated once */
+  tmp = block[0][1];
+  block[0][1] = block[1][1];
+  block[1][1] = block[2][1];
+  block[2][1] = block[3][1];
+  block[3][1] = tmp;
+
+  /* 2nd row rotated twice */
+  tmp = block[0][2];
+  block[0][2] = block[2][2];
+  block[2][2] = tmp;
+  tmp = block[1][2];
+  block[1][2] = block[3][2];
+  block[3][2] = tmp;
+
+  /* 3rd row rotated 3 times */
+  tmp = block[0][3];
+  block[0][3] = block[3][3];
+  block[3][3] = block[2][3];
+  block[2][3] = block[1][3];
+  block[1][3] = tmp;
+}
+
+static uint8_t xtime (uint8_t x)
+{
+  return ((x << 1) ^ (((x >> 7) & 1) * 0x1b));
+}
+
+/* mix columns */
+
+static void aes_encode_mix (aes_block block)
+{
+  uint8_t i, tmp, tm, t;
+
+  for(i = 0; i < 4; ++i)
+  {
+    t   = block[i][0];
+    tmp = block[i][0] ^ block[i][1] ^ block[i][2] ^ block[i][3] ;
+    tm  = block[i][0] ^ block[i][1]; tm = xtime(tm); block[i][0] ^= tm ^ tmp;
+    tm  = block[i][1] ^ block[i][2]; tm = xtime(tm); block[i][1] ^= tm ^ tmp;
+    tm  = block[i][2] ^ block[i][3]; tm = xtime(tm); block[i][2] ^= tm ^ tmp;
+    tm  = block[i][3] ^ t ;          tm = xtime(tm); block[i][3] ^= tm ^ tmp;
+  }
+}
+
+/* multiply is used to multiply numbers in the field GF(2^8) */
+
+#define multiply(x, y)                                \
+      (  ((y & 1) * x) ^                              \
+      ((y>>1 & 1) * xtime(x)) ^                       \
+      ((y>>2 & 1) * xtime(xtime(x))) ^                \
+      ((y>>3 & 1) * xtime(xtime(xtime(x)))) ^         \
+      ((y>>4 & 1) * xtime(xtime(xtime(xtime(x))))))   \
+
+/* mix columns */
+
+static void aes_decode_mix (aes_block block)
+{
+  int i;
+  uint8_t a, b, c, d;
+
+  for(i = 0; i < 4; ++i)
+  {
+    a = block[i][0];
+    b = block[i][1];
+    c = block[i][2];
+    d = block[i][3];
+    block[i][0] = multiply(a, 0x0e) ^ multiply(b, 0x0b) ^ multiply(c, 0x0d) ^ multiply(d, 0x09);
+    block[i][1] = multiply(a, 0x09) ^ multiply(b, 0x0e) ^ multiply(c, 0x0b) ^ multiply(d, 0x0d);
+    block[i][2] = multiply(a, 0x0d) ^ multiply(b, 0x09) ^ multiply(c, 0x0e) ^ multiply(d, 0x0b);
+    block[i][3] = multiply(a, 0x0b) ^ multiply(b, 0x0d) ^ multiply(c, 0x09) ^ multiply(d, 0x0e);
+  }
+}
+
+/* inverse substitution */
+
+static void aes_decode_sub (aes_block block)
+{
+  uint8_t i, j, v;
+  for(i = 0; i < 4; ++i)
+    for(j = 0; j < 4; ++j)
+      v = block[i][j], block[i][j] = rsbox[v];
+}
+
+/* inverse shift rows */
+
+static void aes_decode_shift (aes_block block)
+{
+  uint8_t tmp;
+
+  /* 1st row rotated once right */
+  tmp = block[3][1];
+  block[3][1] = block[2][1];
+  block[2][1] = block[1][1];
+  block[1][1] = block[0][1];
+  block[0][1] = tmp;
+
+  /* 2st row rotated twice right */
+  tmp = block[0][2];
+  block[0][2] = block[2][2];
+  block[2][2] = tmp;
+  tmp = block[1][2];
+  block[1][2] = block[3][2];
+  block[3][2] = tmp;
+
+  /* 3rd row rotated 3 times right */
+  tmp = block[0][3];
+  block[0][3] = block[1][3];
+  block[1][3] = block[2][3];
+  block[2][3] = block[3][3];
+  block[3][3] = tmp;
+}
+
+/* aes block encoder */
+
+static void aes_encode_cipher (aes_state *state)
+{
+  uint8_t round;
+  aes_add_key(state->block, state->keyblock, 0);
+  for (round = 1; round < state->rounds; ++round)
+  {
+    aes_encode_sub(state->block);
+    aes_encode_shift(state->block);
+    aes_encode_mix(state->block);
+    aes_add_key(state->block, state->keyblock, round);
+  }
+  aes_encode_sub(state->block);
+  aes_encode_shift(state->block);
+  aes_add_key(state->block, state->keyblock, state->rounds);
+}
+
+/* aes block decoder */
+
+static void aes_decode_cipher (aes_state *state)
+{
+  uint8_t round;
+  aes_add_key(state->block, state->keyblock, state->rounds);
+  for(round = state->rounds - 1; round > 0; --round)
+  {
+    aes_decode_shift(state->block);
+    aes_decode_sub(state->block);
+    aes_add_key(state->block, state->keyblock, round);
+    aes_decode_mix(state->block);
+  }
+  aes_decode_shift(state->block);
+  aes_decode_sub(state->block);
+  aes_add_key(state->block, state->keyblock, 0);
+}
+
+/* tail block padding; RFC 2898, PKCS #5: Password-Based Cryptography Specification Version 2.0; pdf spec p. 119 */
+
+#define aes_padding(state) ((state->flags & AES_NULL_PADDING) == 0)
+
+static void aes_put_padding (aes_state *state, uint8_t length)
+{
+  uint8_t pad;
+  pad = (aes_padding(state)) ? 16 - length : 0;
+  for (; length < 16; ++length)
+    state->data[length] = state->iv[length] ^ pad;
+}
+
+static int aes_remove_padding (aes_state *state, uint8_t *data, uint8_t *length)
+{
+  uint8_t pad;
+  *length = 16; /* block length 16 means leave intact */
+  if (aes_padding(state))
+  {
+    pad = data[16 - 1];
+    if (pad > 16)
+      return IOFERR;
+    for ( ; *length > 16 - pad; --(*length))
+      if (data[*length - 1] != pad)
+        return IOFERR;
+  }
+  else
+  {
+    for ( ; *length > 0; --(*length))
+      if (data[*length - 1] != '\0')
+        break;
+  }
+  return IOFEOF;
+}
+
+/* aes codec */
+
+/* make the cipher on input xor-ed with iv, save the output as a new iv, write the output */
+#define aes_encode_output(state, output) \
+  (aes_encode_cipher(state), aes_copy_block(state->iv, state->data), aes_copy_block(output, state->data), output += 16)
+
+iof_status aes_encode_state (iof *I, iof *O, aes_state *state)
+{
+  int c;
+
+  if (!(state->flags & AES_HAS_IV)) // weird
+    return IOFERR;
+  if ((state->flags & AES_INLINE_IV) && !(state->flags & AES_CONTINUE))
+  { /* write iv at the beginning of encrypted data */
+    if (!iof_ensure(O, 16))
+      return IOFFULL;
+    aes_copy_block(O->pos, state->iv);
+    O->pos += 16;
+    state->flags |= AES_CONTINUE;
+  }
+  while (iof_ensure(O, 16))
+  {
+    while (state->buffered < 16)
+    {
+      if ((c = iof_get(I)) != IOFEOF)
+      { /* get input byte XORed with iv */
+        state->data[state->buffered] = state->iv[state->buffered] ^ ((uint8_t)c);
+        ++state->buffered;
+      }
+      else
+      {
+        if (state->flush)
+        {
+          if (state->buffered > 0 || aes_padding(state))
+          { /* pad the last input chunk; for input divisable by 16, add 16 bytes 0x0f */
+            aes_put_padding(state, state->buffered);
+            state->buffered = 16;
+            aes_encode_output(state, O->pos);
+          }
+          return IOFEOF;
+        }
+        else
+          return IOFEMPTY;
+      }
+    }
+    aes_encode_output(state, O->pos);
+    state->buffered = 0;
+  }
+  return IOFFULL;
+}
+
+/* write iv to the output, save the raw input just buffered as iv for the next chunk, make the cipher, write out xoring with iv */
+#define aes_decode_output(state, output) \
+  (aes_copy_block(output, state->iv), aes_copy_block(state->iv, state->data), aes_decode_cipher(state), aes_copy_cbc(output, state->data), output += 16)
+
+iof_status aes_decode_state (iof *I, iof *O, aes_state *state)
+{
+  int c, ret;
+  uint8_t lastlength;
+
+  if ((state->flags & AES_INLINE_IV) && !(state->flags & AES_CONTINUE))
+  {
+    while (state->buffered < 16)
+    {
+      if ((c = iof_get(I)) != IOFEOF)
+        state->iv[state->buffered++] = (uint8_t)c;
+      else
+        return state->flush ? IOFERR : IOFEMPTY;
+    }
+    state->flags |= AES_CONTINUE|AES_HAS_IV;
+    state->buffered = 0;
+  }
+  while (iof_ensure(O, 16))
+  {
+    while (state->buffered < 16)
+    {
+      if ((c = iof_get(I)) != IOFEOF)
+        state->data[state->buffered++] = (uint8_t)c;
+      else
+        return state->flush ? IOFERR : IOFEMPTY;
+    }
+    aes_decode_output(state, O->pos);
+    if (state->flush)
+    { /* we have to check for EOF here, to remove eventual padding */
+      if ((c = iof_get(I)) < 0)
+      { /* end of input at 16-bytes boundary; remove padding and quit */
+        ret = aes_remove_padding(state, O->pos - 16, &lastlength);
+        O->pos -= 16 - lastlength;
+        return ret;
+      }
+      else
+      { /* beginning of the next block */
+        state->buffered = 1;
+        state->data[0] = (uint8_t)c;
+      }
+    }
+    else
+      state->buffered = 0;
+  }
+  return IOFFULL;
+}
+
+/* variants that works on c-strings; can work inplace (output==input) except encoder in pdf flavour */
+
+/*
+Codecs operating on c-string can generally work inplace (output==input), except encoder with AES_INLINE_IV flag set,
+which outputs 16 bytes of initialization vector at the beginning of encrypted data. All return the size of encrypted/decrypted
+data. Encoders output is the original length padded to a complete 16 bytes (plus eventual 16 bytes of initialization
+vector, if AES_INLINE_IV is used). Default padding is unambiguously removed during decryption. AES_NULL_PADDING flag
+forces using (ambiguous) NULL-byte padding, only if input length module 16 is greater then zero.
+
+An input data is supposed to be a complete data to be encrypted or decrypted. It is possible, however, to use those
+codecs for scaterred data chunks by manipulating AES_INLINE_IV, AES_NULL_PADDING, AES_CONTINUE flags and data length.
+Caller may assume that c-string codecs do not modify state flags.
+
+Encoder could actually be optimized by writing an initialization vector to a state block once. After every chunk encryption,
+the output is the initialization vector for the next chunk. Since we use c-string codec variants on short strings,
+the gain is neglectable in comparison with the weight of the aes crypt procedure.
+*/
+
+size_t aes_encode_data (const void *input, size_t length, void *output, const void *key, size_t keylength, const void *iv, int flags)
+{
+  aes_state state;
+  aes_keyblock keyblock;
+
+  if (aes_encode_initialize(&state, &keyblock, key, keylength, iv) == NULL)
+    return 0;
+  state.flags |= flags;
+  return aes_encode_state_data(&state, input, length, output);
+  // aes_state_close(&state);
+}
+
+size_t aes_encode_state_data (aes_state *state, const void *input, size_t length, void *output)
+{
+  const uint8_t *inp;
+  uint8_t *out, tail, t;
+  size_t size;
+
+  inp = (const uint8_t *)input;
+  out = (uint8_t *)output;
+
+  if (!(state->flags & AES_HAS_IV))
+      return 0;
+  if ((state->flags & AES_INLINE_IV) && !(state->flags & AES_CONTINUE))
+  {
+    aes_copy_block(out, state->iv);
+    out += 16;
+  }
+  // state->flags |= AES_CONTINUE; // do not modify state flags
+
+  for (size = 0; size + 16 <= length; size += 16)
+  {
+    aes_copy_xor(state->data, inp, state->iv);
+    aes_encode_output(state, out);
+    inp += 16;
+  }
+
+  if ((tail = (length % 16)) > 0 || aes_padding(state))
+  {
+    for (t = 0; t < tail; ++t)
+      state->data[t] = inp[t] ^ state->iv[t];
+    aes_put_padding(state, tail);
+    aes_encode_output(state, out);
+    size += 16;
+  }
+  if (state->flags & AES_INLINE_IV)
+    size += 16; /* iv written at the beginning of encoded data */
+
+  return size;
+}
+
+size_t aes_decode_data (const void *input, size_t length, void *output, const void *key, size_t keylength, const void *iv, int flags)
+{
+  aes_state state;
+  aes_keyblock keyblock;
+
+  if (aes_decode_initialize(&state, &keyblock, key, keylength, iv) == NULL)
+    return 0;
+  state.flags |= flags;
+  return aes_decode_state_data(&state, input, length, output);
+  // aes_state_close(&state);
+}
+
+size_t aes_decode_state_data (aes_state *state, const void *input, size_t length, void *output)
+{
+  const uint8_t *inp;
+  uint8_t *out, lastlength;
+  size_t size;
+
+  inp = (const uint8_t *)input;
+  out = (uint8_t *)output;
+
+  if ((state->flags & AES_INLINE_IV) && !(state->flags & AES_CONTINUE))
+  {
+    aes_copy_block(state->iv, inp);
+    // state->flags |= AES_HAS_IV; // do not modify state flags
+    inp += 16;
+    length = length >= 16 ? length - 16 : 0;
+  }
+  else if (!(state->flags & AES_HAS_IV))
+    return 0;
+  // state->flags |= AES_CONTINUE; // do not modify state flags
+  for (size = 0; size + 16 <= length; size += 16)
+  {
+    aes_copy_block(state->data, inp);
+    aes_decode_output(state, out);
+    inp += 16;
+  }
+
+  if (size >= 16)
+  {
+    aes_remove_padding(state, out - 16, &lastlength);
+    size = size - 16 + lastlength;
+  }
+
+  return size;
+}
+
+/*
+pseudo-random bytes chain exceprted from eexec; not expected to have strong cryptographic properties
+we only expect that it is (reasonably) unique and different for each call (not only function call, but also
+a program call). A current trick with mangling pointer value gives satisfactory results, generally different
+for every function call and a programm call. Note that the pseudo-input bytes starts from some inner address
+bits, as they vary better; without that, the first byte tends to be "lazy".
+*/
+
+void random_bytes (uint8_t *output, size_t size)
+{
+  size_t i;
+  uint8_t p;
+  static uint16_t k = 55665;
+  for (i = 0; i < size; ++i)
+  {
+    p = ((uint8_t *)(&output))[(i + 2) % sizeof(uint8_t *)] ^ (uint8_t)size; // pseudo input byte ;)
+    k = (((p + k) * 52845 + 22719) & 65535); // xor-ed with pseudo-random sequence (kept between calls)
+    output[i] = p ^ (k >> 8);
+  }
+}
+
+void aes_generate_iv (uint8_t output[16])
+{
+  random_bytes(output, 16);
+}
+
+/* filters */
+
+// rc4 decoder function
+
+static size_t rc4_decoder (iof *F, iof_mode mode)
+{
+  rc4_state *state;
+  iof_status status;
+  size_t tail;
+
+  state = iof_filter_state(rc4_state *, F);
+  switch(mode)
+  {
+    case IOFLOAD:
+    case IOFREAD:
+      if (F->flags & IOF_STOPPED)
+        return 0;
+      tail = iof_tail(F);
+      F->pos = F->buf + tail;
+      F->end = F->buf + F->space;
+      do {
+        status = rc4_decode_state(F->next, F, state);
+      } while (mode == IOFLOAD && status == IOFFULL && iof_resize_buffer(F));
+      return iof_decoder_retval(F, "rc4", status);
+    case IOFCLOSE:
+      rc4_state_close(state);
+      iof_free(F);
+      return 0;
+    default:
+      break;
+  }
+  return 0;
+}
+
+// rc4 encoder function
+
+static size_t rc4_encoder (iof *F, iof_mode mode)
+{
+  rc4_state *state;
+  iof_status status;
+
+  state = iof_filter_state(rc4_state *, F);
+  switch (mode)
+  {
+    case IOFFLUSH:
+      state->flush = 1;
+      FALLTHRU // fall through
+    case IOFWRITE:
+      F->end = F->pos;
+      F->pos = F->buf;
+      status = rc4_encode_state(F, F->next, state);
+      return iof_encoder_retval(F, "rc4", status);
+    case IOFCLOSE:
+      if (!state->flush)
+        rc4_encoder(F, IOFFLUSH);
+      rc4_state_close(state);
+      iof_free(F);
+      return 0;
+    default:
+      break;
+  }
+  return 0;
+}
+
+// aes decoder function
+
+static size_t aes_decoder (iof *F, iof_mode mode)
+{
+  aes_state *state;
+  iof_status status;
+  size_t tail;
+
+  state = iof_filter_state(aes_state *, F);
+  switch(mode)
+  {
+    case IOFLOAD:
+    case IOFREAD:
+      if (F->flags & IOF_STOPPED)
+        return 0;
+      tail = iof_tail(F);
+      F->pos = F->buf + tail;
+      F->end = F->buf + F->space;
+      do {
+        status = aes_decode_state(F->next, F, state);
+      } while (mode == IOFLOAD && status == IOFFULL && iof_resize_buffer(F));
+      return iof_decoder_retval(F, "aes", status);
+    case IOFCLOSE:
+      aes_state_close(state);
+      iof_free(F);
+      return 0;
+    default:
+      break;
+  }
+  return 0;
+}
+
+// aes encoder function
+
+static size_t aes_encoder (iof *F, iof_mode mode)
+{
+  aes_state *state;
+  iof_status status;
+
+  state = iof_filter_state(aes_state *, F);
+  switch (mode)
+  {
+    case IOFFLUSH:
+      state->flush = 1;
+      FALLTHRU // fall through
+    case IOFWRITE:
+      F->end = F->pos;
+      F->pos = F->buf;
+      status = aes_encode_state(F, F->next, state);
+      return iof_encoder_retval(F, "aes", status);
+    case IOFCLOSE:
+      if (!state->flush)
+        aes_encoder(F, IOFFLUSH);
+      aes_state_close(state);
+      iof_free(F);
+      return 0;
+    default:
+      break;
+  }
+  return 0;
+}
+
+iof * iof_filter_rc4_decoder (iof *N, const void *key, size_t keylength)
+{
+  iof *I;
+  crypt_state_pointer P;
+
+  I = iof_filter_reader(rc4_decoder, sizeof(rc4_state), &P.voidstate);
+  iof_setup_next(I, N);
+  if (rc4_state_init(P.rc4state, key, keylength) == NULL)
+  {
+    iof_discard(I);
+    return NULL;
+  }
+  P.rc4state->flush = 1;
+  return I;
+}
+
+iof * iof_filter_rc4_encoder (iof *N, const void *key, size_t keylength)
+{
+  iof *O;
+  crypt_state_pointer P;
+
+  O = iof_filter_writer(rc4_encoder, sizeof(rc4_state), &P.voidstate);
+  iof_setup_next(O, N);
+  if (rc4_state_init(P.rc4state, key, keylength) == NULL)
+  {
+    iof_discard(O);
+    return NULL;
+  }
+  // P.rc4state->flush = 1;
+  return O;
+}
+
+/* aes crypt filters */
+
+iof * iof_filter_aes_decoder (iof *N, const void *key, size_t keylength)
+{
+  iof *I;
+  crypt_state_pointer P;
+
+  I = iof_filter_reader(aes_decoder, sizeof(aes_state), &P.voidstate);
+  iof_setup_next(I, N);
+  if (aes_decode_init(P.aesstate, key, keylength) == NULL)
+  {
+    iof_discard(I);
+    return NULL;
+  }
+  aes_pdf_mode(P.aesstate);
+  P.aesstate->flush = 1;
+  return I;
+}
+
+iof * iof_filter_aes_encoder (iof *N, const void *key, size_t keylength)
+{
+  iof *O;
+  crypt_state_pointer P;
+
+  O = iof_filter_writer(aes_encoder, sizeof(aes_state), &P.voidstate);
+  iof_setup_next(O, N);
+  if (aes_encode_init(P.aesstate, key, keylength) == NULL)
+  {
+    iof_discard(O);
+    return NULL;
+  }
+  aes_pdf_mode(P.aesstate);
+  // P.aesstate->flush = 1;
+  return O;
+}
+
+/* test */
+
+/*
+static void show (void *p, size_t size, uint8_t round, uint8_t sym)
+{
+  uint8_t i;
+  printf("%c%c:", round, sym);
+  for (i = 0; i < size; ++i)
+    printf("%02x", ((uint8_t *)p)[i]);
+  printf("\n");
+}
+
+void aes_test (void)
+{
+  const uint8_t key[] = { 0x2b, 0x7e, 0x15, 0x16, 0x28, 0xae, 0xd2, 0xa6, 0xab, 0xf7, 0x15, 0x88, 0x09, 0xcf, 0x4f, 0x3c };
+  const uint8_t iv[]  = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f };
+  const uint8_t inp[] = {
+    0x6b, 0xc1, 0xbe, 0xe2, 0x2e, 0x40, 0x9f, 0x96, 0xe9, 0x3d, 0x7e, 0x11, 0x73, 0x93, 0x17, 0x2a,
+    0xae, 0x2d, 0x8a, 0x57, 0x1e, 0x03, 0xac, 0x9c, 0x9e, 0xb7, 0x6f, 0xac, 0x45, 0xaf, 0x8e, 0x51,
+    0x30, 0xc8, 0x1c, 0x46, 0xa3, 0x5c, 0xe4, 0x11, 0xe5, 0xfb, 0xc1, 0x19, 0x1a, 0x0a, 0x52, 0xef,
+    0xf6, 0x9f, 0x24, 0x45, 0xdf, 0x4f, 0x9b, 0x17, 0xad, 0x2b, 0x41, 0x7b, 0xe6, 0x6c, 0x37, 0x10 };
+  const uint8_t out[]  = {
+    0x76, 0x49, 0xab, 0xac, 0x81, 0x19, 0xb2, 0x46, 0xce, 0xe9, 0x8e, 0x9b, 0x12, 0xe9, 0x19, 0x7d,
+    0x50, 0x86, 0xcb, 0x9b, 0x50, 0x72, 0x19, 0xee, 0x95, 0xdb, 0x11, 0x3a, 0x91, 0x76, 0x78, 0xb2,
+    0x73, 0xbe, 0xd6, 0xb8, 0xe3, 0xc1, 0x74, 0x3b, 0x71, 0x16, 0xe6, 0x9e, 0x22, 0x22, 0x95, 0x16,
+    0x3f, 0xf1, 0xca, 0xa1, 0x68, 0x1f, 0xac, 0x09, 0x12, 0x0e, 0xca, 0x30, 0x75, 0x86, 0xe1, 0xa7 };
+
+  uint8_t input[64], output[64];
+  size_t inpsize, outsize;
+  int flags = AES_NULL_PADDING;
+
+  ////////////////////////////////////////////////////////////////////////////
+
+//#define ENCODETO output
+#define ENCODETO input // inplace
+
+  inpsize = 64;
+  memcpy(input, inp, inpsize);
+  show(input, inpsize, '>', '>');
+  outsize = aes_encode_data(input, inpsize, ENCODETO, key, 16, iv, flags);
+  show(ENCODETO, outsize, '<', '<');
+  if (outsize == inpsize && memcmp(ENCODETO, out, outsize) == 0)
+    printf("ENCODER SUCCESS\n");
+  else
+    printf("ENCODER FAILURE\n");
+
+  ////////////////////////////////////////////////////////////////////////////
+
+//#define DECODETO input
+#define DECODETO output // in place
+
+  outsize = 64;
+  memcpy(output, out, outsize);
+  show(output, outsize, '<', '<');
+  inpsize = aes_decode_data(output, outsize, DECODETO, key, 16, iv, flags);
+  show(DECODETO, inpsize, '>', '>');
+  if (inpsize == outsize && memcmp(DECODETO, inp, inpsize) == 0)
+    printf("DECODER SUCCESS\n");
+  else
+    printf("DECODER FAILURE\n");
+}
+*/
+
+/*
+Some example vectors
+
+================================ AES ECB 128-bit encryption mode ================================
+
+Encryption key: 2b7e151628aed2a6abf7158809cf4f3c
+
+Test vector                      Cipher text
+6bc1bee22e409f96e93d7e117393172a 3ad77bb40d7a3660a89ecaf32466ef97
+ae2d8a571e03ac9c9eb76fac45af8e51 f5d3d58503b9699de785895a96fdbaaf
+30c81c46a35ce411e5fbc1191a0a52ef 43b1cd7f598ece23881b00e3ed030688
+f69f2445df4f9b17ad2b417be66c3710 7b0c785e27e8ad3f8223207104725dd4
+
+
+================================ AES ECB 192-bit encryption mode ================================
+
+Encryption key: 8e73b0f7da0e6452c810f32b809079e562f8ead2522c6b7b
+
+Test vector                       Cipher text
+6bc1bee22e409f96e93d7e117393172a  bd334f1d6e45f25ff712a214571fa5cc
+ae2d8a571e03ac9c9eb76fac45af8e51  974104846d0ad3ad7734ecb3ecee4eef
+30c81c46a35ce411e5fbc1191a0a52ef  ef7afd2270e2e60adce0ba2face6444e
+f69f2445df4f9b17ad2b417be66c3710  9a4b41ba738d6c72fb16691603c18e0e
+
+
+================================ AES ECB 256-bit encryption mode ================================
+
+Encryption key: 603deb1015ca71be2b73aef0857d77811f352c073b6108d72d9810a30914dff4
+
+Test vector                       Cipher text
+6bc1bee22e409f96e93d7e117393172a  f3eed1bdb5d2a03c064b5a7e3db181f8
+ae2d8a571e03ac9c9eb76fac45af8e51  591ccb10d410ed26dc5ba74a31362870
+30c81c46a35ce411e5fbc1191a0a52ef  b6ed21b99ca6f4f9f153e7b1beafed1d
+f69f2445df4f9b17ad2b417be66c3710  23304b7a39f9f3ff067d8d8f9e24ecc7
+
+================================ AES CBC 128-bit encryption mode ================================
+
+Encryption key: 2b7e151628aed2a6abf7158809cf4f3c
+
+Initialization vector             Test vector                       Cipher text
+000102030405060708090A0B0C0D0E0F  6bc1bee22e409f96e93d7e117393172a  7649abac8119b246cee98e9b12e9197d
+7649ABAC8119B246CEE98E9B12E9197D  ae2d8a571e03ac9c9eb76fac45af8e51  5086cb9b507219ee95db113a917678b2
+5086CB9B507219EE95DB113A917678B2  30c81c46a35ce411e5fbc1191a0a52ef  73bed6b8e3c1743b7116e69e22229516
+73BED6B8E3C1743B7116E69E22229516  f69f2445df4f9b17ad2b417be66c3710  3ff1caa1681fac09120eca307586e1a7
+
+================================ AES CBC 192-bit encryption mode ================================
+
+Encryption key: 8e73b0f7da0e6452c810f32b809079e562f8ead2522c6b7b
+
+Initialization vector             Test vector                       Cipher text
+000102030405060708090A0B0C0D0E0F  6bc1bee22e409f96e93d7e117393172a  4f021db243bc633d7178183a9fa071e8
+4F021DB243BC633D7178183A9FA071E8  ae2d8a571e03ac9c9eb76fac45af8e51  b4d9ada9ad7dedf4e5e738763f69145a
+B4D9ADA9AD7DEDF4E5E738763F69145A  30c81c46a35ce411e5fbc1191a0a52ef  571b242012fb7ae07fa9baac3df102e0
+571B242012FB7AE07FA9BAAC3DF102E0  f69f2445df4f9b17ad2b417be66c3710  08b0e27988598881d920a9e64f5615cd
+
+================================ AES CBC 256-bit encryption mode ================================
+
+Encryption key: 603deb1015ca71be2b73aef0857d77811f352c073b6108d72d9810a30914dff4
+
+Initialization vector             Test vector                       Cipher text
+000102030405060708090A0B0C0D0E0F  6bc1bee22e409f96e93d7e117393172a  f58c4c04d6e5f1ba779eabfb5f7bfbd6
+F58C4C04D6E5F1BA779EABFB5F7BFBD6  ae2d8a571e03ac9c9eb76fac45af8e51  9cfc4e967edb808d679f777bc6702c7d
+9CFC4E967EDB808D679F777BC6702C7D  30c81c46a35ce411e5fbc1191a0a52ef  39f23369a9d9bacfa530e26304231461
+39F23369A9D9BACFA530E26304231461  f69f2445df4f9b17ad2b417be66c3710  b2eb05e2c39be9fcda6c19078c6a9d1b
+*/
+\ No newline at end of file
diff --git a/source/luametatex/source/libraries/pplib/util/utilcrypt.h b/source/luametatex/source/libraries/pplib/util/utilcrypt.h
new file mode 100644
index 000000000..e5bf53cc5
--- /dev/null
+++ b/source/luametatex/source/libraries/pplib/util/utilcrypt.h
@@ -0,0 +1,90 @@
+#ifndef UTIL_CRYPT_H
+#define UTIL_CRYPT_H
+
+#include <stdint.h>
+#include <stddef.h>
+#include "utiliof.h"
+
+#ifndef UTIL_CRYPT_TIME
+#  define UTIL_CRYPT_TIME 0
+#endif
+
+/* RC4 */
+
+typedef uint8_t rc4_map[256];
+
+typedef struct rc4_state rc4_state;
+
+#define RC4_STATE_ALLOC (1<<0)
+
+UTILAPI rc4_state * rc4_state_initialize (rc4_state *state, rc4_map *map, const void *vkey, size_t keylength);
+#define rc4_state_init(state, vkey, keylength) rc4_state_initialize(state, NULL, vkey, keylength)
+UTILAPI void rc4_map_save (rc4_state *state, rc4_map *map);
+UTILAPI void rc4_map_restore (rc4_state *state, rc4_map *map);
+
+/* Codecs operating on iof */
+
+UTILAPI iof_status rc4_crypt_state (iof *I, iof *O, rc4_state *state);
+#define rc4_encode_state(I, O, state) rc4_crypt_state(I, O, state)
+#define rc4_decode_state(I, O, state) rc4_crypt_state(I, O, state)
+
+UTILAPI iof_status rc4_crypt (iof *I, iof *O, const void *key, size_t length);
+#define rc4_encode(I, O) rc4_crypt(I, O, key, length)
+#define rc4_decode(I, O) rc4_crypt(I, O, key, length)
+
+UTILAPI size_t rc4_crypt_data (const void *input, size_t length, void *output, const void *key, size_t keylength);
+UTILAPI size_t rc4_crypt_state_data (rc4_state *state, const void *input, size_t length, void *output);
+#define rc4_encode_data(input, length, output, key, keylength) rc4_crypt_data(input, length, output, key, keylength)
+#define rc4_decode_data(input, length, output, key, keylength) rc4_crypt_data(input, length, output, key, keylength)
+#define rc4_encode_state_data(state, input, length, output) rc4_crypt_state_data(state, input, length, output)
+#define rc4_decode_state_data(state, input, length, output) rc4_crypt_state_data(state, input, length, output)
+
+UTILAPI void rc4_state_close (rc4_state *state);
+
+/* AES */
+
+typedef uint8_t aes_block[4][4];
+typedef aes_block aes_keyblock[15]; // aes128 - 10+1, aes192 - 12+1, aes256 - 14+1
+
+typedef struct aes_state aes_state;
+
+#define AES_STATE_ALLOC (1<<0)
+//#define AES_ECB_MODE (1<<2)
+#define AES_HAS_IV (1<<3)
+#define AES_INLINE_IV (1<<4)
+#define AES_CONTINUE (1<<5)
+#define AES_NULL_PADDING (1<<6)
+
+UTILAPI void aes_pdf_mode (aes_state *state);
+//UTILAPI aes_state * aes_state_initialize_ecb (aes_state *State, uint8_t *roundkey, const uint8_t *key);
+UTILAPI aes_state * aes_encode_initialize (aes_state *state, aes_keyblock *keyblock, const void *key, size_t keylength, const void *iv);
+UTILAPI aes_state * aes_decode_initialize (aes_state *state, aes_keyblock *keyblock, const void *key, size_t keylength, const void *iv);
+#define aes_encode_init(state, key, keylength) aes_encode_initialize(state, NULL, key, keylength, NULL)
+#define aes_decode_init(state, key, keylength) aes_decode_initialize(state, NULL, key, keylength, NULL)
+
+UTILAPI void aes_state_close (aes_state *state);
+
+/* Codecs operating on iof */
+
+UTILAPI iof_status aes_encode_state (iof *I, iof *O, aes_state *state);
+UTILAPI iof_status aes_decode_state (iof *I, iof *O, aes_state *state);
+
+UTILAPI size_t aes_encode_data (const void *input, size_t length, void *output, const void *key, size_t keylength, const void *iv, int flags);
+UTILAPI size_t aes_encode_state_data (aes_state *state, const void *input, size_t length, void *output);
+UTILAPI size_t aes_decode_data (const void *input, size_t length, void *output, const void *key, size_t keylength, const void *iv, int flags);
+UTILAPI size_t aes_decode_state_data (aes_state *state, const void *input, size_t length, void *output);
+
+/* random bytes generator */
+
+UTILAPI void random_bytes (uint8_t *output, size_t size);
+UTILAPI void aes_generate_iv (uint8_t output[16]);
+
+/* filters */
+
+iof * iof_filter_rc4_decoder (iof *N, const void *key, size_t keylength);
+iof * iof_filter_rc4_encoder (iof *N, const void *key, size_t keylength);
+
+iof * iof_filter_aes_decoder (iof *N, const void *key, size_t keylength);
+iof * iof_filter_aes_encoder (iof *N, const void *key, size_t keylength);
+
+#endif
+\ No newline at end of file
diff --git a/source/luametatex/source/libraries/pplib/util/utilcryptdef.h b/source/luametatex/source/libraries/pplib/util/utilcryptdef.h
new file mode 100644
index 000000000..d43ea2e5b
--- /dev/null
+++ b/source/luametatex/source/libraries/pplib/util/utilcryptdef.h
@@ -0,0 +1,32 @@
+
+#ifndef UTIL_CRYPTDEF_H
+#define UTIL_CRYPTDEF_H
+
+struct rc4_state {
+  union {
+    rc4_map *map;
+    uint8_t *smap;
+  };
+  int i, j;
+  int flush;
+  int flags;
+};
+
+struct aes_state {
+  size_t keylength;
+  int rounds;
+  //int keywords;
+  union {
+    aes_block block;
+    uint8_t data[16];
+  };
+  aes_keyblock *keyblock;
+  uint8_t iv[16];
+  uint8_t buffered;
+  int flush;
+  int flags;
+};
+
+typedef union { rc4_state *rc4state; aes_state *aesstate; void *voidstate; } crypt_state_pointer; // to avoid 'dereferencing type-puned ...' warnings
+
+#endif
+\ No newline at end of file
diff --git a/source/luametatex/source/libraries/pplib/util/utildecl.h b/source/luametatex/source/libraries/pplib/util/utildecl.h
new file mode 100644
index 000000000..b11e5b884
--- /dev/null
+++ b/source/luametatex/source/libraries/pplib/util/utildecl.h
@@ -0,0 +1,28 @@
+
+#ifndef UTIL_DECL_H
+#define UTIL_DECL_H
+
+/*
+UTILDLL - when building .dll
+UTILEXE - when building .exe to import symbols from .dll
+*/
+
+#if defined (_WIN32) || defined(_WIN64)
+#  ifdef UTILDLL
+#    define UTILAPI __declspec(dllexport)
+#    define UTILDEF __declspec(dllexport)
+#  else
+#    ifdef UTILEXE
+#      define UTILAPI __declspec(dllimport)
+#      define UTILDEF
+#    else
+#      define UTILAPI
+#      define UTILDEF
+#    endif
+#  endif
+#else
+#  define UTILAPI
+#  define UTILDEF
+#endif
+
+#endif
+\ No newline at end of file
diff --git a/source/luametatex/source/libraries/pplib/util/utilflate.c b/source/luametatex/source/libraries/pplib/util/utilflate.c
new file mode 100644
index 000000000..eaff44cce
--- /dev/null
+++ b/source/luametatex/source/libraries/pplib/util/utilflate.c
@@ -0,0 +1,322 @@
+
+#include "../../utilities/auxzlib.h"
+
+#include "utilmem.h"
+#include "utillog.h"
+#include "utilflate.h"
+
+/* flate codec */
+
+/*
+Flate codec example provided at http://www.zlib.net/zpipe.c (http://www.zlib.net/zlib_how.html) uses the following scheme:
+- provide input data buffer
+- keep providing output until codec function uses it
+
+For encoder:
+
+  z->zalloc = z->zfree = z->zopaque = NULL;
+  deflateInit(z, compression_level);
+  do {
+    z->next_in = <input buffer>
+    z->avail_in = <input buffer bytes>
+    do {
+      z->next_out = <output buffer>
+      z->avail_out = <output buffer bytes>
+      deflate(z, flush);
+      // write obtained output from deflate
+    } while (z->avail_out == 0);
+    assert(z->avail_in == 0);
+  } while (flush != Z_FINISH);
+  deflateEnd(z);
+
+'z' is an internal codec state of type z_stream, 'flush' is either Z_NO_FLUSH or Z_FINISH at the end of data.
+deflate() ensures to consume the entire input if there are no obstackles to write an output. The inner loop
+provides an output space as long as it is used by deflate(). When deflate() wrote everything it could,
+it leaves z->avail_out > 0, which breaks the inner loop. At this point z->avail_in should also be zero.
+The example documentation claims that the return codes from deflate() doesn't really need to be checked,
+as checking z->avail_out for zero is enough.
+
+The scheme for decoder is pretty similar, but with substantial differences:
+- the end of stream is automatically found by decoder, so using Z_FINISH flag to indicate an end of stream
+  is not necessary, but if provided, it MUST be given only if the EOF marker actually occurs in the input chunk,
+  and subsequent calls to inflate() must consequently use Z_FINISH
+- calling inflate() as long as it uses the output buffer provided still works for decoder, but inflate()
+  does not ensure to consume the entire input, as it will read until end of stream marker
+- the return code from inflate() must be checked to ensure the proper reaction on invalid data stream and
+  end of stream signals
+- initialization must set an input buffer to NULL or to some existing chunk (the later helps zlib to perform
+  better on inflate(), but inflate() does the research on the first call anyway)
+
+  z->zalloc = z->zfree = z->zopaque = NULL;
+  z->next_in = NULL, z->avail_in = 0;
+  inflateInit(z);
+  do {
+    z->next_in = <input buffer>
+    z->avail_in = <input buffer bytes>
+    do {
+      z->next_out = <output buffer>
+      z->avail_out = <output buffer bytes>
+      status = inflate(z, flush);
+      // check return status
+      // write obtained output from inflate
+    } while (z->avail_out == 0);
+  } while (status != Z_STREAM_END);
+  inflateEnd(z);
+
+Our wrapper generally follows "prepare input, keep pomping output" scheme, but we need to support handler function
+breaks on IOFEMPTY and IOFFULL. For a consistent come back from those on subsequent calls to the handler function,
+we use 3 states:
+- FLATE_IN - get input, when got something then goto FALTE_OUT
+- FLATE_OUT - set z_stream buffers and keep writing output until enything to write, then goto FLATE_IN or FLATE_DONE
+- FLATE_DONE - we are done, no return from that state
+Distinction of FLATE_IN and FLATE_OUT states guarantees that we will not get more input until zlib consumes the stuff
+from the previous feed, possibly interrupted by IOFFULL return on filling the output buffer. This distinction is not
+critical, but makes the filter running according to the scheme described above. Note that we set zlib input buffer
+(z->next_in, z->avail_in) at the beginning of FLATE_OUT state. Also note that we always update our buffers according
+to updated avail_in / avail_out values, just after a call to inflate() / deflate(). So no matter what have happens
+between handler calls, zlib input buffer is in sync with ours.
+*/
+
+struct flate_state {
+  z_stream z;
+  int flush;
+  int status;
+  int level; /* encoder compression level -1..9 */
+};
+
+typedef union { flate_state *flatestate; void *voidstate; } flate_state_pointer; // to avoid 'dereferencing type-puned ...' warnings
+
+enum {
+  FLATE_IN,
+  FLATE_OUT,
+  FLATE_DONE
+};
+
+flate_state * flate_decoder_init (flate_state *state)
+{ /* initialize zlib */
+  z_stream *z = &state->z;
+  z->zalloc = &lmt_zlib_alloc; /* Z_NULL */
+  z->zfree = &lmt_zlib_free; /* Z_NULL */
+  z->opaque = Z_NULL;
+  z->avail_in = 0;     /* must be initialized before inflateInit() */
+  z->next_in = Z_NULL; /* ditto */
+  if (inflateInit(z) != Z_OK)
+    return NULL;
+  state->status = FLATE_IN;
+  return state;
+}
+
+flate_state * flate_encoder_init (flate_state *state)
+{
+  z_stream *z = &state->z;
+  z->zalloc = &lmt_zlib_alloc; /* Z_NULL */
+  z->zfree = &lmt_zlib_free; /* Z_NULL */
+  z->opaque = Z_NULL;
+  z->avail_in = 0;
+  z->next_in = Z_NULL;
+  state->level = Z_DEFAULT_COMPRESSION; // will probably be moved upward
+  if (deflateInit(z, state->level) != Z_OK)
+    return NULL;
+  state->status = FLATE_IN;
+  return state;
+}
+
+static const char * zmess (int zstatus)
+{
+  switch (zstatus)
+  {
+    case Z_OK:            return "ok";
+    case Z_STREAM_END:    return "end of stream";
+    case Z_BUF_ERROR:     return "buffer error";
+    case Z_STREAM_ERROR:  return "stream error";
+    case Z_NEED_DICT:     return "need dict";
+    case Z_DATA_ERROR:    return "data error";
+    case Z_MEM_ERROR:     return "memory error";
+    case Z_VERSION_ERROR: return "version error";
+    case Z_ERRNO:         return "io error";
+    default:
+      break;
+  }
+  return "unknown error";
+}
+
+iof_status flate_decode_state (iof *I, iof *O, flate_state *state)
+{
+  z_stream *z;
+  int zstatus = Z_OK;
+  z = &state->z;
+  while (state->status != FLATE_DONE)
+  {
+    if (state->status == FLATE_IN)
+    {
+      if (!iof_readable(I))
+        return state->flush ? IOFERR : IOFEMPTY;
+      state->status = FLATE_OUT;
+    }
+    z->next_in = (Bytef *)I->pos;
+    z->avail_in = (uInt)iof_left(I);
+    do {
+      if (!iof_writable(O))
+        return IOFFULL;
+      z->next_out = (Bytef *)O->pos;
+      z->avail_out = (uInt)iof_left(O);
+      zstatus = inflate(z, Z_NO_FLUSH);
+      I->pos += iof_left(I) - z->avail_in;
+      O->pos += iof_left(O) - z->avail_out;
+      switch (zstatus)
+      {
+        case Z_OK:
+        case Z_STREAM_END:
+          break;
+        default:
+          loggerf("flate decoder %s (%d)", zmess(zstatus), zstatus);
+          return IOFERR;
+      }
+    } while (z->avail_out == 0);
+    state->status = zstatus == Z_STREAM_END ? FLATE_DONE : FLATE_IN;
+  }
+  return IOFEOF;
+}
+
+iof_status flate_encode_state (iof *I, iof *O, flate_state *state)
+{
+  z_stream *z;
+  int zstatus;
+  z = &state->z;
+  while (state->status != FLATE_DONE)
+  {
+    if (state->status == FLATE_IN)
+    {
+      if (!iof_readable(I))
+        if (!state->flush)
+          return IOFEMPTY;
+      state->status = FLATE_OUT;
+    }
+    z->next_in = (Bytef *)I->pos;
+    z->avail_in = (uInt)iof_left(I);
+    do {
+      if (!iof_writable(O))
+        return IOFFULL;
+      z->next_out = (Bytef *)O->pos;
+      z->avail_out = (uInt)iof_left(O);
+      zstatus = deflate(z, state->flush ? Z_FINISH : Z_NO_FLUSH);
+      I->pos += iof_left(I) - z->avail_in;
+      O->pos += iof_left(O) - z->avail_out;
+      switch (zstatus)
+      {
+        case Z_OK:
+        case Z_STREAM_END:
+          break;
+        default:
+          loggerf("flate encoder %s (%d)", zmess(zstatus), zstatus);
+          return IOFERR;
+      }
+    } while (z->avail_out == 0);
+    state->status = state->flush ? FLATE_DONE : FLATE_IN;
+  }
+  return IOFEOF;
+}
+
+
+void flate_decoder_close (flate_state *state)
+{
+  inflateEnd(&state->z);
+}
+
+void flate_encoder_close (flate_state *state)
+{
+  deflateEnd(&state->z);
+}
+
+/* filter */
+
+// flate decoder function
+
+static size_t flate_decoder (iof *F, iof_mode mode)
+{
+  flate_state *state;
+  iof_status status;
+  size_t tail;
+
+  state = iof_filter_state(flate_state *, F);
+  switch(mode)
+  {
+    case IOFLOAD:
+    case IOFREAD:
+      if (F->flags & IOF_STOPPED)
+        return 0;
+      tail = iof_tail(F);
+      F->pos = F->buf + tail;
+      F->end = F->buf + F->space;
+      do {
+        status = flate_decode_state(F->next, F, state);
+      } while (mode == IOFLOAD && status == IOFFULL && iof_resize_buffer(F));
+      return iof_decoder_retval(F, "flate", status);
+    case IOFCLOSE:
+      flate_decoder_close(state);
+      iof_free(F);
+      return 0;
+    default:
+      break;
+  }
+  return 0;
+}
+
+// flate encoder function
+
+static size_t flate_encoder (iof *F, iof_mode mode)
+{
+  flate_state *state;
+  iof_status status;
+
+  state = iof_filter_state(flate_state *, F);
+  switch (mode)
+  {
+    case IOFFLUSH:
+      state->flush = 1;
+      FALLTHRU // fall through
+    case IOFWRITE:
+      F->end = F->pos;
+      F->pos = F->buf;
+      status = flate_encode_state(F, F->next, state);
+      return iof_encoder_retval(F, "flate", status);
+    case IOFCLOSE:
+      if (!state->flush)
+        flate_encoder(F, IOFFLUSH);
+      flate_encoder_close(state);
+      iof_free(F);
+      return 0;
+    default:
+      break;
+  }
+  return 0;
+}
+
+iof * iof_filter_flate_decoder (iof *N)
+{
+  iof *I;
+  flate_state_pointer P;
+  I = iof_filter_reader(flate_decoder, sizeof(flate_state), &P.voidstate);
+  iof_setup_next(I, N);
+  if (flate_decoder_init(P.flatestate) == NULL)
+  {
+    iof_discard(I);
+    return NULL;
+  }
+  P.flatestate->flush = 1;
+  return I;
+}
+
+iof * iof_filter_flate_encoder (iof *N)
+{
+  iof *O;
+  flate_state_pointer P;
+  O = iof_filter_writer(flate_encoder, sizeof(flate_state), &P.voidstate);
+  iof_setup_next(O, N);
+  if (flate_encoder_init(P.flatestate) == NULL)
+  {
+    iof_discard(O);
+    return NULL;
+  }
+  return O;
+}
diff --git a/source/luametatex/source/libraries/pplib/util/utilflate.h b/source/luametatex/source/libraries/pplib/util/utilflate.h
new file mode 100644
index 000000000..09bdd6661
--- /dev/null
+++ b/source/luametatex/source/libraries/pplib/util/utilflate.h
@@ -0,0 +1,21 @@
+#ifndef UTIL_FLATE_H
+#define UTIL_FLATE_H
+
+#include "utiliof.h"
+
+typedef struct flate_state flate_state;
+
+flate_state * flate_decoder_init (flate_state *state);
+flate_state * flate_encoder_init (flate_state *state);
+
+iof_status flate_decode_state (iof *I, iof *O, flate_state *state);
+iof_status flate_encode_state (iof *I, iof *O, flate_state *state);
+
+void flate_decoder_close (flate_state *state);
+void flate_encoder_close (flate_state *state);
+
+iof * iof_filter_flate_decoder (iof *N);
+iof * iof_filter_flate_encoder (iof *N);
+
+
+#endif
+\ No newline at end of file
diff --git a/source/luametatex/source/libraries/pplib/util/utilfpred.c b/source/luametatex/source/libraries/pplib/util/utilfpred.c
new file mode 100644
index 000000000..9203c5e07
--- /dev/null
+++ b/source/luametatex/source/libraries/pplib/util/utilfpred.c
@@ -0,0 +1,778 @@
+/* predictor filters; common for flate and lzw */
+
+#include "utilmem.h"
+#include "utillog.h"
+#include "utilfpred.h"
+
+/*
+Here we implement predictor filters used with flate and lzw compressions in PDF streams. The main idea of data prediction
+is to compute and output the differences between data records instead of those records. Adjacent pixels in images are usually
+similar, so differences between pixel values tends to be zero. And both Flate and LZW performs better when the input
+is rather smooth. Although a preliminary use of predictors is related to bitmap data, The actual need for predictor filter
+came from the fact that xref streams may also be predicted (usually with PNG up-predictor).
+
+PDF specification allows to use several predictor algorithms, specified by /Predictor key in /DecodeParms dictionary:
+
+   1 - no predictor (default)
+   2 - TIFF horizontal predictor
+  10 - PNG none predictor
+  11 - PNG sub predictor
+  12 - PNG up predictor
+  13 - PNG average predictor
+  14 - PNG paeth predictor
+
+All PNG predictors works on bytes, regardless the image color-depth. While encoding, every input data byte is decreased
+by the appropriate byte of the previous pixel. Even if the pixel does not fit a full byte, PNG predictors use an artificial
+pixel size rounded up to a full byte. PNG predictors utilizes previous (left) pixel, pixel above and previous to above
+pixel. In case of PNG, the type of the predictor is written on a dedicated byte at the beginning of every scanline. It
+means all predictor functions must maintain and information about left, above and left-above pixels.
+
+Despite the same differencing idea, TIFF predictors are different. The prediction process bases on pixel components,
+which are not necessarily bytes (component of a pixel is added/substracted from a relevant component of a previous
+pixel). In TIFF predictor 2, only the previous (the left) pixel is taken into account, there is no need to keep
+an information about other surrounding pixels. Also there is no expicit algorithm marker in data; the same prediction
+method is applied to all input rows.
+
+Not surprisingly, predictor encoders and decoders are pretty similar. Encoders take some input value and the previous
+input value (or 0 at the beginning of the scanline) and output a difference between them. Decoders takes an input value,
+previously decoded value (or zero) and outputs their sum. When encoding, the result is cast to the proper unsigned integer,
+when decoding, modulo 256 (or appropriate) is used, which makes encoding and decoding looseless.
+
+Some extra bits trickery is involved in TIFF predictor function, when components don't fit bytes boundary. In that case,
+an input is treated as a bits stream. Every input byte is "buffered" in a larger integer, as its lower bits (from right).
+Every output value is taken from its higher (left) bits. In a special case of bits-per-component equal 1, we buffer all
+pixel bits and use XOR to compute bits difference between pixels. I've excerpted that trick from poppler, but I'm not
+really sure if it works any better, especially when the number of components per pixel is 1. In that case we do a hard
+bit-by-bit work anyway.
+
+In PNG prediction, we record every pixel byte (in decoded form) in state->rowsave. At the end of a scanline
+we copy state->rowsave to state->rowup, so that in the next scanline we can access up-pixel byte.
+Left pixel byte is accessed as state->rowsave (the byte recently stored or virtual left edge byte \0).
+Up-left pixel byte is accessed via state->rowup, but with state->pixelsize offset (same as left byte, possibly \0
+at the left edge of the row). Both state->rowup and state->rowsave has a safe span of pixelsize bytes on the left,
+that are permanently \0.
+*/
+
+#define predictor_component_t uint16_t
+#define predictor_pixel1b_t uint32_t
+
+#define MAX_COMPONENTS 8
+
+struct predictor_state {
+  int default_predictor;                      /* default predictor indicator */
+  int current_predictor;                      /* current predictor, possibly taken from algorithm marker in PNG data */
+  int rowsamples;                             /* number of pixels in a scanline (/DecodeParms << /Columns ... >>) */
+  int compbits;                               /* number of bits per component (/DecodeParms << /BitsPerComponent ... >>) */
+  int components;                             /* number of components (/DecodeParms << /Colors ... >>) */
+  uint8_t *buffer;                            /* temporary private buffer area */
+  uint8_t *rowin;                             /* an input row buffer position */
+  int rowsize;                                /* size of a current scanline in bytes (rounded up) */
+  int rowend;                                 /* an input buffer end position */
+  int rowindex;                               /* an output buffer position */
+  union {
+    struct {                                  /* used by PNG predictor codecs */
+      uint8_t *rowup, *rowsave;               /* previous scanline buffers */
+      int predictorbyte;                      /* flag indicating that algorithm byte is read/written */
+      int pixelsize;                          /* number of bytes per pixel (rounded up) */
+    };
+    struct {                                  /* used by TIFF predictor codecs */
+      predictor_component_t compbuffer[MAX_COMPONENTS];
+      union {
+        predictor_component_t *prevcomp;      /* an array of left pixel components, typically eq ->compbuffer */
+        predictor_pixel1b_t *prevpixel;       /* left pixel value stored on a single integer (for 1bit color-depth) */
+      };
+      int compin, compout;                    /* bit stream buffers */
+      int bitsin, bitsout;                    /* bit stream counters */
+      int sampleindex;                        /* pixel counter */
+      int compindex;                          /* component counter */
+      int pixbufsize;                         /* size of pixel buffer in bytes */
+    };
+  };
+  int flush;
+  int status;
+};
+
+typedef union { predictor_state *predictorstate; void *voidstate; } predictor_state_pointer; // to avoid 'dereferencing type-puned ...' warnings
+
+enum {
+  STATUS_LAST = 0,
+  STATUS_CONTINUE = 1 // any value different then IOFEOF, IOFERR, ... which are < 0
+};
+
+/*
+Predictor type identifiers (pdf spec 76). lpdf doesn't hire the codec if predictor is 1. Predictor 15 indicates
+that the type of PNG prediction algorithm may change in subsequent lines. We always check algorithm marker anyway.
+*/
+
+enum predictor_code {
+  NONE_PREDICTOR = 1,
+  TIFF_PREDICTOR = 2,
+  PNG_NONE_PREDICTOR = 10,
+  PNG_SUB_PREDICTOR = 11,
+  PNG_UP_PREDICTOR = 12,
+  PNG_AVERAGE_PREDICTOR = 13,
+  PNG_PAETH_PREDICTOR = 14,
+  PNG_OPTIMUM_PREDICTOR = 15
+};
+
+predictor_state * predictor_decoder_init (predictor_state *state, int predictor, int rowsamples, int components, int compbits)
+{
+  int rowsize, pixelsize;
+#define storage_pos(b, p, size) ((b = p), (p += size))
+  uint8_t *buffer, *p;
+  size_t buffersize;
+
+  pixelsize = (components * compbits + 7) >> 3; // to bytes, rounded up
+  rowsize = (rowsamples * components * compbits + 7) >> 3;
+
+  state->default_predictor = state->current_predictor = predictor;
+  state->rowsamples = rowsamples;
+  state->components = components;
+  state->compbits = compbits;
+
+  if (predictor == TIFF_PREDICTOR)
+  { /* tiff predictor */
+    size_t compbuf, pixbuf;
+    compbuf = components * sizeof(predictor_component_t);
+    pixbuf = 1 * sizeof(predictor_pixel1b_t);
+    state->pixbufsize = (int)(compbuf > pixbuf ? compbuf : pixbuf);
+    buffersize = rowsize * sizeof(uint8_t);
+    buffer = (uint8_t *)util_calloc(buffersize, 1);
+    if ((size_t)state->pixbufsize > sizeof(state->compbuffer)) // components > MAX_COMPONENTS
+      state->prevcomp = (predictor_component_t *)util_calloc(state->pixbufsize, 1);
+    else
+      state->prevcomp = state->compbuffer;
+    // &state->prevcomp == &state->prevpixel
+    state->sampleindex = state->compindex = 0;
+    state->bitsin = state->bitsout = 0;
+    state->compin = state->compout = 0;
+  }
+  else
+  { /* png predictors */
+    buffersize = (3 * rowsize + 2 * pixelsize + 1) * sizeof(uint8_t);
+    p = buffer = (uint8_t *)util_calloc(buffersize, 1);
+    storage_pos(state->rowin, p, 1 + rowsize); // one extra byte for prediction algorithm tag
+    p += pixelsize;                            // pixelsize extra bytes for virtual left pixel at the edge, eg. rowup[-1] (permanently \0)
+    storage_pos(state->rowup, p, rowsize);     // actual row byte
+    p += pixelsize;                            // ditto
+    storage_pos(state->rowsave, p, rowsize);
+    state->pixelsize = pixelsize;
+    state->predictorbyte = 0;
+  }
+  state->buffer = buffer;
+  state->rowsize = rowsize;
+  state->rowindex = 0;
+  state->rowend = 0;
+  state->status = STATUS_CONTINUE;
+  return state;
+}
+
+predictor_state * predictor_encoder_init (predictor_state *state, int predictor, int rowsamples, int components, int compbits)
+{
+  return predictor_decoder_init(state, predictor, rowsamples, components, compbits);
+}
+
+void predictor_decoder_close (predictor_state *state)
+{
+  util_free(state->buffer);
+  if (state->default_predictor == TIFF_PREDICTOR && state->prevcomp != NULL && state->prevcomp != state->compbuffer)
+    util_free(state->prevcomp);
+}
+
+void predictor_encoder_close (predictor_state *state)
+{
+  predictor_decoder_close(state);
+}
+
+/*
+All predoctor codecs first read the entire data row into a buffer. This is not crucial for the process,
+but allows to separate read/write states. In particular, there is one place in which codec functions
+may return on EOD.
+*/
+
+#define start_row(state) (state->rowindex = 0, state->rowin = state->buffer)
+
+static int read_scanline (predictor_state *state, iof *I, int size)
+{
+  int rowtail, left;
+  while ((rowtail = size - state->rowend) > 0)
+  {
+    left = (int)iof_left(I);
+    if (left >= rowtail)
+    {
+      memcpy(state->buffer + state->rowend, I->pos, (size_t)rowtail);
+      state->rowend += rowtail;
+      I->pos += rowtail;
+      start_row(state);
+      break;
+    }
+    else
+    {
+      if ((rowtail = left) > 0)
+      {
+        memcpy(state->buffer + state->rowend, I->pos, (size_t)rowtail);
+        state->rowend += rowtail;
+        I->pos += rowtail;
+      }
+      if (iof_input(I) == 0)
+      {
+        if (state->rowend == 0) // no scanline to process, no more input
+          return state->flush ? IOFEOF : IOFEMPTY;
+        /* If we are here, there is an incomplete scanline in buffer:
+           - if there is a chance for more (state->flush == 0), than wait for more
+           - otherwise encode/decode the last incomplete line?
+           pdf spec p. 76 says that "A row occupies a whole number of bytes",
+           so this situation should be considered abnormal (not found so far).
+         */
+        if (!state->flush)
+          return IOFEMPTY;
+        loggerf("incomplete scanline in predictor filter");
+        //return IOFERR;
+        state->status = STATUS_LAST;
+        state->rowsize -= size - state->rowend;
+        start_row(state);
+        break;
+      }
+    }
+  }
+  return STATUS_CONTINUE;
+}
+
+#define read_row(state, I, size, status) if ((status = read_scanline(state, I, size)) != STATUS_CONTINUE) return status
+
+#define ensure_output_bytes(O, n) if (!iof_ensure(O, n)) return IOFFULL
+
+#define tobyte(c) ((uint8_t)(c))
+#define tocomp(c) ((uint16_t)(c))
+
+#define row_byte(state) (state->rowin[state->rowindex])
+
+/* png predictor macros; on bytes */
+
+#define up_pixel_byte(state)     (state->rowup[state->rowindex])
+#define upleft_pixel_byte(state) (state->rowup[state->rowindex - state->pixelsize])
+#define left_pixel_byte(state)   (state->rowsave[state->rowindex - state->pixelsize])
+#define save_pixel_byte(state, c) (state->rowsave[state->rowindex] = (uint8_t)(c))
+
+/* tiff predictor macros; on components */
+
+#define left_pixel_component(state) (state->prevcomp[state->compindex]) // tiff predictor with 2, 4, 8, 16 components
+#define left_pixel_value(state) (state->prevpixel[0])                   // tiff predictor with 1bit components
+
+/* assignment in conditional
+#define save_pixel_component(state, c) ((void)\
+  ((state->prevcomp[state->compindex] = (predictor_component_t)(c)), \
+  ++state->compindex, (state->compindex < state->components || (state->compindex = 0))))
+*/
+#define save_pixel_component(state, c) \
+  do { state->prevcomp[state->compindex] = (predictor_component_t)(c); if (++state->compindex >= state->components) state->compindex = 0; } while (0)
+
+#define save_pixel_value(state, c) (state->prevpixel[0] = (predictor_pixel1b_t)(c))
+
+/* Once the codec function is done with the scanline, we set imaginary left pixel data to zero, and reset row counters to
+zero in order to allow buffering another input scanline. */
+
+#define reset_row(state) state->rowend = 0
+
+#define reset_png_row(state) (memcpy(state->rowup, state->rowsave, state->rowsize), state->predictorbyte = 0, reset_row(state))
+
+#define reset_tiff_row(state) \
+  memset(state->prevcomp, 0, state->pixbufsize), \
+  state->bitsin = state->bitsout = 0, \
+  state->compin = state->compout = 0, \
+  reset_row(state), \
+  state->sampleindex = state->compindex = 0
+
+/* PNG paeth predictor function; http://www.libpng.org/pub/png/book/chapter09.html
+Compute the base value p := left + up - upleft, then choose that byte the closest
+(of the smallest absolute difference) to the base value. Left byte has a precedence. */
+
+
+static int paeth (predictor_state *state)
+{
+  int p, p1, p2, p3;
+  p = left_pixel_byte(state) + up_pixel_byte(state) - upleft_pixel_byte(state);
+  p1 = p >= left_pixel_byte(state)   ? (p - left_pixel_byte(state))   : (left_pixel_byte(state) - p);
+  p2 = p >= up_pixel_byte(state)     ? (p - up_pixel_byte(state))     : (up_pixel_byte(state) - p);
+  p3 = p >= upleft_pixel_byte(state) ? (p - upleft_pixel_byte(state)) : (upleft_pixel_byte(state) - p);
+  return (p1 <= p2 && p1 <= p3) ? left_pixel_byte(state) : (p2 <= p3 ? up_pixel_byte(state) : upleft_pixel_byte(state));
+}
+
+/* predictor decoder */
+
+iof_status predictor_decode_state (iof *I, iof *O, predictor_state *state)
+{
+  int status, c, d, outbytes;
+  while (state->status == STATUS_CONTINUE)
+  {
+    if (state->default_predictor >= 10) // PNG predictor?
+    {
+      read_row(state, I, state->rowsize + 1, status);
+      if (state->predictorbyte == 0)
+      { // we could actually check state->rowin <> state->buffer, but we need this flag for encoder anyway
+        state->current_predictor = row_byte(state) + 10;
+        state->predictorbyte = 1;
+        ++state->rowin;
+      }
+    }
+    else
+    {
+      read_row(state, I, state->rowsize, status);
+    }
+    switch (state->current_predictor)
+    {
+      case NONE_PREDICTOR:
+        for ( ; state->rowindex < state->rowsize; ++state->rowindex)
+        {
+          ensure_output_bytes(O, 1);
+          c = row_byte(state);
+          iof_set(O, c);
+        }
+        reset_row(state);
+        break;
+      case TIFF_PREDICTOR:
+        switch (state->compbits)
+        {
+          case 1:
+            outbytes = (state->components + 7) >> 3;
+            for ( ; state->sampleindex < state->rowsamples; ++state->sampleindex)
+            {
+              ensure_output_bytes(O, outbytes);
+              while (state->bitsin < state->components)
+              {
+                state->compin = (state->compin << 8) | row_byte(state);
+                state->bitsin += 8;
+                ++state->rowindex;
+              }
+              state->bitsin -= state->components;
+              d = state->compin >> state->bitsin;
+              state->compin &= (1 << state->bitsin) - 1;
+              c = d ^ left_pixel_value(state);
+              save_pixel_value(state, c);
+              state->compout = (state->compout << state->components) | c;
+              state->bitsout += state->components;
+              while (state->bitsout >= 8)
+              {
+                state->bitsout -= 8;
+                iof_set(O, state->compout >> state->bitsout);
+                state->compout &= (1 << state->bitsout) - 1;
+              }
+            }
+            if (state->bitsout > 0)
+            {
+              ensure_output_bytes(O, 1);
+              iof_set(O, state->compin << (8 - state->bitsout));
+            }
+            break;
+          case 2: case 4:
+            for ( ; state->sampleindex < state->rowsamples; ++state->sampleindex)
+            {
+              for ( ; state->compindex < state->components; ) // state->compindex is ++ed  by save_pixel_component()
+              {
+                ensure_output_bytes(O, 1);
+                if (state->bitsin < state->compbits)
+                {
+                  state->compin = (state->compin << 8) | row_byte(state);
+                  state->bitsin += 8;
+                  ++state->rowindex;
+                }
+                state->bitsin -= state->compbits;
+                d = state->compin >> state->bitsin;
+                state->compin &= (1 << state->bitsin) - 1;
+                c = (d + left_pixel_component(state)) & 0xff;
+                save_pixel_component(state, c);
+                state->compout = (state->compout << state->compbits) | c;
+                state->bitsout += state->compbits;
+                if (state->bitsout >= 8)
+                {
+                  state->bitsout -= 8;
+                  iof_set(O, state->compout >> state->bitsout);
+                  state->compout &= (1 << state->bitsout) - 1;
+                }
+              }
+            }
+            if (state->bitsout > 0)
+            {
+              ensure_output_bytes(O, 1);
+              iof_set(O, state->compin << (8 - state->bitsout));
+            }
+            break;
+          case 8:
+            for ( ; state->rowindex < state->rowsize; ++state->rowindex)
+            {
+              ensure_output_bytes(O, 1);
+              c = (row_byte(state) + left_pixel_component(state)) & 0xff;
+              save_pixel_component(state, c);
+              iof_set(O, c);
+            }
+            break;
+          case 16:
+            for ( ; state->rowindex < state->rowsize - 1; ++state->rowindex)
+            {
+              ensure_output_bytes(O, 2);
+              d = row_byte(state) << 8;
+              ++state->rowindex;
+              d |= row_byte(state);
+              c = (d + left_pixel_component(state)) & 0xffff;
+              save_pixel_component(state, c);
+              iof_set2(O, c >> 8, c & 0xff);
+            }
+            break;
+          default:
+            return IOFERR;
+        }
+        reset_tiff_row(state);
+        break;
+      case PNG_NONE_PREDICTOR:
+        for ( ; state->rowindex < state->rowsize; ++state->rowindex)
+        {
+          ensure_output_bytes(O, 1);
+          c = row_byte(state);
+          save_pixel_byte(state, c); // next row may need it
+          iof_set(O, c);
+        }
+        reset_png_row(state);
+        break;
+      case PNG_SUB_PREDICTOR:
+        for ( ; state->rowindex < state->rowsize; ++state->rowindex)
+        {
+          ensure_output_bytes(O, 1);
+          c = (row_byte(state) + left_pixel_byte(state)) & 0xff;
+          save_pixel_byte(state, c);
+          iof_set(O, c);
+        }
+        reset_png_row(state);
+        break;
+      case PNG_UP_PREDICTOR:
+        for ( ; state->rowindex < state->rowsize; ++state->rowindex)
+        {
+          ensure_output_bytes(O, 1);
+          c = (row_byte(state) + up_pixel_byte(state)) & 0xff;
+          save_pixel_byte(state, c);
+          iof_set(O, c);
+        }
+        reset_png_row(state);
+        break;
+      case PNG_AVERAGE_PREDICTOR:
+        for ( ; state->rowindex < state->rowsize; ++state->rowindex)
+        {
+          ensure_output_bytes(O, 1);
+          c = (row_byte(state) + ((up_pixel_byte(state) + left_pixel_byte(state)) / 2)) & 0xff;
+          save_pixel_byte(state, c);
+          iof_set(O, c);
+        }
+        reset_png_row(state);
+        break;
+      case PNG_PAETH_PREDICTOR:
+        for ( ; state->rowindex < state->rowsize; ++state->rowindex)
+        {
+          ensure_output_bytes(O, 1);
+          c = (row_byte(state) + paeth(state)) & 0xff;
+          save_pixel_byte(state, c);
+          iof_set(O, c);
+        }
+        reset_png_row(state);
+        break;
+      //case PNG_OPTIMUM_PREDICTOR: // valid as default_redictor, but not as algorithm identifier byte
+      default:
+        return IOFERR;
+    }
+  }
+  return state->status == STATUS_LAST ? IOFERR : IOFEOF;
+}
+
+/* predictor encoder */
+
+iof_status predictor_encode_state (iof *I, iof *O, predictor_state *state)
+{
+  int status, c, d, outbytes;
+  while (state->status == STATUS_CONTINUE)
+  {
+    read_row(state, I, state->rowsize, status);
+    if (state->current_predictor >= 10 && state->predictorbyte == 0)
+    {
+      ensure_output_bytes(O, 1);
+      iof_set(O, state->current_predictor - 10);
+      state->predictorbyte = 1;
+    }
+    switch (state->current_predictor)
+    {
+      case NONE_PREDICTOR:
+        for ( ; state->rowindex < state->rowsize; ++state->rowindex)
+        {
+          ensure_output_bytes(O, 1);
+          c = row_byte(state);
+          iof_set(O, c);
+        }
+        reset_row(state);
+        break;
+      case TIFF_PREDICTOR:
+        switch (state->compbits)
+        {
+          case 1:
+            outbytes = (state->components + 7) >> 3;
+            for ( ; state->sampleindex < state->rowsamples; ++state->sampleindex)
+            {
+              ensure_output_bytes(O, outbytes);
+              while (state->bitsin < state->components)
+              {
+                state->compin = (state->compin << 8) | row_byte(state);
+                state->bitsin += 8;
+                ++state->rowindex;
+              }
+              state->bitsin -= state->components;
+              c = state->compin >> state->bitsin;
+              state->compin &= (1 << state->bitsin) - 1;
+              d = c ^ left_pixel_value(state);
+              save_pixel_value(state, c);
+              state->compout = (state->compout << state->components) | d;
+              state->bitsout += state->components;
+              while (state->bitsout >= 8)
+              {
+                state->bitsout -= 8;
+                iof_set(O, state->compout >> state->bitsout);
+                state->compout &= (1 << state->bitsout) - 1;
+              }
+            }
+            if (state->bitsout > 0)
+            {
+              ensure_output_bytes(O, 1);
+              iof_set(O, state->compin << (8 - state->bitsout));
+            }
+            break;
+          case 2: case 4:
+            for ( ; state->sampleindex < state->rowsamples; ++state->sampleindex)
+            {
+              for ( ; state->compindex < state->components; )
+              {
+                ensure_output_bytes(O, 1);
+                if (state->bitsin < state->compbits)
+                {
+                  state->compin = (state->compin << 8) | row_byte(state);
+                  state->bitsin += 8;
+                  ++state->rowindex;
+                }
+                state->bitsin -= state->compbits;
+                c = state->compin >> state->bitsin;
+                state->compin &= (1 << state->bitsin) - 1;
+                d = tocomp(c - left_pixel_component(state));
+                save_pixel_component(state, c);
+                state->compout = (state->compout << state->compbits) | d;
+                state->bitsout += state->compbits;
+                if (state->bitsout >= 8)
+                {
+                  state->bitsout -= 8;
+                  iof_set(O, state->compout >> state->bitsout);
+                  state->compout &= (1 << state->bitsout) - 1;
+                }
+              }
+            }
+            if (state->bitsout > 0)
+            {
+              ensure_output_bytes(O, 1);
+              iof_set(O, state->compin << (8 - state->bitsout));
+            }
+            break;
+          case 8:
+            for ( ; state->rowindex < state->rowsize; ++state->rowindex)
+            {
+              ensure_output_bytes(O, 1);
+              c = row_byte(state);
+              d = tobyte(c - left_pixel_component(state));
+              save_pixel_component(state, c);
+              iof_set(O, d);
+            }
+            break;
+          case 16:
+            for ( ; state->rowindex < state->rowsize - 1; ++state->rowindex)
+            {
+              ensure_output_bytes(O, 2);
+              c = row_byte(state) << 8;
+              ++state->rowindex;
+              c |= row_byte(state);
+              d = tocomp(c - left_pixel_component(state));
+              save_pixel_component(state, c);
+              iof_set2(O, d >> 8, d & 0xff);
+            }
+            break;
+          default:
+            return IOFERR;
+        }
+        reset_tiff_row(state);
+        break;
+      case PNG_NONE_PREDICTOR:
+        for ( ; state->rowindex < state->rowsize; ++state->rowindex)
+        {
+          ensure_output_bytes(O, 1);
+          c = row_byte(state);
+          save_pixel_byte(state, c); // next row may need it
+          iof_set(O, c);
+        }
+        reset_png_row(state);
+        break;
+      case PNG_SUB_PREDICTOR:
+        for ( ; state->rowindex < state->rowsize; ++state->rowindex)
+        {
+          ensure_output_bytes(O, 1);
+          c = row_byte(state);
+          d = tobyte(c - left_pixel_byte(state));
+          save_pixel_byte(state, c);
+          iof_set(O, d);
+        }
+        reset_png_row(state);
+        break;
+      case PNG_OPTIMUM_PREDICTOR: // not worthy to perform optimization
+      case PNG_UP_PREDICTOR:
+        for ( ; state->rowindex < state->rowsize; ++state->rowindex)
+        {
+          ensure_output_bytes(O, 1);
+          c = row_byte(state);
+          d = tobyte(c - up_pixel_byte(state));
+          save_pixel_byte(state, c);
+          iof_set(O, d);
+        }
+        reset_png_row(state);
+        break;
+      case PNG_AVERAGE_PREDICTOR:
+        for ( ; state->rowindex < state->rowsize; ++state->rowindex)
+        {
+          ensure_output_bytes(O, 1);
+          c = row_byte(state);
+          d = tobyte(c - ((up_pixel_byte(state) + left_pixel_byte(state)) >> 1));
+          save_pixel_byte(state, c);
+          iof_set(O, d);
+        }
+        reset_png_row(state);
+        break;
+      case PNG_PAETH_PREDICTOR:
+        for ( ; state->rowindex < state->rowsize; ++state->rowindex)
+        {
+          ensure_output_bytes(O, 1);
+          c = row_byte(state);
+          d = tobyte(c - paeth(state));
+          save_pixel_byte(state, c);
+          iof_set(O, d);
+        }
+        reset_png_row(state);
+        break;
+      default:
+        return IOFERR;
+    }
+  }
+  return state->status == STATUS_LAST ? IOFERR : IOFEOF;
+}
+
+iof_status predictor_decode (iof *I, iof *O, int predictor, int rowsamples, int components, int compbits)
+{
+  predictor_state state;
+  int ret;
+  predictor_decoder_init(&state, predictor, rowsamples, components, compbits);
+  state.flush = 1;
+  ret = predictor_decode_state(I, O, &state);
+  predictor_decoder_close(&state);
+  return ret;
+}
+
+iof_status predictor_encode (iof *I, iof *O, int predictor, int rowsamples, int components, int compbits)
+{
+  predictor_state state;
+  int ret;
+  predictor_encoder_init(&state, predictor, rowsamples, components, compbits);
+  state.flush = 1;
+  ret = predictor_encode_state(I, O, &state);
+  predictor_encoder_close(&state);
+  return ret;
+}
+
+/* filters */
+
+// predictor decoder function
+
+static size_t predictor_decoder (iof *F, iof_mode mode)
+{
+  predictor_state *state;
+  iof_status status;
+  size_t tail;
+
+  state = iof_filter_state(predictor_state *, F);
+  switch(mode)
+  {
+    case IOFLOAD:
+    case IOFREAD:
+      if (F->flags & IOF_STOPPED)
+        return 0;
+      tail = iof_tail(F);
+      F->pos = F->buf + tail;
+      F->end = F->buf + F->space;
+      do {
+        status = predictor_decode_state(F->next, F, state);
+      } while (mode == IOFLOAD && status == IOFFULL && iof_resize_buffer(F));
+      return iof_decoder_retval(F, "predictor", status);
+    case IOFCLOSE:
+      predictor_decoder_close(state);
+      iof_free(F);
+      return 0;
+    default:
+      break;
+  }
+  return 0;
+}
+
+// predictor encoder function
+
+static size_t predictor_encoder (iof *F, iof_mode mode)
+{
+  predictor_state *state;
+  iof_status status;
+
+  state = iof_filter_state(predictor_state *, F);
+  switch (mode)
+  {
+    case IOFFLUSH:
+      state->flush = 1;
+      FALLTHRU // fall through
+    case IOFWRITE:
+      F->end = F->pos;
+      F->pos = F->buf;
+      status = predictor_encode_state(F, F->next, state);
+      return iof_encoder_retval(F, "predictor", status);
+    case IOFCLOSE:
+      if (!state->flush)
+        predictor_encoder(F, IOFFLUSH);
+      predictor_encoder_close(state);
+      iof_free(F);
+      return 0;
+    default:
+      break;
+  }
+  return 0;
+}
+
+iof * iof_filter_predictor_decoder (iof *N, int predictor, int rowsamples, int components, int compbits)
+{
+  iof *I;
+  predictor_state_pointer P;
+  I = iof_filter_reader(predictor_decoder, sizeof(predictor_state), &P.voidstate);
+  iof_setup_next(I, N);
+  if (predictor_decoder_init(P.predictorstate, predictor, rowsamples, components, compbits) == NULL)
+  {
+    iof_discard(I);
+    return NULL;
+  }
+  P.predictorstate->flush = 1;
+  return I;
+}
+
+iof * iof_filter_predictor_encoder (iof *N, int predictor, int rowsamples, int components, int compbits)
+{
+  iof *O;
+  predictor_state_pointer P;
+  O = iof_filter_writer(predictor_encoder, sizeof(predictor_state), &P.voidstate);
+  iof_setup_next(O, N);
+  if (predictor_encoder_init(P.predictorstate, predictor, rowsamples, components, compbits) == NULL)
+  {
+    iof_discard(O);
+    return NULL;
+  }
+  return O;
+}
diff --git a/source/luametatex/source/libraries/pplib/util/utilfpred.h b/source/luametatex/source/libraries/pplib/util/utilfpred.h
new file mode 100644
index 000000000..6ae2f8935
--- /dev/null
+++ b/source/luametatex/source/libraries/pplib/util/utilfpred.h
@@ -0,0 +1,23 @@
+#ifndef UTIL_FILTER_PREDICTOR_H
+#define UTIL_FILTER_PREDICTOR_H
+
+#include "utiliof.h"
+
+typedef struct predictor_state predictor_state;
+
+predictor_state * predictor_decoder_init (predictor_state *state, int predictor, int rowsamples, int components, int compbits);
+predictor_state * predictor_encoder_init (predictor_state *state, int predictor, int rowsamples, int components, int compbits);
+
+void predictor_decoder_close (predictor_state *state);
+void predictor_encoder_close (predictor_state *state);
+
+iof_status predictor_decode_state (iof *I, iof *O, predictor_state *state);
+iof_status predictor_encode_state (iof *I, iof *O, predictor_state *state);
+
+iof_status predictor_decode (iof *I, iof *O, int predictor, int rowsamples, int components, int compbits);
+iof_status predictor_encode (iof *I, iof *O, int predictor, int rowsamples, int components, int compbits);
+
+iof * iof_filter_predictor_decoder (iof *N, int predictor, int rowsamples, int components, int compbits);
+iof * iof_filter_predictor_encoder (iof *N, int predictor, int rowsamples, int components, int compbits);
+
+#endif
+\ No newline at end of file
diff --git a/source/luametatex/source/libraries/pplib/util/utiliof.c b/source/luametatex/source/libraries/pplib/util/utiliof.c
new file mode 100644
index 000000000..41d6fba38
--- /dev/null
+++ b/source/luametatex/source/libraries/pplib/util/utiliof.c
@@ -0,0 +1,2993 @@
+/* input/output stream */
+
+#include <stdlib.h>
+#include <string.h>
+#include <stdarg.h>
+
+#include "utilmem.h"
+#include "utillog.h"
+#include "utiliof.h"
+
+/* commons */
+
+void * iof_copy_data (const void *data, size_t size)
+{
+  return memcpy(util_malloc(size), data, size);
+}
+
+uint8_t * iof_copy_file_data (const char *filename, size_t *psize)
+{
+  FILE *file;
+  size_t size;
+  uint8_t *data;
+  if ((file = fopen(filename, "rb")) == NULL)
+    return NULL;
+  fseek(file, 0, SEEK_END);
+  size = (size_t)ftell(file);
+  data = (uint8_t *)util_malloc(size);
+  fseek(file, 0, SEEK_SET);
+  if ((*psize = fread(data, 1, size, file)) != size)
+  {
+    util_free(data);
+    data = NULL;
+  }
+  fclose(file);
+  return data;
+}
+
+uint8_t * iof_copy_file_handle_data (FILE *file, size_t *psize)
+{
+  size_t size;
+  uint8_t *data;
+  //long offset = ftell(file); // keep offset intact?
+  fseek(file, 0, SEEK_END);
+  size = (size_t)ftell(file);
+  data = (uint8_t *)util_malloc(size);
+  fseek(file, 0, SEEK_SET);
+  if ((*psize = fread(data, 1, size, file)) != size)
+  {
+    util_free(data);
+    data = NULL;
+  }
+  //fseek(file, offset, SEEK_SET)
+  return data;
+}
+
+FILE * iof_get_file (iof *F)
+{
+  if (F->flags & IOF_FILE)
+    return iof_file_get_file(F->iofile);
+  if (F->flags & IOF_FILE_HANDLE)
+    return F->file;
+  return NULL;
+}
+
+const char * iof_status_kind (iof_status status)
+{
+  switch (status)
+  {
+    case IOFEOF:
+      return "IOFEOF";
+    case IOFERR:
+      return "IOFERR";
+    case IOFEMPTY:
+      return "IOFEMPTY";
+    case IOFFULL:
+      return "IOFFULL";
+    default:
+      break;
+  }
+  return "(unknown)";
+}
+
+/* shared pseudofile */
+
+#define IOF_FILE_DEFAULTS 0
+
+iof_file * iof_file_new (FILE *file)
+{
+  iof_file *iofile = (iof_file *)util_malloc(sizeof(iof_file));
+  iof_file_set_fh(iofile, file);
+  iofile->offset = NULL;
+  iofile->size = 0;
+  iofile->name = NULL;
+  iofile->refcount = 0;
+  iofile->flags = IOF_FILE_DEFAULTS|IOF_ALLOC;
+  return iofile;
+}
+
+iof_file * iof_file_init (iof_file *iofile, FILE *file)
+{
+  iof_file_set_fh(iofile, file);
+  iofile->offset = NULL;
+  iofile->size = 0;
+  iofile->name = NULL;
+  iofile->refcount = 0;
+  iofile->flags = IOF_FILE_DEFAULTS;
+  return iofile;
+}
+
+iof_file * iof_file_rdata (const void *data, size_t size)
+{
+  iof_file *iofile = (iof_file *)util_malloc(sizeof(iof_file));
+  iofile->rbuf = iofile->rpos = (const uint8_t *)data;
+  iofile->rend = iofile->rbuf + size;
+  iofile->offset = NULL;
+  iofile->size = 0;
+  iofile->name = NULL;
+  iofile->refcount = 0;
+  iofile->flags = IOF_FILE_DEFAULTS|IOF_ALLOC|IOF_DATA;
+  return iofile;
+}
+
+iof_file * iof_file_rdata_init (iof_file *iofile, const void *data, size_t size)
+{
+  iofile->rbuf = iofile->rpos = (const uint8_t *)data;
+  iofile->rend = iofile->rbuf + size;
+  iofile->offset = NULL;
+  iofile->size = 0; // lets keep it consequently set to zero (only for user disposition)
+  iofile->name = NULL;
+  iofile->refcount = 0;
+  iofile->flags = IOF_FILE_DEFAULTS|IOF_DATA;
+  return iofile;
+}
+
+iof_file * iof_file_wdata (void *data, size_t size)
+{
+  return iof_file_rdata((const void *)data, size);
+}
+
+iof_file * iof_file_wdata_init (iof_file *iofile, void *data, size_t size)
+{
+  return iof_file_rdata_init(iofile, (const void *)data, size);
+}
+
+/* typical uses so far */
+
+iof_file * iof_file_reader_from_file_handle (iof_file *iofile, const char *filename, FILE *file, int preload, int closefile)
+{
+  uint8_t *data;
+  size_t size;
+
+  if (preload)
+  {
+    if ((data = iof_copy_file_handle_data(file, &size)) == NULL)
+    {
+      if (closefile) // callers expect close also on failure
+        fclose(file);
+      return NULL;
+    }
+    if (iofile == NULL)
+      iofile = iof_file_rdata(data, size);
+    else
+      iof_file_rdata_init(iofile, data, size);
+    iofile->flags |= IOF_BUFFER_ALLOC;
+    if (closefile)
+      fclose(file);
+  }
+  else
+  {
+    if (iofile == NULL)
+      iofile = iof_file_new(file);
+    else
+      iof_file_init(iofile, file);
+    if (closefile)
+      iofile->flags |= IOF_CLOSE_FILE;
+  }
+  if (filename != NULL)
+    iof_file_set_name(iofile, filename);
+  return iofile;
+}
+
+iof_file * iof_file_reader_from_file (iof_file *iofile, const char *filename, int preload)
+{
+  FILE *file;
+  if ((file = fopen(filename, "rb")) == NULL)
+    return NULL;
+  return iof_file_reader_from_file_handle(iofile, filename, file, preload, 1); // takes care to fclose() on failure
+}
+
+iof_file * iof_file_reader_from_data (iof_file *iofile, const void *data, size_t size, int preload, int freedata)
+{
+  void *newdata;
+  if (data == NULL)
+    return NULL;
+  if (preload)
+  {
+    newdata = iof_copy_data(data, size);
+    if (iofile == NULL)
+      iofile = iof_file_rdata(newdata, size);
+    else
+      iof_file_rdata_init(iofile, newdata, size);
+    iofile->flags |= IOF_BUFFER_ALLOC;
+    //if (freedata) // hardly makes sense...  we can't free const void *
+    //  util_free((void *)data);
+  }
+  else
+  {
+    if (iofile == NULL)
+      iofile = iof_file_rdata(data, size);
+    else
+      iof_file_rdata_init(iofile, data, size);
+    if (freedata)
+      iofile->flags |= IOF_BUFFER_ALLOC;
+  }
+  return iofile;
+}
+
+/*
+iof_file * iof_file_writer_from_file (iof_file *iofile, const char *filename)
+{
+  FILE *file;
+  if ((file = fopen(filename, "wb")) == NULL)
+    return NULL;
+  if (iofile == NULL)
+    iofile = iof_file_new(file);
+  else
+    iof_file_init(iofile, file);
+  iofile->flags |= IOF_CLOSE_FILE;
+  iof_file_set_name(iofile, filename);
+  return iofile;
+}
+*/
+
+/*
+Because of limited number of FILE* handles available, we may need to close/reopen a file handle
+when accessing it. In applications so far (fonts, images) we typically need the entire source
+to parse the file on object creation and to rewrite or reload the data on dump. All iof_file API
+functions assume that iofile has FILE* opened. Reopening it on every access (ftell, fseek, read/write)
+makes no sense. So if the caller invalidates iofile by closing and NULLing its file handle,
+it is also responsible to reopen when necessary.
+*/
+
+int iof_file_reclose_input (iof_file *iofile)
+{
+  FILE *file;
+  if (iofile->flags & IOF_DATA)
+    return 0;
+  if ((file = iof_file_get_fh(iofile)) == NULL)
+    return 0;
+  fclose(file);
+  iof_file_set_fh(iofile, NULL);
+  iofile->flags &= ~IOF_RECLOSE_FILE;
+  iofile->flags |= IOF_REOPEN_FILE;
+  return 1;
+}
+
+int iof_file_reopen_input (iof_file *iofile)
+{ // returns true if iofile readable
+  FILE *file;
+  const char *filename;
+  if (iofile->flags & IOF_DATA)
+    return 1;
+  if ((file = iof_file_get_fh(iofile)) != NULL)
+    return 1; // if present, assumed readable
+  if ((filename = iofile->name) == NULL || (file = fopen(filename, "rb")) == NULL)
+    return 0;
+  iof_file_set_fh(iofile, file);
+  iofile->flags &= ~IOF_REOPEN_FILE;
+  iofile->flags |= IOF_RECLOSE_FILE;
+  return 1;
+}
+
+/* freeing iof_file */
+
+void iof_file_free (iof_file *iofile)
+{
+  FILE *file;
+  if (iofile->flags & IOF_DATA)
+  {
+    if (iofile->flags & IOF_BUFFER_ALLOC)
+    {
+      iofile->flags &= ~IOF_BUFFER_ALLOC;
+      if (iofile->buf != NULL)
+      {
+        util_free(iofile->buf);
+        iofile->buf = iofile->pos = iofile->end = NULL;
+      }
+    }
+  }
+  else if ((file = iof_file_get_fh(iofile)) != NULL)
+  {
+    if (iofile->flags & IOF_CLOSE_FILE)
+      fclose(file);
+    iof_file_set_fh(iofile, NULL);
+  }
+  iof_file_set_name(iofile, NULL);
+  if (iofile->flags & IOF_ALLOC)
+    util_free(iofile);
+}
+
+/*
+An attempt to close iofile input keeping things safe. In bindings we sometimes we need to force
+closing the file handle, otherwise it is closed when garbage collector graciously calls destroyer.
+Eg. we are done with an object representing pdf/image/font, but we can't move/replace it, as the
+host language keeps the garbage that keeps a file handle. When we call fclose(), we also have to
+set the handle to NULL. In many places we assume, that if the iofile wraps FILE *, than the handle
+is operable (no NULL checks). To close the handle keeping iofile alive safe, we can silently convert
+it dummy IOF_DATA buffer.
+*/
+
+void iof_file_close_input (iof_file *iofile)
+{
+  FILE *file;
+  if (iofile->flags & IOF_DATA)
+  {
+    if (iofile->flags & IOF_BUFFER_ALLOC)
+    {
+      iofile->flags &= ~IOF_BUFFER_ALLOC;
+      if (iofile->buf != NULL)
+      {
+        util_free(iofile->buf);
+        //iofile->buf = iofile->pos = iofile->end = NULL;
+      }
+    }
+  }
+  else if ((file = iof_file_get_fh(iofile)) != NULL)
+  {
+    iof_file_set_fh(iofile, NULL);
+    fclose(file);
+  }
+  iof_file_set_name(iofile, NULL);
+  /* now make it a dummy string iofile */
+  iofile->buf = iofile->pos = iofile->end = NULL;
+  iofile->flags |= IOF_DATA;
+}
+
+/* set filename for reopen */
+
+void iof_file_set_name (iof_file *iofile, const char *name)
+{
+  if (iofile->name != NULL)
+    util_free(iofile->name);
+  if (name != NULL)
+    iofile->name = iof_copy_data(name, strlen(name) + 1);
+  else
+    iofile->name = NULL;
+}
+
+/* seek */
+
+int iof_file_seek (iof_file *iofile, long offset, int whence)
+{
+  if (iofile->flags & IOF_DATA)
+  {
+    switch (whence)
+    {
+      case SEEK_SET:
+        if (offset >= 0 && iofile->buf + offset <= iofile->end)
+        {
+          iofile->pos = iofile->buf + offset;
+          return 0;
+        }
+        return -1;
+      case SEEK_CUR:
+        if ((offset >= 0 && iofile->pos + offset <= iofile->end) || (offset < 0 && iofile->pos + offset >= iofile->buf))
+        {
+          iofile->pos += offset;
+          return 0;
+        }
+        return -1;
+      case SEEK_END:
+        if (offset <= 0 && iofile->end + offset >= iofile->buf)
+        {
+          iofile->pos = iofile->end + offset;
+          return 0;
+        }
+        return -1;
+    }
+    return -1;
+  }
+  return fseek(iof_file_get_fh(iofile), offset, whence);
+}
+
+/* */
+
+long iof_file_tell (iof_file *iofile)
+{
+  return (iofile->flags & IOF_DATA) ? (long)(iofile->pos - iofile->buf) : ftell(iof_file_get_fh(iofile));
+}
+
+size_t iof_file_size (iof_file *iofile)
+{
+  long pos, size;
+  FILE *file;
+  if (iofile->flags & IOF_DATA)
+    return (size_t)iof_space(iofile);
+  file = iof_file_get_fh(iofile);
+  pos = ftell(file);
+  fseek(file, 0, SEEK_END);
+  size = ftell(file);
+  fseek(file, pos, SEEK_SET);
+  return size;
+}
+
+int iof_file_eof (iof_file *iofile)
+{
+  if (iofile->flags & IOF_DATA)
+    return iofile->pos == iofile->end ? -1 : 0;
+  return feof(iof_file_get_fh(iofile));
+}
+
+int iof_file_flush (iof_file *iofile)
+{
+  if (iofile->flags & IOF_DATA)
+    return 0;
+  return fflush(iof_file_get_fh(iofile));
+}
+
+size_t iof_file_read (void *ptr, size_t size, size_t items, iof_file *iofile)
+{
+  if (iofile->flags & IOF_DATA)
+  {
+    size_t bytes = size * items;
+    if (bytes > (size_t)iof_left(iofile))
+      bytes = (size_t)iof_left(iofile);
+    memcpy(ptr, iofile->pos, bytes);
+    iofile->pos += bytes;
+    return bytes / size; // number of elements read
+  }
+  return fread(ptr, size, items, iof_file_get_fh(iofile));
+}
+
+static size_t iof_file_data_resizeto (iof_file *iofile, size_t space)
+{
+  uint8_t *newbuf;
+  size_t size;
+  size = iof_size(iofile);
+  if (iofile->flags & IOF_BUFFER_ALLOC)
+  {
+    newbuf = (uint8_t *)util_realloc(iofile->buf, space);
+  }
+  else
+  {
+    newbuf = (uint8_t *)util_malloc(space);
+    if (size > 0)
+      memcpy(newbuf, iofile->buf, size);
+    iofile->flags |= IOF_BUFFER_ALLOC;
+  }
+  iofile->buf = newbuf;
+  iofile->pos = newbuf + size;
+  iofile->end = newbuf + space;
+  return space - size;
+}
+
+#define iof_file_data_resize(iofile) iof_file_data_resizeto(iofile, iof_space(iofile) << 1)
+
+size_t iof_file_write (const void *ptr, size_t size, size_t items, iof_file *iofile)
+{
+  if (iofile->flags & IOF_DATA)
+  {
+    size_t space, sizesofar, bytes;
+    bytes = size * items;
+    if (bytes > (size_t)iof_left(iofile))
+    {
+      if ((space = iof_space(iofile)) == 0) // allow iofile->buf/end initially NULL
+        space = BUFSIZ;
+      for (sizesofar = iof_size(iofile), space <<= 1; sizesofar + bytes > space; space <<= 1)
+        ;
+      if (iof_file_data_resizeto(iofile, space) == 0)
+        return 0;
+    }
+    memcpy(iofile->pos, ptr, bytes);
+    iofile->pos += bytes;
+    return bytes / size;
+  }
+  return fwrite(ptr, size, items, iof_file_get_fh(iofile));
+}
+
+size_t iof_file_ensure (iof_file *iofile, size_t bytes)
+{
+  if (iofile->flags & IOF_DATA)
+  {
+    size_t space, sizesofar, left;
+    left = (size_t)iof_left(iofile);
+    if (bytes > left)
+    {
+      if ((space = iof_space(iofile)) == 0) // allow iofile->buf/end initially NULL
+        space = BUFSIZ;
+      for (sizesofar = iof_size(iofile), space <<= 1; sizesofar + bytes > space; space <<= 1);
+      return iof_file_data_resizeto(iofile, space);
+    }
+    return left;
+  }
+  return 0;
+}
+
+int iof_file_getc (iof_file *iofile)
+{
+  if (iofile->flags & IOF_DATA)
+    return iofile->pos < iofile->end ? *iofile->pos++ : IOFEOF;
+  return fgetc(iof_file_get_fh(iofile));
+}
+
+int iof_file_putc (iof_file *iofile, int c)
+{
+  if (iofile->flags & IOF_DATA)
+  {
+    if (iofile->pos >= iofile->end)
+      if (iof_file_data_resize(iofile) == 0)
+        return IOFEOF;
+    *iofile->pos++ = (uint8_t)c;
+    return c;
+  }
+  return fputc(c, iof_file_get_fh(iofile));
+}
+
+static int iof_file_sync (iof_file *iofile, size_t *offset)
+{
+  if (iofile->offset != offset)
+  {
+    if (iofile->offset != NULL)
+      *iofile->offset = iof_file_tell(iofile);
+    iofile->offset = offset;
+    if (offset) // let offset be NULL
+      return iof_file_seek(iofile, (long)*offset, SEEK_SET);
+  }
+  return 0;
+}
+
+//#define iof_file_unsync(iofile, poffset) (void)((iofile)->offset == poffset && (((iofile)->offset = NULL), 0))
+#define iof_file_unsync(iofile, poffset) ((void)poffset, (iofile)->offset = NULL)
+
+/* iof seek */
+
+#define iof_reader_reset(I) ((I)->pos = (I)->end = (I)->buf)
+#define iof_reader_reseek_file(I, offset, whence) (fseek((I)->file, offset, whence) == 0 ? (iof_reader_reset(I), 0) : -1)
+#define iof_reader_reseek_iofile(I, offset, whence) (iof_file_seek((I)->iofile, offset, whence) == 0 ? (iof_reader_reset(I), 0) : -1)
+
+#define iof_writer_reset(O) ((O)->pos = (O)->buf)
+#define iof_writer_reseek_file(O, offset, whence) (iof_flush(O), (fseek((O)->file, offset, whence) == 0 ? (iof_writer_reset(O), 0) : -1))
+#define iof_writer_reseek_iofile(O, offset, whence) (iof_flush(O), (iof_file_seek((O)->iofile, offset, whence) == 0 ? (iof_writer_reset(O), 0) : -1))
+
+static int iof_reader_seek_data (iof *I, long offset, int whence)
+{
+  switch (whence)
+  {
+    case SEEK_SET:
+      if (offset >= 0 && I->buf + offset <= I->end)
+      {
+        I->pos = I->buf + offset;
+        return 0;
+      }
+      return -1;
+    case SEEK_CUR:
+      if ((offset >= 0 && I->pos + offset <= I->end) || (offset < 0 && I->pos + offset >= I->buf))
+      {
+        I->pos += offset;
+        return 0;
+      }
+      return -1;
+    case SEEK_END:
+      if (offset <= 0 && I->end + offset >= I->buf)
+      {
+        I->pos = I->end + offset;
+        return 0;
+      }
+      return -1;
+  }
+  return -1;
+}
+
+static int iof_reader_seek_iofile (iof *I, long offset, int whence)
+{
+  long fileoffset;
+  switch (whence)
+  {
+    case SEEK_SET:
+      fileoffset = iof_file_tell(I->iofile);
+      if (offset <= fileoffset && offset >= fileoffset - iof_space(I))
+      {
+        I->pos = I->end - (fileoffset - offset);
+        return 0;
+      }
+      return iof_reader_reseek_iofile(I, offset, SEEK_SET);
+    case SEEK_CUR:
+      if ((offset >= 0 && I->pos + offset <= I->end) || (offset < 0 && I->pos + offset >= I->buf))
+      {
+        I->pos += offset;
+        return 0;
+      }
+      return iof_reader_reseek_iofile(I, offset, SEEK_CUR);
+    case SEEK_END:
+      return iof_reader_reseek_iofile(I, offset, SEEK_END); // can we do better?
+  }
+  return -1;
+}
+
+static int iof_reader_seek_file (iof *I, long offset, int whence)
+{
+  long fileoffset;
+  switch (whence)
+  {
+    case SEEK_SET:
+      fileoffset = ftell(I->file);
+      if (offset <= fileoffset && offset >= fileoffset - iof_space(I))
+      {
+        I->pos = I->end - (fileoffset - offset);
+        return 0;
+      }
+      return iof_reader_reseek_file(I, offset, SEEK_SET);
+    case SEEK_CUR:
+      if ((offset >= 0 && I->pos + offset <= I->end) || (offset < 0 && I->pos + offset >= I->buf))
+      {
+        I->pos += offset;
+        return 0;
+      }
+      return iof_reader_reseek_file(I, offset, SEEK_CUR);
+    case SEEK_END:
+      return iof_reader_reseek_file(I, offset, SEEK_END); // can we do better?
+  }
+  return -1;
+}
+
+int iof_reader_seek (iof *I, long offset, int whence)
+{
+  I->flags &= ~IOF_STOPPED;
+  if (I->flags & IOF_FILE)
+    return iof_reader_seek_iofile(I, offset, whence);
+  if (I->flags & IOF_FILE_HANDLE)
+    return iof_reader_seek_file(I, offset, whence);
+  if (I->flags & IOF_DATA)
+    return iof_reader_seek_data(I, offset, whence);
+  return -1;
+}
+
+int iof_reader_reseek (iof *I, long offset, int whence)
+{
+  I->flags &= ~IOF_STOPPED;
+  if (I->flags & IOF_FILE)
+    return iof_reader_reseek_iofile(I, offset, whence);
+  if (I->flags & IOF_FILE_HANDLE)
+    return iof_reader_reseek_file(I, offset, whence);
+  if (I->flags & IOF_DATA)
+    return iof_reader_seek_data(I, offset, whence);
+  return -1;
+}
+
+static int iof_writer_seek_data (iof *O, long offset, int whence)
+{
+  /*
+  fseek() allows to seek after the end of file. Seeking does not increase the output file.
+  No byte is written before fwirte(). It seems to fill the gap with zeros. Until we really need that,
+  no seeking out of bounds for writers.
+  */
+  O->flags &= ~IOF_STOPPED;
+  return iof_reader_seek_data(O, offset, whence);
+}
+
+static int iof_writer_seek_iofile (iof *O, long offset, int whence)
+{
+  long fileoffset;
+  switch (whence)
+  {
+    case SEEK_SET:
+      fileoffset = iof_file_tell(O->iofile);
+      if (offset >= fileoffset && offset <= fileoffset + iof_space(O))
+      {
+        O->pos = O->buf + (offset - fileoffset);
+        return 0;
+      }
+      return iof_writer_reseek_iofile(O, offset, SEEK_SET);
+    case SEEK_CUR:
+      if ((offset >=0 && O->pos + offset <= O->end) || (offset < 0 && O->pos + offset >= O->buf))
+      {
+        O->pos += offset;
+        return 0;
+      }
+      return iof_writer_reseek_iofile(O, offset, SEEK_CUR);
+    case SEEK_END:
+      return iof_writer_reseek_iofile(O, offset, SEEK_END);
+  }
+  return -1;
+}
+
+static int iof_writer_seek_file (iof *O, long offset, int whence)
+{
+  long fileoffset;
+  switch (whence)
+  {
+    case SEEK_SET:
+      fileoffset = ftell(O->file);
+      if (offset >= fileoffset && offset <= fileoffset + iof_space(O))
+      {
+        O->pos = O->buf + (offset - fileoffset);
+        return 0;
+      }
+      return iof_writer_reseek_file(O, offset, SEEK_SET);
+    case SEEK_CUR:
+      if ((offset >=0 && O->pos + offset <= O->end) || (offset < 0 && O->pos + offset >= O->buf))
+      {
+        O->pos += offset;
+        return 0;
+      }
+      return iof_writer_reseek_file(O, offset, SEEK_CUR);
+    case SEEK_END:
+      return iof_writer_reseek_file(O, offset, SEEK_END);
+  }
+  return -1;
+}
+
+int iof_writer_seek (iof *I, long offset, int whence)
+{
+  I->flags &= ~IOF_STOPPED;
+  if (I->flags & IOF_FILE)
+    return iof_writer_seek_iofile(I, offset, whence);
+  if (I->flags & IOF_FILE_HANDLE)
+    return iof_writer_seek_file(I, offset, whence);
+  if (I->flags & IOF_DATA)
+    return iof_writer_seek_data(I, offset, whence);
+  return -1;
+}
+
+int iof_writer_reseek (iof *I, long offset, int whence)
+{
+  I->flags &= ~IOF_STOPPED;
+  if (I->flags & IOF_FILE)
+    return iof_writer_reseek_iofile(I, offset, whence);
+  if (I->flags & IOF_FILE_HANDLE)
+    return iof_writer_reseek_file(I, offset, whence);
+  if (I->flags & IOF_DATA)
+    return iof_writer_seek_data(I, offset, whence);
+  return -1;
+}
+
+int iof_seek (iof *F, long offset, int whence)
+{
+  return (F->flags & IOF_WRITER) ? iof_writer_seek(F, offset, whence) : iof_reader_seek(F, offset, whence);
+}
+
+int iof_reseek (iof *F, long offset, int whence)
+{
+  return (F->flags & IOF_WRITER) ? iof_writer_reseek(F, offset, whence) : iof_reader_reseek(F, offset, whence);
+}
+
+/* tell */
+
+long iof_reader_tell (iof *I)
+{
+  if (I->flags & IOF_FILE)
+    return iof_file_tell(I->iofile) - (long)iof_left(I);
+  if (I->flags & IOF_FILE_HANDLE)
+    return ftell(I->file) - (long)iof_left(I);
+  //if (I->flags & IOF_DATA)
+  return (long)iof_size(I);
+}
+
+long iof_writer_tell (iof *O)
+{
+  if (O->flags & IOF_FILE)
+    return iof_file_tell(O->iofile) + (long)iof_size(O);
+  if (O->flags & IOF_FILE_HANDLE)
+    return ftell(O->file) + (long)iof_size(O);
+  //if (I->flags & IOF_DATA)
+  return (long)iof_size(O);
+}
+
+long iof_tell (iof *I)
+{
+  return (I->flags & IOF_WRITER) ? iof_writer_tell(I) : iof_reader_tell(I);
+}
+
+size_t iof_fsize (iof *I)
+{
+  size_t pos, size;
+  if (I->flags & IOF_FILE)
+    return iof_file_size(I->iofile);
+  if (I->flags & IOF_FILE_HANDLE)
+  {
+    pos = (size_t)ftell(I->file);
+    fseek(I->file, 0, SEEK_END);
+    size = (size_t)ftell(I->file);
+    fseek(I->file, (long)pos, SEEK_SET);
+    return size;
+  }
+  //if (I->flags & IOF_DATA)
+  return (size_t)iof_space(I);
+}
+
+/* save reader tail */
+
+size_t iof_save_tail (iof *I)
+{
+  size_t size, left;
+  size = iof_size(I);
+  left = iof_left(I);
+  if (size >= left)
+    memcpy(I->buf, I->pos, left);
+  else
+    memmove(I->buf, I->pos, left);
+  return left;
+}
+
+size_t iof_input_save_tail (iof *I, size_t back)
+{
+  size_t size;
+  I->flags |= IOF_TAIL;
+  I->pos -= back;
+  size = iof_input(I);
+  I->pos += back;
+  I->flags &= ~IOF_TAIL;
+  return size; // + back - back
+}
+
+/* read from file */
+
+/* iof free*/
+
+static size_t file_read (iof *I);
+static size_t file_load (iof *I);
+
+static size_t file_reader (iof *I, iof_mode mode)
+{
+  switch (mode)
+  {
+    case IOFREAD:
+      return file_read(I);
+    case IOFLOAD:
+      return file_load(I);
+    case IOFCLOSE:
+      iof_free(I);
+      return 0;
+    default:
+      return 0;
+  }
+}
+
+iof * iof_setup_file_handle_reader (iof *I, void *buffer, size_t space, FILE *f)
+{
+  iof_setup_reader(I, buffer, space);
+  iof_setup_file(I, f);
+  I->more = file_reader;
+  return I;
+}
+
+iof * iof_setup_file_reader (iof *I, void *buffer, size_t space, const char *filename)
+{
+  FILE *f;
+  if ((f = fopen(filename, "rb")) == NULL)
+    return NULL;
+  iof_setup_reader(I, buffer, space);
+  iof_setup_file(I, f);
+  I->flags |= IOF_CLOSE_FILE;
+  I->more = file_reader;
+  return I;
+}
+
+/* write to file */
+
+static size_t file_write (iof *O, int flush);
+
+static size_t file_writer (iof *O, iof_mode mode)
+{
+  switch (mode)
+  {
+    case IOFWRITE:
+      return file_write(O, 0);
+    case IOFFLUSH:
+      return file_write(O, 1);
+    case IOFCLOSE:
+      file_write(O, 1);
+      iof_free(O);
+      return 0;
+    default:
+      return 0;
+  }
+}
+
+iof * iof_setup_file_handle_writer (iof *O, void *buffer, size_t space, FILE *f)
+{
+  iof_setup_writer(O, buffer, space);
+  iof_setup_file(O, f);
+  O->more = file_writer;
+  return O;
+}
+
+iof * iof_setup_file_writer (iof *O, void *buffer, size_t space, const char *filename)
+{
+  FILE *f;
+  if ((f = fopen(filename, "wb")) == NULL)
+    return NULL;
+  iof_setup_writer(O, buffer, space);
+  iof_setup_file(O, f);
+  O->flags |= IOF_CLOSE_FILE;
+  O->more = file_writer;
+  return O;
+}
+
+/* a dedicated handler for stdout/stderr */
+
+static size_t stdout_writer (iof *O, iof_mode mode)
+{
+  switch(mode)
+  {
+    case IOFWRITE:
+    {
+      fwrite(O->buf, sizeof(uint8_t), iof_size(O), stdout);
+      O->pos = O->buf;
+      return O->space;
+    }
+    case IOFCLOSE:
+    case IOFFLUSH:
+    {
+      fwrite(O->buf, sizeof(uint8_t), iof_size(O), stdout);
+      fflush(stdout);
+      O->pos = O->buf;
+      return 0;
+    }
+    default:
+      break;
+  }
+  return 0;
+}
+
+static size_t stderr_writer (iof *O, iof_mode mode)
+{
+  switch(mode)
+  {
+    case IOFWRITE:
+    {
+      fwrite(O->buf, sizeof(uint8_t), iof_size(O), stderr);
+      O->pos = O->buf;
+      return O->space;
+    }
+    case IOFCLOSE:
+    case IOFFLUSH:
+    {
+      fwrite(O->buf, sizeof(uint8_t), iof_size(O), stderr);
+      fflush(stderr);
+      O->pos = O->buf;
+      return 0;
+    }
+    default:
+      break;
+  }
+  return 0;
+}
+
+static uint8_t iof_stdout_buffer[BUFSIZ];
+iof iof_stdout = IOF_WRITER_INIT(stdout_writer, NULL, iof_stdout_buffer, BUFSIZ, 0);
+
+static uint8_t iof_stderr_buffer[BUFSIZ];
+iof iof_stderr = IOF_WRITER_INIT(stderr_writer, NULL, iof_stderr_buffer, BUFSIZ, 0);
+
+/* read from somewhere */
+
+iof * iof_reader (iof *I, void *link, iof_handler reader, const void *m, size_t bytes)
+{
+  I->space = 0;
+  I->link = link;
+  I->more = reader;
+  I->flags = 0;
+  I->refcount = 0;
+  if (m != NULL)
+  {
+    I->rbuf = I->rpos = (const uint8_t *)m;
+    I->rend = (const uint8_t *)m + bytes;
+    return I;
+  }
+  return NULL;
+}
+
+iof * iof_string_reader (iof *I, const void *s, size_t bytes)
+{
+  I->space = 0;
+  I->link = NULL;
+  I->more = NULL;
+  I->flags = 0; // iof_string() sets IOF_DATA
+  I->refcount = 0;
+  if (s != NULL)
+    return iof_string(I, s, bytes);
+  return NULL;
+}
+
+/* write somewhere */
+
+iof * iof_writer (iof *O, void *link, iof_handler writer, void *m, size_t bytes)
+{
+  O->space = 0;
+  O->link = link;
+  O->more = writer;
+  O->flags = 0;
+  O->refcount = 0;
+  if (m != NULL && bytes > 0)
+  {
+    O->buf = O->pos = (uint8_t *)m;
+    O->end = (uint8_t *)m + bytes;
+    return O;
+  }
+  // return iof_null(O);
+  return NULL;
+}
+
+/* write to growing bytes buffer */
+
+static size_t iof_mem_handler (iof *O, iof_mode mode)
+{
+  switch(mode)
+  {
+    case IOFWRITE:
+      return iof_resize_buffer(O);
+    case IOFCLOSE:
+      iof_free(O);
+      return 0;
+    default:
+      return 0;
+  }
+}
+
+iof * iof_setup_buffer (iof *O, void *buffer, size_t space)
+{
+  iof_setup_writer(O, buffer, space);
+  O->link = NULL;
+  O->flags |= IOF_DATA;
+  O->more = iof_mem_handler;
+  return O;
+}
+
+iof * iof_setup_buffermin (iof *O, void *buffer, size_t space, size_t min)
+{
+  iof_setup_buffer(O, buffer, space);
+  if (space < min) // allocate min to avoid further rewriting
+  {
+    O->buf = O->pos = (uint8_t *)util_malloc(min);
+    O->flags |= IOF_BUFFER_ALLOC;
+    O->end = O->buf + min;
+  }
+  return O;
+}
+
+iof * iof_buffer_create (size_t space)
+{
+  uint8_t *buffer;
+  iof *O;  
+  O = (iof *)util_malloc(space);
+  buffer = (uint8_t *)(O + 1);
+  iof_setup_buffer(O, buffer, space);
+  O->flags |= IOF_ALLOC;
+  return O;
+}
+
+/* set/get */
+
+int iof_getc (iof *I)
+{
+  if (iof_readable(I))
+    return *I->pos++;
+  return IOFEOF;
+}
+
+int iof_putc (iof *O, int u)
+{
+  if (iof_writable(O))
+  {
+    iof_set(O, u);
+    return (uint8_t)u;
+  }
+  return IOFFULL;
+}
+
+size_t iof_skip (iof *I, size_t bytes)
+{
+  while (bytes)
+  {
+    if (iof_readable(I))
+      ++I->pos;
+    else
+      break;
+    --bytes;
+  }
+  return bytes;
+}
+
+/* from iof to iof */
+
+iof_status iof_pass (iof *I, iof *O)
+{
+  size_t leftin, leftout;
+  if ((leftin = iof_left(I)) == 0)
+    leftin = iof_input(I);
+  while (leftin)
+  {
+    if ((leftout = iof_left(O)) == 0)
+      if ((leftout = iof_output(O)) == 0)
+        return IOFFULL;
+    while (leftin > leftout)
+    {
+      memcpy(O->pos, I->pos, leftout);
+      I->pos += leftout;
+      O->pos = O->end; /* eq. += leftout */
+      leftin -= leftout;
+      if ((leftout = iof_output(O)) == 0)
+        return IOFFULL;
+    }
+    if (leftin)
+    {
+      memcpy(O->pos, I->pos, leftin);
+      I->pos = I->end; /* eq. += leftin */
+      O->pos += leftin;
+    }
+    leftin = iof_input(I);
+  }
+  return IOFEOF;
+}
+
+/* read n-bytes */
+
+size_t iof_read (iof *I, void *to, size_t size)
+{
+  size_t leftin, done = 0;
+  char *s = (char *)to;
+
+  if ((leftin = iof_left(I)) == 0)
+    if ((leftin = iof_input(I)) == 0)
+      return done;
+  while (size > leftin)
+  {
+    memcpy(s, I->pos, leftin * sizeof(uint8_t));
+    size -= leftin;
+    done += leftin;
+    s += leftin;
+    I->pos = I->end;
+    if ((leftin = iof_input(I)) == 0)
+      return done;
+  }
+  if (size)
+  {
+    memcpy(s, I->pos, size * sizeof(uint8_t));
+    I->pos += size;
+    done += size;
+  }
+  return done;
+}
+
+/* rewrite FILE content (use fseek if needed) */
+
+size_t iof_write_file_handle (iof *O, FILE *file)
+{
+  size_t leftout, size, readout;
+  if ((leftout = iof_left(O)) == 0)
+    if ((leftout = iof_output(O)) == 0)
+      return 0;
+  size = 0;
+  do {
+    readout = fread(O->pos, 1, leftout, file);
+    O->pos += readout;
+    size += readout;
+  } while(readout == leftout && (leftout = iof_output(O)) > 0);
+  return size;
+}
+
+size_t iof_write_file (iof *O, const char *filename)
+{
+  FILE *file;
+  size_t size;
+  if ((file = fopen(filename, "rb")) == NULL)
+    return 0;
+  size = iof_write_file_handle(O, file);
+  fclose(file);
+  return size;
+}
+
+size_t iof_write_iofile (iof *O, iof_file *iofile, int savepos)
+{
+  long offset;
+  size_t size;
+  FILE *file;
+  if (iofile->flags & IOF_DATA)
+    return iof_write(O, iofile->pos, (size_t)(iofile->end - iofile->pos));
+  file = iof_file_get_fh(iofile);
+  if (savepos)
+  {
+    offset = ftell(file);
+    size = iof_write_file_handle(O, file);
+    fseek(file, offset, SEEK_SET);
+    return size;
+  }
+  return iof_write_file_handle(O, file);
+}
+
+/* write n-bytes */
+
+size_t iof_write (iof *O, const void *data, size_t size)
+{
+  size_t leftout, done = 0;
+  const char *s = (const char *)data;
+  if ((leftout = iof_left(O)) == 0)
+    if ((leftout = iof_output(O)) == 0)
+      return done;
+  while (size > leftout)
+  {
+    memcpy(O->pos, s, leftout * sizeof(uint8_t));
+    size -= leftout;
+    done += leftout;
+    s += leftout;
+    O->pos = O->end;
+    if ((leftout = iof_output(O)) == 0)
+      return done;
+  }
+  if (size)
+  {
+    memcpy(O->pos, s, size * sizeof(uint8_t));
+    O->pos += size;
+    done += size;
+  }
+  return done;
+}
+
+/* write '\0'-terminated string */
+
+iof_status iof_puts (iof *O, const void *data)
+{
+  const char *s = (const char *)data;
+  while (*s)
+  {
+    if (iof_writable(O))
+      iof_set(O, *s++);
+    else
+      return IOFFULL;
+  }
+  return IOFEOF; // ?
+}
+
+size_t iof_put_string (iof *O, const void *data)
+{
+  const char *p, *s = (const char *)data;
+  for (p = s; *p != '\0' && iof_writable(O); iof_set(O, *p++));
+  return p - s;
+}
+
+/* write byte n-times */
+
+/*
+iof_status iof_repc (iof *O, char c, size_t bytes)
+{
+  while (bytes)
+  {
+    if (iof_writable(O))
+      iof_set(O, c);
+    else
+      return IOFFULL;
+    --bytes;
+  }
+  return IOFEOF; // ?
+}
+*/
+
+size_t iof_repc (iof *O, char c, size_t bytes)
+{
+  size_t leftout, todo = bytes;
+  if ((leftout = iof_left(O)) == 0)
+    if ((leftout = iof_output(O)) == 0)
+      return 0;
+  while (bytes > leftout)
+  {
+    memset(O->pos, c, leftout);
+    bytes -= leftout;
+    O->pos = O->end;
+    if ((leftout = iof_output(O)) == 0)
+      return todo - bytes;
+  }
+  if (bytes)
+  {
+    memset(O->pos, c, bytes);
+    O->pos += bytes;
+  }
+  return todo;
+}
+
+/* putfs */
+
+#define IOF_FMT_SIZE 1024
+
+size_t iof_putfs (iof *O, const char *format, ...)
+{
+  static char buffer[IOF_FMT_SIZE];
+  va_list args;
+  va_start(args, format);
+  if (vsnprintf(buffer, IOF_FMT_SIZE, format, args) > 0)
+  {
+    va_end(args);
+    return iof_put_string(O, buffer);
+  }
+  else
+  {
+    va_end(args);
+    return iof_write(O, buffer, IOF_FMT_SIZE);
+  }
+}
+
+/* integer from iof; return 1 on success, 0 otherwise */
+
+int iof_get_int32 (iof *I, int32_t *number)
+{
+  int sign, c = iof_char(I);
+  iof_scan_sign(I, c, sign);
+  if (!base10_digit(c)) return 0;
+  iof_read_integer(I, c, *number);
+  if (sign) *number = -*number;
+  return 1;
+}
+
+int iof_get_slong (iof *I, long *number)
+{
+  int sign, c = iof_char(I);
+  iof_scan_sign(I, c, sign);
+  if (!base10_digit(c)) return 0;
+  iof_read_integer(I, c, *number);
+  if (sign) *number = -*number;
+  return 1;
+}
+
+int iof_get_int64 (iof *I, int64_t *number)
+{
+  int sign, c = iof_char(I);
+  iof_scan_sign(I, c, sign);
+  if (!base10_digit(c)) return 0;
+  iof_read_integer(I, c, *number);
+  if (sign) *number = -*number;
+  return 1;
+}
+
+int iof_get_uint32 (iof *I, uint32_t *number)
+{
+  int c = iof_char(I);
+  if (!base10_digit(c)) return 0;
+  iof_read_integer(I, c, *number);
+  return 1;
+}
+
+int iof_get_ulong (iof *I, unsigned long *number)
+{
+  int c = iof_char(I);
+  if (!base10_digit(c)) return 0;
+  iof_read_integer(I, c, *number);
+  return 1;
+}
+
+int iof_get_usize (iof *I, size_t *number)
+{
+  int c = iof_char(I);
+  if (!base10_digit(c)) return 0;
+  iof_read_integer(I, c, *number);
+  return 1;
+}
+
+int iof_get_uint64 (iof *I, uint64_t *number)
+{
+  int c = iof_char(I);
+  if (!base10_digit(c)) return 0;
+  iof_read_integer(I, c, *number);
+  return 1;
+}
+
+int iof_get_int32_radix (iof *I, int32_t *number, int radix)
+{
+  int sign, c = iof_char(I);
+  iof_scan_sign(I, c, sign);
+  if (!base10_digit(c)) return 0;
+  iof_read_radix(I, c, *number, radix);
+  if (sign) *number = -*number;
+  return 1;
+
+}
+
+int iof_get_slong_radix (iof *I, long *number, int radix)
+{
+  int sign, c = iof_char(I);
+  iof_scan_sign(I, c, sign);
+  if (!base10_digit(c)) return 0;
+  iof_read_radix(I, c, *number, radix);
+  if (sign) *number = -*number;
+  return 1;
+}
+
+int iof_get_int64_radix (iof *I, int64_t *number, int radix)
+{
+  int sign, c = iof_char(I);
+  iof_scan_sign(I, c, sign);
+  if (!base10_digit(c)) return 0;
+  iof_read_radix(I, c, *number, radix);
+  if (sign) *number = -*number;
+  return 1;
+}
+
+int iof_get_uint32_radix (iof *I, uint32_t *number, int radix)
+{
+  int c = iof_char(I);
+  if (!base10_digit(c)) return 0;
+  iof_read_radix(I, c, *number, radix);
+  return 1;
+}
+
+int iof_get_ulong_radix (iof *I, unsigned long *number, int radix)
+{
+  int c = iof_char(I);
+  if (!base10_digit(c)) return 0;
+  iof_read_radix(I, c, *number, radix);
+  return 1;
+}
+
+int iof_get_usize_radix (iof *I, size_t *number, int radix)
+{
+  int c = iof_char(I);
+  if (!base10_digit(c)) return 0;
+  iof_read_radix(I, c, *number, radix);
+  return 1;
+}
+
+int iof_get_uint64_radix (iof *I, uint64_t *number, int radix)
+{
+  int c = iof_char(I);
+  if (!base10_digit(c)) return 0;
+  iof_read_radix(I, c, *number, radix);
+  return 1;
+}
+
+/* get roman to uint16_t, cf. roman_to_uint16() from utilnumber.c*/
+
+/* todo: some trick in place of this macro horror? */
+
+#define roman1000(c) (c == 'M' || c == 'm')
+#define roman500(c)  (c == 'D' || c == 'd')
+#define roman100(c)  (c == 'C' || c == 'c')
+#define roman50(c)   (c == 'L' || c == 'l')
+#define roman10(c)   (c == 'X' || c == 'x')
+#define roman5(c)    (c == 'V' || c == 'v')
+#define roman1(c)    (c == 'I' || c == 'i')
+
+#define roman100s(I, c) \
+  (roman100(c) ? (100 + ((c = iof_next(I), roman100(c)) ? (100 + ((c = iof_next(I), roman100(c)) ? (c = iof_next(I), 100) : 0)) : 0)) : 0)
+#define roman10s(I, c) \
+  (roman10(c) ? (10 + ((c = iof_next(I), roman10(c)) ? (10 + ((c = iof_next(I), roman10(c)) ? (c = iof_next(I), 10) : 0)) : 0)) : 0)
+#define roman1s(I, c) \
+  (roman1(c) ? (1 + ((c = iof_next(I), roman1(c)) ? (1 + ((c = iof_next(I), roman1(c)) ? (c = iof_next(I), 1) : 0)) : 0)) : 0)
+
+int iof_get_roman (iof *I, uint16_t *number)
+{
+  int c;
+  /* M */
+  for (*number = 0, c = iof_char(I); roman1000(c); *number += 1000, c = iof_next(I));
+  /* D C */
+  if (roman500(c))
+  {
+    c = iof_next(I);
+    *number += 500 + roman100s(I, c);
+  }
+  else if (roman100(c))
+  {
+    c = iof_next(I);
+    if (roman1000(c))
+    {
+      c = iof_next(I);
+      *number += 900;
+    }
+    else if (roman500(c))
+    {
+      c = iof_next(I);
+      *number += 400;
+    }
+    else
+      *number += 100 + roman100s(I, c);
+  }
+  /* L X */
+  if (roman50(c))
+  {
+    c = iof_next(I);
+    *number += 50 + roman10s(I, c);
+  }
+  else if (roman10(c))
+  {
+    c = iof_next(I);
+    if (roman100(c))
+    {
+      c = iof_next(I);
+      *number += 90;
+    }
+    else if (roman50(c))
+    {
+      c = iof_next(I);
+      *number += 40;
+    }
+    else
+      *number += 10 + roman10s(I, c);
+  }
+  /* V I */
+  if (roman5(c))
+  {
+    c = iof_next(I);
+    *number += 5 + roman1s(I, c);
+  }
+  else if (roman1(c))
+  {
+    c = iof_next(I);
+    if (roman10(c))
+    {
+      c = iof_next(I);
+      *number += 9;
+    }
+    else if (roman5(c))
+    {
+      c = iof_next(I);
+      *number += 4;
+    }
+    else
+      *number += 1 + roman1s(I, c);
+  }
+  return 1;
+}
+
+/* double from iof; return 1 on success */
+
+int iof_get_double (iof *I, double *number) // cf. string_to_double()
+{
+  int sign, exponent10, c = iof_char(I);
+  iof_scan_sign(I, c, sign);
+  iof_scan_decimal(I, c, *number);
+  if (c == '.')
+  {
+    c = iof_next(I);
+    iof_scan_fraction(I, c, *number, exponent10);
+  }
+  else
+    exponent10 = 0;
+  if (c == 'e' || c == 'E')
+  {
+    c = iof_next(I);
+    iof_scan_exponent10(I, c, exponent10);
+  }
+  double_exp10(*number, exponent10);
+  if (sign) *number = -*number;
+  return 1;
+}
+
+int iof_get_float (iof *I, float *number) // cf. string_to_float()
+{
+  int sign, exponent10, c = iof_char(I);
+  iof_scan_sign(I, c, sign);
+  iof_scan_decimal(I, c, *number);
+  if (c == '.')
+  {
+    c = iof_next(I);
+    iof_scan_fraction(I, c, *number, exponent10);
+  }
+  else
+    exponent10 = 0;
+  if (c == 'e' || c == 'E')
+  {
+    c = iof_next(I);
+    iof_scan_exponent10(I, c, exponent10);
+  }
+  float_exp10(*number, exponent10);
+  if (sign) *number = -*number;
+  return 1;
+}
+
+int iof_conv_double (iof *I, double *number) // cf. convert_to_double()
+{
+  int sign, exponent10, c = iof_char(I);
+  iof_scan_sign(I, c, sign);
+  iof_scan_decimal(I, c, *number);
+  if (c == '.' || c == ',')
+  {
+    c = iof_next(I);
+    iof_scan_fraction(I, c, *number, exponent10);
+    if (exponent10 < 0)
+      double_negative_exp10(*number, exponent10);
+  }
+  if (sign) *number = -*number;
+  return 1;
+}
+
+int iof_conv_float (iof *I, float *number) // cf. convert_to_float()
+{
+  int sign, exponent10, c = iof_char(I);
+  iof_scan_sign(I, c, sign);
+  iof_scan_decimal(I, c, *number);
+  if (c == '.' || c == ',')
+  {
+    c = iof_next(I);
+    iof_scan_fraction(I, c, *number, exponent10);
+    if (exponent10 < 0)
+      float_negative_exp10(*number, exponent10);
+  }
+  if (sign) *number = -*number;
+  return 1;
+}
+
+/* integer to iof; return a number of written bytes */
+
+size_t iof_put_int32 (iof *O, int32_t number)
+{
+  const char *s;
+  size_t size;
+  s = int32_to_string(number, &size);
+  return iof_write(O, s, size);
+}
+
+size_t iof_put_slong (iof *O, long number)
+{
+  const char *s;
+  size_t size;
+  s = slong_to_string(number, &size);
+  return iof_write(O, s, size);
+}
+
+size_t iof_put_int64 (iof *O, int64_t number)
+{
+  const char *s;
+  size_t size;
+  s = int64_to_string(number, &size);
+  return iof_write(O, s, size);
+}
+
+size_t iof_put_uint32 (iof *O, uint32_t number)
+{
+  const char *s;
+  size_t size;
+  s = uint32_to_string(number, &size);
+  return iof_write(O, s, size);
+}
+
+size_t iof_put_ulong (iof *O, unsigned long number)
+{
+  const char *s;
+  size_t size;
+  s = ulong_to_string(number, &size);
+  return iof_write(O, s, size);
+}
+
+size_t iof_put_usize (iof *O, size_t number)
+{
+  const char *s;
+  size_t size;
+  s = usize_to_string(number, &size);
+  return iof_write(O, s, size);
+}
+
+size_t iof_put_uint64 (iof *O, uint64_t number)
+{
+  const char *s;
+  size_t size;
+  s = uint64_to_string(number, &size);
+  return iof_write(O, s, size);
+}
+
+size_t iof_put_int32_radix (iof *O, int32_t number, int radix, int uc)
+{
+  const char *s;
+  size_t size;
+  s = int32_to_radix(number, radix, uc, &size);
+  return iof_write(O, s, size);
+}
+
+size_t iof_put_slong_radix (iof *O, long number, int radix, int uc)
+{
+  const char *s;
+  size_t size;
+  s = slong_to_radix(number, radix, uc, &size);
+  return iof_write(O, s, size);
+}
+
+size_t iof_put_int64_radix (iof *O, int64_t number, int radix, int uc)
+{
+  const char *s;
+  size_t size;
+  s = int64_to_radix(number, radix, uc, &size);
+  return iof_write(O, s, size);
+}
+
+size_t iof_put_uint32_radix (iof *O, uint32_t number, int radix, int uc)
+{
+  const char *s;
+  size_t size;
+  s = uint32_to_radix(number, radix, uc, &size);
+  return iof_write(O, s, size);
+}
+
+size_t iof_put_ulong_radix (iof *O, unsigned long number, int radix, int uc)
+{
+  const char *s;
+  size_t size;
+  s = ulong_to_radix(number, radix, uc, &size);
+  return iof_write(O, s, size);
+}
+
+size_t iof_put_usize_radix (iof *O, size_t number, int radix, int uc)
+{
+  const char *s;
+  size_t size;
+  s = usize_to_radix(number, radix, uc, &size);
+  return iof_write(O, s, size);
+}
+
+size_t iof_put_uint64_radix (iof *O, uint64_t number, int radix, int uc)
+{
+  const char *s;
+  size_t size;
+  s = uint64_to_radix(number, radix, uc, &size);
+  return iof_write(O, s, size);
+}
+
+/* roman numerals */
+
+size_t iof_put_roman (iof *O, uint16_t number, int uc)
+{
+  const char *s;
+  size_t size;
+  s = uint16_to_roman(number, uc, &size);
+  return iof_write(O, s, size);
+}
+
+/* double/float to iof; return the number of written bytes */
+
+size_t iof_put_double (iof *O, double number, int digits)
+{
+  const char *s;
+  size_t size;
+  s = double_to_string(number, digits, &size);
+  return iof_write(O, s, size);
+}
+
+size_t iof_put_float (iof *O, float number, int digits)
+{
+  const char *s;
+  size_t size;
+  s = float_to_string(number, digits, &size);
+  return iof_write(O, s, size);
+}
+
+/* iof to binary integer; pretty common */
+
+int iof_get_be_uint2 (iof *I, uint32_t *pnumber)
+{
+  int c1, c2;
+  if ((c1 = iof_get(I)) < 0 || (c2 = iof_get(I)) < 0)
+    return 0;
+  *pnumber = (c1<<8)|c2;
+  return 1;
+}
+
+int iof_get_be_uint3 (iof *I, uint32_t *pnumber)
+{
+  int c1, c2, c3;
+  if ((c1 = iof_get(I)) < 0 || (c2 = iof_get(I)) < 0 || (c3 = iof_get(I)) < 0)
+    return 0;
+  *pnumber = (c1<<16)|(c2<<8)|c3;
+  return 1;
+}
+
+int iof_get_be_uint4 (iof *I, uint32_t *pnumber)
+{
+  int c1, c2, c3, c4;
+  if ((c1 = iof_get(I)) < 0 || (c2 = iof_get(I)) < 0 || (c3 = iof_get(I)) < 0 || (c4 = iof_get(I)) < 0)
+    return 0;
+  *pnumber = (c1<<24)|(c2<<16)|(c3<<8)|c4;
+  return 1;
+}
+
+int iof_get_le_uint2 (iof *I, uint32_t *pnumber)
+{
+  int c1, c2;
+  if ((c1 = iof_get(I)) < 0 || (c2 = iof_get(I)) < 0)
+    return 0;
+  *pnumber = (c2<<8)|c1;
+  return 1;
+}
+
+int iof_get_le_uint3 (iof *I, uint32_t *pnumber)
+{
+  int c1, c2, c3;
+  if ((c1 = iof_get(I)) < 0 || (c2 = iof_get(I)) < 0 || (c3 = iof_get(I)) < 0)
+    return 0;
+  *pnumber = (c3<<16)|(c2<<8)|c1;
+  return 1;
+}
+
+int iof_get_le_uint4 (iof *I, uint32_t *pnumber)
+{
+  int c1, c2, c3, c4;
+  if ((c1 = iof_get(I)) < 0 || (c2 = iof_get(I)) < 0 || (c3 = iof_get(I)) < 0 || (c4 = iof_get(I)) < 0)
+    return 0;
+  *pnumber = (c4<<24)|(c3<<16)|(c2<<8)|c1;
+  return 1;
+}
+
+/* iof input data */
+
+uint8_t * iof_file_input_data (iof_file *iofile, size_t *psize, int *isnew)
+{
+  uint8_t *data;
+  if (iofile->flags & IOF_DATA)
+  {
+    data = iofile->buf;
+    *psize = iofile->end - iofile->buf;
+    *isnew = 0;
+    return data;
+  }
+  if (iof_file_reopen(iofile))
+  {
+    data = iof_copy_file_handle_data(iof_file_get_fh(iofile), psize);
+    *isnew = 1;
+    iof_file_reclose(iofile);
+    return data;
+  }
+  return NULL;
+}
+
+/*
+uint8_t * iof_file_reader_data (iof_file *iofile, size_t *size)
+{
+  uint8_t *data;
+  if (!(iofile->flags & IOF_DATA) || iofile->pos == NULL || (*size = (size_t)iof_left(iofile)) == 0)
+    return NULL;
+  if (iofile->flags & IOF_BUFFER_ALLOC)
+  {
+    data = iofile->buf; // iofile->pos; // returned must be freeable, makes sense when ->buf == ->pos
+    iofile->flags &= ~IOF_BUFFER_ALLOC;
+    iofile->buf = iofile->pos = iofile->end = NULL;
+    return data;
+  }
+  data = (uint8_t *)util_malloc(*size);
+  memcpy(data, iofile->buf, *size);
+  return data;
+}
+
+uint8_t * iof_file_writer_data (iof_file *iofile, size_t *size)
+{
+  uint8_t *data;
+  if (!(iofile->flags & IOF_DATA) || iofile->buf == NULL || (*size = (size_t)iof_size(iofile)) == 0)
+    return NULL;
+  if (iofile->flags & IOF_BUFFER_ALLOC)
+  {
+    iofile->flags &= ~IOF_BUFFER_ALLOC;
+    data = iofile->buf;
+    iofile->buf = iofile->pos = iofile->end = NULL;
+    return data;
+  }
+  data = (uint8_t *)util_malloc(*size);
+  memcpy(data, iofile->buf, *size);
+  return data;
+}
+*/
+
+uint8_t * iof_reader_data (iof *I, size_t *psize)
+{
+  uint8_t *data;
+  *psize = (size_t)iof_left(I);
+  if (I->flags & IOF_BUFFER_ALLOC)
+  {
+    data = I->buf; // actually I->pos, but we have to return something freeable
+    I->flags &= ~IOF_BUFFER_ALLOC;
+    I->buf = NULL;
+  }
+  else
+  {
+    data = util_malloc(*psize);
+    memcpy(data, I->pos, *psize);
+  }
+  iof_close(I);
+  return data;
+}
+
+
+uint8_t * iof_writer_data (iof *O, size_t *psize)
+{
+  uint8_t *data;
+  *psize = (size_t)iof_size(O);
+  if (O->flags & IOF_BUFFER_ALLOC)
+  {
+    data = O->buf;
+    O->flags &= ~IOF_BUFFER_ALLOC;
+    O->buf = NULL;
+  }
+  else
+  {
+    data = util_malloc(*psize);
+    memcpy(data, O->buf, *psize);
+  }
+  iof_close(O);
+  return data;
+}
+
+size_t iof_reader_to_file_handle (iof *I, FILE *file)
+{
+  size_t size;
+  for (size = 0; iof_readable(I); I->pos = I->end)
+    size += fwrite(I->buf, sizeof(uint8_t), iof_left(I), file);
+  return size;
+}
+
+size_t iof_reader_to_file (iof *I, const char *filename)
+{
+  FILE *file;
+  size_t size;
+  if ((file = fopen(filename, "wb")) == NULL)
+    return 0;
+  for (size = 0; iof_readable(I); I->pos = I->end)
+    size += fwrite(I->buf, sizeof(uint8_t), iof_left(I), file);
+  fclose(file);
+  return size;
+}
+
+/* debug */
+
+size_t iof_data_to_file (const void *data, size_t size, const char *filename)
+{
+  FILE *fh;
+  if ((fh = fopen(filename, "wb")) == NULL)
+    return 0;
+  // size = fwrite(data, size, sizeof(uint8_t), fh); // WRONG, this always returns 1, as fwrite returns the number of elements successfully written out
+  size = fwrite(data, sizeof(uint8_t), size, fh);
+  fclose(fh);
+  return size;
+}
+
+size_t iof_result_to_file_handle (iof *F, FILE *file)
+{
+  const void *data;
+  size_t size;
+  data = iof_result(F, size);
+  return iof_data_to_file_handle(data, size, file);
+}
+
+size_t iof_result_to_file (iof *F, const char *filename)
+{
+  const void *data;
+  size_t size;
+  data = iof_result(F, size);
+  return iof_data_to_file(data, size, filename);
+}
+
+void iof_debug (iof *I, const char *filename)
+{
+  FILE *file = fopen(filename, "wb");
+  if (file != NULL)
+  {
+    fprintf(file, ">>> buf %p <<<\n", I->buf);
+    fwrite(I->buf, sizeof(uint8_t), iof_size(I), file);
+    fprintf(file, "\n>>> pos %p (%ld) <<<\n", I->pos, (long)iof_size(I));
+    fwrite(I->pos, sizeof(uint8_t), iof_left(I), file);
+    fprintf(file, "\n>>> end %p (%ld) <<<\n", I->end, (long)iof_left(I));
+    fwrite(I->end, sizeof(uint8_t), I->space - iof_space(I), file);
+    fprintf(file, "\n>>> end of buffer %p (%ld) <<<\n", I->buf + I->space, (long)(I->buf + I->space - I->end));
+    fclose(file);
+  }
+}
+
+/* common filters api */
+
+/* sizes of filter states on x64
+size of iof_filter: 640 (no longer used; sizeof(iof) + sizeof larger state)
+size of file_state: 16
+size of stream_state: 16
+size of flate_state: 104
+size of lzw_state: 56
+size of predictor_state: 104
+size of basexx_state: 48
+size of basexx_state: 48
+size of basexx_state: 48
+size of eexec_state: 40
+size of runlength_state: 24
+size of rc4_state: 24
+size of aes_state: 72
+size of img_state: 576
+size of img: 496
+*/
+
+typedef struct iof_heap iof_heap;
+
+typedef struct {
+  iof_heap *heap;
+} iof_heap_ghost;
+
+
+struct iof_heap {
+  union { uint8_t *data; iof_heap_ghost *gdata; }; // union instead of casts (ARM)
+  union { uint8_t *pos; iof_heap_ghost *gpos; };
+  size_t size, space;
+  iof_heap *next, *prev;
+  int refcount;
+  uint8_t dummy[4]; // pad to 8N bytes
+};
+
+/*
+We use hidden heap pointer for every allocated buffer, so heap->data should be kept properly aligned.
+Dummy 4-bytes pad doesn't really matter (the pad is there anyway), but iof_heap_take() must pad the
+requested size.
+*/
+
+static iof_heap * iof_buffers_heap = NULL;
+static iof_heap * iof_filters_heap = NULL;
+
+#define IOF_HEAP_FILTERS_COUNT 4
+#define IOF_BUFFER_SIZE 262144 // (1<<18)
+#define IOF_FILTER_SIZE 1024
+// sizeof(iof_filter) on x64 is now 640, img_state 576, img 496, others 16-104
+#define IOF_BUFFER_HEAP_SIZE (IOF_HEAP_FILTERS_COUNT * (IOF_BUFFER_SIZE + sizeof(iof_heap_ghost)))
+#define IOF_FILTER_HEAP_SIZE (IOF_HEAP_FILTERS_COUNT * (IOF_FILTER_SIZE + sizeof(iof_heap_ghost)))
+
+static iof_heap * iof_heap_new (size_t space)
+{
+  iof_heap *iofheap;
+  iofheap = (iof_heap *)util_malloc(sizeof(iof_heap) + space);
+  iofheap->gdata = iofheap->gpos = (iof_heap_ghost *)(iofheap + 1);
+  iofheap->size = iofheap->space = space;
+  iofheap->next = NULL;
+  iofheap->prev = NULL;
+  iofheap->refcount = 0;
+  return iofheap;
+}
+
+#define iof_heap_free(iofheap) util_free(iofheap)
+
+void iof_filters_init (void)
+{
+  if (iof_buffers_heap == NULL)
+    iof_buffers_heap = iof_heap_new(IOF_BUFFER_HEAP_SIZE);
+  if (iof_filters_heap == NULL)
+    iof_filters_heap = iof_heap_new(IOF_FILTER_HEAP_SIZE);
+}
+
+void iof_filters_free (void)
+{
+  iof_heap *heap, *next;
+  for (heap = iof_buffers_heap; heap != NULL; heap = next)
+  {
+    next = heap->next;
+    if (heap->refcount != 0)
+      loggerf("not closed iof filters left (%d)", heap->refcount);
+    if (next != NULL)
+      loggerf("iof filters heap left");
+    iof_heap_free(heap);
+  }
+  iof_buffers_heap = NULL;
+  for (heap = iof_filters_heap; heap != NULL; heap = next)
+  {
+    next = heap->next;
+    if (heap->refcount != 0)
+      loggerf("not closed iof buffers left (%d)", heap->refcount);
+    if (next != NULL)
+      loggerf("iof buffers heap left");
+    iof_heap_free(heap);
+  }
+  iof_filters_heap = NULL;
+}
+
+#define iof_heap_get(hp, ghost, data, siz) \
+ (ghost = (hp)->gpos, ghost->heap = (hp), data = (uint8_t *)(ghost + 1), (hp)->pos += siz, (hp)->size -= siz, ++(hp)->refcount)
+
+static void * iof_heap_take (iof_heap **pheap, size_t size)
+{
+  uint8_t *data;
+  iof_heap_ghost *ghost;
+  iof_heap *heap, *newheap, *next;
+
+  heap = *pheap;
+  if (size & 7)
+  	size += 8 - (size & 7); // pad to 8N bytes so that (heap->pos + size) remains properly aligned
+  size += sizeof(iof_heap_ghost);
+  if (heap->size >= size)
+  { /* take cheap mem from main heap */
+    iof_heap_get(heap, ghost, data, size);
+    return data;
+  }
+  if (size <= (heap->space >> 1))
+  { /* make new cheap heap, make it front */
+    *pheap = newheap = iof_heap_new(heap->space);
+    newheap->next = heap;
+    heap->prev = newheap;
+    iof_heap_get(newheap, ghost, data, size);
+    return data;
+  }
+  /* size much larger than expected? should not happen.
+     make a single-item heap, keep the front heap intact. */
+  newheap = iof_heap_new(size);
+  if ((next = heap->next) != NULL)
+  {
+    newheap->next = next;
+    next->prev = newheap;
+  }
+  heap->next = newheap;
+  newheap->prev = heap;
+  iof_heap_get(newheap, ghost, data, size);
+  return data;
+}
+
+static void iof_heap_back (void *data)
+{
+  iof_heap_ghost *ghost;
+  iof_heap *heap, *next, *prev;
+
+  ghost = ((iof_heap_ghost *)data) - 1;
+  heap = ghost->heap;
+  if (heap->refcount == 0)
+    loggerf("invalid use of iof heap, refcount < 0");
+  if (--heap->refcount <= 0)
+  {
+    if ((prev = heap->prev) != NULL)
+    { /* free the heap */
+      if ((next = heap->next) != NULL)
+        prev->next = next, next->prev = prev;
+      else
+        prev->next = NULL;
+      iof_heap_free(heap);
+    }
+    else
+    { /* this is the front heap, just reset */
+      heap->pos = heap->data;
+      heap->size = heap->space;
+    }
+  }
+}
+
+/**/
+
+/*
+void * iof_filter_new (size_t size)
+{
+  void *data;
+  iof_filters_init();
+  data = iof_heap_take(&iof_filters_heap, size);
+  return memset(data, 0, size);
+}
+*/
+
+iof * iof_filter_reader_new (iof_handler handler, size_t statesize, void **pstate)
+{
+  iof *F;
+  void *filter;
+  uint8_t *buffer;
+  size_t buffersize;
+
+  iof_filters_init();
+  filter = iof_heap_take(&iof_filters_heap, sizeof(iof) + statesize);
+  F = (iof *)memset(filter, 0, sizeof(iof) + statesize);
+  buffer = iof_heap_take(&iof_buffers_heap, IOF_BUFFER_SIZE);
+  buffersize = IOF_BUFFER_SIZE;
+  iof_setup_reader(F, buffer, buffersize);
+  F->flags |= IOF_HEAP|IOF_BUFFER_HEAP;
+  F->more = handler;
+  *pstate = (F + 1);
+  return F;
+}
+
+iof * iof_filter_reader_with_buffer_new (iof_handler handler, size_t statesize, void **pstate, void *buffer, size_t buffersize)
+{ // for filters that has own buffer (string, some image filters)
+  iof *F;
+  void *filter;
+
+  iof_filters_init();
+  filter = iof_heap_take(&iof_filters_heap, sizeof(iof) + statesize);
+  F = (iof *)memset(filter, 0, sizeof(iof) + statesize);
+  iof_setup_reader(F, buffer, buffersize);
+  F->flags |= IOF_HEAP;
+  F->more = handler;
+  *pstate = (F + 1);
+  return F;
+}
+
+iof * iof_filter_writer_new (iof_handler handler, size_t statesize, void **pstate)
+{
+  iof *F;
+  void *filter;
+  uint8_t *buffer;
+  size_t buffersize;
+
+  iof_filters_init();
+  filter = iof_heap_take(&iof_filters_heap, sizeof(iof) + statesize);
+  F = (iof *)memset(filter, 0, sizeof(iof) + statesize);
+  buffer = iof_heap_take(&iof_buffers_heap, IOF_BUFFER_SIZE);
+  buffersize = IOF_BUFFER_SIZE;
+  iof_setup_writer(F, buffer, buffersize);
+  F->flags |= IOF_HEAP|IOF_BUFFER_HEAP;
+  F->more = handler;
+  *pstate = (F + 1);
+  return F;
+}
+
+iof * iof_filter_writer_with_buffer_new (iof_handler handler, size_t statesize, void **pstate, void *buffer, size_t buffersize)
+{
+  iof *F;
+  void *filter;
+
+  iof_filters_init();
+  filter = iof_heap_take(&iof_filters_heap, sizeof(iof) + statesize);
+  F = (iof *)memset(filter, 0, sizeof(iof) + statesize);
+  iof_setup_writer(F, buffer, buffersize);
+  F->flags |= IOF_HEAP;
+  F->more = handler;
+  *pstate = (F + 1);
+  return F;
+}
+
+/**/
+
+#define iof_filter_free(F) iof_heap_back(F)
+#define iof_filter_buffer_free(data) iof_heap_back(data)
+
+/* close */
+
+#define iof_close_next(F) ((void)(iof_decref((F)->next), (F)->next = NULL, 0))
+/* when filter creation fails, we should take care to destroy the filter but leave ->next intact */
+#define iof_clear_next(F) ((void)(iof_unref((F)->next), (F)->next = NULL, 0))
+
+#define iof_close_buffer(F) ((void)\
+  ((F)->buf != NULL ? \
+      ((F->flags & IOF_BUFFER_ALLOC) ? (util_free((F)->buf), (F)->buf = NULL, 0) : \
+      ((F->flags & IOF_BUFFER_HEAP) ? (iof_filter_buffer_free((F)->buf), (F)->buf = NULL, 0) : ((F)->buf = NULL, 0))) : 0))
+
+/* closing underlying file handle */
+
+static void iof_close_file (iof *F)
+{
+  FILE *file;
+  //if (F->flags & IOF_FILE_HANDLE)
+  //{
+    if ((file = F->file) != NULL)
+    {
+      if (F->flags & IOF_CLOSE_FILE)
+        fclose(F->file);
+      F->file = NULL;
+    }
+  //}
+}
+
+/* a very special variant for reader filters initiated with iof_file_reopen(). It also calls
+   iof_file_reclose(), which takes an effect only if previously reopened, but better to keep
+   all this thin ice separated. Used in filters: iofile_reader, iofile_stream_reader, image
+   decoders. */
+
+static void iof_close_iofile (iof *F)
+{
+  iof_file *iofile;
+  //if (F->flags & IOF_FILE)
+  //{
+    if ((iofile = F->iofile) != NULL)
+    {
+      iof_file_unsync(iofile, NULL);
+      iof_file_reclose(iofile); // takes an effect iff prevoiusly reopened
+      iof_file_decref(iofile);
+      F->iofile = NULL;
+    }
+  //}
+}
+
+void iof_free (iof *F)
+{
+  if (F->flags & IOF_FILE_HANDLE)
+    iof_close_file(F);
+  else if (F->flags & IOF_FILE)
+    iof_close_iofile(F);
+  else if (F->flags & IOF_NEXT)
+    iof_close_next(F);
+  iof_close_buffer(F);
+  if (F->flags & IOF_HEAP)
+    iof_filter_free(F);
+  else if (F->flags & IOF_ALLOC)
+    util_free(F);
+}
+
+void iof_discard (iof *F)
+{ // so far used only on failed filters creation; as iof_free() but don't dare to release ->next
+  if (F->flags & IOF_FILE_HANDLE)
+    iof_close_file(F);
+  else if (F->flags & IOF_FILE)
+    iof_close_iofile(F);
+  //else if (F->flags & IOF_NEXT)
+  //  iof_close_next(F);
+  iof_close_buffer(F);
+  if (F->flags & IOF_HEAP)
+    iof_filter_free(F);
+  else if (F->flags & IOF_ALLOC)
+    util_free(F);
+}
+
+/* resizing buffer */
+
+size_t iof_resize_buffer_to (iof *O, size_t space)
+{
+  uint8_t *buf;
+
+  if (O->flags & IOF_BUFFER_ALLOC)
+  {
+    buf = (uint8_t *)util_realloc(O->buf, space);
+  }
+  else
+  {
+    buf = (uint8_t *)util_malloc(space);
+    memcpy(buf, O->buf, iof_size(O));
+    if (O->flags & IOF_BUFFER_HEAP)
+    {
+      iof_filter_buffer_free(O->buf);
+      O->flags &= ~IOF_BUFFER_HEAP;
+    }
+    O->flags |= IOF_BUFFER_ALLOC;
+
+  }
+  O->pos = buf + iof_size(O);
+  O->end = buf + space;
+  O->buf = buf;
+  O->space = space;
+  return iof_left(O);
+}
+
+/* */
+
+size_t iof_decoder_retval (iof *I, const char *type, iof_status status)
+{
+  switch (status)
+  {
+    case IOFERR:
+    case IOFEMPTY:             // should never happen as we set state.flush = 1 on decoders init
+      loggerf("%s decoder error (%d, %s)", type, status, iof_status_kind(status));
+      I->flags |= IOF_STOPPED;
+      return 0;
+    case IOFEOF:               // this is the last chunk,
+      I->flags |= IOF_STOPPED; // so stop it and fall
+      FALLTHRU                 // fall through
+    case IOFFULL:              // prepare pointers to read from I->buf
+      I->end = I->pos;
+      I->pos = I->buf;
+      return I->end - I->buf;
+  }
+  loggerf("%s decoder bug, invalid retval %d", type, status);
+  return 0;
+}
+
+size_t iof_encoder_retval (iof *O, const char *type, iof_status status)
+{
+  switch (status)
+  {
+    case IOFERR:
+    case IOFFULL:
+      loggerf("%s encoder error (%d, %s)", type, status, iof_status_kind(status));
+      return 0;
+    case IOFEMPTY:
+      O->pos = O->buf;
+      O->end = O->buf + O->space;
+      return O->space;
+    case IOFEOF:
+      return 0;
+  }
+  loggerf("%s encoder bug, invalid retval %d", type, status);
+  return 0;
+}
+
+/* file/stream state */
+
+typedef struct {
+  size_t length;
+  size_t offset;
+} file_state;
+
+#define file_state_init(state, off, len) ((state)->offset = off, (state)->length = len)
+
+typedef struct {
+  size_t length;
+  size_t offset;
+} stream_state;
+
+#define stream_state_init(state, off, len) ((state)->offset = off, (state)->length = len)
+
+/* union type to avoid 'dereferencing type-punned .. ' warnings on (void **) case */
+
+typedef union { file_state *filestate; stream_state *streamstate; void *voidstate; } fs_state_pointer;
+
+/**/
+
+static size_t file_read (iof *I)
+{
+  size_t bytes, tail;
+  if (I->flags & IOF_STOPPED)
+    return 0;
+  tail = iof_tail(I);
+  if ((bytes = tail + fread(I->buf + tail, sizeof(uint8_t), I->space - tail, I->file)) < I->space)
+    I->flags |= IOF_STOPPED;
+  I->pos = I->buf;
+  I->end = I->buf + bytes;
+  return bytes;
+}
+
+static size_t iofile_read (iof *I, size_t *poffset)
+{
+  size_t bytes, tail;
+  if (I->flags & IOF_STOPPED)
+    return 0;
+  iof_file_sync(I->iofile, poffset);
+  tail = iof_tail(I);
+  if ((bytes = tail + iof_file_read(I->buf + tail, sizeof(uint8_t), I->space - tail, I->iofile)) < I->space)
+  {
+    I->flags |= IOF_STOPPED;
+    iof_file_unsync(I->iofile, poffset);
+  }
+  I->pos = I->buf;
+  I->end = I->buf + bytes;
+  return bytes;
+}
+
+static size_t file_load (iof *I)
+{
+  size_t bytes, left, tail;
+  if (I->flags & IOF_STOPPED)
+    return 0;
+  tail = iof_tail(I);
+  I->pos = I->buf + tail;
+  I->end = I->buf + I->space; /* don't assume its done when initializing the filter */
+  left = I->space - tail;
+  do {
+    bytes = fread(I->pos, sizeof(uint8_t), left, I->file);
+    I->pos += bytes;
+  } while (bytes == left && (left = iof_resize_buffer(I)) > 0);
+  I->flags |= IOF_STOPPED;
+  return iof_loaded(I);
+}
+
+static size_t iofile_load (iof *I, size_t *poffset)
+{
+  size_t bytes, left, tail;
+  if (I->flags & IOF_STOPPED)
+    return 0;
+  tail = iof_tail(I);
+  I->pos = I->buf + tail;
+  I->end = I->buf + I->space; /* don't assume its done when initializing the filter */
+  left = I->space - tail;
+  iof_file_sync(I->iofile, poffset);
+  do {
+    bytes = iof_file_read(I->pos, sizeof(uint8_t), left, I->iofile);
+    I->pos += bytes;
+  } while (bytes == left && (left = iof_resize_buffer(I)) > 0);
+  I->flags |= IOF_STOPPED;
+  iof_file_unsync(I->iofile, poffset);
+  return iof_loaded(I);
+}
+
+static size_t filter_file_reader (iof *I, iof_mode mode)
+{
+  switch (mode)
+  {
+    case IOFREAD:
+      return file_read(I);
+    case IOFLOAD:
+      return file_load(I);
+    case IOFCLOSE:
+      iof_free(I);
+      return 0;
+    default:
+      return 0;
+  }
+}
+
+static size_t filter_iofile_reader (iof *I, iof_mode mode)
+{
+  file_state *state;
+  state = iof_filter_state(file_state *, I);
+  switch (mode)
+  {
+    case IOFREAD:
+      return iofile_read(I, &state->offset);
+    case IOFLOAD:
+      return iofile_load(I, &state->offset);
+    case IOFCLOSE:
+      iof_free(I);
+      return 0;
+    default:
+      return 0;
+  }
+}
+
+static size_t file_write (iof *O, int flush)
+{
+  size_t bytes;
+  if ((bytes = iof_size(O)) > 0)
+    if (bytes != fwrite(O->buf, sizeof(uint8_t), bytes, O->file))
+      return 0;
+  if (flush)
+    fflush(O->file);
+  O->end = O->buf + O->space; // remains intact actually
+  O->pos = O->buf;
+  return O->space;
+}
+
+static size_t iofile_write (iof *O, size_t *poffset, int flush)
+{
+  size_t bytes;
+  iof_file_sync(O->iofile, poffset);
+  if ((bytes = iof_size(O)) > 0)
+  {
+    if (bytes != iof_file_write(O->buf, sizeof(uint8_t), bytes, O->iofile))
+    {
+      iof_file_unsync(O->iofile, poffset);
+      return 0;
+    }
+  }
+  if (flush)
+    iof_file_flush(O->iofile);
+  O->end = O->buf + O->space; // remains intact actually
+  O->pos = O->buf;
+  return O->space;
+}
+
+static size_t filter_file_writer (iof *O, iof_mode mode)
+{
+  switch (mode)
+  {
+    case IOFWRITE:
+      return file_write(O, 0);
+    case IOFFLUSH:
+      return file_write(O, 1);
+    case IOFCLOSE:
+      file_write(O, 1);
+      iof_free(O);
+      return 0;
+    default:
+      return 0;
+  }
+}
+
+static size_t filter_iofile_writer (iof *O, iof_mode mode)
+{
+  file_state *state;
+  state = iof_filter_state(file_state *, O);
+  switch (mode)
+  {
+    case IOFWRITE:
+      return iofile_write(O, &state->offset, 0);
+    case IOFFLUSH:
+      return iofile_write(O, &state->offset, 1);
+    case IOFCLOSE:
+      iofile_write(O, &state->offset, 1);
+      iof_free(O);
+      return 0;
+    default:
+      return 0;
+  }
+}
+
+/* filter from FILE* */
+
+iof * iof_filter_file_handle_reader (FILE *file)
+{
+  iof *I;
+  fs_state_pointer P;
+  if (file == NULL)
+    return NULL;
+  I = iof_filter_reader(filter_file_reader, sizeof(file_state), &P.voidstate);
+  iof_setup_file(I, file);
+  file_state_init(P.filestate, 0, 0);
+  return I;
+}
+
+iof * iof_filter_file_handle_writer (FILE *file)
+{
+  iof *O;
+  fs_state_pointer P;
+  if (file == NULL)
+    return NULL;
+  O = iof_filter_writer(filter_file_writer, sizeof(file_state), &P.voidstate);
+  iof_setup_file(O, file);
+  file_state_init(P.filestate, 0, 0);
+  return O;
+}
+
+/* filter from iof_file * */
+
+iof * iof_filter_iofile_reader (iof_file *iofile, size_t offset)
+{
+  iof *I;
+  fs_state_pointer P;
+  if (!iof_file_reopen(iofile))
+    return NULL;
+  I = iof_filter_reader(filter_iofile_reader, sizeof(file_state), &P.voidstate);
+  iof_setup_iofile(I, iofile);
+  file_state_init(P.filestate, offset, 0);
+  return I;
+}
+
+iof * iof_filter_iofile_writer (iof_file *iofile, size_t offset)
+{
+  iof *O;
+  fs_state_pointer P;
+  O = iof_filter_writer(filter_iofile_writer, sizeof(file_state), &P.voidstate);
+  iof_setup_iofile(O, iofile);
+  file_state_init(P.filestate, offset, 0);
+  return O;
+}
+
+/* filter from filename */
+
+iof * iof_filter_file_reader (const char *filename)
+{
+  iof *I;
+  fs_state_pointer P;
+  FILE *file;
+  if ((file = fopen(filename, "rb")) == NULL)
+    return NULL;
+  I = iof_filter_reader(filter_file_reader, sizeof(file_state), &P.voidstate);
+  iof_setup_file(I, file);
+  file_state_init(P.filestate, 0, 0);
+  I->flags |= IOF_CLOSE_FILE;
+  return I;
+}
+
+iof * iof_filter_file_writer (const char *filename)
+{
+  iof *O;
+  fs_state_pointer P;
+  FILE *file;
+  if ((file = fopen(filename, "wb")) == NULL)
+    return NULL;
+  O = iof_filter_writer(filter_file_writer, sizeof(file_state), &P.voidstate);
+  iof_setup_file(O, file);
+  file_state_init(P.filestate, 0, 0);
+  O->flags |= IOF_CLOSE_FILE;
+  return O;
+}
+
+/* from string */
+
+static size_t dummy_handler (iof *I, iof_mode mode)
+{
+  switch (mode)
+  {
+    case IOFCLOSE:
+      iof_free(I);
+      return 0;
+    default:
+      return 0;
+  }
+}
+
+iof * iof_filter_string_reader (const void *s, size_t length)
+{
+  iof *I;
+  void *dummy;
+  I = iof_filter_reader_with_buffer(dummy_handler, 0, &dummy, NULL, 0);
+  I->rbuf = I->rpos = (const uint8_t *)s;
+  I->rend = (const uint8_t *)s + length;
+  // I->space = length;
+  return I;
+}
+
+iof * iof_filter_string_writer (const void *s, size_t length)
+{
+  iof *O;
+  void *dummy;
+  O = iof_filter_reader_with_buffer(dummy_handler, 0, &dummy, NULL, 0);
+  O->rbuf = O->rpos = (const uint8_t *)s;
+  O->rend = (const uint8_t *)s + length;
+  // O->space = length;
+  return O;
+}
+
+iof * iof_filter_buffer_writer (size_t size)
+{ // cmp iof_buffer_create()
+  iof *O;
+  fs_state_pointer dummy;
+  uint8_t *buffer;
+  if (size > IOF_BUFFER_SIZE)
+  {
+    buffer = (uint8_t *)util_malloc(size);
+    O = iof_filter_writer_with_buffer(iof_mem_handler, 0, &dummy.voidstate, buffer, size);
+    O->flags |= IOF_BUFFER_ALLOC;
+    return O;
+  }
+  return iof_filter_writer(iof_mem_handler, 0, &dummy.voidstate);
+}
+
+/* stream */
+
+static size_t file_stream_read (iof *I, size_t *plength)
+{
+  size_t bytes, tail;
+  if (I->flags & IOF_STOPPED || *plength == 0)
+    return 0;
+  tail = iof_tail(I);
+  if (I->space - tail >= *plength)
+  {
+    bytes = tail + fread(I->buf + tail, sizeof(uint8_t), *plength, I->file);
+    I->flags |= IOF_STOPPED;
+    *plength = 0;
+  }
+  else
+  {
+    bytes = tail + fread(I->buf + tail, sizeof(uint8_t), I->space - tail, I->file);
+    *plength -= bytes - tail;
+  }
+  I->pos = I->buf;
+  I->end = I->buf + bytes;
+  return bytes;
+}
+
+static size_t iofile_stream_read (iof *I, size_t *plength, size_t *poffset)
+{
+  size_t bytes, tail;
+  if (I->flags & IOF_STOPPED || *plength == 0)
+    return 0;
+  tail = iof_tail(I);
+  iof_file_sync(I->iofile, poffset);
+  if (I->space - tail >= *plength)
+  {
+    bytes = tail + iof_file_read(I->buf + tail, sizeof(uint8_t), *plength, I->iofile);
+    iof_file_unsync(I->iofile, poffset);
+    I->flags |= IOF_STOPPED;
+    *plength = 0;
+  }
+  else
+  {
+    bytes = tail + iof_file_read(I->buf + tail, sizeof(uint8_t), I->space - tail, I->iofile);
+    *plength -= bytes - tail;
+  }
+  I->pos = I->buf;
+  I->end = I->buf + bytes;
+  return bytes;
+}
+
+static size_t file_stream_load (iof *I, size_t *plength)
+{
+  size_t bytes, tail;
+  if (I->flags & IOF_STOPPED || *plength == 0)
+    return 0;
+  tail = iof_tail(I);
+  if (I->space - tail < *plength)
+    if (iof_resize_buffer_to(I, tail + *plength) == 0)
+      return 0;
+  bytes = tail + fread(I->buf + tail, sizeof(uint8_t), *plength, I->file);
+  I->flags |= IOF_STOPPED;
+  *plength = 0;
+  I->pos = I->buf;
+  I->end = I->buf + bytes;
+  return bytes;
+}
+
+static size_t iofile_stream_load (iof *I, size_t *plength, size_t *poffset)
+{
+  size_t bytes, tail;
+  if (I->flags & IOF_STOPPED || *plength == 0)
+    return 0;
+  iof_file_sync(I->iofile, poffset);
+  tail = iof_tail(I);
+  if (I->space - tail < *plength)
+    if (iof_resize_buffer_to(I, tail + *plength) == 0)
+      return 0;
+  bytes = tail + iof_file_read(I->buf + tail, sizeof(uint8_t), *plength, I->iofile);
+  iof_file_unsync(I->iofile, poffset);
+  I->flags |= IOF_STOPPED;
+  *plength = 0;
+  I->pos = I->buf;
+  I->end = I->buf + bytes;
+  return bytes;
+}
+
+static size_t filter_file_stream_reader (iof *I, iof_mode mode)
+{
+  stream_state *state;
+  state = iof_filter_state(stream_state *, I);
+  switch(mode)
+  {
+    case IOFREAD:
+      return file_stream_read(I, &state->length);
+    case IOFLOAD:
+      return file_stream_load(I, &state->length);
+    case IOFCLOSE:
+      iof_free(I);
+      return 0;
+    default:
+      return 0;
+  }
+}
+
+static size_t filter_iofile_stream_reader (iof *I, iof_mode mode)
+{
+  stream_state *state;
+  state = iof_filter_state(stream_state *, I);
+  switch(mode)
+  {
+    case IOFREAD:
+      return iofile_stream_read(I, &state->length, &state->offset);
+    case IOFLOAD:
+      return iofile_stream_load(I, &state->length, &state->offset);
+    case IOFCLOSE:
+      iof_free(I);
+      return 0;
+    default:
+      return 0;
+  }
+}
+
+iof * iof_filter_stream_reader (FILE *file, size_t offset, size_t length)
+{
+  iof *I;
+  fs_state_pointer P;
+  I = iof_filter_reader(filter_file_stream_reader, sizeof(stream_state), &P.voidstate);
+  iof_setup_file(I, file);
+  stream_state_init(P.streamstate, offset, length);
+  fseek(file, (long)offset, SEEK_SET); // or perhaps it should be call in file_stream_read(), like iof_file_sync()?
+  return I;
+}
+
+iof * iof_filter_stream_coreader (iof_file *iofile, size_t offset, size_t length)
+{
+  iof *I;
+  fs_state_pointer P;
+  if (!iof_file_reopen(iofile))
+    return NULL;
+  I = iof_filter_reader(filter_iofile_stream_reader, sizeof(stream_state), &P.voidstate);
+  iof_setup_iofile(I, iofile);
+  stream_state_init(P.streamstate, offset, length);
+  return I;
+}
+
+static size_t file_stream_write (iof *O, size_t *plength, int flush)
+{
+  size_t bytes;
+  if ((bytes = iof_size(O)) > 0)
+  {
+    if (bytes != fwrite(O->buf, sizeof(uint8_t), bytes, O->file))
+    {
+      *plength += bytes;
+      return 0;
+    }
+  }
+  if (flush)
+    fflush(O->file);
+  *plength += bytes;
+  O->end = O->buf + O->space; // remains intact
+  O->pos = O->buf;
+  return O->space;
+}
+
+static size_t iofile_stream_write (iof *O, size_t *plength, size_t *poffset, int flush)
+{
+  size_t bytes;
+  if ((bytes = iof_size(O)) > 0)
+  {
+    iof_file_sync(O->iofile, poffset);
+    if (bytes != iof_file_write(O->buf, sizeof(uint8_t), bytes, O->iofile))
+    {
+      *plength += bytes;
+      iof_file_unsync(O->iofile, poffset);
+      return 0;
+    }
+  }
+  if (flush)
+    iof_file_flush(O->iofile);
+  *plength += bytes;
+  O->end = O->buf + O->space; // remains intact
+  O->pos = O->buf;
+  return O->space;
+}
+
+static size_t filter_file_stream_writer (iof *O, iof_mode mode)
+{
+  stream_state *state;
+  state = iof_filter_state(stream_state *, O);
+  switch (mode)
+  {
+    case IOFWRITE:
+      return file_stream_write(O, &state->length, 0);
+    case IOFFLUSH:
+      return file_stream_write(O, &state->length, 1);
+    case IOFCLOSE:
+      file_stream_write(O, &state->length, 1);
+      iof_free(O);
+      return 0;
+    default:
+      return 0;
+  }
+}
+
+static size_t filter_iofile_stream_writer (iof *O, iof_mode mode)
+{
+  stream_state *state;
+  state = iof_filter_state(stream_state *, O);
+  switch (mode)
+  {
+    case IOFWRITE:
+      return iofile_stream_write(O, &state->length, &state->offset, 0);
+    case IOFFLUSH:
+      return iofile_stream_write(O, &state->length, &state->offset, 1);
+    case IOFCLOSE:
+      iofile_stream_write(O, &state->length, &state->offset, 1);
+      iof_free(O);
+      return 0;
+    default:
+      return 0;
+  }
+}
+
+iof * iof_filter_stream_writer (FILE *file)
+{
+  iof *O;
+  fs_state_pointer P;
+  O = iof_filter_writer(filter_file_stream_writer, sizeof(stream_state), &P.voidstate);
+  iof_setup_file(O, file);
+  stream_state_init(P.streamstate, 0, 0);
+  return O;
+}
+
+iof * iof_filter_stream_cowriter (iof_file *iofile, size_t offset)
+{
+  iof *O;
+  fs_state_pointer P;
+  O = iof_filter_writer(filter_iofile_stream_writer, sizeof(stream_state), &P.voidstate);
+  iof_setup_iofile(O, iofile);
+  stream_state_init(P.streamstate, offset, 0);
+  return O;
+}
+
+/* very specific for images; get input from already created strem filter, exchange the filter but keep the buffer */
+
+FILE * iof_filter_file_reader_source (iof *I, size_t *poffset, size_t *plength)
+{
+  fs_state_pointer P;
+  if (I->more == filter_file_stream_reader) // I is the result of iof_filter_stream_reader()
+  {
+    P.streamstate = iof_filter_state(stream_state *, I);
+    *poffset = P.streamstate->offset;
+    *plength = P.streamstate->length; // might be 0 but it is ok for file readers
+    return I->file;
+  }
+  if (I->more == filter_file_reader)
+  {
+    P.filestate = iof_filter_state(file_state *, I);
+    *poffset = P.filestate->offset;
+    *plength = P.filestate->length; // might be 0 but it is ok for file readers
+    return I->file;
+  }
+  return NULL;
+}
+
+iof_file * iof_filter_file_coreader_source (iof *I, size_t *poffset, size_t *plength)
+{
+  fs_state_pointer P;
+  if (I->more == filter_iofile_stream_reader) // I is the result of iof_filter_stream_coreader()
+  {
+    P.streamstate = iof_filter_state(stream_state *, I);
+    *poffset = P.streamstate->offset;
+    *plength = P.streamstate->length;
+    return I->iofile;
+  }
+  if (I->more == filter_iofile_reader)
+  {
+    P.filestate = iof_filter_state(file_state *, I);
+    *poffset = P.filestate->offset;
+    *plength = P.filestate->length;
+    return I->iofile;
+  }
+  return NULL;
+}
+
+iof * iof_filter_reader_replacement (iof *P, iof_handler handler, size_t statesize, void **pstate)
+{ // called after iof_filter_file_reader_source(), no need to check if F is filter from iof heap and if has buffer from iof heap
+  iof *F;
+  F = iof_filter_reader_with_buffer(handler, statesize, pstate, P->buf, P->space);
+  F->flags |= IOF_BUFFER_HEAP;
+  //iof_setup_reader(P, NULL, 0);
+  //P->flags &= ~IOF_BUFFER_HEAP;
+  iof_filter_free(P);
+  return F;
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/source/luametatex/source/libraries/pplib/util/utiliof.h b/source/luametatex/source/libraries/pplib/util/utiliof.h
new file mode 100644
index 000000000..bad43a773
--- /dev/null
+++ b/source/luametatex/source/libraries/pplib/util/utiliof.h
@@ -0,0 +1,673 @@
+
+#ifndef UTIL_IOF_H
+#define UTIL_IOF_H
+
+#include <stdio.h>  // for FILE *
+#include <errno.h>  // for errno
+#include <string.h> // for strerror()
+#include <stdint.h> // for uintN_t
+
+#include "utildecl.h"
+#include "utilnumber.h"
+
+/* handler call modes */
+
+typedef enum {
+  IOFREAD  = 0, /* read to buffer */
+  IOFLOAD  = 1, /* read all to buffer */
+  IOFWRITE = 2, /* write buffer to the output */
+  IOFFLUSH = 3, /* flush buffer to the output */
+  IOFCLOSE = 4  /* (flush and) close */
+} iof_mode;
+
+/* return statuses */
+
+typedef enum {
+  IOFEOF   = -1, /* end of input */
+  IOFEMPTY = -2, /* end of input buffer*/
+  IOFFULL  = -3, /* end of output buffer */
+  IOFERR   = -4  /* error */
+} iof_status;
+
+const char * iof_status_kind (iof_status status);
+
+/* iof_file */
+
+typedef struct iof_file {
+  union {
+    FILE *iofh; // access via iof_file_get_fh / iof_file_set_fh (below)
+    union {
+      struct { uint8_t *buf, *pos, *end; };
+      struct { const uint8_t *rbuf, *rpos, *rend; }; // to trick compiler warnings about cast discarding const
+    };
+  };
+  size_t *offset;
+  char *name;
+  size_t size;
+  int refcount;
+  int flags;
+} iof_file;
+
+/* iof handler function */
+
+typedef struct iof iof;
+typedef size_t (*iof_handler) (iof *I, iof_mode mode);
+
+/* iof structure; keep 8N bytes */
+
+#define IOF_MEMBERS \
+  union { \
+    struct { uint8_t *buf, *pos, *end; }; \
+    struct { uint16_t *hbuf, *hpos, *hend; }; \
+    struct { uint32_t *ibuf, *ipos, *iend; }; \
+    struct { const uint8_t *rbuf, *rpos, *rend; }; \
+  }; \
+  size_t space; \
+  iof_handler more; \
+  union { void *link; iof *next; FILE *file; iof_file *iofile; }; \
+  int flags; \
+  int refcount
+
+/*
+  buf -- the beginning of buffer
+  pos -- the current position
+  end -- the end of buffer
+  space -- private space size, not always eq. (end - buf)
+  more -- handler function
+  next/file/iofile/link -- reader source or writer target
+  source -- source filter
+  flags -- private filter info
+  refcount -- refcount
+*/
+
+struct iof {
+  IOF_MEMBERS;
+};
+
+typedef void (*iof_dump_function) (const void *value, iof *O);
+
+/* flags */
+
+#define IOF_ALLOC          (1<<0) // iof is allocated
+#define IOF_HEAP           (1<<1) // iof taken from iof heap
+#define IOF_BUFFER_ALLOC   (1<<2) // buffer allocated
+#define IOF_BUFFER_HEAP    (1<<3) // buffer taken from iof heap
+
+#define IOF_SHORT          (1<<4) // buffer uses 16bit integers
+#define IOF_LONG           (1<<5) // buffer uses 32bit integers
+
+#define IOF_TAIL           (1<<6) // preserve reader tail
+#define IOF_READER         (1<<7) // is reader
+#define IOF_WRITER         (1<<8) // is writer
+
+#define IOF_DATA           (1<<9)  // binds some memory
+#define IOF_FILE_HANDLE    (1<<10) // links FILE *
+#define IOF_FILE           (1<<11) // links iof_file *
+#define IOF_NEXT           (1<<12) // links next iof *
+#define IOF_CLOSE_FILE     (1<<13) // close FILE * on free
+#define IOF_REOPEN_FILE    (1<<14) // close/reopen mode for iof_file
+#define IOF_RECLOSE_FILE   (1<<15) // ditto
+
+#define IOF_STOPPED        (1<<16) // stopped
+
+// #define IOF_CUSTOM         (1<<17) // first custom flag
+
+#define IOF_BUFSIZ (sizeof(iof) + BUFSIZ*sizeof(uint8_t))
+
+/*
+reading buffer -- all of buf, pos, end pointers are initialized to the beginning of the private buffer,
+  next call to a handler function moves the end pointer to bufer+space
+writer -- buf and pos pointers initialized to the beginning of the buffer, end initialized to bufer+space
+
+Every call to handler returns size_t number of bytes
+available (to write/read) or 0 if there is no more space.
+
+We usually align the data buffer just after the iof structure.
+This is convenient, especially when a memory for the structure
+and its buffer is to be allocated. In the case of growing output
+buffers we used to check if the memory of the buffer is allocated
+by the handler function using test (O->buf != (O+1)). We don't use
+it any longer not to rely on little secrets. Now there is an explicit
+IOF_BUFFER_ALLOC flag for that. IOF_ALLOC tells if the structure
+itself is taken from malloc (not used so far). Assuming the buffer size
+is way larger the sizeof(iof)
+*/
+
+/* initializers */
+
+#define IOF_READER_INIT(handler, file, buffer, size, flags) \
+  { {{ (uint8_t *)(buffer), (uint8_t *)(buffer), (uint8_t *)(buffer) }}, size, handler, { file }, (flags)|IOF_READER, 0 }
+
+#define IOF_WRITER_INIT(handler, file, buffer, size, flags) \
+  { {{ (uint8_t *)(buffer), (uint8_t *)(buffer), (uint8_t *)(buffer) + size }}, size, handler, { file }, (flags)|IOF_WRITER, 0 }
+
+#define IOF_STRING_INIT(buffer, size) \
+  { {{ (uint8_t *)(buffer), (uint8_t *)(buffer), (uint8_t *)(buffer) + size }}, size, NULL, { NULL }, 0|IOF_READER|IOF_DATA, 0 }
+
+#define IOF_STRING() IOF_STRING_INIT(0, 0)
+
+/* refcount */
+
+#define iof_incref(I) (++(I)->refcount)
+#define iof_decref(I) ((void)(--(I)->refcount <= 0 && iof_close(I)))
+#define iof_unref(I) (--(I)->refcount)
+
+/* binding buffer of a given size */
+
+#define iof_setup_reader(I, buffer, size) \
+  ((I)->buf = (I)->pos = (I)->end = (uint8_t *)(buffer), \
+   (I)->space = size, (I)->flags = 0|IOF_READER, (I)->refcount = 0)
+
+#define iof_setup_writer(O, buffer, size) \
+  ((O)->buf = (O)->pos = (uint8_t *)(buffer), \
+   (O)->end = (uint8_t *)(buffer) + size, \
+   (O)->space = size, (O)->flags = 0|IOF_WRITER, (O)->refcount = 0)
+
+/* basics */
+
+#define iof_space(I) ((I)->end - (I)->buf)
+#define iof_left(I)  ((I)->end - (I)->pos)
+#define iof_size(I)  ((I)->pos - (I)->buf)
+
+#define iof_input(I)  ((I)->more ? (I)->more((I), IOFREAD) : 0lu)
+#define iof_load(I)   ((I)->more ? (I)->more((I), IOFLOAD) : 0lu)
+
+#define iof_output(O) ((O)->more ? (O)->more((O), IOFWRITE) : 0lu)
+//#define iof_flush(O)  ((O)->pos > (O)->buf && (O)->more ? (O)->more(O, IOFFLUSH) : 0lu)
+// flush should be unconditional, because encoders emits EOD markers only on flush
+#define iof_flush(O) ((O)->more ? (O)->more(O, IOFFLUSH) : 0lu)
+#define iof_close(O)  ((O)->more ? (O)->more(O, IOFCLOSE) : 0lu)
+
+#define iof_stop(F) ((void)(F->pos = F->end = F->buf, F->flags |= IOF_STOPPED))
+
+/*
+Rewriting reader tail to the beginning of new data portion; readers reacting on IOFREAD
+mode must be aware of some not yet read data, but treat it necessary only if IOF_TAIL flag is set.
+Parsers using iof input may protect not yet read data when there may be a need to put bytes
+back to the stream. This is trivial when I->pos > I->buf, as we can make a move by --I->pos.
+But when there is a need to put back more then one byte, we can protect the data tail, so that
+realoder will rewrite it to the beginning of new data chunk.
+
+  iof_tail(I) - internal, used by iof handlers at IOFREAD mode
+  iof_protect_tail(I) - used by parsers to ensure some bytes chunk in one piece
+
+*/
+
+size_t iof_save_tail (iof *I);
+#define iof_tail(I) (((I)->flags & IOF_TAIL) && (I)->pos < (I)->end ? iof_save_tail(I) : 0)
+
+size_t iof_input_save_tail (iof *I, size_t back);
+#define iof_protect_tail(I, back, length) ((iof_left(I) >= (length) - (back)) ? 1 : (iof_input_save_tail(I, back) >= length - back))
+
+//uint8_t * iof_tail_data (iof *I, size_t *ptail);
+//#define iof_tail_free(data) util_free(data)
+
+/* panic */
+
+// #define iof_panic(mess) return 0
+#ifndef iof_panic
+  #define iof_panic(mess) (fputs(mess, stderr), abort())
+#endif
+//#define iof_memory_error() iof_panic(strerror(errno))
+#define iof_fwrite_error() iof_panic(strerror(errno))
+
+/* generic helpers */
+
+UTILAPI uint8_t * iof_copy_file_data (const char *filename, size_t *psize);
+UTILAPI uint8_t * iof_copy_file_handle_data (FILE *file, size_t *psize);
+
+/* In the future we may need releasing file handle and restoring it from iofile->name, so access file handle via macros */
+
+#define iof_file_get_fh(iofile) ((iofile)->iofh)
+#define iof_file_set_fh(iofile, fh) ((iofile)->iofh = fh)
+#define iof_file_get_file(iofile) (((iofile)->flags & IOF_DATA) ? NULL : iof_file_get_fh(iofile))
+FILE * iof_get_file (iof *F);
+
+/* basic iof_file interface */
+
+iof_file * iof_file_new (FILE *file);
+iof_file * iof_file_init (iof_file *iofile, FILE *file);
+
+iof_file * iof_file_rdata (const void *data, size_t size);
+iof_file * iof_file_wdata (void *data, size_t size);
+
+iof_file * iof_file_rdata_init (iof_file *iofile, const void *data, size_t size);
+iof_file * iof_file_wdata_init (iof_file *iofile, void *data, size_t size);
+
+iof_file * iof_file_reader_from_file_handle (iof_file *iofile, const char *filename, FILE *file, int preload, int closefile);
+iof_file * iof_file_reader_from_file (iof_file *iofile, const char *filename, int preload);
+iof_file * iof_file_reader_from_data (iof_file *iofile, const void *data, size_t size, int preload, int freedata);
+//iof_file * iof_file_writer_from_file (iof_file *iofile, const char *filename);
+
+void * iof_copy_data (const void *data, size_t size);
+#define iof_data_free(data) util_free(data)
+#define iof_file_wdata_copy(data, size) iof_file_wdata(iof_copy_data(data, size), size)
+#define iof_file_rdata_copy(data, size) iof_file_rdata(iof_copy_data(data, size), size)
+
+void iof_file_free (iof_file *iofile);
+
+#define iof_file_get_name(iofile) ((iofile)->name)
+void iof_file_set_name (iof_file *iofile, const char *name);
+
+#define iof_file_incref(iofile) (++(iofile)->refcount)
+#define iof_file_decref(iofile) ((void)(--(iofile)->refcount <= 0 && (iof_file_free(iofile), 0)))
+
+int iof_file_seek (iof_file *iofile, long offset, int whence);
+long iof_file_tell (iof_file *iofile);
+size_t iof_file_size (iof_file *iofile);
+int iof_file_eof (iof_file *iofile);
+
+size_t iof_file_read (void *ptr, size_t size, size_t items, iof_file *iofile);
+size_t iof_file_write (const void *ptr, size_t size, size_t items, iof_file *iofile);
+size_t iof_file_ensure (iof_file *iofile, size_t bytes);
+int iof_file_flush (iof_file *iofile);
+
+int iof_file_getc (iof_file *iofile);
+int iof_file_putc (iof_file *iofile, int c);
+
+int iof_file_reclose_input (iof_file *iofile);
+int iof_file_reopen_input (iof_file *iofile);
+
+#define iof_file_reopen(iofile) (((iofile)->flags & IOF_REOPEN_FILE) ? iof_file_reopen_input(iofile) : 1)
+#define iof_file_reclose(iofile) (void)(((iofile)->flags & IOF_RECLOSE_FILE) ? iof_file_reclose_input(iofile) : 0)
+
+void iof_file_close_input (iof_file *iofile);
+
+/* wrappers of basic operations for iof */
+
+int iof_reader_seek (iof *I, long offset, int whence);
+int iof_reader_reseek (iof *I, long offset, int whence);
+int iof_writer_seek (iof *I, long offset, int whence);
+int iof_writer_reseek (iof *I, long offset, int whence);
+
+int iof_seek (iof *I, long offset, int whence);
+int iof_reseek (iof *I, long offset, int whence);
+
+long iof_reader_tell (iof *I);
+long iof_writer_tell (iof *I);
+long iof_tell (iof *I);
+size_t iof_fsize (iof *I);
+
+#define iof_setup_iofile(I, f) (iof_file_incref(f), (I)->iofile = f, (I)->flags |= IOF_FILE)
+#define iof_setup_file(I, fh) ((I)->file = fh, (I)->flags |= IOF_FILE_HANDLE)
+#define iof_setup_next(I, N) ((I)->next = N, iof_incref(N), (I)->flags |= IOF_NEXT)
+
+/* file handler reader and writer */
+
+UTILAPI iof * iof_setup_file_handle_reader (iof *I, void *buffer, size_t space, FILE *f);
+UTILAPI iof * iof_setup_file_handle_writer (iof *O, void *buffer, size_t space, FILE *f);
+
+/* file reader and writer */
+
+UTILAPI iof * iof_setup_file_reader (iof *I, void *buffer, size_t space, const char *filename);
+UTILAPI iof * iof_setup_file_writer (iof *O, void *buffer, size_t space, const char *filename);
+
+/* mem writer */
+
+UTILAPI iof * iof_setup_buffer (iof *O, void *buffer, size_t space);
+UTILAPI iof * iof_setup_buffermin (iof *O, void *buffer, size_t space, size_t min);
+
+UTILAPI iof * iof_buffer_create (size_t space);
+#define iof_buffer_new() iof_buffer_create(BUFSIZ)
+
+/* custom handler */
+
+UTILAPI iof * iof_reader (iof *I, void *link, iof_handler reader, const void *s, size_t bytes);
+UTILAPI iof * iof_writer (iof *O, void *link, iof_handler writer,       void *s, size_t bytes);
+
+/* stdout wrapper */
+
+extern UTILAPI iof iof_stdout;
+extern UTILAPI iof iof_stderr;
+
+/* simple string reader */
+
+UTILAPI iof * iof_string_reader (iof *I, const void *s, size_t bytes);
+
+#define iof_string(I, s, bytes) \
+  (((I)->rbuf = (I)->rpos = (const uint8_t *)s), ((I)->rend = (I)->rbuf + (bytes)), ((I)->flags |= IOF_DATA), (I))
+
+/* dummies */
+
+UTILAPI iof * iof_dummy (void *buffer, size_t space);
+UTILAPI iof * iof_null (void *buffer, size_t space);
+
+/* checking available space */
+
+#define iof_loadable(I) ((I)->pos < (I)->end || iof_load(I))
+#define iof_readable(I) ((I)->pos < (I)->end || iof_input(I))
+#define iof_writable(O) ((O)->pos < (O)->end || iof_output(O))
+
+#define iof_hloadable iof_loadable
+#define iof_iloadable iof_loadable
+
+#define iof_hreadable iof_readable
+#define iof_ireadable iof_readable
+
+#define iof_hwritable iof_writable
+#define iof_iwritable iof_writable
+
+/* ensure space to write several bytes (several means less then I->space) */
+
+#define iof_ensure(O, n) ((O)->pos+(n)-1 < (O)->end || iof_output(O)) // iof_ensure(O, 1) eq iof_writable(O)
+#define iof_hensure(O, n) ((O)->hpos+(n)-1 < (O)->hend || iof_output(O))
+#define iof_iensure(O, n) ((O)->ipos+(n)-1 < (O)->iend || iof_output(O))
+
+/* reading */
+
+UTILAPI int iof_getc (iof *I);
+UTILAPI int iof_hgetc (iof *I);
+UTILAPI int iof_igetc (iof *I);
+
+// UTILAPI int iof_cmp (iof *I, const char *s);
+// UTILAPI int iof_cmpn (iof *I, const char *s, size_t bytes);
+
+UTILAPI iof_status iof_pass (iof *I, iof *O);
+#define iof_hpass iof_pass
+#define iof_ipass iof_pass
+
+/* readers helpers */
+
+UTILAPI size_t iof_read (iof *I, void *s, size_t bytes);
+UTILAPI size_t iof_hread (iof *I, void *s, size_t bytes);
+UTILAPI size_t iof_iread (iof *I, void *s, size_t bytes);
+
+UTILAPI size_t iof_skip (iof *I, size_t bytes);
+UTILAPI size_t iof_hskip (iof *I, size_t bytes);
+UTILAPI size_t iof_iskip (iof *I, size_t bytes);
+
+/* get */
+
+#define iof_pos(I)  (*(I)->pos++)
+#define iof_hpos(I) (*(I)->hpos++)
+#define iof_ipos(I) (*(I)->ipos++)
+
+#define iof_get(I)  (iof_readable(I)  ? (int)(*(I)->pos++)  : IOFEOF)
+#define iof_hget(I) (iof_hreadable(I) ? (int)(*(I)->hpos++) : IOFEOF)
+#define iof_iget(I) (iof_ireadable(I) ? (int)(*(I)->ipos++) : IOFEOF)
+
+#define iof_char(I)  (iof_readable(I)  ? (int)(*(I)->pos) : IOFEOF)
+#define iof_hcurr(I) (iof_hreadable(I) ? (int)(*(I)->hpos) : IOFEOF)
+#define iof_icurr(I) (iof_ireadable(I) ? (int)(*(I)->ipos) : IOFEOF)
+
+#define iof_next(I)  (++(I)->pos, iof_char(I))
+#define iof_hnext(I) (++(I)->hpos, iof_hcurr(I))
+#define iof_inext(I) (++(I)->ipos, iof_icurr(I))
+
+/* unget */
+
+/*
+If possible, we just move the position backward. If it is not possible to
+move backward, we call iof_backup(I, c) that sets all pointers to the end of
+a private backup space, then moves buf AND pos pointers backward and set c at
+pos (==buf). We can backup characters as long as there is a private space. If
+several calls to iof_backup() are followed by iof_get(), pos pointer
+increases in normal way and so the use of another iof_unget() works just fine
+by moving the position. Once we swallow all backup characters (when
+pos==end), backup handler restores the previous pointers.
+
+Obviously we assume that the character provided to iof_unget() is always the
+character just obtained from iof_get(). We CAN'T just overwrite the character
+at a given position as the space we read may not be writable.
+
+When backup is in use, we can only get bytes until automatically restored.
+*/
+
+/* backup */
+
+/*
+#define iof_uses_backup(I) ((I)->more == iof_unget_handler)
+
+#define iof_save(I, B) \
+  ((B)->buf = (I)->buf, (B)->pos = (I)->pos, (B)->end = (I)->end, (B)->space = (I)->space, \
+   (B)->link = I->link, (B)->more = (I)->more, (B)->flags = (I)->flags)
+#define iof_restore(B, I) iof_save(I, B)
+
+#define iof_unget(I, c) \
+  ((void)(c == (uint8_t)c ? ((I)->pos > (I)->buf ? --(I)->pos : iof_backup(I, c)) : 0)
+int iof_backup (iof *I, int c);
+*/
+
+/* writing */
+
+UTILAPI size_t iof_write_file_handle (iof *O, FILE *file);
+UTILAPI size_t iof_write_file (iof *O, const char *filename);
+UTILAPI size_t iof_write_iofile (iof *O, iof_file *iofile, int savepos);
+
+UTILAPI int iof_putc (iof *O, int u);
+UTILAPI int iof_hputc (iof *O, int u);
+UTILAPI int iof_iputc (iof *O, int u);
+
+UTILAPI size_t iof_write (iof *O, const void *data, size_t size);
+UTILAPI size_t iof_hwrite (iof *O, const void *data, size_t size);
+UTILAPI size_t iof_iwrite (iof *O, const void *data, size_t size);
+
+UTILAPI iof_status iof_puts (iof *O, const void *data);
+UTILAPI size_t iof_put_string (iof *O, const void *data);
+UTILAPI size_t iof_putfs (iof *O, const char *format, ...);
+UTILAPI size_t iof_repc (iof *O, char c, size_t bytes);
+
+#define iof_putl(O, s) iof_write(O, "" s, sizeof(s)-1)
+//#define iof_putl iof_puts
+
+#define iof_set(O, c)               (*(O)->pos++ = (uint8_t)(c))
+#define iof_set2(O, c1, c2)         (iof_set(O, c1), iof_set(O, c2))
+#define iof_set3(O, c1, c2, c3)     (iof_set(O, c1), iof_set(O, c2), iof_set(O, c3))
+#define iof_set4(O, c1, c2, c3, c4) (iof_set(O, c1), iof_set(O, c2), iof_set(O, c3), iof_set(O, c4))
+#define iof_set5(O, c1, c2, c3, c4, c5) (iof_set(O, c1), iof_set(O, c2), iof_set(O, c3), iof_set(O, c4), iof_set(O, c5))
+
+#define iof_hset(O, c)              (*(O)->hpos++ = (uint16_t)(c))
+#define iof_iset(O, c)              (*(O)->ipos++ = (uint32_t)(c))
+
+#define iof_put(O, c)               ((void)iof_ensure(O, 1), iof_set(O, c))
+#define iof_put2(O, c1, c2)         ((void)iof_ensure(O, 2), iof_set2(O, c1, c2))
+#define iof_put3(O, c1, c2, c3)     ((void)iof_ensure(O, 3), iof_set3(O, c1, c2, c3))
+#define iof_put4(O, c1, c2, c3, c4) ((void)iof_ensure(O, 4), iof_set4(O, c1, c2, c3, c4))
+#define iof_put5(O, c1, c2, c3, c4, c5) ((void)iof_ensure(O, 5), iof_set5(O, c1, c2, c3, c4, c5))
+
+#define iof_hput(O, c)               ((void)iof_hensure(O, 1), iof_hset(O, c))
+#define iof_iput(O, c)               ((void)iof_iensure(O, 1), iof_iset(O, c))
+
+#define iof_put_uc_hex(O, c) iof_put2(O, base16_uc_digit1(c), base16_uc_digit2(c))
+#define iof_put_lc_hex(O, c) iof_put2(O, base16_lc_digit1(c), base16_lc_digit2(c))
+#define iof_set_uc_hex(O, c) iof_set2(O, base16_uc_digit1(c), base16_uc_digit2(c))
+#define iof_set_lc_hex(O, c) iof_set2(O, base16_lc_digit1(c), base16_lc_digit2(c))
+#define iof_put_hex iof_put_uc_hex
+#define iof_set_hex iof_set_uc_hex
+
+/* number from iof; return 1 on success, 0 otherwise */
+
+#define iof_scan_sign(I, c, sign) _scan_sign(c, sign, iof_next(I))
+#define iof_scan_integer(I, c, number) _scan_integer(c, number, iof_next(I))
+#define iof_scan_radix(I, c, number, radix) _scan_radix(c, number, radix, iof_next(I))
+#define iof_read_integer(I, c, number) _read_integer(c, number, iof_next(I))
+#define iof_read_radix(I, c, number, radix) _read_radix(c, number, radix, iof_next(I))
+
+#define iof_scan_decimal(I, c, number) _scan_decimal(c, number, iof_next(I))
+#define iof_scan_fraction(I, c, number, exponent10) _scan_fraction(c, number, exponent10, iof_next(I))
+#define iof_scan_exponent10(I, c, exponent10) _scan_exponent10(c, exponent10, iof_next(I))
+
+UTILAPI int iof_get_int32 (iof *I, int32_t *number);
+UTILAPI int iof_get_slong (iof *I, long *number);
+UTILAPI int iof_get_int64 (iof *I, int64_t *number);
+
+UTILAPI int iof_get_uint32 (iof *I, uint32_t *number);
+UTILAPI int iof_get_ulong (iof *I, unsigned long *number);
+UTILAPI int iof_get_usize (iof *I, size_t *number);
+UTILAPI int iof_get_uint64 (iof *I, uint64_t *number);
+
+UTILAPI int iof_get_int32_radix (iof *I, int32_t *number, int radix);
+UTILAPI int iof_get_slong_radix (iof *I, long *number, int radix);
+UTILAPI int iof_get_int64_radix (iof *I, int64_t *number, int radix);
+
+UTILAPI int iof_get_uint32_radix (iof *I, uint32_t *number, int radix);
+UTILAPI int iof_get_ulong_radix (iof *I, unsigned long *number, int radix);
+UTILAPI int iof_get_usize_radix (iof *I, size_t *number, int radix);
+UTILAPI int iof_get_uint64_radix (iof *I, uint64_t *number, int radix);
+
+#if defined(INTLW_IS_INT64)
+#  define iof_get_intlw(I, number) iof_get_int64(I, number)
+#  define iof_get_uintlw(I, number) iof_get_uint64(I, number)
+#  define iof_get_intlw_radix(I, number, radix) iof_get_int64_radix(I, number, radix)
+#  define iof_get_uintlw_radix(I, number, radix) iof_get_uint64_radix(I, number, radix)
+#elif defined(INTLW_IS_LONG)
+#  define iof_get_intlw(I, number) iof_get_slong(I, number)
+#  define iof_get_uintlw(I, number) iof_get_ulong(I, number)
+#  define iof_get_intlw_radix(I, number, radix) iof_get_slong_radix(I, number, radix)
+#  define iof_get_uintlw_radix(I, number, radix) iof_get_ulong_radix(I, number, radix)
+#endif
+
+UTILAPI int iof_get_roman (iof *I, uint16_t *number);
+
+UTILAPI int iof_get_double (iof *I, double *number);
+UTILAPI int iof_get_float (iof *I, float *number);
+
+UTILAPI int iof_conv_double (iof *I, double *number);
+UTILAPI int iof_conv_float (iof *I, float *number);
+
+/* number to iof; return a number of written bytes */
+
+UTILAPI size_t iof_put_int32 (iof *O, int32_t number);
+UTILAPI size_t iof_put_slong (iof *O, long number);
+UTILAPI size_t iof_put_int64 (iof *O, int64_t number);
+
+UTILAPI size_t iof_put_uint32 (iof *O, uint32_t number);
+UTILAPI size_t iof_put_ulong (iof *O, unsigned long number);
+UTILAPI size_t iof_put_usize  (iof *O, size_t number);
+UTILAPI size_t iof_put_uint64 (iof *O, uint64_t number);
+
+UTILAPI size_t iof_put_int32_radix (iof *O, int32_t number, int radix, int uc);
+UTILAPI size_t iof_put_slong_radix (iof *O, long number, int radix, int uc);
+UTILAPI size_t iof_put_int64_radix (iof *O, int64_t number, int radix, int uc);
+
+UTILAPI size_t iof_put_uint32_radix (iof *O, uint32_t number, int radix, int uc);
+UTILAPI size_t iof_put_ulong_radix (iof *O, unsigned long number, int radix, int uc);
+UTILAPI size_t iof_put_usize_radix  (iof *O, size_t number,   int radix, int uc);
+UTILAPI size_t iof_put_uint64_radix (iof *O, uint64_t number, int radix, int uc);
+
+#if defined(INTLW_IS_INT64)
+#  define iof_put_intlw(O, number) iof_put_int64(O, number)
+#  define iof_put_uintlw(O, number) iof_put_uint64(O, number)
+#  define iof_put_intlw_radix(O, number, radix, uc) iof_put_int64_radix(O, number, radix, uc)
+#  define iof_put_uintlw_radix(O, number, radix, uc) iof_put_uint64_radix(O, number, radix, uc)
+#elif defined(INTLW_IS_LONG)
+#  define iof_put_intlw(O, number) iof_put_slong(O, number)
+#  define iof_put_uintlw(O, number) iof_put_ulong(O, number)
+#  define iof_put_intlw_radix(O, number, radix, uc) iof_put_slong_radix(O, number, radix, uc)
+#  define iof_put_uintlw_radix(O, number, radix, uc) iof_put_ulong_radix(O, number, radix, uc)
+#endif
+
+UTILAPI size_t iof_put_roman (iof *O, uint16_t number, int uc);
+
+UTILAPI size_t iof_put_double(iof *O, double number, int digits);
+UTILAPI size_t iof_put_float(iof *O, float number, int digits);
+
+/* common helpers for binary parsers */
+
+UTILAPI int iof_get_be_uint2 (iof *I, uint32_t *pnumber);
+UTILAPI int iof_get_be_uint3 (iof *I, uint32_t *pnumber);
+UTILAPI int iof_get_be_uint4 (iof *I, uint32_t *pnumber);
+
+UTILAPI int iof_get_le_uint2 (iof *I, uint32_t *pnumber);
+UTILAPI int iof_get_le_uint3 (iof *I, uint32_t *pnumber);
+UTILAPI int iof_get_le_uint4 (iof *I, uint32_t *pnumber);
+
+// iof_set() and iof_put() suite casts arguments to uint8_t, so we don't need &0xff mask
+
+#define iof_set_be_uint1(O, u) iof_set(O, u)
+#define iof_set_be_uint2(O, u) iof_set2(O, (u)>>8, u)
+#define iof_set_be_uint3(O, u) iof_set3(O, (u)>>16, (u)>>8, u)
+#define iof_set_be_uint4(O, u) iof_set4(O, (u)>>24, (u)>>16, (u)>>8, u)
+
+#define iof_set_le_uint1(O, u) iof_set(O, u)
+#define iof_set_le_uint2(O, u) iof_set2(O, u, (u)>>8)
+#define iof_set_le_uint3(O, u) iof_set3(O, u, (u)>>8, (u)>>16)
+#define iof_set_le_uint4(O, u) iof_set4(O, u, (u)>>8, (u)>>16, (u)>>24)
+
+#define iof_put_be_uint1(O, u) iof_put(O, u)
+#define iof_put_be_uint2(O, u) iof_put2(O, (u)>>8, u)
+#define iof_put_be_uint3(O, u) iof_put3(O, (u)>>16, (u)>>8, u)
+#define iof_put_be_uint4(O, u) iof_put4(O, (u)>>24, (u)>>16, (u)>>8, u)
+
+#define iof_put_le_uint1(O, u) iof_put(O, u)
+#define iof_put_le_uint2(O, u) iof_put2(O, u, (u)>>8)
+#define iof_put_le_uint3(O, u) iof_put3(O, u, (u)>>8, (u)>>16)
+#define iof_put_le_uint4(O, u) iof_put4(O, u, (u)>>8, (u)>>16, (u)>>24)
+
+/* buffer results */
+
+#define iof_reader_result(I, size) ((size = (size_t)iof_left(I)), (I)->pos)
+#define iof_writer_result(I, size) ((size = (size_t)iof_size(I)), (I)->buf)
+#define iof_result(I, size) (((I)->flags & IOF_READER) ? iof_reader_result(I, size) : iof_writer_result(I, size))
+
+uint8_t * iof_file_input_data (iof_file *iofile, size_t *psize, int *isnew);
+//uint8_t * iof_file_reader_data (iof_file *iofile, size_t *size);
+//uint8_t * iof_file_writer_data (iof_file *iofile, size_t *size);
+
+uint8_t * iof_reader_data (iof *I, size_t *psize);
+uint8_t * iof_writer_data (iof *O, size_t *psize);
+size_t iof_reader_to_file_handle (iof *I, FILE *file);
+size_t iof_reader_to_file (iof *I, const char *filename);
+
+#define iof_loaded(I) ((I)->end = (I)->pos, (I)->pos = (I)->buf, iof_left(I))
+
+#define iof_data_to_file_handle(data, size, file) fwrite(data, sizeof(uint8_t), size, file)
+UTILAPI size_t iof_data_to_file (const void *data, size_t size, const char *filename);
+
+UTILAPI size_t iof_result_to_file_handle (iof *F, FILE *file);
+UTILAPI size_t iof_result_to_file (iof *F, const char *filename);
+UTILAPI void iof_debug (iof *I, const char *filename);
+
+/* common filters allocator */
+
+void iof_filters_init (void);
+void iof_filters_free (void);
+
+iof * iof_filter_reader_new (iof_handler handler, size_t statesize, void **pstate);
+#define iof_filter_reader(handler, statesize, pstate) iof_filter_reader_new(handler, statesize, (void **)(pstate))
+iof * iof_filter_reader_with_buffer_new (iof_handler handler, size_t statesize, void **pstate, void *buffer, size_t buffersize);
+#define iof_filter_reader_with_buffer(handler, statesize, pstate, buffer, buffersize) iof_filter_reader_with_buffer_new(handler, statesize, (void **)(pstate), buffer, buffersize)
+iof * iof_filter_writer_new (iof_handler handler, size_t statesize, void **pstate);
+#define iof_filter_writer(handler, statesize, pstate) iof_filter_writer_new(handler, statesize, (void **)(pstate))
+iof * iof_filter_writer_with_buffer_new (iof_handler handler, size_t statesize, void **pstate, void *buffer, size_t buffersize);
+#define iof_filter_writer_with_buffer(handler, statesize, pstate, buffer, buffersize) iof_filter_writer_with_buffer_new(handler, statesize, (void **)(pstate), buffer, buffersize)
+
+#define iof_filter_state(statetype, F) (statetype)((void *)((F) + 1))
+
+void iof_free (iof *F);
+void iof_discard (iof *F);
+
+size_t iof_resize_buffer_to (iof *O, size_t space);
+#define iof_resize_buffer(O) iof_resize_buffer_to(O, (O)->space << 1)
+
+size_t iof_decoder_retval (iof *I, const char *type, iof_status status);
+size_t iof_encoder_retval (iof *O, const char *type, iof_status status);
+
+/* filters */
+
+iof * iof_filter_file_handle_reader (FILE *file);
+iof * iof_filter_file_handle_writer (FILE *file);
+
+iof * iof_filter_iofile_reader (iof_file *iofile, size_t offset);
+iof * iof_filter_iofile_writer (iof_file *iofile, size_t offset);
+
+iof * iof_filter_file_reader (const char *filename);
+iof * iof_filter_file_writer (const char *filename);
+
+iof * iof_filter_string_reader (const void *s, size_t length);
+iof * iof_filter_string_writer (const void *s, size_t length);
+
+iof * iof_filter_buffer_writer (size_t size);
+
+iof * iof_filter_stream_reader (FILE *file, size_t offset, size_t length);
+iof * iof_filter_stream_coreader (iof_file *iofile, size_t offset, size_t length);
+
+iof * iof_filter_stream_writer (FILE *file);
+iof * iof_filter_stream_cowriter (iof_file *iofile, size_t offset);
+
+FILE * iof_filter_file_reader_source (iof *I, size_t *poffset, size_t *plength);
+iof_file * iof_filter_file_coreader_source (iof *I, size_t *poffset, size_t *plength);
+iof * iof_filter_reader_replacement (iof *P, iof_handler handler, size_t statesize, void **pstate);
+#define iof_filter_reader_replace(P, handler, statesize, pstate) iof_filter_reader_replacement(P, handler, statesize, (void **)(pstate))
+
+#endif
+\ No newline at end of file
diff --git a/source/luametatex/source/libraries/pplib/util/utillog.c b/source/luametatex/source/libraries/pplib/util/utillog.c
new file mode 100644
index 000000000..6d32514a7
--- /dev/null
+++ b/source/luametatex/source/libraries/pplib/util/utillog.c
@@ -0,0 +1,60 @@
+
+#include <stdio.h>
+#include <string.h> // strlen
+#include <stdarg.h>
+#include "utillog.h"
+
+#define LOGGER_BUFFER_SIZE 256
+#define LOGGER_PREFIX_SIZE 32
+
+typedef struct {
+  logger_function callback;
+  void *context;
+  size_t pfxlen;
+} logger_struct;
+
+static logger_struct logger = { 0, NULL, 0 };
+
+static char logger_buffer[LOGGER_BUFFER_SIZE+LOGGER_PREFIX_SIZE];
+
+void loggerf (const char *format, ...)
+{
+  va_list args;
+  int length;
+
+  va_start(args, format);
+  length = vsnprintf(logger_buffer + logger.pfxlen, LOGGER_BUFFER_SIZE, format, args);
+  if (length > 0)
+  {
+    if (length > LOGGER_BUFFER_SIZE)
+      length = LOGGER_BUFFER_SIZE;
+  }
+  else
+  {
+    loggerf("logger encoding error '%s'", format);
+    length = (int)strlen(logger_buffer);
+  }
+  length += (int)logger.pfxlen;
+  if (logger.callback)
+    logger.callback(logger_buffer, logger.context);
+  else
+    printf("\n%s\n", logger_buffer);
+  va_end(args);
+}
+
+void logger_callback (logger_function callback, void *context)
+{
+  logger.callback = callback;
+  logger.context = context;
+}
+
+int logger_prefix (const char *prefix)
+{
+  size_t pfxlen;
+  pfxlen = strlen(prefix);
+  if (pfxlen > LOGGER_PREFIX_SIZE)
+    return 0;
+  memcpy(logger_buffer, prefix, pfxlen);
+  logger.pfxlen = pfxlen;
+  return 1;
+}
diff --git a/source/luametatex/source/libraries/pplib/util/utillog.h b/source/luametatex/source/libraries/pplib/util/utillog.h
new file mode 100644
index 000000000..c30e0ff0f
--- /dev/null
+++ b/source/luametatex/source/libraries/pplib/util/utillog.h
@@ -0,0 +1,10 @@
+
+#ifndef UTIL_LOG_H
+#define UTIL_LOG_H
+
+typedef void (*logger_function) (const char *message, void *alien);
+void loggerf (const char *format, ...);
+void logger_callback (logger_function callback, void *context);
+int logger_prefix (const char *prefix);
+
+#endif
+\ No newline at end of file
diff --git a/source/luametatex/source/libraries/pplib/util/utillzw.c b/source/luametatex/source/libraries/pplib/util/utillzw.c
new file mode 100644
index 000000000..e5134e794
--- /dev/null
+++ b/source/luametatex/source/libraries/pplib/util/utillzw.c
@@ -0,0 +1,705 @@
+/* lzw implementation for postscript/pdf filters
+# Notes on LZW
+
+# Encoder
+
+Initially the table contains 256 entires for single bytes. Encoder consumes
+input bytes trying to find the longest sequence stored so far in the table.
+Once it finds a sequence that is not present in the table, it outputs the table
+index of the longest sequence found (accumulated bytes except the last
+consumed) and pushes the new sequence (accumulated bytes including the last
+one) on the top of the table. The last taken byte is not yet written to the
+output, it becomes the beginning of the new sequence to accumulate. Initially,
+encoder outputs 9-bit codes. While the table grows, the number of bits for each
+code increases up to 12. In example, after adding a table entry of index 511 it
+is high time to switch to 10-bit bytes. /EarlyChange=true parameter in stream
+dictionary (both postscript and pdf) informs to increase the number of bits one
+code earlier then necessary. Looks pretty much like an early days bug that
+became a specification :) I have never found a PDF having /EarlyChange key
+specified anyway.
+
+Once the table becomes full (or when encoder decides it is worthy),
+a clear-table marker (code 256) purges the table and restores codes length to
+9. End-of-data marker (code 257) ends the stream. Conventionally, the beginning
+of the stream starts with clear-table marker.
+
+Postscript allows to provide a /UnitLength which determines the bit length of
+codes. The above description assumes UnitLength=8 (default). Allowed values are
+from 3 to 8. Different UnitLength also affects markers; clear-table is then
+2^UnitLength and end-of-data marker is 2^UnitLenth+1.
+
+Encoder outputs 9-12bit codes that are packed into bytes using high-bits-first
+scheme (default) or low-bits-scheme.
+
+PDF spec p. 73 (PS spec p. 135 gives an mistaken output sequence and so
+mistaken output bytes)
+
+Input character sequence (decimal)
+45 45 45 45 45 65 45 45 45 66
+
+Output 9bit codes (decimal)
+256 45 258 258 65 259 66 257
+
+Output 9bit codes (binary)
+100000000 000101101 100000010 100000010 001000001 100000011 001000010 100000001
+
+Output bytes (LowBitsFirst=false); eight high-order bits of code becomes
+the first byte, remaining low-order bit of code becomes the high-order bit of the
+next byte;
+10000000 00001011 01100000 01010000 00100010 00001100 00001100 10000101 00000001
+-> 80 0B 60 50 22 0C 0C 85 01
+
+Output bytes (binary, LowBitsFirst=true); eight low-order bits of code becomes
+the first byte, remaining high-order bit of code becomes low-order bit of the
+next byte;
+00000000 01011011 00001000 00010100 00011000 01100100 10100000 10000000 10010000
+-> 00 5B 08 14 18 64 A0 80 90
+
+# Decoder
+
+Decoder consumes input bytes transforming them to 9 to 12 bit codes. Initially
+it starts with 9bit codes and the table of 258 fixed codes (same as encoder).
+Basically, it interprets incoming codes as table indices (except 256 and 257
+markers) and it outputs byte sequences stored at given indices. It also
+upbuilds the table and changes the number of bits of codes when necessary. The
+key point on lzw is that both encoder and decoder builds the table
+synchronously.
+
+However, decoder needs some "knowledge" about how encoder works to be able to
+interpret a table index that it doesn't have so far. Look that the output from
+encoder in the example above. The first output code is conventional clear-table
+(256). Then comes a code 45. So far so good, decoder interprets code 45 as
+a (fixed) entry of the table, emitting byte 45. The next code is 258, which is
+should be interpreted as an index in the table. Oops, encoder doesn't have one
+yet. If that occurs, it means that encoder was able to output the new entry
+code just after adding it to a table. It means that
+
+  sequence_before + next_byte == next_byte + sequence_after
+
+This may happen not only for sequences like 45 45 45, but also symmetric series
+such as abcbabcba; abcb + a == a + bcba. Decoder must be aware of that and if
+it gets a code one larger than the top table index, it should create one on-fly
+by appending last entry sequence by the first by of the last entry.
+
+# UnitLength
+
+Postscript specification mentions about UnitLength parameter that can be used
+in LZW decoder (not allowed in encoder), with possible values from 3 to 8. This
+parameter determines the number of bits per code; form UnitLength + 1 to 12. It
+also determines which codes are used for clear-table marker (2^UnitLength) and
+end-of-data marker ((2^UnitLength)+1). Postscript specification says (page 134):
+
+"Initially, the code length is (UnitLength + 1) bits and the table contains only
+entries for the (2^UnitLength + 2) fixed codes. As encoding proceeds, entries are
+appended to the table, associating new codes with longer and longer input character
+sequences. The encoding and decoding filters maintain identical copies of
+this table."
+
+Later on page 136 Postscript specification says:
+
+"Data that has been LZW-encoded with a UnitLength less than 8 consists only of
+codes in the range 0 to 2^UnitLength - 1; consequently, the LZWDecode filter produces
+only codes in that range when read. UnitLength also affects the encoded
+representation, as described above."
+
+UnitLength (Postscript only) and LowBitsFirst are used only by decoder.
+EarlyChange should obviously be respected by both encoder and decoder. When
+table index reaches current bit length boundary (511, 1023, ...) it must react
+by increasing the number of bits of input code. But if the index reaches it
+maximum value (when the table is full), decoder is NOT supposed to clear the
+table. When the table is full, encoder must emit clear-table marker and it
+emits this code using 12 bits and reinitialize code bits after that. It means
+that, when the table is full, decoder should get one more 12-bit code (which
+should be clear-table marker) and actually clear the table and reinitialize
+code bits after that.
+
+# Clear-table vs last entry track (after tries and checks)
+
+It is also not quite clear what should actually happen when encoder gets a full
+table and it is supposed to emit clear-table marker. When it gets full, it
+means that it has just appended another entry to the table. And that happens
+only the input sequence collected so far plus the last byte is not present in
+the table. Encoder is supposed to output the table index of the present
+sequence and set the recent byte as a starting index of the new sequence to be
+collected. Even if it is time to clear the table, encoder is still supposed to
+keep the track of the last table entry. Decoder, however, must drop the track of the
+last code on clear-table.
+
+# Decoder table vs encoder table
+
+While decoding we need query lzw table by (subsequent) numeric codes and output
+character sequences stored in the table. While encoding we need to query the
+table on every input byte and fetch indices pointing to character sequences.
+Note that we never need to query the entire table for the longest sequence
+found so far. The encoder table do not need to access the longest character
+sequence at one piece. It is enough to keep the track of the current table
+index and the very next byte. We organize an encoder table into a search tree,
+where every node contains its table index (value) and last byte (key). Except
+initial tree content, every node is created on the base of the previous node
+and it conceptually point the sequence represented by that nodo consists of the
+previous node sequence plus the next byte.
+
+Every new node is a descendant of the node it has been derived from. Every node
+has a map (a search subtree) indexed by suffix byte value, pointing to
+descendants nodes. Every node also has binary tentackles (left/right fields)
+necessary to search the map (except initials, every node lives in a map of some
+ancestor node). The key point is that on every input byte we don't search the
+entire tree, but only the map of the current node children. The map tree is
+a simple binary tree with no balancing mechanism (not worthy to optimize an
+ephemeric structure that may be upbuilt more often then queried).
+
+In our implementation, decoder table requires  4069 entries (topmost index 4095).
+Encoder table, however, needs 4097 entries to handle the case when EarlyIndex
+parameter is 0 (I have never a chance to test that in practise). The node of index
+4096 might be added to a search tree, but its code is never emitted; the lookup
+is purged just after adding that node.
+
+todo:
+- support for LowBitsFirst encoding
+*/
+
+#include "utilmem.h"
+#include "utillzw.h"
+
+/* filter state struct */
+
+typedef struct lzw_entry {
+  union {
+    const char *rdata; // to be able to init with string literal
+    char *data;
+  };
+  int size;
+} lzw_entry;
+
+#define lzw_index short
+
+typedef struct lzw_node lzw_node;
+
+struct lzw_node {
+  lzw_index index;
+  unsigned char suffix;
+  lzw_node *left;
+  lzw_node *right;
+  lzw_node *map;
+};
+
+struct lzw_state {
+  union {
+    lzw_node *lookup;                 /* encoder table */
+    lzw_entry *table;                 /* decoder table */
+  };
+  lzw_index index;                    /* table index */
+  union {
+    lzw_node *lastnode;               /* previous encoder table node */
+    struct {
+      lzw_entry *lastentry;           /* previous decoder table entry */
+      int tailbytes;                  /* num of bytes of lastentry not yet written out */
+    };
+  };
+  int basebits;                       /* /UnitLength parameter (8) */
+  int codebits;                       /* current code bits */
+  int lastbyte;                       /* previosly read byte */
+  int tailbits;                       /* lastbyte bits not yet consumed */
+  int flush;                          /* encoder */
+  int flags;                          /* options */
+};
+
+typedef union { lzw_state *lzwstate; void *voidstate; } lzw_state_pointer; // to avoid 'dereferencing type-puned ...' warnings
+
+#define LZW_INIT_STATE { { 0 }, 0, { 0 }, 0, 0, 0, 0, 0, 0 }
+
+/* macros */
+
+#define LZW_MIN_BITS 3
+#define LZW_MAX_BITS 12
+#define LZW_TABLE_SIZE (1 << LZW_MAX_BITS)
+#define LZW_LOOKUP_SIZE (LZW_TABLE_SIZE + 1)
+
+#define lzw_bit_range(bits) (bits >= LZW_MIN_BITS && bits <= LZW_BASE_BITS)
+#define lzw_base_bits(flags) (flags & ((1 << 4) - 1)) // 4 low bits of flags is basebits (UnitLength)
+
+#define lzw_initial_codes(state) (1 << state->basebits)
+#define lzw_clear_code(state)    lzw_initial_codes(state)
+#define lzw_eod_code(state)      (lzw_initial_codes(state) + 1)
+#define lzw_initial_index(state) (lzw_initial_codes(state) + 2)
+
+#define lzw_max_index(state) ((1 << state->codebits) - ((state->flags & LZW_EARLY_INDEX) ? 1 : 0))
+#define lzw_check_bits(state) ((void)(state->index == lzw_max_index(state) && state->codebits < LZW_MAX_BITS && ++state->codebits))
+
+#define lzw_malloc util_malloc
+#define lzw_free util_free
+
+/* decoder */
+
+static struct lzw_entry lzw_initial_table[] = {
+  {{"\x00"}, 1}, {{"\x01"}, 1}, {{"\x02"}, 1}, {{"\x03"}, 1}, {{"\x04"}, 1}, {{"\x05"}, 1}, {{"\x06"}, 1}, {{"\x07"}, 1}, {{"\x08"}, 1}, {{"\x09"}, 1}, {{"\x0A"}, 1}, {{"\x0B"}, 1}, {{"\x0C"}, 1}, {{"\x0D"}, 1}, {{"\x0E"}, 1}, {{"\x0F"}, 1},
+  {{"\x10"}, 1}, {{"\x11"}, 1}, {{"\x12"}, 1}, {{"\x13"}, 1}, {{"\x14"}, 1}, {{"\x15"}, 1}, {{"\x16"}, 1}, {{"\x17"}, 1}, {{"\x18"}, 1}, {{"\x19"}, 1}, {{"\x1A"}, 1}, {{"\x1B"}, 1}, {{"\x1C"}, 1}, {{"\x1D"}, 1}, {{"\x1E"}, 1}, {{"\x1F"}, 1},
+  {{"\x20"}, 1}, {{"\x21"}, 1}, {{"\x22"}, 1}, {{"\x23"}, 1}, {{"\x24"}, 1}, {{"\x25"}, 1}, {{"\x26"}, 1}, {{"\x27"}, 1}, {{"\x28"}, 1}, {{"\x29"}, 1}, {{"\x2A"}, 1}, {{"\x2B"}, 1}, {{"\x2C"}, 1}, {{"\x2D"}, 1}, {{"\x2E"}, 1}, {{"\x2F"}, 1},
+  {{"\x30"}, 1}, {{"\x31"}, 1}, {{"\x32"}, 1}, {{"\x33"}, 1}, {{"\x34"}, 1}, {{"\x35"}, 1}, {{"\x36"}, 1}, {{"\x37"}, 1}, {{"\x38"}, 1}, {{"\x39"}, 1}, {{"\x3A"}, 1}, {{"\x3B"}, 1}, {{"\x3C"}, 1}, {{"\x3D"}, 1}, {{"\x3E"}, 1}, {{"\x3F"}, 1},
+  {{"\x40"}, 1}, {{"\x41"}, 1}, {{"\x42"}, 1}, {{"\x43"}, 1}, {{"\x44"}, 1}, {{"\x45"}, 1}, {{"\x46"}, 1}, {{"\x47"}, 1}, {{"\x48"}, 1}, {{"\x49"}, 1}, {{"\x4A"}, 1}, {{"\x4B"}, 1}, {{"\x4C"}, 1}, {{"\x4D"}, 1}, {{"\x4E"}, 1}, {{"\x4F"}, 1},
+  {{"\x50"}, 1}, {{"\x51"}, 1}, {{"\x52"}, 1}, {{"\x53"}, 1}, {{"\x54"}, 1}, {{"\x55"}, 1}, {{"\x56"}, 1}, {{"\x57"}, 1}, {{"\x58"}, 1}, {{"\x59"}, 1}, {{"\x5A"}, 1}, {{"\x5B"}, 1}, {{"\x5C"}, 1}, {{"\x5D"}, 1}, {{"\x5E"}, 1}, {{"\x5F"}, 1},
+  {{"\x60"}, 1}, {{"\x61"}, 1}, {{"\x62"}, 1}, {{"\x63"}, 1}, {{"\x64"}, 1}, {{"\x65"}, 1}, {{"\x66"}, 1}, {{"\x67"}, 1}, {{"\x68"}, 1}, {{"\x69"}, 1}, {{"\x6A"}, 1}, {{"\x6B"}, 1}, {{"\x6C"}, 1}, {{"\x6D"}, 1}, {{"\x6E"}, 1}, {{"\x6F"}, 1},
+  {{"\x70"}, 1}, {{"\x71"}, 1}, {{"\x72"}, 1}, {{"\x73"}, 1}, {{"\x74"}, 1}, {{"\x75"}, 1}, {{"\x76"}, 1}, {{"\x77"}, 1}, {{"\x78"}, 1}, {{"\x79"}, 1}, {{"\x7A"}, 1}, {{"\x7B"}, 1}, {{"\x7C"}, 1}, {{"\x7D"}, 1}, {{"\x7E"}, 1}, {{"\x7F"}, 1},
+  {{"\x80"}, 1}, {{"\x81"}, 1}, {{"\x82"}, 1}, {{"\x83"}, 1}, {{"\x84"}, 1}, {{"\x85"}, 1}, {{"\x86"}, 1}, {{"\x87"}, 1}, {{"\x88"}, 1}, {{"\x89"}, 1}, {{"\x8A"}, 1}, {{"\x8B"}, 1}, {{"\x8C"}, 1}, {{"\x8D"}, 1}, {{"\x8E"}, 1}, {{"\x8F"}, 1},
+  {{"\x90"}, 1}, {{"\x91"}, 1}, {{"\x92"}, 1}, {{"\x93"}, 1}, {{"\x94"}, 1}, {{"\x95"}, 1}, {{"\x96"}, 1}, {{"\x97"}, 1}, {{"\x98"}, 1}, {{"\x99"}, 1}, {{"\x9A"}, 1}, {{"\x9B"}, 1}, {{"\x9C"}, 1}, {{"\x9D"}, 1}, {{"\x9E"}, 1}, {{"\x9F"}, 1},
+  {{"\xA0"}, 1}, {{"\xA1"}, 1}, {{"\xA2"}, 1}, {{"\xA3"}, 1}, {{"\xA4"}, 1}, {{"\xA5"}, 1}, {{"\xA6"}, 1}, {{"\xA7"}, 1}, {{"\xA8"}, 1}, {{"\xA9"}, 1}, {{"\xAA"}, 1}, {{"\xAB"}, 1}, {{"\xAC"}, 1}, {{"\xAD"}, 1}, {{"\xAE"}, 1}, {{"\xAF"}, 1},
+  {{"\xB0"}, 1}, {{"\xB1"}, 1}, {{"\xB2"}, 1}, {{"\xB3"}, 1}, {{"\xB4"}, 1}, {{"\xB5"}, 1}, {{"\xB6"}, 1}, {{"\xB7"}, 1}, {{"\xB8"}, 1}, {{"\xB9"}, 1}, {{"\xBA"}, 1}, {{"\xBB"}, 1}, {{"\xBC"}, 1}, {{"\xBD"}, 1}, {{"\xBE"}, 1}, {{"\xBF"}, 1},
+  {{"\xC0"}, 1}, {{"\xC1"}, 1}, {{"\xC2"}, 1}, {{"\xC3"}, 1}, {{"\xC4"}, 1}, {{"\xC5"}, 1}, {{"\xC6"}, 1}, {{"\xC7"}, 1}, {{"\xC8"}, 1}, {{"\xC9"}, 1}, {{"\xCA"}, 1}, {{"\xCB"}, 1}, {{"\xCC"}, 1}, {{"\xCD"}, 1}, {{"\xCE"}, 1}, {{"\xCF"}, 1},
+  {{"\xD0"}, 1}, {{"\xD1"}, 1}, {{"\xD2"}, 1}, {{"\xD3"}, 1}, {{"\xD4"}, 1}, {{"\xD5"}, 1}, {{"\xD6"}, 1}, {{"\xD7"}, 1}, {{"\xD8"}, 1}, {{"\xD9"}, 1}, {{"\xDA"}, 1}, {{"\xDB"}, 1}, {{"\xDC"}, 1}, {{"\xDD"}, 1}, {{"\xDE"}, 1}, {{"\xDF"}, 1},
+  {{"\xE0"}, 1}, {{"\xE1"}, 1}, {{"\xE2"}, 1}, {{"\xE3"}, 1}, {{"\xE4"}, 1}, {{"\xE5"}, 1}, {{"\xE6"}, 1}, {{"\xE7"}, 1}, {{"\xE8"}, 1}, {{"\xE9"}, 1}, {{"\xEA"}, 1}, {{"\xEB"}, 1}, {{"\xEC"}, 1}, {{"\xED"}, 1}, {{"\xEE"}, 1}, {{"\xEF"}, 1},
+  {{"\xF0"}, 1}, {{"\xF1"}, 1}, {{"\xF2"}, 1}, {{"\xF3"}, 1}, {{"\xF4"}, 1}, {{"\xF5"}, 1}, {{"\xF6"}, 1}, {{"\xF7"}, 1}, {{"\xF8"}, 1}, {{"\xF9"}, 1}, {{"\xFA"}, 1}, {{"\xFB"}, 1}, {{"\xFC"}, 1}, {{"\xFD"}, 1}, {{"\xFE"}, 1}, {{"\xFF"}, 1}
+};
+
+#define lzw_entry_at(state, index) (&state->table[index])
+
+static lzw_state * lzw_decoder_init_table (lzw_state *state, lzw_entry *table, int flags)
+{
+  state->basebits = lzw_base_bits(flags); // first four bits or flags
+  if (!lzw_bit_range(state->basebits))
+    return NULL;
+  state->flags = flags;
+  if ((state->table = table) == NULL)
+  {
+    state->table = (lzw_entry *)lzw_malloc(LZW_TABLE_SIZE * sizeof(lzw_entry));
+    state->flags |= LZW_TABLE_ALLOC;
+  }
+  memcpy(state->table, lzw_initial_table, (size_t)lzw_initial_codes(state)*sizeof(lzw_entry));
+  // memset(&state->table[lzw_initial_codes(state)], 0, 2*sizeof(lzw_entry)); // eod and clear entries never accessed
+  state->codebits = state->basebits + 1;
+  state->index = lzw_initial_index(state);
+  state->lastentry = NULL;
+  state->tailbytes = 0;
+  state->lastbyte = 0;
+  state->tailbits = 0;
+  return state;
+}
+
+lzw_state * lzw_decoder_init (lzw_state *state, int flags)
+{
+  return lzw_decoder_init_table(state, NULL, flags);
+}
+
+static void lzw_decoder_clear (lzw_state *state)
+{
+  lzw_entry *entry;
+  lzw_index initindex = lzw_initial_index(state);
+  while (state->index > initindex)
+  {
+    entry = lzw_entry_at(state, --state->index);
+    lzw_free(entry->data);
+    // entry->data = NULL;
+    // entry->size = 0;
+  }
+  state->lastentry = NULL;
+  state->tailbytes = 0;
+  state->codebits = state->basebits + 1;
+}
+
+void lzw_decoder_close (lzw_state *state)
+{
+  lzw_decoder_clear(state);
+  if (state->flags & LZW_TABLE_ALLOC)
+    lzw_free(state->table);
+}
+
+static int lzw_next_entry (lzw_state *state, lzw_entry *nextentry)
+{
+  lzw_entry *lastentry, *newentry;
+  if ((lastentry = state->lastentry) == NULL)
+    return 1; /* its ok */
+  if (state->index == LZW_TABLE_SIZE)
+    return 0; /* invalid input; eod marker expected earlier */
+  /* put the new entry on the top of the table */
+  newentry = lzw_entry_at(state, state->index++);
+  /* its size is the last entrtyy size plus 1 */
+  newentry->size = lastentry->size + 1;
+  /* its content is the content of the last entry, */
+  newentry->data = (char *)lzw_malloc((size_t)newentry->size);
+  memcpy(newentry->data, lastentry->data, lastentry->size);
+  /* plus the first byte of the new entry (usually fixed code entry) */
+  newentry->data[newentry->size - 1] = nextentry->data[0];
+  return 1;
+}
+
+#define lzw_write_bytes(O, state) ((state->tailbytes -= (int)iof_write(O, state->lastentry->data, (size_t)state->tailbytes)) == 0)
+
+iof_status lzw_decode_state (iof *I, iof *O, lzw_state *state)
+{
+  const lzw_index clear = lzw_clear_code(state), eod = lzw_eod_code(state);
+  lzw_index code;
+  lzw_entry *entry;
+  if (state->lastentry != NULL)
+  { /* write out the tail from the last call */
+    if (state->tailbytes > 0 && !lzw_write_bytes(O, state))
+      return IOFFULL;
+    /* do what we normally do at the end of the loop body below */
+    lzw_check_bits(state);
+  }
+  // if (state->flags & LZW_LOW_BITS_FIRST)
+  //   return IOFERR;
+  while (1)
+  {
+    /* get input code of length state->codebits */
+    code = (state->lastbyte & ((1 << state->tailbits) - 1)) << (state->codebits - state->tailbits);
+    for (state->tailbits -= state->codebits; state->tailbits < 0; )
+    {
+      get_code:
+      if ((state->lastbyte = iof_get(I)) < 0)
+        return state->flush ? IOFEOF : state->lastbyte;
+      state->tailbits += 8;
+      if (state->tailbits < 0)
+      {
+        code |= (state->lastbyte << (-state->tailbits));
+        goto get_code;
+      }
+      else
+      {
+        code |= (state->lastbyte >> state->tailbits);
+        break;
+      }
+    }
+    /* interpret the code */
+    if (code < state->index)
+    { /* single byte code or special marker */
+      if (code == clear)
+      {
+        lzw_decoder_clear(state);
+        continue;
+      }
+      if (code == eod)
+        return IOFEOF;
+      entry = lzw_entry_at(state, code);
+      if (!lzw_next_entry(state, entry))
+        return IOFERR;
+    }
+    else if (code == state->index)
+    { /* apparently encoder has emitted the code of the key just created (see notes) */
+      if (!lzw_next_entry(state, state->lastentry))
+        return IOFERR;
+      entry = lzw_entry_at(state, state->index - 1);
+    }
+    else
+    { /* invalid input code */
+      return IOFERR;
+    }
+    /* record the entry found */
+    state->lastentry = entry;
+    /* emit the sequence pointed by that entry */
+    state->tailbytes = entry->size;
+    if (!lzw_write_bytes(O, state))
+      return IOFFULL;
+    /* check and update code bits */
+    lzw_check_bits(state);
+  }
+  return state->lastbyte; // never reached
+}
+
+/* encoder */
+
+#define lzw_node_at(state, index) (&state->lookup[index])
+
+#define lzw_node_init(node, i, c) (node->index = i, node->suffix = c, node->left = NULL, node->right = NULL, node->map = NULL)
+
+static lzw_state * lzw_encoder_init_table (lzw_state *state, lzw_node *lookup, int flags)
+{
+  lzw_index index;
+  lzw_node *node;
+  state->basebits = lzw_base_bits(flags); // first four bits of flags is base bits of code (default 8)
+  if (!lzw_bit_range(state->basebits))
+    return NULL;
+  state->flags = flags;
+  if ((state->lookup = lookup) == NULL)
+  {
+    state->lookup = lzw_malloc(LZW_LOOKUP_SIZE*sizeof(lzw_node));
+    state->flags |= LZW_TABLE_ALLOC;
+  }
+  state->index = lzw_initial_index(state);
+  for (index = 0; index < lzw_initial_codes(state); ++index)
+  {
+    node = lzw_node_at(state, index);
+    lzw_node_init(node, index, (unsigned char)index);
+  }
+  state->codebits = state->basebits + 1;
+  state->lastnode = NULL;
+  state->lastbyte = 0;
+  state->tailbits = 0;
+  return state;
+}
+
+lzw_state * lzw_encoder_init (lzw_state *state, int flags)
+{
+  return lzw_encoder_init_table(state, NULL, flags);
+}
+
+void lzw_encoder_close (lzw_state *state)
+{
+  if (state->flags & LZW_TABLE_ALLOC)
+    lzw_free(state->lookup);
+}
+
+static void lzw_encoder_clear (lzw_state *state)
+{
+  lzw_node *node;
+  lzw_index index;
+  /* clear fixed nodes */
+  for (index = 0; index < lzw_initial_codes(state); ++index)
+  {
+    node = lzw_node_at(state, index);
+    lzw_node_init(node, index, (unsigned char)index);
+  }
+  /* reset table index */
+  state->index = lzw_initial_index(state);
+  /* reset code bits */
+  state->codebits = state->basebits + 1;
+}
+
+static void lzw_put_code (iof *O, lzw_state *state, lzw_index code, int todobits)
+{
+  int leftbits, rightbits;
+  do
+  {
+    leftbits = 8 - state->tailbits;
+    rightbits = todobits - leftbits;
+    if (rightbits >= 0)
+    {
+      state->lastbyte |= (code >> rightbits);
+      iof_put(O, state->lastbyte);
+      code = code & ((1 << rightbits) - 1);
+      todobits -= leftbits;
+      state->lastbyte = 0;
+      state->tailbits = 0;
+    }
+    else
+    {
+      state->lastbyte |= (code << (-rightbits));
+      state->tailbits += todobits;
+      return;
+    }
+  } while (1);
+}
+
+static iof_status lzw_encode_last (iof *O, lzw_state *state)
+{
+  if (state->flush)
+  {
+    /* put the last code if any */
+    if (state->lastnode != NULL)
+      lzw_put_code(O, state, state->lastnode->index, state->codebits);
+    /* put eod marker, */
+    lzw_put_code(O, state, lzw_eod_code(state), state->codebits);
+    /* with tail bits set to 0 */
+    if (state->tailbits > 0)
+      lzw_put_code(O, state, 0, 8 - state->tailbits);
+    return IOFEOF;
+  }
+  return IOFEMPTY;
+}
+
+static lzw_node * lzw_node_push (lzw_state *state, unsigned char suffix)
+{
+  lzw_node *node;
+  node = lzw_node_at(state, state->index);
+  lzw_node_init(node, state->index, suffix);
+  ++state->index;
+  return node;
+}
+
+static int lzw_next_node (lzw_state *state, unsigned char suffix)
+{
+  lzw_node *node;
+  if ((node = state->lastnode->map) == NULL)
+  {
+    state->lastnode->map = lzw_node_push(state, suffix);
+    return 0;
+  }
+  while (1)
+  {
+    if (suffix < node->suffix)
+    {
+      if (node->left == NULL)
+      {
+        node->left = lzw_node_push(state, suffix);
+        return 0;
+      }
+      node = node->left;
+    }
+    else if (suffix > node->suffix)
+    {
+      if (node->right == NULL)
+      {
+        node->right = lzw_node_push(state, suffix);
+        return 0;
+      }
+      node = node->right;
+    }
+    else
+    {
+      state->lastnode = node;
+      return 1;
+    }
+  }
+  return 0; // never reached
+}
+
+iof_status lzw_encode_state (iof *I, iof *O, lzw_state *state)
+{
+  int byte;
+  if (state->lastnode == NULL)
+  { /* first call only; following convention, put clear-table marker */
+    if (!iof_ensure(O, 2))
+      return IOFFULL;
+    lzw_put_code(O, state, lzw_clear_code(state), state->codebits);
+    /* get the first input byte and initialize the current table entry */
+    if ((byte = iof_get(I)) < 0)
+      return lzw_encode_last(O, state);
+    state->lastnode = lzw_node_at(state, byte);
+  }
+  while (iof_ensure(O, 2))
+  { /* we need to write at most 2 bytes on each iteration */
+    if ((byte = iof_get(I)) < 0)
+      return lzw_encode_last(O, state);
+    if (lzw_next_node(state, (unsigned char)byte) == 0)
+    { /* means that the key hasn't been found and the new entry has just been created */
+      /* output the code pointing the longest sequence so far */
+      lzw_put_code(O, state, state->lastnode->index, state->codebits);
+      /* update code bits */
+      if (state->index == lzw_max_index(state) + 1)
+      {
+        if (state->codebits < LZW_MAX_BITS)
+          ++state->codebits;
+        else
+        {
+          /* put clear-table marker */
+          lzw_put_code(O, state, lzw_clear_code(state), state->codebits);
+          /* reset the table */
+          lzw_encoder_clear(state);
+        }
+      }
+      /* in any case, recent byte becomes the current table code */
+      state->lastnode = lzw_node_at(state, byte);
+    }
+    /* otherwise no new entry is appended and state->lastnode points the longer sequence just found */
+  }
+  return IOFFULL;
+}
+
+/* single call codecs */
+
+iof_status lzw_decode (iof *I, iof *O, int flags)
+{
+  lzw_state state = LZW_INIT_STATE;
+  lzw_entry table[LZW_TABLE_SIZE];
+  int ret;
+  lzw_decoder_init_table(&state, table, flags);
+  state.flush = 1;
+  ret = lzw_decode_state(I, O, &state);
+  // iof_flush(O); // ?
+  lzw_decoder_close(&state);
+  return ret;
+}
+
+iof_status lzw_encode (iof *I, iof *O, int flags)
+{
+  lzw_state state = LZW_INIT_STATE;
+  lzw_node lookup[LZW_LOOKUP_SIZE];
+  int ret;
+  lzw_encoder_init_table(&state, lookup, flags);
+  state.flush = 1;
+  ret = lzw_encode_state(I, O, &state);
+  // iof_flush(O); // ?
+  lzw_encoder_close(&state);
+  return ret;
+}
+
+/* filters */
+
+// lzw decoder function
+
+static size_t lzw_decoder (iof *F, iof_mode mode)
+{
+  lzw_state *state;
+  iof_status status;
+  size_t tail;
+
+  state = iof_filter_state(lzw_state *, F);
+  switch(mode)
+  {
+    case IOFLOAD:
+    case IOFREAD:
+      if (F->flags & IOF_STOPPED)
+        return 0;
+      tail = iof_tail(F);
+      F->pos = F->buf + tail;
+      F->end = F->buf + F->space;
+      do {
+        status = lzw_decode_state(F->next, F, state);
+      } while (mode == IOFLOAD && status == IOFFULL && iof_resize_buffer(F));
+      return iof_decoder_retval(F, "lzw", status);
+    case IOFCLOSE:
+      lzw_decoder_close(state);
+      iof_free(F);
+      return 0;
+    default:
+      break;
+  }
+  return 0;
+}
+
+// lzw encoder function
+
+static size_t lzw_encoder (iof *F, iof_mode mode)
+{
+  lzw_state *state;
+  iof_status status;
+
+  state = iof_filter_state(lzw_state *, F);
+  switch (mode)
+  {
+    case IOFFLUSH:
+      state->flush = 1;
+      FALLTHRU // fall through
+    case IOFWRITE:
+      F->end = F->pos;
+      F->pos = F->buf;
+      status = lzw_encode_state(F, F->next, state);
+      return iof_encoder_retval(F, "lzw", status);
+    case IOFCLOSE:
+      if (!state->flush)
+        lzw_encoder(F, IOFFLUSH);
+      lzw_encoder_close(state);
+      iof_free(F);
+      return 0;
+    default:
+      break;
+  }
+  return 0;
+}
+
+iof * iof_filter_lzw_decoder (iof *N, int flags)
+{
+  iof *I;
+  lzw_state_pointer P;
+  I = iof_filter_reader(lzw_decoder, sizeof(lzw_state), &P.voidstate);
+  iof_setup_next(I, N);
+  if (lzw_decoder_init(P.lzwstate, flags) == NULL)
+  {
+    iof_discard(I);
+    return NULL;
+  }
+  P.lzwstate->flush = 1;
+  return I;
+}
+
+iof * iof_filter_lzw_encoder (iof *N, int flags)
+{
+  iof *O;
+  lzw_state_pointer P;
+  O = iof_filter_writer(lzw_encoder, sizeof(lzw_state), &P.voidstate);
+  iof_setup_next(O, N);
+  if (lzw_encoder_init(P.lzwstate, flags) == NULL)
+  {
+    iof_discard(O);
+    return NULL;
+  }
+  return O;
+}
diff --git a/source/luametatex/source/libraries/pplib/util/utillzw.h b/source/luametatex/source/libraries/pplib/util/utillzw.h
new file mode 100644
index 000000000..9e3a085d4
--- /dev/null
+++ b/source/luametatex/source/libraries/pplib/util/utillzw.h
@@ -0,0 +1,30 @@
+#ifndef UTIL_LZW_H
+#define UTIL_LZW_H
+
+#include "utiliof.h"
+
+typedef struct lzw_state lzw_state;
+
+#define LZW_BASE_BITS 8
+#define LZW_TABLE_ALLOC (1<<4)
+#define LZW_EARLY_INDEX (1<<5)
+//#define LZW_LOW_BITS_FIRST (1<<6)
+#define LZW_DECODER_DEFAULTS (LZW_BASE_BITS|LZW_EARLY_INDEX|0)
+#define LZW_ENCODER_DEFAULTS (LZW_BASE_BITS|LZW_EARLY_INDEX|0)
+
+lzw_state * lzw_decoder_init (lzw_state *state, int flags);
+lzw_state * lzw_encoder_init (lzw_state *state, int flags);
+
+void lzw_decoder_close (lzw_state *state);
+void lzw_encoder_close (lzw_state *state);
+
+iof_status lzw_encode_state (iof *I, iof *O, lzw_state *state);
+iof_status lzw_decode_state (iof *I, iof *O, lzw_state *state);
+
+iof_status lzw_encode (iof *I, iof *O, int flags);
+iof_status lzw_decode (iof *I, iof *O, int flags);
+
+iof * iof_filter_lzw_decoder (iof *N, int flags);
+iof * iof_filter_lzw_encoder (iof *N, int flags);
+
+#endif
+\ No newline at end of file
diff --git a/source/luametatex/source/libraries/pplib/util/utilmd5.c b/source/luametatex/source/libraries/pplib/util/utilmd5.c
new file mode 100644
index 000000000..871984229
--- /dev/null
+++ b/source/luametatex/source/libraries/pplib/util/utilmd5.c
@@ -0,0 +1,447 @@
+
+/* md5 implementation excerpted from code by Peter Deutsch */
+
+/* begin of md5.c */
+
+/*
+  Copyright (C) 1999, 2000, 2002 Aladdin Enterprises.  All rights reserved.
+
+  This software is provided 'as-is', without any express or implied
+  warranty.  In no event will the authors be held liable for any damages
+  arising from the use of this software.
+
+  Permission is granted to anyone to use this software for any purpose,
+  including commercial applications, and to alter it and redistribute it
+  freely, subject to the following restrictions:
+
+  1. The origin of this software must not be misrepresented; you must not
+     claim that you wrote the original software. If you use this software
+     in a product, an acknowledgment in the product documentation would be
+     appreciated but is not required.
+  2. Altered source versions must be plainly marked as such, and must not be
+     misrepresented as being the original software.
+  3. This notice may not be removed or altered from any source distribution.
+
+  L. Peter Deutsch
+  ghost@aladdin.com
+
+ */
+/* $Id: md5.c,v 1.6 2002/04/13 19:20:28 lpd Exp $ */
+/*
+  Independent implementation of MD5 (RFC 1321).
+
+  This code implements the MD5 Algorithm defined in RFC 1321, whose
+  text is available at
+  http://www.ietf.org/rfc/rfc1321.txt
+  The code is derived from the text of the RFC, including the test suite
+  (section A.5) but excluding the rest of Appendix A.  It does not include
+  any code or documentation that is identified in the RFC as being
+  copyrighted.
+
+  The original and principal author of md5.c is L. Peter Deutsch
+  <ghost@aladdin.com>.  Other authors are noted in the change history
+  that follows (in reverse chronological order):
+
+  2002-04-13 lpd Clarified derivation from RFC 1321; now handles byte order
+  either statically or dynamically; added missing #include <string.h>
+  in library.
+  2002-03-11 lpd Corrected argument list for main(), and added int return
+  type, in test program and T value program.
+  2002-02-21 lpd Added missing #include <stdio.h> in test program.
+  2000-07-03 lpd Patched to eliminate warnings about "constant is
+  unsigned in ANSI C, signed in traditional"; made test program
+  self-checking.
+  1999-11-04 lpd Edited comments slightly for automatic TOC extraction.
+  1999-10-18 lpd Fixed typo in header comment (ansi2knr rather than md5).
+  1999-05-03 lpd Original version.
+ */
+
+#include <string.h> // memcpy
+#include <stdio.h>  // FILE
+
+#include "utilmd5.h"
+
+#undef BYTE_ORDER /* 1 = big-endian, -1 = little-endian, 0 = unknown */
+#ifdef ARCH_IS_BIG_ENDIAN
+#  define BYTE_ORDER (ARCH_IS_BIG_ENDIAN ? 1 : -1)
+#else
+#  define BYTE_ORDER 0
+#endif
+
+#define T_MASK ((uint32_t)~0)
+#define T1 /* 0xd76aa478 */ (T_MASK ^ 0x28955b87)
+#define T2 /* 0xe8c7b756 */ (T_MASK ^ 0x173848a9)
+#define T3    0x242070db
+#define T4 /* 0xc1bdceee */ (T_MASK ^ 0x3e423111)
+#define T5 /* 0xf57c0faf */ (T_MASK ^ 0x0a83f050)
+#define T6    0x4787c62a
+#define T7 /* 0xa8304613 */ (T_MASK ^ 0x57cfb9ec)
+#define T8 /* 0xfd469501 */ (T_MASK ^ 0x02b96afe)
+#define T9    0x698098d8
+#define T10 /* 0x8b44f7af */ (T_MASK ^ 0x74bb0850)
+#define T11 /* 0xffff5bb1 */ (T_MASK ^ 0x0000a44e)
+#define T12 /* 0x895cd7be */ (T_MASK ^ 0x76a32841)
+#define T13    0x6b901122
+#define T14 /* 0xfd987193 */ (T_MASK ^ 0x02678e6c)
+#define T15 /* 0xa679438e */ (T_MASK ^ 0x5986bc71)
+#define T16    0x49b40821
+#define T17 /* 0xf61e2562 */ (T_MASK ^ 0x09e1da9d)
+#define T18 /* 0xc040b340 */ (T_MASK ^ 0x3fbf4cbf)
+#define T19    0x265e5a51
+#define T20 /* 0xe9b6c7aa */ (T_MASK ^ 0x16493855)
+#define T21 /* 0xd62f105d */ (T_MASK ^ 0x29d0efa2)
+#define T22    0x02441453
+#define T23 /* 0xd8a1e681 */ (T_MASK ^ 0x275e197e)
+#define T24 /* 0xe7d3fbc8 */ (T_MASK ^ 0x182c0437)
+#define T25    0x21e1cde6
+#define T26 /* 0xc33707d6 */ (T_MASK ^ 0x3cc8f829)
+#define T27 /* 0xf4d50d87 */ (T_MASK ^ 0x0b2af278)
+#define T28    0x455a14ed
+#define T29 /* 0xa9e3e905 */ (T_MASK ^ 0x561c16fa)
+#define T30 /* 0xfcefa3f8 */ (T_MASK ^ 0x03105c07)
+#define T31    0x676f02d9
+#define T32 /* 0x8d2a4c8a */ (T_MASK ^ 0x72d5b375)
+#define T33 /* 0xfffa3942 */ (T_MASK ^ 0x0005c6bd)
+#define T34 /* 0x8771f681 */ (T_MASK ^ 0x788e097e)
+#define T35    0x6d9d6122
+#define T36 /* 0xfde5380c */ (T_MASK ^ 0x021ac7f3)
+#define T37 /* 0xa4beea44 */ (T_MASK ^ 0x5b4115bb)
+#define T38    0x4bdecfa9
+#define T39 /* 0xf6bb4b60 */ (T_MASK ^ 0x0944b49f)
+#define T40 /* 0xbebfbc70 */ (T_MASK ^ 0x4140438f)
+#define T41    0x289b7ec6
+#define T42 /* 0xeaa127fa */ (T_MASK ^ 0x155ed805)
+#define T43 /* 0xd4ef3085 */ (T_MASK ^ 0x2b10cf7a)
+#define T44    0x04881d05
+#define T45 /* 0xd9d4d039 */ (T_MASK ^ 0x262b2fc6)
+#define T46 /* 0xe6db99e5 */ (T_MASK ^ 0x1924661a)
+#define T47    0x1fa27cf8
+#define T48 /* 0xc4ac5665 */ (T_MASK ^ 0x3b53a99a)
+#define T49 /* 0xf4292244 */ (T_MASK ^ 0x0bd6ddbb)
+#define T50    0x432aff97
+#define T51 /* 0xab9423a7 */ (T_MASK ^ 0x546bdc58)
+#define T52 /* 0xfc93a039 */ (T_MASK ^ 0x036c5fc6)
+#define T53    0x655b59c3
+#define T54 /* 0x8f0ccc92 */ (T_MASK ^ 0x70f3336d)
+#define T55 /* 0xffeff47d */ (T_MASK ^ 0x00100b82)
+#define T56 /* 0x85845dd1 */ (T_MASK ^ 0x7a7ba22e)
+#define T57    0x6fa87e4f
+#define T58 /* 0xfe2ce6e0 */ (T_MASK ^ 0x01d3191f)
+#define T59 /* 0xa3014314 */ (T_MASK ^ 0x5cfebceb)
+#define T60    0x4e0811a1
+#define T61 /* 0xf7537e82 */ (T_MASK ^ 0x08ac817d)
+#define T62 /* 0xbd3af235 */ (T_MASK ^ 0x42c50dca)
+#define T63    0x2ad7d2bb
+#define T64 /* 0xeb86d391 */ (T_MASK ^ 0x14792c6e)
+
+static void md5_process (md5_state *state, const uint8_t *data /*[64]*/)
+{
+  uint32_t
+  a = state->words[0], b = state->words[1],
+  c = state->words[2], d = state->words[3];
+  uint32_t t;
+#if BYTE_ORDER > 0
+  /* Define storage only for big-endian CPUs. */
+  uint32_t X[16];
+#else
+  /* Define storage for little-endian or both types of CPUs. */
+  uint32_t xbuf[16];
+  const uint32_t *X;
+#endif
+
+  {
+#if BYTE_ORDER == 0
+    /*
+     * Determine dynamically whether this is a big-endian or
+     * little-endian machine, since we can use a more efficient
+     * algorithm on the latter.
+     */
+    static const int w = 1;
+
+    if (*((const uint8_t *)&w)) /* dynamic little-endian */
+#endif
+#if BYTE_ORDER <= 0   /* little-endian */
+    {
+      /*
+       * On little-endian machines, we can process properly aligned
+       * data without copying it.
+       */
+      if (!((data - (const uint8_t *)0) & 3)) {
+      /* data are properly aligned */
+        X = (const uint32_t *)((const void *)data); // avoid compiler warning
+      } else {
+        /* not aligned */
+        memcpy(xbuf, data, 64);
+        X = xbuf;
+      }
+    }
+#endif
+#if BYTE_ORDER == 0
+    else      /* dynamic big-endian */
+#endif
+#if BYTE_ORDER >= 0   /* big-endian */
+    {
+      /*
+       * On big-endian machines, we must arrange the bytes in the
+       * right order.
+       */
+      const uint8_t *xp = data;
+      int i;
+#  if BYTE_ORDER == 0
+      X = xbuf;   /* (dynamic only) */
+#  else
+#    define xbuf X    /* (static only) */
+#  endif
+      for (i = 0; i < 16; ++i, xp += 4)
+        xbuf[i] = xp[0] + (xp[1] << 8) + (xp[2] << 16) + (xp[3] << 24);
+    }
+#endif
+  }
+
+#define ROTATE_LEFT(x, n) (((x) << (n)) | ((x) >> (32 - (n))))
+
+  /* Round 1. */
+  /* Let [abcd k s i] denote the operation
+     a = b + ((a + F(b,c,d) + X[k] + T[i]) <<< s). */
+#define F(x, y, z) (((x) & (y)) | (~(x) & (z)))
+#define SET(a, b, c, d, k, s, Ti)\
+  t = a + F(b,c,d) + X[k] + Ti;\
+  a = ROTATE_LEFT(t, s) + b
+  /* Do the following 16 operations. */
+  SET(a, b, c, d,  0,  7,  T1);
+  SET(d, a, b, c,  1, 12,  T2);
+  SET(c, d, a, b,  2, 17,  T3);
+  SET(b, c, d, a,  3, 22,  T4);
+  SET(a, b, c, d,  4,  7,  T5);
+  SET(d, a, b, c,  5, 12,  T6);
+  SET(c, d, a, b,  6, 17,  T7);
+  SET(b, c, d, a,  7, 22,  T8);
+  SET(a, b, c, d,  8,  7,  T9);
+  SET(d, a, b, c,  9, 12, T10);
+  SET(c, d, a, b, 10, 17, T11);
+  SET(b, c, d, a, 11, 22, T12);
+  SET(a, b, c, d, 12,  7, T13);
+  SET(d, a, b, c, 13, 12, T14);
+  SET(c, d, a, b, 14, 17, T15);
+  SET(b, c, d, a, 15, 22, T16);
+#undef SET
+
+   /* Round 2. */
+   /* Let [abcd k s i] denote the operation
+        a = b + ((a + G(b,c,d) + X[k] + T[i]) <<< s). */
+#define G(x, y, z) (((x) & (z)) | ((y) & ~(z)))
+#define SET(a, b, c, d, k, s, Ti)\
+  t = a + G(b,c,d) + X[k] + Ti;\
+  a = ROTATE_LEFT(t, s) + b
+   /* Do the following 16 operations. */
+  SET(a, b, c, d,  1,  5, T17);
+  SET(d, a, b, c,  6,  9, T18);
+  SET(c, d, a, b, 11, 14, T19);
+  SET(b, c, d, a,  0, 20, T20);
+  SET(a, b, c, d,  5,  5, T21);
+  SET(d, a, b, c, 10,  9, T22);
+  SET(c, d, a, b, 15, 14, T23);
+  SET(b, c, d, a,  4, 20, T24);
+  SET(a, b, c, d,  9,  5, T25);
+  SET(d, a, b, c, 14,  9, T26);
+  SET(c, d, a, b,  3, 14, T27);
+  SET(b, c, d, a,  8, 20, T28);
+  SET(a, b, c, d, 13,  5, T29);
+  SET(d, a, b, c,  2,  9, T30);
+  SET(c, d, a, b,  7, 14, T31);
+  SET(b, c, d, a, 12, 20, T32);
+#undef SET
+
+   /* Round 3. */
+   /* Let [abcd k s t] denote the operation
+        a = b + ((a + H(b,c,d) + X[k] + T[i]) <<< s). */
+#define H(x, y, z) ((x) ^ (y) ^ (z))
+#define SET(a, b, c, d, k, s, Ti)\
+  t = a + H(b,c,d) + X[k] + Ti;\
+  a = ROTATE_LEFT(t, s) + b
+   /* Do the following 16 operations. */
+  SET(a, b, c, d,  5,  4, T33);
+  SET(d, a, b, c,  8, 11, T34);
+  SET(c, d, a, b, 11, 16, T35);
+  SET(b, c, d, a, 14, 23, T36);
+  SET(a, b, c, d,  1,  4, T37);
+  SET(d, a, b, c,  4, 11, T38);
+  SET(c, d, a, b,  7, 16, T39);
+  SET(b, c, d, a, 10, 23, T40);
+  SET(a, b, c, d, 13,  4, T41);
+  SET(d, a, b, c,  0, 11, T42);
+  SET(c, d, a, b,  3, 16, T43);
+  SET(b, c, d, a,  6, 23, T44);
+  SET(a, b, c, d,  9,  4, T45);
+  SET(d, a, b, c, 12, 11, T46);
+  SET(c, d, a, b, 15, 16, T47);
+  SET(b, c, d, a,  2, 23, T48);
+#undef SET
+
+   /* Round 4. */
+   /* Let [abcd k s t] denote the operation
+        a = b + ((a + I(b,c,d) + X[k] + T[i]) <<< s). */
+#define I(x, y, z) ((y) ^ ((x) | ~(z)))
+#define SET(a, b, c, d, k, s, Ti)\
+  t = a + I(b,c,d) + X[k] + Ti;\
+  a = ROTATE_LEFT(t, s) + b
+   /* Do the following 16 operations. */
+  SET(a, b, c, d,  0,  6, T49);
+  SET(d, a, b, c,  7, 10, T50);
+  SET(c, d, a, b, 14, 15, T51);
+  SET(b, c, d, a,  5, 21, T52);
+  SET(a, b, c, d, 12,  6, T53);
+  SET(d, a, b, c,  3, 10, T54);
+  SET(c, d, a, b, 10, 15, T55);
+  SET(b, c, d, a,  1, 21, T56);
+  SET(a, b, c, d,  8,  6, T57);
+  SET(d, a, b, c, 15, 10, T58);
+  SET(c, d, a, b,  6, 15, T59);
+  SET(b, c, d, a, 13, 21, T60);
+  SET(a, b, c, d,  4,  6, T61);
+  SET(d, a, b, c, 11, 10, T62);
+  SET(c, d, a, b,  2, 15, T63);
+  SET(b, c, d, a,  9, 21, T64);
+#undef SET
+
+   /* Then perform the following additions. (That is increment each
+      of the four registers by the value it had before this block
+      was started.) */
+  state->words[0] += a;
+  state->words[1] += b;
+  state->words[2] += c;
+  state->words[3] += d;
+}
+
+/* api */
+
+md5_state * md5_digest_init (md5_state *state)
+{
+  state->bitcount[0] = state->bitcount[1] = 0;
+  state->words[0] = 0x67452301;
+  state->words[1] = /*0xefcdab89*/ T_MASK ^ 0x10325476;
+  state->words[2] = /*0x98badcfe*/ T_MASK ^ 0x67452301;
+  state->words[3] = 0x10325476;
+  return state;
+}
+
+void md5_digest_add (md5_state *state, const void *input, size_t size)
+{
+  const uint8_t *p = (const uint8_t *)input;
+  int nbytes = (int)size; // PJ
+  int left = nbytes;
+  int offset = (state->bitcount[0] >> 3) & 63;
+  uint32_t nbits = (uint32_t)(nbytes << 3);
+
+  if (nbytes <= 0)
+    return;
+
+  /* Update the message length. */
+  state->bitcount[1] += nbytes >> 29;
+  state->bitcount[0] += nbits;
+  if (state->bitcount[0] < nbits)
+    state->bitcount[1]++;
+
+  /* Process an initial partial block. */
+  if (offset) {
+    int copy = (offset + nbytes > 64 ? 64 - offset : nbytes);
+
+    memcpy(state->buffer + offset, p, copy);
+    if (offset + copy < 64)
+      return;
+    p += copy;
+    left -= copy;
+    md5_process(state, state->buffer);
+  }
+
+  /* Process full blocks. */
+  for (; left >= 64; p += 64, left -= 64)
+    md5_process(state, p);
+
+  /* Process a final partial block. */
+  if (left)
+    memcpy(state->buffer, p, left);
+}
+
+#define md5_digest_byte(state, i) (uint8_t)(state->words[i >> 2] >> ((i & 3) << 3))
+
+void md5_digest_get (md5_state *state, uint8_t digest[], int flags)
+{
+  static const uint8_t pad[64] = {
+    0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+  };
+  uint8_t data[8];
+  int i;
+
+  /* Save the length before padding. */
+  for (i = 0; i < 8; ++i)
+    data[i] = (uint8_t)(state->bitcount[i >> 2] >> ((i & 3) << 3));
+  /* Pad to 56 bytes mod 64. */
+  md5_digest_add(state, pad, ((55 - (state->bitcount[0] >> 3)) & 63) + 1);
+  /* Append the length. */
+  md5_digest_add(state, data, 8);
+
+  /* Output */
+  if (flags & MD5_HEX)
+  { // expected digest buffer size MD5_STRING_LENGTH
+    uint8_t byte;
+    const char *alphabet;
+    alphabet = (flags & MD5_LCHEX) ? "0123456789abcdef" : "0123456789ABCDEF";
+    for (i = 0; i < MD5_DIGEST_LENGTH; ++i)
+    {
+      byte = md5_digest_byte(state, i);
+      *digest++ = (uint8_t)alphabet[byte >> 4];
+      *digest++ = (uint8_t)alphabet[byte & 15];
+    }
+    *digest = 0;
+  }
+  else
+  { // expected digest buffer size MD5_DIGEST_LENGTH
+    for (i = 0; i < MD5_DIGEST_LENGTH; ++i)
+      *digest++ = md5_digest_byte(state, i);
+  }
+}
+
+void md5_digest (const void *input, size_t length, uint8_t digest[], int flags)
+{
+  md5_state md5;
+  md5_digest_init(&md5);
+  md5_digest_add(&md5, input, length);
+  md5_digest_get(&md5, digest, flags);
+}
+
+/* file checksum */
+
+#define DIGEST_BUFFER_SIZE 4096
+
+int md5_digest_add_file (md5_state *state, const char *filename)
+{
+  FILE *fh;
+  uint8_t buffer[DIGEST_BUFFER_SIZE];
+  size_t read;
+
+  if ((fh = fopen(filename, "rb")) == NULL)
+    return 0;
+  do {
+    read = fread(buffer, 1, DIGEST_BUFFER_SIZE, fh);
+    md5_digest_add(state, buffer, read);
+  } while (read == DIGEST_BUFFER_SIZE);
+  fclose(fh);
+  return 1;
+}
+
+int md5_digest_file (const char *filename, uint8_t digest[], int flags)
+{
+  md5_state state;
+
+  md5_digest_init(&state);
+  if (md5_digest_add_file(&state, filename))
+  {
+    md5_digest_get(&state, digest, flags);
+    return 1;
+  }
+  return 0;
+}
+\ No newline at end of file
diff --git a/source/luametatex/source/libraries/pplib/util/utilmd5.h b/source/luametatex/source/libraries/pplib/util/utilmd5.h
new file mode 100644
index 000000000..3964d59df
--- /dev/null
+++ b/source/luametatex/source/libraries/pplib/util/utilmd5.h
@@ -0,0 +1,49 @@
+
+/* sha2 implementation excerpted from code by Aaron D. Gifford */
+
+#ifndef UTIL_MD5_H
+#define UTIL_MD5_H
+
+#include <stdint.h>
+#include <stddef.h> // for size_t
+#include "utildecl.h"
+
+//#define md5_state md5_state_t
+
+typedef struct {
+  uint32_t bitcount[2];
+  uint32_t words[4];
+  uint8_t buffer[64];
+} md5_state;
+
+#define MD5_DIGEST_LENGTH 16
+#define MD5_STRING_LENGTH (MD5_DIGEST_LENGTH * 2 + 1)
+
+enum {
+  MD5_BYTES = 0,
+  MD5_UCHEX = (1<<0),
+  MD5_LCHEX = (1<<1)
+};
+
+#define MD5_DEFAULT MD5_BYTES
+#define MD5_HEX (MD5_UCHEX|MD5_LCHEX)
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+UTILAPI md5_state * md5_digest_init (md5_state *state);
+UTILAPI void md5_digest_add (md5_state *state, const void *input, size_t size);
+UTILAPI void md5_digest_get (md5_state *state, uint8_t digest[], int flags);
+
+UTILAPI void md5_digest (const void *input, size_t length, uint8_t digest[], int flags);
+
+UTILAPI int md5_digest_add_file (md5_state *state, const char *filename);
+UTILAPI int md5_digest_file (const char *filename, uint8_t digest[], int flags);
+
+#ifdef __cplusplus
+}  /* end extern "C" */
+#endif
+
+#endif
+\ No newline at end of file
diff --git a/source/luametatex/source/libraries/pplib/util/utilmem.c b/source/luametatex/source/libraries/pplib/util/utilmem.c
new file mode 100644
index 000000000..9a32247ab
--- /dev/null
+++ b/source/luametatex/source/libraries/pplib/util/utilmem.c
@@ -0,0 +1,67 @@
+
+#include <string.h> // for memcpy
+
+#include "utilmem.h"
+#include "utillog.h"
+
+#ifndef util_memerr
+#  if defined(_WIN64) || defined(__MINGW32__)
+#    define util_memerr(size) { loggerf("ooops, not enough memory (%I64u)", ((unsigned long long)(size))); abort(); }
+#  else
+#    define util_memerr(size) { loggerf("ooops, not enough memory (%llu)", ((unsigned long long)(size))); abort(); }
+#  endif
+#endif
+
+void * util_malloc (size_t size)
+{
+  void *m;
+  if ((m = malloc(size)) == NULL)
+    util_memerr(size);
+  return m;
+}
+
+void * util_calloc (size_t num, size_t size)
+{
+  void *m;
+  if ((m = calloc(num, size)) == NULL)
+    util_memerr(size);
+  return m;
+}
+
+void * util_realloc (void *m, size_t size)
+{
+  if ((m = realloc(m, size)) == NULL)
+    util_memerr(size);
+  return m;
+}
+
+/* common array resizer
+
+data -- the beginning of array
+unit -- sizeof array element
+size -- current array size
+extra -- requested extra size
+space -- pointer to available space
+allocated -- flag indicating if *data has been allocated (with malloc)
+
+*/
+
+void util_resize (void **data, size_t unit, size_t size, size_t extra, size_t *space, int allocated)
+{
+  if (*space == 0)
+    *space = 4; // better keep *space non-zero to avoid it
+  do { *space <<= 1; } while (size + extra > *space);
+
+  if (allocated)
+  {
+    *data = util_realloc(*data, *space * unit);
+  }
+  else
+  {
+    void *newdata = util_malloc(*space * unit);
+    if (*data != NULL)
+      memcpy(newdata, *data, size * unit);
+    *data = newdata;
+  }
+}
+
diff --git a/source/luametatex/source/libraries/pplib/util/utilmem.h b/source/luametatex/source/libraries/pplib/util/utilmem.h
new file mode 100644
index 000000000..4cfcfaba2
--- /dev/null
+++ b/source/luametatex/source/libraries/pplib/util/utilmem.h
@@ -0,0 +1,16 @@
+
+#ifndef UTIL_MEM_H
+#define UTIL_MEM_H
+
+#include <stdlib.h> // for size_t and alloc functions
+#include "utildecl.h"
+
+UTILAPI void * util_malloc (size_t size);
+UTILAPI void * util_calloc (size_t num, size_t size);
+UTILAPI void * util_realloc (void *m, size_t size);
+
+void util_resize (void **data, size_t unit, size_t size, size_t extra, size_t *space, int allocated);
+
+#define util_free free // not a call, might be used as identifier
+
+#endif
diff --git a/source/luametatex/source/libraries/pplib/util/utilmemallc.h b/source/luametatex/source/libraries/pplib/util/utilmemallc.h
new file mode 100644
index 000000000..6d0ed2a06
--- /dev/null
+++ b/source/luametatex/source/libraries/pplib/util/utilmemallc.h
@@ -0,0 +1,569 @@
+/*
+Allocators
+==========
+
+Using allocators defined here makes sense if there is a need to take a space for rather large amount of rather small objects.
+The basic idea is to take memory in reasonably large blocks and to distribute small chunks from those blocks. Once chunks are
+no longer needed, one can free them all at once, or free taken chunks individually.
+
+We define 3 types of allocators:
+
+1. HEAP - the simplest one, provides variable length memory chunks from larger blocks and frees them all on explicit
+   request. There is no way to free individual objects, only the entire heap. The heap only grows, until freed.
+
+2. STOCK - provides variable length memory chunks from larger blocks, but allows to free individual objects as well as the
+   entire stock. The stock grows and shrinks, by leaps and bounds, depending on parameters given during initialization.
+
+3. POOL - provides fixed size memory chunks from larger blocks. It allows to free individual chunks as well as the entire pool.
+   In opposite to a stock, a pool also reuses a space reclaimed by freeing individual objects; before allocating a new block it
+   firsts recycles freed chunks, if any.
+
+In general, memory chunks provided by allocators are never reallocated. Allocators do nothing with provided chunks until freed.
+
+Allocators are represented as small structures (several pointers and integers). We assume that such structures are either static
+variables, or members of larger structures. We don't bother to allocate a memory for them. Usage scheme is pretty similar for
+all:
+
+  init() - just inititalize, don't allocate anything yet
+  take() - take chunks
+  take()
+  take()
+  ...
+  free() - free the all at once
+
+For stocks and pools there is a possibility to give back individual chunks:
+
+  init() - like above
+  take() - take chunks
+  take()
+  take()
+  back() - give chunks back when no longer needed
+  take()
+  back()
+  ...
+  free() - needed only if not all chunks have been given back
+
+All calls take a shell structure pointer as an argument. take() returns a void pointer, aligned according to used variant
+(8, 16, 32, 64). back() takes a void pointer as the second argument. It must be the pointer previously returned by take().
+
+back() can be called in any order and can obviously be plotted with take(). By default, after back()-ing all taken chunks, the
+stock returns to its initial state (zero memory used). A special KEEP flag can be used during initialization to prevent
+freeing the last (sole) allocated block. If KEEP option is used, the allocator always keeps a single segment for further
+allocations. This is necessary only when there is a risk that just several take() calls will be immediatelly followed by the
+same number of back() calls. KEEP flag prevents allocating and freeing rather large blocks just to serve several chunks. And
+this is actually important only if there are no other blocks taken, that is, if there is only one, nearly empty block in use.
+In other cases KEEP flag doesn't matter, but allocators takes care to always have a block for fast allocs.
+
+There is also clear() operation that frees all but the recent block. One can use it to free all chunks taken so far, but to
+make the allocator ready for further allocs. If either KEEP flag is used or clear() is called, soner or later the user have to
+call free() explicitly, to remove all the remaining memory kept by the allocator. There is no KEEP flag for heaps, as heaps
+don't allow to free individual chunks. And so, the heap never needs to make a decision if the last sole block should be removed
+or not. The user makes the decision by calling clear() vs free() respectively.
+
+Pop
+===
+
+A very last chunk taken can be quickly given back with
+
+  pop(heap, taken, size) // for heap or stock
+  pop(pool, taken)       // for pool
+
+taken must be the chunk returned by the very last take(), size must be the size requested. If the chunk has been taken from
+the head block (more about blocks below), the block pointer returns to its previous position, as it was before the last take().
+If the chunk has been taken from the sole block beneatch the head, the entire sole block (containing just that single chunk)
+is freed. The pop() operation is different than back(); the popped chunk doesn't cause freeing  the head block when its refcount
+gets zero. So pop() operation breaks the concept of stock that frees all the memory once all taken chunks are given back.
+on the other hand, if for some reason the very last taken chunk is to be ignored, pop() is better, as it doesn't cause blocks
+scattering. The popped chunk pointer will probably be returned by the very next call to take(). In case of heap, pop() is
+the only way to discard the chunk, as there is no back() operation.
+
+Buffer interface
+================
+
+When heap or stock is used by parsers, the caller oftenly doesn't know how many space will be needed for a data (this doesn't
+apply to pools, which returns constant size memory chunks). Here is an interface for the allocator-as-bufer case (same for
+heap and stock):
+
+  some(heap, atleast, &space);
+  ...
+  atleast <<= 1;
+  more(heap, taken, written, atleast, &space);
+  ...
+  done(heap, taken, written);
+
+some() operation provides a data pointer to at least a given bytes. The actual space provided for writing is set to the third
+argument. The caller may write space-bytes. If more space is needed, more() operation takes care to provide a chunk for a given
+amount of bytes and rewrites already written amount of bytes from a previous chunk to a new location. Same as with() some, the
+requests for atleast bytes, and the actual provided chunk size is given as space (space >= atleast).
+
+The function takes the pointer to the chunk previously taken; the one returned by some() or more(). This argument must not be NULL. 
+If you don't want to copy a data, set written argument to zero. No matter if more() operation was used zero, one or multiple times, 
+all the cycle must end with done(). Calls triple - some(), more() and done() - must not be interrupted by any other api calls. 
+In particular, using take() or back() smells like a segfault. However, if there is a need discard the buffer being written 
+(eg. input data error), instead of done() one may use
+
+  giveup(heap, taken)
+
+If done() has already been called, pop() is the only option to discard the chunk
+
+  pop(heap, taken, written)
+
+some() operation usually doesn't change the state of the heap, unless the heap head block is NULL, or atleast parameter is too
+large to fit the remaining block. more() usually changes the state, either by allocating a new head block, or by allocating
+a sole block just beneath the head (blocks and blocks tiling mechanism are described below). If a sole block has been taken for
+some large chunk subsequent calls to more() reallocate this sole block in place. It is assumed, that the size you request in subsequent
+calls generally grows. It is ok to request space-bytes, then call done() with written value less then requested. But the drawback
+is that if the chunk has already been allocated from a sole chunk, the space requested but not used is a waste.
+
+iof interface
+=============
+
+iof is an independent interface for buffers written/read byte-by-byte. When used together with allocators, it provides 
+a convenient way to write byte data to the heap or stock, without a need for intermediate buffers. The buffer is setup with
+
+  iof output, *O
+  O = buffer_init(heap, &output); // doesn't allocate anything
+
+or 
+
+  output = BUFFER_INIT(heap); // doesn't allocate anything
+  O = &output;
+
+iof keeps pointers to the beginning of the buffer, end of buffer, and current position. Once the position reaches the end, 
+the iof internal handler updates the buffer providing more space to write. When used in conjunction with heap or stock,
+the space to write is the space provided by the heap or stock. To start the buffer session:
+
+  O = buffer_some(heap, O, atleast) // ensure iof *O to have atleast bytes to be written
+
+Once you are done with writing some chunk
+
+  buffer_done(heap, O)
+
+instead of buffer_done(), one may also use
+
+  iof_flush(O) // calls buffer_done() and buffer_some() again
+  
+which updates the underlying heap or stock, and makes the iof ready for a new chunk. iof itself does not allocate a memory, 
+so it doesn't need finalizer. iof_close(output) does nothing. To drop the buffer use:
+
+  buffer_giveup(heap, O) // restore the from before buffer_some()
+
+More often then not, we need to specify a minimal space for buffer each time, eg. for memcpy() or so. The actual space left
+can be checked with iof_left(O). The entire space of recent chunk is O->space (eq. O->end - O->buf).
+
+Identical interface for heap and stock.
+
+Blocks
+======
+
+Each alloctor structure keeps a pointer to a head block, initially NULL. Most of new chunks are taken from the head. Once the
+space left in the head block is to small to provide a chunk of requested size, a new head is created and the previous one is
+linked to the head (blocks form a linked list). A stock block is named a ream, a heap block is named a pyre, a pool block is
+named pile (we need to distinguish structure names in code but in the description below they are all called blocks). Every
+block knows a number of chunks taken from that block (refcont). A stock also keeps a number of freed chunks [actually only
+for statistics; in most cases it doesn't need an extra space in struct ream, as thies structure member lays in the place
+f padding bytes.]
+
+We change the head block only if the new block is allocated, but we never change the head backward. Once some block became
+->prev, it will never became a head again. This ensures that the allocator have the head block that usually has a lot of space
+for new allocs. This needs a special care when removing a block that is not a head block. We check if the next block to the one
+being removed is the head. If it is, and if its refcount is zero (and no KEEP flag is used) the head is removed as well.
+
+The basis of pools is similar to stocks and heaps, but there are some significant differences. A pool servers memory chunks of
+equal size, specified during initialization. This also means that the pool knows the boundaries of individual chunks (stock and
+heap doesn't). A pool provides iterators over chunks in use (taken but not given back yet). A pool shell structure keeps
+a pointer to a head block and a tail block (both may point a different block, the same block or NULL). This is necessary only
+for iterators to let the user follow the chunks from the first or from the last taken. The extra cost of maintaining both
+->head and ->tail is neglectable.
+
+Refcounting
+===========
+
+Heap refcounting: whenever a new chunk is taken, the block refcount is incremented. It is never decremented, but plays an
+important role in block tiling algorithm (below). No KEEP flag is used here. All the blocks are removed on free(), all but
+recent are removed on clear().
+
+Stock refcounting: whenever a new chunk in taken from the block, the block refcount is incremented. Whenever the chunk is given
+back, the refcount is decremented. When the refcount gets zero, the block is removed and freed. To remove the block from the
+list (any block, not necessarily a head block), a stock needs 2-directional list; every block has ->next and ->prev links. The
+head block of the stock is freed only if this is the last (sole) block and no KEEP flag was used during initialization.
+Otherwise the block is just reset, becoming ready for further allocations - refcount gets zero, data space reset to an initial
+state.
+
+Pool refcounting: pretty much like with stocks, except that any chunk given back can be recycled on further take().
+
+Ghosts
+======
+
+Every allocated block starts with a private structure for next/prev links, data pointer, refcount. We call it a block ghost.
+Except from heap, individual chunks also need a ghost (chunk ghost) so that we are able to know from which block the chunk
+comes from once the chunk is given back by the user (heaps don't have back() operation so data chunks have no ghosts). We keep
+ghosts possibly small. Chunk ghosts are of size natural for alignment variant (1, 2, 4 or 8 bytes). Block ghosts are somewhat
+larger. Statistics show clearly that it is worthy to keep them as small as possible:
+- chunk ghosts keep offset to the block ghost, not a pointer to it (we use the pointer only if it makes no difference
+  to the chunk size; 64-bit aligned variant on 64-bit machine, 32 and 64 variants on 32-bit machine)
+- block ghosts uses a data pointer (not an offset) so that we are able to record any requested chunk size (size_t) and to avoid
+  long array indexing on every chunk request
+
+At some point we considered storing a sheel structure pointer in the block ghost, then back() operation wouldn't need an extra
+argument. But stats showed that the size of the block ghost is the most significant factor in memory usage efficiency, so eliminating
+this extra pointer pays off. Besides, this would make impossible to relocate the shell structure. We don't allocate a memory
+for the shell, so we shouldn't make assumptions of shell structure address.
+
+Tiling algorithm
+================
+
+Tiling the block size refers to stocks and heaps that serves memory chunks of variable size. Both stock and heap performs best
+when the average size of requested chunks is a way smaller that the configured block size. But both also put no limitations on
+chunk sizes, so they need to cope with situation, where the requested size is quite large, eg. half of the block size or even
+more than the block size. Here is the algorithm used for blocks tiling:
+
+1. When the requested chunk size fills in the recent block, just serve it from that block. This is the best and hopefully the
+   most common case.
+
+2. When the requested chunk size is larger that the space left in the recent block, the new block must be allocated. But there
+are two ways:
+
+   a) either replace the head block with the new block so that the list of blocks is
+
+      ... <- prev <- head so far <- new head
+
+   b) or insert the block just "below the head", keeping the head intact,
+
+      ... <- prev <- new single head <- head
+
+The first is the regular case. It is used when the space left in the head so far is small (can be neglected), and the requested
+size is relatively small (will fit the new block). If the space left in the head block is worthy to bother, or the requested
+chunk size is rather large, the new chunk is served from a single block, allocated just for that chunk. The block is of the
+size needed for that chunk. The block never becomes the head, no other chunks will be served from it (its refcount is
+permanently 1, until freed).
+
+Much depends on what is considered 'small, neglectable block space' and 'rather large chunk size'. The later is easier to
+imagine. When the requested size is larger than the block size used for a given allocator, then the size is definitelly
+considered large. When it is smaller than the block size, but still large enough to occupy most of the block size (grabbing
+quite some space for tiny chunks), it is also considered large. As the block size, what is considered 'large' can be spcified
+during initialization. A setup that works fine for me is (large = block_size / 2).
+
+Making a decision what is the left block space we can neglect is quite not obvious. At first approach we used a constant value,
+requested from the user during allocator initialization. But it is hard to select a good default. Now we compute this value
+from block params, by dividing a complete space occupied so far in the block by the number of chunks served from that block
+(the average size of chunks allocated from this block). We assume that the average chunk size (or smaller) is the space we can
+neglect. The logic behind is the following: if the space left in the block is larger than the average, it makes sense not to
+waste this space and keep it for further allocs. If the space left in the block is less than the average, there is only a little
+chance we will get a request for suitable size, so we sacrifice that space and we start allocating from a new block.
+
+Statistics showed a caveat in average == treshold approach. Suppose we have a block that has the average chunk size 16, there
+is 18 bytes left in the block (not neglectable), and the user request is 20 bytes. Allocating a single block for 20 bytes is
+bad, because the block ghost is 24 bytes (more internal than allocated memory). Allocating many of such blocks gives bad results;
+much more allocs than necessary, large waste. To avoid that, we help to neglect the remaining block space by checking if the
+space left is smaller than the block ghost size, which is an inevitable cost anyway.
+
+Stats below shows clearly that we should rather focus on "how to avoid producing sole-chunk blocks" instead of "how to feel the
+remaining space".
+
+Recycling
+=========
+
+Recycling applies only to pools. When a chunk is given back, it is inserted into a list of items for recycling. Every pool
+block keeps a head of that list. Once a chunk is given back, it is inserted as recycling head and the previous head is attached
+to a new head. Since every chunk is associated with a ghost, we use ghosts to store a link (pointer or offset) to another item
+for recycling. Note that the ghost always keeps either a link to the block it belongs to, or a link to another recyclable ghost
+of the same block. This is used by iteratos to distinguish the chunk currently in use from the chunk that has already been
+given back; if the link points the block, the chunk is in use.
+
+A pool block that has at least one recyclable chunk is called a squot. A pool shell structure keeps 2-directional list of
+squots. Once a pool block becomes a squot, it is inserted to that list. Once its all recyclable items has been used, it is
+removed from the squots list. In every moment, the pool has an access to a list of all squots, and therefore, to a list of all
+recyclable items.
+
+Whenever there is a request for a new chunk, at first it is served from the head block, as this is the easiest and the cheapest way.
+Once the recent block has no more place for new items, recycling list is used, starting from the head recyclable chunk of the head squot.
+In practise this is always the most recently reclaimed chunk ghost. During further allocs, a pool will first utilize all recyclables
+from all squots before allocating a new block.
+
+Stats
+=====
+
+Some numbers. The test made on a stock8, block size 255 bytes, 10000 allocations, random chunk sizes from 1 to 32 bytes
+(average 16). These are rather tight constraints because of 255 buffer limit. First approach:
+
+  blocks: 903 - this is the actual number of malloc() calls
+  singles: 214, 23.70% of all blocks
+  waste: 20.16% - total memory that was allocated but not requested by the user
+    block ghosts 10.04%, plus single block ghosts 3.12%
+    chunk ghosts 4.55%
+    neglected block tails 2.45%
+
+After adding a test for left space that helps in 'neglect remainig space or make sole chunk block' decision:
+
+  blocks: 723 - a way better
+  singles 0
+  waste: 19.04% - slightly better
+  block ghosts 10.67%
+  chunk ghosts 4.61%
+  neglected block tails 3.76%
+
+The actual numbers vary depending on the buffer size, the average elements size and, of course, taken alignment variant. After
+some parameters tuning, on various tests we get 5-19% total waste for stocks, 3-14% total waste for heaps. But the basic scheme
+of statistics remains similar: we take relatively lots of space for blocks ghost (5-10% of total memory taken), some inevitable
+space for chunk ghosts (varies, 4-13% on various tests), and a little waste of neglected block tails (2-4%). Quite
+surprisingly, block ghosts are, in sum, oftenly more significant than individual chunk ghosts (for the test above over half of
+all the waste!). The number of block ghosts (equals the number of blocks) mostly depends on block size vs chunk size relation.
+But seemingly it is worthy to bother about the size of the block ghost and the number of blocks taken - the less the better.
+The waste of ghosts of individual objects (stock and pool) is inevitable, and depends only on the number/size of objects taken.
+We can't use smaller ghosts, we can't do better. Anyways, the least significant is the waste of neglected block tails.
+
+Pools stats are pretty similar, but more predictable because of known chunks size. A pool block ghost is somewhat larger
+structure because it keeps ->nextsquot / ->prevsquot pointers among ->next / ->prev. On the other hand, it doesn't need
+->unused counter, as for fixed-length chunks it can always be computed from the refcount and used data. Also somewhat larger
+block ghost structure is compensated by the fact that the are no tail block waste and there is no 'neglect or not' problem.
+
+Alignment
+=========
+
+Each allocator has 4 variants for 1, 2, 4, 8 bytes alignment respectively. Eg. stock32_take() always returns a pointer aligned
+to 4 bytes, heap64_take() returns a pointer aligned to 8 bytes. You can ask for any data length, but in practise you'll always
+obtain 1N, 2N, 4N or 8N. Alignment implies data padding unless the user requests for "aligned" sizes. In statistics the padding
+is not considered a waste.
+
+Zeroing
+=======
+
+All heap, stock and pool may return zeroed memory chunks, depending on initial flags:
+
+  HEAP_ZERO
+  STOCK_ZERO
+  POOL_ZERO
+
+There are also take0() variants that simply return memset(take(), 0, size), regardless the flag.
+*/
+
+#ifndef UTIL_MEM_ALLC_C
+#define UTIL_MEM_ALLC_C
+
+/*
+Common internals for allocators suite. A selection or all of the following defines (from api headers) should already be there:
+
+	UTIL_MEM_HEAP_H  // utilmemheap.h
+	UTIL_MEM_STOCK_H // utilmemstock.h
+	UTIL_MEM_POOL_H  // utilmempool.h
+
+*/
+
+#include <string.h> // memset()
+#include <stdio.h> // printf()
+
+#include "utilmem.h"
+
+//#if defined(DEBUG) && debug != 0
+#if 1
+#  define ASSERT8(cond) ((void)((cond) || (printf("8bit allocator assertion, %s:%d: %s\n", __FILE__, __LINE__, #cond), 0)))
+#  define ASSERT16(cond) ((void)((cond) || (printf("16bit allocator assertion, %s:%d: %s\n", __FILE__, __LINE__, #cond), 0)))
+#  define ASSERT32(cond) ((void)((cond) || (printf("32bit allocator assertion, %s:%d: %s\n", __FILE__, __LINE__, #cond), 0)))
+#  define ASSERT64(cond) ((void)((cond) || (printf("64bit allocator assertion, %s:%d: %s\n", __FILE__, __LINE__, #cond), 0)))
+#else
+#  define ASSERT8(cond) (void)0
+#  define ASSERT16(cond) (void)0
+#  define ASSERT32(cond) (void)0
+#  define ASSERT64(cond) (void)0
+#endif
+
+#if defined(UTIL_MEM_STOCK_H) || defined(UTIL_MEM_POOL_H)
+struct ghost8{
+  uint8_t offset;
+};
+
+struct ghost16 {
+  uint16_t offset;
+};
+
+#ifdef BIT32
+struct ghost32 {
+  union {
+#ifdef UTIL_MEM_STOCK_H
+    ream32 *ream;
+#endif
+#ifdef UTIL_MEM_POOL_H
+    pile32 *pile;
+    ghost32 *nextfree;
+#endif
+    void *block;
+  };
+};
+#else
+struct ghost32 {
+  uint32_t offset;
+};
+#endif
+
+struct ghost64 {
+  union {
+#ifdef UTIL_MEM_STOCK_H
+    ream64 *ream;
+#endif
+#ifdef UTIL_MEM_POOL_H
+    pile64 *pile;
+    ghost64 *nextfree;
+#endif
+    void *block;
+  };
+#ifdef BIT32
+  uint8_t dummy[4]; // force 8
+#endif
+};
+#endif
+
+/*
+All offsets related macro horror is here. Block is 4/8-bytes aligned (32/64 pointer size), ream->data is adjusted to 1/2/4/8-bytes accordingly.
+Therefore all offsets we store and pointers we cast, should be properly aligned. In all cases, sizes and offsets refers to bytes.
+We need data ghosts only to access the block. For 8 and 16 we use 8/16 bit offsets to keep the ghost smaller. For 32 and 64 we either use offset,
+or a pointer to the ream.
+
+malloc() is obviously expected to return a pointer properly allowed for all standard c-types. For 64-bit we can safely expect at least 8-bytes aligned.
+(at least, because long double may need 16 bytes on gcc64, or 8 bytes on msvc64, or weird on some exotics). On 32 bit machines pointers are 4 bytes
+aligned, even long long is 4-bytes aligned. But double on 32bit machine is 8-bytes aligned on windows, 4 bytes aligned in linux (compiler option
+-malign-double makes it 8-bytes aligned). Anyways, we cannot expect that on 32bit machine the result of malloc is always 8-bytes aligned.
+This requires a very special treatment of 64-variant on 32bit machine: the first data ghost may need to be 4-bytes off. Should we ensure 4 bytes
+more from malloc just in case? Hmm padding will be there anyway, as we adjust ream->data size to bytes boundaries.
+
+In both 32/64bit environments, the ghost keeps a pointer to the block. On 32bit machine, the first chunk ghost address may need to be +4,
+as this is not ensured by malloc(). See struct ream64 {}. We have an extra test; the final ghost pointer will be properly aligned iff
+
+  ((block & 7 == 0) && (sizeof(block64) & 7 == 0)) || ((block & 7 == 4) && (sizeof(block64) & 7 == 4)
+
+or in short
+
+  ((block + 1) & 7) == 0
+
+otherwise it needs 4 bytes offset.
+*/
+
+#define pointer_tointeger(p) ((size_t)(p)) // & not allowed on pointer
+
+#define pointer_aligned32(p) ((pointer_tointeger(p) & 3) == 0)
+#define pointer_aligned64(p) ((pointer_tointeger(p) & 7) == 0)
+
+#define void_data(data) ((void *)(data))
+#define byte_data(data) ((uint8_t *)(data))
+
+/* top of the block ghost */
+
+#define block_top(block) (byte_data(block + 1))
+
+/* where the data begins */
+
+#define block_edge8(block) block_top(block)
+#define block_edge16(block) block_top(block)
+#define block_edge32(block) block_top(block)
+
+#ifdef BIT32
+#  define ALIGN64ON32(block) (pointer_aligned64(block + 1) ? 0 : 4)
+#  define block_edge64(block) (block_top(block) + ALIGN64ON32(block))
+#else
+#  define block_edge64(block) block_top(block)
+#endif
+
+#define block_left8(block, size) (size)
+#define block_left16(block, size) (size)
+#define block_left32(block, size) (size)
+#ifdef BIT32
+#  define block_left64(block, size) (size - ALIGN64ON32(block))
+#else
+#  define block_left64(block, size) (size)
+#endif
+
+/* consumed block space; it is important to use edge() macros that involves ALIGN64ON32() */
+
+#define block_used8(block)  (block->data - block_edge8(block))
+#define block_used16(block) (block->data - block_edge16(block))
+#define block_used32(block) (block->data - block_edge32(block))
+#define block_used64(block) (block->data - block_edge64(block))
+
+/* align requested size to keep ream->data / pyre->data always aligned. size is always size_t, no insane overflow checks */
+
+#define align_size8(size) ((void)size)
+#define align_size16(size) (size = aligned_size16(size))
+#define align_size32(size) (size = aligned_size32(size))
+#define align_size64(size) (size = aligned_size64(size))
+
+/*
+done() and pop() operations decrements block->left space by an aligned size; block->left -= alignedwritten. Lets have 8-bytes aligned
+variant block. If we tell the user there is 15 bytes left (block->left == 15) and the user taked 12. Aligned is 16, we cannot substract.
+We could eventually set block->left to 0, but then pop() operation would no be allowed. Hance, block->left must be aligned. The procedure
+is different than for size (size_t), we cannot cross 0xff/0xffff,... bondaries.
+*/
+
+#define align_space8(space) ((void)space)
+#define align_space16(space) (space = aligned_space16(space))
+#define align_space32(space) (space = aligned_space32(space))
+#define align_space64(space) (space = aligned_space64(space))
+
+/* handling ghost structure (stock and pool) */
+
+#if defined(UTIL_MEM_STOCK_H) || defined(UTIL_MEM_POOL_H)
+
+/* ghost offset from block top; not from bottom because we must not exceed offset limit */
+
+#define ghost_offset(block, ghost) (byte_data(ghost) - block_top(block))
+
+/* ghost <-> data */
+
+#define ghost_data(ghost) ((void *)(ghost + 1))
+
+/* cast from data to ghost structure goes via (void *) to shut up warnigns, alignment ok */
+
+#define data_ghost8(data)  (((ghost8 *)void_data(data)) - 1)
+#define data_ghost16(data) (((ghost16 *)void_data(data)) - 1)
+#define data_ghost32(data) (((ghost32 *)void_data(data)) - 1)
+#define data_ghost64(data) (((ghost64 *)void_data(data)) - 1)
+
+/* ghost <-> block */
+
+#define ghost_block8(ghost, block8) ((block8 *)void_data(byte_data(ghost) - ghost->offset - sizeof(block8)))
+#define ghost_block16(ghost, block16) ((block16 *)void_data(byte_data(ghost) - ghost->offset - sizeof(block16)))
+#ifdef BIT32
+#  define ghost_block32(ghost, block32) (ghost->block)
+#else
+#  define ghost_block32(ghost, block32) ((block32 *)void_data(byte_data(ghost) - ghost->offset - sizeof(block32)))
+#endif
+#define ghost_block64(ghost, block64) (ghost->block)
+
+/* ghost init */
+
+#define ghost_next8(block, ghost) ((ghost = block->dataghost), (ghost->offset = (uint8_t)ghost_offset(block, ghost)))
+#define ghost_next16(block, ghost) ((ghost = block->dataghost), (ghost->offset = (uint16_t)ghost_offset(block, ghost)))
+#ifdef BIT32
+#  define ghost_next32(bl0ck, ghost) ((ghost = bl0ck->dataghost), (ghost->block = bl0ck))
+#else
+#  define ghost_next32(block, ghost) ((ghost = block->dataghost), (ghost->offset = (uint32_t)ghost_offset(block, ghost)))
+#endif
+#define ghost_next64(bl0ck, ghost) ((ghost = bl0ck->dataghost), (ghost->block = bl0ck))
+
+#endif
+
+/* average block chunk size */
+
+#define average_block_chunk8(ream)  (block_used8(ream) / ream->chunks)
+#define average_block_chunk16(ream) (block_used16(ream) / ream->chunks)
+#define average_block_chunk32(ream) (block_used32(ream) / ream->chunks)
+#define average_block_chunk64(ream) (block_used64(ream) / ream->chunks)
+
+/*
+neglect remaining block tail and start a new block or create a single block; a test for (block->chunks > 0) is a sanity;
+if block->chunks is zero (block has a full space left), we shouldn't get there, except when alloc->large is larger then alloc->space
+*/
+
+#define take_new_block8(alloc, ghoststruct, block, size) \
+  ((size < alloc->large) && (block->left <= sizeof(ghoststruct) || (block->chunks > 0 && block->left <= average_block_chunk8(block))))
+#define take_new_block16(alloc, ghoststruct, block, size) \
+  ((size < alloc->large) && (block->left <= sizeof(ghoststruct) || (block->chunks > 0 && block->left <= average_block_chunk16(block))))
+#define take_new_block32(alloc, ghoststruct, block, size) \
+  ((size < alloc->large) && (block->left <= sizeof(ghoststruct) || (block->chunks > 0 && block->left <= average_block_chunk32(block))))
+#define take_new_block64(alloc, ghoststruct, block, size) \
+  ((size < alloc->large) && (block->left <= sizeof(ghoststruct) || (block->chunks > 0 && block->left <= average_block_chunk64(block))))
+
+/* empty */
+
+#define head_block_empty(alloc, block) (((block = alloc->head) == NULL) || (block->chunks == 0 && block->prev == NULL))
+
+#endif
+\ No newline at end of file
diff --git a/source/luametatex/source/libraries/pplib/util/utilmemallh.h b/source/luametatex/source/libraries/pplib/util/utilmemallh.h
new file mode 100644
index 000000000..a543d1acb
--- /dev/null
+++ b/source/luametatex/source/libraries/pplib/util/utilmemallh.h
@@ -0,0 +1,36 @@
+
+#ifndef UTIL_MEM_ALLH_H
+#define UTIL_MEM_ALLH_H
+
+#include <stddef.h> // size_t
+#include <stdint.h>
+
+#include "utildecl.h"
+
+typedef struct ghost8 ghost8;
+typedef struct ghost16 ghost16;
+typedef struct ghost32 ghost32;
+typedef struct ghost64 ghost64;
+
+#define aligned_size8(size) (size)
+#define aligned_size16(size) ((((size) + 1) >> 1) << 1)
+#define aligned_size32(size) ((((size) + 3) >> 2) << 2)
+#define aligned_size64(size) ((((size) + 7) >> 3) << 3)
+
+#define aligned_space8(size) (size)
+#define aligned_space16(size) (((size) & 1) ? ((size) < 0xFFFF ? ((size) + 1) : ((size) - 1)) : (size))
+#define aligned_space32(size) (((size) & 3) ? ((size) < 0xFFFFFFFD ? ((size) - ((size) & 3) + 4) : (size) - ((size) & 3)) : (size))
+#define aligned_space64(size) (((size) & 7) ? ((size) < 0xFFFFFFFFFFFFFFF8ULL ? ((size) - ((size) & 7) + 8) : (size) - ((size) & 7)) : (size))
+
+/* info stub */
+
+typedef struct {
+  size_t blocks, singles;
+  size_t chunks, unused;
+  size_t used, singleused, left;
+  size_t ghosts, blockghosts, singleghosts;
+} mem_info;
+
+#define MEM_INFO_INIT() = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }
+
+#endif
+\ No newline at end of file
diff --git a/source/luametatex/source/libraries/pplib/util/utilmemheap.c b/source/luametatex/source/libraries/pplib/util/utilmemheap.c
new file mode 100644
index 000000000..f4a6b8814
--- /dev/null
+++ b/source/luametatex/source/libraries/pplib/util/utilmemheap.c
@@ -0,0 +1,1078 @@
+
+#include "utilmemheap.h"
+#include "utilmemallc.h"
+
+#define pyre_alloc8(heap,  space) ((pyre8 *)((heap->flags  & HEAP_ZERO) ? util_calloc(1, sizeof(pyre8)  + space * sizeof(uint8_t)) : util_malloc(sizeof(pyre8)  + space * sizeof(uint8_t))))
+#define pyre_alloc16(heap, space) ((pyre16 *)((heap->flags & HEAP_ZERO) ? util_calloc(1, sizeof(pyre16) + space * sizeof(uint8_t)) : util_malloc(sizeof(pyre16) + space * sizeof(uint8_t))))
+#define pyre_alloc32(heap, space) ((pyre32 *)((heap->flags & HEAP_ZERO) ? util_calloc(1, sizeof(pyre32) + space * sizeof(uint8_t)) : util_malloc(sizeof(pyre32) + space * sizeof(uint8_t))))
+#define pyre_alloc64(heap, space) ((pyre64 *)((heap->flags & HEAP_ZERO) ? util_calloc(1, sizeof(pyre64) + space * sizeof(uint8_t)) : util_malloc(sizeof(pyre64) + space * sizeof(uint8_t))))
+
+#define pyre_free(pyre) util_free(pyre)
+
+/* block reset */
+
+#define reset_heap_head8(heap, pyre, used) \
+  ((used = block_used8(pyre)), (pyre->data -= used), ((heap->flags & HEAP_ZERO) ? (memset(pyre->data, 0, used), 0) : 0), (pyre->left += (uint8_t)used))
+#define reset_heap_head16(heap, pyre, used) \
+  ((used = block_used16(pyre)), (pyre->data -= used), ((heap->flags & HEAP_ZERO) ? (memset(pyre->data, 0, used), 0) : 0), (pyre->left += (uint16_t)used))
+#define reset_heap_head32(heap, pyre, used) \
+  ((used = block_used32(pyre)), (pyre->data -= used), ((heap->flags & HEAP_ZERO) ? (memset(pyre->data, 0, used), 0) : 0), (pyre->left += (uint32_t)used))
+#define reset_heap_head64(heap, pyre, used) \
+  ((used = block_used64(pyre)), (pyre->data -= used), ((heap->flags & HEAP_ZERO) ? (memset(pyre->data, 0, used), 0) : 0), (pyre->left += (uint64_t)used))
+
+/* init heap */
+
+heap8 * heap8_init (heap8 *heap, uint8_t space, uint8_t large, uint8_t flags)
+{
+  align_space8(space);
+  if (large > space) large = space;
+  heap->head = NULL;
+  heap->space = space;
+  heap->large = large;
+  heap->flags = flags;
+  return heap;
+}
+
+heap16 * heap16_init (heap16 *heap, uint16_t space, uint16_t large, uint8_t flags)
+{
+  align_space16(space);
+  if (large > space) large = space;
+  heap->head = NULL;
+  heap->space = space;
+  heap->large = large;
+  heap->flags = flags;
+  return heap;
+}
+
+heap32 * heap32_init (heap32 *heap, uint32_t space, uint32_t large, uint8_t flags)
+{
+  align_space32(space);
+  if (large > space) large = space;
+  heap->head = NULL;
+  heap->space = space;
+  heap->large = large;
+  heap->flags = flags;
+  return heap;
+}
+
+heap64 * heap64_init (heap64 *heap, uint64_t space, uint64_t large, uint8_t flags)
+{
+  align_space64(space);
+  if (large > space) large = space;
+  heap->head = NULL;
+  heap->space = space;
+  heap->large = large;
+  heap->flags = flags;
+  return heap;
+}
+
+/* free heap */
+
+void heap8_free (heap8 *heap)
+{
+  pyre8 *pyre, *prev;
+  pyre = heap->head;
+  heap->head = NULL;
+  while (pyre != NULL)
+  {
+    prev = pyre->prev;
+    pyre_free(pyre);
+    pyre = prev;
+  }
+}
+
+void heap16_free (heap16 *heap)
+{
+  pyre16 *pyre, *prev;
+  pyre = heap->head;
+  heap->head = NULL;
+  while (pyre != NULL)
+  {
+    prev = pyre->prev;
+    pyre_free(pyre);
+    pyre = prev;
+  }
+}
+
+void heap32_free (heap32 *heap)
+{
+  pyre32 *pyre, *prev;
+  pyre = heap->head;
+  heap->head = NULL;
+  while (pyre != NULL)
+  {
+    prev = pyre->prev;
+    pyre_free(pyre);
+    pyre = prev;
+  }
+}
+
+void heap64_free (heap64 *heap)
+{
+  pyre64 *pyre, *prev;
+  pyre = heap->head;
+  heap->head = NULL;
+  while (pyre != NULL)
+  {
+    prev = pyre->prev;
+    pyre_free(pyre);
+    pyre = prev;
+  }
+}
+
+/* clear heap */
+
+void heap8_clear (heap8 *heap)
+{
+  pyre8 *pyre, *prev;
+  size_t used;
+  if ((pyre = heap->head) == NULL)
+    return;
+  prev = pyre->prev;
+  pyre->prev = NULL;
+  reset_heap_head8(heap, pyre, used);
+  for (; prev != NULL; prev = pyre)
+  {
+    pyre = prev->prev;
+    pyre_free(prev);
+  }
+}
+
+void heap16_clear (heap16 *heap)
+{
+  pyre16 *pyre, *prev;
+  size_t used;
+  if ((pyre = heap->head) == NULL)
+    return;
+  prev = pyre->prev;
+  pyre->prev = NULL;
+  reset_heap_head16(heap, pyre, used);
+  for (; prev != NULL; prev = pyre)
+  {
+    pyre = prev->prev;
+    pyre_free(prev);
+  }
+}
+
+void heap32_clear (heap32 *heap)
+{
+  pyre32 *pyre, *prev;
+  size_t used;
+  if ((pyre = heap->head) == NULL)
+    return;
+  prev = pyre->prev;
+  pyre->prev = NULL;
+  reset_heap_head32(heap, pyre, used);
+  for (; prev != NULL; prev = pyre)
+  {
+    pyre = prev->prev;
+    pyre_free(prev);
+  }
+}
+
+void heap64_clear (heap64 *heap)
+{
+  pyre64 *pyre, *prev;
+  size_t used;
+  if ((pyre = heap->head) == NULL)
+    return;
+  prev = pyre->prev;
+  pyre->prev = NULL;
+  reset_heap_head64(heap, pyre, used);
+  for (; prev != NULL; prev = pyre)
+  {
+    pyre = prev->prev;
+    pyre_free(prev);
+  }
+}
+
+/* heap head */
+
+void heap8_head (heap8 *heap)
+{
+  pyre8 *pyre;
+  heap->head = pyre = pyre_alloc8(heap, heap->space);
+  pyre->prev = NULL;
+  pyre->data = block_edge8(pyre);
+  pyre->left = block_left8(pyre, heap->space);
+  pyre->chunks = 0;
+}
+
+void heap16_head (heap16 *heap)
+{
+  pyre16 *pyre;
+  heap->head = pyre = pyre_alloc16(heap, heap->space);
+  pyre->prev = NULL;
+  pyre->data = block_edge16(pyre);
+  pyre->left = block_left16(pyre, heap->space);
+  pyre->chunks = 0;
+}
+
+void heap32_head (heap32 *heap)
+{
+  pyre32 *pyre;
+  heap->head = pyre = pyre_alloc32(heap, heap->space);
+  pyre->prev = NULL;
+  pyre->data = block_edge32(pyre);
+  pyre->left = block_left32(pyre, heap->space);
+  pyre->chunks = 0;
+}
+
+void heap64_head (heap64 *heap)
+{
+  pyre64 *pyre;
+  heap->head = pyre = pyre_alloc64(heap, heap->space);
+  pyre->prev = NULL;
+  pyre->data = block_edge64(pyre);
+  pyre->left = block_left64(pyre, heap->space);
+  pyre->chunks = 0;
+}
+
+/* next heap head */
+
+static pyre8 * heap8_new (heap8 *heap)
+{
+  pyre8 *pyre;
+  pyre = pyre_alloc8(heap, heap->space);
+  pyre->prev = heap->head;
+  heap->head = pyre;
+  pyre->data = block_edge8(pyre);
+  pyre->left = block_left8(pyre, heap->space);
+  pyre->chunks = 0;
+  return pyre;
+}
+
+static pyre16 * heap16_new (heap16 *heap)
+{
+  pyre16 *pyre;
+  pyre = pyre_alloc16(heap, heap->space);
+  pyre->prev = heap->head;
+  heap->head = pyre;
+  pyre->data = block_edge16(pyre);
+  pyre->left = block_left16(pyre, heap->space);
+  pyre->chunks = 0;
+  return pyre;
+}
+
+static pyre32 * heap32_new (heap32 *heap)
+{
+  pyre32 *pyre;
+  pyre = pyre_alloc32(heap, heap->space);
+  pyre->prev = heap->head;
+  heap->head = pyre;
+  pyre->data = block_edge32(pyre);
+  pyre->left = block_left32(pyre, heap->space);
+  pyre->chunks = 0;
+  return pyre;
+}
+
+static pyre64 * heap64_new (heap64 *heap)
+{
+  pyre64 *pyre;
+  pyre = pyre_alloc64(heap, heap->space);
+  pyre->prev = heap->head;
+  heap->head = pyre;
+  pyre->data = block_edge64(pyre);
+  pyre->left = block_left64(pyre, heap->space);
+  pyre->chunks = 0;
+  return pyre;
+}
+
+/* next heap sole */
+
+static pyre8 * heap8_sole (heap8 *heap, size_t size)
+{
+  pyre8 *pyre, *head, *prev;
+  pyre = pyre_alloc8(heap, size);
+  head = heap->head;
+  prev = head->prev;
+  pyre->prev = prev;
+  head->prev = pyre;
+  pyre->data = block_edge8(pyre);
+  pyre->left = 0; // (uint8_t)size makes no sense, even with buffer api it will finally become 0
+  return pyre;
+}
+
+static pyre16 * heap16_sole (heap16 *heap, size_t size)
+{
+  pyre16 *pyre, *head, *prev;
+  pyre = pyre_alloc16(heap, size);
+  head = heap->head;
+  prev = head->prev;
+  pyre->prev = prev;
+  head->prev = pyre;
+  pyre->data = block_edge16(pyre);
+  pyre->left = 0;
+  return pyre;
+}
+
+static pyre32 * heap32_sole (heap32 *heap, size_t size)
+{
+  pyre32 *pyre, *head, *prev;
+  pyre = pyre_alloc32(heap, size);
+  head = heap->head;
+  prev = head->prev;
+  pyre->prev = prev;
+  head->prev = pyre;
+  pyre->data = block_edge32(pyre);
+  pyre->left = 0;
+  return pyre;
+}
+
+static pyre64 * heap64_sole (heap64 *heap, size_t size)
+{
+  pyre64 *pyre, *head, *prev;
+  pyre = pyre_alloc64(heap, size);
+  head = heap->head;
+  prev = head->prev;
+  pyre->prev = prev;
+  head->prev = pyre;
+  pyre->data = block_edge64(pyre);
+  pyre->left = 0;
+  return pyre;
+}
+
+/* take from heap */
+
+#define pyre_next8(d, pyre, size) (d = pyre->data, pyre->data += size, pyre->left -= (uint8_t)size, ++pyre->chunks)
+#define pyre_next16(d, pyre, size) (d = pyre->data, pyre->data += size, pyre->left -= (uint16_t)size, ++pyre->chunks)
+#define pyre_next32(d, pyre, size) (d = pyre->data, pyre->data += size, pyre->left -= (uint32_t)size, ++pyre->chunks)
+#define pyre_next64(d, pyre, size) (d = pyre->data, pyre->data += size, pyre->left -= (uint64_t)size, ++pyre->chunks)
+
+// for sole blocks, block->left is permanently 0, we can't store size_t there
+#define pyre_last8(d, pyre, size)  (d = pyre->data, pyre->data += size, pyre->chunks = 1)
+#define pyre_last16(d, pyre, size) (d = pyre->data, pyre->data += size, pyre->chunks = 1)
+#define pyre_last32(d, pyre, size) (d = pyre->data, pyre->data += size, pyre->chunks = 1)
+#define pyre_last64(d, pyre, size) (d = pyre->data, pyre->data += size, pyre->chunks = 1)
+
+void * _heap8_take (heap8 *heap, size_t size)
+{
+  pyre8 *pyre;
+  void *data;
+  pyre = heap->head;
+  align_size8(size);
+  if (size <= pyre->left)
+  {
+    pyre_next8(data, pyre, size);
+  }
+  else if (take_new_block8(heap, pyre8, pyre, size))
+  {
+    pyre = heap8_new(heap);
+    pyre_next8(data, pyre, size);
+  }
+  else
+  {
+    pyre = heap8_sole(heap, size);
+    pyre_last8(data, pyre, size);
+  }
+  return data;
+}
+
+void * _heap16_take (heap16 *heap, size_t size)
+{
+  pyre16 *pyre;
+  void *data;
+  pyre = heap->head;
+  align_size16(size);
+  if (size <= pyre->left)
+  {
+    pyre_next16(data, pyre, size);
+  }
+  else if (take_new_block16(heap, pyre16, pyre, size))
+  {
+    pyre = heap16_new(heap);
+    pyre_next16(data, pyre, size);
+  }
+  else
+  {
+    pyre = heap16_sole(heap, size);
+    pyre_last16(data, pyre, size);
+  }
+  return data;
+}
+
+void * _heap32_take (heap32 *heap, size_t size)
+{
+  pyre32 *pyre;
+  void *data;
+  pyre = heap->head;
+  align_size32(size);
+  if (size <= pyre->left)
+  {
+    pyre_next32(data, pyre, size);
+  }
+  else if (take_new_block32(heap, pyre32, pyre, size))
+  {
+    pyre = heap32_new(heap);
+    pyre_next32(data, pyre, size);
+  }
+  else
+  {
+    pyre = heap32_sole(heap, size);
+    pyre_last32(data, pyre, size);
+  }
+  return data;
+}
+
+void * _heap64_take (heap64 *heap, size_t size)
+{
+  pyre64 *pyre;
+  void *data;
+  pyre = heap->head;
+  align_size64(size);
+  if (size <= pyre->left)
+  {
+    pyre_next64(data, pyre, size);
+  }
+  else if (take_new_block64(heap, pyre64, pyre, size))
+  {
+    pyre = heap64_new(heap);
+    pyre_next64(data, pyre, size);
+  }
+  else
+  {
+    pyre = heap64_sole(heap, size);
+    pyre_last64(data, pyre, size);
+  }
+  return data;
+}
+
+void * _heap8_take0 (heap8 *heap, size_t size)
+{
+  return memset(_heap8_take(heap, size), 0, size);
+}
+
+void * _heap16_take0 (heap16 *heap, size_t size)
+{
+  return memset(_heap16_take(heap, size), 0, size);
+}
+
+void * _heap32_take0 (heap32 *heap, size_t size)
+{
+  return memset(_heap32_take(heap, size), 0, size);
+}
+
+void * _heap64_take0 (heap64 *heap, size_t size)
+{
+  return memset(_heap64_take(heap, size), 0, size);
+}
+
+/* pop last heap chunk */
+
+#define taken_from_head(taken, head) (byte_data(taken) == head->data)
+#define taken_from_sole(taken, head, sole) ((sole = head->prev) != NULL && byte_data(taken) == sole->data)
+
+#define taken_prev_head(taken, head, size) (byte_data(taken) == head->data - size)
+#define taken_prev_sole(taken, head, sole, size) ((sole = head->prev) != NULL && byte_data(taken) == sole->data - size)
+
+void heap8_pop (heap8 *heap, void *taken, size_t size)
+{
+  pyre8 *pyre, *head;
+  head = heap->head;
+  align_size8(size);
+  if (taken_prev_head(taken, head, size))
+  {
+
+    head->data -= size;
+    head->left += (uint8_t)size;
+    --head->chunks;
+  }
+  else if (taken_prev_sole(taken, head, pyre, size))
+  {
+    head->prev = pyre->prev;
+    pyre_free(pyre);
+  }
+  else
+  {
+    ASSERT8(0);
+  }
+}
+
+void heap16_pop (heap16 *heap, void *taken, size_t size)
+{
+  pyre16 *pyre, *head;
+  head = heap->head;
+  align_size16(size);
+  if (taken_prev_head(taken, head, size))
+  {
+
+    head->data -= size;
+    head->left += (uint16_t)size;
+    --head->chunks;
+  }
+  else if (taken_prev_sole(taken, head, pyre, size))
+  {
+    head->prev = pyre->prev;
+    pyre_free(pyre);
+  }
+  else
+  {
+    ASSERT16(0);
+  }
+}
+
+void heap32_pop (heap32 *heap, void *taken, size_t size)
+{
+  pyre32 *pyre, *head;
+  head = heap->head;
+  align_size32(size);
+  if (taken_prev_head(taken, head, size))
+  {
+
+    head->data -= size;
+    head->left += (uint32_t)size;
+    --head->chunks;
+  }
+  else if (taken_prev_sole(taken, head, pyre, size))
+  {
+    head->prev = pyre->prev;
+    pyre_free(pyre);
+  }
+  else
+  {
+    ASSERT32(0);
+  }
+}
+
+void heap64_pop (heap64 *heap, void *taken, size_t size)
+{
+  pyre64 *pyre, *head;
+  head = heap->head;
+  align_size64(size);
+  if (taken_prev_head(taken, head, size))
+  {
+
+    head->data -= size;
+    head->left += (uint64_t)size;
+    --head->chunks;
+  }
+  else if (taken_prev_sole(taken, head, pyre, size))
+  {
+    head->prev = pyre->prev;
+    pyre_free(pyre);
+  }
+  else
+  {
+    ASSERT64(0);
+  }
+}
+
+/* heap buffer */
+
+void * _heap8_some (heap8 *heap, size_t size, size_t *pspace)
+{
+  pyre8 *pyre;
+  pyre = heap->head;
+  align_size8(size);
+  if (size <= pyre->left)
+  {
+    *pspace = pyre->left;
+  }
+  else if (take_new_block8(heap, pyre8, pyre, size))
+  {
+    pyre = heap8_new(heap);
+    *pspace = pyre->left;
+  }
+  else
+  {
+    pyre = heap8_sole(heap, size);
+    *pspace = size;
+  }
+  return void_data(pyre->data);
+}
+
+void * _heap16_some (heap16 *heap, size_t size, size_t *pspace)
+{
+  pyre16 *pyre;
+  pyre = heap->head;
+  align_size16(size);
+  if (size <= pyre->left)
+  {
+    *pspace = pyre->left;
+  }
+  else if (take_new_block16(heap, pyre16, pyre, size))
+  {
+    pyre = heap16_new(heap);
+    *pspace = pyre->left;
+  }
+  else
+  {
+    pyre = heap16_sole(heap, size);
+    *pspace = size;
+  }
+  return void_data(pyre->data);
+}
+
+void * _heap32_some (heap32 *heap, size_t size, size_t *pspace)
+{
+  pyre32 *pyre;
+  pyre = heap->head;
+  align_size32(size);
+  if (size <= pyre->left)
+  {
+    *pspace = pyre->left;
+  }
+  else if (take_new_block32(heap, pyre32, pyre, size))
+  {
+    pyre = heap32_new(heap);
+    *pspace = pyre->left;
+  }
+  else
+  {
+    pyre = heap32_sole(heap, size);
+    *pspace = size;
+  }
+  return void_data(pyre->data);
+}
+
+void * _heap64_some (heap64 *heap, size_t size, size_t *pspace)
+{
+  pyre64 *pyre;
+  pyre = heap->head;
+  align_size64(size);
+  if (size <= pyre->left)
+  {
+    *pspace = pyre->left;
+  }
+  else if (take_new_block64(heap, pyre64, pyre, size))
+  {
+    pyre = heap64_new(heap);
+    *pspace = pyre->left;
+  }
+  else
+  {
+    pyre = heap64_sole(heap, size);
+    *pspace = size;
+  }
+  return void_data(pyre->data);
+}
+
+void * heap8_more (heap8 *heap, void *taken, size_t written, size_t size, size_t *pspace)
+{
+  pyre8 *pyre, *prev;
+  pyre = heap->head;
+  align_size8(size);
+  if (taken_from_head(taken, pyre))
+  {
+    if (size <= pyre->left)
+    {
+      *pspace = pyre->left;
+    }
+    else if (take_new_block8(heap, pyre8, pyre, size))
+    {
+      pyre = heap8_new(heap);
+      memcpy(pyre->data, taken, written);
+      *pspace = pyre->left;
+    }
+    else
+    {
+      pyre = heap8_sole(heap, size);
+      memcpy(pyre->data, taken, written);
+      *pspace = size;
+    }
+  }
+  else if (taken_from_sole(taken, pyre, prev))
+  {
+    pyre = heap8_sole(heap, size);
+    memcpy(pyre->data, taken, written);
+    *pspace = size;
+    pyre->prev = prev->prev;
+    pyre_free(prev);
+  }
+  else
+  {
+    ASSERT8(0);
+    *pspace = 0;
+    return NULL;
+  }
+  return void_data(pyre->data);
+}
+
+void * heap16_more (heap16 *heap, void *taken, size_t written, size_t size, size_t *pspace)
+{
+  pyre16 *pyre, *prev;
+  pyre = heap->head;
+  align_size16(size);
+  if (taken_from_head(taken, pyre))
+  {
+    if (size <= pyre->left)
+    {
+      *pspace = pyre->left;
+    }
+    else if (take_new_block16(heap, pyre16, pyre, size))
+    {
+      pyre = heap16_new(heap);
+      memcpy(pyre->data, taken, written);
+      *pspace = pyre->left;
+    }
+    else
+    {
+      pyre = heap16_sole(heap, size);
+      memcpy(pyre->data, taken, written);
+      *pspace = size;
+    }
+  }
+  else if (taken_from_sole(taken, pyre, prev))
+  {
+    pyre = heap16_sole(heap, size);
+    memcpy(pyre->data, taken, written);
+    *pspace = size;
+    pyre->prev = prev->prev;
+    pyre_free(prev);
+  }
+  else
+  {
+    ASSERT16(0);
+    *pspace = 0;
+    return NULL;
+  }
+  return void_data(pyre->data);
+}
+
+void * heap32_more (heap32 *heap, void *taken, size_t written, size_t size, size_t *pspace)
+{
+  pyre32 *pyre, *prev;
+  pyre = heap->head;
+  align_size32(size);
+  if (taken_from_head(taken, pyre))
+  {
+    if (size <= pyre->left)
+    {
+      *pspace = pyre->left;
+    }
+    else if (take_new_block32(heap, pyre32, pyre, size))
+    {
+      pyre = heap32_new(heap);
+      memcpy(pyre->data, taken, written);
+      *pspace = pyre->left;
+    }
+    else
+    {
+      pyre = heap32_sole(heap, size);
+      memcpy(pyre->data, taken, written);
+      *pspace = size;
+    }
+  }
+  else if (taken_from_sole(taken, pyre, prev))
+  {
+    pyre = heap32_sole(heap, size);
+    memcpy(pyre->data, taken, written);
+    *pspace = size;
+    pyre->prev = prev->prev;
+    pyre_free(prev);
+  }
+  else
+  {
+    ASSERT32(0);
+    *pspace = 0;
+    return NULL;
+  }
+  return void_data(pyre->data);
+}
+
+void * heap64_more (heap64 *heap, void *taken, size_t written, size_t size, size_t *pspace)
+{
+  pyre64 *pyre, *prev;
+  pyre = heap->head;
+  align_size64(size);
+  if (taken_from_head(taken, pyre))
+  {
+    if (size <= pyre->left)
+    {
+      *pspace = pyre->left;
+    }
+    else if (take_new_block64(heap, pyre64, pyre, size))
+    {
+      pyre = heap64_new(heap);
+      memcpy(pyre->data, taken, written);
+      *pspace = pyre->left;
+    }
+    else
+    {
+      pyre = heap64_sole(heap, size);
+      memcpy(pyre->data, taken, written);
+      *pspace = size;
+    }
+  }
+  else if (taken_from_sole(taken, pyre, prev))
+  {
+    pyre = heap64_sole(heap, size);
+    memcpy(pyre->data, taken, written);
+    *pspace = size;
+    pyre->prev = prev->prev;
+    pyre_free(prev);
+  }
+  else
+  {
+    ASSERT64(0);
+    *pspace = 0;
+    return NULL;
+  }
+  return void_data(pyre->data);
+}
+
+void heap8_done (heap8 *heap, void *taken, size_t written)
+{
+  pyre8 *pyre;
+  pyre = heap->head;
+  align_size8(written);
+  if (taken_from_head(taken, pyre))
+  {
+    pyre->data += written;
+    pyre->left -= (uint8_t)written;
+    ++pyre->chunks;
+  }
+  else if (taken_from_sole(taken, pyre, pyre))
+  {
+    pyre->data += written;
+    pyre->chunks = 1;
+  }
+  else
+  {
+    ASSERT8(0);
+  }
+}
+
+void heap16_done (heap16 *heap, void *taken, size_t written)
+{
+  pyre16 *pyre;
+  pyre = heap->head;
+  align_size16(written);
+  if (taken_from_head(taken, pyre))
+  {
+    pyre->data += written;
+    pyre->left -= (uint16_t)written;
+    ++pyre->chunks;
+  }
+  else if (taken_from_sole(taken, pyre, pyre))
+  {
+    pyre->data += written;
+    pyre->chunks = 1;
+  }
+  else
+  {
+    ASSERT16(0);
+  }
+}
+
+void heap32_done (heap32 *heap, void *taken, size_t written)
+{
+  pyre32 *pyre;
+  pyre = heap->head;
+  align_size32(written);
+  if (taken_from_head(taken, pyre))
+  {
+    pyre->data += written;
+    pyre->left -= (uint32_t)written;
+    ++pyre->chunks;
+  }
+  else if (taken_from_sole(taken, pyre, pyre))
+  {
+    pyre->data += written;
+    pyre->chunks = 1;
+  }
+  else
+  {
+    ASSERT32(0);
+  }
+}
+
+void heap64_done (heap64 *heap, void *taken, size_t written)
+{
+  pyre64 *pyre;
+  pyre = heap->head;
+  align_size64(written);
+  if (taken_from_head(taken, pyre))
+  {
+    pyre->data += written;
+    pyre->left -= (uint64_t)written;
+    ++pyre->chunks;
+  }
+  else if (taken_from_sole(taken, pyre, pyre))
+  {
+    pyre->data += written;
+    pyre->chunks = 1;
+  }
+  else
+  {
+    ASSERT64(0);
+  }
+}
+
+/* giveup */
+
+void heap8_giveup (heap8 *heap, void *taken)
+{
+  pyre8 *head, *pyre;
+  head = heap->head;
+  if (taken_from_sole(taken, head, pyre))
+  {
+    head->prev = pyre->prev;
+    pyre_free(pyre);
+  }
+}
+
+void heap16_giveup (heap16 *heap, void *taken)
+{
+  pyre16 *head, *pyre;
+  head = heap->head;
+  if (taken_from_sole(taken, head, pyre))
+  {
+    head->prev = pyre->prev;
+    pyre_free(pyre);
+  }
+}
+
+void heap32_giveup (heap32 *heap, void *taken)
+{
+  pyre32 *head, *pyre;
+  head = heap->head;
+  if (taken_from_sole(taken, head, pyre))
+  {
+    head->prev = pyre->prev;
+    pyre_free(pyre);
+  }
+}
+
+void heap64_giveup (heap64 *heap, void *taken)
+{
+  pyre64 *head, *pyre;
+  head = heap->head;
+  if (taken_from_sole(taken, head, pyre))
+  {
+    head->prev = pyre->prev;
+    pyre_free(pyre);
+  }
+}
+
+/* heap empty */
+
+int heap8_empty (heap8 *heap)
+{
+  pyre8 *pyre;
+  return head_block_empty(heap, pyre);
+}
+
+int heap16_empty (heap16 *heap)
+{
+  pyre16 *pyre;
+  return head_block_empty(heap, pyre);
+}
+
+int heap32_empty (heap32 *heap)
+{
+  pyre32 *pyre;
+  return head_block_empty(heap, pyre);
+}
+
+int heap64_empty (heap64 *heap)
+{
+  pyre64 *pyre;
+  return head_block_empty(heap, pyre);
+}
+
+/* heap stats */
+
+void heap8_stats (heap8 *heap, mem_info *info, int append)
+{
+  pyre8 *pyre;
+  size_t used, chunks = 0, blocks = 0, singles = 0;
+  if (!append)
+    memset(info, 0, sizeof(mem_info));
+  for (pyre = heap->head; pyre != NULL; pyre = pyre->prev)
+  {
+    ++blocks;
+    chunks += pyre->chunks;
+    used = block_used8(pyre);
+    info->used += used;
+    info->left += pyre->left;
+    if (pyre->chunks == 1 && pyre->left == 0)
+    {
+      ++singles;
+      info->singleused += used;
+    }
+  }
+  info->chunks += chunks;
+  info->blocks += blocks;
+  info->blockghosts += blocks * sizeof(pyre8);
+  info->singles += singles;
+  info->singleghosts += singles * sizeof(pyre8);
+}
+
+void heap16_stats (heap16 *heap, mem_info *info, int append)
+{
+  pyre16 *pyre;
+  size_t used, chunks = 0, blocks = 0, singles = 0;
+  if (!append)
+    memset(info, 0, sizeof(mem_info));
+  for (pyre = heap->head; pyre != NULL; pyre = pyre->prev)
+  {
+    ++blocks;
+    chunks += pyre->chunks;
+    used = block_used16(pyre);
+    info->used += used;
+    info->left += pyre->left;
+    if (pyre->chunks == 1 && pyre->left == 0)
+    {
+      ++singles;
+      info->singleused += used;
+    }
+  }
+  info->chunks += chunks;
+  info->blocks += blocks;
+  info->blockghosts += blocks * sizeof(pyre16);
+  info->singles += singles;
+  info->singleghosts += singles * sizeof(pyre16);
+}
+
+void heap32_stats (heap32 *heap, mem_info *info, int append)
+{
+  pyre32 *pyre;
+  size_t used, chunks = 0, blocks = 0, singles = 0;
+  if (!append)
+    memset(info, 0, sizeof(mem_info));
+  for (pyre = heap->head; pyre != NULL; pyre = pyre->prev)
+  {
+    ++blocks;
+    chunks += pyre->chunks;
+    used = block_used32(pyre);
+    info->used += used;
+    info->left += pyre->left;
+    if (pyre->chunks == 1 && pyre->left == 0)
+    {
+      ++singles;
+      info->singleused += used;
+    }
+  }
+  info->chunks += chunks;
+  info->blocks += blocks;
+  info->blockghosts += blocks * sizeof(pyre32);
+  info->singles += singles;
+  info->singleghosts += singles * sizeof(pyre32);
+}
+
+void heap64_stats (heap64 *heap, mem_info *info, int append)
+{
+  pyre64 *pyre;
+  size_t used, chunks = 0, blocks = 0, singles = 0;
+  if (!append)
+    memset(info, 0, sizeof(mem_info));
+  for (pyre = heap->head; pyre != NULL; pyre = pyre->prev)
+  {
+    ++blocks;
+    chunks += pyre->chunks;
+    used = block_used64(pyre);
+    info->used += used;
+    info->left += pyre->left;
+    if (pyre->chunks == 1 && pyre->left == 0)
+    {
+      ++singles;
+      info->singleused += used;
+    }
+  }
+  info->chunks += chunks;
+  info->blocks += blocks;
+  info->blockghosts += blocks * sizeof(pyre64);
+  info->singles += singles;
+  info->singleghosts += singles * sizeof(pyre64);
+}
diff --git a/source/luametatex/source/libraries/pplib/util/utilmemheap.h b/source/luametatex/source/libraries/pplib/util/utilmemheap.h
new file mode 100644
index 000000000..8776419c2
--- /dev/null
+++ b/source/luametatex/source/libraries/pplib/util/utilmemheap.h
@@ -0,0 +1,188 @@
+
+#ifndef UTIL_MEM_HEAP_H
+#define UTIL_MEM_HEAP_H
+
+#include "utilmemallh.h"
+
+typedef struct pyre8 pyre8;
+typedef struct pyre16 pyre16;
+typedef struct pyre32 pyre32;
+typedef struct pyre64 pyre64;
+
+struct pyre8 {
+  pyre8 *prev;
+  uint8_t *data;
+  uint8_t left;
+  uint8_t chunks;
+#ifdef BIT32
+  uint8_t dummy[2]; // 10->12
+#else
+  uint8_t dummy[6]; // 18->24
+#endif
+};
+
+struct pyre16 {
+  pyre16 *prev;
+  uint8_t *data;
+  uint16_t left;
+  uint16_t chunks;
+#ifdef BIT32
+  //uint8_t dummy[0]; // 12->12
+#else
+  uint8_t dummy[4]; // 20->24
+#endif
+};
+
+struct pyre32 {
+  pyre32 *prev;
+  uint8_t *data;
+  uint32_t left;
+  uint32_t chunks;
+#ifdef BIT32
+  //uint8_t dummy[0]; // 16->16
+#else
+  //uint8_t dummy[0]; // 24->24
+#endif
+};
+
+struct pyre64 {
+  pyre64 *prev;
+  uint8_t *data;
+  uint64_t left;
+  uint64_t chunks;
+#ifdef BIT32
+  //uint8_t dummy[0]; // 24->24
+#else
+  //uint8_t dummy[0]; // 32->32
+#endif
+};
+
+/* heaps */
+
+typedef struct heap8 heap8;
+typedef struct heap16 heap16;
+typedef struct heap32 heap32;
+typedef struct heap64 heap64;
+
+struct heap8 {
+  pyre8 *head;
+  uint8_t space;
+  uint8_t large;
+  uint8_t flags;
+};
+
+struct heap16 {
+  pyre16 *head;
+  uint16_t space;
+  uint16_t large;
+  uint8_t flags;
+};
+
+struct heap32 {
+  pyre32 *head;
+  uint32_t space;
+  uint32_t large;
+  uint8_t flags;
+};
+
+struct heap64 {
+  pyre64 *head;
+  uint64_t space;
+  uint64_t large;
+  uint8_t flags;
+};
+
+#define HEAP_ZERO (1 << 0)
+#define HEAP_DEFAULTS 0
+
+#define HEAP8_INIT(space, large, flags) { NULL, aligned_space8(space), large, flags }
+#define HEAP16_INIT(space, large, flags) { NULL, aligned_space16(space), large, flags }
+#define HEAP32_INIT(space, large, flags) { NULL, aligned_space32(space), large, flags }
+#define HEAP64_INIT(space, large, flags) { NULL, aligned_space64(space), large, flags }
+
+UTILAPI heap8 * heap8_init (heap8 *heap, uint8_t space, uint8_t large, uint8_t flags);
+UTILAPI heap16 * heap16_init (heap16 *heap, uint16_t space, uint16_t large, uint8_t flags);
+UTILAPI heap32 * heap32_init (heap32 *heap, uint32_t space, uint32_t large, uint8_t flags);
+UTILAPI heap64 * heap64_init (heap64 *heap, uint64_t space, uint64_t large, uint8_t flags);
+
+UTILAPI void heap8_head (heap8 *heap);
+UTILAPI void heap16_head (heap16 *heap);
+UTILAPI void heap32_head (heap32 *heap);
+UTILAPI void heap64_head (heap64 *heap);
+
+#define heap8_ensure_head(heap) ((void)((heap)->head != NULL || (heap8_head(heap), 0)))
+#define heap16_ensure_head(heap) ((void)((heap)->head != NULL || (heap16_head(heap), 0)))
+#define heap32_ensure_head(heap) ((void)((heap)->head != NULL || (heap32_head(heap), 0)))
+#define heap64_ensure_head(heap) ((void)((heap)->head != NULL || (heap64_head(heap), 0)))
+
+UTILAPI void heap8_free (heap8 *heap);
+UTILAPI void heap16_free (heap16 *heap);
+UTILAPI void heap32_free (heap32 *heap);
+UTILAPI void heap64_free (heap64 *heap);
+
+UTILAPI void heap8_clear (heap8 *heap);
+UTILAPI void heap16_clear (heap16 *heap);
+UTILAPI void heap32_clear (heap32 *heap);
+UTILAPI void heap64_clear (heap64 *heap);
+
+UTILAPI void * _heap8_take (heap8 *heap, size_t size);
+UTILAPI void * _heap16_take (heap16 *heap, size_t size);
+UTILAPI void * _heap32_take (heap32 *heap, size_t size);
+UTILAPI void * _heap64_take (heap64 *heap, size_t size);
+
+UTILAPI void * _heap8_take0 (heap8 *heap, size_t size);
+UTILAPI void * _heap16_take0 (heap16 *heap, size_t size);
+UTILAPI void * _heap32_take0 (heap32 *heap, size_t size);
+UTILAPI void * _heap64_take0 (heap64 *heap, size_t size);
+
+#define heap8_take(heap, size) (heap8_ensure_head(heap), _heap8_take(heap, size))
+#define heap16_take(heap, size) (heap16_ensure_head(heap), _heap16_take(heap, size))
+#define heap32_take(heap, size) (heap32_ensure_head(heap), _heap32_take(heap, size))
+#define heap64_take(heap, size) (heap64_ensure_head(heap), _heap64_take(heap, size))
+
+#define heap8_take0(heap, size) (heap8_ensure_head(heap), _heap8_take0(heap, size))
+#define heap16_take0(heap, size) (heap16_ensure_head(heap), _heap16_take0(heap, size))
+#define heap32_take0(heap, size) (heap32_ensure_head(heap), _heap32_take0(heap, size))
+#define heap64_take0(heap, size) (heap64_ensure_head(heap), _heap64_take0(heap, size))
+
+UTILAPI void heap8_pop (heap8 *heap, void *taken, size_t size);
+UTILAPI void heap16_pop (heap16 *heap, void *taken, size_t size);
+UTILAPI void heap32_pop (heap32 *heap, void *taken, size_t size);
+UTILAPI void heap64_pop (heap64 *heap, void *taken, size_t size);
+
+UTILAPI void * _heap8_some (heap8 *heap, size_t size, size_t *pspace);
+UTILAPI void * _heap16_some (heap16 *heap, size_t size, size_t *pspace);
+UTILAPI void * _heap32_some (heap32 *heap, size_t size, size_t *pspace);
+UTILAPI void * _heap64_some (heap64 *heap, size_t size, size_t *pspace);
+
+#define heap8_some(heap, size, pspace) (heap8_ensure_head(heap), _heap8_some(heap, size, pspace))
+#define heap16_some(heap, size, pspace) (heap16_ensure_head(heap), _heap16_some(heap, size, pspace))
+#define heap32_some(heap, size, pspace) (heap32_ensure_head(heap), _heap32_some(heap, size, pspace))
+#define heap64_some(heap, size, pspace) (heap64_ensure_head(heap), _heap64_some(heap, size, pspace))
+
+UTILAPI void * heap8_more (heap8 *heap, void *taken, size_t written, size_t size, size_t *pspace);
+UTILAPI void * heap16_more (heap16 *heap, void *taken, size_t written, size_t size, size_t *pspace);
+UTILAPI void * heap32_more (heap32 *heap, void *taken, size_t written, size_t size, size_t *pspace);
+UTILAPI void * heap64_more (heap64 *heap, void *taken, size_t written, size_t size, size_t *pspace);
+
+UTILAPI void heap8_done (heap8 *heap, void *taken, size_t written);
+UTILAPI void heap16_done (heap16 *heap, void *taken, size_t written);
+UTILAPI void heap32_done (heap32 *heap, void *taken, size_t written);
+UTILAPI void heap64_done (heap64 *heap, void *taken, size_t written);
+
+UTILAPI void heap8_giveup (heap8 *heap, void *taken);
+UTILAPI void heap16_giveup (heap16 *heap, void *taken);
+UTILAPI void heap32_giveup (heap32 *heap, void *taken);
+UTILAPI void heap64_giveup (heap64 *heap, void *taken);
+
+UTILAPI int heap8_empty (heap8 *heap);
+UTILAPI int heap16_empty (heap16 *heap);
+UTILAPI int heap32_empty (heap32 *heap);
+UTILAPI int heap64_empty (heap64 *heap);
+
+UTILAPI void heap8_stats (heap8 *heap, mem_info *info, int append);
+UTILAPI void heap16_stats (heap16 *heap, mem_info *info, int append);
+UTILAPI void heap32_stats (heap32 *heap, mem_info *info, int append);
+UTILAPI void heap64_stats (heap64 *heap, mem_info *info, int append);
+
+#endif
+\ No newline at end of file
diff --git a/source/luametatex/source/libraries/pplib/util/utilmemheapiof.c b/source/luametatex/source/libraries/pplib/util/utilmemheapiof.c
new file mode 100644
index 000000000..cd9609da8
--- /dev/null
+++ b/source/luametatex/source/libraries/pplib/util/utilmemheapiof.c
@@ -0,0 +1,142 @@
+
+#include "utilmemheapiof.h"
+
+// this is identical to stock iof suite, keep in sync
+
+size_t heap8_writer (iof *O, iof_mode mode)
+{
+  heap8 *heap;
+  size_t written;
+  heap = (heap8 *)O->link;
+  switch (mode)
+  {
+    case IOFFLUSH:
+      heap8_buffer_done(heap, O);
+      O->buf = _heap8_some(heap, 0, &O->space);
+      O->pos = O->buf;
+      O->end = O->buf + O->space;
+      break;
+    case IOFWRITE:
+      written = (size_t)iof_size(O);
+      O->buf = heap8_more(heap, O->buf, written, written << 1, &O->space);
+      O->pos = O->buf + written;
+      O->end = O->buf + O->space;
+      return O->space - written;
+    case IOFCLOSE:
+    default:
+      break;
+  }
+  return 0;
+}
+
+size_t heap16_writer (iof *O, iof_mode mode)
+{
+  heap16 *heap;
+  size_t written;
+  heap = (heap16 *)O->link;
+  switch (mode)
+  {
+    case IOFFLUSH:
+      heap16_buffer_done(heap, O);
+      O->buf = _heap16_some(heap, 0, &O->space);
+      O->pos = O->buf;
+      O->end = O->buf + O->space;
+      break;
+    case IOFWRITE:
+      written = (size_t)iof_size(O);
+      O->buf = heap16_more(heap, O->buf, written, written << 1, &O->space);
+      O->pos = O->buf + written;
+      O->end = O->buf + O->space;
+      return O->space - written;
+    case IOFCLOSE:
+    default:
+      break;
+  }
+  return 0;
+}
+
+size_t heap32_writer (iof *O, iof_mode mode)
+{
+  heap32 *heap;
+  size_t written;
+  heap = (heap32 *)O->link;
+  switch (mode)
+  {
+    case IOFFLUSH:
+      heap32_buffer_done(heap, O);
+      O->buf = _heap32_some(heap, 0, &O->space);
+      O->pos = O->buf;
+      O->end = O->buf + O->space;
+      break;
+    case IOFWRITE:
+      written = (size_t)iof_size(O);
+      O->buf = heap32_more(heap, O->buf, written, written << 1, &O->space);
+      O->pos = O->buf + written;
+      O->end = O->buf + O->space;
+      return O->space - written;
+    case IOFCLOSE:
+    default:
+      break;
+  }
+  return 0;
+}
+
+size_t heap64_writer (iof *O, iof_mode mode)
+{
+  heap64 *heap;
+  size_t written;
+  heap = (heap64 *)O->link;
+  switch (mode)
+  {
+    case IOFFLUSH:
+      heap64_buffer_done(heap, O);
+      O->buf = _heap64_some(heap, 0, &O->space);
+      O->pos = O->buf;
+      O->end = O->buf + O->space;
+      break;
+    case IOFWRITE:
+      written = (size_t)iof_size(O);
+      O->buf = heap64_more(heap, O->buf, written, written << 1, &O->space);
+      O->pos = O->buf + written;
+      O->end = O->buf + O->space;
+      return O->space - written;
+    case IOFCLOSE:
+    default:
+      break;
+  }
+  return 0;
+}
+
+/* buffer for some */
+
+iof * _heap8_buffer_some (heap8 *heap, iof *O, size_t atleast)
+{
+  O->buf = _heap8_some(heap, atleast, &O->space);
+  O->pos = O->buf;
+  O->end = O->buf + O->space;
+  return O;
+}
+
+iof * _heap16_buffer_some (heap16 *heap, iof *O, size_t atleast)
+{
+  O->buf = _heap16_some(heap, atleast, &O->space);
+  O->pos = O->buf;
+  O->end = O->buf + O->space;
+  return O;
+}
+
+iof * _heap32_buffer_some (heap32 *heap, iof *O, size_t atleast)
+{
+  O->buf = _heap32_some(heap, atleast, &O->space);
+  O->pos = O->buf;
+  O->end = O->buf + O->space;
+  return O;
+}
+
+iof * _heap64_buffer_some (heap64 *heap, iof *O, size_t atleast)
+{
+  O->buf = _heap64_some(heap, atleast, &O->space);
+  O->pos = O->buf;
+  O->end = O->buf + O->space;
+  return O;
+}
diff --git a/source/luametatex/source/libraries/pplib/util/utilmemheapiof.h b/source/luametatex/source/libraries/pplib/util/utilmemheapiof.h
new file mode 100644
index 000000000..1f3da7efb
--- /dev/null
+++ b/source/luametatex/source/libraries/pplib/util/utilmemheapiof.h
@@ -0,0 +1,43 @@
+
+#ifndef UTIL_MEM_HEAP_IOF_H
+#define UTIL_MEM_HEAP_IOF_H
+
+#include "utilmemheap.h"
+#include "utiliof.h"
+
+UTILAPI size_t heap8_writer (iof *O, iof_mode mode);
+UTILAPI size_t heap16_writer (iof *O, iof_mode mode);
+UTILAPI size_t heap32_writer (iof *O, iof_mode mode);
+UTILAPI size_t heap64_writer (iof *O, iof_mode mode);
+
+#define HEAP8_BUFFER_INIT(heap) IOF_WRITER_INIT(heap8_writer, (void *)(heap), NULL, 0, 0)
+#define HEAP16_BUFFER_INIT(heap) IOF_WRITER_INIT(heap16_writer, (void *)(heap), NULL, 0, 0)
+#define HEAP32_BUFFER_INIT(heap) IOF_WRITER_INIT(heap32_writer, (void *)(heap), NULL, 0, 0)
+#define HEAP64_BUFFER_INIT(heap) IOF_WRITER_INIT(heap64_writer, (void *)(heap), NULL, 0, 0)
+
+#define heap8_buffer_init(heap, O) iof_writer(O, (void *)(heap), heap8_writer, NULL, 0)
+#define heap16_buffer_init(heap, O) iof_writer(O, (void *)(heap), heap16_writer, NULL, 0)
+#define heap32_buffer_init(heap, O) iof_writer(O, (void *)(heap), heap32_writer, NULL, 0)
+#define heap64_buffer_init(heap, O) iof_writer(O, (void *)(heap), heap64_writer, NULL, 0)
+
+UTILAPI iof * _heap8_buffer_some (heap8 *heap, iof *O, size_t atleast);
+UTILAPI iof * _heap16_buffer_some (heap16 *heap, iof *O, size_t atleast);
+UTILAPI iof * _heap32_buffer_some (heap32 *heap, iof *O, size_t atleast);
+UTILAPI iof * _heap64_buffer_some (heap64 *heap, iof *O, size_t atleast);
+
+#define heap8_buffer_some(heap, O, atleast) (heap8_ensure_head(heap), _heap8_buffer_some(heap, O, atleast))
+#define heap16_buffer_some(heap, O, atleast) (heap16_ensure_head(heap), _heap16_buffer_some(heap, O, atleast))
+#define heap32_buffer_some(heap, O, atleast) (heap32_ensure_head(heap), _heap32_buffer_some(heap, O, atleast))
+#define heap64_buffer_some(heap, O, atleast) (heap64_ensure_head(heap), _heap64_buffer_some(heap, O, atleast))
+
+#define heap8_buffer_done(heap, O) heap8_done(heap, (O)->buf, (size_t)iof_size(O))
+#define heap16_buffer_done(heap, O) heap16_done(heap, (O)->buf, (size_t)iof_size(O))
+#define heap32_buffer_done(heap, O) heap32_done(heap, (O)->buf, (size_t)iof_size(O))
+#define heap64_buffer_done(heap, O) heap64_done(heap, (O)->buf, (size_t)iof_size(O))
+
+#define heap8_buffer_giveup(heap, O) heap8_giveup(heap, (O)->buf)
+#define heap16_buffer_giveup(heap, O) heap16_giveup(heap, (O)->buf)
+#define heap32_buffer_giveup(heap, O) heap32_giveup(heap, (O)->buf)
+#define heap64_buffer_giveup(heap, O) heap64_giveup(heap, (O)->buf)
+
+#endif
+\ No newline at end of file
diff --git a/source/luametatex/source/libraries/pplib/util/utilmeminfo.c b/source/luametatex/source/libraries/pplib/util/utilmeminfo.c
new file mode 100644
index 000000000..d3f61d5ca
--- /dev/null
+++ b/source/luametatex/source/libraries/pplib/util/utilmeminfo.c
@@ -0,0 +1,38 @@
+/* print stats; common for heap, stock and pool */
+
+#include <stdio.h>
+
+#include "utilmeminfo.h"
+
+#define UINT(i) ((unsigned long)(i))
+
+void show_mem_info (mem_info *info)
+{
+  size_t totalwaste, totalmem, averagechunk, singlemem;
+  double ftotalwaste, fblockwaste, fghostwaste, ftailwaste, fsinglewaste;
+  double funused, fsingles, fsinglemem, fsingleeff;
+
+  totalwaste = info->ghosts + info->blockghosts + info->left;
+  totalmem = info->used + totalwaste;
+
+  ftotalwaste = totalmem > 0 ? totalwaste * 100.0 / totalmem : 0;
+  fblockwaste = totalmem > 0 ? (info->blockghosts - info->singleghosts) * 100.0 / totalmem : 0;
+  fsinglewaste = totalmem > 0 ? info->singleghosts * 100.0 / totalmem : 0;
+  fghostwaste = totalmem > 0 ? info->ghosts * 100.0 / totalmem : 0;
+  ftailwaste = totalmem > 0 ? info->left * 100.0 / totalmem : 0;
+
+  averagechunk = info->chunks > 0 ? info->used / info->chunks : 0;
+  funused = info->chunks > 0 ? info->unused * 100.0 / info->chunks : 0.0;
+
+  fsingles = info->blocks > 0 ? info->singles * 100.0 / info->blocks : 0;
+  fsinglemem = info->used > 0 ? info->singleused * 100.0 / info->used : 0;
+  singlemem = info->singleused + info->singleghosts;
+  fsingleeff = singlemem > 0 ? info->singleused * 100.0 / singlemem : 0;
+
+  printf("total: %lu + %lu = %lu\n", UINT(info->used), UINT(totalwaste), UINT(totalmem));
+  printf("chunks: %lu of average size %lu, unused %lu[%.2f%%]\n", UINT(info->chunks), UINT(averagechunk), UINT(info->unused), funused);
+  printf("blocks: %lu, singles %lu[%.2f%%], %.2f%% of allocs, efficiency %.2f%%\n", 
+    UINT(info->blocks), UINT(info->singles), fsingles, fsinglemem, fsingleeff);
+  printf("waste: %lu[%0.2f%%], block ghosts %0.2f%%, single ghosts %.2f%%, chunk ghosts %0.2f%%, tails %0.2f%%\n\n",
+    UINT(totalwaste), ftotalwaste, fblockwaste, fsinglewaste, fghostwaste, ftailwaste);
+}
diff --git a/source/luametatex/source/libraries/pplib/util/utilmeminfo.h b/source/luametatex/source/libraries/pplib/util/utilmeminfo.h
new file mode 100644
index 000000000..cfa0fd670
--- /dev/null
+++ b/source/luametatex/source/libraries/pplib/util/utilmeminfo.h
@@ -0,0 +1,9 @@
+
+#ifndef UTIL_MEM_INFO_H
+#define UTIL_MEM_INFO_H
+
+#include "utilmemallh.h"
+
+UTILAPI void show_mem_info (mem_info *info);
+
+#endif
+\ No newline at end of file
diff --git a/source/luametatex/source/libraries/pplib/util/utilnumber.c b/source/luametatex/source/libraries/pplib/util/utilnumber.c
new file mode 100644
index 000000000..4352c26fb
--- /dev/null
+++ b/source/luametatex/source/libraries/pplib/util/utilnumber.c
@@ -0,0 +1,1177 @@
+
+#include <math.h> /* for log10() and floor() */
+#include <stdio.h> /* for printf() */
+
+#include "utilnumber.h"
+
+// todo: lookups can be chars
+// change lookup arrays to some __name to discourage accessing them directly; they always should be accessed via macros; base16_value() base16_digit()
+//
+
+const int base10_lookup[] = {
+  -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+  -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+  -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+   0, 1, 2, 3, 4, 5, 6, 7, 8, 9,-1,-1,-1,-1,-1,-1,
+  -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+  -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+  -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+  -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+  -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+  -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+  -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+  -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+  -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+  -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+  -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+  -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
+};
+
+const int base16_lookup[] = {
+  -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+  -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+  -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+   0, 1, 2, 3, 4, 5, 6, 7, 8, 9,-1,-1,-1,-1,-1,-1,
+  -1,10,11,12,13,14,15,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+  -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+  -1,10,11,12,13,14,15,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+  -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+  -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+  -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+  -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+  -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+  -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+  -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+  -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+  -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
+};
+
+const int base26_lookup[] = {
+  -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+  -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+  -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+  -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+  -1, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,
+  16,17,18,19,20,21,22,23,24,25,26,-1,-1,-1,-1,-1,
+  -1, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,
+  16,17,18,19,20,21,22,23,24,25,26,-1,-1,-1,-1,-1,
+  -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+  -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+  -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+  -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+  -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+  -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+  -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+  -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
+};
+
+const int base36_lookup[] = {
+  -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+  -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+  -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+   0, 1, 2, 3, 4, 5, 6, 7, 8, 9,-1,-1,-1,-1,-1,-1,
+  -1,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,
+  25,26,27,28,29,30,31,32,33,34,35,-1,-1,-1,-1,-1,
+  -1,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,
+  25,26,27,28,29,30,31,32,33,34,35,-1,-1,-1,-1,-1,
+  -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+  -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+  -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+  -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+  -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+  -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+  -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+  -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
+};
+
+/* common buffer for quick conversions (unsafe) */
+
+char util_number_buffer[NUMBER_BUFFER_SIZE] = { 0 };
+
+/* integer from string; return a pointer to character next to the last digit */
+
+#define string_scan_sign(s, c, sign) _scan_sign(c, sign, *++s)
+#define string_scan_integer(s, c, number) _scan_integer(c, number, *++s)
+#define string_scan_radix(s, c, number, radix) _scan_radix(c, number, radix, *++s)
+#define string_read_integer(s, c, number) _read_integer(c, number, *++s)
+#define string_read_radix(s, c, number, radix) _read_radix(c, number, radix, *++s)
+
+const char * string_to_int32 (const char *s, int32_t *number)
+{
+  int sign, c = *s;
+  string_scan_sign(s, c, sign);
+  string_scan_integer(s, c, *number);
+  if (sign) *number = -*number;
+  return s;
+}
+
+const char * string_to_slong (const char *s, long *number)
+{
+  int sign, c = *s;
+  string_scan_sign(s, c, sign);
+  string_scan_integer(s, c, *number);
+  if (sign) *number = -*number;
+  return s;
+}
+
+const char * string_to_int64 (const char *s, int64_t *number)
+{
+  int sign, c = *s;
+  string_scan_sign(s, c, sign);
+  string_scan_integer(s, c, *number);
+  if (sign) *number = -*number;
+  return s;
+}
+
+const char * string_to_uint32 (const char *s, uint32_t *number)
+{
+  int c = *s;
+  string_scan_integer(s, c, *number);
+  return s;
+}
+
+const char * string_to_ulong (const char *s, unsigned long *number)
+{
+  int c = *s;
+  string_scan_integer(s, c, *number);
+  return s;
+}
+
+const char * string_to_usize (const char *s, size_t *number)
+{
+  int c = *s;
+  string_scan_integer(s, c, *number);
+  return s;
+}
+
+const char * string_to_uint64 (const char *s, uint64_t *number)
+{
+  int c = *s;
+  string_scan_integer(s, c, *number);
+  return s;
+}
+
+const char * radix_to_int32 (const char *s, int32_t *number, int radix)
+{
+  int sign, c = *s;
+  string_scan_sign(s, c, sign);
+  string_scan_radix(s, c, *number, radix);
+  if (sign) *number = -*number;
+  return s;
+}
+
+const char * radix_to_slong (const char *s, long *number, int radix)
+{
+  int sign, c = *s;
+  string_scan_sign(s, c, sign);
+  string_scan_radix(s, c, *number, radix);
+  if (sign) *number = -*number;
+  return s;
+}
+
+const char * radix_to_int64 (const char *s, int64_t *number, int radix)
+{
+  int sign, c = *s;
+  string_scan_sign(s, c, sign);
+  string_scan_radix(s, c, *number, radix);
+  if (sign) *number = -*number;
+  return s;
+}
+
+const char * radix_to_uint32 (const char *s, uint32_t *number, int radix)
+{
+  int c = *s;
+  string_scan_radix(s, c, *number, radix);
+  return s;
+}
+
+const char * radix_to_ulong (const char *s, unsigned long *number, int radix)
+{
+  int c = *s;
+  string_scan_radix(s, c, *number, radix);
+  return s;
+}
+
+const char * radix_to_usize (const char *s, size_t *number, int radix)
+{
+  int c = *s;
+  string_scan_radix(s, c, *number, radix);
+  return s;
+}
+
+const char * radix_to_uint64 (const char *s, uint64_t *number, int radix)
+{
+  int c = *s;
+  string_scan_radix(s, c, *number, radix);
+  return s;
+}
+
+/* roman to uint16_t */
+
+#define roman1000(c) (c == 'M' || c == 'm')
+#define roman500(c)  (c == 'D' || c == 'd')
+#define roman100(c)  (c == 'C' || c == 'c')
+#define roman50(c)   (c == 'L' || c == 'l')
+#define roman10(c)   (c == 'X' || c == 'x')
+#define roman5(c)    (c == 'V' || c == 'v')
+#define roman1(c)    (c == 'I' || c == 'i')
+
+#define roman100s(p) (roman100(*p) ? (100 + ((++p, roman100(*p)) ? (100 + ((++p, roman100(*p)) ? (++p, 100) : 0)) : 0)) : 0)
+#define roman10s(p) (roman10(*p) ? (10 + ((++p, roman10(*p)) ? (10 + ((++p, roman10(*p)) ? (++p, 10) : 0)) : 0)) : 0)
+#define roman1s(p) (roman1(*p) ? (1 + ((++p, roman1(*p)) ? (1 + ((++p, roman1(*p)) ? (++p, 1) : 0)) : 0)) : 0)
+
+const char * roman_to_uint16 (const char *s, uint16_t *number)
+{
+  const char *p;
+  /* M */
+  for (*number = 0, p = s; roman1000(*p); *number += 1000, ++p);
+  /* D C */
+  if (roman500(*p))
+  {
+    ++p;
+    *number += 500 + roman100s(p);
+  }
+  else if (roman100(*p))
+  {
+    ++p;
+    if (roman1000(*p))
+    {
+      ++p;
+      *number += 900;
+    }
+    else if (roman500(*p))
+    {
+      ++p;
+      *number += 400;
+    }
+    else
+      *number += 100 + roman100s(p);
+  }
+  /* L X */
+  if (roman50(*p))
+  {
+    ++p;
+    *number += 50 + roman10s(p);
+  }
+  else if (roman10(*p))
+  {
+    ++p;
+    if (roman100(*p))
+    {
+      ++p;
+      *number += 90;
+    }
+    else if (roman50(*p))
+    {
+      ++p;
+      *number += 40;
+    }
+    else
+      *number += 10 + roman10s(p);
+  }
+  /* V I */
+  if (roman5(*p))
+  {
+    ++p;
+    *number += 5 + roman1s(p);
+  }
+  else if (roman1(*p))
+  {
+    ++p;
+    if (roman10(*p))
+    {
+      ++p;
+      *number += 9;
+    }
+    else if (roman5(*p))
+    {
+      ++p;
+      *number += 4;
+    }
+    else
+      *number += 1 + roman1s(p);
+  }
+  return p;
+}
+
+/* integer to string; return a pointer to null-terminated static const string */
+
+#define end_of_integer_buffer(integer_buffer) (integer_buffer + MAX_INTEGER_DIGITS - 1)
+
+#define number_printrev_signed(p, number, quotient) \
+  do { \
+    quotient = number; number /= 10; \
+    *--p = base10_palindrome[9 + (quotient - number*10)]; \
+  } while (number); \
+  if (quotient < 0) *--p = '-'
+
+#define number_printrev_unsigned(p, number, quotient) \
+  do { \
+    quotient = number; number /= 10; \
+    *--p = (char)(quotient - integer_multiplied10(number)) + '0'; \
+  } while (number)
+
+#define SINTTYPE_AS_STRING(inttype, number, ibuf, psize) \
+  char *p, *e; \
+  inttype quotient; \
+  e = p = end_of_integer_buffer(ibuf); *p = '\0'; \
+  number_printrev_signed(p, number, quotient); \
+  *psize = (size_t)(e - p)
+
+#define UINTTYPE_AS_STRING(inttype, number, ibuf, psize) \
+  char *p, *e; \
+  inttype quotient; \
+  e = p = end_of_integer_buffer(ibuf); *p = '\0'; \
+  number_printrev_unsigned(p, number, quotient); \
+  *psize = (size_t)(e - p)
+
+char * int32_as_string (int32_t number, char ibuf[MAX_INTEGER_DIGITS], size_t *psize)
+{
+  SINTTYPE_AS_STRING(int32_t, number, ibuf, psize);
+  return p;
+}
+
+char * slong_as_string (long number, char ibuf[MAX_INTEGER_DIGITS], size_t *psize)
+{
+  SINTTYPE_AS_STRING(long, number, ibuf, psize);
+  return p;
+}
+
+char * int64_as_string (int64_t number, char ibuf[MAX_INTEGER_DIGITS], size_t *psize)
+{
+  SINTTYPE_AS_STRING(int64_t, number, ibuf, psize);
+  return p;
+}
+
+char * uint32_as_string (uint32_t number, char ibuf[MAX_INTEGER_DIGITS], size_t *psize)
+{
+  UINTTYPE_AS_STRING(uint32_t, number, ibuf, psize);
+  return p;
+}
+
+char * ulong_as_string (unsigned long number, char ibuf[MAX_INTEGER_DIGITS], size_t *psize)
+{
+  UINTTYPE_AS_STRING(unsigned long, number, ibuf, psize);
+  return p;
+}
+
+char * usize_as_string (size_t number, char ibuf[MAX_INTEGER_DIGITS], size_t *psize)
+{
+  UINTTYPE_AS_STRING(size_t, number, ibuf, psize);
+  return p;
+}
+
+char * uint64_as_string (uint64_t number, char ibuf[MAX_INTEGER_DIGITS], size_t *psize)
+{
+  UINTTYPE_AS_STRING(uint64_t, number, ibuf, psize);
+  return p;
+}
+
+/* radix variant */
+
+#define number_printrev_signed_radix_uc(p, number, radix, quotient) \
+  do { \
+    quotient = number; number /= radix; \
+    *--p = base36_uc_palindrome[MAX_RADIX - 1 + (quotient - number*radix)]; \
+  } while (number)
+
+#define number_printrev_signed_radix_lc(p, number, radix, quotient) \
+  do { \
+    quotient = number; number /= radix; \
+    *--p = base36_lc_palindrome[MAX_RADIX - 1 + (quotient - number*radix)]; \
+  } while (number)
+
+#define number_printrev_signed_radix(p, number, radix, quotient, uc) \
+  do { \
+    if (uc) { number_printrev_signed_radix_uc(p, number, radix, quotient); } \
+    else { number_printrev_signed_radix_lc(p, number, radix, quotient); } \
+    if (quotient < 0) *--p = '-'; \
+  } while (0)
+
+#define number_printrev_unsigned_radix_uc(p, number, radix, quotient) \
+  do { \
+    quotient = number; number /= radix; \
+    *--p = base36_uc_alphabet[quotient % radix]; \
+  } while (number)
+
+#define number_printrev_unsigned_radix_lc(p, number, radix, quotient) \
+  do { \
+    quotient = number; number /= radix; \
+    *--p = base36_lc_alphabet[quotient % radix]; \
+  } while (number)
+
+#define number_printrev_unsigned_radix(p, number, radix, quotient, uc) \
+  do { \
+    if (uc) { number_printrev_unsigned_radix_uc(p, number, radix, quotient); } \
+    else { number_printrev_unsigned_radix_lc(p, number, radix, quotient); } \
+  } while (0)
+
+#define SINTTYPE_AS_RADIX(inttype, number, radix, uc, ibuf, psize) \
+  char *p, *e; \
+  inttype quotient; \
+  e = p = end_of_integer_buffer(ibuf); *p = '\0'; \
+  number_printrev_signed_radix(p, number, radix, quotient, uc); \
+  *psize = (size_t)(e - p)
+
+#define UINTTYPE_AS_RADIX(inttype, number, radix, uc, ibuf, psize) \
+  char *p, *e; \
+  inttype quotient; \
+  e = p = end_of_integer_buffer(ibuf); *p = '\0'; \
+  number_printrev_unsigned_radix(p, number, radix, quotient, uc); \
+  *psize = (size_t)(e - p)
+
+char * int32_as_radix (int32_t number, int radix, int uc, char ibuf[MAX_INTEGER_DIGITS], size_t *psize)
+{
+  SINTTYPE_AS_RADIX(int32_t, number, radix, uc, ibuf, psize);
+  return p;
+}
+
+char * slong_as_radix (long number, int radix, int uc, char ibuf[MAX_INTEGER_DIGITS], size_t *psize)
+{
+  SINTTYPE_AS_RADIX(long, number, radix, uc, ibuf, psize);
+  return p;
+}
+
+/*
+char * ssize_as_radix (ssize_t number, int radix, int uc, char ibuf[MAX_INTEGER_DIGITS], size_t *psize)
+{
+  SINTTYPE_AS_RADIX(ssize_t, number, radix, uc, ibuf, psize);
+  return p;
+}
+*/
+
+char * int64_as_radix (int64_t number, int radix, int uc, char ibuf[MAX_INTEGER_DIGITS], size_t *psize)
+{
+  SINTTYPE_AS_RADIX(int64_t, number, radix, uc, ibuf, psize);
+  return p;
+}
+
+char * uint32_as_radix (uint32_t number, int radix, int uc, char ibuf[MAX_INTEGER_DIGITS], size_t *psize)
+{
+  UINTTYPE_AS_RADIX(uint32_t, number, radix, uc, ibuf, psize);
+  return p;
+}
+
+char * ulong_as_radix (unsigned long number, int radix, int uc, char ibuf[MAX_INTEGER_DIGITS], size_t *psize)
+{
+  UINTTYPE_AS_RADIX(unsigned long, number, radix, uc, ibuf, psize);
+  return p;
+}
+
+char * usize_as_radix (size_t number, int radix, int uc, char ibuf[MAX_INTEGER_DIGITS], size_t *psize)
+{
+  UINTTYPE_AS_RADIX(size_t, number, radix, uc, ibuf, psize);
+  return p;
+}
+
+char * uint64_as_radix (uint64_t number, int radix, int uc, char ibuf[MAX_INTEGER_DIGITS], size_t *psize)
+{
+  UINTTYPE_AS_RADIX(uint64_t, number, radix, uc, ibuf, psize);
+  return p;
+}
+
+/* aaa, aab, aac, ...; unsigned only. 0 gives empty string */
+
+#define string_scan_alpha(s, c, number, radix) \
+  for (number = 0, c = *s; (c = base26_value(c)) > 0; number = number * radix + c, c = *++s)
+
+const char * alpha_to_uint32 (const char *s, uint32_t *number)
+{
+  int c;
+  string_scan_alpha(s, c, *number, 26);
+  return s;
+}
+
+const char * alpha_to_ulong (const char *s, unsigned long *number)
+{
+  int c;
+  string_scan_alpha(s, c, *number, 26);
+  return s;
+}
+
+const char * alpha_to_usize (const char *s, size_t *number)
+{
+  int c;
+  string_scan_alpha(s, c, *number, 26);
+  return s;
+}
+
+const char * alpha_to_uint64 (const char *s, uint64_t *number)
+{
+  int c;
+  string_scan_alpha(s, c, *number, 26);
+  return s;
+}
+
+#define number_printrev_unsigned_alpha_uc(p, number, radix, quotient) \
+  while (number > 0) { \
+    quotient = --number; number /= radix; \
+    *--p = base26_uc_alphabet[quotient % radix]; \
+  }
+
+#define number_printrev_unsigned_alpha_lc(p, number, radix, quotient) \
+  while (number > 0) { \
+    quotient = --number; number /= radix; \
+    *--p = base26_lc_alphabet[quotient % radix]; \
+  }
+
+#define UINTTYPE_AS_ALPHA(inttype, number, uc, ibuf, psize) \
+  char *p, *e; \
+  inttype quotient; \
+  e = p = end_of_integer_buffer(ibuf); *p = '\0'; \
+  if (uc) { number_printrev_unsigned_alpha_uc(p, number, 26, quotient); } \
+  else { number_printrev_unsigned_alpha_lc(p, number, 26, quotient); } \
+  *psize = (size_t)(e - p)
+
+char * uint32_as_alpha (uint32_t number, int uc, char ibuf[MAX_INTEGER_DIGITS], size_t *psize)
+{
+  UINTTYPE_AS_ALPHA(uint32_t, number, uc, ibuf, psize);
+  return p;
+}
+
+char * ulong_as_alpha (unsigned long number, int uc, char ibuf[MAX_INTEGER_DIGITS], size_t *psize)
+{
+  UINTTYPE_AS_ALPHA(unsigned long, number, uc, ibuf, psize);
+  return p;
+}
+
+char * usize_as_alpha (size_t number, int uc, char ibuf[MAX_INTEGER_DIGITS], size_t *psize)
+{
+  UINTTYPE_AS_ALPHA(size_t, number, uc, ibuf, psize);
+  return p;
+}
+
+char * uint64_as_alpha (uint64_t number, int uc, char ibuf[MAX_INTEGER_DIGITS], size_t *psize)
+{
+  UINTTYPE_AS_ALPHA(uint64_t, number, uc, ibuf, psize);
+  return p;
+}
+
+/* a variant of alphabetic, a, b, c, ..., z, aa, bb, cc, ..., zz (eg. pdf page labelling)
+   watch out: unsafe for large numbers; for buffer size N we can handle max. N * 26. */
+
+#define string_scan_alphan(s, c, number, radix) \
+  do { \
+    number = 0; \
+    if ((c = (uint16_t)base26_value(*s)) > 0) { \
+      number = c; \
+      while (c == (uint16_t)base26_value(*++s)) number += radix; \
+    }  \
+  } while (0)
+
+const char * alphan_to_uint16 (const char *s, uint16_t *number)
+{
+  uint16_t c;
+  string_scan_alphan(s, c, *number, 26);
+  return s;
+}
+
+#define number_print_alphan_uc(p, e, c, number, radix) \
+  for (c = (--number) % radix, number -= c; ; number -= radix) { \
+    *p++ = base26_uc_alphabet[c]; \
+     if (number == 0 || p >= e) break; \
+  }
+
+#define number_print_alphan_lc(p, e, c, number, radix) \
+  for (c = (--number) % radix, number -= c; ; number -= radix) { \
+    *p++ = base26_lc_alphabet[c]; \
+     if (number == 0 || p >= e) break; \
+  }
+
+#define UINTTYPE_AS_ALPHAN(inttype, number, uc, ibuf, size, psize) \
+  char *p, *e; \
+  uint8_t c; \
+  p = ibuf; \
+  e = p + size; \
+  if (number > 0) { \
+    if (uc) { number_print_alphan_uc(p, e, c, number, 26); } \
+    else { number_print_alphan_lc(p, e, c, number, 26); } \
+  } \
+  *p = '\0'; \
+  *psize = (size_t)(p - ibuf)
+
+char * uint16_as_alphan (uint16_t number, int uc, char ibuf[], size_t size, size_t *psize)
+{
+  UINTTYPE_AS_ALPHAN(uint16_t, number, uc, ibuf, size, psize);
+  return ibuf;
+}
+
+/* roman numeral */
+
+/* large roman numerals? http://mathforum.org/library/drmath/view/57569.html */
+
+#define base_roman_uc_alphabet "MDCLXVI"
+#define base_roman_lc_alphabet "mdclxvi"
+
+char * uint16_as_roman (uint16_t number, int uc, char ibuf[MAX_ROMAN_DIGITS], size_t *psize)
+{
+  static const uint32_t base_roman_values[] = { 1000, 500, 100, 50, 10, 5, 1 };
+  const char *alphabet;
+  char *p;
+  uint32_t k, j, v, u, n;
+
+  n = (uint32_t)number; // uint16_t used to limit leding 'M'
+  alphabet = uc ? base_roman_uc_alphabet : base_roman_lc_alphabet;
+  for (p = ibuf, j = 0, v = base_roman_values[0]; n > 0; )
+  {
+    if (n >= v)
+    {
+     *p++ = alphabet[j];
+     n -= v;
+     continue;
+    }
+    if (j & 1)
+      k = j + 1;
+    else
+      k = j + 2;
+    u = base_roman_values[k];
+    if (n + u >= v)
+    {
+      *p++ = alphabet[k];
+      n += u;
+    }
+    else
+      v = base_roman_values[++j];
+  }
+  *p = '\0';
+  *psize = (size_t)(p - ibuf);
+  return ibuf;
+}
+
+/* IEEE-754 */
+
+#define BINARY_MODF 1
+
+#define NOT_A_NUMBER_STRING "NaN"
+#define INFINITY_STRING "INF"
+#define SIGNED_INFINITY 1
+#define SIGNED_ZERO 0
+#define SIGNED_NOT_A_NUMBER 0
+#define RADIX_CHAR '.'
+
+/* double/float to decimal */
+
+typedef struct ieee_double {
+  union {
+    double number;
+    uint64_t bits;
+  };
+  uint64_t fraction;
+  int exponent, sign;
+} ieee_double;
+
+typedef struct ieee_float {
+  union {
+    float number;
+    uint32_t bits;
+  };
+  uint32_t fraction;
+  int exponent, sign;
+} ieee_float;
+
+#define IEEE_DOUBLE_BIAS           1023
+#define IEEE_DOUBLE_MIN_EXPONENT  -1023
+#define IEEE_DOUBLE_MAX_EXPONENT  (0x7ff - IEEE_DOUBLE_BIAS)
+
+#define IEEE_FLOAT_BIAS            127
+#define IEEE_FLOAT_MIN_EXPONENT   -127
+#define IEEE_FLOAT_MAX_EXPONENT   (0xff - IEEE_FLOAT_BIAS)
+
+#define ieee_double_fraction(i) (i & 0x000fffffffffffffull)
+#define ieee_double_exponent(i) ((0x7ff & (i >> 52)) - IEEE_DOUBLE_BIAS)
+#define ieee_double_init(ieee_number, number) \
+  ieee_number.number = number, \
+  ieee_number.fraction = ieee_double_fraction(ieee_number.bits), \
+  ieee_number.exponent = ieee_double_exponent(ieee_number.bits)
+
+#define ieee_float_fraction(i) (i & 0x007fffff)
+#define ieee_float_exponent(i) ((0xff & (i >> 23)) - IEEE_FLOAT_BIAS)
+#define ieee_float_init(ieee_number, number) \
+  ieee_number.number = number, \
+  ieee_number.fraction = ieee_float_fraction(ieee_number.bits), \
+  ieee_number.exponent = ieee_float_exponent(ieee_number.bits)
+
+/* special cases  */
+
+#define ieee_double_is_zero(ieee_number) (ieee_number.number == 0) // || ieee_double_too_small(ieee_number) ?
+#define ieee_double_too_small(ieee_number) (ieee_number.exponent == 0 && ieee_number.fraction != 0) // denormalized, implicit fracion bit not set
+
+#define ieee_float_is_zero(ieee_number) (ieee_number.number == 0) // || ieee_float_too_small(ieee_number) ?
+#define ieee_float_too_small(ieee_number) (ieee_number.exponent == 0 && ieee_number.fraction != 0)
+
+#define ieee_double_zero_string(ieee_number) (SIGNED_ZERO && ieee_number.sign ? "-0" : "0")
+#define ieee_double_infinity_string(ieee_number) (SIGNED_INFINITY && ieee_number.sign ? "-" INFINITY_STRING : INFINITY_STRING)
+
+#define ieee_float_zero_string ieee_double_zero_string
+#define ieee_float_infinity_string ieee_double_infinity_string
+
+#define ieee_double_special_case(ieee_number) (ieee_number.exponent == IEEE_DOUBLE_MAX_EXPONENT)
+#define ieee_double_special_string(ieee_number) (ieee_number.fraction ? NOT_A_NUMBER_STRING : ieee_double_infinity_string(ieee_number))
+
+#define ieee_float_special_case(ieee_number) (ieee_number.exponent == IEEE_FLOAT_MAX_EXPONENT)
+#define ieee_float_special_string(ieee_number) (ieee_number.fraction ? NOT_A_NUMBER_STRING : ieee_float_infinity_string(ieee_number))
+
+#if 0
+
+const double double_binary_power10[] =
+{
+  1.0e1, 1.0e2, 1.0e4, 1.0e8, 1.0e16, 1.0e32, 1.0e64, 1.0e128, 1.0e256
+};
+
+const float float_binary_power10[] =
+{
+  1.0e1, 1.0e2, 1.0e4, 1.0e8, 1.0e16, 1.0e32
+};
+
+const double double_binary_negpower10[] =
+{
+  1.0e-1, 1.0e-2, 1.0e-4, 1.0e-8, 1.0e-16, 1.0e-32
+};
+
+const float float_binary_negpower10[] =
+{
+  1.0e-1, 1.0e-2, 1.0e-4, 1.0e-8, 1.0e-16, 1.0e-32
+};
+
+#else
+
+const double double_decimal_power10[] = {
+    1.0e0,   1.0e1,   1.0e2,   1.0e3,   1.0e4,   1.0e5,   1.0e6,   1.0e7,   1.0e8,   1.0e9,
+   1.0e10,  1.0e11,  1.0e12,  1.0e13,  1.0e14,  1.0e15,  1.0e16,  1.0e17,  1.0e18,  1.0e19,
+   1.0e20,  1.0e21,  1.0e22,  1.0e23,  1.0e24,  1.0e25,  1.0e26,  1.0e27,  1.0e28,  1.0e29,
+   1.0e30,  1.0e31,  1.0e32,  1.0e33,  1.0e34,  1.0e35,  1.0e36,  1.0e37,  1.0e38,  1.0e39,
+   1.0e40,  1.0e41,  1.0e42,  1.0e43,  1.0e44,  1.0e45,  1.0e46,  1.0e47,  1.0e48,  1.0e49,
+   1.0e50,  1.0e51,  1.0e52,  1.0e53,  1.0e54,  1.0e55,  1.0e56,  1.0e57,  1.0e58,  1.0e59,
+   1.0e60,  1.0e61,  1.0e62,  1.0e63,  1.0e64,  1.0e65,  1.0e66,  1.0e67,  1.0e68,  1.0e69,
+   1.0e70,  1.0e71,  1.0e72,  1.0e73,  1.0e74,  1.0e75,  1.0e76,  1.0e77,  1.0e78,  1.0e79,
+   1.0e80,  1.0e81,  1.0e82,  1.0e83,  1.0e84,  1.0e85,  1.0e86,  1.0e87,  1.0e88,  1.0e89,
+   1.0e90,  1.0e91,  1.0e92,  1.0e93,  1.0e94,  1.0e95,  1.0e96,  1.0e97,  1.0e98,  1.0e99,
+  1.0e100, 1.0e101, 1.0e102, 1.0e103, 1.0e104, 1.0e105, 1.0e106, 1.0e107, 1.0e108, 1.0e109,
+  1.0e110, 1.0e111, 1.0e112, 1.0e113, 1.0e114, 1.0e115, 1.0e116, 1.0e117, 1.0e118, 1.0e119,
+  1.0e120, 1.0e121, 1.0e122, 1.0e123, 1.0e124, 1.0e125, 1.0e126, 1.0e127, 1.0e128, 1.0e129,
+  1.0e130, 1.0e131, 1.0e132, 1.0e133, 1.0e134, 1.0e135, 1.0e136, 1.0e137, 1.0e138, 1.0e139,
+  1.0e140, 1.0e141, 1.0e142, 1.0e143, 1.0e144, 1.0e145, 1.0e146, 1.0e147, 1.0e148, 1.0e149,
+  1.0e150, 1.0e151, 1.0e152, 1.0e153, 1.0e154, 1.0e155, 1.0e156, 1.0e157, 1.0e158, 1.0e159,
+  1.0e160, 1.0e161, 1.0e162, 1.0e163, 1.0e164, 1.0e165, 1.0e166, 1.0e167, 1.0e168, 1.0e169,
+  1.0e170, 1.0e171, 1.0e172, 1.0e173, 1.0e174, 1.0e175, 1.0e176, 1.0e177, 1.0e178, 1.0e179,
+  1.0e180, 1.0e181, 1.0e182, 1.0e183, 1.0e184, 1.0e185, 1.0e186, 1.0e187, 1.0e188, 1.0e189,
+  1.0e190, 1.0e191, 1.0e192, 1.0e193, 1.0e194, 1.0e195, 1.0e196, 1.0e197, 1.0e198, 1.0e199,
+  1.0e200, 1.0e201, 1.0e202, 1.0e203, 1.0e204, 1.0e205, 1.0e206, 1.0e207, 1.0e208, 1.0e209,
+  1.0e210, 1.0e211, 1.0e212, 1.0e213, 1.0e214, 1.0e215, 1.0e216, 1.0e217, 1.0e218, 1.0e219,
+  1.0e220, 1.0e221, 1.0e222, 1.0e223, 1.0e224, 1.0e225, 1.0e226, 1.0e227, 1.0e228, 1.0e229,
+  1.0e230, 1.0e231, 1.0e232, 1.0e233, 1.0e234, 1.0e235, 1.0e236, 1.0e237, 1.0e238, 1.0e239,
+  1.0e240, 1.0e241, 1.0e242, 1.0e243, 1.0e244, 1.0e245, 1.0e246, 1.0e247, 1.0e248, 1.0e249,
+  1.0e250, 1.0e251, 1.0e252, 1.0e253, 1.0e254, 1.0e255, 1.0e256, 1.0e257, 1.0e258, 1.0e259,
+  1.0e260, 1.0e261, 1.0e262, 1.0e263, 1.0e264, 1.0e265, 1.0e266, 1.0e267, 1.0e268, 1.0e269,
+  1.0e270, 1.0e271, 1.0e272, 1.0e273, 1.0e274, 1.0e275, 1.0e276, 1.0e277, 1.0e278, 1.0e279,
+  1.0e280, 1.0e281, 1.0e282, 1.0e283, 1.0e284, 1.0e285, 1.0e286, 1.0e287, 1.0e288, 1.0e289,
+  1.0e290, 1.0e291, 1.0e292, 1.0e293, 1.0e294, 1.0e295, 1.0e296, 1.0e297, 1.0e298, 1.0e299,
+  1.0e300, 1.0e301, 1.0e302, 1.0e303, 1.0e304, 1.0e305, 1.0e306, 1.0e307, 1.0e308
+};
+
+const float float_decimal_power10[] = {
+    1.0e0f,   1.0e1f,   1.0e2f,   1.0e3f,   1.0e4f,   1.0e5f,   1.0e6f,   1.0e7f,   1.0e8f,   1.0e9f,
+   1.0e10f,  1.0e11f,  1.0e12f,  1.0e13f,  1.0e14f,  1.0e15f,  1.0e16f,  1.0e17f,  1.0e18f,  1.0e19f,
+   1.0e20f,  1.0e21f,  1.0e22f,  1.0e23f,  1.0e24f,  1.0e25f,  1.0e26f,  1.0e27f,  1.0e28f,  1.0e29f,
+   1.0e30f,  1.0e31f,  1.0e32f,  1.0e33f,  1.0e34f,  1.0e35f,  1.0e36f,  1.0e37f,  1.0e38f
+};
+
+const double double_decimal_negpower10[] = {
+    1.0e0,   1.0e-1,   1.0e-2,   1.0e-3,   1.0e-4,   1.0e-5,   1.0e-6,   1.0e-7,   1.0e-8,   1.0e-9,
+   1.0e-10,  1.0e-11,  1.0e-12,  1.0e-13,  1.0e-14,  1.0e-15,  1.0e-16,  1.0e-17,  1.0e-18,  1.0e-19,
+   1.0e-20,  1.0e-21,  1.0e-22,  1.0e-23,  1.0e-24,  1.0e-25,  1.0e-26,  1.0e-27,  1.0e-28,  1.0e-29,
+   1.0e-30,  1.0e-31,  1.0e-32,  1.0e-33,  1.0e-34,  1.0e-35,  1.0e-36,  1.0e-37,  1.0e-38,  1.0e-39,
+   1.0e-40,  1.0e-41,  1.0e-42,  1.0e-43,  1.0e-44,  1.0e-45,  1.0e-46,  1.0e-47,  1.0e-48,  1.0e-49,
+   1.0e-50,  1.0e-51,  1.0e-52,  1.0e-53,  1.0e-54,  1.0e-55,  1.0e-56,  1.0e-57,  1.0e-58,  1.0e-59,
+   1.0e-60,  1.0e-61,  1.0e-62,  1.0e-63,  1.0e-64,  1.0e-65,  1.0e-66,  1.0e-67,  1.0e-68,  1.0e-69,
+   1.0e-70,  1.0e-71,  1.0e-72,  1.0e-73,  1.0e-74,  1.0e-75,  1.0e-76,  1.0e-77,  1.0e-78,  1.0e-79,
+   1.0e-80,  1.0e-81,  1.0e-82,  1.0e-83,  1.0e-84,  1.0e-85,  1.0e-86,  1.0e-87,  1.0e-88,  1.0e-89,
+   1.0e-90,  1.0e-91,  1.0e-92,  1.0e-93,  1.0e-94,  1.0e-95,  1.0e-96,  1.0e-97,  1.0e-98,  1.0e-99,
+  1.0e-100, 1.0e-101, 1.0e-102, 1.0e-103, 1.0e-104, 1.0e-105, 1.0e-106, 1.0e-107, 1.0e-108, 1.0e-109,
+  1.0e-110, 1.0e-111, 1.0e-112, 1.0e-113, 1.0e-114, 1.0e-115, 1.0e-116, 1.0e-117, 1.0e-118, 1.0e-119,
+  1.0e-120, 1.0e-121, 1.0e-122, 1.0e-123, 1.0e-124, 1.0e-125, 1.0e-126, 1.0e-127, 1.0e-128, 1.0e-129,
+  1.0e-130, 1.0e-131, 1.0e-132, 1.0e-133, 1.0e-134, 1.0e-135, 1.0e-136, 1.0e-137, 1.0e-138, 1.0e-139,
+  1.0e-140, 1.0e-141, 1.0e-142, 1.0e-143, 1.0e-144, 1.0e-145, 1.0e-146, 1.0e-147, 1.0e-148, 1.0e-149,
+  1.0e-150, 1.0e-151, 1.0e-152, 1.0e-153, 1.0e-154, 1.0e-155, 1.0e-156, 1.0e-157, 1.0e-158, 1.0e-159,
+  1.0e-160, 1.0e-161, 1.0e-162, 1.0e-163, 1.0e-164, 1.0e-165, 1.0e-166, 1.0e-167, 1.0e-168, 1.0e-169,
+  1.0e-170, 1.0e-171, 1.0e-172, 1.0e-173, 1.0e-174, 1.0e-175, 1.0e-176, 1.0e-177, 1.0e-178, 1.0e-179,
+  1.0e-180, 1.0e-181, 1.0e-182, 1.0e-183, 1.0e-184, 1.0e-185, 1.0e-186, 1.0e-187, 1.0e-188, 1.0e-189,
+  1.0e-190, 1.0e-191, 1.0e-192, 1.0e-193, 1.0e-194, 1.0e-195, 1.0e-196, 1.0e-197, 1.0e-198, 1.0e-199,
+  1.0e-200, 1.0e-201, 1.0e-202, 1.0e-203, 1.0e-204, 1.0e-205, 1.0e-206, 1.0e-207, 1.0e-208, 1.0e-209,
+  1.0e-210, 1.0e-211, 1.0e-212, 1.0e-213, 1.0e-214, 1.0e-215, 1.0e-216, 1.0e-217, 1.0e-218, 1.0e-219,
+  1.0e-220, 1.0e-221, 1.0e-222, 1.0e-223, 1.0e-224, 1.0e-225, 1.0e-226, 1.0e-227, 1.0e-228, 1.0e-229,
+  1.0e-230, 1.0e-231, 1.0e-232, 1.0e-233, 1.0e-234, 1.0e-235, 1.0e-236, 1.0e-237, 1.0e-238, 1.0e-239,
+  1.0e-240, 1.0e-241, 1.0e-242, 1.0e-243, 1.0e-244, 1.0e-245, 1.0e-246, 1.0e-247, 1.0e-248, 1.0e-249,
+  1.0e-250, 1.0e-251, 1.0e-252, 1.0e-253, 1.0e-254, 1.0e-255, 1.0e-256, 1.0e-257, 1.0e-258, 1.0e-259,
+  1.0e-260, 1.0e-261, 1.0e-262, 1.0e-263, 1.0e-264, 1.0e-265, 1.0e-266, 1.0e-267, 1.0e-268, 1.0e-269,
+  1.0e-270, 1.0e-271, 1.0e-272, 1.0e-273, 1.0e-274, 1.0e-275, 1.0e-276, 1.0e-277, 1.0e-278, 1.0e-279,
+  1.0e-280, 1.0e-281, 1.0e-282, 1.0e-283, 1.0e-284, 1.0e-285, 1.0e-286, 1.0e-287, 1.0e-288, 1.0e-289,
+  1.0e-290, 1.0e-291, 1.0e-292, 1.0e-293, 1.0e-294, 1.0e-295, 1.0e-296, 1.0e-297, 1.0e-298, 1.0e-299,
+  1.0e-300, 1.0e-301, 1.0e-302, 1.0e-303, 1.0e-304, 1.0e-305, 1.0e-306, 1.0e-307, 1.0e-308
+};
+
+const float float_decimal_negpower10[] = {
+     1.0e0f,   1.0e-1f,   1.0e-2f,   1.0e-3f,   1.0e-4f,   1.0e-5f,   1.0e-6f,   1.0e-7f,   1.0e-8f,   1.0e-9f,
+   1.0e-10f,  1.0e-11f,  1.0e-12f,  1.0e-13f,  1.0e-14f,  1.0e-15f,  1.0e-16f,  1.0e-17f,  1.0e-18f,  1.0e-19f,
+   1.0e-20f,  1.0e-21f,  1.0e-22f,  1.0e-23f,  1.0e-24f,  1.0e-25f,  1.0e-26f,  1.0e-27f,  1.0e-28f,  1.0e-29f,
+   1.0e-30f,  1.0e-31f,  1.0e-32f,  1.0e-33f,  1.0e-34f,  1.0e-35f,  1.0e-36f,  1.0e-37f,  1.0e-38f
+};
+
+#endif
+
+/* scale number by floor(log10(number)) + 1 so that the result is in range [0.1, 1) */
+
+#define ieee_double_exponent10(ieee_number) ((int)floor(log10(ieee_number.number)) + 1)
+#define ieee_float_exponent10(ieee_number) ((int)floorf(log10f(ieee_number.number)) + 1) // floorf, log10f ?
+
+#define ieee_double_exp10(ieee_number, exponent10) \
+  exponent10 = ieee_double_exponent10(ieee_number); \
+  if (exponent10 > 0) { \
+    double_negative_exp10(ieee_number.number, -exponent10); \
+    ieee_number.fraction = ieee_double_fraction(ieee_number.bits); \
+    ieee_number.exponent = ieee_double_exponent(ieee_number.bits); \
+  } else if (exponent10 < 0) { \
+    double_positive_exp10(ieee_number.number, -exponent10); \
+    ieee_number.fraction = ieee_double_fraction(ieee_number.bits); \
+    ieee_number.exponent = ieee_double_exponent(ieee_number.bits); \
+  }
+
+#define ieee_float_exp10(ieee_number, exponent10) \
+  exponent10 = ieee_float_exponent10(ieee_number); \
+  if (exponent10 > 0) { \
+    float_negative_exp10(ieee_number.number, -exponent10); \
+    ieee_number.fraction = ieee_float_fraction(ieee_number.bits); \
+    ieee_number.exponent = ieee_float_exponent(ieee_number.bits); \
+  } else if (exponent10 < 0) { \
+    float_positive_exp10(ieee_number.number, -exponent10); \
+    ieee_number.fraction = ieee_float_fraction(ieee_number.bits); \
+    ieee_number.exponent = ieee_float_exponent(ieee_number.bits); \
+  }
+
+#if BINARY_MODF
+
+/* unhide implicit bit 53, produce 56-bit denormalised fraction (binary exponent already in range [-4, -1]) */
+
+#define ieee_double_denormalize(ieee_number) \
+  (ieee_number.exponent == IEEE_DOUBLE_MIN_EXPONENT ? (++ieee_number.exponent, 0) : (ieee_number.fraction |= (1ull<<52))), \
+  ieee_number.fraction <<= (ieee_number.exponent + 4)
+
+/* unhide implicit bit 24, produce 27-bit denormalized fraction (binary exponent already in range [-4, -1]) */
+
+#define ieee_float_denormalize(ieee_number) \
+  (ieee_number.exponent == IEEE_FLOAT_MIN_EXPONENT ? (++ieee_number.exponent, 0) : (ieee_number.fraction |= (1<<23))), \
+  ieee_number.fraction <<= (ieee_number.exponent + 4)
+
+/* turn off significant bits over 56 (integer part), multiply by 10, return new integer part (subsequent decimal digit) */
+
+#define ieee_double_binary_fraction(ieee_number) \
+  (ieee_number.fraction &= ((1ull<<56) - 1), \
+   ieee_number.fraction = (ieee_number.fraction << 1) + (ieee_number.fraction << 3), \
+   ieee_number.fraction >> 56)
+
+/* turn off significant bits over 27 (integer part), multiply by 10, return the integer part (subsequent decimal digit) */
+
+#define ieee_float_binary_fraction(ieee_number) \
+  (ieee_number.fraction &= ((1<<27) - 1), \
+   ieee_number.fraction = (ieee_number.fraction << 1) + (ieee_number.fraction << 3), \
+   ieee_number.fraction >> 27)
+
+#define ieee_double_decimal(ieee_number, exponent10, digits, p) \
+  ieee_number_decimal(ieee_double_binary_fraction, ieee_number, exponent10, digits, p)
+#define ieee_float_decimal(ieee_number, exponent10, digits, p) \
+  ieee_number_decimal(ieee_float_binary_fraction, ieee_number, exponent10, digits, p)
+
+#else
+
+/* generic method */
+
+#define ieee_double_decimal_fraction(ieee_number, i) (ieee_number.number = modf(10*ieee_number.number, &i), i)
+#define ieee_float_decimal_fraction(ieee_number, i) (ieee_number.number = (float)modf(10*ieee_number.number, &i), i) // ???
+
+#define ieee_double_decimal(ieee_number, exponent10, digits, p) \
+  ieee_number_decimal(ieee_double_decimal_fraction, ieee_number, exponent10, digits, p)
+#define ieee_float_decimal(ieee_number, exponent10, digits, p) \
+  ieee_number_decimal(ieee_float_decimal_fraction, ieee_number, exponent10, digits, p)
+
+#endif
+
+#define ieee_number_decimal(method, ieee_number, exponent10, digits, p) \
+  ieee_double_denormalize(ieee_number); \
+  if (ieee_number.sign) *p++ = '-'; \
+  if (exponent10 <= 0) \
+    for (*p++ = '0', *p++ = RADIX_CHAR; exponent10 && digits; *p++ = '0', ++exponent10, --digits); \
+  else \
+  { \
+    do { *p++ = '0' + (char)method(ieee_number); } while (--exponent10); \
+    *p++ = RADIX_CHAR; \
+  } \
+  for  ( ; digits && ieee_number.fraction; --digits) \
+    *p++ = '0' + (char)method(ieee_number)
+
+/* rounding to nearest integer */
+
+#if BINARY_MODF
+/* check if the mantissa has the most significant bit set, means >= 0.5 */
+#  define ieee_double_half(ieee_number) (ieee_number.fraction & (1ull<<55))
+#  define ieee_float_half(ieee_number) (ieee_number.fraction & (1<<26))
+#else
+#  define ieee_double_half(ieee_number) (ieee_number.number >= 0.5)
+#  define ieee_float_half(ieee_number) (ieee_number.number >= 0.5)
+#endif
+
+/* rounding to nearest integer */
+
+#define buffer_ceil(s, p, sign) \
+  { \
+    while (*--p == '9'); \
+    if (*p != RADIX_CHAR) ++*p++; \
+    else { \
+      char *q; \
+      for (q = p - 1; ; --q) { \
+        if (*q < '9') { ++*q; break; } \
+        *q = '0'; \
+        if (q == s) \
+          *--s = '1'; \
+        else if (sign && q - 1 == s) \
+          *s = '1', *--s = '-'; \
+      } \
+    } \
+  }
+
+#define buffer_remove_trailing_zeros(s, p, sign) \
+  { \
+    while (*--p == '0'); \
+    if (*p != RADIX_CHAR) \
+      ++p; \
+    else if (!SIGNED_ZERO && sign && p - 2 == s && *(p - 1) == '0') \
+      p -= 2, *p++ = '0'; \
+  }
+
+// if digits parameter was initially less then exponent10, then exponent10 > 0 and ieee_double_half(ieee_number) is irrelevant
+#define ieee_double_round(ieee_number, exponent10, s, p) \
+  if (exponent10 == 0 && ieee_double_half(ieee_number)) \
+    { buffer_ceil(s, p, ieee_number.sign); } \
+  else \
+    { buffer_remove_trailing_zeros(s, p, ieee_number.sign); }
+
+#define ieee_float_round(ieee_number, exponent10, s, p) \
+  if (exponent10 == 0 && ieee_float_half(ieee_number)) \
+    { buffer_ceil(s, p, ieee_number.sign); } \
+  else \
+    { buffer_remove_trailing_zeros(s, p, ieee_number.sign); }
+
+/* double to decimal */
+
+#define ieee_copy_special_string(nbuf, special, p, _p) \
+  for (p = nbuf, _p = special; ; ++p, ++_p) { \
+    if ((*p = *_p) == '\0') break; \
+  }
+
+#define ieee_copy_special_string_re(nbuf, special, p, _p, r, e) \
+  for (p = nbuf, _p = special; ; ++p, ++_p) { \
+    if ((*p = *_p) == '\0') { \
+      if (r != NULL) *r = NULL; \
+      if (e != NULL) *e = p; \
+      break; \
+    } \
+  }
+
+char * double_as_string (double number, int digits, char nbuf[MAX_NUMBER_DIGITS], size_t *psize)
+{
+  ieee_double ieee_number;
+  int exponent10;
+  char *s, *p; const char *_p;
+  s = p = nbuf + 1; // for sign/rounding
+  ieee_double_init(ieee_number, number);
+  if ((ieee_number.sign = ieee_number.bits >> 63) != 0)
+    ieee_number.number = -ieee_number.number;
+  if (ieee_double_is_zero(ieee_number)) // to avoid crash on log10(number)
+  {
+    ieee_copy_special_string(nbuf, ieee_double_zero_string(ieee_number), p, _p);
+    *psize = (size_t)(p - nbuf);
+    return nbuf;
+  }
+  if (ieee_double_special_case(ieee_number))
+  {
+    ieee_copy_special_string(nbuf, ieee_double_special_string(ieee_number), p, _p);
+    *psize = (size_t)(p - nbuf);
+    return nbuf;
+  }
+  ieee_double_exp10(ieee_number, exponent10);
+  ieee_double_decimal(ieee_number, exponent10, digits, p);
+  ieee_double_round(ieee_number, exponent10, s, p);
+  *p = '\0';
+  *psize = (size_t)(p - s);
+  return s;
+}
+
+/* float to decimal */
+
+char * float_as_string (float number, int digits, char nbuf[MAX_NUMBER_DIGITS], size_t *psize)
+{
+  ieee_float ieee_number;
+  int exponent10;
+  char *s, *p; const char *_p;
+  s = p = nbuf + 1; // for sign/rounding
+  ieee_float_init(ieee_number, number);
+  if ((ieee_number.sign = ieee_number.bits >> 31) != 0)
+    ieee_number.number = -ieee_number.number;
+  if (ieee_float_is_zero(ieee_number))
+  {
+    ieee_copy_special_string(nbuf, ieee_float_zero_string(ieee_number), p, _p);
+    *psize = (size_t)(p - nbuf);
+    return nbuf;
+  }
+  if (ieee_float_special_case(ieee_number))
+  {
+    ieee_copy_special_string(nbuf, ieee_float_special_string(ieee_number), p, _p);
+    *psize = (size_t)(p - nbuf);
+    return nbuf;
+  }
+  ieee_float_exp10(ieee_number, exponent10);
+  ieee_float_decimal(ieee_number, exponent10, digits, p);
+  ieee_float_round(ieee_number, exponent10, s, p);
+  *p = '\0';
+  *psize = (size_t)(p - s);
+  return s;
+}
+
+/* decimal string to double/float */
+
+#define string_scan_decimal(s, c, number) _scan_decimal(c, number, *++s)
+#define string_scan_fraction(s, c, number, exponent10) _scan_fraction(c, number, exponent10, *++s)
+#define string_scan_exponent10(s, c, exponent10) _scan_exponent10(c, exponent10, *++s)
+
+const char * string_to_double (const char *s, double *number)
+{
+  int sign, exponent10, c = *s;
+  string_scan_sign(s, c, sign);
+  string_scan_decimal(s, c, *number);
+  if (c == '.')
+  {
+    c = *++s;
+    string_scan_fraction(s, c, *number, exponent10);
+  }
+  else
+    exponent10 = 0;
+  if (c == 'e' || c == 'E')
+  {
+    c = *++s;
+    string_scan_exponent10(s, c, exponent10);
+  }
+  double_exp10(*number, exponent10);
+  if (sign) *number = -*number;
+  return s;
+}
+
+const char * string_to_float (const char *s, float *number)
+{
+  int sign, exponent10, c = *s;
+  string_scan_sign(s, c, sign);
+  string_scan_decimal(s, c, *number);
+  if (c == '.')
+  {
+    c = *++s;
+    string_scan_fraction(s, c, *number, exponent10);
+  }
+  else
+    exponent10 = 0;
+  if (c == 'e' || c == 'E')
+  {
+    c = *++s;
+    string_scan_exponent10(s, c, exponent10);
+  }
+  float_exp10(*number, exponent10);
+  if (sign) *number = -*number;
+  return s;
+}
+
+/* conventional form */
+
+const char * convert_to_double (const char *s, double *number)
+{
+  int sign, c = *s;
+  string_scan_sign(s, c, sign);
+  string_scan_decimal(s, c, *number);
+  if (c == '.' || c == ',')
+  {
+    int exponent10;
+    c = *++s;
+    string_scan_fraction(s, c, *number, exponent10);
+    if (exponent10 < 0)
+      double_negative_exp10(*number, exponent10);
+  }
+  if (sign) *number = -*number;
+  return s;
+}
+
+const char * convert_to_float (const char *s, float *number)
+{
+  int sign, c = *s;
+  string_scan_sign(s, c, sign);
+  string_scan_decimal(s, c, *number);
+  if (c == '.' || c == ',')
+  {
+    int exponent10;
+    c = *++s;
+    string_scan_fraction(s, c, *number, exponent10);
+    if (exponent10 < 0)
+      float_negative_exp10(*number, exponent10);
+  }
+  if (sign) *number = -*number;
+  return s;
+}
+
+/* pretty common stuff */
+
+size_t bytes_to_hex_lc (const void *input, size_t size, unsigned char *output)
+{
+  size_t i;
+  const unsigned char *p;
+  for (i = 0, p = (const unsigned char *)input; i < size; ++i, ++p)
+  {
+    *output++ = base16_lc_digit1(*p);
+    *output++ = base16_lc_digit2(*p);
+  }
+  *output = '\0';
+  return 2*size + 1;
+}
+
+size_t bytes_to_hex_uc (const void *input, size_t size, unsigned char *output)
+{
+  size_t i;
+  const unsigned char *p;
+  for (i = 0, p = (const unsigned char *)input; i < size; ++i, ++p)
+  {
+    *output++ = base16_uc_digit1(*p);
+    *output++ = base16_uc_digit2(*p);
+  }
+  *output = '\0';
+  return 2*size + 1;
+}
+
+size_t hex_to_bytes (const void *input, size_t size, unsigned char *output)
+{
+  size_t i;
+  int c1, c2;
+  const unsigned char *p;
+  for (i = 1, p = (const unsigned char *)input; i < size; i += 2)
+  {
+    c1 = base16_value(*p);
+    ++p;
+    c2 = base16_value(*p);
+    ++p;
+    if (c1 >= 0 && c2 >= 0)
+      *output++ = (unsigned char)((c1<<4)|c2);
+    else
+      break;
+  }
+  return i >> 1;
+}
+
+void print_as_hex (const void *input, size_t bytes)
+{
+  const unsigned char *p;
+  for (p = (const unsigned char *)input; bytes > 0; --bytes, ++p)
+    printf("%02x", *p);
+}
diff --git a/source/luametatex/source/libraries/pplib/util/utilnumber.h b/source/luametatex/source/libraries/pplib/util/utilnumber.h
new file mode 100644
index 000000000..735432b8d
--- /dev/null
+++ b/source/luametatex/source/libraries/pplib/util/utilnumber.h
@@ -0,0 +1,428 @@
+#ifndef UTIL_NUMBER_H
+#define UTIL_NUMBER_H
+
+#include <stddef.h> // for size_t
+
+#include "utilplat.h"
+#include "utildecl.h"
+
+#if defined(__cplusplus) && defined(_MSC_VER)
+// int*_t types are in standard in msvc++
+#else
+#  include <stdint.h>
+#endif
+
+/* 'long' isn't long for msvc64/mingw64, we need a type for machine word */
+
+#if defined(_WIN64) || defined(__MINGW32__)
+#  define INT64F "%I64d"
+#  define UINT64F "%I64u"
+#else
+#  define INT64F "%lld"
+#  define UINT64F "%llu"
+#endif
+
+#if defined(MSVC64)
+#  define INTLW_IS_INT64
+#  define intlw_t int64_t
+#  define uintlw_t uint64_t
+#  define INTLW(N) N##I64
+#  define UINTLW(N) N##UI64
+#  define INTLWF INT64F
+#  define UINTLWF UINT64F
+#elif defined(__MINGW64__)
+#  define INTLW_IS_INT64
+#  define intlw_t int64_t
+#  define uintlw_t uint64_t
+#  define INTLW(N) N##LL
+#  define UINTLW(N) N##ULL
+#  define INTLWF INT64F
+#  define UINTLWF UINT64F
+#else // 32bit or sane 64bit (LP64)
+#  define INTLW_IS_LONG
+#  define intlw_t long
+#  define uintlw_t unsigned long
+#  define INTLW(N) N##L
+#  define UINTLW(N) N##UL
+#  define INTLWF "%ld"
+#  define UINTLWF "%lu"
+#endif
+
+// ssize_t is missing in MSVC, but defining it is risky; some environments (eg. python) typedefs ssize_t on its own way..
+// #if defined(MSVC64)
+// #  define ssize_t int32_t
+// #else
+// #  if defined(MSVC32)
+// #    define ssize_t int64_t
+// #  endif
+// #endif
+
+/* basic constants */
+
+#define MAX_RADIX 36
+#define MAX_INTEGER_DIGITS 65 /* 64-bit number in binary form plus '\0' */
+#define MAX_ROMAN_DIGITS 128  /* to handle romannumeral of short int (up to 65 leading 'M') */
+#define MAX_NUMBER_DIGITS 512
+#define NUMBER_BUFFER_SIZE MAX_NUMBER_DIGITS
+
+#define base36_uc_alphabet "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+#define base36_lc_alphabet "0123456789abcdefghijklmnopqrstuvwxyz"
+
+#define base26_uc_alphabet "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+#define base26_lc_alphabet "abcdefghijklmnopqrstuvwxyz"
+extern const int base26_lookup[];
+
+#define base36_lc_palindrome "zyxwvutsrqponmlkjihgfedcba9876543210123456789abcdefghijklmnopqrstuvwxyz"
+#define base36_uc_palindrome "ZYXWVUTSRQPONMLKJIHGFEDCBA9876543210123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+
+extern const int base36_lookup[];
+
+#define base10_palindrome "9876543210123456789"
+#define base10_alphabet "0123456789"
+extern const int base10_lookup[];
+
+#define base16_uc_alphabet "0123456789ABCDEF"
+#define base16_lc_alphabet "0123456789abcdef"
+extern const int base16_lookup[];
+
+#define base16_uc_digit1(c) base16_uc_alphabet[(c)>>4]
+#define base16_uc_digit2(c) base16_uc_alphabet[(c)&15]
+#define base16_lc_digit1(c) base16_lc_alphabet[(c)>>4]
+#define base16_lc_digit2(c) base16_lc_alphabet[(c)&15]
+
+#define base8_digit(c)  ((unsigned)(c - '0') <= (unsigned)('7' - '0'))
+#define base8_value(c)  (base8_digit(c) ? (c) - '0' : -1)
+
+#define base10_digit(c) ((unsigned)(c - '0') <= (unsigned)('9' - '0'))
+#define base10_value(c) (base10_lookup[(uint8_t)(c)])
+
+#define base16_digit(c) (base16_lookup[(uint8_t)(c)] >= 0)
+#define base16_value(c) (base16_lookup[(uint8_t)(c)])
+
+#define base26_digit(c) (base26_lookup[(uint8_t)(c)] >= 0)
+#define base26_value(c) (base26_lookup[(uint8_t)(c)])
+
+#define base36_digit(c) (base36_lookup[(uint8_t)(c)] >= 0)
+#define base36_value(c) (base36_lookup[(uint8_t)(c)])
+
+//#define base_digit(c, radix) ((unsigned)(base36_lookup[c]) < (unsigned)(radix))
+//#define base_value(c, radix) (base_digit(c, radix) ? base36_lookup[c] : -1)
+
+UTILDEF extern char util_number_buffer[NUMBER_BUFFER_SIZE];
+
+/* integer from string; return a pointer to character next to the last digit */
+
+UTILAPI const char * string_to_int32 (const char *s, int32_t *number);
+UTILAPI const char * string_to_slong (const char *s, long *number);
+UTILAPI const char * string_to_int64 (const char *s, int64_t *number);
+
+UTILAPI const char * string_to_uint32 (const char *s, uint32_t *number);
+UTILAPI const char * string_to_ulong (const char *s, unsigned long *number);
+UTILAPI const char * string_to_usize (const char *s, size_t *number);
+UTILAPI const char * string_to_uint64 (const char *s, uint64_t *number);
+
+UTILAPI const char * radix_to_int32 (const char *s, int32_t *number, int radix);
+UTILAPI const char * radix_to_slong (const char *s, long *number, int radix);
+UTILAPI const char * radix_to_int64 (const char *s, int64_t *number, int radix);
+
+UTILAPI const char * radix_to_uint32 (const char *s, uint32_t *number, int radix);
+UTILAPI const char * radix_to_ulong (const char *s, unsigned long *number, int radix);
+UTILAPI const char * radix_to_usize (const char *s, size_t *number, int radix);
+UTILAPI const char * radix_to_uint64 (const char *s, uint64_t *number, int radix);
+
+UTILAPI const char * alpha_to_uint32 (const char *s, uint32_t *number);
+UTILAPI const char * alpha_to_ulong (const char *s, unsigned long *number);
+UTILAPI const char * alpha_to_usize (const char *s, size_t *number);
+UTILAPI const char * alpha_to_uint64 (const char *s, uint64_t *number);
+
+/* integer to string */
+
+UTILAPI char * int32_as_string (int32_t number, char ibuf[MAX_INTEGER_DIGITS], size_t *psize);
+UTILAPI char * slong_as_string (long number, char ibuf[MAX_INTEGER_DIGITS], size_t *psize);
+UTILAPI char * int64_as_string (int64_t number, char ibuf[MAX_INTEGER_DIGITS], size_t *psize);
+
+#define int32_to_string(number, psize) int32_as_string(number, util_number_buffer, psize)
+#define slong_to_string(number, psize) slong_as_string(number, util_number_buffer, psize)
+#define int64_to_string(number, psize) int64_as_string(number, util_number_buffer, psize)
+
+UTILAPI char * uint32_as_string (uint32_t number, char ibuf[MAX_INTEGER_DIGITS], size_t *psize);
+UTILAPI char * ulong_as_string (unsigned long number, char ibuf[MAX_INTEGER_DIGITS], size_t *psize);
+UTILAPI char * usize_as_string  (size_t   number, char ibuf[MAX_INTEGER_DIGITS], size_t *psize);
+UTILAPI char * uint64_as_string (uint64_t number, char ibuf[MAX_INTEGER_DIGITS], size_t *psize);
+
+#define uint32_to_string(number, psize) uint32_as_string(number, util_number_buffer, psize)
+#define ulong_to_string(number, psize) ulong_as_string(number, util_number_buffer, psize)
+#define usize_to_string(number, psize)  usize_as_string(number, util_number_buffer, psize)
+#define uint64_to_string(number, psize) uint64_as_string(number, util_number_buffer, psize)
+
+UTILAPI char * int32_as_radix (int32_t number, int radix, int uc, char ibuf[MAX_INTEGER_DIGITS], size_t *psize);
+UTILAPI char * slong_as_radix (long number, int radix, int uc, char ibuf[MAX_INTEGER_DIGITS], size_t *psize);
+UTILAPI char * int64_as_radix (int64_t number, int radix, int uc, char ibuf[MAX_INTEGER_DIGITS], size_t *psize);
+
+#define int32_to_radix(number, radix, uc, psize) int32_as_radix(number, radix, uc, util_number_buffer, psize)
+#define slong_to_radix(number, radix, uc, psize) slong_as_radix(number, radix, uc, util_number_buffer, psize)
+#define int64_to_radix(number, radix, uc, psize) int64_as_radix(number, radix, uc, util_number_buffer, psize)
+
+UTILAPI char * uint32_as_radix (uint32_t number, int radix, int uc, char ibuf[MAX_INTEGER_DIGITS], size_t *psize);
+UTILAPI char * ulong_as_radix (unsigned long number, int radix, int uc, char ibuf[MAX_INTEGER_DIGITS], size_t *psize);
+UTILAPI char * usize_as_radix  (size_t   number, int radix, int uc, char ibuf[MAX_INTEGER_DIGITS], size_t *psize);
+UTILAPI char * uint64_as_radix (uint64_t number, int radix, int uc, char ibuf[MAX_INTEGER_DIGITS], size_t *psize);
+
+#define uint32_to_radix(number, radix, uc, psize) uint32_as_radix(number, radix, uc, util_number_buffer, psize)
+#define ulong_to_radix(number, radix, uc, psize) ulong_as_radix(number, radix, uc, util_number_buffer, psize)
+#define usize_to_radix(number, radix, uc, psize)  usize_as_radix(number, radix, uc, util_number_buffer, psize)
+#define uint64_to_radix(number, radix, uc, psize) uint64_as_radix(number, radix, uc, util_number_buffer, psize)
+
+UTILAPI char * uint32_as_alpha (uint32_t number, int uc, char ibuf[MAX_INTEGER_DIGITS], size_t *psize);
+UTILAPI char * ulong_as_alpha (unsigned long number, int uc, char ibuf[MAX_INTEGER_DIGITS], size_t *psize);
+UTILAPI char * usize_as_alpha  (size_t   number, int uc, char ibuf[MAX_INTEGER_DIGITS], size_t *psize);
+UTILAPI char * uint64_as_alpha (uint64_t number, int uc, char ibuf[MAX_INTEGER_DIGITS], size_t *psize);
+
+#define uint32_to_alpha(number, uc, psize) uint32_as_alpha(number, uc, util_number_buffer, psize)
+#define ulong_to_alpha(number, uc, psize) ulong_as_alpha(number, uc, util_number_buffer, psize)
+#define usize_to_alpha(number, uc, psize)  usize_as_alpha(number, uc, util_number_buffer, psize)
+#define uint64_to_alpha(number, uc, psize) uint64_as_alpha(number, uc, util_number_buffer, psize)
+
+#if defined(INTLW_IS_INT64)
+
+#  define string_to_intlw(s, number) string_to_int64(s, number)
+#  define string_to_uintlw(s, number) string_to_uint64(s, number)
+
+#  define radix_to_intlw(s, number, radix) radix_to_int64(s, number, radix)
+#  define radix_to_uintlw(s, number, radix) radix_to_uint64(s, number, radix)
+
+#  define alpha_to_uintlw(s, number) alpha_to_uint64(s, number)
+
+#  define intlw_as_string(number, ibuf, psize) int64_as_string(number, ibuf, psize)
+#  define uintlw_as_string(number, ibuf, psize) uint64_as_string(number, ibuf, psize)
+
+#  define intlw_to_string(number, psize) int64_to_string(number, psize)
+#  define uintlw_to_string(number, psize) uint64_to_string(number, psize)
+
+#  define intlw_as_radix(number, radix, uc, ibuf, psize) int64_as_radix(number, radix, uc, ibuf, psize)
+#  define uintlw_as_radix(number, radix, uc, ibuf, psize) uint64_as_radix(number, radix, uc, ibuf, psize)
+
+#  define intlw_to_radix(number, radix, uc, psize) int64_to_radix(number, radix, uc, psize)
+#  define uintlw_to_radix(number, radix, uc, psize) uint64_to_radix(number, radix, uc, psize)
+
+#  define uintlw_as_alpha(number, uc, ibuf, psize) uint64_as_alpha(number, uc, ibuf, psize)
+#  define uintlw_to_alpha(number, uc, psize) uint64_to_alpha(number, uc, ibuf, psize)
+
+#elif defined(INTLW_IS_LONG)
+
+#  define string_to_intlw(s, number) string_to_slong(s, number)
+#  define string_to_uintlw(s, number) string_to_ulong(s, number)
+
+#  define radix_to_intlw(s, number, radix) radix_to_slong(s, number, radix)
+#  define radix_to_uintlw(s, number, radix) radix_to_ulong(s, number, radix)
+
+#  define alpha_to_uintlw(s, number) alpha_to_ulong(s, number)
+
+#  define intlw_as_string(number, ibuf, psize) slong_as_string(number, ibuf, psize)
+#  define uintlw_as_string(number, ibuf, psize) ulong_as_string(number, ibuf, psize)
+
+#  define intlw_to_string(number, psize) slong_to_string(number, psize)
+#  define uintlw_to_string(number, psize) ulong_to_string(number, psize)
+
+#  define intlw_as_radix(number, radix, uc, ibuf, psize) slong_as_radix(number, radix, uc, ibuf, psize)
+#  define uintlw_as_radix(number, radix, uc, ibuf, psize) ulong_as_radix(number, radix, uc, ibuf, psize)
+
+#  define intlw_to_radix(number, radix, uc, psize) slong_to_radix(number, radix, uc, psize)
+#  define uintlw_to_radix(number, radix, uc, psize) ulong_to_radix(number, radix, uc, psize)
+
+#  define uintlw_as_alpha(number, uc, ibuf, psize) ulong_as_alpha(number, uc, ibuf, psize)
+#  define uintlw_to_alpha(number, uc, psize) ulong_to_alpha(number, uc, ibuf, psize)
+
+#endif
+
+/* a..z, aa..zz, aaa..zzz (limited to uint16_t, valid for N <= buffer_size * 26) */
+
+UTILAPI const char * alphan_to_uint16 (const char *s, uint16_t *number);
+UTILAPI char * uint16_as_alphan (uint16_t number, int uc, char ibuf[], size_t size, size_t *psize);
+#define uint16_to_alphan(number, uc, psize) uint16_as_alphan(number, uc, util_number_buffer, NUMBER_BUFFER_SIZE, psize)
+
+/* roman numeral (limited to uint16_t) */
+
+UTILAPI const char * roman_to_uint16 (const char *s, uint16_t *number);
+UTILAPI char * uint16_as_roman (uint16_t number, int uc, char ibuf[MAX_ROMAN_DIGITS], size_t *psize);
+#define uint16_to_roman(number, uc, psize) uint16_as_roman(number, uc, util_number_buffer, psize)
+
+/* double/float  to string */
+
+UTILAPI char * double_as_string (double number, int digits, char nbuf[MAX_NUMBER_DIGITS], size_t *psize);
+#define double_to_string(number, digits, psize) double_as_string(number, digits, util_number_buffer, psize)
+
+UTILAPI char * float_as_string (float number, int digits, char nbuf[MAX_NUMBER_DIGITS], size_t *psize);
+#define float_to_string(number, digits, psize) float_as_string(number, digits, util_number_buffer, psize)
+
+/* string to double/float */
+
+UTILAPI const char * string_to_double (const char *s, double *number);
+UTILAPI const char * string_to_float  (const char *s, float *number);
+
+/* convenience form accepting comma among a dot, with not exp notation (eg. pdf) */
+
+UTILAPI const char * convert_to_double (const char *s, double *number);
+UTILAPI const char * convert_to_float  (const char *s, float *number);
+
+/* binary data parsers helpers */
+
+#if 0 // masking gives more overactive warnings
+#define get_number_byte1(n) ((n) & 0x000000ffu)
+#define get_number_byte2(n) (((n) & 0x0000ff00u) >> 8)
+#define get_number_byte3(n) (((n) & 0x00ff0000u) >> 16)
+#define get_number_byte4(n) (((n) & 0xff000000u) >> 24)
+#define get_number_byte5(n) (((n) & 0x000000ff00000000ull) >> 32)
+#define get_number_byte6(n) (((n) & 0x0000ff0000000000ull) >> 40)
+#define get_number_byte7(n) (((n) & 0x00ff000000000000ull) >> 48)
+#define get_number_byte8(n) (((n) & 0xff00000000000000ull) >> 56)
+#else
+#define get_number_byte1(n) ((n) & 0xff)
+#define get_number_byte2(n) (((n) >> 8) & 0xff)
+#define get_number_byte3(n) (((n) >> 16) & 0xff)
+#define get_number_byte4(n) (((n) >> 24) & 0xff)
+#define get_number_byte5(n) (((n) >> 32) & 0xff)
+#define get_number_byte6(n) (((n) >> 40) & 0xff)
+#define get_number_byte7(n) (((n) >> 48) & 0xff)
+#define get_number_byte8(n) (((n) >> 56) & 0xff)
+#endif
+
+#define get_number_bytes_be1(n, b) (b[0] = (uint8_t)get_number_byte1(n))
+#define get_number_bytes_be2(n, b) (b[0] = (uint8_t)get_number_byte2(n), b[1] = (uint8_t)get_number_byte1(n))
+#define get_number_bytes_be3(n, b) (b[0] = (uint8_t)get_number_byte3(n), b[1] = (uint8_t)get_number_byte2(n), b[2] = (uint8_t)get_number_byte1(n))
+#define get_number_bytes_be4(n, b) (b[0] = (uint8_t)get_number_byte4(n), b[1] = (uint8_t)get_number_byte3(n), b[2] = (uint8_t)get_number_byte2(n), b[3] = (uint8_t)get_number_byte1(n))
+
+#define get_number_bytes_be5(n, b) (b[0] = (uint8_t)get_number_byte5(n), b[1] = (uint8_t)get_number_byte4(n), b[2] = (uint8_t)get_number_byte3(n), b[3] = (uint8_t)get_number_byte2(n), \
+                                    b[4] = (uint8_t)get_number_byte1(n))
+#define get_number_bytes_be6(n, b) (b[0] = (uint8_t)get_number_byte6(n), b[1] = (uint8_t)get_number_byte5(n), b[2] = (uint8_t)get_number_byte4(n), b[3] = (uint8_t)get_number_byte3(n), \
+                                    b[4] = (uint8_t)get_number_byte2(n), b[5] = (uint8_t)get_number_byte1(n))
+#define get_number_bytes_be7(n, b) (b[0] = (uint8_t)get_number_byte7(n), b[1] = (uint8_t)get_number_byte6(n), b[2] = (uint8_t)get_number_byte5(n), b[3] = (uint8_t)get_number_byte4(n), \
+                                    b[4] = (uint8_t)get_number_byte3(n), b[5] = (uint8_t)get_number_byte2(n), b[6] = (uint8_t)get_number_byte1(n))
+#define get_number_bytes_be8(n, b) (b[0] = (uint8_t)get_number_byte8(n), b[1] = (uint8_t)get_number_byte7(n), b[2] = (uint8_t)get_number_byte6(n), b[3] = (uint8_t)get_number_byte5(n), \
+                                    b[4] = (uint8_t)get_number_byte4(n), b[5] = (uint8_t)get_number_byte3(n), b[6] = (uint8_t)get_number_byte2(n), b[7] = (uint8_t)get_number_byte1(n))
+
+#define read_uint16be_as(s, int_type) ((int_type)((s[0]<<8)|s[1]))
+#define read_uint32be_as(s, int_type) ((int_type)((s[0]<<24)|(s[1]<<16)|(s[2]<<8)|s[3]))
+
+#define read_uint16le_as(s, int_type) ((int_type)((s[1]<<8)|s[0]))
+#define read_uint32le_as(s, int_type) ((int_type)((s[3]<<24)|(s[2]<<16)|(s[1]<<8)|s[0]))
+
+#define read_uint16_native(s) (*((uint16_t *)(s)))
+#define read_uint32_native(s) (*((uint32_t *)(s)))
+#define read_int16_native(s)  (*((int16_t *)(s)))
+#define read_int32_native(s)  (*((int32_t *)(s)))
+
+#define scan_uint16be_as(s, int_type) (s += 2, (int_type)((s[-2]<<8)|s[-1]))
+#define scan_uint32be_as(s, int_type) (s += 4, (int_type)((s[-4]<<24)|(s[-3]<<16)|(s[-2]<<8)|s[-1]))
+
+#define scan_uint16le_as(s, int_type) (s += 2, (int_type)((s[-1]<<8)|s[-2]))
+#define scan_uint32le_as(s, int_type) (s += 4, (int_type)((s[-1]<<24)|(s[-2]<<16)|(s[-3]<<8)|s[-4]))
+
+#define scan_uint16_native(s) (s += 2, read_uint16_native(s-2))
+#define scan_uint32_native(s) (s += 4, read_uint32_native(s-4))
+#define scan_int16_native(s)  (s += 2, read_int16_native(s-2))
+#define scan_int32_native(s)  (s += 4, read_int32_native(s-4))
+
+#define read_fixed16_16_as(s, float_type)  (((float_type)read_uint32be_as(s, signed int))/(1<<16))
+#define read_fixed2_14_as(s, float_type)  (((float_type)read_uint16be_as(s, signed short))/(1<<14))
+
+#define scan_fixed16_16_as(s, float_type) (((float_type)scan_uint32be_as(s, signed int))/(1<<16))
+#define scan_fixed2_14_as(s, float_type) (((float_type)scan_uint16be_as(s, signed short))/(1<<14))
+
+/* internal procedures */
+
+#define _scan_sign(c, sign, next) \
+  do { if (c == '-') { sign = 1; c = next; } else if (c == '+') { sign = 0; c = next; } else sign = 0; } while (0)
+
+#define integer_multiplied10(number) (((number) << 1) + ((number) << 3))
+
+#define _scan_integer(c, number, next) \
+  for (number = 0; base10_digit(c); number = integer_multiplied10(number) + (c - '0'), c = next)
+#define _scan_radix(c, number, radix, next) \
+  for (number = 0; (c = base36_value(c)) >= 0 && c < radix; number = number * radix + c, c = next)
+
+#define _read_integer(c, number, next) \
+  for (number = c - '0', c = next; base10_digit(c); number = integer_multiplied10(number) + (c - '0'), c = next)
+#define _read_radix(c, number, radix, next) \
+  for (number = c - '0', c = next; (c = base36_value(c)) >= 0 && c < radix; number = number * radix + c, c = next)
+
+/* rationals */
+
+#define _scan_decimal(c, number, next) \
+  for (number = 0; base10_digit(c); number = number*10 + (c - '0'), c = next)
+#define _scan_fraction(c, number, exponent10, next) \
+  for (exponent10 = 0; base10_digit(c); --exponent10, number = number*10 + (c - '0'), c = next)
+
+#define _scan_exponent10(c, exponent10, next) \
+  do { \
+    int eexponent10, eexpsign; \
+    _scan_sign(c, eexpsign, next); \
+    _scan_integer(c, eexponent10, next); \
+    if (eexpsign) \
+      exponent10 -= eexponent10; \
+    else \
+      exponent10 += eexponent10; \
+  } while(0)
+
+#if 0
+
+// kept just for sentiment ;)
+
+extern const double double_binary_power10[];
+extern const float float_binary_power10[];
+extern const double double_binary_negpower10[];
+extern const float float_binary_negpower10[];
+
+#define double_negative_exp10(number, exponent) \
+{ const double *bp10; int e = ((exponent) < 511 ? 511 : -(exponent)); \
+  for (bp10 = double_binary_negpower10; e > 0; e >>= 1, ++bp10) \
+    if (e & 1) number *= *bp10; }
+
+#define float_negative_exp10(number, exponent) \
+{ const float *bp10; int e = ((exponent) < 64 ? 64 : -(exponent)); \
+  for (bp10 = float_binary_negpower10; e > 0; e >>= 1, ++bp10) \
+    if (e & 1) number *= *bp10; }
+
+#define double_positive_exp10(number, exponent) \
+{ const double *bp10; int e = ((exponent) > 511 ? 511 : (exponent)); \
+  for (bp10 = double_binary_power10; e > 0; e >>= 1, ++bp10) \
+    if (e & 1) number *= *bp10; }
+
+#define float_positive_exp10(number, exponent) \
+{ const float *bp10; int e = ((exponent) > 64 ? 64 : (exponent)); \
+  for (bp10 = double_binary_power10; e > 0; e >>= 1, ++bp10) \
+    if (e & 1) number *= *bp10; }
+
+#define double_exp10(number, exponent) \
+  if ((exponent) < 0) double_negative_exp10(number, exponent) else if ((exponent) > 0) double_positive_exp10(number, exponent)
+
+#define float_exp10(number, exponent) \
+  if ((exponent) < 0) float_negative_exp10(number, exponent) else if ((exponent) > 0) float_positive_exp10(number, exponent)
+
+#else
+
+extern const double double_decimal_power10[];
+extern const float float_decimal_power10[];
+extern const double double_decimal_negpower10[];
+extern const float float_decimal_negpower10[];
+
+#define double_negative_exp10(number, exponent) ((number) *= double_decimal_negpower10[(exponent) < -308 ? 308 : -(exponent)])
+#define double_positive_exp10(number, exponent) ((number) *= double_decimal_power10[(exponent) > 308 ? 308 : (exponent)])
+
+#define float_negative_exp10(number, exponent) ((number) *= float_decimal_negpower10[(exponent) < -38 ? 38 : -(exponent)])
+#define float_positive_exp10(number, exponent) ((number) *= float_decimal_power10[(exponent) > 38 ? 38 : (exponent)])
+
+#define double_exp10(number, exponent) ((void)(((exponent) < 0 && double_negative_exp10(number, exponent)) || (((exponent) > 0 && double_positive_exp10(number, exponent)))))
+#define float_exp10(number, exponent) ((void)(((exponent) < 0 && float_negative_exp10(number, exponent)) || (((exponent) > 0 && float_positive_exp10(number, exponent)))))
+
+#endif
+
+/* pretty common stuff */
+
+#define bytes_to_hex(input, size, output) bytes_to_hex_lc(input, size, output)
+UTILAPI size_t bytes_to_hex_lc (const void *input, size_t size, uint8_t *output);
+UTILAPI size_t bytes_to_hex_uc (const void *input, size_t size, uint8_t *output);
+UTILAPI size_t hex_to_bytes (const void *input, size_t size, uint8_t *output);
+UTILAPI void print_as_hex (const void *input, size_t bytes);
+
+#endif
+\ No newline at end of file
diff --git a/source/luametatex/source/libraries/pplib/util/utilplat.h b/source/luametatex/source/libraries/pplib/util/utilplat.h
new file mode 100644
index 000000000..8838f702b
--- /dev/null
+++ b/source/luametatex/source/libraries/pplib/util/utilplat.h
@@ -0,0 +1,31 @@
+
+#ifndef UTIL_PLAT_H
+#define UTIL_PLAT_H
+
+#if defined(_WIN32) || defined(WIN32)
+#  ifdef _MSC_VER
+#    if defined(_M_64) || defined(_WIN64)
+#      define MSVC64
+#    else
+#      define MSVC32
+#    endif
+#  else
+#    if defined(__MINGW64__)
+#      define MINGW64
+#    else
+#      if defined(__MINGW32__)
+#        define MINGW32
+#      endif
+#    endif
+#  endif
+#endif
+
+#ifdef __GNUC__
+//#  define FALLTHRU [[fallthrough]] // c++17
+//#  define FALLTHRU [[gnu:fallthrough]] // c++14
+#  define FALLTHRU __attribute__((fallthrough)); // C and C++03
+#else
+#  define FALLTHRU
+#endif
+
+#endif
+\ No newline at end of file
diff --git a/source/luametatex/source/libraries/pplib/util/utilsha.c b/source/luametatex/source/libraries/pplib/util/utilsha.c
new file mode 100644
index 000000000..596bf76f7
--- /dev/null
+++ b/source/luametatex/source/libraries/pplib/util/utilsha.c
@@ -0,0 +1,1065 @@
+/* sha2 implementation excerpted from code by Aaron D. Gifford */
+
+/*
+ * AUTHOR: Aaron D. Gifford - http://www.aarongifford.com/
+ *
+ * Copyright (c) 2000-2001, Aaron D. Gifford
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the copyright holder nor the names of contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTOR(S) ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTOR(S) BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $Id: sha2.c,v 1.1 2001/11/08 00:01:51 adg Exp adg $
+ */
+
+#include <stdio.h>  /* FILE */
+#include <string.h> /* memcpy()/memset() or bcopy()/bzero() */
+//#include <assert.h> /* assert() */
+#include "utilsha.h"
+
+/*
+ * UNROLLED TRANSFORM LOOP NOTE:
+ * You can define SHA2_UNROLL_TRANSFORM to use the unrolled transform
+ * loop version for the hash transform rounds (defined using macros
+ * later in this file).  Either define on the command line, for example:
+ *
+ *   cc -DSHA2_UNROLL_TRANSFORM -o sha2 sha2.c sha2prog.c
+ *
+ * or define below:
+ *
+ *   #define SHA2_UNROLL_TRANSFORM
+ *
+ */
+
+/*** SHA-256/384/512 Machine Architecture Definitions *****************/
+/*
+ * BYTE_ORDER NOTE:
+ *
+ * Please make sure that your system defines BYTE_ORDER.  If your
+ * architecture is little-endian, make sure it also defines
+ * LITTLE_ENDIAN and that the two (BYTE_ORDER and LITTLE_ENDIAN) are
+ * equivilent.
+ *
+ * If your system does not define the above, then you can do so by
+ * hand like this:
+ *
+ *   #define LITTLE_ENDIAN 1234
+ *   #define BIG_ENDIAN    4321
+ *
+ * And for little-endian machines, add:
+ *
+ *   #define BYTE_ORDER LITTLE_ENDIAN
+ *
+ * Or for big-endian machines:
+ *
+ *   #define BYTE_ORDER BIG_ENDIAN
+ *
+ * The FreeBSD machine this was written on defines BYTE_ORDER
+ * appropriately by including <sys/types.h> (which in turn includes
+ * <machine/endian.h> where the appropriate definitions are actually
+ * made).
+ */
+
+#ifndef BYTE_ORDER
+#define BYTE_ORDER LITTLE_ENDIAN
+#endif
+
+//#if !defined(BYTE_ORDER) || (BYTE_ORDER != LITTLE_ENDIAN && BYTE_ORDER != BIG_ENDIAN)
+//#error Define BYTE_ORDER to be equal to either LITTLE_ENDIAN or BIG_ENDIAN
+//#endif
+
+/*
+ * Define the following sha2_* types to types of the correct length on
+ * the native archtecture.   Most BSD systems and Linux define u_intXX_t
+ * types.  Machines with very recent ANSI C headers, can use the
+ * uintXX_t definintions from inttypes.h by defining SHA2_USE_INTTYPES_H
+ * during compile or in the sha.h header file.
+ *
+ * Machines that support neither u_intXX_t nor inttypes.h's uintXX_t
+ * will need to define these three typedefs below (and the appropriate
+ * ones in sha.h too) by hand according to their system architecture.
+ *
+ * Thank you, Jun-ichiro itojun Hagino, for suggesting using u_intXX_t
+ * types and pointing out recent ANSI C support for uintXX_t in inttypes.h.
+ *
+ * PJ: replace by uintX_t
+ */
+
+//typedef uint8_t  sha2_byte; /* Exactly 1 byte */
+//typedef uint32_t sha2_word32; /* Exactly 4 bytes */
+//typedef uint64_t sha2_word64; /* Exactly 8 bytes */
+
+/*** SHA-256/384/512 Various Length Definitions ***********************/
+/* NOTE: Most of these are in header */
+#define SHA256_SHORT_BLOCK_LENGTH (SHA256_BLOCK_LENGTH - 8)
+#define SHA384_SHORT_BLOCK_LENGTH (SHA384_BLOCK_LENGTH - 16)
+#define SHA512_SHORT_BLOCK_LENGTH (SHA512_BLOCK_LENGTH - 16)
+
+
+/*** ENDIAN REVERSAL MACROS *******************************************/
+#if BYTE_ORDER == LITTLE_ENDIAN
+#define REVERSE32(w, x) { \
+  uint32_t tmp = (w); \
+  tmp = (tmp >> 16) | (tmp << 16); \
+  (x) = ((tmp & 0xff00ff00UL) >> 8) | ((tmp & 0x00ff00ffUL) << 8); \
+}
+#define REVERSE64(w, x) { \
+  uint64_t tmp = (w); \
+  tmp = (tmp >> 32) | (tmp << 32); \
+  tmp = ((tmp & 0xff00ff00ff00ff00ULL) >> 8) | \
+        ((tmp & 0x00ff00ff00ff00ffULL) << 8); \
+  (x) = ((tmp & 0xffff0000ffff0000ULL) >> 16) | \
+        ((tmp & 0x0000ffff0000ffffULL) << 16); \
+}
+#endif /* BYTE_ORDER == LITTLE_ENDIAN */
+
+/*
+ * Macro for incrementally adding the unsigned 64-bit integer n to the
+ * unsigned 128-bit integer (represented using a two-element array of
+ * 64-bit words):
+ */
+#define ADDINC128(w,n) { \
+  (w)[0] += (uint64_t)(n); \
+  if ((w)[0] < (n)) { \
+    (w)[1]++; \
+  } \
+}
+
+#define MEMSET_BZERO(p,l) memset((p), 0, (l))
+#define MEMCPY_BCOPY(d,s,l) memcpy((d), (s), (l))
+
+/*** THE SIX LOGICAL FUNCTIONS ****************************************/
+/*
+ * Bit shifting and rotation (used by the six SHA-XYZ logical functions:
+ *
+ *   NOTE:  The naming of R and S appears backwards here (R is a SHIFT and
+ *   S is a ROTATION) because the SHA-256/384/512 description document
+ *   (see http://csrc.nist.gov/cryptval/shs/sha256-384-512.pdf) uses this
+ *   same "backwards" definition.
+ */
+/* Shift-right (used in SHA-256, SHA-384, and SHA-512): */
+#define R(b,x)   ((x) >> (b))
+/* 32-bit Rotate-right (used in SHA-256): */
+#define S32(b,x) (((x) >> (b)) | ((x) << (32 - (b))))
+/* 64-bit Rotate-right (used in SHA-384 and SHA-512): */
+#define S64(b,x) (((x) >> (b)) | ((x) << (64 - (b))))
+
+/* Two of six logical functions used in SHA-256, SHA-384, and SHA-512: */
+#define Ch(x,y,z) (((x) & (y)) ^ ((~(x)) & (z)))
+#define Maj(x,y,z) (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
+
+/* Four of six logical functions used in SHA-256: */
+#define Sigma0_256(x) (S32(2,  (x)) ^ S32(13, (x)) ^ S32(22, (x)))
+#define Sigma1_256(x) (S32(6,  (x)) ^ S32(11, (x)) ^ S32(25, (x)))
+#define sigma0_256(x) (S32(7,  (x)) ^ S32(18, (x)) ^ R(3 ,   (x)))
+#define sigma1_256(x) (S32(17, (x)) ^ S32(19, (x)) ^ R(10,   (x)))
+
+/* Four of six logical functions used in SHA-384 and SHA-512: */
+#define Sigma0_512(x) (S64(28, (x)) ^ S64(34, (x)) ^ S64(39, (x)))
+#define Sigma1_512(x) (S64(14, (x)) ^ S64(18, (x)) ^ S64(41, (x)))
+#define sigma0_512(x) (S64( 1, (x)) ^ S64( 8, (x)) ^ R( 7,   (x)))
+#define sigma1_512(x) (S64(19, (x)) ^ S64(61, (x)) ^ R( 6,   (x)))
+
+static void sha512_last (sha512_state *state);
+static void sha256_transform (sha256_state *state, const uint32_t idata[16]);
+static void sha512_transform (sha512_state *state, const uint64_t idata[16]);
+
+/*** SHA-XYZ INITIAL HASH VALUES AND CONSTANTS ************************/
+/* Hash constant words K for SHA-256: */
+static const uint32_t K256[64] = {
+  0x428a2f98UL, 0x71374491UL, 0xb5c0fbcfUL, 0xe9b5dba5UL,
+  0x3956c25bUL, 0x59f111f1UL, 0x923f82a4UL, 0xab1c5ed5UL,
+  0xd807aa98UL, 0x12835b01UL, 0x243185beUL, 0x550c7dc3UL,
+  0x72be5d74UL, 0x80deb1feUL, 0x9bdc06a7UL, 0xc19bf174UL,
+  0xe49b69c1UL, 0xefbe4786UL, 0x0fc19dc6UL, 0x240ca1ccUL,
+  0x2de92c6fUL, 0x4a7484aaUL, 0x5cb0a9dcUL, 0x76f988daUL,
+  0x983e5152UL, 0xa831c66dUL, 0xb00327c8UL, 0xbf597fc7UL,
+  0xc6e00bf3UL, 0xd5a79147UL, 0x06ca6351UL, 0x14292967UL,
+  0x27b70a85UL, 0x2e1b2138UL, 0x4d2c6dfcUL, 0x53380d13UL,
+  0x650a7354UL, 0x766a0abbUL, 0x81c2c92eUL, 0x92722c85UL,
+  0xa2bfe8a1UL, 0xa81a664bUL, 0xc24b8b70UL, 0xc76c51a3UL,
+  0xd192e819UL, 0xd6990624UL, 0xf40e3585UL, 0x106aa070UL,
+  0x19a4c116UL, 0x1e376c08UL, 0x2748774cUL, 0x34b0bcb5UL,
+  0x391c0cb3UL, 0x4ed8aa4aUL, 0x5b9cca4fUL, 0x682e6ff3UL,
+  0x748f82eeUL, 0x78a5636fUL, 0x84c87814UL, 0x8cc70208UL,
+  0x90befffaUL, 0xa4506cebUL, 0xbef9a3f7UL, 0xc67178f2UL
+};
+
+/* Initial hash value H for SHA-256: */
+static const uint32_t sha256_initial_hash_value[8] = {
+  0x6a09e667UL,
+  0xbb67ae85UL,
+  0x3c6ef372UL,
+  0xa54ff53aUL,
+  0x510e527fUL,
+  0x9b05688cUL,
+  0x1f83d9abUL,
+  0x5be0cd19UL
+};
+
+/* Hash constant words K for SHA-384 and SHA-512: */
+static const uint64_t K512[80] = {
+  0x428a2f98d728ae22ULL, 0x7137449123ef65cdULL,
+  0xb5c0fbcfec4d3b2fULL, 0xe9b5dba58189dbbcULL,
+  0x3956c25bf348b538ULL, 0x59f111f1b605d019ULL,
+  0x923f82a4af194f9bULL, 0xab1c5ed5da6d8118ULL,
+  0xd807aa98a3030242ULL, 0x12835b0145706fbeULL,
+  0x243185be4ee4b28cULL, 0x550c7dc3d5ffb4e2ULL,
+  0x72be5d74f27b896fULL, 0x80deb1fe3b1696b1ULL,
+  0x9bdc06a725c71235ULL, 0xc19bf174cf692694ULL,
+  0xe49b69c19ef14ad2ULL, 0xefbe4786384f25e3ULL,
+  0x0fc19dc68b8cd5b5ULL, 0x240ca1cc77ac9c65ULL,
+  0x2de92c6f592b0275ULL, 0x4a7484aa6ea6e483ULL,
+  0x5cb0a9dcbd41fbd4ULL, 0x76f988da831153b5ULL,
+  0x983e5152ee66dfabULL, 0xa831c66d2db43210ULL,
+  0xb00327c898fb213fULL, 0xbf597fc7beef0ee4ULL,
+  0xc6e00bf33da88fc2ULL, 0xd5a79147930aa725ULL,
+  0x06ca6351e003826fULL, 0x142929670a0e6e70ULL,
+  0x27b70a8546d22ffcULL, 0x2e1b21385c26c926ULL,
+  0x4d2c6dfc5ac42aedULL, 0x53380d139d95b3dfULL,
+  0x650a73548baf63deULL, 0x766a0abb3c77b2a8ULL,
+  0x81c2c92e47edaee6ULL, 0x92722c851482353bULL,
+  0xa2bfe8a14cf10364ULL, 0xa81a664bbc423001ULL,
+  0xc24b8b70d0f89791ULL, 0xc76c51a30654be30ULL,
+  0xd192e819d6ef5218ULL, 0xd69906245565a910ULL,
+  0xf40e35855771202aULL, 0x106aa07032bbd1b8ULL,
+  0x19a4c116b8d2d0c8ULL, 0x1e376c085141ab53ULL,
+  0x2748774cdf8eeb99ULL, 0x34b0bcb5e19b48a8ULL,
+  0x391c0cb3c5c95a63ULL, 0x4ed8aa4ae3418acbULL,
+  0x5b9cca4f7763e373ULL, 0x682e6ff3d6b2b8a3ULL,
+  0x748f82ee5defb2fcULL, 0x78a5636f43172f60ULL,
+  0x84c87814a1f0ab72ULL, 0x8cc702081a6439ecULL,
+  0x90befffa23631e28ULL, 0xa4506cebde82bde9ULL,
+  0xbef9a3f7b2c67915ULL, 0xc67178f2e372532bULL,
+  0xca273eceea26619cULL, 0xd186b8c721c0c207ULL,
+  0xeada7dd6cde0eb1eULL, 0xf57d4f7fee6ed178ULL,
+  0x06f067aa72176fbaULL, 0x0a637dc5a2c898a6ULL,
+  0x113f9804bef90daeULL, 0x1b710b35131c471bULL,
+  0x28db77f523047d84ULL, 0x32caab7b40c72493ULL,
+  0x3c9ebe0a15c9bebcULL, 0x431d67c49c100d4cULL,
+  0x4cc5d4becb3e42b6ULL, 0x597f299cfc657e2aULL,
+  0x5fcb6fab3ad6faecULL, 0x6c44198c4a475817ULL
+};
+
+/* Initial hash value H for SHA-384 */
+static const uint64_t sha384_initial_hash_value[8] = {
+  0xcbbb9d5dc1059ed8ULL,
+  0x629a292a367cd507ULL,
+  0x9159015a3070dd17ULL,
+  0x152fecd8f70e5939ULL,
+  0x67332667ffc00b31ULL,
+  0x8eb44a8768581511ULL,
+  0xdb0c2e0d64f98fa7ULL,
+  0x47b5481dbefa4fa4ULL
+};
+
+/* Initial hash value H for SHA-512 */
+static const uint64_t sha512_initial_hash_value[8] = {
+  0x6a09e667f3bcc908ULL,
+  0xbb67ae8584caa73bULL,
+  0x3c6ef372fe94f82bULL,
+  0xa54ff53a5f1d36f1ULL,
+  0x510e527fade682d1ULL,
+  0x9b05688c2b3e6c1fULL,
+  0x1f83d9abfb41bd6bULL,
+  0x5be0cd19137e2179ULL
+};
+
+/*** SHA-256: *********************************************************/
+sha256_state * sha256_digest_init (sha256_state *state) 
+{
+  MEMCPY_BCOPY(state->words, sha256_initial_hash_value, SHA256_DIGEST_LENGTH);
+  MEMSET_BZERO(state->buffer, SHA256_BLOCK_LENGTH);
+  state->bitcount = 0;
+  return state;
+}
+
+#ifdef SHA2_UNROLL_TRANSFORM
+
+/* Unrolled SHA-256 round macros: */
+
+#if BYTE_ORDER == LITTLE_ENDIAN
+
+#define ROUND256_0_TO_15(v, a, b, c, d, e, f, g, h) \
+  REVERSE32(v, W256[j]); \
+  T1 = (h) + Sigma1_256(e) + Ch((e), (f), (g)) + K256[j] + W256[j]; \
+  (d) += T1; \
+  (h) = T1 + Sigma0_256(a) + Maj((a), (b), (c))
+
+#else /* BYTE_ORDER == LITTLE_ENDIAN */
+
+#define ROUND256_0_TO_15(v, a, b, c, d, e, f, g, h) \
+  T1 = (h) + Sigma1_256(e) + Ch((e), (f), (g)) + K256[j] + (W256[j] = v); \
+  (d) += T1; \
+  (h) = T1 + Sigma0_256(a) + Maj((a), (b), (c))
+
+#endif /* BYTE_ORDER == LITTLE_ENDIAN */
+
+#define ROUND256(a, b, c, d, e, f, g, h) \
+  s0 = W256[(j+1)&0x0f]; \
+  s0 = sigma0_256(s0); \
+  s1 = W256[(j+14)&0x0f]; \
+  s1 = sigma1_256(s1); \
+  T1 = (h) + Sigma1_256(e) + Ch((e), (f), (g)) + K256[j] + (W256[j&0x0f] += s1 + W256[(j+9)&0x0f] + s0); \
+  (d) += T1; \
+  (h) = T1 + Sigma0_256(a) + Maj((a), (b), (c))
+
+static void sha256_transform (sha256_state *state, const uint32_t idata[16]) {
+  uint32_t a, b, c, d, e, f, g, h, s0, s1;
+  uint32_t T1, *W256, v;
+  int j;
+
+  W256 = state->buffer32;
+
+  /* Initialize registers with the prev. intermediate value */
+  a = state->words[0];
+  b = state->words[1];
+  c = state->words[2];
+  d = state->words[3];
+  e = state->words[4];
+  f = state->words[5];
+  g = state->words[6];
+  h = state->words[7];
+
+  j = 0;
+  do {
+    /* Rounds 0 to 15 (unrolled): */
+    v = idata[j]; ROUND256_0_TO_15(v, a, b, c, d, e, f, g, h); ++j;
+    v = idata[j]; ROUND256_0_TO_15(v, h, a, b, c, d, e, f, g); ++j;
+    v = idata[j]; ROUND256_0_TO_15(v, g, h, a, b, c, d, e, f); ++j;
+    v = idata[j]; ROUND256_0_TO_15(v, f, g, h, a, b, c, d, e); ++j;
+    v = idata[j]; ROUND256_0_TO_15(v, e, f, g, h, a, b, c, d); ++j;
+    v = idata[j]; ROUND256_0_TO_15(v, d, e, f, g, h, a, b, c); ++j;
+    v = idata[j]; ROUND256_0_TO_15(v, c, d, e, f, g, h, a, b); ++j;
+    v = idata[j]; ROUND256_0_TO_15(v, b, c, d, e, f, g, h, a); ++j;
+  } while (j < 16);
+
+  /* Now for the remaining rounds to 64: */
+  do {
+    ROUND256(a, b, c, d, e, f, g, h); ++j;
+    ROUND256(h, a, b, c, d, e, f, g); ++j;
+    ROUND256(g, h, a, b, c, d, e, f); ++j;
+    ROUND256(f, g, h, a, b, c, d, e); ++j;
+    ROUND256(e, f, g, h, a, b, c, d); ++j;
+    ROUND256(d, e, f, g, h, a, b, c); ++j;
+    ROUND256(c, d, e, f, g, h, a, b); ++j;
+    ROUND256(b, c, d, e, f, g, h, a); ++j;
+  } while (j < 64);
+
+  /* Compute the current intermediate hash value */
+  state->words[0] += a;
+  state->words[1] += b;
+  state->words[2] += c;
+  state->words[3] += d;
+  state->words[4] += e;
+  state->words[5] += f;
+  state->words[6] += g;
+  state->words[7] += h;
+}
+
+#else /* SHA2_UNROLL_TRANSFORM */
+
+static void sha256_transform (sha256_state *state, const uint32_t idata[16]) {
+  uint32_t a, b, c, d, e, f, g, h, s0, s1;
+  uint32_t T1, T2, *W256, v;
+  int j;
+
+  W256 = state->buffer32;
+
+  /* Initialize registers with the prev. intermediate value */
+  a = state->words[0];
+  b = state->words[1];
+  c = state->words[2];
+  d = state->words[3];
+  e = state->words[4];
+  f = state->words[5];
+  g = state->words[6];
+  h = state->words[7];
+
+  j = 0;
+  do {
+    v = idata[j];
+#if BYTE_ORDER == LITTLE_ENDIAN
+    /* Copy data while converting to host byte order */
+    REVERSE32(v, W256[j]);
+    /* Apply the SHA-256 compression function to update a..h */
+    T1 = h + Sigma1_256(e) + Ch(e, f, g) + K256[j] + W256[j];
+#else /* BYTE_ORDER == LITTLE_ENDIAN */
+    /* Apply the SHA-256 compression function to update a..h with copy */
+    T1 = h + Sigma1_256(e) + Ch(e, f, g) + K256[j] + (W256[j] = v);
+#endif /* BYTE_ORDER == LITTLE_ENDIAN */
+    T2 = Sigma0_256(a) + Maj(a, b, c);
+    h = g;
+    g = f;
+    f = e;
+    e = d + T1;
+    d = c;
+    c = b;
+    b = a;
+    a = T1 + T2;
+
+    j++;
+  } while (j < 16);
+
+  do {
+    /* Part of the message block expansion: */
+    s0 = W256[(j+1)&0x0f];
+    s0 = sigma0_256(s0);
+    s1 = W256[(j+14)&0x0f];
+    s1 = sigma1_256(s1);
+
+    /* Apply the SHA-256 compression function to update a..h */
+    T1 = h + Sigma1_256(e) + Ch(e, f, g) + K256[j] + (W256[j&0x0f] += s1 + W256[(j+9)&0x0f] + s0);
+    T2 = Sigma0_256(a) + Maj(a, b, c);
+    h = g;
+    g = f;
+    f = e;
+    e = d + T1;
+    d = c;
+    c = b;
+    b = a;
+    a = T1 + T2;
+
+    j++;
+  } while (j < 64);
+
+  /* Compute the current intermediate hash value */
+  state->words[0] += a;
+  state->words[1] += b;
+  state->words[2] += c;
+  state->words[3] += d;
+  state->words[4] += e;
+  state->words[5] += f;
+  state->words[6] += g;
+  state->words[7] += h;
+}
+
+#endif /* SHA2_UNROLL_TRANSFORM */
+
+/* PJ: alignment-safe version */
+
+#define data_aligned4(data) (((data - (const uint8_t *)(0UL)) & 3) == 0)
+#define data_aligned8(data) (((data - (const uint8_t *)(0ULL)) & 7) == 0)
+
+static void sha256_transform_aligned (sha256_state *state, const uint8_t *data) {
+  if (data_aligned4(data)) 
+  {
+    sha256_transform(state, (const uint32_t *)((const void *)data)); // alignment ok
+  } 
+  else 
+  {
+    uint32_t idata[16];
+    memcpy(&idata[0], data, 16 * sizeof(uint32_t));
+    sha256_transform(state, idata);
+  }
+}
+
+void sha256_digest_add (sha256_state *state, const void *vdata, size_t len) 
+{
+  unsigned int freespace, usedspace;
+  const uint8_t *data;
+
+  if (len == 0) /* Calling with no data is valid - we do nothing */  
+    return;
+
+  data = (const uint8_t *)vdata;
+
+  usedspace = (state->bitcount >> 3) % SHA256_BLOCK_LENGTH;
+  if (usedspace > 0) 
+  {
+    /* Calculate how much free space is available in the buffer */
+    freespace = SHA256_BLOCK_LENGTH - usedspace;
+
+    if (len >= freespace) 
+    {
+      /* Fill the buffer completely and process it */
+      MEMCPY_BCOPY(&state->buffer[usedspace], data, freespace);
+      state->bitcount += freespace << 3;
+      len -= freespace;
+      data += freespace;
+      sha256_transform(state, state->buffer32);
+    }
+    else 
+    {
+      /* The buffer is not yet full */
+      MEMCPY_BCOPY(&state->buffer[usedspace], data, len);
+      state->bitcount += len << 3;
+      return;
+    }
+  }
+  while (len >= SHA256_BLOCK_LENGTH) 
+  {
+    /* Process as many complete blocks as we can */
+    sha256_transform_aligned(state, data);
+
+    state->bitcount += SHA256_BLOCK_LENGTH << 3;
+    len -= SHA256_BLOCK_LENGTH;
+    data += SHA256_BLOCK_LENGTH;
+  }
+  if (len > 0) 
+  {
+    /* There's left-overs, so save 'em */
+    MEMCPY_BCOPY(state->buffer, data, len);
+    state->bitcount += len << 3;
+  }
+}
+
+static void digest_hex (uint8_t digest[], const void *data, size_t size, int flags);
+
+void sha256_digest_get (sha256_state *state, uint8_t digest[], int flags) {
+  unsigned int usedspace;
+
+  usedspace = (state->bitcount >> 3) % SHA256_BLOCK_LENGTH;
+#if BYTE_ORDER == LITTLE_ENDIAN
+  /* Convert FROM host byte order */
+  REVERSE64(state->bitcount,state->bitcount);
+#endif
+  if (usedspace > 0) 
+  {
+    /* Begin padding with a 1 bit: */
+    state->buffer[usedspace++] = 0x80;
+
+    if (usedspace <= SHA256_SHORT_BLOCK_LENGTH) {
+      /* Set-up for the last transform: */
+      MEMSET_BZERO(&state->buffer[usedspace], SHA256_SHORT_BLOCK_LENGTH - usedspace);
+    } else {
+      if (usedspace < SHA256_BLOCK_LENGTH) {
+        MEMSET_BZERO(&state->buffer[usedspace], SHA256_BLOCK_LENGTH - usedspace);
+      }
+      /* Do second-to-last transform: */
+      sha256_transform(state, state->buffer32);
+
+      /* And set-up for the last transform: */
+      MEMSET_BZERO(state->buffer, SHA256_SHORT_BLOCK_LENGTH);
+    }
+  } 
+  else 
+  {
+    /* Set-up for the last transform: */
+    MEMSET_BZERO(state->buffer, SHA256_SHORT_BLOCK_LENGTH);
+
+    /* Begin padding with a 1 bit: */
+    *state->buffer = 0x80;
+  }
+  /* Set the bit count: */
+  //*(uint64_t*)&state->buffer[SHA256_SHORT_BLOCK_LENGTH] = state->bitcount; // aliasing violation warning
+  state->buffer64[SHA256_SHORT_BLOCK_LENGTH / sizeof(uint64_t)] = state->bitcount;
+
+  /* Final transform: */
+  sha256_transform(state, state->buffer32);
+
+#if BYTE_ORDER == LITTLE_ENDIAN
+  {
+    /* Convert TO host byte order */
+    int j;
+    for (j = 0; j < 8; j++) 
+    {
+      REVERSE32(state->words[j], state->words[j]);
+    }
+  }
+#endif
+  if (flags & SHA_HEX)
+    digest_hex(digest, state->words, SHA256_DIGEST_LENGTH, flags);
+  else
+    memcpy(digest, state->words, SHA256_DIGEST_LENGTH);
+}
+
+/*** SHA-512: *********************************************************/
+sha512_state * sha512_digest_init (sha512_state *state) 
+{
+  MEMCPY_BCOPY(state->words, sha512_initial_hash_value, SHA512_DIGEST_LENGTH);
+  MEMSET_BZERO(state->buffer, SHA512_BLOCK_LENGTH);
+  state->bitcount[0] = 0;
+  state->bitcount[1] = 0;
+  return state;
+}
+
+#ifdef SHA2_UNROLL_TRANSFORM
+
+/* PJ: ++ operations moved out of macros! */
+
+/* Unrolled SHA-512 round macros: */
+#if BYTE_ORDER == LITTLE_ENDIAN
+
+#define ROUND512_0_TO_15(v, a, b, c, d, e, f, g, h) \
+  REVERSE64(v, W512[j]); \
+  T1 = (h) + Sigma1_512(e) + Ch((e), (f), (g)) + K512[j] + W512[j]; \
+  (d) += T1; \
+  (h) = T1 + Sigma0_512(a) + Maj((a), (b), (c))
+
+#else /* BYTE_ORDER == LITTLE_ENDIAN */
+
+#define ROUND512_0_TO_15(v, a, b, c, d, e, f, g, h) \
+  T1 = (h) + Sigma1_512(e) + Ch((e), (f), (g)) + K512[j] + (W512[j] = v); \
+  (d) += T1; \
+  (h) = T1 + Sigma0_512(a) + Maj((a), (b), (c))
+
+#endif /* BYTE_ORDER == LITTLE_ENDIAN */
+
+#define ROUND512(a, b, c, d, e, f, g, h) \
+  s0 = W512[(j+1)&0x0f]; \
+  s0 = sigma0_512(s0); \
+  s1 = W512[(j+14)&0x0f]; \
+  s1 = sigma1_512(s1); \
+  T1 = (h) + Sigma1_512(e) + Ch((e), (f), (g)) + K512[j] + (W512[j&0x0f] += s1 + W512[(j+9)&0x0f] + s0); \
+  (d) += T1; \
+  (h) = T1 + Sigma0_512(a) + Maj((a), (b), (c))
+
+static void sha512_transform (sha512_state *state, const uint64_t idata[16]) 
+{
+  uint64_t a, b, c, d, e, f, g, h, s0, s1;
+  uint64_t T1, *W512, v;
+  int j;
+
+  W512 = state->buffer64;
+
+  /* Initialize registers with the prev. intermediate value */
+  a = state->words[0];
+  b = state->words[1];
+  c = state->words[2];
+  d = state->words[3];
+  e = state->words[4];
+  f = state->words[5];
+  g = state->words[6];
+  h = state->words[7];
+
+  j = 0;
+  do {
+    v = idata[j]; ROUND512_0_TO_15(v, a, b, c, d, e, f, g, h); ++j;
+    v = idata[j]; ROUND512_0_TO_15(v, h, a, b, c, d, e, f, g); ++j;
+    v = idata[j]; ROUND512_0_TO_15(v, g, h, a, b, c, d, e, f); ++j;
+    v = idata[j]; ROUND512_0_TO_15(v, f, g, h, a, b, c, d, e); ++j;
+    v = idata[j]; ROUND512_0_TO_15(v, e, f, g, h, a, b, c, d); ++j;
+    v = idata[j]; ROUND512_0_TO_15(v, d, e, f, g, h, a, b, c); ++j;
+    v = idata[j]; ROUND512_0_TO_15(v, c, d, e, f, g, h, a, b); ++j;
+    v = idata[j]; ROUND512_0_TO_15(v, b, c, d, e, f, g, h, a); ++j;
+  } while (j < 16);
+
+  /* Now for the remaining rounds up to 79: */
+  do {
+    ROUND512(a, b, c, d, e, f, g, h); ++j;
+    ROUND512(h, a, b, c, d, e, f, g); ++j;
+    ROUND512(g, h, a, b, c, d, e, f); ++j;
+    ROUND512(f, g, h, a, b, c, d, e); ++j;
+    ROUND512(e, f, g, h, a, b, c, d); ++j;
+    ROUND512(d, e, f, g, h, a, b, c); ++j;
+    ROUND512(c, d, e, f, g, h, a, b); ++j;
+    ROUND512(b, c, d, e, f, g, h, a); ++j;
+  } while (j < 80);
+
+  /* Compute the current intermediate hash value */
+  state->words[0] += a;
+  state->words[1] += b;
+  state->words[2] += c;
+  state->words[3] += d;
+  state->words[4] += e;
+  state->words[5] += f;
+  state->words[6] += g;
+  state->words[7] += h;
+}
+
+#else /* SHA2_UNROLL_TRANSFORM */
+
+static void sha512_transform (sha512_state *state, const uint64_t idata[16]) 
+{
+  uint64_t a, b, c, d, e, f, g, h, s0, s1;
+  uint64_t T1, T2, *W512, v;
+  int j;
+
+  W512 = state->buffer64;
+
+  /* Initialize registers with the prev. intermediate value */
+  a = state->words[0];
+  b = state->words[1];
+  c = state->words[2];
+  d = state->words[3];
+  e = state->words[4];
+  f = state->words[5];
+  g = state->words[6];
+  h = state->words[7];
+
+  j = 0;
+  do {
+    v = idata[j];
+#if BYTE_ORDER == LITTLE_ENDIAN
+    /* Convert TO host byte order */
+    REVERSE64(v, W512[j]);
+    /* Apply the SHA-512 compression function to update a..h */
+    T1 = h + Sigma1_512(e) + Ch(e, f, g) + K512[j] + W512[j];
+#else /* BYTE_ORDER == LITTLE_ENDIAN */
+    /* Apply the SHA-512 compression function to update a..h with copy */
+    T1 = h + Sigma1_512(e) + Ch(e, f, g) + K512[j] + (W512[j] = v);
+#endif /* BYTE_ORDER == LITTLE_ENDIAN */
+    T2 = Sigma0_512(a) + Maj(a, b, c);
+    h = g;
+    g = f;
+    f = e;
+    e = d + T1;
+    d = c;
+    c = b;
+    b = a;
+    a = T1 + T2;
+
+    j++;
+  } while (j < 16);
+
+  do {
+    /* Part of the message block expansion: */
+    s0 = W512[(j+1)&0x0f];
+    s0 = sigma0_512(s0);
+    s1 = W512[(j+14)&0x0f];
+    s1 = sigma1_512(s1);
+
+    /* Apply the SHA-512 compression function to update a..h */
+    T1 = h + Sigma1_512(e) + Ch(e, f, g) + K512[j] + (W512[j&0x0f] += s1 + W512[(j+9)&0x0f] + s0);
+    T2 = Sigma0_512(a) + Maj(a, b, c);
+    h = g;
+    g = f;
+    f = e;
+    e = d + T1;
+    d = c;
+    c = b;
+    b = a;
+    a = T1 + T2;
+
+    j++;
+  } while (j < 80);
+
+  /* Compute the current intermediate hash value */
+  state->words[0] += a;
+  state->words[1] += b;
+  state->words[2] += c;
+  state->words[3] += d;
+  state->words[4] += e;
+  state->words[5] += f;
+  state->words[6] += g;
+  state->words[7] += h;
+}
+
+#endif /* SHA2_UNROLL_TRANSFORM */
+
+static void sha512_transform_aligned (sha512_state *state, const uint8_t *data) 
+{
+  if (data_aligned8(data)) 
+  {
+    sha512_transform(state, (const uint64_t *)((const void *)data)); // alignment ok
+  } 
+  else 
+  {
+    uint64_t idata[16];
+    memcpy(&idata[0], data, 16 * sizeof(uint64_t));
+    sha512_transform(state, idata);
+  }
+}
+
+void sha512_digest_add (sha512_state *state, const void *vdata, size_t len) 
+{
+  unsigned int freespace, usedspace;
+  const uint8_t *data;
+
+  if (len == 0) /* Calling with no data is valid - we do nothing */
+    return;
+
+  /* Sanity check: */
+  data = (const uint8_t *)vdata;
+
+  usedspace = (state->bitcount[0] >> 3) % SHA512_BLOCK_LENGTH;
+  if (usedspace > 0) 
+  {
+    /* Calculate how much free space is available in the buffer */
+    freespace = SHA512_BLOCK_LENGTH - usedspace;
+
+    if (len >= freespace) 
+    {
+      /* Fill the buffer completely and process it */
+      MEMCPY_BCOPY(&state->buffer[usedspace], data, freespace);
+      ADDINC128(state->bitcount, freespace << 3);
+      len -= freespace;
+      data += freespace;
+      sha512_transform(state, state->buffer64);
+    } 
+    else 
+    {
+      /* The buffer is not yet full */
+      MEMCPY_BCOPY(&state->buffer[usedspace], data, len);
+      ADDINC128(state->bitcount, len << 3);
+      return;
+    }
+  }
+  while (len >= SHA512_BLOCK_LENGTH) 
+  {
+    /* Process as many complete blocks as we can */
+    sha512_transform_aligned(state, data);
+
+    ADDINC128(state->bitcount, SHA512_BLOCK_LENGTH << 3);
+    len -= SHA512_BLOCK_LENGTH;
+    data += SHA512_BLOCK_LENGTH;
+  }
+  if (len > 0) 
+  {
+    /* There's left-overs, so save 'em */
+    MEMCPY_BCOPY(state->buffer, data, len);
+    ADDINC128(state->bitcount, len << 3);
+  }
+}
+
+static void sha512_last (sha512_state *state) 
+{
+  unsigned int usedspace;
+
+  usedspace = (state->bitcount[0] >> 3) % SHA512_BLOCK_LENGTH;
+#if BYTE_ORDER == LITTLE_ENDIAN
+  /* Convert FROM host byte order */
+  REVERSE64(state->bitcount[0],state->bitcount[0]);
+  REVERSE64(state->bitcount[1],state->bitcount[1]);
+#endif
+  if (usedspace > 0) 
+  {
+    /* Begin padding with a 1 bit: */
+    state->buffer[usedspace++] = 0x80;
+
+    if (usedspace <= SHA512_SHORT_BLOCK_LENGTH) {
+      /* Set-up for the last transform: */
+      MEMSET_BZERO(&state->buffer[usedspace], SHA512_SHORT_BLOCK_LENGTH - usedspace);
+    } else {
+      if (usedspace < SHA512_BLOCK_LENGTH) {
+        MEMSET_BZERO(&state->buffer[usedspace], SHA512_BLOCK_LENGTH - usedspace);
+      }
+      /* Do second-to-last transform: */
+      sha512_transform(state, state->buffer64);
+
+      /* And set-up for the last transform: */
+      //MEMSET_BZERO(state->buffer, SHA512_BLOCK_LENGTH - 2); // seems a typo, we overwrite last 16 bytes below
+      MEMSET_BZERO(state->buffer, SHA512_SHORT_BLOCK_LENGTH);
+    }
+  } 
+  else 
+  {
+    /* Prepare for final transform: */
+    MEMSET_BZERO(state->buffer, SHA512_SHORT_BLOCK_LENGTH);
+
+    /* Begin padding with a 1 bit: */
+    *state->buffer = 0x80;
+  }
+  /* Store the length of input data (in bits): */
+  //*(uint64_t*)&state->buffer[SHA512_SHORT_BLOCK_LENGTH] = state->bitcount[1]; // aliasing violation warning
+  //*(uint64_t*)&state->buffer[SHA512_SHORT_BLOCK_LENGTH+8] = state->bitcount[0];
+  state->buffer64[SHA512_SHORT_BLOCK_LENGTH / sizeof(uint64_t)] = state->bitcount[1];
+  state->buffer64[SHA512_SHORT_BLOCK_LENGTH / sizeof(uint64_t) + 1] = state->bitcount[0];
+
+  /* Final transform: */
+  sha512_transform(state, state->buffer64);
+}
+
+void sha512_digest_get (sha512_state *state, uint8_t digest[], int flags) 
+{
+  /* If no digest buffer is passed, we don't bother doing this: */
+  sha512_last(state);
+
+  /* Save the hash data for output: */
+#if BYTE_ORDER == LITTLE_ENDIAN
+  {
+    /* Convert TO host byte order */
+    int j;
+    for (j = 0; j < 8; j++) 
+    {
+      REVERSE64(state->words[j], state->words[j]);
+    }
+  }
+#endif
+  if (flags & SHA_HEX)
+    digest_hex(digest, state->words, SHA512_DIGEST_LENGTH, flags);
+  else
+    memcpy(digest, state->words, SHA512_DIGEST_LENGTH);
+}
+
+/*** SHA-384: *********************************************************/
+sha384_state * sha384_digest_init (sha384_state *state) 
+{
+  MEMCPY_BCOPY(state->words, sha384_initial_hash_value, SHA512_DIGEST_LENGTH);
+  MEMSET_BZERO(state->buffer, SHA384_BLOCK_LENGTH);
+  state->bitcount[0] = state->bitcount[1] = 0;
+  return state;
+}
+
+void sha384_digest_add (sha384_state *state, const void *data, size_t len) 
+{
+  sha512_digest_add((sha512_state *)state, data, len);
+}
+
+void sha384_digest_get (sha384_state *state, uint8_t digest[], int flags) 
+{
+  sha512_last((sha512_state *)state);
+
+  /* Save the hash data for output: */
+#if BYTE_ORDER == LITTLE_ENDIAN
+  {
+    /* Convert TO host byte order */
+    int j;
+    for (j = 0; j < 6; j++) 
+    {
+      REVERSE64(state->words[j], state->words[j]);
+    }
+  }
+#endif
+  if (flags & SHA_HEX)
+    digest_hex(digest, state->words, SHA384_DIGEST_LENGTH, flags);
+  else
+    memcpy(digest, state->words, SHA384_DIGEST_LENGTH);
+}
+
+/* hex output */
+
+static void digest_hex (uint8_t digest[], const void *data, size_t size, int flags)
+{
+  const char *alphabet;
+  const uint8_t *bytes;
+  size_t i;
+
+  bytes = (const uint8_t *)data;
+  alphabet = (flags & SHA_LCHEX) ? "0123456789abcdef" : "0123456789ABCDEF";
+  for (i = 0; i < size; ++i, ++bytes)
+  {
+    *digest++ = (uint8_t)alphabet[(*bytes) >> 4];
+    *digest++ = (uint8_t)alphabet[(*bytes) & 15];
+  }
+  *digest = 0;
+}
+
+/* string checksum */
+
+void sha256_digest (const void *data, size_t size, uint8_t digest[], int flags)
+{
+  sha256_state state;
+  sha256_digest_init(&state);
+  sha256_digest_add(&state, data, size);
+  sha256_digest_get(&state, digest, flags);
+}
+
+void sha384_digest (const void *data, size_t size, uint8_t digest[], int flags)
+{
+  sha384_state state;
+  sha384_digest_init(&state);
+  sha384_digest_add(&state, data, size);
+  sha384_digest_get(&state, digest, flags);
+}
+
+void sha512_digest (const void *data, size_t size, uint8_t digest[], int flags)
+{
+  sha512_state state;
+  sha512_digest_init(&state);
+  sha512_digest_add(&state, data, size);
+  sha512_digest_get(&state, digest, flags);
+}
+
+/* file checksum */
+
+#define DIGEST_BUFFER_SIZE 4096
+
+int sha256_digest_add_file (sha256_state *state, const char *filename)
+{
+  FILE *fh;
+  uint8_t buffer[DIGEST_BUFFER_SIZE];
+  size_t read;
+
+  if ((fh = fopen(filename, "rb")) == NULL)
+    return 0;
+  do {
+    read = fread(buffer, 1, DIGEST_BUFFER_SIZE, fh);
+    sha256_digest_add(state, buffer, read);
+  } while (read == DIGEST_BUFFER_SIZE);
+  fclose(fh);
+  return 1;
+}
+
+int sha256_digest_file (const char *filename, uint8_t digest[], int flags)
+{
+  sha256_state state;
+
+  sha256_digest_init(&state);
+  if (sha256_digest_add_file(&state, filename))
+  {
+    sha256_digest_get(&state, digest, flags);
+    return 1;
+  }
+  return 0;
+}
+
+int sha384_digest_add_file (sha384_state *state, const char *filename)
+{
+  FILE *fh;
+  uint8_t buffer[DIGEST_BUFFER_SIZE];
+  size_t read;
+
+  if ((fh = fopen(filename, "rb")) == NULL)
+    return 0;
+  do {
+    read = fread(buffer, 1, DIGEST_BUFFER_SIZE, fh);
+    sha384_digest_add(state, buffer, read);
+  } while (read == DIGEST_BUFFER_SIZE);
+  fclose(fh);
+  return 1;
+}
+
+int sha384_digest_file (const char *filename, uint8_t digest[], int flags)
+{
+  sha384_state state;
+
+  sha384_digest_init(&state);
+  if (sha384_digest_add_file(&state, filename))
+  {
+    sha384_digest_get(&state, digest, flags);
+    return 1;
+  }
+  return 0;
+}
+
+int sha512_digest_add_file (sha512_state *state, const char *filename)
+{
+  FILE *fh;
+  uint8_t buffer[DIGEST_BUFFER_SIZE];
+  size_t read;
+
+  if ((fh = fopen(filename, "rb")) == NULL)
+    return 0;
+  do {
+    read = fread(buffer, 1, DIGEST_BUFFER_SIZE, fh);
+    sha512_digest_add(state, buffer, read);
+  } while (read == DIGEST_BUFFER_SIZE);
+  fclose(fh);
+  return 1;
+}
+
+int sha512_digest_file (const char *filename, uint8_t digest[], int flags)
+{
+  sha512_state state;
+
+  sha512_digest_init(&state);
+  if (sha512_digest_add_file(&state, filename))
+  {
+    sha512_digest_get(&state, digest, flags);
+    return 1;
+  }
+  return 0;
+}
diff --git a/source/luametatex/source/libraries/pplib/util/utilsha.h b/source/luametatex/source/libraries/pplib/util/utilsha.h
new file mode 100644
index 000000000..6c9b1bdc9
--- /dev/null
+++ b/source/luametatex/source/libraries/pplib/util/utilsha.h
@@ -0,0 +1,79 @@
+/* sha2 implementation excerpted from code by Aaron D. Gifford */
+
+#ifndef UTIL_SHA_H
+#define UTIL_SHA_H
+
+#include <stddef.h>
+#include <stdint.h>
+#include "utildecl.h"
+
+#define SHA256_BLOCK_LENGTH 64
+#define SHA256_DIGEST_LENGTH 32
+#define SHA256_STRING_LENGTH (SHA256_DIGEST_LENGTH * 2 + 1)
+#define SHA384_BLOCK_LENGTH 128
+#define SHA384_DIGEST_LENGTH 48
+#define SHA384_STRING_LENGTH (SHA384_DIGEST_LENGTH * 2 + 1)
+#define SHA512_BLOCK_LENGTH 128
+#define SHA512_DIGEST_LENGTH 64
+#define SHA512_STRING_LENGTH (SHA512_DIGEST_LENGTH * 2 + 1)
+
+//#define sha256_state sha256_state_t
+//#define sha384_state sha384_state_t
+//#define sha512_state sha512_state_t
+
+typedef struct {
+  uint32_t  words[8];
+  uint64_t  bitcount;
+  union {
+    uint8_t buffer[SHA256_BLOCK_LENGTH];
+    uint32_t buffer32[SHA256_BLOCK_LENGTH / sizeof(uint32_t)];
+    uint64_t buffer64[SHA256_BLOCK_LENGTH / sizeof(uint64_t)];
+  };
+} sha256_state;
+
+typedef struct {
+  uint64_t  words[8];
+  uint64_t  bitcount[2];
+  union {
+    uint8_t buffer[SHA512_BLOCK_LENGTH];
+    uint64_t buffer64[SHA512_BLOCK_LENGTH / sizeof(uint64_t)];
+  };
+} sha512_state;
+
+typedef sha512_state sha384_state;
+
+enum {
+  SHA_BYTES = 0,
+  SHA_UCHEX = (1<<0),
+  SHA_LCHEX = (1<<1)
+};
+
+#define SHA_DEFAULT SHA_BYTES
+#define SHA_HEX (SHA_UCHEX|SHA_LCHEX)
+
+UTILAPI sha256_state * sha256_digest_init (sha256_state *state);
+UTILAPI sha384_state * sha384_digest_init (sha384_state *state);
+UTILAPI sha512_state * sha512_digest_init (sha512_state *state);
+
+UTILAPI void sha256_digest_add (sha256_state *state, const void *data, size_t size);
+UTILAPI void sha384_digest_add (sha384_state *state, const void *data, size_t size);
+UTILAPI void sha512_digest_add (sha512_state *state, const void *data, size_t size);
+
+UTILAPI void sha256_digest_get (sha256_state *state, uint8_t digest[], int flags);
+UTILAPI void sha384_digest_get (sha384_state *state, uint8_t digest[], int flags);
+UTILAPI void sha512_digest_get (sha512_state *state, uint8_t digest[], int flags);
+
+UTILAPI void sha256_digest (const void *data, size_t size, uint8_t digest[], int flags);
+UTILAPI void sha384_digest (const void *data, size_t size, uint8_t digest[], int flags);
+UTILAPI void sha512_digest (const void *data, size_t size, uint8_t digest[], int flags);
+
+UTILAPI int sha256_digest_add_file (sha256_state *state, const char *filename);
+UTILAPI int sha256_digest_file (const char *filename, uint8_t digest[], int flags);
+
+UTILAPI int sha384_digest_add_file (sha384_state *state, const char *filename);
+UTILAPI int sha384_digest_file (const char *filename, uint8_t digest[], int flags);
+
+UTILAPI int sha512_digest_add_file (sha512_state *state, const char *filename);
+UTILAPI int sha512_digest_file (const char *filename, uint8_t digest[], int flags);
+
+#endif
diff --git a/source/luametatex/source/libraries/readme.txt b/source/luametatex/source/libraries/readme.txt
new file mode 100644
index 000000000..8af76f93a
--- /dev/null
+++ b/source/luametatex/source/libraries/readme.txt
@@ -0,0 +1,25 @@
+Nota bene,
+
+The currently embedded libcerf library might become an optional one as soon as we decide to provide
+it as such. It doesn't put a dent in filesize but as it's used rarely (and mostly as complement to
+the complex math support) that makes sense. The library was added because some users wanted it as
+companion the other math libraries and because TeX is often about math it sort of feels okay. But
+it looks like there will never be support for the MSVC compiler. Mojca and I (Hans) adapted the
+sources included here to compile out of the box, but that didn't make it back into the original.
+
+The pplib library has a few patches with respect to memory allocation and zip compression so that
+we can hook in the minizip and mimalloc alternatives.
+
+The avl and hnj libraries are adapted to Lua(Meta)TeX and might get some more adaptations depending
+on our needs. The decnumber library that is also used in mplib is unchanged.
+
+In mimalloc we need to patch init.c: #if defined(_M_X64) || defined(_M_ARM64) to get rid of a link
+error.
+
+In decNumber.c this got added: 
+
+# include "../../utilities/auxmemory.h"
+# define malloc lmt_memory_malloc
+# define free   lmt_memory_free
+
+Hans
+\ No newline at end of file