From 62d980c99a617ff260f29ac2d3bdb084049f25b0 Mon Sep 17 00:00:00 2001
From: Hans Hagen <pragma@wxs.nl>
Date: Thu, 29 Dec 2022 15:11:22 +0100
Subject: 2022-12-29 14:31:00

---
 .../source/libraries/mimalloc/src/alloc-aligned.c  |  70 +++-
 .../libraries/mimalloc/src/alloc-override-osx.c    |  24 +-
 .../source/libraries/mimalloc/src/alloc-override.c |  54 +--
 .../source/libraries/mimalloc/src/alloc-posix.c    |  18 +-
 .../source/libraries/mimalloc/src/alloc.c          | 307 ++++++++++-------
 .../source/libraries/mimalloc/src/arena.c          |  85 ++---
 .../source/libraries/mimalloc/src/bitmap.c         |  22 +-
 .../source/libraries/mimalloc/src/heap.c           |  33 +-
 .../source/libraries/mimalloc/src/init.c           |  73 ++--
 .../source/libraries/mimalloc/src/options.c        |  45 ++-
 .../luametatex/source/libraries/mimalloc/src/os.c  | 122 ++++---
 .../source/libraries/mimalloc/src/page-queue.c     |   5 +-
 .../source/libraries/mimalloc/src/page.c           |  84 +++--
 .../source/libraries/mimalloc/src/random.c         |  57 +++-
 .../source/libraries/mimalloc/src/region.c         |  96 +++---
 .../source/libraries/mimalloc/src/segment-cache.c  |  63 +++-
 .../source/libraries/mimalloc/src/segment.c        | 374 ++++++++++++---------
 .../source/libraries/mimalloc/src/stats.c          |  92 +++--
 18 files changed, 1023 insertions(+), 601 deletions(-)

(limited to 'source/luametatex/source/libraries/mimalloc/src')

diff --git a/source/luametatex/source/libraries/mimalloc/src/alloc-aligned.c b/source/luametatex/source/libraries/mimalloc/src/alloc-aligned.c
index 9614aa092..9fe82890f 100644
--- a/source/luametatex/source/libraries/mimalloc/src/alloc-aligned.c
+++ b/source/luametatex/source/libraries/mimalloc/src/alloc-aligned.c
@@ -18,9 +18,9 @@ terms of the MIT license. A copy of the license can be found in the file
 static mi_decl_noinline void* mi_heap_malloc_zero_aligned_at_fallback(mi_heap_t* const heap, const size_t size, const size_t alignment, const size_t offset, const bool zero) mi_attr_noexcept
 {
   mi_assert_internal(size <= PTRDIFF_MAX);
-  mi_assert_internal(alignment!=0 && _mi_is_power_of_two(alignment) && alignment <= MI_ALIGNMENT_MAX);
+  mi_assert_internal(alignment != 0 && _mi_is_power_of_two(alignment));
 
-  const uintptr_t align_mask = alignment-1;  // for any x, `(x & align_mask) == (x % alignment)`
+  const uintptr_t align_mask = alignment - 1;  // for any x, `(x & align_mask) == (x % alignment)`
   const size_t padsize = size + MI_PADDING_SIZE;
 
   // use regular allocation if it is guaranteed to fit the alignment constraints
@@ -30,26 +30,59 @@ static mi_decl_noinline void* mi_heap_malloc_zero_aligned_at_fallback(mi_heap_t*
     return p;
   }
 
-  // otherwise over-allocate
-  const size_t oversize = size + alignment - 1;
-  void* p = _mi_heap_malloc_zero(heap, oversize, zero);
-  if (p == NULL) return NULL;
+  void* p;
+  size_t oversize;
+  if mi_unlikely(alignment > MI_ALIGNMENT_MAX) {
+    // use OS allocation for very large alignment and allocate inside a huge page (dedicated segment with 1 page)
+    // This can support alignments >= MI_SEGMENT_SIZE by ensuring the object can be aligned at a point in the
+    // first (and single) page such that the segment info is `MI_SEGMENT_SIZE` bytes before it (so it can be found by aligning the pointer down)
+    if mi_unlikely(offset != 0) {
+      // todo: cannot support offset alignment for very large alignments yet
+      #if MI_DEBUG > 0
+      _mi_error_message(EOVERFLOW, "aligned allocation with a very large alignment cannot be used with an alignment offset (size %zu, alignment %zu, offset %zu)\n", size, alignment, offset);
+      #endif
+      return NULL;
+    }
+    oversize = (size <= MI_SMALL_SIZE_MAX ? MI_SMALL_SIZE_MAX + 1 /* ensure we use generic malloc path */ : size);
+    p = _mi_heap_malloc_zero_ex(heap, oversize, false, alignment); // the page block size should be large enough to align in the single huge page block
+    // zero afterwards as only the area from the aligned_p may be committed!
+    if (p == NULL) return NULL;
+  }
+  else {
+    // otherwise over-allocate
+    oversize = size + alignment - 1;
+    p = _mi_heap_malloc_zero(heap, oversize, zero);
+    if (p == NULL) return NULL;
+  }
 
   // .. and align within the allocation
-  uintptr_t adjust = alignment - (((uintptr_t)p + offset) & align_mask);
-  mi_assert_internal(adjust <= alignment);
-  void* aligned_p = (adjust == alignment ? p : (void*)((uintptr_t)p + adjust));
-  if (aligned_p != p) mi_page_set_has_aligned(_mi_ptr_page(p), true);
-  mi_assert_internal(((uintptr_t)aligned_p + offset) % alignment == 0);
+  const uintptr_t poffset = ((uintptr_t)p + offset) & align_mask;
+  const uintptr_t adjust  = (poffset == 0 ? 0 : alignment - poffset);
+  mi_assert_internal(adjust < alignment);
+  void* aligned_p = (void*)((uintptr_t)p + adjust);
+  if (aligned_p != p) {
+    mi_page_set_has_aligned(_mi_ptr_page(p), true);
+  }
+
+  mi_assert_internal(mi_page_usable_block_size(_mi_ptr_page(p)) >= adjust + size);
   mi_assert_internal(p == _mi_page_ptr_unalign(_mi_ptr_segment(aligned_p), _mi_ptr_page(aligned_p), aligned_p));
+  mi_assert_internal(((uintptr_t)aligned_p + offset) % alignment == 0);
+  mi_assert_internal(mi_page_usable_block_size(_mi_ptr_page(p)) >= adjust + size);
+
+  // now zero the block if needed
+  if (zero && alignment > MI_ALIGNMENT_MAX) {
+    const ptrdiff_t diff = (uint8_t*)aligned_p - (uint8_t*)p;
+    const ptrdiff_t zsize = mi_page_usable_block_size(_mi_ptr_page(p)) - diff - MI_PADDING_SIZE;
+    if (zsize > 0) { _mi_memzero(aligned_p, zsize); }
+  }
 
   #if MI_TRACK_ENABLED
   if (p != aligned_p) {
-    mi_track_free(p);
-    mi_track_malloc(aligned_p,size,zero);
+    mi_track_free_size(p, oversize);
+    mi_track_malloc(aligned_p, size, zero);
   }
   else {
-    mi_track_resize(aligned_p,oversize,size);
+    mi_track_resize(aligned_p, oversize, size);
   }
   #endif
   return aligned_p;
@@ -60,19 +93,21 @@ static void* mi_heap_malloc_zero_aligned_at(mi_heap_t* const heap, const size_t
 {
   // note: we don't require `size > offset`, we just guarantee that the address at offset is aligned regardless of the allocated size.
   mi_assert(alignment > 0);
-  if mi_unlikely(alignment==0 || !_mi_is_power_of_two(alignment)) { // require power-of-two (see <https://en.cppreference.com/w/c/memory/aligned_alloc>)
+  if mi_unlikely(alignment == 0 || !_mi_is_power_of_two(alignment)) { // require power-of-two (see <https://en.cppreference.com/w/c/memory/aligned_alloc>)
     #if MI_DEBUG > 0
     _mi_error_message(EOVERFLOW, "aligned allocation requires the alignment to be a power-of-two (size %zu, alignment %zu)\n", size, alignment);
     #endif
     return NULL;
   }
+  /*
   if mi_unlikely(alignment > MI_ALIGNMENT_MAX) {  // we cannot align at a boundary larger than this (or otherwise we cannot find segment headers)
     #if MI_DEBUG > 0
     _mi_error_message(EOVERFLOW, "aligned allocation has a maximum alignment of %zu (size %zu, alignment %zu)\n", MI_ALIGNMENT_MAX, size, alignment);
     #endif
     return NULL;
   }
-  if mi_unlikely(size > PTRDIFF_MAX) {          // we don't allocate more than PTRDIFF_MAX (see <https://sourceware.org/ml/libc-announce/2019/msg00001.html>)                                                    
+  */
+  if mi_unlikely(size > PTRDIFF_MAX) {          // we don't allocate more than PTRDIFF_MAX (see <https://sourceware.org/ml/libc-announce/2019/msg00001.html>)
     #if MI_DEBUG > 0
     _mi_error_message(EOVERFLOW, "aligned allocation request is too large (size %zu, alignment %zu)\n", size, alignment);
     #endif
@@ -82,7 +117,7 @@ static void* mi_heap_malloc_zero_aligned_at(mi_heap_t* const heap, const size_t
   const size_t padsize = size + MI_PADDING_SIZE;  // note: cannot overflow due to earlier size > PTRDIFF_MAX check
 
   // try first if there happens to be a small block available with just the right alignment
-  if mi_likely(padsize <= MI_SMALL_SIZE_MAX) {
+  if mi_likely(padsize <= MI_SMALL_SIZE_MAX && alignment <= padsize) {
     mi_page_t* page = _mi_heap_get_free_small_page(heap, padsize);
     const bool is_aligned = (((uintptr_t)page->free+offset) & align_mask)==0;
     if mi_likely(page->free != NULL && is_aligned)
@@ -269,4 +304,3 @@ mi_decl_nodiscard void* mi_recalloc_aligned_at(void* p, size_t newcount, size_t
 mi_decl_nodiscard void* mi_recalloc_aligned(void* p, size_t newcount, size_t size, size_t alignment) mi_attr_noexcept {
   return mi_heap_recalloc_aligned(mi_get_default_heap(), p, newcount, size, alignment);
 }
-
diff --git a/source/luametatex/source/libraries/mimalloc/src/alloc-override-osx.c b/source/luametatex/source/libraries/mimalloc/src/alloc-override-osx.c
index ba2313a2a..a2819a8bf 100644
--- a/source/luametatex/source/libraries/mimalloc/src/alloc-override-osx.c
+++ b/source/luametatex/source/libraries/mimalloc/src/alloc-override-osx.c
@@ -19,8 +19,8 @@ terms of the MIT license. A copy of the license can be found in the file
    This is done through the malloc zone interface.
    It seems to be most robust in combination with interposing
    though or otherwise we may get zone errors as there are could
-   be allocations done by the time we take over the 
-   zone. 
+   be allocations done by the time we take over the
+   zone.
 ------------------------------------------------------ */
 
 #include <AvailabilityMacros.h>
@@ -215,7 +215,7 @@ static malloc_zone_t mi_malloc_zone = {
   .zone_name = "mimalloc",
   .batch_malloc = &zone_batch_malloc,
   .batch_free = &zone_batch_free,
-  .introspect = &mi_introspect,  
+  .introspect = &mi_introspect,
 #if defined(MAC_OS_X_VERSION_10_6) && (MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_6)
   #if defined(MAC_OS_X_VERSION_10_14) && (MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_14)
   .version = 10,
@@ -242,7 +242,7 @@ static malloc_zone_t mi_malloc_zone = {
 #if defined(MI_OSX_INTERPOSE) && defined(MI_SHARED_LIB_EXPORT)
 
 // ------------------------------------------------------
-// Override malloc_xxx and malloc_zone_xxx api's to use only 
+// Override malloc_xxx and malloc_zone_xxx api's to use only
 // our mimalloc zone. Since even the loader uses malloc
 // on macOS, this ensures that all allocations go through
 // mimalloc (as all calls are interposed).
@@ -254,7 +254,7 @@ static malloc_zone_t mi_malloc_zone = {
 static inline malloc_zone_t* mi_get_default_zone(void)
 {
   static bool init;
-  if mi_unlikely(!init) { 
+  if mi_unlikely(!init) {
     init = true;
     malloc_zone_register(&mi_malloc_zone);  // by calling register we avoid a zone error on free (see <http://eatmyrandom.blogspot.com/2010/03/mallocfree-interception-on-mac-os-x.html>)
   }
@@ -272,7 +272,7 @@ static malloc_zone_t* mi_malloc_create_zone(vm_size_t size, unsigned flags) {
   return mi_get_default_zone();
 }
 
-static malloc_zone_t* mi_malloc_default_zone (void) {   
+static malloc_zone_t* mi_malloc_default_zone (void) {
   return mi_get_default_zone();
 }
 
@@ -292,11 +292,11 @@ static kern_return_t mi_malloc_get_all_zones (task_t task, memory_reader_t mr, v
   return KERN_SUCCESS;
 }
 
-static const char* mi_malloc_get_zone_name(malloc_zone_t* zone) {  
+static const char* mi_malloc_get_zone_name(malloc_zone_t* zone) {
   return (zone == NULL ? mi_malloc_zone.zone_name : zone->zone_name);
 }
 
-static void mi_malloc_set_zone_name(malloc_zone_t* zone, const char* name) {  
+static void mi_malloc_set_zone_name(malloc_zone_t* zone, const char* name) {
   MI_UNUSED(zone); MI_UNUSED(name);
 }
 
@@ -306,7 +306,7 @@ static int mi_malloc_jumpstart(uintptr_t cookie) {
 }
 
 static void mi__malloc_fork_prepare(void) {
-  // nothing  
+  // nothing
 }
 static void mi__malloc_fork_parent(void) {
   // nothing
@@ -367,13 +367,13 @@ __attribute__((used)) static const struct mi_interpose_s _mi_zone_interposes[]
   MI_INTERPOSE_MI(malloc_destroy_zone),
   MI_INTERPOSE_MI(malloc_get_all_zones),
   MI_INTERPOSE_MI(malloc_get_zone_name),
-  MI_INTERPOSE_MI(malloc_jumpstart),  
+  MI_INTERPOSE_MI(malloc_jumpstart),
   MI_INTERPOSE_MI(malloc_printf),
   MI_INTERPOSE_MI(malloc_set_zone_name),
   MI_INTERPOSE_MI(_malloc_fork_child),
   MI_INTERPOSE_MI(_malloc_fork_parent),
   MI_INTERPOSE_MI(_malloc_fork_prepare),
-  
+
   MI_INTERPOSE_ZONE(zone_batch_free),
   MI_INTERPOSE_ZONE(zone_batch_malloc),
   MI_INTERPOSE_ZONE(zone_calloc),
@@ -416,7 +416,7 @@ static inline malloc_zone_t* mi_get_default_zone(void)
 }
 
 #if defined(__clang__)
-__attribute__((constructor(0))) 
+__attribute__((constructor(0)))
 #else
 __attribute__((constructor))      // seems not supported by g++-11 on the M1
 #endif
diff --git a/source/luametatex/source/libraries/mimalloc/src/alloc-override.c b/source/luametatex/source/libraries/mimalloc/src/alloc-override.c
index 6b9845d39..84a0d19df 100644
--- a/source/luametatex/source/libraries/mimalloc/src/alloc-override.c
+++ b/source/luametatex/source/libraries/mimalloc/src/alloc-override.c
@@ -13,7 +13,7 @@ terms of the MIT license. A copy of the license can be found in the file
 #error "It is only possible to override "malloc" on Windows when building as a DLL (and linking the C runtime as a DLL)"
 #endif
 
-#if defined(MI_MALLOC_OVERRIDE) && !(defined(_WIN32)) 
+#if defined(MI_MALLOC_OVERRIDE) && !(defined(_WIN32))
 
 #if defined(__APPLE__)
 #include <AvailabilityMacros.h>
@@ -43,7 +43,7 @@ typedef struct mi_nothrow_s { int _tag; } mi_nothrow_t;
   #define MI_FORWARD0(fun,x)      MI_FORWARD(fun)
   #define MI_FORWARD02(fun,x,y)   MI_FORWARD(fun)
 #else
-  // otherwise use forwarding by calling our `mi_` function 
+  // otherwise use forwarding by calling our `mi_` function
   #define MI_FORWARD1(fun,x)      { return fun(x); }
   #define MI_FORWARD2(fun,x,y)    { return fun(x,y); }
   #define MI_FORWARD3(fun,x,y,z)  { return fun(x,y,z); }
@@ -51,11 +51,17 @@ typedef struct mi_nothrow_s { int _tag; } mi_nothrow_t;
   #define MI_FORWARD02(fun,x,y)   { fun(x,y); }
 #endif
 
-#if defined(__APPLE__) && defined(MI_SHARED_LIB_EXPORT) && defined(MI_OSX_INTERPOSE)    
-  // define MI_OSX_IS_INTERPOSED as we should not provide forwarding definitions for 
+
+#if defined(__APPLE__) && defined(MI_SHARED_LIB_EXPORT) && defined(MI_OSX_INTERPOSE)
+  // define MI_OSX_IS_INTERPOSED as we should not provide forwarding definitions for
   // functions that are interposed (or the interposing does not work)
   #define MI_OSX_IS_INTERPOSED
 
+  mi_decl_externc static size_t mi_malloc_size_checked(void *p) {
+    if (!mi_is_in_heap_region(p)) return 0;
+    return mi_usable_size(p);
+  }
+
   // use interposing so `DYLD_INSERT_LIBRARIES` works without `DYLD_FORCE_FLAT_NAMESPACE=1`
   // See: <https://books.google.com/books?id=K8vUkpOXhN4C&pg=PA73>
   struct mi_interpose_s {
@@ -64,7 +70,7 @@ typedef struct mi_nothrow_s { int _tag; } mi_nothrow_t;
   };
   #define MI_INTERPOSE_FUN(oldfun,newfun) { (const void*)&newfun, (const void*)&oldfun }
   #define MI_INTERPOSE_MI(fun)            MI_INTERPOSE_FUN(fun,mi_##fun)
-  
+
   __attribute__((used)) static struct mi_interpose_s _mi_interposes[]  __attribute__((section("__DATA, __interpose"))) =
   {
     MI_INTERPOSE_MI(malloc),
@@ -76,9 +82,9 @@ typedef struct mi_nothrow_s { int _tag; } mi_nothrow_t;
     MI_INTERPOSE_MI(posix_memalign),
     MI_INTERPOSE_MI(reallocf),
     MI_INTERPOSE_MI(valloc),
-    MI_INTERPOSE_MI(malloc_size),
+    MI_INTERPOSE_FUN(malloc_size,mi_malloc_size_checked),
     MI_INTERPOSE_MI(malloc_good_size),
-    #if defined(MAC_OS_X_VERSION_10_15) && MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_15 
+    #if defined(MAC_OS_X_VERSION_10_15) && MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_15
     MI_INTERPOSE_MI(aligned_alloc),
     #endif
     #ifdef MI_OSX_ZONE
@@ -122,11 +128,11 @@ typedef struct mi_nothrow_s { int _tag; } mi_nothrow_t;
   // cannot override malloc unless using a dll.
   // we just override new/delete which does work in a static library.
 #else
-  // On all other systems forward to our API  
+  // On all other systems forward to our API
   mi_decl_export void* malloc(size_t size)              MI_FORWARD1(mi_malloc, size)
   mi_decl_export void* calloc(size_t size, size_t n)    MI_FORWARD2(mi_calloc, size, n)
   mi_decl_export void* realloc(void* p, size_t newsize) MI_FORWARD2(mi_realloc, p, newsize)
-  mi_decl_export void  free(void* p)                    MI_FORWARD0(mi_free, p)  
+  mi_decl_export void  free(void* p)                    MI_FORWARD0(mi_free, p)
 #endif
 
 #if (defined(__GNUC__) || defined(__clang__)) && !defined(__APPLE__)
@@ -168,20 +174,20 @@ typedef struct mi_nothrow_s { int _tag; } mi_nothrow_t;
   void operator delete[](void* p, std::size_t n, std::align_val_t al) noexcept { mi_free_size_aligned(p, n, static_cast<size_t>(al)); };
   void operator delete  (void* p, std::align_val_t al, const std::nothrow_t&) noexcept { mi_free_aligned(p, static_cast<size_t>(al)); }
   void operator delete[](void* p, std::align_val_t al, const std::nothrow_t&) noexcept { mi_free_aligned(p, static_cast<size_t>(al)); }
-  
+
   void* operator new( std::size_t n, std::align_val_t al)   noexcept(false) { return mi_new_aligned(n, static_cast<size_t>(al)); }
   void* operator new[]( std::size_t n, std::align_val_t al) noexcept(false) { return mi_new_aligned(n, static_cast<size_t>(al)); }
   void* operator new  (std::size_t n, std::align_val_t al, const std::nothrow_t&) noexcept { return mi_new_aligned_nothrow(n, static_cast<size_t>(al)); }
   void* operator new[](std::size_t n, std::align_val_t al, const std::nothrow_t&) noexcept { return mi_new_aligned_nothrow(n, static_cast<size_t>(al)); }
   #endif
 
-#elif (defined(__GNUC__) || defined(__clang__)) 
+#elif (defined(__GNUC__) || defined(__clang__))
   // ------------------------------------------------------
   // Override by defining the mangled C++ names of the operators (as
   // used by GCC and CLang).
   // See <https://itanium-cxx-abi.github.io/cxx-abi/abi.html#mangling>
   // ------------------------------------------------------
-  
+
   void _ZdlPv(void* p)            MI_FORWARD0(mi_free,p) // delete
   void _ZdaPv(void* p)            MI_FORWARD0(mi_free,p) // delete[]
   void _ZdlPvm(void* p, size_t n) MI_FORWARD02(mi_free_size,p,n)
@@ -190,12 +196,12 @@ typedef struct mi_nothrow_s { int _tag; } mi_nothrow_t;
   void _ZdaPvSt11align_val_t(void* p, size_t al)            { mi_free_aligned(p,al); }
   void _ZdlPvmSt11align_val_t(void* p, size_t n, size_t al) { mi_free_size_aligned(p,n,al); }
   void _ZdaPvmSt11align_val_t(void* p, size_t n, size_t al) { mi_free_size_aligned(p,n,al); }
-  
+
   #if (MI_INTPTR_SIZE==8)
     void* _Znwm(size_t n)                             MI_FORWARD1(mi_new,n)  // new 64-bit
     void* _Znam(size_t n)                             MI_FORWARD1(mi_new,n)  // new[] 64-bit
     void* _ZnwmRKSt9nothrow_t(size_t n, mi_nothrow_t tag) { MI_UNUSED(tag); return mi_new_nothrow(n); }
-    void* _ZnamRKSt9nothrow_t(size_t n, mi_nothrow_t tag) { MI_UNUSED(tag); return mi_new_nothrow(n); }     
+    void* _ZnamRKSt9nothrow_t(size_t n, mi_nothrow_t tag) { MI_UNUSED(tag); return mi_new_nothrow(n); }
     void* _ZnwmSt11align_val_t(size_t n, size_t al)   MI_FORWARD2(mi_new_aligned, n, al)
     void* _ZnamSt11align_val_t(size_t n, size_t al)   MI_FORWARD2(mi_new_aligned, n, al)
     void* _ZnwmSt11align_val_tRKSt9nothrow_t(size_t n, size_t al, mi_nothrow_t tag) { MI_UNUSED(tag); return mi_new_aligned_nothrow(n,al); }
@@ -204,7 +210,7 @@ typedef struct mi_nothrow_s { int _tag; } mi_nothrow_t;
     void* _Znwj(size_t n)                             MI_FORWARD1(mi_new,n)  // new 64-bit
     void* _Znaj(size_t n)                             MI_FORWARD1(mi_new,n)  // new[] 64-bit
     void* _ZnwjRKSt9nothrow_t(size_t n, mi_nothrow_t tag) { MI_UNUSED(tag); return mi_new_nothrow(n); }
-    void* _ZnajRKSt9nothrow_t(size_t n, mi_nothrow_t tag) { MI_UNUSED(tag); return mi_new_nothrow(n); }   
+    void* _ZnajRKSt9nothrow_t(size_t n, mi_nothrow_t tag) { MI_UNUSED(tag); return mi_new_nothrow(n); }
     void* _ZnwjSt11align_val_t(size_t n, size_t al)   MI_FORWARD2(mi_new_aligned, n, al)
     void* _ZnajSt11align_val_t(size_t n, size_t al)   MI_FORWARD2(mi_new_aligned, n, al)
     void* _ZnwjSt11align_val_tRKSt9nothrow_t(size_t n, size_t al, mi_nothrow_t tag) { MI_UNUSED(tag); return mi_new_aligned_nothrow(n,al); }
@@ -234,29 +240,37 @@ extern "C" {
 
   // No forwarding here due to aliasing/name mangling issues
   void*  valloc(size_t size)               { return mi_valloc(size); }
-  void   vfree(void* p)                    { mi_free(p); }                
+  void   vfree(void* p)                    { mi_free(p); }
   size_t malloc_good_size(size_t size)     { return mi_malloc_good_size(size); }
   int    posix_memalign(void** p, size_t alignment, size_t size) { return mi_posix_memalign(p, alignment, size); }
-  
+
   // `aligned_alloc` is only available when __USE_ISOC11 is defined.
   // Note: Conda has a custom glibc where `aligned_alloc` is declared `static inline` and we cannot
   // override it, but both _ISOC11_SOURCE and __USE_ISOC11 are undefined in Conda GCC7 or GCC9.
   // Fortunately, in the case where `aligned_alloc` is declared as `static inline` it
   // uses internally `memalign`, `posix_memalign`, or `_aligned_malloc` so we  can avoid overriding it ourselves.
-  #if __USE_ISOC11 
+  #if __USE_ISOC11
   void* aligned_alloc(size_t alignment, size_t size) { return mi_aligned_alloc(alignment, size); }
   #endif
 #endif
 
 // no forwarding here due to aliasing/name mangling issues
-void  cfree(void* p)                                    { mi_free(p); } 
+void  cfree(void* p)                                    { mi_free(p); }
 void* pvalloc(size_t size)                              { return mi_pvalloc(size); }
 void* reallocarray(void* p, size_t count, size_t size)  { return mi_reallocarray(p, count, size); }
 int   reallocarr(void* p, size_t count, size_t size)    { return mi_reallocarr(p, count, size); }
 void* memalign(size_t alignment, size_t size)           { return mi_memalign(alignment, size); }
 void* _aligned_malloc(size_t alignment, size_t size)    { return mi_aligned_alloc(alignment, size); }
 
-#if defined(__GLIBC__) && defined(__linux__)
+#if defined(__wasi__)
+  // forward __libc interface (see PR #667)
+  void* __libc_malloc(size_t size)                      MI_FORWARD1(mi_malloc, size)
+  void* __libc_calloc(size_t count, size_t size)        MI_FORWARD2(mi_calloc, count, size)
+  void* __libc_realloc(void* p, size_t size)            MI_FORWARD2(mi_realloc, p, size)
+  void  __libc_free(void* p)                            MI_FORWARD0(mi_free, p)
+  void* __libc_memalign(size_t alignment, size_t size)  { return mi_memalign(alignment, size); }
+
+#elif defined(__GLIBC__) && defined(__linux__)
   // forward __libc interface (needed for glibc-based Linux distributions)
   void* __libc_malloc(size_t size)                      MI_FORWARD1(mi_malloc,size)
   void* __libc_calloc(size_t count, size_t size)        MI_FORWARD2(mi_calloc,count,size)
diff --git a/source/luametatex/source/libraries/mimalloc/src/alloc-posix.c b/source/luametatex/source/libraries/mimalloc/src/alloc-posix.c
index 214a97345..e6505f290 100644
--- a/source/luametatex/source/libraries/mimalloc/src/alloc-posix.c
+++ b/source/luametatex/source/libraries/mimalloc/src/alloc-posix.c
@@ -33,12 +33,12 @@ terms of the MIT license. A copy of the license can be found in the file
 
 
 mi_decl_nodiscard size_t mi_malloc_size(const void* p) mi_attr_noexcept {
-  //if (!mi_is_in_heap_region(p)) return 0;
+  // if (!mi_is_in_heap_region(p)) return 0;
   return mi_usable_size(p);
 }
 
 mi_decl_nodiscard size_t mi_malloc_usable_size(const void *p) mi_attr_noexcept {
-  //if (!mi_is_in_heap_region(p)) return 0;
+  // if (!mi_is_in_heap_region(p)) return 0;
   return mi_usable_size(p);
 }
 
@@ -86,12 +86,12 @@ mi_decl_nodiscard mi_decl_restrict void* mi_aligned_alloc(size_t alignment, size
   // C11 requires the size to be an integral multiple of the alignment, see <https://en.cppreference.com/w/c/memory/aligned_alloc>.
   // unfortunately, it turns out quite some programs pass a size that is not an integral multiple so skip this check..
   /* if mi_unlikely((size & (alignment - 1)) != 0) { // C11 requires alignment>0 && integral multiple, see <https://en.cppreference.com/w/c/memory/aligned_alloc>
-    #if MI_DEBUG > 0
-    _mi_error_message(EOVERFLOW, "(mi_)aligned_alloc requires the size to be an integral multiple of the alignment (size %zu, alignment %zu)\n", size, alignment);
-    #endif
-    return NULL;
-  }
-  */  
+      #if MI_DEBUG > 0
+      _mi_error_message(EOVERFLOW, "(mi_)aligned_alloc requires the size to be an integral multiple of the alignment (size %zu, alignment %zu)\n", size, alignment);
+      #endif
+      return NULL;
+    }
+  */
   // C11 also requires alignment to be a power-of-two (and > 0) which is checked in mi_malloc_aligned
   void* p = mi_malloc_aligned(size, alignment);
   mi_assert_internal(((uintptr_t)p % alignment) == 0);
@@ -110,7 +110,7 @@ mi_decl_nodiscard int mi_reallocarr( void* p, size_t count, size_t size ) mi_att
     errno = EINVAL;
     return EINVAL;
   }
-  void** op = (void**)p;  
+  void** op = (void**)p;
   void* newp = mi_reallocarray(*op, count, size);
   if mi_unlikely(newp == NULL) { return errno; }
   *op = newp;
diff --git a/source/luametatex/source/libraries/mimalloc/src/alloc.c b/source/luametatex/source/libraries/mimalloc/src/alloc.c
index 348218246..86453f152 100644
--- a/source/luametatex/source/libraries/mimalloc/src/alloc.c
+++ b/source/luametatex/source/libraries/mimalloc/src/alloc.c
@@ -30,7 +30,7 @@ extern inline void* _mi_page_malloc(mi_heap_t* heap, mi_page_t* page, size_t siz
   mi_assert_internal(page->xblock_size==0||mi_page_block_size(page) >= size);
   mi_block_t* const block = page->free;
   if mi_unlikely(block == NULL) {
-    return _mi_malloc_generic(heap, size, zero); 
+    return _mi_malloc_generic(heap, size, zero, 0);
   }
   mi_assert_internal(block != NULL && _mi_ptr_page(block) == page);
   // pop from the free list
@@ -38,20 +38,22 @@ extern inline void* _mi_page_malloc(mi_heap_t* heap, mi_page_t* page, size_t siz
   page->free = mi_block_next(page, block);
   mi_assert_internal(page->free == NULL || _mi_ptr_page(page->free) == page);
 
-  // allow use of the block internally 
+  // allow use of the block internally
   // note: when tracking we need to avoid ever touching the MI_PADDING since
   // that is tracked by valgrind etc. as non-accessible (through the red-zone, see `mimalloc-track.h`)
   mi_track_mem_undefined(block, mi_page_usable_block_size(page));
-  
+
   // zero the block? note: we need to zero the full block size (issue #63)
   if mi_unlikely(zero) {
     mi_assert_internal(page->xblock_size != 0); // do not call with zero'ing for huge blocks (see _mi_malloc_generic)
     const size_t zsize = (page->is_zero ? sizeof(block->next) + MI_PADDING_SIZE : page->xblock_size);
-    _mi_memzero_aligned(block, zsize - MI_PADDING_SIZE);    
+    _mi_memzero_aligned(block, zsize - MI_PADDING_SIZE);
   }
 
 #if (MI_DEBUG>0) && !MI_TRACK_ENABLED
-  if (!page->is_zero && !zero) { memset(block, MI_DEBUG_UNINIT, mi_page_usable_block_size(page)); }
+  if (!page->is_zero && !zero && !mi_page_is_huge(page)) {
+    memset(block, MI_DEBUG_UNINIT, mi_page_usable_block_size(page));
+  }
 #elif (MI_SECURE!=0)
   if (!zero) { block->next = 0; } // don't leak internal data
 #endif
@@ -77,32 +79,37 @@ extern inline void* _mi_page_malloc(mi_heap_t* heap, mi_page_t* page, size_t siz
   #endif
   padding->canary = (uint32_t)(mi_ptr_encode(page,block,page->keys));
   padding->delta  = (uint32_t)(delta);
-  uint8_t* fill = (uint8_t*)padding - delta;
-  const size_t maxpad = (delta > MI_MAX_ALIGN_SIZE ? MI_MAX_ALIGN_SIZE : delta); // set at most N initial padding bytes
-  for (size_t i = 0; i < maxpad; i++) { fill[i] = MI_DEBUG_PADDING; }
+  if (!mi_page_is_huge(page)) {
+    uint8_t* fill = (uint8_t*)padding - delta;
+    const size_t maxpad = (delta > MI_MAX_ALIGN_SIZE ? MI_MAX_ALIGN_SIZE : delta); // set at most N initial padding bytes
+    for (size_t i = 0; i < maxpad; i++) { fill[i] = MI_DEBUG_PADDING; }
+  }
 #endif
 
   return block;
 }
 
 static inline mi_decl_restrict void* mi_heap_malloc_small_zero(mi_heap_t* heap, size_t size, bool zero) mi_attr_noexcept {
-  mi_assert(heap!=NULL);
-  mi_assert(heap->thread_id == 0 || heap->thread_id == _mi_thread_id()); // heaps are thread local
+  mi_assert(heap != NULL);
+  #if MI_DEBUG
+  const uintptr_t tid = _mi_thread_id();
+  mi_assert(heap->thread_id == 0 || heap->thread_id == tid); // heaps are thread local
+  #endif
   mi_assert(size <= MI_SMALL_SIZE_MAX);
-  #if (MI_PADDING)
+#if (MI_PADDING)
   if (size == 0) {
     size = sizeof(void*);
   }
-  #endif
-  mi_page_t* page = _mi_heap_get_free_small_page(heap,size + MI_PADDING_SIZE);
+#endif
+  mi_page_t* page = _mi_heap_get_free_small_page(heap, size + MI_PADDING_SIZE);
   void* p = _mi_page_malloc(heap, page, size + MI_PADDING_SIZE, zero);
-  mi_assert_internal(p==NULL || mi_usable_size(p) >= size);
-  #if MI_STAT>1
+  mi_assert_internal(p == NULL || mi_usable_size(p) >= size);
+#if MI_STAT>1
   if (p != NULL) {
     if (!mi_heap_is_initialized(heap)) { heap = mi_get_default_heap(); }
     mi_heap_stat_increase(heap, malloc, mi_usable_size(p));
   }
-  #endif
+#endif
   mi_track_malloc(p,size,zero);
   return p;
 }
@@ -117,14 +124,15 @@ mi_decl_nodiscard extern inline mi_decl_restrict void* mi_malloc_small(size_t si
 }
 
 // The main allocation function
-extern inline void* _mi_heap_malloc_zero(mi_heap_t* heap, size_t size, bool zero) mi_attr_noexcept {
+extern inline void* _mi_heap_malloc_zero_ex(mi_heap_t* heap, size_t size, bool zero, size_t huge_alignment) mi_attr_noexcept {
   if mi_likely(size <= MI_SMALL_SIZE_MAX) {
+    mi_assert_internal(huge_alignment == 0);
     return mi_heap_malloc_small_zero(heap, size, zero);
   }
   else {
     mi_assert(heap!=NULL);
-    mi_assert(heap->thread_id == 0 || heap->thread_id == _mi_thread_id()); // heaps are thread local
-    void* const p = _mi_malloc_generic(heap, size + MI_PADDING_SIZE, zero);  // note: size can overflow but it is detected in malloc_generic
+    mi_assert(heap->thread_id == 0 || heap->thread_id == _mi_thread_id());   // heaps are thread local
+    void* const p = _mi_malloc_generic(heap, size + MI_PADDING_SIZE, zero, huge_alignment);  // note: size can overflow but it is detected in malloc_generic
     mi_assert_internal(p == NULL || mi_usable_size(p) >= size);
     #if MI_STAT>1
     if (p != NULL) {
@@ -137,6 +145,10 @@ extern inline void* _mi_heap_malloc_zero(mi_heap_t* heap, size_t size, bool zero
   }
 }
 
+extern inline void* _mi_heap_malloc_zero(mi_heap_t* heap, size_t size, bool zero) mi_attr_noexcept {
+  return _mi_heap_malloc_zero_ex(heap, size, zero, 0);
+}
+
 mi_decl_nodiscard extern inline mi_decl_restrict void* mi_heap_malloc(mi_heap_t* heap, size_t size) mi_attr_noexcept {
   return _mi_heap_malloc_zero(heap, size, false);
 }
@@ -220,9 +232,9 @@ static bool mi_page_decode_padding(const mi_page_t* page, const mi_block_t* bloc
   mi_track_mem_defined(padding,sizeof(mi_padding_t));
   *delta = padding->delta;
   uint32_t canary = padding->canary;
-  uintptr_t keys[2]; 
+  uintptr_t keys[2];
   keys[0] = page->keys[0];
-  keys[1] = page->keys[1]; 
+  keys[1] = page->keys[1];
   bool ok = ((uint32_t)mi_ptr_encode(page,block,keys) == canary && *delta <= *bsize);
   mi_track_mem_noaccess(padding,sizeof(mi_padding_t));
   return ok;
@@ -245,17 +257,19 @@ static bool mi_verify_padding(const mi_page_t* page, const mi_block_t* block, si
   if (!ok) return false;
   mi_assert_internal(bsize >= delta);
   *size = bsize - delta;
-  uint8_t* fill = (uint8_t*)block + bsize - delta;
-  const size_t maxpad = (delta > MI_MAX_ALIGN_SIZE ? MI_MAX_ALIGN_SIZE : delta); // check at most the first N padding bytes
-  mi_track_mem_defined(fill,maxpad);
-  for (size_t i = 0; i < maxpad; i++) {
-    if (fill[i] != MI_DEBUG_PADDING) {
-      *wrong = bsize - delta + i;
-      ok = false;
-      break;
+  if (!mi_page_is_huge(page)) {
+    uint8_t* fill = (uint8_t*)block + bsize - delta;
+    const size_t maxpad = (delta > MI_MAX_ALIGN_SIZE ? MI_MAX_ALIGN_SIZE : delta); // check at most the first N padding bytes
+    mi_track_mem_defined(fill, maxpad);
+    for (size_t i = 0; i < maxpad; i++) {
+      if (fill[i] != MI_DEBUG_PADDING) {
+        *wrong = bsize - delta + i;
+        ok = false;
+        break;
+      }
     }
+    mi_track_mem_noaccess(fill, maxpad);
   }
-  mi_track_mem_noaccess(fill,maxpad);
   return ok;
 }
 
@@ -325,7 +339,7 @@ static void mi_stat_free(const mi_page_t* page, const mi_block_t* block) {
   }
   else {
     mi_heap_stat_decrease(heap, huge, bsize);
-  }
+  }  
 }
 #else
 static void mi_stat_free(const mi_page_t* page, const mi_block_t* block) {
@@ -333,6 +347,7 @@ static void mi_stat_free(const mi_page_t* page, const mi_block_t* block) {
 }
 #endif
 
+#if MI_HUGE_PAGE_ABANDON
 #if (MI_STAT>0)
 // maintain stats for huge objects
 static void mi_stat_huge_free(const mi_page_t* page) {
@@ -350,29 +365,41 @@ static void mi_stat_huge_free(const mi_page_t* page) {
   MI_UNUSED(page);
 }
 #endif
+#endif
 
 // ------------------------------------------------------
 // Free
 // ------------------------------------------------------
 
-// multi-threaded free (or free in huge block)
+// multi-threaded free (or free in huge block if compiled with MI_HUGE_PAGE_ABANDON)
 static mi_decl_noinline void _mi_free_block_mt(mi_page_t* page, mi_block_t* block)
 {
   // The padding check may access the non-thread-owned page for the key values.
   // that is safe as these are constant and the page won't be freed (as the block is not freed yet).
   mi_check_padding(page, block);
-  mi_padding_shrink(page, block, sizeof(mi_block_t)); // for small size, ensure we can fit the delayed thread pointers without triggering overflow detection
-  #if (MI_DEBUG!=0) && !MI_TRACK_ENABLED                    // note: when tracking, cannot use mi_usable_size with multi-threading
-  memset(block, MI_DEBUG_FREED, mi_usable_size(block));
-  #endif
-
+  mi_padding_shrink(page, block, sizeof(mi_block_t));       // for small size, ensure we can fit the delayed thread pointers without triggering overflow detection
+  
   // huge page segments are always abandoned and can be freed immediately
   mi_segment_t* segment = _mi_page_segment(page);
-  if (segment->kind==MI_SEGMENT_HUGE) {
+  if (segment->kind == MI_SEGMENT_HUGE) {
+    #if MI_HUGE_PAGE_ABANDON
+    // huge page segments are always abandoned and can be freed immediately
     mi_stat_huge_free(page);
     _mi_segment_huge_page_free(segment, page, block);
     return;
+    #else
+    // huge pages are special as they occupy the entire segment
+    // as these are large we reset the memory occupied by the page so it is available to other threads
+    // (as the owning thread needs to actually free the memory later).
+    _mi_segment_huge_page_reset(segment, page, block);
+    #endif
+  }
+  
+  #if (MI_DEBUG!=0) && !MI_TRACK_ENABLED                    // note: when tracking, cannot use mi_usable_size with multi-threading
+  if (segment->kind != MI_SEGMENT_HUGE) {                   // not for huge segments as we just reset the content
+    memset(block, MI_DEBUG_FREED, mi_usable_size(block));
   }
+  #endif
 
   // Try to put the block on either the page-local thread free list, or the heap delayed free list.
   mi_thread_free_t tfreex;
@@ -423,7 +450,9 @@ static inline void _mi_free_block(mi_page_t* page, bool local, mi_block_t* block
     if mi_unlikely(mi_check_is_double_free(page, block)) return;
     mi_check_padding(page, block);
     #if (MI_DEBUG!=0) && !MI_TRACK_ENABLED
-    memset(block, MI_DEBUG_FREED, mi_page_block_size(page));
+    if (!mi_page_is_huge(page)) {   // huge page content may be already decommitted
+      memset(block, MI_DEBUG_FREED, mi_page_block_size(page));
+    }
     #endif
     mi_block_set_next(page, block, page->local_free);
     page->local_free = block;
@@ -450,20 +479,21 @@ mi_block_t* _mi_page_ptr_unalign(const mi_segment_t* segment, const mi_page_t* p
 }
 
 
-static void mi_decl_noinline mi_free_generic(const mi_segment_t* segment, bool local, void* p) mi_attr_noexcept {
-  mi_page_t* const page = _mi_segment_page_of(segment, p);
+void mi_decl_noinline _mi_free_generic(const mi_segment_t* segment, mi_page_t* page, bool is_local, void* p) mi_attr_noexcept {
   mi_block_t* const block = (mi_page_has_aligned(page) ? _mi_page_ptr_unalign(segment, page, p) : (mi_block_t*)p);
   mi_stat_free(page, block);                 // stat_free may access the padding
   mi_track_free(p);
-  _mi_free_block(page, local, block);
+  _mi_free_block(page, is_local, block);
 }
 
 // Get the segment data belonging to a pointer
 // This is just a single `and` in assembly but does further checks in debug mode
 // (and secure mode) if this was a valid pointer.
-static inline mi_segment_t* mi_checked_ptr_segment(const void* p, const char* msg) 
+static inline mi_segment_t* mi_checked_ptr_segment(const void* p, const char* msg)
 {
   MI_UNUSED(msg);
+  mi_assert(p != NULL);
+
 #if (MI_DEBUG>0)
   if mi_unlikely(((uintptr_t)p & (MI_INTPTR_SIZE - 1)) != 0) {
     _mi_error_message(EINVAL, "%s: invalid (unaligned) pointer: %p\n", msg, p);
@@ -472,14 +502,20 @@ static inline mi_segment_t* mi_checked_ptr_segment(const void* p, const char* ms
 #endif
 
   mi_segment_t* const segment = _mi_ptr_segment(p);
-  if mi_unlikely(segment == NULL) return NULL;  // checks also for (p==NULL)
+  mi_assert_internal(segment != NULL);
 
 #if (MI_DEBUG>0)
   if mi_unlikely(!mi_is_in_heap_region(p)) {
-    _mi_warning_message("%s: pointer might not point to a valid heap region: %p\n"
-      "(this may still be a valid very large allocation (over 64MiB))\n", msg, p);
-    if mi_likely(_mi_ptr_cookie(segment) == segment->cookie) {
-      _mi_warning_message("(yes, the previous pointer %p was valid after all)\n", p);
+  #if (MI_INTPTR_SIZE == 8 && defined(__linux__))
+    if (((uintptr_t)p >> 40) != 0x7F) { // linux tends to align large blocks above 0x7F000000000 (issue #640)
+  #else
+    {
+  #endif
+      _mi_warning_message("%s: pointer might not point to a valid heap region: %p\n"
+        "(this may still be a valid very large allocation (over 64MiB))\n", msg, p);
+      if mi_likely(_mi_ptr_cookie(segment) == segment->cookie) {
+        _mi_warning_message("(yes, the previous pointer %p was valid after all)\n", p);
+      }
     }
   }
 #endif
@@ -489,38 +525,44 @@ static inline mi_segment_t* mi_checked_ptr_segment(const void* p, const char* ms
     return NULL;
   }
 #endif
+
   return segment;
 }
 
-// Free a block 
+// Free a block
+// fast path written carefully to prevent spilling on the stack
 void mi_free(void* p) mi_attr_noexcept
 {
+  if mi_unlikely(p == NULL) return;
   mi_segment_t* const segment = mi_checked_ptr_segment(p,"mi_free");
-  if mi_unlikely(segment == NULL) return; 
-
-  mi_threadid_t tid = _mi_thread_id();
-  mi_page_t* const page = _mi_segment_page_of(segment, p);
-  
-  if mi_likely(tid == mi_atomic_load_relaxed(&segment->thread_id) && page->flags.full_aligned == 0) {  // the thread id matches and it is not a full page, nor has aligned blocks
-    // local, and not full or aligned
-    mi_block_t* block = (mi_block_t*)(p);
-    if mi_unlikely(mi_check_is_double_free(page,block)) return;
-    mi_check_padding(page, block);
-    mi_stat_free(page, block);
-    #if (MI_DEBUG!=0) && !MI_TRACK_ENABLED
-    memset(block, MI_DEBUG_FREED, mi_page_block_size(page));
-    #endif
-    mi_track_free(p);
-    mi_block_set_next(page, block, page->local_free);
-    page->local_free = block;
-    if mi_unlikely(--page->used == 0) {   // using this expression generates better code than: page->used--; if (mi_page_all_free(page))    
-      _mi_page_retire(page);
+  const bool          is_local= (_mi_thread_id() == mi_atomic_load_relaxed(&segment->thread_id));
+  mi_page_t* const    page    = _mi_segment_page_of(segment, p);
+
+  if mi_likely(is_local) {                       // thread-local free?
+    if mi_likely(page->flags.full_aligned == 0)  // and it is not a full page (full pages need to move from the full bin), nor has aligned blocks (aligned blocks need to be unaligned)
+    {
+      mi_block_t* const block = (mi_block_t*)p;
+      if mi_unlikely(mi_check_is_double_free(page, block)) return;
+      mi_check_padding(page, block);
+      mi_stat_free(page, block);
+      #if (MI_DEBUG!=0) && !MI_TRACK_ENABLED
+      memset(block, MI_DEBUG_FREED, mi_page_block_size(page));
+      #endif
+      mi_track_free(p);
+      mi_block_set_next(page, block, page->local_free);
+      page->local_free = block;
+      if mi_unlikely(--page->used == 0) {   // using this expression generates better code than: page->used--; if (mi_page_all_free(page))
+        _mi_page_retire(page);
+      }
+    }
+    else {
+      // page is full or contains (inner) aligned blocks; use generic path
+      _mi_free_generic(segment, page, true, p);
     }
   }
   else {
-    // non-local, aligned blocks, or a full page; use the more generic path
-    // note: recalc page in generic to improve code generation
-    mi_free_generic(segment, tid == segment->thread_id, p);
+    // not thread-local; use generic path
+    _mi_free_generic(segment, page, false, p);
   }
 }
 
@@ -559,9 +601,9 @@ mi_decl_noinline static size_t mi_page_usable_aligned_size_of(const mi_segment_t
 }
 
 static inline size_t _mi_usable_size(const void* p, const char* msg) mi_attr_noexcept {
+  if (p == NULL) return 0;
   const mi_segment_t* const segment = mi_checked_ptr_segment(p, msg);
-  if (segment==NULL) return 0;  // also returns 0 if `p == NULL`
-  const mi_page_t* const page = _mi_segment_page_of(segment, p);  
+  const mi_page_t* const page = _mi_segment_page_of(segment, p);
   if mi_likely(!mi_page_has_aligned(page)) {
     const mi_block_t* block = (const mi_block_t*)p;
     return mi_page_usable_size_of(page, block);
@@ -577,24 +619,6 @@ mi_decl_nodiscard size_t mi_usable_size(const void* p) mi_attr_noexcept {
 }
 
 
-// ------------------------------------------------------
-// ensure explicit external inline definitions are emitted!
-// ------------------------------------------------------
-
-#ifdef __cplusplus
-void* _mi_externs[] = {
-  (void*)&_mi_page_malloc,
-  (void*)&_mi_heap_malloc_zero,
-  (void*)&mi_malloc,
-  (void*)&mi_malloc_small,
-  (void*)&mi_zalloc_small,
-  (void*)&mi_heap_malloc,
-  (void*)&mi_heap_zalloc,
-  (void*)&mi_heap_malloc_small
-};
-#endif
-
-
 // ------------------------------------------------------
 // Allocation extensions
 // ------------------------------------------------------
@@ -641,7 +665,7 @@ mi_decl_nodiscard mi_decl_restrict void* mi_mallocn(size_t count, size_t size) m
 // Expand (or shrink) in place (or fail)
 void* mi_expand(void* p, size_t newsize) mi_attr_noexcept {
   #if MI_PADDING
-  // we do not shrink/expand with padding enabled 
+  // we do not shrink/expand with padding enabled
   MI_UNUSED(p); MI_UNUSED(newsize);
   return NULL;
   #else
@@ -659,7 +683,7 @@ void* _mi_heap_realloc_zero(mi_heap_t* heap, void* p, size_t newsize, bool zero)
   const size_t size = _mi_usable_size(p,"mi_realloc"); // also works if p == NULL (with size 0)
   if mi_unlikely(newsize <= size && newsize >= (size / 2) && newsize > 0) {  // note: newsize must be > 0 or otherwise we return NULL for realloc(NULL,0)
     // todo: adjust potential padding to reflect the new size?
-    mi_track_free(p);
+    mi_track_free_size(p, size);
     mi_track_malloc(p,newsize,true);
     return p;  // reallocation still fits and not more than 50% waste
   }
@@ -683,7 +707,7 @@ void* _mi_heap_realloc_zero(mi_heap_t* heap, void* p, size_t newsize, bool zero)
 }
 
 mi_decl_nodiscard void* mi_heap_realloc(mi_heap_t* heap, void* p, size_t newsize) mi_attr_noexcept {
-  return _mi_heap_realloc_zero(heap, p, newsize, false);  
+  return _mi_heap_realloc_zero(heap, p, newsize, false);
 }
 
 mi_decl_nodiscard void* mi_heap_reallocn(mi_heap_t* heap, void* p, size_t count, size_t size) mi_attr_noexcept {
@@ -743,7 +767,9 @@ mi_decl_nodiscard mi_decl_restrict char* mi_heap_strdup(mi_heap_t* heap, const c
   if (s == NULL) return NULL;
   size_t n = strlen(s);
   char* t = (char*)mi_heap_malloc(heap,n+1);
-  if (t != NULL) _mi_memcpy(t, s, n + 1);
+  if (t == NULL) return NULL;
+  _mi_memcpy(t, s, n);
+  t[n] = 0;
   return t;
 }
 
@@ -793,6 +819,7 @@ mi_decl_nodiscard mi_decl_restrict char* mi_heap_realpath(mi_heap_t* heap, const
   }
 }
 #else
+/*
 #include <unistd.h>  // pathconf
 static size_t mi_path_max(void) {
   static size_t path_max = 0;
@@ -804,20 +831,31 @@ static size_t mi_path_max(void) {
   }
   return path_max;
 }
-
+*/
 char* mi_heap_realpath(mi_heap_t* heap, const char* fname, char* resolved_name) mi_attr_noexcept {
   if (resolved_name != NULL) {
     return realpath(fname,resolved_name);
   }
   else {
-    size_t n  = mi_path_max();
+    char* rname = realpath(fname, NULL);
+    if (rname == NULL) return NULL;
+    char* result = mi_heap_strdup(heap, rname);
+    free(rname);  // use regular free! (which may be redirected to our free but that's ok)
+    return result;
+  }
+  /*
+    const size_t n  = mi_path_max();
     char* buf = (char*)mi_malloc(n+1);
-    if (buf==NULL) return NULL;
+    if (buf == NULL) {
+      errno = ENOMEM;
+      return NULL;
+    }
     char* rname  = realpath(fname,buf);
     char* result = mi_heap_strndup(heap,rname,n); // ok if `rname==NULL`
     mi_free(buf);
     return result;
   }
+  */
 }
 #endif
 
@@ -843,9 +881,9 @@ static bool mi_try_new_handler(bool nothrow) {
   #else
     std::new_handler h = std::set_new_handler();
     std::set_new_handler(h);
-  #endif  
+  #endif
   if (h==NULL) {
-    _mi_error_message(ENOMEM, "out of memory in 'new'");      
+    _mi_error_message(ENOMEM, "out of memory in 'new'");
     if (!nothrow) {
       throw std::bad_alloc();
     }
@@ -876,7 +914,7 @@ static std_new_handler_t mi_get_new_handler() {
 static bool mi_try_new_handler(bool nothrow) {
   std_new_handler_t h = mi_get_new_handler();
   if (h==NULL) {
-    _mi_error_message(ENOMEM, "out of memory in 'new'");       
+    _mi_error_message(ENOMEM, "out of memory in 'new'");
     if (!nothrow) {
       abort();  // cannot throw in plain C, use abort
     }
@@ -889,20 +927,46 @@ static bool mi_try_new_handler(bool nothrow) {
 }
 #endif
 
-static mi_decl_noinline void* mi_try_new(size_t size, bool nothrow ) {
+static mi_decl_noinline void* mi_heap_try_new(mi_heap_t* heap, size_t size, bool nothrow ) {
   void* p = NULL;
   while(p == NULL && mi_try_new_handler(nothrow)) {
-    p = mi_malloc(size);
+    p = mi_heap_malloc(heap,size);
   }
   return p;
 }
 
-mi_decl_nodiscard mi_decl_restrict void* mi_new(size_t size) {
-  void* p = mi_malloc(size);
-  if mi_unlikely(p == NULL) return mi_try_new(size,false);
+static mi_decl_noinline void* mi_try_new(size_t size, bool nothrow) {
+  return mi_heap_try_new(mi_get_default_heap(), size, nothrow);
+}
+
+
+mi_decl_nodiscard mi_decl_restrict extern inline void* mi_heap_alloc_new(mi_heap_t* heap, size_t size) {
+  void* p = mi_heap_malloc(heap,size);
+  if mi_unlikely(p == NULL) return mi_heap_try_new(heap, size, false);
   return p;
 }
 
+mi_decl_nodiscard mi_decl_restrict void* mi_new(size_t size) {
+  return mi_heap_alloc_new(mi_get_default_heap(), size);
+}
+
+
+mi_decl_nodiscard mi_decl_restrict extern inline void* mi_heap_alloc_new_n(mi_heap_t* heap, size_t count, size_t size) {
+  size_t total;
+  if mi_unlikely(mi_count_size_overflow(count, size, &total)) {
+    mi_try_new_handler(false);  // on overflow we invoke the try_new_handler once to potentially throw std::bad_alloc
+    return NULL;
+  }
+  else {
+    return mi_heap_alloc_new(heap,total);
+  }
+}
+
+mi_decl_nodiscard mi_decl_restrict void* mi_new_n(size_t count, size_t size) {
+  return mi_heap_alloc_new_n(mi_get_default_heap(), size, count);
+}
+
+
 mi_decl_nodiscard mi_decl_restrict void* mi_new_nothrow(size_t size) mi_attr_noexcept {
   void* p = mi_malloc(size);
   if mi_unlikely(p == NULL) return mi_try_new(size, true);
@@ -927,17 +991,6 @@ mi_decl_nodiscard mi_decl_restrict void* mi_new_aligned_nothrow(size_t size, siz
   return p;
 }
 
-mi_decl_nodiscard mi_decl_restrict void* mi_new_n(size_t count, size_t size) {
-  size_t total;
-  if mi_unlikely(mi_count_size_overflow(count, size, &total)) {
-    mi_try_new_handler(false);  // on overflow we invoke the try_new_handler once to potentially throw std::bad_alloc
-    return NULL;
-  }
-  else {
-    return mi_new(total);
-  }
-}
-
 mi_decl_nodiscard void* mi_new_realloc(void* p, size_t newsize) {
   void* q;
   do {
@@ -956,3 +1009,23 @@ mi_decl_nodiscard void* mi_new_reallocn(void* p, size_t newcount, size_t size) {
     return mi_new_realloc(p, total);
   }
 }
+
+// ------------------------------------------------------
+// ensure explicit external inline definitions are emitted!
+// ------------------------------------------------------
+
+#ifdef __cplusplus
+void* _mi_externs[] = {
+  (void*)&_mi_page_malloc,
+  (void*)&_mi_heap_malloc_zero,
+  (void*)&_mi_heap_malloc_zero_ex,
+  (void*)&mi_malloc,
+  (void*)&mi_malloc_small,
+  (void*)&mi_zalloc_small,
+  (void*)&mi_heap_malloc,
+  (void*)&mi_heap_zalloc,
+  (void*)&mi_heap_malloc_small,
+  (void*)&mi_heap_alloc_new,
+  (void*)&mi_heap_alloc_new_n
+};
+#endif
diff --git a/source/luametatex/source/libraries/mimalloc/src/arena.c b/source/luametatex/source/libraries/mimalloc/src/arena.c
index 56b87d083..80dd47869 100644
--- a/source/luametatex/source/libraries/mimalloc/src/arena.c
+++ b/source/luametatex/source/libraries/mimalloc/src/arena.c
@@ -48,8 +48,8 @@ bool  _mi_os_decommit(void* addr, size_t size, mi_stats_t* stats);
 // Block info: bit 0 contains the `in_use` bit, the upper bits the
 // size in count of arena blocks.
 typedef uintptr_t mi_block_info_t;
-#define MI_ARENA_BLOCK_SIZE   (MI_SEGMENT_SIZE)        // 8MiB  (must be at least MI_SEGMENT_ALIGN)
-#define MI_ARENA_MIN_OBJ_SIZE (MI_ARENA_BLOCK_SIZE/2)  // 4MiB
+#define MI_ARENA_BLOCK_SIZE   (MI_SEGMENT_SIZE)        // 64MiB  (must be at least MI_SEGMENT_ALIGN)
+#define MI_ARENA_MIN_OBJ_SIZE (MI_ARENA_BLOCK_SIZE/2)  // 32MiB
 #define MI_MAX_ARENAS         (64)                     // not more than 126 (since we use 7 bits in the memid and an arena index + 1)
 
 // A memory arena descriptor
@@ -190,22 +190,23 @@ static mi_decl_noinline void* mi_arena_alloc_from(mi_arena_t* arena, size_t aren
   return p;
 }
 
-static mi_decl_noinline void* mi_arena_allocate(int numa_node, size_t size, size_t alignment, bool* commit, bool* large, 
-                                                bool* is_pinned, bool* is_zero, 
-                                                mi_arena_id_t req_arena_id, size_t* memid, mi_os_tld_t* tld)
-{  
+// allocate from an arena with fallback to the OS
+static mi_decl_noinline void* mi_arena_allocate(int numa_node, size_t size, size_t alignment, bool* commit, bool* large,
+                                                bool* is_pinned, bool* is_zero,
+                                                mi_arena_id_t req_arena_id, size_t* memid, mi_os_tld_t* tld )
+{
   MI_UNUSED_RELEASE(alignment);
   mi_assert_internal(alignment <= MI_SEGMENT_ALIGN);
-  const size_t max_arena = mi_atomic_load_relaxed(&mi_arena_count);  
+  const size_t max_arena = mi_atomic_load_relaxed(&mi_arena_count);
   const size_t bcount = mi_block_count_of_size(size);
   if mi_likely(max_arena == 0) return NULL;
-  mi_assert_internal(size <= bcount*MI_ARENA_BLOCK_SIZE);
+  mi_assert_internal(size <= bcount * MI_ARENA_BLOCK_SIZE);
 
   size_t arena_index = mi_arena_id_index(req_arena_id);
   if (arena_index < MI_MAX_ARENAS) {
     // try a specific arena if requested
     mi_arena_t* arena = mi_atomic_load_ptr_relaxed(mi_arena_t, &mi_arenas[arena_index]);
-    if (arena != NULL &&
+    if ((arena != NULL) &&
         (arena->numa_node < 0 || arena->numa_node == numa_node) && // numa local?
         (*large || !arena->is_large)) // large OS pages allowed, or arena is not large OS pages
     {
@@ -215,28 +216,28 @@ static mi_decl_noinline void* mi_arena_allocate(int numa_node, size_t size, size
     }
   }
   else {
-  // try numa affine allocation
-  for (size_t i = 0; i < max_arena; i++) {
-    mi_arena_t* arena = mi_atomic_load_ptr_relaxed(mi_arena_t, &mi_arenas[i]);
-    if (arena==NULL) break; // end reached
-    if ((arena->numa_node<0 || arena->numa_node==numa_node) && // numa local?
-      (*large || !arena->is_large)) // large OS pages allowed, or arena is not large OS pages
-    {
+    // try numa affine allocation
+    for (size_t i = 0; i < max_arena; i++) {
+      mi_arena_t* arena = mi_atomic_load_ptr_relaxed(mi_arena_t, &mi_arenas[i]);
+      if (arena == NULL) break; // end reached
+      if ((arena->numa_node < 0 || arena->numa_node == numa_node) && // numa local?
+          (*large || !arena->is_large)) // large OS pages allowed, or arena is not large OS pages
+      {
         void* p = mi_arena_alloc_from(arena, i, bcount, commit, large, is_pinned, is_zero, req_arena_id, memid, tld);
-      mi_assert_internal((uintptr_t)p % alignment == 0);
+        mi_assert_internal((uintptr_t)p % alignment == 0);
         if (p != NULL) return p;
+      }
     }
-  }
 
-  // try from another numa node instead..
-  for (size_t i = 0; i < max_arena; i++) {
-    mi_arena_t* arena = mi_atomic_load_ptr_relaxed(mi_arena_t, &mi_arenas[i]);
-    if (arena==NULL) break; // end reached
-    if ((arena->numa_node>=0 && arena->numa_node!=numa_node) && // not numa local!
-      (*large || !arena->is_large)) // large OS pages allowed, or arena is not large OS pages
-    {
+    // try from another numa node instead..
+    for (size_t i = 0; i < max_arena; i++) {
+      mi_arena_t* arena = mi_atomic_load_ptr_relaxed(mi_arena_t, &mi_arenas[i]);
+      if (arena == NULL) break; // end reached
+      if ((arena->numa_node >= 0 && arena->numa_node != numa_node) && // not numa local!
+          (*large || !arena->is_large)) // large OS pages allowed, or arena is not large OS pages
+      {
         void* p = mi_arena_alloc_from(arena, i, bcount, commit, large, is_pinned, is_zero, req_arena_id, memid, tld);
-      mi_assert_internal((uintptr_t)p % alignment == 0);
+        mi_assert_internal((uintptr_t)p % alignment == 0);
         if (p != NULL) return p;
       }
     }
@@ -244,8 +245,7 @@ static mi_decl_noinline void* mi_arena_allocate(int numa_node, size_t size, size
   return NULL;
 }
 
-
-void* _mi_arena_alloc_aligned(size_t size, size_t alignment, bool* commit, bool* large, bool* is_pinned, bool* is_zero,
+void* _mi_arena_alloc_aligned(size_t size, size_t alignment, size_t align_offset, bool* commit, bool* large, bool* is_pinned, bool* is_zero,
                               mi_arena_id_t req_arena_id, size_t* memid, mi_os_tld_t* tld)
 {
   mi_assert_internal(commit != NULL && is_pinned != NULL && is_zero != NULL && memid != NULL && tld != NULL);
@@ -255,11 +255,11 @@ void* _mi_arena_alloc_aligned(size_t size, size_t alignment, bool* commit, bool*
   *is_pinned = false;
 
   bool default_large = false;
-  if (large==NULL) large = &default_large;     // ensure `large != NULL`
+  if (large == NULL) large = &default_large;   // ensure `large != NULL`
   const int numa_node = _mi_os_numa_node(tld); // current numa node
 
   // try to allocate in an arena if the alignment is small enough and the object is not too small (as for heap meta data)
-  if (size >= MI_ARENA_MIN_OBJ_SIZE && alignment <= MI_SEGMENT_ALIGN) {
+  if (size >= MI_ARENA_MIN_OBJ_SIZE && alignment <= MI_SEGMENT_ALIGN && align_offset == 0) {
     void* p = mi_arena_allocate(numa_node, size, alignment, commit, large, is_pinned, is_zero, req_arena_id, memid, tld);
     if (p != NULL) return p;
   }
@@ -270,15 +270,15 @@ void* _mi_arena_alloc_aligned(size_t size, size_t alignment, bool* commit, bool*
     return NULL;
   }
   *is_zero = true;
-  *memid   = MI_MEMID_OS;  
-  void* p = _mi_os_alloc_aligned(size, alignment, *commit, large, tld->stats);
-  if (p != NULL) *is_pinned = *large;
+  *memid   = MI_MEMID_OS;
+  void* p = _mi_os_alloc_aligned_offset(size, alignment, align_offset, *commit, large, tld->stats);
+  if (p != NULL) { *is_pinned = *large; }
   return p;
 }
 
 void* _mi_arena_alloc(size_t size, bool* commit, bool* large, bool* is_pinned, bool* is_zero, mi_arena_id_t req_arena_id, size_t* memid, mi_os_tld_t* tld)
 {
-  return _mi_arena_alloc_aligned(size, MI_ARENA_BLOCK_SIZE, commit, large, is_pinned, is_zero, req_arena_id, memid, tld);
+  return _mi_arena_alloc_aligned(size, MI_ARENA_BLOCK_SIZE, 0, commit, large, is_pinned, is_zero, req_arena_id, memid, tld);
 }
 
 void* mi_arena_area(mi_arena_id_t arena_id, size_t* size) {
@@ -295,17 +295,18 @@ void* mi_arena_area(mi_arena_id_t arena_id, size_t* size) {
   Arena free
 ----------------------------------------------------------- */
 
-void _mi_arena_free(void* p, size_t size, size_t memid, bool all_committed, mi_os_tld_t* tld) {
-  mi_assert_internal(size > 0 && tld->stats != NULL);
+void _mi_arena_free(void* p, size_t size, size_t alignment, size_t align_offset, size_t memid, bool all_committed, mi_stats_t* stats) {
+  mi_assert_internal(size > 0 && stats != NULL);
   if (p==NULL) return;
   if (size==0) return;
 
   if (memid == MI_MEMID_OS) {
     // was a direct OS allocation, pass through
-    _mi_os_free_ex(p, size, all_committed, tld->stats);
+    _mi_os_free_aligned(p, size, alignment, align_offset, all_committed, stats);
   }
   else {
     // allocated in an arena
+    mi_assert_internal(align_offset == 0);
     size_t arena_idx;
     size_t bitmap_idx;
     mi_arena_memid_indices(memid, &arena_idx, &bitmap_idx);
@@ -329,10 +330,10 @@ void _mi_arena_free(void* p, size_t size, size_t memid, bool all_committed, mi_o
     }
     else {
       mi_assert_internal(arena->blocks_committed != NULL);
-      _mi_os_decommit(p, blocks * MI_ARENA_BLOCK_SIZE, tld->stats); // ok if this fails
+      _mi_os_decommit(p, blocks * MI_ARENA_BLOCK_SIZE, stats); // ok if this fails
       _mi_bitmap_unclaim_across(arena->blocks_committed, arena->field_count, blocks, bitmap_idx);
     }
-    // and make it available to others again 
+    // and make it available to others again
     bool all_inuse = _mi_bitmap_unclaim_across(arena->blocks_inuse, arena->field_count, blocks, bitmap_idx);
     if (!all_inuse) {
       _mi_error_message(EAGAIN, "trying to free an already freed block: %p, size %zu\n", p, size);
@@ -371,8 +372,8 @@ bool mi_manage_os_memory_ex(void* start, size_t size, bool is_committed, bool is
     mi_assert_internal(is_committed);
     is_committed = true;
   }
-  
-  const size_t bcount = size / MI_ARENA_BLOCK_SIZE; 
+
+  const size_t bcount = size / MI_ARENA_BLOCK_SIZE;
   const size_t fields = _mi_divide_up(bcount, MI_BITMAP_FIELD_BITS);
   const size_t bitmaps = (is_committed ? 2 : 3);
   const size_t asize  = sizeof(mi_arena_t) + (bitmaps*fields*sizeof(mi_bitmap_field_t));
@@ -410,7 +411,7 @@ bool mi_manage_os_memory_ex(void* start, size_t size, bool is_committed, bool is
 }
 
 // Reserve a range of regular OS memory
-int mi_reserve_os_memory_ex(size_t size, bool commit, bool allow_large, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept 
+int mi_reserve_os_memory_ex(size_t size, bool commit, bool allow_large, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept
 {
   if (arena_id != NULL) *arena_id = _mi_arena_id_none();
   size = _mi_align_up(size, MI_ARENA_BLOCK_SIZE); // at least one block
diff --git a/source/luametatex/source/libraries/mimalloc/src/bitmap.c b/source/luametatex/source/libraries/mimalloc/src/bitmap.c
index 4fc7a1f3d..4ea9f4afa 100644
--- a/source/luametatex/source/libraries/mimalloc/src/bitmap.c
+++ b/source/luametatex/source/libraries/mimalloc/src/bitmap.c
@@ -188,15 +188,15 @@ bool _mi_bitmap_is_any_claimed(mi_bitmap_t bitmap, size_t bitmap_fields, size_t
 // between the fields. This is used in arena allocation
 //--------------------------------------------------------------------------
 
-// Try to atomically claim a sequence of `count` bits starting from the field 
+// Try to atomically claim a sequence of `count` bits starting from the field
 // at `idx` in `bitmap` and crossing into subsequent fields. Returns `true` on success.
 static bool mi_bitmap_try_find_claim_field_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t idx, const size_t count, const size_t retries, mi_bitmap_index_t* bitmap_idx)
 {
   mi_assert_internal(bitmap_idx != NULL);
-  
+
   // check initial trailing zeros
   mi_bitmap_field_t* field = &bitmap[idx];
-  size_t map = mi_atomic_load_relaxed(field);  
+  size_t map = mi_atomic_load_relaxed(field);
   const size_t initial = mi_clz(map);  // count of initial zeros starting at idx
   mi_assert_internal(initial <= MI_BITMAP_FIELD_BITS);
   if (initial == 0)     return false;
@@ -231,14 +231,14 @@ static bool mi_bitmap_try_find_claim_field_across(mi_bitmap_t bitmap, size_t bit
     newmap = map | initial_mask;
     if ((map & initial_mask) != 0) { goto rollback; };
   } while (!mi_atomic_cas_strong_acq_rel(field, &map, newmap));
-  
+
   // intermediate fields
   while (++field < final_field) {
     newmap = MI_BITMAP_FIELD_FULL;
     map = 0;
     if (!mi_atomic_cas_strong_acq_rel(field, &map, newmap)) { goto rollback; }
   }
-  
+
   // final field
   mi_assert_internal(field == final_field);
   map = mi_atomic_load_relaxed(field);
@@ -251,7 +251,7 @@ static bool mi_bitmap_try_find_claim_field_across(mi_bitmap_t bitmap, size_t bit
   *bitmap_idx = mi_bitmap_index_create(idx, MI_BITMAP_FIELD_BITS - initial);
   return true;
 
-rollback: 
+rollback:
   // roll back intermediate fields
   while (--field > initial_field) {
     newmap = 0;
@@ -265,7 +265,7 @@ rollback:
       mi_assert_internal((map & initial_mask) == initial_mask);
       newmap = map & ~initial_mask;
     } while (!mi_atomic_cas_strong_acq_rel(field, &map, newmap));
-  }  
+  }
   // retry? (we make a recursive call instead of goto to be able to use const declarations)
   if (retries < 4) {
     return mi_bitmap_try_find_claim_field_across(bitmap, bitmap_fields, idx, count, retries+1, bitmap_idx);
@@ -330,7 +330,7 @@ bool _mi_bitmap_unclaim_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t
   size_t pre_mask;
   size_t mid_mask;
   size_t post_mask;
-  size_t mid_count = mi_bitmap_mask_across(bitmap_idx, bitmap_fields, count, &pre_mask, &mid_mask, &post_mask);  
+  size_t mid_count = mi_bitmap_mask_across(bitmap_idx, bitmap_fields, count, &pre_mask, &mid_mask, &post_mask);
   bool all_one = true;
   mi_bitmap_field_t* field = &bitmap[idx];
   size_t prev = mi_atomic_and_acq_rel(field++, ~pre_mask);
@@ -343,7 +343,7 @@ bool _mi_bitmap_unclaim_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t
     prev = mi_atomic_and_acq_rel(field, ~post_mask);
     if ((prev & post_mask) != post_mask) all_one = false;
   }
-  return all_one;  
+  return all_one;
 }
 
 // Set `count` bits at `bitmap_idx` to 1 atomically
@@ -375,7 +375,7 @@ bool _mi_bitmap_claim_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t co
 }
 
 
-// Returns `true` if all `count` bits were 1. 
+// Returns `true` if all `count` bits were 1.
 // `any_ones` is `true` if there was at least one bit set to one.
 static bool mi_bitmap_is_claimedx_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, bool* pany_ones) {
   size_t idx = mi_bitmap_index_field(bitmap_idx);
@@ -398,7 +398,7 @@ static bool mi_bitmap_is_claimedx_across(mi_bitmap_t bitmap, size_t bitmap_field
     prev = mi_atomic_load_relaxed(field);
     if ((prev & post_mask) != post_mask) all_ones = false;
     if ((prev & post_mask) != 0) any_ones = true;
-  }  
+  }
   if (pany_ones != NULL) *pany_ones = any_ones;
   return all_ones;
 }
diff --git a/source/luametatex/source/libraries/mimalloc/src/heap.c b/source/luametatex/source/libraries/mimalloc/src/heap.c
index 15ca36031..ac2d042bf 100644
--- a/source/luametatex/source/libraries/mimalloc/src/heap.c
+++ b/source/luametatex/source/libraries/mimalloc/src/heap.c
@@ -92,7 +92,7 @@ static bool mi_heap_page_collect(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_t
   mi_collect_t collect = *((mi_collect_t*)arg_collect);
   _mi_page_free_collect(page, collect >= MI_FORCE);
   if (mi_page_all_free(page)) {
-    // no more used blocks, free the page. 
+    // no more used blocks, free the page.
     // note: this will free retired pages as well.
     _mi_page_free(page, pq, collect >= MI_FORCE);
   }
@@ -133,7 +133,7 @@ static void mi_heap_collect_ex(mi_heap_t* heap, mi_collect_t collect)
     // if all memory is freed by now, all segments should be freed.
     _mi_abandoned_reclaim_all(heap, &heap->tld->segments);
   }
-  
+
   // if abandoning, mark all pages to no longer add to delayed_free
   if (collect == MI_ABANDON) {
     mi_heap_visit_pages(heap, &mi_heap_page_never_delayed_free, NULL, NULL);
@@ -260,7 +260,7 @@ static void mi_heap_free(mi_heap_t* heap) {
   // remove ourselves from the thread local heaps list
   // linear search but we expect the number of heaps to be relatively small
   mi_heap_t* prev = NULL;
-  mi_heap_t* curr = heap->tld->heaps; 
+  mi_heap_t* curr = heap->tld->heaps;
   while (curr != heap && curr != NULL) {
     prev = curr;
     curr = curr->next;
@@ -347,7 +347,20 @@ void mi_heap_destroy(mi_heap_t* heap) {
   }
 }
 
-
+void _mi_heap_destroy_all(void) {
+  mi_heap_t* bheap = mi_heap_get_backing();
+  mi_heap_t* curr = bheap->tld->heaps;
+  while (curr != NULL) {
+    mi_heap_t* next = curr->next;
+    if (curr->no_reclaim) {
+      mi_heap_destroy(curr);
+    }
+    else {
+      _mi_heap_destroy_pages(curr);
+    }
+    curr = next;
+  }
+}
 
 /* -----------------------------------------------------------
   Safe Heap delete
@@ -360,8 +373,8 @@ static void mi_heap_absorb(mi_heap_t* heap, mi_heap_t* from) {
 
   // reduce the size of the delayed frees
   _mi_heap_delayed_free_partial(from);
-  
-  // transfer all pages by appending the queues; this will set a new heap field 
+
+  // transfer all pages by appending the queues; this will set a new heap field
   // so threads may do delayed frees in either heap for a while.
   // note: appending waits for each page to not be in the `MI_DELAYED_FREEING` state
   // so after this only the new heap will get delayed frees
@@ -374,17 +387,17 @@ static void mi_heap_absorb(mi_heap_t* heap, mi_heap_t* from) {
   }
   mi_assert_internal(from->page_count == 0);
 
-  // and do outstanding delayed frees in the `from` heap  
+  // and do outstanding delayed frees in the `from` heap
   // note: be careful here as the `heap` field in all those pages no longer point to `from`,
-  // turns out to be ok as `_mi_heap_delayed_free` only visits the list and calls a 
+  // turns out to be ok as `_mi_heap_delayed_free` only visits the list and calls a
   // the regular `_mi_free_delayed_block` which is safe.
-  _mi_heap_delayed_free_all(from);  
+  _mi_heap_delayed_free_all(from);
   #if !defined(_MSC_VER) || (_MSC_VER > 1900) // somehow the following line gives an error in VS2015, issue #353
   mi_assert_internal(mi_atomic_load_ptr_relaxed(mi_block_t,&from->thread_delayed_free) == NULL);
   #endif
 
   // and reset the `from` heap
-  mi_heap_reset_pages(from);  
+  mi_heap_reset_pages(from);
 }
 
 // Safe delete a heap without freeing any still allocated blocks in that heap.
diff --git a/source/luametatex/source/libraries/mimalloc/src/init.c b/source/luametatex/source/libraries/mimalloc/src/init.c
index 4f37b7176..c416208cf 100644
--- a/source/luametatex/source/libraries/mimalloc/src/init.c
+++ b/source/luametatex/source/libraries/mimalloc/src/init.c
@@ -19,12 +19,12 @@ const mi_page_t _mi_page_empty = {
   false,   // is_zero
   0,       // retire_expire
   NULL,    // free
-  #if MI_ENCODE_FREELIST
-  { 0, 0 },
-  #endif
   0,       // used
   0,       // xblock_size
   NULL,    // local_free
+  #if MI_ENCODE_FREELIST
+  { 0, 0 },
+  #endif
   MI_ATOMIC_VAR_INIT(0), // xthread_free
   MI_ATOMIC_VAR_INIT(0), // xheap
   NULL, NULL
@@ -111,7 +111,7 @@ mi_decl_cache_align const mi_heap_t _mi_heap_empty = {
   0,                // cookie
   0,                // arena id
   { 0, 0 },         // keys
-  { {0}, {0}, 0 },
+  { {0}, {0}, 0, true }, // random
   0,                // page count
   MI_BIN_FULL, 0,   // page retired min/max
   NULL,             // next
@@ -152,7 +152,7 @@ mi_heap_t _mi_heap_main = {
   0,                // initial cookie
   0,                // arena id
   { 0, 0 },         // the key of the main heap can be fixed (unlike page keys that need to be secure!)
-  { {0x846ca68b}, {0}, 0 },  // random
+  { {0x846ca68b}, {0}, 0, true },  // random
   0,                // page count
   MI_BIN_FULL, 0,   // page retired min/max
   NULL,             // next heap
@@ -167,8 +167,13 @@ mi_stats_t _mi_stats_main = { MI_STATS_NULL };
 static void mi_heap_main_init(void) {
   if (_mi_heap_main.cookie == 0) {
     _mi_heap_main.thread_id = _mi_thread_id();
-    _mi_heap_main.cookie = _mi_os_random_weak((uintptr_t)&mi_heap_main_init);
-    _mi_random_init(&_mi_heap_main.random);
+    _mi_heap_main.cookie = 1;
+    #if defined(_WIN32) && !defined(MI_SHARED_LIB)
+      _mi_random_init_weak(&_mi_heap_main.random);    // prevent allocation failure during bcrypt dll initialization with static linking
+    #else
+      _mi_random_init(&_mi_heap_main.random);
+    #endif
+    _mi_heap_main.cookie  = _mi_heap_random_next(&_mi_heap_main);
     _mi_heap_main.keys[0] = _mi_heap_random_next(&_mi_heap_main);
     _mi_heap_main.keys[1] = _mi_heap_random_next(&_mi_heap_main);
   }
@@ -193,7 +198,7 @@ typedef struct mi_thread_data_s {
 
 // Thread meta-data is allocated directly from the OS. For
 // some programs that do not use thread pools and allocate and
-// destroy many OS threads, this may causes too much overhead 
+// destroy many OS threads, this may causes too much overhead
 // per thread so we maintain a small cache of recently freed metadata.
 
 #define TD_CACHE_SIZE (8)
@@ -205,7 +210,7 @@ static mi_thread_data_t* mi_thread_data_alloc(void) {
   for (int i = 0; i < TD_CACHE_SIZE; i++) {
     td = mi_atomic_load_ptr_relaxed(mi_thread_data_t, &td_cache[i]);
     if (td != NULL) {
-      td = mi_atomic_exchange_ptr_acq_rel(mi_thread_data_t, &td_cache[i], NULL); 
+      td = mi_atomic_exchange_ptr_acq_rel(mi_thread_data_t, &td_cache[i], NULL);
       if (td != NULL) {
         return td;
       }
@@ -283,7 +288,7 @@ static bool _mi_heap_init(void) {
     tld->segments.stats = &tld->stats;
     tld->segments.os = &tld->os;
     tld->os.stats = &tld->stats;
-    _mi_heap_set_default_direct(heap);    
+    _mi_heap_set_default_direct(heap);
   }
   return false;
 }
@@ -316,9 +321,9 @@ static bool _mi_heap_done(mi_heap_t* heap) {
   if (heap != &_mi_heap_main) {
     _mi_heap_collect_abandon(heap);
   }
-  
+
   // merge stats
-  _mi_stats_done(&heap->tld->stats);  
+  _mi_stats_done(&heap->tld->stats);
 
   // free if not the main thread
   if (heap != &_mi_heap_main) {
@@ -329,8 +334,8 @@ static bool _mi_heap_done(mi_heap_t* heap) {
     mi_thread_data_free((mi_thread_data_t*)heap);
   }
   else {
-    mi_thread_data_collect(); // free cached thread metadata  
-    #if 0  
+    mi_thread_data_collect(); // free cached thread metadata
+    #if 0
     // never free the main thread even in debug mode; if a dll is linked statically with mimalloc,
     // there may still be delete/free calls after the mi_fls_done is called. Issue #207
     _mi_heap_destroy_pages(heap);
@@ -366,7 +371,7 @@ static void _mi_thread_done(mi_heap_t* default_heap);
   // use thread local storage keys to detect thread ending
   #include <windows.h>
   #include <fibersapi.h>
-  #if (_WIN32_WINNT < 0x600)  // before Windows Vista 
+  #if (_WIN32_WINNT < 0x600)  // before Windows Vista
   WINBASEAPI DWORD WINAPI FlsAlloc( _In_opt_ PFLS_CALLBACK_FUNCTION lpCallback );
   WINBASEAPI PVOID WINAPI FlsGetValue( _In_ DWORD dwFlsIndex );
   WINBASEAPI BOOL  WINAPI FlsSetValue( _In_ DWORD dwFlsIndex, _In_opt_ PVOID lpFlsData );
@@ -374,7 +379,11 @@ static void _mi_thread_done(mi_heap_t* default_heap);
   #endif
   static DWORD mi_fls_key = (DWORD)(-1);
   static void NTAPI mi_fls_done(PVOID value) {
-    if (value!=NULL) _mi_thread_done((mi_heap_t*)value);
+    mi_heap_t* heap = (mi_heap_t*)value;
+    if (heap != NULL) {
+      _mi_thread_done(heap);
+      FlsSetValue(mi_fls_key, NULL);  // prevent recursion as _mi_thread_done may set it back to the main heap, issue #672
+    }
   }
 #elif defined(MI_USE_PTHREADS)
   // use pthread local storage keys to detect thread ending
@@ -421,7 +430,7 @@ void mi_thread_init(void) mi_attr_noexcept
 {
   // ensure our process has started already
   mi_process_init();
-  
+
   // initialize the thread local default heap
   // (this will call `_mi_heap_set_default_direct` and thus set the
   //  fiber/pthread key to a non-zero value, ensuring `_mi_thread_done` is called)
@@ -442,7 +451,7 @@ static void _mi_thread_done(mi_heap_t* heap) {
 
   // check thread-id as on Windows shutdown with FLS the main (exit) thread may call this on thread-local heaps...
   if (heap->thread_id != _mi_thread_id()) return;
-  
+
   // abandon the thread local heap
   if (_mi_heap_done(heap)) return;  // returns true if already ran
 }
@@ -531,12 +540,13 @@ static void mi_process_load(void) {
   MI_UNUSED(dummy);
   #endif
   os_preloading = false;
+  mi_assert_internal(_mi_is_main_thread());
   #if !(defined(_WIN32) && defined(MI_SHARED_LIB))  // use Dll process detach (see below) instead of atexit (issue #521)
-  atexit(&mi_process_done);  
+  atexit(&mi_process_done);
   #endif
   _mi_options_init();
+  mi_process_setup_auto_thread_done();
   mi_process_init();
-  //mi_stats_reset();-
   if (mi_redirected) _mi_verbose_message("malloc is redirected.\n");
 
   // show message from the redirector (if present)
@@ -545,6 +555,9 @@ static void mi_process_load(void) {
   if (msg != NULL && (mi_option_is_enabled(mi_option_verbose) || mi_option_is_enabled(mi_option_show_errors))) {
     _mi_fputs(NULL,NULL,NULL,msg);
   }
+
+  // reseed random
+  _mi_random_reinit_if_weak(&_mi_heap_main.random);
 }
 
 #if defined(_WIN32) && (defined(_M_IX86) || defined(_M_X64))
@@ -571,7 +584,6 @@ void mi_process_init(void) mi_attr_noexcept {
   _mi_process_is_initialized = true;
   mi_process_setup_auto_thread_done();
 
-  
   mi_detect_cpu_features();
   _mi_os_init();
   mi_heap_main_init();
@@ -579,6 +591,7 @@ void mi_process_init(void) mi_attr_noexcept {
   _mi_verbose_message("debug level : %d\n", MI_DEBUG);
   #endif
   _mi_verbose_message("secure level: %d\n", MI_SECURE);
+  _mi_verbose_message("mem tracking: %s\n", MI_TRACK_TOOL);
   mi_thread_init();
 
   #if defined(_WIN32) && !defined(MI_SHARED_LIB)
@@ -598,7 +611,7 @@ void mi_process_init(void) mi_attr_noexcept {
     } else {
       mi_reserve_huge_os_pages_interleave(pages, 0, pages*500);
     }
-  } 
+  }
   if (mi_option_is_enabled(mi_option_reserve_os_memory)) {
     long ksize = mi_option_get(mi_option_reserve_os_memory);
     if (ksize > 0) {
@@ -619,9 +632,9 @@ static void mi_cdecl mi_process_done(void) {
   #if defined(_WIN32) && !defined(MI_SHARED_LIB)
   FlsFree(mi_fls_key);  // call thread-done on all threads (except the main thread) to prevent dangling callback pointer if statically linked with a DLL; Issue #208
   #endif
-  
+
   #ifndef MI_SKIP_COLLECT_ON_EXIT
-    #if (MI_DEBUG != 0) || !defined(MI_SHARED_LIB)  
+    #if (MI_DEBUG != 0) || !defined(MI_SHARED_LIB)
     // free all memory if possible on process exit. This is not needed for a stand-alone process
     // but should be done if mimalloc is statically linked into another shared library which
     // is repeatedly loaded/unloaded, see issue #281.
@@ -629,10 +642,18 @@ static void mi_cdecl mi_process_done(void) {
     #endif
   #endif
 
+  // Forcefully release all retained memory; this can be dangerous in general if overriding regular malloc/free
+  // since after process_done there might still be other code running that calls `free` (like at_exit routines,
+  // or C-runtime termination code.
+  if (mi_option_is_enabled(mi_option_destroy_on_exit)) {
+    _mi_heap_destroy_all();                          // forcefully release all memory held by all heaps (of this thread only!)
+    _mi_segment_cache_free_all(&_mi_heap_main_get()->tld->os);  // release all cached segments
+  }
+
   if (mi_option_is_enabled(mi_option_show_stats) || mi_option_is_enabled(mi_option_verbose)) {
     mi_stats_print(NULL);
   }
-  mi_allocator_done();  
+  mi_allocator_done();
   _mi_verbose_message("process done: 0x%zx\n", _mi_heap_main.thread_id);
   os_preloading = true; // don't call the C runtime anymore
 }
@@ -654,7 +675,7 @@ static void mi_cdecl mi_process_done(void) {
       if (!mi_is_redirected()) {
         mi_thread_done();
       }
-    }    
+    }
     return TRUE;
   }
 
diff --git a/source/luametatex/source/libraries/mimalloc/src/options.c b/source/luametatex/source/libraries/mimalloc/src/options.c
index 0182671ce..e53538f5f 100644
--- a/source/luametatex/source/libraries/mimalloc/src/options.c
+++ b/source/luametatex/source/libraries/mimalloc/src/options.c
@@ -76,7 +76,7 @@ static mi_option_desc_t options[_mi_option_last] =
   { 0, UNINIT, MI_OPTION(reserve_os_memory)     },
   { 0, UNINIT, MI_OPTION(deprecated_segment_cache) },  // cache N segments per thread
   { 0, UNINIT, MI_OPTION(page_reset) },          // reset page memory on free
-  { 0, UNINIT, MI_OPTION_LEGACY(abandoned_page_decommit, abandoned_page_reset) },// decommit free page memory when a thread terminates
+  { 0, UNINIT, MI_OPTION_LEGACY(abandoned_page_decommit, abandoned_page_reset) },// decommit free page memory when a thread terminates  
   { 0, UNINIT, MI_OPTION(deprecated_segment_reset) },
   #if defined(__NetBSD__)
   { 0, UNINIT, MI_OPTION(eager_commit_delay) },  // the first N segments per thread are not eagerly committed
@@ -86,15 +86,16 @@ static mi_option_desc_t options[_mi_option_last] =
   { 1, UNINIT, MI_OPTION(eager_commit_delay) },  // the first N segments per thread are not eagerly committed (but per page in the segment on demand)
   #endif
   { 25,   UNINIT, MI_OPTION_LEGACY(decommit_delay, reset_delay) }, // page decommit delay in milli-seconds
-  { 0,    UNINIT, MI_OPTION(use_numa_nodes) },    // 0 = use available numa nodes, otherwise use at most N nodes.
+  { 0,    UNINIT, MI_OPTION(use_numa_nodes) },    // 0 = use available numa nodes, otherwise use at most N nodes. 
   { 0,    UNINIT, MI_OPTION(limit_os_alloc) },    // 1 = do not use OS memory for allocation (but only reserved arenas)
   { 100,  UNINIT, MI_OPTION(os_tag) },            // only apple specific for now but might serve more or less related purpose
   { 16,   UNINIT, MI_OPTION(max_errors) },        // maximum errors that are output
   { 16,   UNINIT, MI_OPTION(max_warnings) },      // maximum warnings that are output
-  { 8,    UNINIT, MI_OPTION(max_segment_reclaim)},// max. number of segment reclaims from the abandoned segments per try.
+  { 8,    UNINIT, MI_OPTION(max_segment_reclaim)},// max. number of segment reclaims from the abandoned segments per try.  
   { 1,    UNINIT, MI_OPTION(allow_decommit) },    // decommit slices when no longer used (after decommit_delay milli-seconds)
   { 500,  UNINIT, MI_OPTION(segment_decommit_delay) }, // decommit delay in milli-seconds for freed segments
-  { 2,    UNINIT, MI_OPTION(decommit_extend_delay) }
+  { 1,    UNINIT, MI_OPTION(decommit_extend_delay) },
+  { 0,    UNINIT, MI_OPTION(destroy_on_exit)}     // release all OS memory on process exit; careful with dangling pointer or after-exit frees!
 };
 
 static void mi_option_init(mi_option_desc_t* desc);
@@ -106,7 +107,8 @@ void _mi_options_init(void) {
   for(int i = 0; i < _mi_option_last; i++ ) {
     mi_option_t option = (mi_option_t)i;
     long l = mi_option_get(option); MI_UNUSED(l); // initialize
-    if (option != mi_option_verbose) {
+    // if (option != mi_option_verbose)
+    {
       mi_option_desc_t* desc = &options[option];
       _mi_verbose_message("option '%s': %ld\n", desc->name, desc->value);
     }
@@ -179,13 +181,26 @@ static void mi_cdecl mi_out_stderr(const char* msg, void* arg) {
   if (!_mi_preloading()) {
     // _cputs(msg);  // _cputs cannot be used at is aborts if it fails to lock the console
     static HANDLE hcon = INVALID_HANDLE_VALUE;
+    static bool hconIsConsole;
     if (hcon == INVALID_HANDLE_VALUE) {
+      CONSOLE_SCREEN_BUFFER_INFO sbi;
       hcon = GetStdHandle(STD_ERROR_HANDLE);
+      hconIsConsole = ((hcon != INVALID_HANDLE_VALUE) && GetConsoleScreenBufferInfo(hcon, &sbi));
     }
     const size_t len = strlen(msg);
-    if (hcon != INVALID_HANDLE_VALUE && len > 0 && len < UINT32_MAX) {
+    if (len > 0 && len < UINT32_MAX) {
       DWORD written = 0;
-      WriteConsoleA(hcon, msg, (DWORD)len, &written, NULL);
+      if (hconIsConsole) {
+        WriteConsoleA(hcon, msg, (DWORD)len, &written, NULL);
+      }
+      else if (hcon != INVALID_HANDLE_VALUE) {
+        // use direct write if stderr was redirected
+        WriteFile(hcon, msg, (DWORD)len, &written, NULL);
+      }
+      else {
+        // finally fall back to fputs after all
+        fputs(msg, stderr);
+      }
     }
   }
   #else
@@ -480,13 +495,6 @@ static bool mi_getenv(const char* name, char* result, size_t result_size) {
   return false;
 }
 #else
-static inline int mi_strnicmp(const char* s, const char* t, size_t n) {
-  if (n==0) return 0;
-  for (; *s != 0 && *t != 0 && n > 0; s++, t++, n--) {
-    if (toupper(*s) != toupper(*t)) break;
-  }
-  return (n==0 ? 0 : *s - *t);
-}
 #if defined _WIN32
 // On Windows use GetEnvironmentVariable instead of getenv to work
 // reliably even when this is invoked before the C runtime is initialized.
@@ -512,6 +520,13 @@ static char** mi_get_environ(void) {
   return environ;
 }
 #endif
+static int mi_strnicmp(const char* s, const char* t, size_t n) {
+  if (n == 0) return 0;
+  for (; *s != 0 && *t != 0 && n > 0; s++, t++, n--) {
+    if (toupper(*s) != toupper(*t)) break;
+  }
+  return (n == 0 ? 0 : *s - *t);
+}
 static bool mi_getenv(const char* name, char* result, size_t result_size) {
   if (name==NULL) return false;
   const size_t len = strlen(name);
@@ -570,7 +585,7 @@ static void mi_option_init(mi_option_desc_t* desc) {
     found = mi_getenv(buf,s,sizeof(s));
     if (found) {
       _mi_warning_message("environment option \"mimalloc_%s\" is deprecated -- use \"mimalloc_%s\" instead.\n", desc->legacy_name, desc->name );
-    }
+    }    
   }
 
   if (found) {
diff --git a/source/luametatex/source/libraries/mimalloc/src/os.c b/source/luametatex/source/libraries/mimalloc/src/os.c
index 6d7249873..0f9847417 100644
--- a/source/luametatex/source/libraries/mimalloc/src/os.c
+++ b/source/luametatex/source/libraries/mimalloc/src/os.c
@@ -88,7 +88,7 @@ static size_t os_alloc_granularity = 4096;
 // if non-zero, use large page allocation
 static size_t large_os_page_size = 0;
 
-// is memory overcommit allowed? 
+// is memory overcommit allowed?
 // set dynamically in _mi_os_init (and if true we use MAP_NORESERVE)
 static bool os_overcommit = true;
 
@@ -139,7 +139,7 @@ typedef enum MI_MEM_EXTENDED_PARAMETER_TYPE_E {
   MiMemExtendedParameterUserPhysicalHandle,
   MiMemExtendedParameterAttributeFlags,
   MiMemExtendedParameterMax
-} MI_MEM_EXTENDED_PARAMETER_TYPE; 
+} MI_MEM_EXTENDED_PARAMETER_TYPE;
 
 typedef struct DECLSPEC_ALIGN(8) MI_MEM_EXTENDED_PARAMETER_S {
   struct { DWORD64 Type : 8; DWORD64 Reserved : 56; } Type;
@@ -166,9 +166,11 @@ typedef struct MI_PROCESSOR_NUMBER_S { WORD Group; BYTE Number; BYTE Reserved; }
 typedef VOID (__stdcall *PGetCurrentProcessorNumberEx)(MI_PROCESSOR_NUMBER* ProcNumber);
 typedef BOOL (__stdcall *PGetNumaProcessorNodeEx)(MI_PROCESSOR_NUMBER* Processor, PUSHORT NodeNumber);
 typedef BOOL (__stdcall* PGetNumaNodeProcessorMaskEx)(USHORT Node, PGROUP_AFFINITY ProcessorMask);
+typedef BOOL (__stdcall *PGetNumaProcessorNode)(UCHAR Processor, PUCHAR NodeNumber);
 static PGetCurrentProcessorNumberEx pGetCurrentProcessorNumberEx = NULL;
 static PGetNumaProcessorNodeEx      pGetNumaProcessorNodeEx = NULL;
 static PGetNumaNodeProcessorMaskEx  pGetNumaNodeProcessorMaskEx = NULL;
+static PGetNumaProcessorNode        pGetNumaProcessorNode = NULL;
 
 static bool mi_win_enable_large_os_pages(void)
 {
@@ -205,7 +207,7 @@ static bool mi_win_enable_large_os_pages(void)
   return (ok!=0);
 }
 
-void _mi_os_init(void) 
+void _mi_os_init(void)
 {
   os_overcommit = false;
   // get the page size
@@ -234,6 +236,7 @@ void _mi_os_init(void)
     pGetCurrentProcessorNumberEx = (PGetCurrentProcessorNumberEx)(void (*)(void))GetProcAddress(hDll, "GetCurrentProcessorNumberEx");
     pGetNumaProcessorNodeEx = (PGetNumaProcessorNodeEx)(void (*)(void))GetProcAddress(hDll, "GetNumaProcessorNodeEx");
     pGetNumaNodeProcessorMaskEx = (PGetNumaNodeProcessorMaskEx)(void (*)(void))GetProcAddress(hDll, "GetNumaNodeProcessorMaskEx");
+    pGetNumaProcessorNode = (PGetNumaProcessorNode)(void (*)(void))GetProcAddress(hDll, "GetNumaProcessorNode");
     FreeLibrary(hDll);
   }
   if (mi_option_is_enabled(mi_option_large_os_pages) || mi_option_is_enabled(mi_option_reserve_huge_os_pages)) {
@@ -266,9 +269,9 @@ static void os_detect_overcommit(void) {
   size_t olen = sizeof(val);
   if (sysctlbyname("vm.overcommit", &val, &olen, NULL, 0) == 0) {
     os_overcommit = (val != 0);
-  }  
+  }
 #else
-  // default: overcommit is true  
+  // default: overcommit is true
 #endif
 }
 
@@ -306,10 +309,10 @@ static int mi_madvise(void* addr, size_t length, int advice) {
 static mi_decl_cache_align _Atomic(uintptr_t)aligned_base;
 
 // Return a MI_SEGMENT_SIZE aligned address that is probably available.
-// If this returns NULL, the OS will determine the address but on some OS's that may not be 
+// If this returns NULL, the OS will determine the address but on some OS's that may not be
 // properly aligned which can be more costly as it needs to be adjusted afterwards.
-// For a size > 1GiB this always returns NULL in order to guarantee good ASLR randomization; 
-// (otherwise an initial large allocation of say 2TiB has a 50% chance to include (known) addresses 
+// For a size > 1GiB this always returns NULL in order to guarantee good ASLR randomization;
+// (otherwise an initial large allocation of say 2TiB has a 50% chance to include (known) addresses
 //  in the middle of the 2TiB - 6TiB address range (see issue #372))
 
 #define MI_HINT_BASE ((uintptr_t)2 << 40)  // 2TiB start
@@ -383,12 +386,12 @@ static bool mi_os_mem_free(void* addr, size_t size, bool was_committed, mi_stats
 #endif
   if (was_committed) { _mi_stat_decrease(&stats->committed, size); }
   _mi_stat_decrease(&stats->reserved, size);
-  return !err;  
+  return !err;
 }
 
 
 /* -----------------------------------------------------------
-  Raw allocation on Windows (VirtualAlloc) 
+  Raw allocation on Windows (VirtualAlloc)
 -------------------------------------------------------------- */
 
 #ifdef _WIN32
@@ -406,7 +409,7 @@ static void* mi_win_virtual_allocx(void* addr, size_t size, size_t try_alignment
       _mi_verbose_message("warning: unable to allocate hinted aligned OS memory (%zu bytes, error code: 0x%x, address: %p, alignment: %zu, flags: 0x%x)\n", size, GetLastError(), hint, try_alignment, flags);
       // fall through on error
     }
-  } 
+  }
 #endif
   // on modern Windows try use VirtualAlloc2 for aligned allocation
   if (try_alignment > 1 && (try_alignment % _mi_os_page_size()) == 0 && pVirtualAlloc2 != NULL) {
@@ -464,12 +467,12 @@ static void* mi_win_virtual_alloc(void* addr, size_t size, size_t try_alignment,
 -------------------------------------------------------------- */
 
 #elif defined(MI_USE_SBRK) || defined(__wasi__)
-#if defined(MI_USE_SBRK) 
+#if defined(MI_USE_SBRK)
   static void* mi_memory_grow( size_t size ) {
     void* p = sbrk(size);
     if (p == (void*)(-1)) return NULL;
     #if !defined(__wasi__) // on wasi this is always zero initialized already (?)
-    memset(p,0,size); 
+    memset(p,0,size);
     #endif
     return p;
   }
@@ -477,8 +480,8 @@ static void* mi_win_virtual_alloc(void* addr, size_t size, size_t try_alignment,
   static void* mi_memory_grow( size_t size ) {
     size_t base = (size > 0 ? __builtin_wasm_memory_grow(0,_mi_divide_up(size, _mi_os_page_size()))
                             : __builtin_wasm_memory_size(0));
-    if (base == SIZE_MAX) return NULL;     
-    return (void*)(base * _mi_os_page_size());    
+    if (base == SIZE_MAX) return NULL;
+    return (void*)(base * _mi_os_page_size());
   }
 #endif
 
@@ -490,7 +493,7 @@ static void* mi_heap_grow(size_t size, size_t try_alignment) {
   void* p = NULL;
   if (try_alignment <= 1) {
     // `sbrk` is not thread safe in general so try to protect it (we could skip this on WASM but leave it in for now)
-    #if defined(MI_USE_PTHREADS) 
+    #if defined(MI_USE_PTHREADS)
     pthread_mutex_lock(&mi_heap_grow_mutex);
     #endif
     p = mi_memory_grow(size);
@@ -512,7 +515,7 @@ static void* mi_heap_grow(size_t size, size_t try_alignment) {
       if (current != NULL) {
         void* aligned_current = mi_align_up_ptr(current, try_alignment);  // and align from there to minimize wasted space
         alloc_size = _mi_align_up( ((uint8_t*)aligned_current - (uint8_t*)current) + size, _mi_os_page_size());
-        base = mi_memory_grow(alloc_size);        
+        base = mi_memory_grow(alloc_size);
       }
     }
     #if defined(MI_USE_PTHREADS)
@@ -529,7 +532,7 @@ static void* mi_heap_grow(size_t size, size_t try_alignment) {
     }
   }
   if (p == NULL) {
-    _mi_warning_message("unable to allocate sbrk/wasm_memory_grow OS memory (%zu bytes, %zu alignment)\n", size, try_alignment);    
+    _mi_warning_message("unable to allocate sbrk/wasm_memory_grow OS memory (%zu bytes, %zu alignment)\n", size, try_alignment);
     errno = ENOMEM;
     return NULL;
   }
@@ -540,10 +543,10 @@ static void* mi_heap_grow(size_t size, size_t try_alignment) {
 /* -----------------------------------------------------------
   Raw allocation on Unix's (mmap)
 -------------------------------------------------------------- */
-#else 
+#else
 #define MI_OS_USE_MMAP
 static void* mi_unix_mmapx(void* addr, size_t size, size_t try_alignment, int protect_flags, int flags, int fd) {
-  MI_UNUSED(try_alignment);  
+  MI_UNUSED(try_alignment);
   #if defined(MAP_ALIGNED)  // BSD
   if (addr == NULL && try_alignment > 1 && (try_alignment % _mi_os_page_size()) == 0) {
     size_t n = mi_bsr(try_alignment);
@@ -574,7 +577,7 @@ static void* mi_unix_mmapx(void* addr, size_t size, size_t try_alignment, int pr
   #endif
   // regular mmap
   void* p = mmap(addr, size, protect_flags, flags, fd, 0);
-  if (p!=MAP_FAILED) return p;  
+  if (p!=MAP_FAILED) return p;
   // failed to allocate
   return NULL;
 }
@@ -602,7 +605,7 @@ static void* mi_unix_mmap(void* addr, size_t size, size_t try_alignment, int pro
   int flags = MAP_PRIVATE | MAP_ANONYMOUS;
   if (_mi_os_has_overcommit()) {
     flags |= MAP_NORESERVE;
-  }  
+  }
   #if defined(PROT_MAX)
   protect_flags |= PROT_MAX(PROT_READ | PROT_WRITE); // BSD
   #endif    
@@ -685,7 +688,7 @@ static void* mi_unix_mmap(void* addr, size_t size, size_t try_alignment, int pro
         if (memcntl((caddr_t)p, size, MC_HAT_ADVISE, (caddr_t)&cmd, 0, 0) == 0) {
           *is_large = true;
         }
-      }      
+      }
       #endif
     }
   }
@@ -753,7 +756,7 @@ static void* mi_os_mem_alloc_aligned(size_t size, size_t alignment, bool commit,
   // try first with a hint (this will be aligned directly on Win 10+ or BSD)
   void* p = mi_os_mem_alloc(size, alignment, commit, allow_large, is_large, stats);
   if (p == NULL) return NULL;
-  
+
   // if not aligned, free it, overallocate, and unmap around it
   if (((uintptr_t)p % alignment != 0)) {
     mi_os_mem_free(p, size, commit, stats);
@@ -765,7 +768,7 @@ static void* mi_os_mem_alloc_aligned(size_t size, size_t alignment, bool commit,
     // over-allocate uncommitted (virtual) memory
     p = mi_os_mem_alloc(over_size, 0 /*alignment*/, false /* commit? */, false /* allow_large */, is_large, stats);
     if (p == NULL) return NULL;
-    
+
     // set p to the aligned part in the full region
     // note: this is dangerous on Windows as VirtualFree needs the actual region pointer
     // but in mi_os_mem_free we handle this (hopefully exceptional) situation.
@@ -837,7 +840,45 @@ void* _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, bool* lar
   return mi_os_mem_alloc_aligned(size, alignment, commit, allow_large, (large!=NULL?large:&allow_large), &_mi_stats_main /*tld->stats*/ );
 }
 
+/* -----------------------------------------------------------
+  OS aligned allocation with an offset. This is used
+  for large alignments > MI_ALIGNMENT_MAX. We use a large mimalloc
+  page where the object can be aligned at an offset from the start of the segment.
+  As we may need to overallocate, we need to free such pointers using `mi_free_aligned`
+  to use the actual start of the memory region.
+----------------------------------------------------------- */
 
+void* _mi_os_alloc_aligned_offset(size_t size, size_t alignment, size_t offset, bool commit, bool* large, mi_stats_t* tld_stats) {
+  mi_assert(offset <= MI_SEGMENT_SIZE);
+  mi_assert(offset <= size);
+  mi_assert((alignment % _mi_os_page_size()) == 0);
+  if (offset > MI_SEGMENT_SIZE) return NULL;
+  if (offset == 0) {
+    // regular aligned allocation
+    return _mi_os_alloc_aligned(size, alignment, commit, large, tld_stats);
+  }
+  else {
+    // overallocate to align at an offset
+    const size_t extra = _mi_align_up(offset, alignment) - offset;
+    const size_t oversize = size + extra;
+    void* start = _mi_os_alloc_aligned(oversize, alignment, commit, large, tld_stats);
+    if (start == NULL) return NULL;
+    void* p = (uint8_t*)start + extra;
+    mi_assert(_mi_is_aligned((uint8_t*)p + offset, alignment));
+    // decommit the overallocation at the start
+    if (commit && extra > _mi_os_page_size()) {
+      _mi_os_decommit(start, extra, tld_stats);
+    }
+    return p;
+  }
+}
+
+void _mi_os_free_aligned(void* p, size_t size, size_t alignment, size_t align_offset, bool was_committed, mi_stats_t* tld_stats) {
+  mi_assert(align_offset <= MI_SEGMENT_SIZE);
+  const size_t extra = _mi_align_up(align_offset, alignment) - align_offset;
+  void* start = (uint8_t*)p - extra;
+  _mi_os_free_ex(start, size + extra, was_committed, tld_stats);
+}
 
 /* -----------------------------------------------------------
   OS memory API: reset, commit, decommit, protect, unprotect.
@@ -916,7 +957,7 @@ static bool mi_os_commitx(void* addr, size_t size, bool commit, bool conservativ
     // commit: just change the protection
     err = mprotect(start, csize, (PROT_READ | PROT_WRITE));
     if (err != 0) { err = errno; }
-  } 
+  }
   else {
     // decommit: use mmap with MAP_FIXED to discard the existing memory (and reduce rss)
     const int fd = mi_unix_mmap_fd();
@@ -926,10 +967,10 @@ static bool mi_os_commitx(void* addr, size_t size, bool commit, bool conservativ
   #else
   // Linux, macOSX and others.
   if (commit) {
-    // commit: ensure we can access the area    
+    // commit: ensure we can access the area
     err = mprotect(start, csize, (PROT_READ | PROT_WRITE));
     if (err != 0) { err = errno; }
-  } 
+  }
   else {
     #if defined(MADV_DONTNEED) && MI_DEBUG == 0 && MI_SECURE == 0
     // decommit: use MADV_DONTNEED as it decreases rss immediately (unlike MADV_FREE)
@@ -1008,7 +1049,7 @@ static bool mi_os_resetx(void* addr, size_t size, bool reset, mi_stats_t* stats)
   int oadvice = (int)mi_atomic_load_relaxed(&advice);
   int err;
   while ((err = mi_madvise(start, csize, oadvice)) != 0 && errno == EAGAIN) { errno = 0;  };
-  if (err != 0 && errno == EINVAL && oadvice == MADV_FREE) {  
+  if (err != 0 && errno == EINVAL && oadvice == MADV_FREE) {
     // if MADV_FREE is not supported, fall back to MADV_DONTNEED from now on
     mi_atomic_store_release(&advice, (size_t)MADV_DONTNEED);
     err = mi_madvise(start, csize, MADV_DONTNEED);
@@ -1041,13 +1082,8 @@ bool _mi_os_reset(void* addr, size_t size, mi_stats_t* tld_stats) {
 bool _mi_os_unreset(void* addr, size_t size, bool* is_zero, mi_stats_t* tld_stats) {
   MI_UNUSED(tld_stats);
   mi_stats_t* stats = &_mi_stats_main;
-  if (mi_option_is_enabled(mi_option_reset_decommits)) {
-    return mi_os_commit_unreset(addr, size, is_zero, stats);  // re-commit it (conservatively!)
-  }
-  else {
-    *is_zero = false;
-    return mi_os_resetx(addr, size, false, stats);
-  }
+  *is_zero = false;
+  return mi_os_resetx(addr, size, false, stats);
 }
 */
 
@@ -1156,7 +1192,7 @@ static void* mi_os_alloc_huge_os_pagesx(void* addr, size_t size, int numa_node)
     params[0].Arg.ULong = (unsigned)numa_node;
     return (*pVirtualAlloc2)(GetCurrentProcess(), addr, size, flags, PAGE_READWRITE, params, 1);
   }
-  
+
   // otherwise use regular virtual alloc on older windows
   return VirtualAlloc(addr, size, flags, PAGE_READWRITE);
 }
@@ -1305,7 +1341,7 @@ void _mi_os_free_huge_pages(void* p, size_t size, mi_stats_t* stats) {
 /* ----------------------------------------------------------------------------
 Support NUMA aware allocation
 -----------------------------------------------------------------------------*/
-#ifdef _WIN32  
+#ifdef _WIN32
 static size_t mi_os_numa_nodex(void) {
   USHORT numa_node = 0;
   if (pGetCurrentProcessorNumberEx != NULL && pGetNumaProcessorNodeEx != NULL) {
@@ -1314,14 +1350,14 @@ static size_t mi_os_numa_nodex(void) {
     (*pGetCurrentProcessorNumberEx)(&pnum);
     USHORT nnode = 0;
     BOOL ok = (*pGetNumaProcessorNodeEx)(&pnum, &nnode);
-    if (ok) numa_node = nnode;
+    if (ok) { numa_node = nnode; }
   }
-  else {
+  else if (pGetNumaProcessorNode != NULL) {
     // Vista or earlier, use older API that is limited to 64 processors. Issue #277
     DWORD pnum = GetCurrentProcessorNumber();
     UCHAR nnode = 0;
-    BOOL ok = GetNumaProcessorNode((UCHAR)pnum, &nnode);
-    if (ok) numa_node = nnode;    
+    BOOL ok = pGetNumaProcessorNode((UCHAR)pnum, &nnode);
+    if (ok) { numa_node = nnode; }
   }
   return numa_node;
 }
@@ -1425,7 +1461,7 @@ size_t _mi_os_numa_node_count_get(void) {
     else {
       count = mi_os_numa_node_countx(); // or detect dynamically
       if (count == 0) count = 1;
-    }    
+    }
     mi_atomic_store_release(&_mi_numa_node_count, count); // save it
     _mi_verbose_message("using %zd numa regions\n", count);
   }
diff --git a/source/luametatex/source/libraries/mimalloc/src/page-queue.c b/source/luametatex/source/libraries/mimalloc/src/page-queue.c
index 92f933c2a..cb54b3740 100644
--- a/source/luametatex/source/libraries/mimalloc/src/page-queue.c
+++ b/source/luametatex/source/libraries/mimalloc/src/page-queue.c
@@ -229,8 +229,9 @@ static void mi_page_queue_remove(mi_page_queue_t* queue, mi_page_t* page) {
 static void mi_page_queue_push(mi_heap_t* heap, mi_page_queue_t* queue, mi_page_t* page) {
   mi_assert_internal(mi_page_heap(page) == heap);
   mi_assert_internal(!mi_page_queue_contains(queue, page));
-
+  #if MI_HUGE_PAGE_ABANDON
   mi_assert_internal(_mi_page_segment(page)->kind != MI_SEGMENT_HUGE);
+  #endif
   mi_assert_internal(page->xblock_size == queue->block_size ||
                       (page->xblock_size > MI_MEDIUM_OBJ_SIZE_MAX) ||
                         (mi_page_is_in_full(page) && mi_page_queue_is_full(queue)));
@@ -304,7 +305,7 @@ size_t _mi_page_queue_append(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_queue
   for (mi_page_t* page = append->first; page != NULL; page = page->next) {
     // inline `mi_page_set_heap` to avoid wrong assertion during absorption;
     // in this case it is ok to be delayed freeing since both "to" and "from" heap are still alive.
-    mi_atomic_store_release(&page->xheap, (uintptr_t)heap); 
+    mi_atomic_store_release(&page->xheap, (uintptr_t)heap);
     // set the flag to delayed free (not overriding NEVER_DELAYED_FREE) which has as a
     // side effect that it spins until any DELAYED_FREEING is finished. This ensures
     // that after appending only the new heap will be used for delayed free operations.
diff --git a/source/luametatex/source/libraries/mimalloc/src/page.c b/source/luametatex/source/libraries/mimalloc/src/page.c
index 4b321156c..4250ff358 100644
--- a/source/luametatex/source/libraries/mimalloc/src/page.c
+++ b/source/luametatex/source/libraries/mimalloc/src/page.c
@@ -112,7 +112,10 @@ bool _mi_page_is_valid(mi_page_t* page) {
     mi_segment_t* segment = _mi_page_segment(page);
 
     mi_assert_internal(!_mi_process_is_initialized || segment->thread_id==0 || segment->thread_id == mi_page_heap(page)->thread_id);
-    if (segment->kind != MI_SEGMENT_HUGE) {
+    #if MI_HUGE_PAGE_ABANDON
+    if (segment->kind != MI_SEGMENT_HUGE) 
+    #endif
+    {    
       mi_page_queue_t* pq = mi_page_queue_of(page);
       mi_assert_internal(mi_page_queue_contains(pq, page));
       mi_assert_internal(pq->block_size==mi_page_block_size(page) || mi_page_block_size(page) > MI_MEDIUM_OBJ_SIZE_MAX || mi_page_is_in_full(page));
@@ -132,7 +135,7 @@ void _mi_page_use_delayed_free(mi_page_t* page, mi_delayed_t delay, bool overrid
 bool _mi_page_try_use_delayed_free(mi_page_t* page, mi_delayed_t delay, bool override_never) {
   mi_thread_free_t tfreex;
   mi_delayed_t     old_delay;
-  mi_thread_free_t tfree;  
+  mi_thread_free_t tfree;
   size_t yield_count = 0;
   do {
     tfree = mi_atomic_load_acquire(&page->xthread_free); // note: must acquire as we can break/repeat this loop and not do a CAS;
@@ -245,7 +248,9 @@ void _mi_page_reclaim(mi_heap_t* heap, mi_page_t* page) {
 
   mi_assert_internal(mi_page_heap(page) == heap);
   mi_assert_internal(mi_page_thread_free_flag(page) != MI_NEVER_DELAYED_FREE);
+  #if MI_HUGE_PAGE_ABANDON
   mi_assert_internal(_mi_page_segment(page)->kind != MI_SEGMENT_HUGE);
+  #endif
   mi_assert_internal(!page->is_reset);
   // TODO: push on full queue immediately if it is full?
   mi_page_queue_t* pq = mi_page_queue(heap, mi_page_block_size(page));
@@ -254,17 +259,26 @@ void _mi_page_reclaim(mi_heap_t* heap, mi_page_t* page) {
 }
 
 // allocate a fresh page from a segment
-static mi_page_t* mi_page_fresh_alloc(mi_heap_t* heap, mi_page_queue_t* pq, size_t block_size) {
-  mi_assert_internal(pq==NULL||mi_heap_contains_queue(heap, pq));
-  mi_page_t* page = _mi_segment_page_alloc(heap, block_size, &heap->tld->segments, &heap->tld->os);
+static mi_page_t* mi_page_fresh_alloc(mi_heap_t* heap, mi_page_queue_t* pq, size_t block_size, size_t page_alignment) {
+  #if !MI_HUGE_PAGE_ABANDON
+  mi_assert_internal(pq != NULL);
+  mi_assert_internal(mi_heap_contains_queue(heap, pq));
+  mi_assert_internal(page_alignment > 0 || block_size > MI_MEDIUM_OBJ_SIZE_MAX || block_size == pq->block_size);
+  #endif
+  mi_page_t* page = _mi_segment_page_alloc(heap, block_size, page_alignment, &heap->tld->segments, &heap->tld->os);
   if (page == NULL) {
     // this may be out-of-memory, or an abandoned page was reclaimed (and in our queue)
     return NULL;
   }
-  mi_assert_internal(pq==NULL || _mi_page_segment(page)->kind != MI_SEGMENT_HUGE);
-  mi_page_init(heap, page, block_size, heap->tld);
+  mi_assert_internal(page_alignment >0 || block_size > MI_MEDIUM_OBJ_SIZE_MAX || _mi_page_segment(page)->kind != MI_SEGMENT_HUGE);
+  mi_assert_internal(pq!=NULL || page->xblock_size != 0);
+  mi_assert_internal(pq!=NULL || mi_page_block_size(page) >= block_size);
+  // a fresh page was found, initialize it
+  const size_t full_block_size = ((pq == NULL || mi_page_queue_is_huge(pq)) ? mi_page_block_size(page) : block_size); // see also: mi_segment_huge_page_alloc
+  mi_assert_internal(full_block_size >= block_size);
+  mi_page_init(heap, page, full_block_size, heap->tld);
   mi_heap_stat_increase(heap, pages, 1);
-  if (pq!=NULL) mi_page_queue_push(heap, pq, page); // huge pages use pq==NULL
+  if (pq != NULL) { mi_page_queue_push(heap, pq, page); }
   mi_assert_expensive(_mi_page_is_valid(page));
   return page;
 }
@@ -272,7 +286,7 @@ static mi_page_t* mi_page_fresh_alloc(mi_heap_t* heap, mi_page_queue_t* pq, size
 // Get a fresh page to use
 static mi_page_t* mi_page_fresh(mi_heap_t* heap, mi_page_queue_t* pq) {
   mi_assert_internal(mi_heap_contains_queue(heap, pq));
-  mi_page_t* page = mi_page_fresh_alloc(heap, pq, pq->block_size);
+  mi_page_t* page = mi_page_fresh_alloc(heap, pq, pq->block_size, 0);
   if (page==NULL) return NULL;
   mi_assert_internal(pq->block_size==mi_page_block_size(page));
   mi_assert_internal(pq==mi_page_queue(heap, mi_page_block_size(page)));
@@ -402,7 +416,7 @@ void _mi_page_free(mi_page_t* page, mi_page_queue_t* pq, bool force) {
 }
 
 // Retire parameters
-#define MI_MAX_RETIRE_SIZE    MI_MEDIUM_OBJ_SIZE_MAX  
+#define MI_MAX_RETIRE_SIZE    (MI_MEDIUM_OBJ_SIZE_MAX)
 #define MI_RETIRE_CYCLES      (8)
 
 // Retire a page with no more used blocks
@@ -425,7 +439,7 @@ void _mi_page_retire(mi_page_t* page) mi_attr_noexcept {
   // how to check this efficiently though...
   // for now, we don't retire if it is the only page left of this size class.
   mi_page_queue_t* pq = mi_page_queue_of(page);
-  if mi_likely(page->xblock_size <= MI_MAX_RETIRE_SIZE && !mi_page_is_in_full(page)) {
+  if mi_likely(page->xblock_size <= MI_MAX_RETIRE_SIZE && !mi_page_queue_is_special(pq)) {  // not too large && not full or huge queue?
     if (pq->last==page && pq->first==page) { // the only page in the queue?
       mi_stat_counter_increase(_mi_stats_main.page_no_retire,1);
       page->retire_expire = 1 + (page->xblock_size <= MI_SMALL_OBJ_SIZE_MAX ? MI_RETIRE_CYCLES : MI_RETIRE_CYCLES/4);      
@@ -573,7 +587,7 @@ static mi_decl_noinline void mi_page_free_list_extend( mi_page_t* const page, co
 #if (MI_SECURE>0)
 #define MI_MIN_EXTEND         (8*MI_SECURE) // extend at least by this many
 #else
-#define MI_MIN_EXTEND         (1)
+#define MI_MIN_EXTEND         (4)
 #endif
 
 // Extend the capacity (up to reserved) by initializing a free list
@@ -603,7 +617,7 @@ static void mi_page_extend_free(mi_heap_t* heap, mi_page_t* page, mi_tld_t* tld)
   size_t max_extend = (bsize >= MI_MAX_EXTEND_SIZE ? MI_MIN_EXTEND : MI_MAX_EXTEND_SIZE/(uint32_t)bsize);
   if (max_extend < MI_MIN_EXTEND) { max_extend = MI_MIN_EXTEND; }
   mi_assert_internal(max_extend > 0);
-    
+
   if (extend > max_extend) {
     // ensure we don't touch memory beyond the page to reduce page commit.
     // the `lean` benchmark tests this. Going from 1 to 8 increases rss by 50%.
@@ -648,6 +662,7 @@ static void mi_page_init(mi_heap_t* heap, mi_page_t* page, size_t block_size, mi
   mi_assert_internal(page_size <= page->slice_count*MI_SEGMENT_SLICE_SIZE);
   mi_assert_internal(page_size / block_size < (1L<<16));
   page->reserved = (uint16_t)(page_size / block_size);
+  mi_assert_internal(page->reserved > 0);
   #ifdef MI_ENCODE_FREELIST
   page->keys[0] = _mi_heap_random_next(heap);
   page->keys[1] = _mi_heap_random_next(heap);
@@ -725,7 +740,7 @@ static mi_page_t* mi_page_queue_find_free_ex(mi_heap_t* heap, mi_page_queue_t* p
     page = mi_page_fresh(heap, pq);
     if (page == NULL && first_try) {
       // out-of-memory _or_ an abandoned page with free blocks was reclaimed, try once again
-      page = mi_page_queue_find_free_ex(heap, pq, false);      
+      page = mi_page_queue_find_free_ex(heap, pq, false);
     }
   }
   else {
@@ -743,17 +758,17 @@ static inline mi_page_t* mi_find_free_page(mi_heap_t* heap, size_t size) {
   mi_page_queue_t* pq = mi_page_queue(heap,size);
   mi_page_t* page = pq->first;
   if (page != NULL) {
-   #if (MI_SECURE>=3) // in secure mode, we extend half the time to increase randomness      
+   #if (MI_SECURE>=3) // in secure mode, we extend half the time to increase randomness
     if (page->capacity < page->reserved && ((_mi_heap_random_next(heap) & 1) == 1)) {
       mi_page_extend_free(heap, page, heap->tld);
       mi_assert_internal(mi_page_immediate_available(page));
     }
-    else 
+    else
    #endif
     {
       _mi_page_free_collect(page,false);
     }
-    
+
     if (mi_page_immediate_available(page)) {
       page->retire_expire = 0;
       return page; // fast path
@@ -797,21 +812,28 @@ void mi_register_deferred_free(mi_deferred_free_fun* fn, void* arg) mi_attr_noex
 // Because huge pages contain just one block, and the segment contains
 // just that page, we always treat them as abandoned and any thread
 // that frees the block can free the whole page and segment directly.
-static mi_page_t* mi_large_huge_page_alloc(mi_heap_t* heap, size_t size) {
+// Huge pages are also use if the requested alignment is very large (> MI_ALIGNMENT_MAX).
+static mi_page_t* mi_large_huge_page_alloc(mi_heap_t* heap, size_t size, size_t page_alignment) {
   size_t block_size = _mi_os_good_alloc_size(size);
-  mi_assert_internal(mi_bin(block_size) == MI_BIN_HUGE);
-  bool is_huge = (block_size > MI_LARGE_OBJ_SIZE_MAX);
+  mi_assert_internal(mi_bin(block_size) == MI_BIN_HUGE || page_alignment > 0);
+  bool is_huge = (block_size > MI_LARGE_OBJ_SIZE_MAX || page_alignment > 0);
+  #if MI_HUGE_PAGE_ABANDON
   mi_page_queue_t* pq = (is_huge ? NULL : mi_page_queue(heap, block_size));
-  mi_page_t* page = mi_page_fresh_alloc(heap, pq, block_size);
+  #else
+  mi_page_queue_t* pq = mi_page_queue(heap, is_huge ? MI_HUGE_BLOCK_SIZE : block_size); // not block_size as that can be low if the page_alignment > 0
+  mi_assert_internal(!is_huge || mi_page_queue_is_huge(pq));
+  #endif
+  mi_page_t* page = mi_page_fresh_alloc(heap, pq, block_size, page_alignment);
   if (page != NULL) {
     mi_assert_internal(mi_page_immediate_available(page));
     
-    if (pq == NULL) {
-      // huge pages are directly abandoned
+    if (is_huge) {
       mi_assert_internal(_mi_page_segment(page)->kind == MI_SEGMENT_HUGE);
       mi_assert_internal(_mi_page_segment(page)->used==1);
+      #if MI_HUGE_PAGE_ABANDON
       mi_assert_internal(_mi_page_segment(page)->thread_id==0); // abandoned, not in the huge queue
       mi_page_set_heap(page, NULL);
+      #endif      
     }
     else {
       mi_assert_internal(_mi_page_segment(page)->kind != MI_SEGMENT_HUGE);
@@ -833,16 +855,16 @@ static mi_page_t* mi_large_huge_page_alloc(mi_heap_t* heap, size_t size) {
 
 // Allocate a page
 // Note: in debug mode the size includes MI_PADDING_SIZE and might have overflowed.
-static mi_page_t* mi_find_page(mi_heap_t* heap, size_t size) mi_attr_noexcept {
+static mi_page_t* mi_find_page(mi_heap_t* heap, size_t size, size_t huge_alignment) mi_attr_noexcept {
   // huge allocation?
   const size_t req_size = size - MI_PADDING_SIZE;  // correct for padding_size in case of an overflow on `size`  
-  if mi_unlikely(req_size > (MI_MEDIUM_OBJ_SIZE_MAX - MI_PADDING_SIZE)) {
+  if mi_unlikely(req_size > (MI_MEDIUM_OBJ_SIZE_MAX - MI_PADDING_SIZE) || huge_alignment > 0) {
     if mi_unlikely(req_size > PTRDIFF_MAX) {  // we don't allocate more than PTRDIFF_MAX (see <https://sourceware.org/ml/libc-announce/2019/msg00001.html>)
       _mi_error_message(EOVERFLOW, "allocation request is too large (%zu bytes)\n", req_size);
       return NULL;
     }
     else {
-      return mi_large_huge_page_alloc(heap,size);
+      return mi_large_huge_page_alloc(heap,size,huge_alignment);
     }
   }
   else {
@@ -854,7 +876,9 @@ static mi_page_t* mi_find_page(mi_heap_t* heap, size_t size) mi_attr_noexcept {
 
 // Generic allocation routine if the fast path (`alloc.c:mi_page_malloc`) does not succeed.
 // Note: in debug mode the size includes MI_PADDING_SIZE and might have overflowed.
-void* _mi_malloc_generic(mi_heap_t* heap, size_t size, bool zero) mi_attr_noexcept
+// The `huge_alignment` is normally 0 but is set to a multiple of MI_SEGMENT_SIZE for
+// very large requested alignments in which case we use a huge segment.
+void* _mi_malloc_generic(mi_heap_t* heap, size_t size, bool zero, size_t huge_alignment) mi_attr_noexcept
 {
   mi_assert_internal(heap != NULL);
 
@@ -873,14 +897,14 @@ void* _mi_malloc_generic(mi_heap_t* heap, size_t size, bool zero) mi_attr_noexce
   _mi_heap_delayed_free_partial(heap);
 
   // find (or allocate) a page of the right size
-  mi_page_t* page = mi_find_page(heap, size);
+  mi_page_t* page = mi_find_page(heap, size, huge_alignment);
   if mi_unlikely(page == NULL) { // first time out of memory, try to collect and retry the allocation once more
     mi_heap_collect(heap, true /* force */);
-    page = mi_find_page(heap, size);
+    page = mi_find_page(heap, size, huge_alignment);
   }
 
   if mi_unlikely(page == NULL) { // out of memory
-    const size_t req_size = size - MI_PADDING_SIZE;  // correct for padding_size in case of an overflow on `size`  
+    const size_t req_size = size - MI_PADDING_SIZE;  // correct for padding_size in case of an overflow on `size`
     _mi_error_message(ENOMEM, "unable to allocate memory (%zu bytes)\n", req_size);
     return NULL;
   }
diff --git a/source/luametatex/source/libraries/mimalloc/src/random.c b/source/luametatex/source/libraries/mimalloc/src/random.c
index a5f5e6b82..06d4ba4ad 100644
--- a/source/luametatex/source/libraries/mimalloc/src/random.c
+++ b/source/luametatex/source/libraries/mimalloc/src/random.c
@@ -168,9 +168,9 @@ If we cannot get good randomness, we fall back to weak randomness based on a tim
 
 #if defined(_WIN32)
 
-#if defined(MI_USE_RTLGENRANDOM) || defined(__cplusplus)
-// We prefer to use BCryptGenRandom instead of (the unofficial) RtlGenRandom but when using 
-// dynamic overriding, we observed it can raise an exception when compiled with C++, and 
+#if defined(MI_USE_RTLGENRANDOM) // || defined(__cplusplus)
+// We prefer to use BCryptGenRandom instead of (the unofficial) RtlGenRandom but when using
+// dynamic overriding, we observed it can raise an exception when compiled with C++, and
 // sometimes deadlocks when also running under the VS debugger.
 // In contrast, issue #623 implies that on Windows Server 2019 we need to use BCryptGenRandom.
 // To be continued..
@@ -187,10 +187,27 @@ static bool os_random_buf(void* buf, size_t buf_len) {
   return (RtlGenRandom(buf, (ULONG)buf_len) != 0);
 }
 #else
-#pragma comment (lib,"bcrypt.lib")
-#include <bcrypt.h>
+
+#ifndef BCRYPT_USE_SYSTEM_PREFERRED_RNG
+#define BCRYPT_USE_SYSTEM_PREFERRED_RNG 0x00000002
+#endif
+
+typedef LONG (NTAPI *PBCryptGenRandom)(HANDLE, PUCHAR, ULONG, ULONG);
+static  PBCryptGenRandom pBCryptGenRandom = NULL;
+
 static bool os_random_buf(void* buf, size_t buf_len) {
-  return (BCryptGenRandom(NULL, (PUCHAR)buf, (ULONG)buf_len, BCRYPT_USE_SYSTEM_PREFERRED_RNG) >= 0);
+  if (pBCryptGenRandom == NULL) {
+    HINSTANCE hDll = LoadLibrary(TEXT("bcrypt.dll"));
+    if (hDll != NULL) {
+      pBCryptGenRandom = (PBCryptGenRandom)(void (*)(void))GetProcAddress(hDll, "BCryptGenRandom");
+    }
+  }
+  if (pBCryptGenRandom == NULL) {
+    return false;
+  }
+  else {
+    return (pBCryptGenRandom(NULL, (PUCHAR)buf, (ULONG)buf_len, BCRYPT_USE_SYSTEM_PREFERRED_RNG) >= 0);
+  }
 }
 #endif
 
@@ -203,7 +220,7 @@ static bool os_random_buf(void* buf, size_t buf_len) {
 static bool os_random_buf(void* buf, size_t buf_len) {
   #if defined(MAC_OS_X_VERSION_10_15) && MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_15
     // We prefere CCRandomGenerateBytes as it returns an error code while arc4random_buf
-    // may fail silently on macOS. See PR #390, and <https://opensource.apple.com/source/Libc/Libc-1439.40.11/gen/FreeBSD/arc4random.c.auto.html>      
+    // may fail silently on macOS. See PR #390, and <https://opensource.apple.com/source/Libc/Libc-1439.40.11/gen/FreeBSD/arc4random.c.auto.html>
     return (CCRandomGenerateBytes(buf, buf_len) == kCCSuccess);
   #else
     // fall back on older macOS
@@ -281,7 +298,7 @@ static bool os_random_buf(void* buf, size_t buf_len) {
 
 uintptr_t _mi_os_random_weak(uintptr_t extra_seed) {
   uintptr_t x = (uintptr_t)&_mi_os_random_weak ^ extra_seed; // ASLR makes the address random
-  
+
   #if defined(_WIN32)
     LARGE_INTEGER pcount;
     QueryPerformanceCounter(&pcount);
@@ -303,23 +320,41 @@ uintptr_t _mi_os_random_weak(uintptr_t extra_seed) {
   return x;
 }
 
-void _mi_random_init(mi_random_ctx_t* ctx) {
+static void mi_random_init_ex(mi_random_ctx_t* ctx, bool use_weak) {
   uint8_t key[32];
-  if (!os_random_buf(key, sizeof(key))) {
+  if (use_weak || !os_random_buf(key, sizeof(key))) {
     // if we fail to get random data from the OS, we fall back to a
     // weak random source based on the current time
     #if !defined(__wasi__)
-    _mi_warning_message("unable to use secure randomness\n");
+    if (!use_weak) { _mi_warning_message("unable to use secure randomness\n"); }
     #endif
     uintptr_t x = _mi_os_random_weak(0);
     for (size_t i = 0; i < 8; i++) {  // key is eight 32-bit words.
       x = _mi_random_shuffle(x);
       ((uint32_t*)key)[i] = (uint32_t)x;
     }
+    ctx->weak = true;
+  }
+  else {
+    ctx->weak = false;
   }
   chacha_init(ctx, key, (uintptr_t)ctx /*nonce*/ );
 }
 
+void _mi_random_init(mi_random_ctx_t* ctx) {
+  mi_random_init_ex(ctx, false);
+}
+
+void _mi_random_init_weak(mi_random_ctx_t * ctx) {
+  mi_random_init_ex(ctx, true);
+}
+
+void _mi_random_reinit_if_weak(mi_random_ctx_t * ctx) {
+  if (ctx->weak) {
+    _mi_random_init(ctx);
+  }
+}
+
 /* --------------------------------------------------------
 test vectors from <https://tools.ietf.org/html/rfc8439>
 ----------------------------------------------------------- */
diff --git a/source/luametatex/source/libraries/mimalloc/src/region.c b/source/luametatex/source/libraries/mimalloc/src/region.c
index 57d11fe8d..3571abb60 100644
--- a/source/luametatex/source/libraries/mimalloc/src/region.c
+++ b/source/luametatex/source/libraries/mimalloc/src/region.c
@@ -16,8 +16,8 @@ We need this memory layer between the raw OS calls because of:
 1. on `sbrk` like systems (like WebAssembly) we need our own memory maps in order
    to reuse memory effectively.
 2. It turns out that for large objects, between 1MiB and 32MiB (?), the cost of
-   an OS allocation/free is still (much) too expensive relative to the accesses 
-   in that object :-( (`malloc-large` tests this). This means we need a cheaper 
+   an OS allocation/free is still (much) too expensive relative to the accesses
+   in that object :-( (`malloc-large` tests this). This means we need a cheaper
    way to reuse memory.
 3. This layer allows for NUMA aware allocation.
 
@@ -47,35 +47,34 @@ bool    _mi_os_commit(void* p, size_t size, bool* is_zero, mi_stats_t* stats);
 bool    _mi_os_decommit(void* p, size_t size, mi_stats_t* stats);
 bool    _mi_os_reset(void* p, size_t size, mi_stats_t* stats);
 bool    _mi_os_unreset(void* p, size_t size, bool* is_zero, mi_stats_t* stats);
+bool    _mi_os_commit_unreset(void* addr, size_t size, bool* is_zero, mi_stats_t* stats);
 
 // arena.c
 mi_arena_id_t _mi_arena_id_none(void);
-void    _mi_arena_free(void* p, size_t size, size_t memid, bool all_committed, mi_stats_t* stats);
+void    _mi_arena_free(void* p, size_t size, size_t alignment, size_t align_offset, size_t memid, bool all_committed, mi_stats_t* stats);
 void*   _mi_arena_alloc(size_t size, bool* commit, bool* large, bool* is_pinned, bool* is_zero, mi_arena_id_t req_arena_id, size_t* memid, mi_os_tld_t* tld);
-void*   _mi_arena_alloc_aligned(size_t size, size_t alignment, bool* commit, bool* large, bool* is_pinned, bool* is_zero, mi_arena_id_t req_arena_id, size_t* memid, mi_os_tld_t* tld);
+void*   _mi_arena_alloc_aligned(size_t size, size_t alignment, size_t align_offset, bool* commit, bool* large, bool* is_pinned, bool* is_zero, mi_arena_id_t req_arena_id, size_t* memid, mi_os_tld_t* tld);
 
 
 
 // Constants
 #if (MI_INTPTR_SIZE==8)
-#define MI_HEAP_REGION_MAX_SIZE    (256 * MI_GiB)  // 64KiB for the region map 
+#define MI_HEAP_REGION_MAX_SIZE    (256 * MI_GiB)  // 64KiB for the region map
 #elif (MI_INTPTR_SIZE==4)
 #define MI_HEAP_REGION_MAX_SIZE    (3 * MI_GiB)    // ~ KiB for the region map
 #else
 #error "define the maximum heap space allowed for regions on this platform"
 #endif
 
-#define MI_SEGMENT_ALIGN          MI_SEGMENT_SIZE
-
 #define MI_REGION_MAX_BLOCKS      MI_BITMAP_FIELD_BITS
 #define MI_REGION_SIZE            (MI_SEGMENT_SIZE * MI_BITMAP_FIELD_BITS)    // 256MiB  (64MiB on 32 bits)
 #define MI_REGION_MAX             (MI_HEAP_REGION_MAX_SIZE / MI_REGION_SIZE)  // 1024  (48 on 32 bits)
 #define MI_REGION_MAX_OBJ_BLOCKS  (MI_REGION_MAX_BLOCKS/4)                    // 64MiB
-#define MI_REGION_MAX_OBJ_SIZE    (MI_REGION_MAX_OBJ_BLOCKS*MI_SEGMENT_SIZE)  
+#define MI_REGION_MAX_OBJ_SIZE    (MI_REGION_MAX_OBJ_BLOCKS*MI_SEGMENT_SIZE)
 
-// Region info 
+// Region info
 typedef union mi_region_info_u {
-  size_t value;      
+  size_t value;
   struct {
     bool  valid;        // initialized?
     bool  is_large:1;   // allocated in fixed large/huge OS pages
@@ -89,7 +88,7 @@ typedef union mi_region_info_u {
 // a bit map with one bit per MI_SEGMENT_SIZE (4MiB) block.
 typedef struct mem_region_s {
   _Atomic(size_t)           info;        // mi_region_info_t.value
-  _Atomic(void*)            start;       // start of the memory area 
+  _Atomic(void*)            start;       // start of the memory area
   mi_bitmap_field_t         in_use;      // bit per in-use block
   mi_bitmap_field_t         dirty;       // track if non-zero per block
   mi_bitmap_field_t         commit;      // track if committed per block
@@ -102,7 +101,7 @@ typedef struct mem_region_s {
 static mem_region_t regions[MI_REGION_MAX];
 
 // Allocated regions
-static _Atomic(size_t) regions_count; // = 0;        
+static _Atomic(size_t) regions_count; // = 0;
 
 
 /* ----------------------------------------------------------------------------
@@ -137,7 +136,7 @@ mi_decl_nodiscard bool mi_is_in_heap_region(const void* p) mi_attr_noexcept {
 static void* mi_region_blocks_start(const mem_region_t* region, mi_bitmap_index_t bit_idx) {
   uint8_t* start = (uint8_t*)mi_atomic_load_ptr_acquire(uint8_t, &((mem_region_t*)region)->start);
   mi_assert_internal(start != NULL);
-  return (start + (bit_idx * MI_SEGMENT_SIZE));  
+  return (start + (bit_idx * MI_SEGMENT_SIZE));
 }
 
 static size_t mi_memid_create(mem_region_t* region, mi_bitmap_index_t bit_idx) {
@@ -181,7 +180,7 @@ static bool mi_region_try_alloc_os(size_t blocks, bool commit, bool allow_large,
   bool is_zero = false;
   bool is_pinned = false;
   size_t arena_memid = 0;
-  void* const start = _mi_arena_alloc_aligned(MI_REGION_SIZE, MI_SEGMENT_ALIGN, &region_commit, &region_large, &is_pinned, &is_zero, _mi_arena_id_none(),  & arena_memid, tld);
+  void* const start = _mi_arena_alloc_aligned(MI_REGION_SIZE, MI_SEGMENT_ALIGN, 0, &region_commit, &region_large, &is_pinned, &is_zero, _mi_arena_id_none(),  & arena_memid, tld);
   if (start == NULL) return false;
   mi_assert_internal(!(region_large && !allow_large));
   mi_assert_internal(!region_large || region_commit);
@@ -190,7 +189,7 @@ static bool mi_region_try_alloc_os(size_t blocks, bool commit, bool allow_large,
   const size_t idx = mi_atomic_increment_acq_rel(&regions_count);
   if (idx >= MI_REGION_MAX) {
     mi_atomic_decrement_acq_rel(&regions_count);
-    _mi_arena_free(start, MI_REGION_SIZE, arena_memid, region_commit, tld->stats);
+    _mi_arena_free(start, MI_REGION_SIZE, MI_SEGMENT_ALIGN, 0, arena_memid, region_commit, tld->stats);
     _mi_warning_message("maximum regions used: %zu GiB (perhaps recompile with a larger setting for MI_HEAP_REGION_MAX_SIZE)", _mi_divide_up(MI_HEAP_REGION_MAX_SIZE, MI_GiB));
     return false;
   }
@@ -206,7 +205,7 @@ static bool mi_region_try_alloc_os(size_t blocks, bool commit, bool allow_large,
   _mi_bitmap_claim(&r->in_use, 1, blocks, *bit_idx, NULL);
   mi_atomic_store_ptr_release(void,&r->start, start);
 
-  // and share it 
+  // and share it
   mi_region_info_t info;
   info.value = 0;                        // initialize the full union to zero
   info.x.valid = true;
@@ -243,7 +242,7 @@ static bool mi_region_is_suitable(const mem_region_t* region, int numa_node, boo
 
 static bool mi_region_try_claim(int numa_node, size_t blocks, bool allow_large, mem_region_t** region, mi_bitmap_index_t* bit_idx, mi_os_tld_t* tld)
 {
-  // try all regions for a free slot  
+  // try all regions for a free slot
   const size_t count = mi_atomic_load_relaxed(&regions_count); // monotonic, so ok to be relaxed
   size_t idx = tld->region_idx; // Or start at 0 to reuse low addresses? Starting at 0 seems to increase latency though
   for (size_t visited = 0; visited < count; visited++, idx++) {
@@ -277,7 +276,7 @@ static void* mi_region_try_alloc(size_t blocks, bool* commit, bool* large, bool*
       return NULL;
     }
   }
-  
+
   // ------------------------------------------------
   // found a region and claimed `blocks` at `bit_idx`, initialize them now
   mi_assert_internal(region != NULL);
@@ -289,7 +288,7 @@ static void* mi_region_try_alloc(size_t blocks, bool* commit, bool* large, bool*
   mi_assert_internal(!(info.x.is_large && !*large));
   mi_assert_internal(start != NULL);
 
-  *is_zero   = _mi_bitmap_claim(&region->dirty, 1, blocks, bit_idx, NULL);  
+  *is_zero   = _mi_bitmap_claim(&region->dirty, 1, blocks, bit_idx, NULL);
   *large     = info.x.is_large;
   *is_pinned = info.x.is_pinned;
   *memid     = mi_memid_create(region, bit_idx);
@@ -308,20 +307,20 @@ static void* mi_region_try_alloc(size_t blocks, bool* commit, bool* large, bool*
         mi_bitmap_unclaim(&region->in_use, 1, blocks, bit_idx);
         return NULL;
       }
-      if (commit_zero) *is_zero = true;      
+      if (commit_zero) *is_zero = true;
     }
   }
   else {
     // no need to commit, but check if already fully committed
     *commit = _mi_bitmap_is_claimed(&region->commit, 1, blocks, bit_idx);
-  }  
+  }
   mi_assert_internal(!*commit || _mi_bitmap_is_claimed(&region->commit, 1, blocks, bit_idx));
 
   // unreset reset blocks
   if (_mi_bitmap_is_any_claimed(&region->reset, 1, blocks, bit_idx)) {
     // some blocks are still reset
     mi_assert_internal(!info.x.is_large && !info.x.is_pinned);
-    mi_assert_internal(!mi_option_is_enabled(mi_option_eager_commit) || *commit || mi_option_get(mi_option_eager_commit_delay) > 0); 
+    mi_assert_internal(!mi_option_is_enabled(mi_option_eager_commit) || *commit || mi_option_get(mi_option_eager_commit_delay) > 0);
     mi_bitmap_unclaim(&region->reset, 1, blocks, bit_idx);
     if (*commit || !mi_option_is_enabled(mi_option_reset_decommits)) { // only if needed
       bool reset_zero = false;
@@ -330,13 +329,13 @@ static void* mi_region_try_alloc(size_t blocks, bool* commit, bool* large, bool*
     }
   }
   mi_assert_internal(!_mi_bitmap_is_any_claimed(&region->reset, 1, blocks, bit_idx));
-  
+
   #if (MI_DEBUG>=2) && !MI_TRACK_ENABLED
   if (*commit) { ((uint8_t*)p)[0] = 0; }
   #endif
-  
-  // and return the allocation  
-  mi_assert_internal(p != NULL);  
+
+  // and return the allocation
+  mi_assert_internal(p != NULL);
   return p;
 }
 
@@ -347,7 +346,7 @@ static void* mi_region_try_alloc(size_t blocks, bool* commit, bool* large, bool*
 
 // Allocate `size` memory aligned at `alignment`. Return non NULL on success, with a given memory `id`.
 // (`id` is abstract, but `id = idx*MI_REGION_MAP_BITS + bitidx`)
-void* _mi_mem_alloc_aligned(size_t size, size_t alignment, bool* commit, bool* large, bool* is_pinned, bool* is_zero, size_t* memid, mi_os_tld_t* tld)
+void* _mi_mem_alloc_aligned(size_t size, size_t alignment, size_t align_offset, bool* commit, bool* large, bool* is_pinned, bool* is_zero, size_t* memid, mi_os_tld_t* tld)
 {
   mi_assert_internal(memid != NULL && tld != NULL);
   mi_assert_internal(size > 0);
@@ -355,7 +354,7 @@ void* _mi_mem_alloc_aligned(size_t size, size_t alignment, bool* commit, bool* l
   *is_zero = false;
   *is_pinned = false;
   bool default_large = false;
-  if (large==NULL) large = &default_large;  // ensure `large != NULL`  
+  if (large==NULL) large = &default_large;  // ensure `large != NULL`
   if (size == 0) return NULL;
   size = _mi_align_up(size, _mi_os_page_size());
 
@@ -363,23 +362,23 @@ void* _mi_mem_alloc_aligned(size_t size, size_t alignment, bool* commit, bool* l
   void* p = NULL;
   size_t arena_memid;
   const size_t blocks = mi_region_block_count(size);
-  if (blocks <= MI_REGION_MAX_OBJ_BLOCKS && alignment <= MI_SEGMENT_ALIGN) {
-    p = mi_region_try_alloc(blocks, commit, large, is_pinned, is_zero, memid, tld);    
+  if (blocks <= MI_REGION_MAX_OBJ_BLOCKS && alignment <= MI_SEGMENT_ALIGN && align_offset == 0) {
+    p = mi_region_try_alloc(blocks, commit, large, is_pinned, is_zero, memid, tld);
     if (p == NULL) {
       _mi_warning_message("unable to allocate from region: size %zu\n", size);
     }
   }
   if (p == NULL) {
     // and otherwise fall back to the OS
-    p = _mi_arena_alloc_aligned(size, alignment, commit, large, is_pinned, is_zero, _mi_arena_id_none(),  & arena_memid, tld);
+    p = _mi_arena_alloc_aligned(size, alignment, align_offset, commit, large, is_pinned, is_zero, _mi_arena_id_none(),  & arena_memid, tld);
     *memid = mi_memid_create_from_arena(arena_memid);
   }
 
   if (p != NULL) {
-    mi_assert_internal((uintptr_t)p % alignment == 0);
+    mi_assert_internal(((uintptr_t)p + align_offset) % alignment == 0);
     #if (MI_DEBUG>=2) && !MI_TRACK_ENABLED
     if (*commit) { ((uint8_t*)p)[0] = 0; } // ensure the memory is committed
-#endif
+    #endif
   }
   return p;
 }
@@ -391,21 +390,22 @@ Free
 -----------------------------------------------------------------------------*/
 
 // Free previously allocated memory with a given id.
-void _mi_mem_free(void* p, size_t size, size_t id, bool full_commit, bool any_reset, mi_os_tld_t* tld) {
+void _mi_mem_free(void* p, size_t size, size_t alignment, size_t align_offset, size_t id, bool full_commit, bool any_reset, mi_os_tld_t* tld) {
   mi_assert_internal(size > 0 && tld != NULL);
   if (p==NULL) return;
   if (size==0) return;
   size = _mi_align_up(size, _mi_os_page_size());
-  
+
   size_t arena_memid = 0;
   mi_bitmap_index_t bit_idx;
   mem_region_t* region;
   if (mi_memid_is_arena(id,&region,&bit_idx,&arena_memid)) {
    // was a direct arena allocation, pass through
-    _mi_arena_free(p, size, arena_memid, full_commit, tld->stats);
+    _mi_arena_free(p, size, alignment, align_offset, arena_memid, full_commit, tld->stats);
   }
   else {
     // allocated in a region
+    mi_assert_internal(align_offset == 0);
     mi_assert_internal(size <= MI_REGION_MAX_OBJ_SIZE); if (size > MI_REGION_MAX_OBJ_SIZE) return;
     const size_t blocks = mi_region_block_count(size);
     mi_assert_internal(blocks + bit_idx <= MI_BITMAP_FIELD_BITS);
@@ -428,9 +428,9 @@ void _mi_mem_free(void* p, size_t size, size_t id, bool full_commit, bool any_re
     }
 
     // reset the blocks to reduce the working set.
-    if (!info.x.is_large && !info.x.is_pinned && mi_option_is_enabled(mi_option_segment_reset) 
+    if (!info.x.is_large && !info.x.is_pinned && mi_option_is_enabled(mi_option_segment_reset)
        && (mi_option_is_enabled(mi_option_eager_commit) ||
-           mi_option_is_enabled(mi_option_reset_decommits))) // cannot reset halfway committed segments, use only `option_page_reset` instead            
+           mi_option_is_enabled(mi_option_reset_decommits))) // cannot reset halfway committed segments, use only `option_page_reset` instead
     {
       bool any_unreset;
       _mi_bitmap_claim(&region->reset, 1, blocks, bit_idx, &any_unreset);
@@ -438,7 +438,7 @@ void _mi_mem_free(void* p, size_t size, size_t id, bool full_commit, bool any_re
         _mi_abandoned_await_readers(); // ensure no more pending write (in case reset = decommit)
         _mi_mem_reset(p, blocks * MI_SEGMENT_SIZE, tld);
       }
-    }    
+    }
 
     // and unclaim
     bool all_unclaimed = mi_bitmap_unclaim(&region->in_use, 1, blocks, bit_idx);
@@ -467,9 +467,9 @@ void _mi_mem_collect(mi_os_tld_t* tld) {
         memset((void*)&regions[i], 0, sizeof(mem_region_t));  // cast to void* to avoid atomic warning
         // and release the whole region
         mi_atomic_store_release(&region->info, (size_t)0);
-        if (start != NULL) { // && !_mi_os_is_huge_reserved(start)) {         
+        if (start != NULL) { // && !_mi_os_is_huge_reserved(start)) {
           _mi_abandoned_await_readers(); // ensure no pending reads
-          _mi_arena_free(start, MI_REGION_SIZE, arena_memid, (~commit == 0), tld->stats);
+          _mi_arena_free(start, MI_REGION_SIZE, MI_SEGMENT_ALIGN, 0, arena_memid, (~commit == 0), tld->stats);
         }
       }
     }
@@ -482,11 +482,21 @@ void _mi_mem_collect(mi_os_tld_t* tld) {
 -----------------------------------------------------------------------------*/
 
 bool _mi_mem_reset(void* p, size_t size, mi_os_tld_t* tld) {
-  return _mi_os_reset(p, size, tld->stats);
+  if (mi_option_is_enabled(mi_option_reset_decommits)) {
+    return _mi_os_decommit(p, size, tld->stats);
+  }
+  else {
+    return _mi_os_reset(p, size, tld->stats);
+  }
 }
 
 bool _mi_mem_unreset(void* p, size_t size, bool* is_zero, mi_os_tld_t* tld) {
-  return _mi_os_unreset(p, size, is_zero, tld->stats);
+  if (mi_option_is_enabled(mi_option_reset_decommits)) {
+    return _mi_os_commit(p, size, is_zero, tld->stats);
+  }
+  else {
+    return _mi_os_unreset(p, size, is_zero, tld->stats);
+  }
 }
 
 bool _mi_mem_commit(void* p, size_t size, bool* is_zero, mi_os_tld_t* tld) {
diff --git a/source/luametatex/source/libraries/mimalloc/src/segment-cache.c b/source/luametatex/source/libraries/mimalloc/src/segment-cache.c
index da726716a..d93fd6441 100644
--- a/source/luametatex/source/libraries/mimalloc/src/segment-cache.c
+++ b/source/luametatex/source/libraries/mimalloc/src/segment-cache.c
@@ -45,7 +45,11 @@ static bool mi_cdecl mi_segment_cache_is_suitable(mi_bitmap_index_t bitidx, void
   return _mi_arena_memid_is_suitable(slot->memid, req_arena_id);
 }
 
-mi_decl_noinline void* _mi_segment_cache_pop(size_t size, mi_commit_mask_t* commit_mask, mi_commit_mask_t* decommit_mask, bool* large, bool* is_pinned, bool* is_zero, mi_arena_id_t _req_arena_id, size_t* memid, mi_os_tld_t* tld)
+mi_decl_noinline static void* mi_segment_cache_pop_ex(
+                              bool all_suitable,
+                              size_t size, mi_commit_mask_t* commit_mask, 
+                              mi_commit_mask_t* decommit_mask, bool* large, bool* is_pinned, bool* is_zero, 
+                              mi_arena_id_t _req_arena_id, size_t* memid, mi_os_tld_t* tld)
 {
 #ifdef MI_CACHE_DISABLE
   return NULL;
@@ -66,8 +70,8 @@ mi_decl_noinline void* _mi_segment_cache_pop(size_t size, mi_commit_mask_t* comm
   mi_bitmap_index_t bitidx = 0;
   bool claimed = false;
   mi_arena_id_t req_arena_id = _req_arena_id;
-  mi_bitmap_pred_fun_t pred_fun = &mi_segment_cache_is_suitable;  // cannot pass NULL as the arena may be exclusive itself; todo: do not put exclusive arenas in the cache?
-  
+  mi_bitmap_pred_fun_t pred_fun = (all_suitable ? NULL : &mi_segment_cache_is_suitable);  // cannot pass NULL as the arena may be exclusive itself; todo: do not put exclusive arenas in the cache?
+
   if (*large) {  // large allowed?
     claimed = _mi_bitmap_try_find_from_claim_pred(cache_available_large, MI_CACHE_FIELDS, start_field, 1, pred_fun, &req_arena_id, &bitidx);
     if (claimed) *large = true;
@@ -97,6 +101,12 @@ mi_decl_noinline void* _mi_segment_cache_pop(size_t size, mi_commit_mask_t* comm
 #endif
 }
 
+
+mi_decl_noinline void* _mi_segment_cache_pop(size_t size, mi_commit_mask_t* commit_mask, mi_commit_mask_t* decommit_mask, bool* large, bool* is_pinned, bool* is_zero, mi_arena_id_t _req_arena_id, size_t* memid, mi_os_tld_t* tld)
+{
+  return mi_segment_cache_pop_ex(false, size, commit_mask, decommit_mask, large, is_pinned, is_zero, _req_arena_id, memid, tld);
+}
+
 static mi_decl_noinline void mi_commit_mask_decommit(mi_commit_mask_t* cmask, void* p, size_t total, mi_stats_t* stats)
 {
   if (mi_commit_mask_is_empty(cmask)) {
@@ -123,14 +133,14 @@ static mi_decl_noinline void mi_commit_mask_decommit(mi_commit_mask_t* cmask, vo
 
 #define MI_MAX_PURGE_PER_PUSH  (4)
 
-static mi_decl_noinline void mi_segment_cache_purge(bool force, mi_os_tld_t* tld)
+static mi_decl_noinline void mi_segment_cache_purge(bool visit_all, bool force, mi_os_tld_t* tld)
 {
   MI_UNUSED(tld);
   if (!mi_option_is_enabled(mi_option_allow_decommit)) return;
   mi_msecs_t now = _mi_clock_now();
   size_t purged = 0;
-  const size_t max_visits = (force ? MI_CACHE_MAX /* visit all */ : MI_CACHE_FIELDS /* probe at most N (=16) slots */);
-  size_t idx              = (force ? 0 : _mi_random_shuffle((uintptr_t)now) % MI_CACHE_MAX /* random start */ );
+  const size_t max_visits = (visit_all ? MI_CACHE_MAX /* visit all */ : MI_CACHE_FIELDS /* probe at most N (=16) slots */);
+  size_t idx              = (visit_all ? 0 : _mi_random_shuffle((uintptr_t)now) % MI_CACHE_MAX /* random start */ );
   for (size_t visited = 0; visited < max_visits; visited++,idx++) {  // visit N slots
     if (idx >= MI_CACHE_MAX) idx = 0; // wrap
     mi_cache_slot_t* slot = &cache[idx];
@@ -154,13 +164,43 @@ static mi_decl_noinline void mi_segment_cache_purge(bool force, mi_os_tld_t* tld
         }
         _mi_bitmap_unclaim(cache_available, MI_CACHE_FIELDS, 1, bitidx); // make it available again for a pop
       }
-      if (!force && purged > MI_MAX_PURGE_PER_PUSH) break;  // bound to no more than N purge tries per push
+      if (!visit_all && purged > MI_MAX_PURGE_PER_PUSH) break;  // bound to no more than N purge tries per push
     }
   }
 }
 
 void _mi_segment_cache_collect(bool force, mi_os_tld_t* tld) {
-  mi_segment_cache_purge(force, tld );
+  if (force) {
+    // called on `mi_collect(true)` but not on thread termination    
+    _mi_segment_cache_free_all(tld);
+  }
+  else {
+    mi_segment_cache_purge(true /* visit all */, false /* don't force unexpired */, tld);
+  }
+}
+
+void _mi_segment_cache_free_all(mi_os_tld_t* tld) {
+  mi_commit_mask_t commit_mask;
+  mi_commit_mask_t decommit_mask;
+  bool is_pinned;
+  bool is_zero;
+  size_t memid;
+  const size_t size = MI_SEGMENT_SIZE;
+  // iterate twice: first large pages, then regular memory 
+  for (int i = 0; i < 2; i++) {
+    void* p;
+    do {
+      // keep popping and freeing the memory
+      bool large = (i == 0);  
+      p = mi_segment_cache_pop_ex(true /* all */, size, &commit_mask, &decommit_mask,
+                                  &large, &is_pinned, &is_zero, _mi_arena_id_none(), &memid, tld);
+      if (p != NULL) {
+        size_t csize = _mi_commit_mask_committed_size(&commit_mask, size);
+        if (csize > 0 && !is_pinned) _mi_stat_decrease(&_mi_stats_main.committed, csize);
+        _mi_arena_free(p, size, MI_SEGMENT_ALIGN, 0, memid, is_pinned /* pretend not committed to not double count decommits */, tld->stats);
+      }
+    } while (p != NULL);
+  }
 }
 
 mi_decl_noinline bool _mi_segment_cache_push(void* start, size_t size, size_t memid, const mi_commit_mask_t* commit_mask, const mi_commit_mask_t* decommit_mask, bool is_large, bool is_pinned, mi_os_tld_t* tld)
@@ -181,7 +221,7 @@ mi_decl_noinline bool _mi_segment_cache_push(void* start, size_t size, size_t me
   }
 
   // purge expired entries
-  mi_segment_cache_purge(false /* force? */, tld);
+  mi_segment_cache_purge(false /* limit purges to a constant N */, false /* don't force unexpired */, tld);
 
   // find an available slot
   mi_bitmap_index_t bitidx;
@@ -245,7 +285,7 @@ mi_decl_noinline bool _mi_segment_cache_push(void* start, size_t size, size_t me
 static _Atomic(uintptr_t) mi_segment_map[MI_SEGMENT_MAP_WSIZE + 1];  // 2KiB per TB with 64MiB segments
 
 static size_t mi_segment_map_index_of(const mi_segment_t* segment, size_t* bitidx) {
-  mi_assert_internal(_mi_ptr_segment(segment) == segment); // is it aligned on MI_SEGMENT_SIZE?
+  mi_assert_internal(_mi_ptr_segment(segment + 1) == segment); // is it aligned on MI_SEGMENT_SIZE?
   if ((uintptr_t)segment >= MI_MAX_ADDRESS) {
     *bitidx = 0;
     return MI_SEGMENT_MAP_WSIZE;
@@ -285,8 +325,9 @@ void _mi_segment_map_freed_at(const mi_segment_t* segment) {
 
 // Determine the segment belonging to a pointer or NULL if it is not in a valid segment.
 static mi_segment_t* _mi_segment_of(const void* p) {
+  if (p == NULL) return NULL;
   mi_segment_t* segment = _mi_ptr_segment(p);
-  if (segment == NULL) return NULL; 
+  mi_assert_internal(segment != NULL);
   size_t bitidx;
   size_t index = mi_segment_map_index_of(segment, &bitidx);
   // fast path: for any pointer to valid small/medium/large object or first MI_SEGMENT_SIZE in huge
diff --git a/source/luametatex/source/libraries/mimalloc/src/segment.c b/source/luametatex/source/libraries/mimalloc/src/segment.c
index c76c2259e..dc98e3e7b 100644
--- a/source/luametatex/source/libraries/mimalloc/src/segment.c
+++ b/source/luametatex/source/libraries/mimalloc/src/segment.c
@@ -316,7 +316,7 @@ static uint8_t* _mi_segment_page_start_from_slice(const mi_segment_t* segment, c
   ptrdiff_t idx = slice - segment->slices;
   size_t psize = (size_t)slice->slice_count * MI_SEGMENT_SLICE_SIZE;
   // make the start not OS page aligned for smaller blocks to avoid page/cache effects
-  size_t start_offset = (xblock_size >= MI_INTPTR_SIZE && xblock_size <= 1024 ? MI_MAX_ALIGN_GUARANTEE : 0); 
+  size_t start_offset = (xblock_size >= MI_INTPTR_SIZE && xblock_size <= 1024 ? 3*MI_MAX_ALIGN_GUARANTEE : 0); 
   if (page_size != NULL) { *page_size = psize - start_offset; }
   return (uint8_t*)segment + ((idx*MI_SEGMENT_SLICE_SIZE) + start_offset);
 }
@@ -336,12 +336,14 @@ static size_t mi_segment_calculate_slices(size_t required, size_t* pre_size, siz
   size_t page_size = _mi_os_page_size();
   size_t isize     = _mi_align_up(sizeof(mi_segment_t), page_size);
   size_t guardsize = 0;
-
+  
   if (MI_SECURE>0) {
     // in secure mode, we set up a protected page in between the segment info
     // and the page data (and one at the end of the segment)
-    guardsize =  page_size;
-    required  = _mi_align_up(required, page_size);
+    guardsize = page_size;
+    if (required > 0) {
+      required = _mi_align_up(required, MI_SEGMENT_SLICE_SIZE) + page_size;
+    }
   }
 
   if (pre_size != NULL) *pre_size = isize;
@@ -386,11 +388,13 @@ static void mi_segment_os_free(mi_segment_t* segment, mi_segments_tld_t* tld) {
   
   // _mi_os_free(segment, mi_segment_size(segment), /*segment->memid,*/ tld->stats);
   const size_t size = mi_segment_size(segment);
-  if (size != MI_SEGMENT_SIZE || !_mi_segment_cache_push(segment, size, segment->memid, &segment->commit_mask, &segment->decommit_mask, segment->mem_is_large, segment->mem_is_pinned, tld->os)) {
+  if (size != MI_SEGMENT_SIZE || segment->mem_align_offset != 0 || segment->kind == MI_SEGMENT_HUGE || // only push regular segments on the cache
+       !_mi_segment_cache_push(segment, size, segment->memid, &segment->commit_mask, &segment->decommit_mask, segment->mem_is_large, segment->mem_is_pinned, tld->os)) 
+  {
     const size_t csize = _mi_commit_mask_committed_size(&segment->commit_mask, size);
     if (csize > 0 && !segment->mem_is_pinned) _mi_stat_decrease(&_mi_stats_main.committed, csize);
     _mi_abandoned_await_readers();  // wait until safe to free
-    _mi_arena_free(segment, mi_segment_size(segment), segment->memid, segment->mem_is_pinned /* pretend not committed to not double count decommits */, tld->os);
+    _mi_arena_free(segment, mi_segment_size(segment), segment->mem_alignment, segment->mem_align_offset, segment->memid, segment->mem_is_pinned /* pretend not committed to not double count decommits */, tld->stats);
   }
 }
 
@@ -402,11 +406,11 @@ void _mi_segment_thread_collect(mi_segments_tld_t* tld) {
 
 
 /* -----------------------------------------------------------
-   Span management
+   Commit/Decommit ranges
 ----------------------------------------------------------- */
 
 static void mi_segment_commit_mask(mi_segment_t* segment, bool conservative, uint8_t* p, size_t size, uint8_t** start_p, size_t* full_size, mi_commit_mask_t* cm) {
-  mi_assert_internal(_mi_ptr_segment(p) == segment);
+  mi_assert_internal(_mi_ptr_segment(p + 1) == segment);
   mi_assert_internal(segment->kind != MI_SEGMENT_HUGE);
   mi_commit_mask_create_empty(cm);
   if (size == 0 || size > MI_SEGMENT_SIZE || segment->kind == MI_SEGMENT_HUGE) return;
@@ -459,15 +463,6 @@ static void mi_segment_commit_mask(mi_segment_t* segment, bool conservative, uin
 static bool mi_segment_commitx(mi_segment_t* segment, bool commit, uint8_t* p, size_t size, mi_stats_t* stats) {    
   mi_assert_internal(mi_commit_mask_all_set(&segment->commit_mask, &segment->decommit_mask));
 
-  // try to commit in at least MI_MINIMAL_COMMIT_SIZE sizes.
-  /*
-  if (commit && size > 0) {
-    const size_t csize = _mi_align_up(size, MI_MINIMAL_COMMIT_SIZE);
-    if (p + csize <= mi_segment_end(segment)) {
-      size = csize;
-    }
-  }
-  */
   // commit liberal, but decommit conservative
   uint8_t* start = NULL;
   size_t   full_size = 0;
@@ -536,8 +531,12 @@ static void mi_segment_perhaps_decommit(mi_segment_t* segment, uint8_t* p, size_
     }
     else if (segment->decommit_expire <= now) {
       // previous decommit mask already expired
-      // mi_segment_delayed_decommit(segment, true, stats);
-      segment->decommit_expire = now + mi_option_get(mi_option_decommit_extend_delay); // (mi_option_get(mi_option_decommit_delay) / 8); // wait a tiny bit longer in case there is a series of free's
+      if (segment->decommit_expire + mi_option_get(mi_option_decommit_extend_delay) <= now) {
+        mi_segment_delayed_decommit(segment, true, stats);
+      }
+      else {
+        segment->decommit_expire = now + mi_option_get(mi_option_decommit_extend_delay); // (mi_option_get(mi_option_decommit_delay) / 8); // wait a tiny bit longer in case there is a series of free's
+      }
     }
     else {
       // previous decommit mask is not yet expired, increase the expiration by a bit.
@@ -570,12 +569,16 @@ static void mi_segment_delayed_decommit(mi_segment_t* segment, bool force, mi_st
 }
 
 
+/* -----------------------------------------------------------
+   Span free
+----------------------------------------------------------- */
+
 static bool mi_segment_is_abandoned(mi_segment_t* segment) {
   return (segment->thread_id == 0);
 }
 
 // note: can be called on abandoned segments
-static void mi_segment_span_free(mi_segment_t* segment, size_t slice_index, size_t slice_count, mi_segments_tld_t* tld) {
+static void mi_segment_span_free(mi_segment_t* segment, size_t slice_index, size_t slice_count, bool allow_decommit, mi_segments_tld_t* tld) {
   mi_assert_internal(slice_index < segment->slice_entries);
   mi_span_queue_t* sq = (segment->kind == MI_SEGMENT_HUGE || mi_segment_is_abandoned(segment) 
                           ? NULL : mi_span_queue_for(slice_count,tld));
@@ -595,7 +598,9 @@ static void mi_segment_span_free(mi_segment_t* segment, size_t slice_index, size
   }
 
   // perhaps decommit
-  mi_segment_perhaps_decommit(segment,mi_slice_start(slice),slice_count*MI_SEGMENT_SLICE_SIZE,tld->stats);
+  if (allow_decommit) {
+    mi_segment_perhaps_decommit(segment, mi_slice_start(slice), slice_count * MI_SEGMENT_SLICE_SIZE, tld->stats);
+  }
   
   // and push it on the free page queue (if it was not a huge page)
   if (sq != NULL) mi_span_queue_push( sq, slice );
@@ -657,27 +662,20 @@ static mi_slice_t* mi_segment_span_free_coalesce(mi_slice_t* slice, mi_segments_
   }
 
   // and add the new free page
-  mi_segment_span_free(segment, mi_slice_index(slice), slice_count, tld);
+  mi_segment_span_free(segment, mi_slice_index(slice), slice_count, true, tld);
   return slice;
 }
 
 
-static void mi_segment_slice_split(mi_segment_t* segment, mi_slice_t* slice, size_t slice_count, mi_segments_tld_t* tld) {
-  mi_assert_internal(_mi_ptr_segment(slice)==segment);
-  mi_assert_internal(slice->slice_count >= slice_count);
-  mi_assert_internal(slice->xblock_size > 0); // no more in free queue
-  if (slice->slice_count <= slice_count) return;
-  mi_assert_internal(segment->kind != MI_SEGMENT_HUGE);
-  size_t next_index = mi_slice_index(slice) + slice_count;
-  size_t next_count = slice->slice_count - slice_count;
-  mi_segment_span_free(segment, next_index, next_count, tld);
-  slice->slice_count = (uint32_t)slice_count;
-}
+
+/* -----------------------------------------------------------
+   Page allocation
+----------------------------------------------------------- */
 
 // Note: may still return NULL if committing the memory failed
 static mi_page_t* mi_segment_span_allocate(mi_segment_t* segment, size_t slice_index, size_t slice_count, mi_segments_tld_t* tld) {
   mi_assert_internal(slice_index < segment->slice_entries);
-  mi_slice_t* slice = &segment->slices[slice_index];
+  mi_slice_t* const slice = &segment->slices[slice_index];
   mi_assert_internal(slice->xblock_size==0 || slice->xblock_size==1);
 
   // commit before changing the slice data
@@ -698,18 +696,21 @@ static mi_page_t* mi_segment_span_allocate(mi_segment_t* segment, size_t slice_i
   size_t extra = slice_count-1;
   if (extra > MI_MAX_SLICE_OFFSET) extra = MI_MAX_SLICE_OFFSET;
   if (slice_index + extra >= segment->slice_entries) extra = segment->slice_entries - slice_index - 1;  // huge objects may have more slices than avaiable entries in the segment->slices
-  slice++;
-  for (size_t i = 1; i <= extra; i++, slice++) {
-    slice->slice_offset = (uint32_t)(sizeof(mi_slice_t)*i);
-    slice->slice_count = 0;
-    slice->xblock_size = 1;
+  
+  mi_slice_t* slice_next = slice + 1;
+  for (size_t i = 1; i <= extra; i++, slice_next++) {
+    slice_next->slice_offset = (uint32_t)(sizeof(mi_slice_t)*i);
+    slice_next->slice_count = 0;
+    slice_next->xblock_size = 1;
   }
 
-  // and also for the last one (if not set already) (the last one is needed for coalescing)
+  // and also for the last one (if not set already) (the last one is needed for coalescing and for large alignments)
   // note: the cast is needed for ubsan since the index can be larger than MI_SLICES_PER_SEGMENT for huge allocations (see #543)
-  mi_slice_t* last = &((mi_slice_t*)segment->slices)[slice_index + slice_count - 1]; 
-  if (last < mi_segment_slices_end(segment) && last >= slice) {
-    last->slice_offset = (uint32_t)(sizeof(mi_slice_t)*(slice_count-1));
+  mi_slice_t* last = slice + slice_count - 1;
+  mi_slice_t* end = (mi_slice_t*)mi_segment_slices_end(segment);
+  if (last > end) last = end;
+  if (last > slice) {
+    last->slice_offset = (uint32_t)(sizeof(mi_slice_t) * (last - slice));
     last->slice_count = 0;
     last->xblock_size = 1;
   }
@@ -721,6 +722,18 @@ static mi_page_t* mi_segment_span_allocate(mi_segment_t* segment, size_t slice_i
   return page;
 }
 
+static void mi_segment_slice_split(mi_segment_t* segment, mi_slice_t* slice, size_t slice_count, mi_segments_tld_t* tld) {
+  mi_assert_internal(_mi_ptr_segment(slice) == segment);
+  mi_assert_internal(slice->slice_count >= slice_count);
+  mi_assert_internal(slice->xblock_size > 0); // no more in free queue
+  if (slice->slice_count <= slice_count) return;
+  mi_assert_internal(segment->kind != MI_SEGMENT_HUGE);
+  size_t next_index = mi_slice_index(slice) + slice_count;
+  size_t next_count = slice->slice_count - slice_count;
+  mi_segment_span_free(segment, next_index, next_count, false /* don't decommit left-over part */, tld);
+  slice->slice_count = (uint32_t)slice_count;
+}
+
 static mi_page_t* mi_segments_page_find_and_allocate(size_t slice_count, mi_arena_id_t req_arena_id, mi_segments_tld_t* tld) {
   mi_assert_internal(slice_count*MI_SEGMENT_SLICE_SIZE <= MI_LARGE_OBJ_SIZE_MAX);
   // search from best fit up
@@ -735,20 +748,20 @@ static mi_page_t* mi_segments_page_find_and_allocate(size_t slice_count, mi_aren
           // found a suitable page span
           mi_span_queue_delete(sq, slice);
 
-        if (slice->slice_count > slice_count) {
-          mi_segment_slice_split(segment, slice, slice_count, tld);
-        }
-        mi_assert_internal(slice != NULL && slice->slice_count == slice_count && slice->xblock_size > 0);
-        mi_page_t* page = mi_segment_span_allocate(segment, mi_slice_index(slice), slice->slice_count, tld);
-        if (page == NULL) {
-          // commit failed; return NULL but first restore the slice
-          mi_segment_span_free_coalesce(slice, tld);
-          return NULL;
+          if (slice->slice_count > slice_count) {
+            mi_segment_slice_split(segment, slice, slice_count, tld);
+          }
+          mi_assert_internal(slice != NULL && slice->slice_count == slice_count && slice->xblock_size > 0);
+          mi_page_t* page = mi_segment_span_allocate(segment, mi_slice_index(slice), slice->slice_count, tld);
+          if (page == NULL) {
+            // commit failed; return NULL but first restore the slice
+            mi_segment_span_free_coalesce(slice, tld);
+            return NULL;
+          }
+          return page;
         }
-        return page;        
       }
     }
-    }
     sq++;
   }
   // could not find a page..
@@ -760,108 +773,129 @@ static mi_page_t* mi_segments_page_find_and_allocate(size_t slice_count, mi_aren
    Segment allocation
 ----------------------------------------------------------- */
 
+static mi_segment_t* mi_segment_os_alloc( size_t required, size_t page_alignment, bool eager_delay, mi_arena_id_t req_arena_id,
+                                          size_t* psegment_slices, size_t* ppre_size, size_t* pinfo_slices, 
+                                          mi_commit_mask_t* pcommit_mask, mi_commit_mask_t* pdecommit_mask,
+                                          bool* is_zero, bool* pcommit, mi_segments_tld_t* tld, mi_os_tld_t* os_tld)
+
+{
+  // Allocate the segment from the OS
+  bool mem_large = (!eager_delay && (MI_SECURE==0)); // only allow large OS pages once we are no longer lazy    
+  bool is_pinned = false;
+  size_t memid = 0;
+  size_t align_offset = 0;
+  size_t alignment = MI_SEGMENT_ALIGN;
+  
+  if (page_alignment > 0) {
+    // mi_assert_internal(huge_page != NULL);
+    mi_assert_internal(page_alignment >= MI_SEGMENT_ALIGN);
+    alignment = page_alignment;
+    const size_t info_size = (*pinfo_slices) * MI_SEGMENT_SLICE_SIZE;
+    align_offset = _mi_align_up( info_size, MI_SEGMENT_ALIGN );
+    const size_t extra = align_offset - info_size;
+    // recalculate due to potential guard pages
+    *psegment_slices = mi_segment_calculate_slices(required + extra, ppre_size, pinfo_slices);
+    //segment_size += _mi_align_up(align_offset - info_size, MI_SEGMENT_SLICE_SIZE);
+    //segment_slices = segment_size / MI_SEGMENT_SLICE_SIZE;
+  }
+  const size_t segment_size = (*psegment_slices) * MI_SEGMENT_SLICE_SIZE;
+  mi_segment_t* segment = NULL;
+
+  // get from cache?
+  if (page_alignment == 0) {
+    segment = (mi_segment_t*)_mi_segment_cache_pop(segment_size, pcommit_mask, pdecommit_mask, &mem_large, &is_pinned, is_zero, req_arena_id, &memid, os_tld);
+  }
+  
+  // get from OS
+  if (segment==NULL) {
+    segment = (mi_segment_t*)_mi_arena_alloc_aligned(segment_size, alignment, align_offset, pcommit, &mem_large, &is_pinned, is_zero, req_arena_id, &memid, os_tld);
+    if (segment == NULL) return NULL;  // failed to allocate
+    if (*pcommit) {
+      mi_commit_mask_create_full(pcommit_mask);
+    }
+    else {
+      mi_commit_mask_create_empty(pcommit_mask);
+    }
+  }    
+  mi_assert_internal(segment != NULL && (uintptr_t)segment % MI_SEGMENT_SIZE == 0);
+
+  const size_t commit_needed = _mi_divide_up((*pinfo_slices)*MI_SEGMENT_SLICE_SIZE, MI_COMMIT_SIZE);
+  mi_assert_internal(commit_needed>0);
+  mi_commit_mask_t commit_needed_mask;
+  mi_commit_mask_create(0, commit_needed, &commit_needed_mask);
+  if (!mi_commit_mask_all_set(pcommit_mask, &commit_needed_mask)) {
+    // at least commit the info slices
+    mi_assert_internal(commit_needed*MI_COMMIT_SIZE >= (*pinfo_slices)*MI_SEGMENT_SLICE_SIZE);
+    bool ok = _mi_os_commit(segment, commit_needed*MI_COMMIT_SIZE, is_zero, tld->stats);
+    if (!ok) return NULL; // failed to commit 
+    mi_commit_mask_set(pcommit_mask, &commit_needed_mask); 
+  }
+  mi_track_mem_undefined(segment,commit_needed);
+  segment->memid = memid;
+  segment->mem_is_pinned = is_pinned;
+  segment->mem_is_large = mem_large;
+  segment->mem_is_committed = mi_commit_mask_is_full(pcommit_mask);
+  segment->mem_alignment = alignment;
+  segment->mem_align_offset = align_offset;
+  mi_segments_track_size((long)(segment_size), tld);
+  _mi_segment_map_allocated_at(segment);
+  return segment;
+}
+
+
 // Allocate a segment from the OS aligned to `MI_SEGMENT_SIZE` .
-static mi_segment_t* mi_segment_init(mi_segment_t* segment, size_t required, mi_arena_id_t req_arena_id, mi_segments_tld_t* tld, mi_os_tld_t* os_tld, mi_page_t** huge_page)
+static mi_segment_t* mi_segment_alloc(size_t required, size_t page_alignment, mi_arena_id_t req_arena_id, mi_segments_tld_t* tld, mi_os_tld_t* os_tld, mi_page_t** huge_page)
 {
   mi_assert_internal((required==0 && huge_page==NULL) || (required>0 && huge_page != NULL));
-  mi_assert_internal((segment==NULL) || (segment!=NULL && required==0));
+  
   // calculate needed sizes first
   size_t info_slices;
   size_t pre_size;
-  const size_t segment_slices = mi_segment_calculate_slices(required, &pre_size, &info_slices);
-  const size_t slice_entries = (segment_slices > MI_SLICES_PER_SEGMENT ? MI_SLICES_PER_SEGMENT : segment_slices);
-  const size_t segment_size = segment_slices * MI_SEGMENT_SLICE_SIZE;
-
+  size_t segment_slices = mi_segment_calculate_slices(required, &pre_size, &info_slices);
+  
   // Commit eagerly only if not the first N lazy segments (to reduce impact of many threads that allocate just a little)
   const bool eager_delay = (// !_mi_os_has_overcommit() &&             // never delay on overcommit systems
                             _mi_current_thread_count() > 1 &&       // do not delay for the first N threads
                             tld->count < (size_t)mi_option_get(mi_option_eager_commit_delay));
   const bool eager = !eager_delay && mi_option_is_enabled(mi_option_eager_commit);
-  bool commit = eager || (required > 0); 
-  
-  // Try to get from our cache first
-  bool is_zero = false;
-  const bool commit_info_still_good = (segment != NULL);
+  bool commit = eager || (required > 0);   
+  bool is_zero = false;  
+
   mi_commit_mask_t commit_mask;
   mi_commit_mask_t decommit_mask;
-  if (segment != NULL) {
-    commit_mask = segment->commit_mask;
-    decommit_mask = segment->decommit_mask;
-  }
-  else {
-    mi_commit_mask_create_empty(&commit_mask);
-    mi_commit_mask_create_empty(&decommit_mask);
-  }
-  if (segment==NULL) {
-    // Allocate the segment from the OS
-    bool mem_large = (!eager_delay && (MI_SECURE==0)); // only allow large OS pages once we are no longer lazy    
-    bool is_pinned = false;
-    size_t memid = 0;
-    segment = (mi_segment_t*)_mi_segment_cache_pop(segment_size, &commit_mask, &decommit_mask, &mem_large, &is_pinned, &is_zero, req_arena_id, &memid, os_tld);
-    if (segment==NULL) {
-      segment = (mi_segment_t*)_mi_arena_alloc_aligned(segment_size, MI_SEGMENT_SIZE, &commit, &mem_large, &is_pinned, &is_zero, req_arena_id, &memid, os_tld);
-      if (segment == NULL) return NULL;  // failed to allocate
-      if (commit) {
-        mi_commit_mask_create_full(&commit_mask);
-      }
-      else {
-        mi_commit_mask_create_empty(&commit_mask);
-      }
-    }    
-    mi_assert_internal(segment != NULL && (uintptr_t)segment % MI_SEGMENT_SIZE == 0);
-
-    const size_t commit_needed = _mi_divide_up(info_slices*MI_SEGMENT_SLICE_SIZE, MI_COMMIT_SIZE);
-    mi_assert_internal(commit_needed>0);
-    mi_commit_mask_t commit_needed_mask;
-    mi_commit_mask_create(0, commit_needed, &commit_needed_mask);
-    if (!mi_commit_mask_all_set(&commit_mask, &commit_needed_mask)) {
-      // at least commit the info slices
-      mi_assert_internal(commit_needed*MI_COMMIT_SIZE >= info_slices*MI_SEGMENT_SLICE_SIZE);
-      bool ok = _mi_os_commit(segment, commit_needed*MI_COMMIT_SIZE, &is_zero, tld->stats);
-      if (!ok) return NULL; // failed to commit   
-      mi_commit_mask_set(&commit_mask, &commit_needed_mask); 
-    }
-    mi_track_mem_undefined(segment,commit_needed);
-    segment->memid = memid;
-    segment->mem_is_pinned = is_pinned;
-    segment->mem_is_large = mem_large;
-    segment->mem_is_committed = mi_commit_mask_is_full(&commit_mask);
-    mi_segments_track_size((long)(segment_size), tld);
-    _mi_segment_map_allocated_at(segment);
-  }
-
-  // zero the segment info? -- not always needed as it is zero initialized from the OS 
+  mi_commit_mask_create_empty(&commit_mask);
+  mi_commit_mask_create_empty(&decommit_mask);
+
+  // Allocate the segment from the OS  
+  mi_segment_t* segment = mi_segment_os_alloc(required, page_alignment, eager_delay, req_arena_id, 
+                                              &segment_slices, &pre_size, &info_slices, &commit_mask, &decommit_mask, 
+                                              &is_zero, &commit, tld, os_tld);
+  if (segment == NULL) return NULL;
+  
+  // zero the segment info? -- not always needed as it may be zero initialized from the OS 
   mi_atomic_store_ptr_release(mi_segment_t, &segment->abandoned_next, NULL);  // tsan
   if (!is_zero) {
     ptrdiff_t ofs = offsetof(mi_segment_t, next);
     size_t    prefix = offsetof(mi_segment_t, slices) - ofs;
-    memset((uint8_t*)segment+ofs, 0, prefix + sizeof(mi_slice_t)*segment_slices);
-  }
-
-  if (!commit_info_still_good) {
-    segment->commit_mask = commit_mask; // on lazy commit, the initial part is always committed
-    segment->allow_decommit = (mi_option_is_enabled(mi_option_allow_decommit) && !segment->mem_is_pinned && !segment->mem_is_large);    
-    if (segment->allow_decommit) {
-      segment->decommit_expire = _mi_clock_now() + mi_option_get(mi_option_decommit_delay);
-      segment->decommit_mask = decommit_mask;
-      mi_assert_internal(mi_commit_mask_all_set(&segment->commit_mask, &segment->decommit_mask));
-      #if MI_DEBUG>2
-      const size_t commit_needed = _mi_divide_up(info_slices*MI_SEGMENT_SLICE_SIZE, MI_COMMIT_SIZE);
-      mi_commit_mask_t commit_needed_mask;
-      mi_commit_mask_create(0, commit_needed, &commit_needed_mask);
-      mi_assert_internal(!mi_commit_mask_any_set(&segment->decommit_mask, &commit_needed_mask));
-      #endif
-    }    
-    else {
-      mi_assert_internal(mi_commit_mask_is_empty(&decommit_mask));
-      segment->decommit_expire = 0;
-      mi_commit_mask_create_empty( &segment->decommit_mask );
-      mi_assert_internal(mi_commit_mask_is_empty(&segment->decommit_mask));
-    }
+    memset((uint8_t*)segment+ofs, 0, prefix + sizeof(mi_slice_t)*(segment_slices+1));  // one more
   }
   
-
+  segment->commit_mask = commit_mask; // on lazy commit, the initial part is always committed
+  segment->allow_decommit = (mi_option_is_enabled(mi_option_allow_decommit) && !segment->mem_is_pinned && !segment->mem_is_large);    
+  if (segment->allow_decommit) {
+    segment->decommit_expire = 0; // don't decommit just committed memory // _mi_clock_now() + mi_option_get(mi_option_decommit_delay);
+    segment->decommit_mask = decommit_mask;
+    mi_assert_internal(mi_commit_mask_all_set(&segment->commit_mask, &segment->decommit_mask));
+    #if MI_DEBUG>2
+    const size_t commit_needed = _mi_divide_up(info_slices*MI_SEGMENT_SLICE_SIZE, MI_COMMIT_SIZE);
+    mi_commit_mask_t commit_needed_mask;
+    mi_commit_mask_create(0, commit_needed, &commit_needed_mask);
+    mi_assert_internal(!mi_commit_mask_any_set(&segment->decommit_mask, &commit_needed_mask));
+    #endif
+  }    
+  
   // initialize segment info
+  const size_t slice_entries = (segment_slices > MI_SLICES_PER_SEGMENT ? MI_SLICES_PER_SEGMENT : segment_slices);
   segment->segment_slices = segment_slices;
   segment->segment_info_slices = info_slices;
   segment->thread_id = _mi_thread_id();
@@ -896,7 +930,7 @@ static mi_segment_t* mi_segment_init(mi_segment_t* segment, size_t required, mi_
   // initialize initial free pages
   if (segment->kind == MI_SEGMENT_NORMAL) { // not a huge page
     mi_assert_internal(huge_page==NULL);
-    mi_segment_span_free(segment, info_slices, segment->slice_entries - info_slices, tld);
+    mi_segment_span_free(segment, info_slices, segment->slice_entries - info_slices, false /* don't decommit */, tld);
   }
   else {
     mi_assert_internal(huge_page!=NULL);
@@ -911,12 +945,6 @@ static mi_segment_t* mi_segment_init(mi_segment_t* segment, size_t required, mi_
 }
 
 
-// Allocate a segment from the OS aligned to `MI_SEGMENT_SIZE` .
-static mi_segment_t* mi_segment_alloc(size_t required, mi_arena_id_t req_arena_id, mi_segments_tld_t* tld, mi_os_tld_t* os_tld, mi_page_t** huge_page) {
-  return mi_segment_init(NULL, required, req_arena_id, tld, os_tld, huge_page);
-}
-
-
 static void mi_segment_free(mi_segment_t* segment, bool force, mi_segments_tld_t* tld) {
   MI_UNUSED(force);
   mi_assert_internal(segment != NULL);
@@ -1058,7 +1086,7 @@ static mi_decl_cache_align _Atomic(mi_segment_t*)       abandoned_visited; // =
 static mi_decl_cache_align _Atomic(mi_tagged_segment_t) abandoned;         // = NULL
 
 // Maintain these for debug purposes (these counts may be a bit off)
-static mi_decl_cache_align _Atomic(size_t)           abandoned_count; 
+static mi_decl_cache_align _Atomic(size_t)           abandoned_count;
 static mi_decl_cache_align _Atomic(size_t)           abandoned_visited_count;
 
 // We also maintain a count of current readers of the abandoned list
@@ -1369,7 +1397,7 @@ static mi_segment_t* mi_segment_try_reclaim(mi_heap_t* heap, size_t needed_slice
 {
   *reclaimed = false;
   mi_segment_t* segment;
-  long max_tries = mi_option_get_clamp(mi_option_max_segment_reclaim, 8, 1024);     // limit the work to bound allocation times  
+  long max_tries = mi_option_get_clamp(mi_option_max_segment_reclaim, 8, 1024);     // limit the work to bound allocation times
   while ((max_tries-- > 0) && ((segment = mi_abandoned_pop()) != NULL)) {
     segment->abandoned_visits++;
     // todo: an arena exclusive heap will potentially visit many abandoned unsuitable segments
@@ -1432,7 +1460,7 @@ void _mi_abandoned_collect(mi_heap_t* heap, bool force, mi_segments_tld_t* tld)
    Reclaim or allocate
 ----------------------------------------------------------- */
 
-static mi_segment_t* mi_segment_reclaim_or_alloc(mi_heap_t* heap, size_t needed_slices, size_t block_size, mi_segments_tld_t* tld, mi_os_tld_t* os_tld) 
+static mi_segment_t* mi_segment_reclaim_or_alloc(mi_heap_t* heap, size_t needed_slices, size_t block_size, mi_segments_tld_t* tld, mi_os_tld_t* os_tld)
 {
   mi_assert_internal(block_size < MI_HUGE_BLOCK_SIZE);
   mi_assert_internal(block_size <= MI_LARGE_OBJ_SIZE_MAX);
@@ -1450,7 +1478,7 @@ static mi_segment_t* mi_segment_reclaim_or_alloc(mi_heap_t* heap, size_t needed_
     return segment;
   }
   // 2. otherwise allocate a fresh segment
-  return mi_segment_alloc(0, heap->arena_id, tld, os_tld, NULL);  
+  return mi_segment_alloc(0, 0, heap->arena_id, tld, os_tld, NULL);  
 }
 
 
@@ -1490,17 +1518,37 @@ static mi_page_t* mi_segments_page_alloc(mi_heap_t* heap, mi_page_kind_t page_ki
    Huge page allocation
 ----------------------------------------------------------- */
 
-static mi_page_t* mi_segment_huge_page_alloc(size_t size, mi_arena_id_t req_arena_id, mi_segments_tld_t* tld, mi_os_tld_t* os_tld)
+static mi_page_t* mi_segment_huge_page_alloc(size_t size, size_t page_alignment, mi_arena_id_t req_arena_id, mi_segments_tld_t* tld, mi_os_tld_t* os_tld)
 {
   mi_page_t* page = NULL;
-  mi_segment_t* segment = mi_segment_alloc(size,req_arena_id,tld,os_tld,&page);
+  mi_segment_t* segment = mi_segment_alloc(size,page_alignment,req_arena_id,tld,os_tld,&page);
   if (segment == NULL || page==NULL) return NULL;
   mi_assert_internal(segment->used==1);
   mi_assert_internal(mi_page_block_size(page) >= size);  
+  #if MI_HUGE_PAGE_ABANDON
   segment->thread_id = 0; // huge segments are immediately abandoned
+  #endif  
+
+  // for huge pages we initialize the xblock_size as we may
+  // overallocate to accommodate large alignments.
+  size_t psize;
+  uint8_t* start = _mi_segment_page_start(segment, page, &psize);
+  page->xblock_size = (psize > MI_HUGE_BLOCK_SIZE ? MI_HUGE_BLOCK_SIZE : (uint32_t)psize);
+  
+  // decommit the part of the prefix of a page that will not be used; this can be quite large (close to MI_SEGMENT_SIZE)
+  if (page_alignment > 0 && segment->allow_decommit) {
+    uint8_t* aligned_p = (uint8_t*)_mi_align_up((uintptr_t)start, page_alignment);
+    mi_assert_internal(_mi_is_aligned(aligned_p, page_alignment));
+    mi_assert_internal(psize - (aligned_p - start) >= size);      
+    uint8_t* decommit_start = start + sizeof(mi_block_t);              // for the free list
+    ptrdiff_t decommit_size = aligned_p - decommit_start;
+    _mi_os_decommit(decommit_start, decommit_size, &_mi_stats_main);   // note: cannot use segment_decommit on huge segments    
+  }
+  
   return page;
 }
 
+#if MI_HUGE_PAGE_ABANDON
 // free huge block from another thread
 void _mi_segment_huge_page_free(mi_segment_t* segment, mi_page_t* page, mi_block_t* block) {
   // huge page segments are always abandoned and can be freed immediately by any thread
@@ -1528,12 +1576,34 @@ void _mi_segment_huge_page_free(mi_segment_t* segment, mi_page_t* page, mi_block
 #endif
 }
 
+#else
+// reset memory of a huge block from another thread
+void _mi_segment_huge_page_reset(mi_segment_t* segment, mi_page_t* page, mi_block_t* block) {
+  MI_UNUSED(page);
+  mi_assert_internal(segment->kind == MI_SEGMENT_HUGE);
+  mi_assert_internal(segment == _mi_page_segment(page));
+  mi_assert_internal(page->used == 1); // this is called just before the free
+  mi_assert_internal(page->free == NULL);
+  if (segment->allow_decommit) {
+    const size_t csize = mi_usable_size(block) - sizeof(mi_block_t);
+    uint8_t* p = (uint8_t*)block + sizeof(mi_block_t);
+    _mi_os_decommit(p, csize, &_mi_stats_main);  // note: cannot use segment_decommit on huge segments
+  }
+}
+#endif
+
 /* -----------------------------------------------------------
    Page allocation and free
 ----------------------------------------------------------- */
-mi_page_t* _mi_segment_page_alloc(mi_heap_t* heap, size_t block_size, mi_segments_tld_t* tld, mi_os_tld_t* os_tld) {
+mi_page_t* _mi_segment_page_alloc(mi_heap_t* heap, size_t block_size, size_t page_alignment, mi_segments_tld_t* tld, mi_os_tld_t* os_tld) {
   mi_page_t* page;
-  if (block_size <= MI_SMALL_OBJ_SIZE_MAX) {
+  if mi_unlikely(page_alignment > MI_ALIGNMENT_MAX) {
+    mi_assert_internal(_mi_is_power_of_two(page_alignment));
+    mi_assert_internal(page_alignment >= MI_SEGMENT_SIZE);
+    if (page_alignment < MI_SEGMENT_SIZE) { page_alignment = MI_SEGMENT_SIZE; }
+    page = mi_segment_huge_page_alloc(block_size,page_alignment,heap->arena_id,tld,os_tld);
+  }
+  else if (block_size <= MI_SMALL_OBJ_SIZE_MAX) {
     page = mi_segments_page_alloc(heap,MI_PAGE_SMALL,block_size,block_size,tld,os_tld);
   }
   else if (block_size <= MI_MEDIUM_OBJ_SIZE_MAX) {
@@ -1543,7 +1613,7 @@ mi_page_t* _mi_segment_page_alloc(mi_heap_t* heap, size_t block_size, mi_segment
     page = mi_segments_page_alloc(heap,MI_PAGE_LARGE,block_size,block_size,tld, os_tld);
   }
   else {
-    page = mi_segment_huge_page_alloc(block_size,heap->arena_id,tld,os_tld);    
+    page = mi_segment_huge_page_alloc(block_size,page_alignment,heap->arena_id,tld,os_tld);    
   }
   mi_assert_internal(page == NULL || _mi_heap_memid_is_suitable(heap, _mi_page_segment(page)->memid));
   mi_assert_expensive(page == NULL || mi_segment_is_valid(_mi_page_segment(page),tld));
diff --git a/source/luametatex/source/libraries/mimalloc/src/stats.c b/source/luametatex/source/libraries/mimalloc/src/stats.c
index f82c7c67f..2a8b9404f 100644
--- a/source/luametatex/source/libraries/mimalloc/src/stats.c
+++ b/source/luametatex/source/libraries/mimalloc/src/stats.c
@@ -21,7 +21,7 @@ terms of the MIT license. A copy of the license can be found in the file
 
 static bool mi_is_in_main(void* stat) {
   return ((uint8_t*)stat >= (uint8_t*)&_mi_stats_main
-         && (uint8_t*)stat < ((uint8_t*)&_mi_stats_main + sizeof(mi_stats_t)));  
+         && (uint8_t*)stat < ((uint8_t*)&_mi_stats_main + sizeof(mi_stats_t)));
 }
 
 static void mi_stat_update(mi_stat_count_t* stat, int64_t amount) {
@@ -51,7 +51,7 @@ static void mi_stat_update(mi_stat_count_t* stat, int64_t amount) {
   }
 }
 
-void _mi_stat_counter_increase(mi_stat_counter_t* stat, size_t amount) {  
+void _mi_stat_counter_increase(mi_stat_counter_t* stat, size_t amount) {
   if (mi_is_in_main(stat)) {
     mi_atomic_addi64_relaxed( &stat->count, 1 );
     mi_atomic_addi64_relaxed( &stat->total, (int64_t)amount );
@@ -77,7 +77,7 @@ static void mi_stat_add(mi_stat_count_t* stat, const mi_stat_count_t* src, int64
   mi_atomic_addi64_relaxed( &stat->allocated, src->allocated * unit);
   mi_atomic_addi64_relaxed( &stat->current, src->current * unit);
   mi_atomic_addi64_relaxed( &stat->freed, src->freed * unit);
-  // peak scores do not work across threads.. 
+  // peak scores do not work across threads..
   mi_atomic_addi64_relaxed( &stat->peak, src->peak * unit);
 }
 
@@ -129,11 +129,11 @@ static void mi_stats_add(mi_stats_t* stats, const mi_stats_t* src) {
   Display statistics
 ----------------------------------------------------------- */
 
-// unit > 0 : size in binary bytes 
+// unit > 0 : size in binary bytes
 // unit == 0: count as decimal
 // unit < 0 : count in binary
 static void mi_printf_amount(int64_t n, int64_t unit, mi_output_fun* out, void* arg, const char* fmt) {
-  char buf[32]; buf[0] = 0;  
+  char buf[32]; buf[0] = 0;
   int  len = 32;
   const char* suffix = (unit <= 0 ? " " : "B");
   const int64_t base = (unit == 0 ? 1000 : 1024);
@@ -146,7 +146,7 @@ static void mi_printf_amount(int64_t n, int64_t unit, mi_output_fun* out, void*
     }
   }
   else {
-    int64_t divider = base;    
+    int64_t divider = base;
     const char* magnitude = "K";
     if (pos >= divider*base) { divider *= base; magnitude = "M"; }
     if (pos >= divider*base) { divider *= base; magnitude = "G"; }
@@ -170,19 +170,23 @@ static void mi_print_count(int64_t n, int64_t unit, mi_output_fun* out, void* ar
           else mi_print_amount(n,0,out,arg);
 }
 
-static void mi_stat_print(const mi_stat_count_t* stat, const char* msg, int64_t unit, mi_output_fun* out, void* arg ) {
+static void mi_stat_print_ex(const mi_stat_count_t* stat, const char* msg, int64_t unit, mi_output_fun* out, void* arg, const char* notok ) {
   _mi_fprintf(out, arg,"%10s:", msg);
-  if (unit>0) {
+  if (unit > 0) {
     mi_print_amount(stat->peak, unit, out, arg);
     mi_print_amount(stat->allocated, unit, out, arg);
     mi_print_amount(stat->freed, unit, out, arg);
     mi_print_amount(stat->current, unit, out, arg);
     mi_print_amount(unit, 1, out, arg);
     mi_print_count(stat->allocated, unit, out, arg);
-    if (stat->allocated > stat->freed)
-      _mi_fprintf(out, arg, "  not all freed!\n");
-    else
+    if (stat->allocated > stat->freed) {
+      _mi_fprintf(out, arg, "  ");
+      _mi_fprintf(out, arg, (notok == NULL ? "not all freed!" : notok));
+      _mi_fprintf(out, arg, "\n");
+    }
+    else {
       _mi_fprintf(out, arg, "  ok\n");
+    }
   }
   else if (unit<0) {
     mi_print_amount(stat->peak, -1, out, arg);
@@ -204,12 +208,16 @@ static void mi_stat_print(const mi_stat_count_t* stat, const char* msg, int64_t
   else {
     mi_print_amount(stat->peak, 1, out, arg);
     mi_print_amount(stat->allocated, 1, out, arg);
-    _mi_fprintf(out, arg, "%11s", " ");  // no freed 
+    _mi_fprintf(out, arg, "%11s", " ");  // no freed
     mi_print_amount(stat->current, 1, out, arg);
     _mi_fprintf(out, arg, "\n");
   }
 }
 
+static void mi_stat_print(const mi_stat_count_t* stat, const char* msg, int64_t unit, mi_output_fun* out, void* arg) {
+  mi_stat_print_ex(stat, msg, unit, out, arg, NULL);
+}
+
 static void mi_stat_counter_print(const mi_stat_counter_t* stat, const char* msg, mi_output_fun* out, void* arg ) {
   _mi_fprintf(out, arg, "%10s:", msg);
   mi_print_amount(stat->total, -1, out, arg);
@@ -217,7 +225,7 @@ static void mi_stat_counter_print(const mi_stat_counter_t* stat, const char* msg
 }
 
 static void mi_stat_counter_print_avg(const mi_stat_counter_t* stat, const char* msg, mi_output_fun* out, void* arg) {
-  const int64_t avg_tens = (stat->count == 0 ? 0 : (stat->total*10 / stat->count)); 
+  const int64_t avg_tens = (stat->count == 0 ? 0 : (stat->total*10 / stat->count));
   const long avg_whole = (long)(avg_tens/10);
   const long avg_frac1 = (long)(avg_tens%10);
   _mi_fprintf(out, arg, "%10s: %5ld.%ld avg\n", msg, avg_whole, avg_frac1);
@@ -257,7 +265,7 @@ typedef struct buffered_s {
   mi_output_fun* out;   // original output function
   void*          arg;   // and state
   char*          buf;   // local buffer of at least size `count+1`
-  size_t         used;  // currently used chars `used <= count`  
+  size_t         used;  // currently used chars `used <= count`
   size_t         count; // total chars available for output
 } buffered_t;
 
@@ -312,8 +320,8 @@ static void _mi_stats_print(mi_stats_t* stats, mi_output_fun* out0, void* arg0)
   mi_stat_print(&stats->malloc, "malloc req", 1, out, arg);
   _mi_fprintf(out, arg, "\n");
   #endif
-  mi_stat_print(&stats->reserved, "reserved", 1, out, arg);
-  mi_stat_print(&stats->committed, "committed", 1, out, arg);
+  mi_stat_print_ex(&stats->reserved, "reserved", 1, out, arg, "");
+  mi_stat_print_ex(&stats->committed, "committed", 1, out, arg, "");
   mi_stat_print(&stats->reset, "reset", 1, out, arg);
   mi_stat_print(&stats->page_committed, "touched", 1, out, arg);
   mi_stat_print(&stats->segments, "segments", -1, out, arg);
@@ -328,7 +336,7 @@ static void _mi_stats_print(mi_stats_t* stats, mi_output_fun* out0, void* arg0)
   mi_stat_print(&stats->threads, "threads", -1, out, arg);
   mi_stat_counter_print_avg(&stats->searches, "searches", out, arg);
   _mi_fprintf(out, arg, "%10s: %7zu\n", "numa nodes", _mi_os_numa_node_count());
-  
+
   mi_msecs_t elapsed;
   mi_msecs_t user_time;
   mi_msecs_t sys_time;
@@ -346,7 +354,7 @@ static void _mi_stats_print(mi_stats_t* stats, mi_output_fun* out0, void* arg0)
     _mi_fprintf(out, arg, ", commit: ");
     mi_printf_amount((int64_t)peak_commit, 1, out, arg, "%s");
   }
-  _mi_fprintf(out, arg, "\n");  
+  _mi_fprintf(out, arg, "\n");
 }
 
 static mi_msecs_t mi_process_start; // = 0
@@ -406,7 +414,7 @@ static mi_msecs_t mi_to_msecs(LARGE_INTEGER t) {
     mfreq.QuadPart = f.QuadPart/1000LL;
     if (mfreq.QuadPart == 0) mfreq.QuadPart = 1;
   }
-  return (mi_msecs_t)(t.QuadPart / mfreq.QuadPart);  
+  return (mi_msecs_t)(t.QuadPart / mfreq.QuadPart);
 }
 
 mi_msecs_t _mi_clock_now(void) {
@@ -421,7 +429,7 @@ mi_msecs_t _mi_clock_now(void) {
   struct timespec t;
   #ifdef CLOCK_MONOTONIC
   clock_gettime(CLOCK_MONOTONIC, &t);
-  #else  
+  #else
   clock_gettime(CLOCK_REALTIME, &t);
   #endif
   return ((mi_msecs_t)t.tv_sec * 1000) + ((mi_msecs_t)t.tv_nsec / 1000000);
@@ -457,8 +465,6 @@ mi_msecs_t _mi_clock_end(mi_msecs_t start) {
 
 #if defined(_WIN32)
 #include <windows.h>
-#include <psapi.h>
-#pragma comment(lib,"psapi.lib")
 
 static mi_msecs_t filetime_msecs(const FILETIME* ftime) {
   ULARGE_INTEGER i;
@@ -468,7 +474,23 @@ static mi_msecs_t filetime_msecs(const FILETIME* ftime) {
   return msecs;
 }
 
-static void mi_stat_process_info(mi_msecs_t* elapsed, mi_msecs_t* utime, mi_msecs_t* stime, size_t* current_rss, size_t* peak_rss, size_t* current_commit, size_t* peak_commit, size_t* page_faults) 
+typedef struct _PROCESS_MEMORY_COUNTERS {
+  DWORD cb;
+  DWORD PageFaultCount;
+  SIZE_T PeakWorkingSetSize;
+  SIZE_T WorkingSetSize;
+  SIZE_T QuotaPeakPagedPoolUsage;
+  SIZE_T QuotaPagedPoolUsage;
+  SIZE_T QuotaPeakNonPagedPoolUsage;
+  SIZE_T QuotaNonPagedPoolUsage;
+  SIZE_T PagefileUsage;
+  SIZE_T PeakPagefileUsage;
+} PROCESS_MEMORY_COUNTERS;
+typedef PROCESS_MEMORY_COUNTERS* PPROCESS_MEMORY_COUNTERS;
+typedef BOOL (WINAPI *PGetProcessMemoryInfo)(HANDLE, PPROCESS_MEMORY_COUNTERS, DWORD);
+static PGetProcessMemoryInfo pGetProcessMemoryInfo = NULL;
+
+static void mi_stat_process_info(mi_msecs_t* elapsed, mi_msecs_t* utime, mi_msecs_t* stime, size_t* current_rss, size_t* peak_rss, size_t* current_commit, size_t* peak_commit, size_t* page_faults)
 {
   *elapsed = _mi_clock_end(mi_process_start);
   FILETIME ct;
@@ -478,13 +500,26 @@ static void mi_stat_process_info(mi_msecs_t* elapsed, mi_msecs_t* utime, mi_msec
   GetProcessTimes(GetCurrentProcess(), &ct, &et, &st, &ut);
   *utime = filetime_msecs(&ut);
   *stime = filetime_msecs(&st);
+  
+  // load psapi on demand
+  if (pGetProcessMemoryInfo == NULL) {
+    HINSTANCE hDll = LoadLibrary(TEXT("psapi.dll"));
+    if (hDll != NULL) {
+      pGetProcessMemoryInfo = (PGetProcessMemoryInfo)(void (*)(void))GetProcAddress(hDll, "GetProcessMemoryInfo");
+    }
+  }
+
+  // get process info
   PROCESS_MEMORY_COUNTERS info;
-  GetProcessMemoryInfo(GetCurrentProcess(), &info, sizeof(info));
+  memset(&info, 0, sizeof(info));
+  if (pGetProcessMemoryInfo != NULL) {
+    pGetProcessMemoryInfo(GetCurrentProcess(), &info, sizeof(info));
+  } 
   *current_rss    = (size_t)info.WorkingSetSize;
   *peak_rss       = (size_t)info.PeakWorkingSetSize;
   *current_commit = (size_t)info.PagefileUsage;
   *peak_commit    = (size_t)info.PeakPagefileUsage;
-  *page_faults    = (size_t)info.PageFaultCount;  
+  *page_faults    = (size_t)info.PageFaultCount;
 }
 
 #elif !defined(__wasi__) && (defined(__unix__) || defined(__unix) || defined(unix) || defined(__APPLE__) || defined(__HAIKU__))
@@ -517,7 +552,7 @@ static void mi_stat_process_info(mi_msecs_t* elapsed, mi_msecs_t* utime, mi_msec
   // estimate commit using our stats
   *peak_commit    = (size_t)(mi_atomic_loadi64_relaxed((_Atomic(int64_t)*)&_mi_stats_main.committed.peak));
   *current_commit = (size_t)(mi_atomic_loadi64_relaxed((_Atomic(int64_t)*)&_mi_stats_main.committed.current));
-  *current_rss    = *current_commit;  // estimate 
+  *current_rss    = *current_commit;  // estimate
 #if defined(__HAIKU__)
   // Haiku does not have (yet?) a way to
   // get these stats per process
@@ -538,7 +573,7 @@ static void mi_stat_process_info(mi_msecs_t* elapsed, mi_msecs_t* utime, mi_msec
   }
 #else
   *peak_rss = rusage.ru_maxrss * 1024;  // Linux reports in KiB
-#endif  
+#endif
 }
 
 #else
@@ -570,7 +605,7 @@ mi_decl_export void mi_process_info(size_t* elapsed_msecs, size_t* user_msecs, s
   size_t peak_rss0 = 0;
   size_t current_commit0 = 0;
   size_t peak_commit0 = 0;
-  size_t page_faults0 = 0;  
+  size_t page_faults0 = 0;
   mi_stat_process_info(&elapsed,&utime, &stime, &current_rss0, &peak_rss0, &current_commit0, &peak_commit0, &page_faults0);
   if (elapsed_msecs!=NULL)  *elapsed_msecs = (elapsed < 0 ? 0 : (elapsed < (mi_msecs_t)PTRDIFF_MAX ? (size_t)elapsed : PTRDIFF_MAX));
   if (user_msecs!=NULL)     *user_msecs     = (utime < 0 ? 0 : (utime < (mi_msecs_t)PTRDIFF_MAX ? (size_t)utime : PTRDIFF_MAX));
@@ -581,4 +616,3 @@ mi_decl_export void mi_process_info(size_t* elapsed_msecs, size_t* user_msecs, s
   if (peak_commit!=NULL)    *peak_commit    = peak_commit0;
   if (page_faults!=NULL)    *page_faults    = page_faults0;
 }
-
-- 
cgit v1.2.3