1 files changed, 627 insertions, 0 deletions
diff --git a/source/luametatex/source/libraries/hnj/hnjhyphen.c b/source/luametatex/source/libraries/hnj/hnjhyphen.c
new file mode 100644
index 000000000..ad9d87683
--- /dev/null
+++ b/source/luametatex/source/libraries/hnj/hnjhyphen.c
@@ -0,0 +1,627 @@
+/*
+    See license.txt in the root of this project.
+*/
+
+/*
+
+    This file is derived from libhnj which is is dual licensed under LGPL and MPL. Boilerplate
+    for both licenses follows.
+
+    LibHnj - a library for high quality hyphenation and justification
+
+    (C) 1998 Raph Levien,
+    (C) 2001 ALTLinux, Moscow (http://www.alt-linux.org),
+    (C) 2001 Peter Novodvorsky (nidd@cs.msu.su)
+
+    This library is free software; you can redistribute it and/or modify it under the terms of the
+    GNU Library General Public License as published by the Free Software Foundation; either version
+    2 of the License, or (at your option) any later version.
+
+    This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
+    without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
+    the GNU Library General Public License for more details.
+
+    You should have received a copy of the GNU Library General Public License along with this
+    library; if not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+    Boston, MA 02111-1307 USA.
+
+    The contents of this file are subject to the Mozilla Public License Version 1.0 (the "MPL");
+    you may not use this file except in compliance with the MPL. You may obtain a copy of the MPL
+    at http://www.mozilla.org/MPL/
+
+    Software distributed under the MPL is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
+    KIND, either express or implied. See the MPL for the specific language governing rights and
+    limitations under the MPL.
+
+    Remark: I'm not sure if something fundamental was adapted in the perspective of using this
+    library in LuaTeX. However, for instance error reporting has been hooked into the Lua(Meta)TeX
+    error reporting mechanisms. Also a bit of reformatting was done. This module won't change.
+    Also, the code has been adapted a little in order to fit in the rest (function names etc)
+    because it is more exposed. We use the alternative memory allocator.
+
+*/
+
+/*tex We need the warning subsystem, so: */
+
+# include "luametatex.h"
+
+/*tex A few helpers (from |hnjalloc|): */
+
+static void *hnj_malloc(int size)
+{
+    void *p = lmt_memory_malloc((size_t) size);
+    if (! p) {
+        tex_formatted_error("hyphenation", "allocating %d bytes failed\n", size);
+    }
+    return p;
+}
+
+static void *hnj_realloc(void *p, int size)
+{
+    void *n = lmt_memory_realloc(p, (size_t) size);
+    if (! n) {
+        tex_formatted_error("hyphenation", "reallocating %d bytes failed\n", size);
+    }
+    return n;
+}
+
+static void hnj_free(void *p)
+{
+    lmt_memory_free(p);
+}
+
+static unsigned char *hnj_strdup(const unsigned char *s)
+{
+    size_t l = strlen((const char *) s);
+    unsigned char *n = hnj_malloc((int) l + 1);
+    memcpy(n, s, l);
+    n[l] = 0;
+    return n;
+}
+
+/*tex
+
+    Combine two right-aligned number patterns, 04000 + 020 becomes 04020. This works also for utf8
+    sequences because the substring is identical to the last |substring - length| bytes of expr
+    except for the (single byte) hyphenation encoders
+
+*/
+
+static char *combine_patterns(char *expr, const char *subexpr)
+{
+    size_t l1 = strlen(expr);
+    size_t l2 = strlen(subexpr);
+    size_t off = l1 - l2;
+    for (unsigned j = 0; j < l2; j++) {
+        if (expr[off + j] < subexpr[j]) {
+            expr[off + j] = subexpr[j];
+        }
+    }
+    return expr;
+}
+
+/*tex Some original code: */
+
+static hjn_hashiterator *new_hashiterator(hjn_hashtable *h)
+{
+    hjn_hashiterator *i = hnj_malloc(sizeof(hjn_hashiterator));
+    i->e = h->entries;
+    i->cur = NULL;
+    i->ndx = -1;
+    return i;
+}
+
+static int nexthashstealpattern(hjn_hashiterator *i, unsigned char **word, char **pattern)
+{
+    while (i->cur == NULL) {
+        if (i->ndx >= HNJ_HASH_SIZE - 1) {
+            return 0;
+        } else {
+            i->cur = i->e[++i->ndx];
+        }
+    }
+    *word = i->cur->key;
+    *pattern = i->cur->u.hyppat;
+    i->cur->u.hyppat = NULL;
+    i->cur = i->cur->next;
+    return 1;
+}
+
+static int nexthash(hjn_hashiterator *i, unsigned char **word)
+{
+    while (! i->cur) {
+        if (i->ndx >= HNJ_HASH_SIZE - 1) {
+            return 0;
+        } else {
+            i->cur = i->e[++i->ndx];
+        }
+    }
+    *word = i->cur->key;
+    i->cur = i->cur->next;
+    return 1;
+}
+
+static int eachhash(hjn_hashiterator *i, unsigned char **word, char **pattern)
+{
+    while (! i->cur) {
+        if (i->ndx >= HNJ_HASH_SIZE - 1) {
+            return 0;
+        } else {
+            i->cur = i->e[++i->ndx];
+        }
+    }
+    *word = i->cur->key;
+    *pattern = i->cur->u.hyppat;
+    i->cur = i->cur->next;
+    return 1;
+}
+
+static void delete_hashiterator(hjn_hashiterator *i)
+{
+    hnj_free(i);
+}
+
+/*tex A |char*| hash function from ASU, adapted from |Gtk+|: */
+
+static unsigned int string_hash(const unsigned char *s)
+{
+    const unsigned char *p = s;
+    unsigned int h = 0, g;
+    for (; *p != '\0'; p += 1) {
+        h = (h << 4) + *p;
+        g = h & 0xf0000000;
+        if (g) {
+            h = h ^ (g >> 24);
+            h = h ^ g;
+        }
+    }
+    return h;
+}
+
+/*tex This assumes that key is not already present! */
+
+static void state_insert(hjn_hashtable *hashtab, unsigned char *key, int state)
+{
+    int i = (int) (string_hash(key) % HNJ_HASH_SIZE);
+    hjn_hashentry* e = hnj_malloc(sizeof(hjn_hashentry));
+    e->next = hashtab->entries[i];
+    e->key = key;
+    e->u.state = state;
+    hashtab->entries[i] = e;
+}
+
+/*tex This also assumes that key is not already present! */
+
+static void insert_pattern(hjn_hashtable *hashtab, unsigned char *key, char *hyppat, int trace)
+{
+    hjn_hashentry *e;
+    int i = (int) (string_hash(key) % HNJ_HASH_SIZE);
+    for (e = hashtab->entries[i]; e; e = e->next) {
+        if (strcmp((char *) e->key, (char *) key) == 0) {
+            if (e->u.hyppat) {
+                if (trace && hyppat && strcmp((char *) e->u.hyppat, (char *) hyppat) != 0) {
+                    tex_formatted_warning("hyphenation", "a conflicting pattern '%s' has been ignored", hyppat);
+                }
+                hnj_free(e->u.hyppat);
+            }
+            e->u.hyppat = hyppat;
+            hnj_free(key);
+            return;
+        }
+    }
+    e = hnj_malloc(sizeof(hjn_hashentry));
+    e->next = hashtab->entries[i];
+    e->key = key;
+    e->u.hyppat = hyppat;
+    hashtab->entries[i] = e;
+}
+
+/*tex We return |state| if found, otherwise |-1|. */
+
+static int state_lookup(hjn_hashtable *hashtab, const unsigned char *key)
+{
+    int i = (int) (string_hash(key) % HNJ_HASH_SIZE);
+    for (hjn_hashentry *e = hashtab->entries[i]; e; e = e->next) {
+        if (! strcmp((const char *) key, (const char *) e->key)) {
+            return e->u.state;
+        }
+    }
+    return -1;
+}
+
+/*tex We return |state| if found, otherwise |-1|. The 256 should be enough. */
+
+static char *lookup_pattern(hjn_hashtable * hashtab, const unsigned char *chars, int l)
+{
+    int i;
+    unsigned char key[256];
+    strncpy((char *) key, (const char *) chars, (size_t) l);
+    key[l] = 0;
+    i = (int) (string_hash(key) % HNJ_HASH_SIZE);
+    for (hjn_hashentry *e = hashtab->entries[i]; e; e = e->next) {
+        if (! strcmp((char *) key, (char *) e->key)) {
+            return e->u.hyppat;
+        }
+    }
+    return NULL;
+}
+
+/*tex Get the state number, allocating a new state if necessary. */
+
+static int hnj_get_state(hjn_dictionary *dict, const unsigned char *str, int *state_num)
+{
+    *state_num = state_lookup(dict->state_num, str);
+    if (*state_num >= 0) {
+        return *state_num;
+    } else {
+        state_insert(dict->state_num, hnj_strdup(str), dict->num_states);
+        /*tex The predicate is true if |dict->num_states| is a power of two: */
+        if (! (dict->num_states & (dict->num_states - 1))) {
+            dict->states = hnj_realloc(dict->states, (int) ((dict->num_states << 1) * (int) sizeof(hjn_state)));
+        }
+        dict->states[dict->num_states] = (hjn_state) { .match = NULL, .fallback_state = -1, .num_trans = 0, .trans = NULL };
+        return dict->num_states++;
+    }
+}
+
+/*tex
+
+    Add a transition from state1 to state2 through ch - assumes that the transition does not
+    already exist.
+
+*/
+
+static void hnj_add_trans(hjn_dictionary *dict, int state1, int state2, int chr)
+{
+    /*tex
+
+        This test was a bit too strict, it is quite normal for old patterns to have chars in the
+        range 0-31 or 127-159 (inclusive). To ease the transition, let's only disallow |nul| for
+        now, which probably is a requirement of the code anyway.
+
+    */
+    if (chr) {
+        int num_trans = dict->states[state1].num_trans;
+        if (num_trans == 0) {
+            dict->states[state1].trans = hnj_malloc(sizeof(hjn_transition));
+        } else {
+            /*tex
+
+                The old version did:
+
+                \starttyping
+                } else if (!(num_trans & (num_trans - 1))) {
+                    ... = hnj_realloc(dict->states[state1].trans,
+                            (int) ((num_trans << 1) * sizeof(HyphenTrans)));
+                \stoptyping
+
+                but that is incredibly nasty when adding patters one-at-a-time. Controlled growth
+                would be nicer than the current +1, but if no one complains, and no one did in a
+                decade, this is good enough.
+
+            */
+            dict->states[state1].trans = hnj_realloc(dict->states[state1].trans, (int) ((num_trans + 1) * sizeof(hjn_transition)));
+        }
+        dict->states[state1].trans[num_trans].uni_ch = chr;
+        dict->states[state1].trans[num_trans].new_state = state2;
+        dict->states[state1].num_trans++;
+    } else {
+        tex_normal_error("hyphenation","a nul character is not permited");
+    }
+}
+
+/*tex
+
+    We did change the semantics a bit here: |hnj_hyphen_load| used to operate on a file, but now
+    the argument is a string buffer.
+
+*/
+
+/* define tex_isspace(c) (c == ' ' || c == '\t') */
+#  define tex_isspace(c) (c == ' ')
+
+static const unsigned char *next_pattern(size_t* length, const unsigned char** buf)
+{
+    const unsigned char *here, *rover = *buf;
+    while (*rover && tex_isspace(*rover)) {
+        rover++;
+    }
+    here = rover;
+    while (*rover) {
+        if (tex_isspace(*rover)) {
+            *length = (size_t) (rover - here);
+            *buf = rover;
+            return here;
+        } else {
+            rover++;
+        }
+    }
+    *length = (size_t) (rover - here);
+    *buf = rover;
+    return *length ? here : NULL;
+}
+
+static void init_hash(hjn_hashtable **h)
+{
+    if (! *h) {
+        *h = hnj_malloc(sizeof(hjn_hashtable));
+        for (int i = 0; i < HNJ_HASH_SIZE; i++) {
+            (*h)->entries[i] = NULL;
+        }
+    }
+}
+
+static void clear_state_hash(hjn_hashtable **h)
+{
+    if (*h) {
+        for (int i = 0; i < HNJ_HASH_SIZE; i++) {
+            hjn_hashentry *e, *next;
+            for (e = (*h)->entries[i]; e; e = next) {
+                next = e->next;
+                hnj_free(e->key);
+                hnj_free(e);
+            }
+        }
+        hnj_free(*h);
+        *h = NULL;
+    }
+}
+
+static void clear_pattern_hash(hjn_hashtable **h)
+{
+    if (*h) {
+        for (int i = 0; i < HNJ_HASH_SIZE; i++) {
+            hjn_hashentry *e, *next;
+            for (e = (*h)->entries[i]; e; e = next) {
+                next = e->next;
+                hnj_free(e->key);
+                if (e->u.hyppat) {
+                    hnj_free(e->u.hyppat);
+                }
+                hnj_free(e);
+            }
+        }
+        hnj_free(*h);
+        *h = NULL;
+    }
+}
+
+static void init_dictionary(hjn_dictionary *dict)
+{
+    dict->num_states = 1;
+    dict->pat_length = 0;
+    dict->states = hnj_malloc(sizeof(hjn_state));
+    dict->states[0] = (hjn_state) { .match = NULL, .fallback_state = -1, .num_trans = 0, .trans = NULL };
+    dict->patterns = NULL;
+    dict->merged = NULL;
+    dict->state_num = NULL;
+    init_hash(&dict->patterns);
+}
+
+static void clear_dictionary(hjn_dictionary *dict)
+{
+    for (int state_num = 0; state_num < dict->num_states; state_num++) {
+        hjn_state *hstate = &dict->states[state_num];
+        if (hstate->match) {
+            hnj_free(hstate->match);
+        }
+        if (hstate->trans) {
+            hnj_free(hstate->trans);
+        }
+    }
+    hnj_free(dict->states);
+    clear_pattern_hash(&dict->patterns);
+    clear_pattern_hash(&dict->merged);
+    clear_state_hash(&dict->state_num);
+}
+
+hjn_dictionary *hnj_dictionary_new(void)
+{
+    hjn_dictionary *dict = hnj_malloc(sizeof(hjn_dictionary));
+    init_dictionary(dict);
+    return dict;
+}
+
+void hnj_dictionary_clear(hjn_dictionary *dict)
+{
+    clear_dictionary(dict);
+    init_dictionary(dict);
+}
+
+void hnj_dictionary_free(hjn_dictionary *dict)
+{
+    clear_dictionary(dict);
+    hnj_free(dict);
+}
+
+unsigned char *hnj_dictionary_tostring(hjn_dictionary *dict)
+{
+    unsigned char *word;
+    char *pattern;
+    unsigned char *buf = hnj_malloc(dict->pat_length);
+    unsigned char *cur = buf;
+    hjn_hashiterator *v = new_hashiterator(dict->patterns);
+    while (eachhash(v, &word, &pattern)) {
+        int i = 0;
+        int e = 0;
+        while (word[e + i]) {
+            if (pattern[i] != '0') {
+                *cur++ = (unsigned char) pattern[i];
+            }
+            *cur++ = word[e + i++];
+            while (is_utf8_follow(word[e + i])) {
+                *cur++ = word[i + e++];
+            }
+        }
+        if (pattern[i] != '0') {
+            *cur++ = (unsigned char) pattern[i];
+        }
+        *cur++ = ' ';
+    }
+    delete_hashiterator(v);
+    *cur = 0;
+    return buf;
+}
+
+/*tex
+
+    In hyphenation patterns we use signed bytes where |0|, or actually any negative number,
+    indicates end:
+
+    \starttyping
+    prio(1+),startpos,length,len1,[replace],len2,[replace]
+    \starttyping
+
+    A basic example is:
+
+    \starttyping
+    p n 0 0 0
+    \starttyping
+
+    for a hyphenation point between characters.
+
+*/
+
+void hnj_dictionary_load(hjn_dictionary *dict, const unsigned char *f, int trace)
+{
+    int state_num, last_state;
+    int ch;
+    int found;
+    hjn_hashiterator *v;
+    unsigned char *word;
+    char *pattern;
+    size_t l = 0;
+    const unsigned char *format;
+    const unsigned char *begin = f;
+    while ((format = next_pattern(&l, &f)) != NULL) {
+        if (l > 0 && l < 255) {
+            int i, j, e1;
+            for (i = 0, j = 0, e1 = 0; (unsigned) i < l; i++) {
+                if (format[i] >= '0' && format[i] <= '9') {
+                    j++;
+                }
+                if (is_utf8_follow(format[i])) {
+                    e1++;
+                }
+            }
+            /*tex
+                Here |l-e1| is the number of {\em characters} not {\em bytes}, |l-j| the number of
+                pattern bytes and |l-e1-j| the number of pattern characters.
+            */
+            {
+                unsigned char *pat = (unsigned char *) hnj_malloc((1 + (int) l - j));
+                char *org = (char *) hnj_malloc(2 + (int) l - e1 - j);
+                /*tex Remove hyphenation encoders (digits) from pat. */
+                org[0] = '0';
+                for (i = 0, j = 0, e1 = 0; (unsigned) i < l; i++) {
+                    unsigned char c = format[i];
+                    if (is_utf8_follow(c)) {
+                        pat[j + e1++] = c;
+                    } else if (c < '0' || c > '9') {
+                        pat[e1 + j++] = c;
+                        org[j] = '0';
+                    } else {
+                        org[j] = (char) c;
+                    }
+                }
+                pat[e1 + j] = 0;
+                org[j + 1] = 0;
+                insert_pattern(dict->patterns, pat, org, trace);
+            }
+        } else {
+           tex_normal_warning("hyphenation", "a pattern of more than 254 bytes is ignored");
+        }
+    }
+    /*tex We add 2 bytes for spurious spaces. */
+    dict->pat_length += (int) ((f - begin) + 2);
+    init_hash(&dict->merged);
+    v = new_hashiterator(dict->patterns);
+    while (nexthash(v, &word)) {
+        int wordsize = (int) strlen((char *) word);
+        for (int l1 = 1; l1 <= wordsize; l1++) {
+            if (is_utf8_follow(word[l1])) {
+                /*tex Do not clip an utf8 sequence. */
+            } else {
+                for (int j1 = 1; j1 <= l1; j1++) {
+                    int i1 = l1 - j1;
+                    if (is_utf8_follow(word[i1])) {
+                        /*tex Do not start halfway an utf8 sequence. */
+                    } else {
+                        char *subpat_pat = lookup_pattern(dict->patterns, word + i1, j1);
+                        if (subpat_pat) {
+                            char *newpat_pat = lookup_pattern(dict->merged, word, l1);
+                            if (! newpat_pat) {
+                                char *neworg;
+                                unsigned char *newword = (unsigned char *) hnj_malloc((size_t) (l1 + 1));
+                                int e1 = 0;
+                                strncpy((char *) newword, (char *) word, (size_t) l1);
+                                newword[l1] = 0;
+                                for (i1 = 0; i1 < l1; i1++) {
+                                    if (is_utf8_follow(newword[i1])) {
+                                        e1++;
+                                    }
+                                }
+                                neworg = hnj_malloc((size_t) (l1 + 2 - e1));
+                                /*tex Fill with right amount of zeros: */
+                                sprintf(neworg, "%0*d", l1 + 1 - e1, 0);
+                                insert_pattern(dict->merged, newword, combine_patterns(neworg, subpat_pat), trace);
+                            } else {
+                                combine_patterns(newpat_pat, subpat_pat);
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+    delete_hashiterator(v);
+    init_hash(&dict->state_num);
+    state_insert(dict->state_num, hnj_strdup((const unsigned char *) ""), 0);
+    v = new_hashiterator(dict->merged);
+    while (nexthashstealpattern(v, &word, &pattern)) {
+        static unsigned char mask[] = { 0x3F, 0x1F, 0xF, 0x7 };
+        int j1 = (int) strlen((char *) word);
+        state_num = hnj_get_state(dict, word, &found);
+        dict->states[state_num].match = pattern;
+        /*tex Now, put in the prefix transitions. */
+        while (found < 0) {
+            j1--;
+            last_state = state_num;
+            ch = word[j1];
+            if (ch >= 0x80) { /* why not is_utf8_follow(ch) here */
+                int m;
+                int i1 = 1;
+                while (is_utf8_follow(word[j1 - i1])) {
+                    i1++;
+                }
+                ch = word[j1 - i1] & mask[i1];
+                m = j1 - i1;
+                while (i1--) {
+                    ch = (ch << 6) + (0x3F & word[j1 - i1]);
+                }
+                j1 = m;
+            }
+            word[j1] = '\0';
+            state_num = hnj_get_state(dict, word, &found);
+            hnj_add_trans(dict, state_num, last_state, ch);
+        }
+    }
+    delete_hashiterator(v);
+    clear_pattern_hash(&dict->merged);
+    /*tex Put in the fallback states. */
+    for (int i = 0; i < HNJ_HASH_SIZE; i++) {
+        for (hjn_hashentry *e = dict->state_num->entries[i]; e; e = e->next) {
+            /*tex Do not do |state == 0| otherwise things get confused. */
+            if (e->u.state) {
+                for (int j = 1; 1; j++) {
+                    state_num = state_lookup(dict->state_num, e->key + j);
+                    if (state_num >= 0) {
+                        break;
+                    }
+                }
+                dict->states[e->u.state].fallback_state = state_num;
+            }
+        }
+    }
+    clear_state_hash(&dict->state_num);
+}