1 files changed, 1774 insertions, 0 deletions
diff --git a/source/luametatex/source/tex/texlanguage.c b/source/luametatex/source/tex/texlanguage.c
new file mode 100644
index 000000000..6f3460c22
--- /dev/null
+++ b/source/luametatex/source/tex/texlanguage.c
@@ -0,0 +1,1774 @@
+/*
+    See license.txt in the root of this project.
+*/
+
+# include "luametatex.h"
+
+/*tex
+
+    We no longer dump the patterns and exeptions as they as supposed to be loaded runtime. There is
+    no gain getting them from the format. But we do dump some of the properties.
+
+    There were all kind of checks for simple characters i.e. not ligatures but there is no need for
+    that in \LUAMETATEX. We have separated stages and the hyphenator sees just glyphs. And when a
+    traditional font has glyphs we can assume that the old school font encoding matches the patterns
+    i.e. that ligatures are not in the normal character slots.
+
+    Exceptions are stored at the \LUA\ end. We cannot easilly go dynamic because fonts are stored
+    in the eqtb so we would have to use some more indirect mechanism (doable as we do it for other
+    items) too.
+
+*/
+
+language_state_info lmt_language_state = {
+    .languages        = NULL,
+    .language_data    = {
+        .minimum      = min_language_size,
+        .maximum      = max_language_size,
+        .size         = memory_data_unset,
+        .step         = stp_language_size,
+        .allocated    = 0,
+        .itemsize     = 1,
+        .top          = 0,
+        .ptr          = 0,
+        .initial      = memory_data_unset,
+        .offset       = 0,
+    },
+    .handler_table_id = 0,
+    .handler_count    = 0,
+};
+
+/*tex
+    We can enforce a language id but we want to be sequential so we accept holes! So one
+    has to define bottom-up. As with fonts, we have a zero language but that one normally
+    is not set.
+*/
+
+static void tex_aux_reset_language(halfword id)
+{
+    tex_language *lang = lmt_language_state.languages[id];
+    lang->id = id;
+    lang->exceptions = 0;
+    lang->patterns = NULL;
+    lang->wordhandler = 0;
+    lang->pre_hyphen_char = '-';
+    lang->post_hyphen_char = 0;
+    lang->pre_exhyphen_char = 0;
+    lang->post_exhyphen_char = 0;
+    lang->hyphenation_min = -1;
+    lang->hjcode_head = NULL;
+}
+
+/*tex
+    A value below zero will bump the language id. Because we have a rather limited number of
+    languages there is no configuration, size is just maximum.
+*/
+
+static halfword tex_aux_new_language_id(halfword id)
+{
+    int top;
+    if (id >= 0) {
+        if (id <= lmt_language_state.language_data.top) {
+            if (lmt_language_state.languages[id]) {
+                return tex_formatted_error("languages", "the language with id %d is already created", id);
+            } else {
+                return id;
+            }
+        } else if (id > lmt_language_state.language_data.maximum) {
+            goto OVERFLOWERROR;
+        } else {
+            top = id;
+        }
+    } else if (lmt_language_state.language_data.ptr < lmt_language_state.language_data.top) {
+        ++lmt_language_state.language_data.ptr;
+        return lmt_language_state.language_data.ptr;
+    } else if (lmt_language_state.language_data.top >= lmt_language_state.language_data.maximum) {
+        goto OVERFLOWERROR;
+    } else if (lmt_language_state.language_data.top + lmt_language_state.language_data.step > lmt_language_state.language_data.maximum) {
+        top = lmt_language_state.language_data.maximum;
+    } else {
+        top = lmt_language_state.language_data.top + lmt_language_state.language_data.step;
+    }
+    /*tex Finally we can bump memory. */
+    {
+        tex_language **tmp = aux_reallocate_array(lmt_language_state.languages, sizeof(tex_language *), top, 0);
+        if (tmp) {
+            for (int i = lmt_language_state.language_data.top + 1; i <= top; i++) {
+                tmp[i] = NULL;
+            }
+            lmt_language_state.languages = tmp;
+            lmt_language_state.language_data.allocated += ((size_t) top - lmt_language_state.language_data.top) * sizeof(tex_language *);
+            lmt_language_state.language_data.top = top;
+            lmt_language_state.language_data.ptr += 1;
+            return lmt_language_state.language_data.ptr;
+        }
+    }
+  OVERFLOWERROR:
+    tex_overflow_error("languages", lmt_language_state.language_data.maximum);
+    return 0;
+}
+
+void tex_initialize_languages(void)
+{
+    tex_language **tmp = aux_allocate_clear_array(sizeof(tex_language *), lmt_language_state.language_data.minimum, 0);
+    if (tmp) {
+        for (int i = 0; i < lmt_language_state.language_data.minimum; i++) {
+            tmp[i] = NULL;
+        }
+        lmt_language_state.languages = tmp;
+        lmt_language_state.language_data.allocated += lmt_language_state.language_data.minimum * sizeof(tex_language *);
+        lmt_language_state.language_data.top = lmt_language_state.language_data.minimum;
+    } else {
+        tex_overflow_error("languages", lmt_language_state.language_data.minimum);
+    }
+}
+
+/*
+halfword tex_aux_maximum_language_id(void)
+{
+    return language_state.language_data.maximum;
+}
+*/
+
+int tex_is_valid_language(halfword n)
+{
+    if (n == 0) {
+        return 1;
+    } else if (n > 0 && n <= lmt_language_state.language_data.top) {
+        return lmt_language_state.languages[n] ? 1 : 0;
+    } else {
+        return 0;
+    }
+}
+
+tex_language *tex_new_language(halfword n)
+{
+    halfword id = tex_aux_new_language_id(n);
+    if (id >= 0) {
+        tex_language *lang = lmt_memory_malloc(sizeof(struct tex_language));
+        if (lang) {
+            lmt_language_state.languages[id] = lang;
+            lmt_language_state.language_data.allocated += sizeof(struct tex_language);
+            tex_aux_reset_language(id);
+            if (saving_hyph_codes_par) {
+                /*tex
+                    For now, we might just use specific value for whatever task. This will become
+                    obsolete.
+                */
+                tex_hj_codes_from_lc_codes(id);
+            }
+        } else {
+            tex_overflow_error("language", sizeof(struct tex_language));
+        }
+        return lang;
+    } else {
+        return NULL;
+    }
+}
+
+tex_language *tex_get_language(halfword n)
+{
+    if (n >= 0) {
+        if (n <= lmt_language_state.language_data.top && lmt_language_state.languages[n]) {
+            return lmt_language_state.languages[n];
+        }
+        if (n <= lmt_language_state.language_data.maximum) {
+            return tex_new_language(n);
+        }
+    }
+    return NULL;
+}
+
+/*tex
+    Freeing, dumping, undumping languages:
+*/
+
+/*
+void free_languages(void)
+{
+    for (int i = 0; i < language_state.language_data.top; i++) {
+        if (language_state.languages[i]) {
+            lmt_memory_free(language_state.languages[i]);
+            language_state.languages[i] = NULL;
+        }
+    }
+}
+*/
+
+void tex_dump_language_data(dumpstream f)
+{
+    dump_int(f, lmt_language_state.language_data.top);
+    dump_int(f, lmt_language_state.language_data.ptr);
+    if (lmt_language_state.language_data.top > 0) {
+        for (int i = 0; i < lmt_language_state.language_data.top; i++) {
+            tex_language *lang = lmt_language_state.languages[i];
+            if (lang) {
+                dump_via_int(f, 1);
+                dump_int(f, lang->id);
+                dump_int(f, lang->pre_hyphen_char);
+                dump_int(f, lang->post_hyphen_char);
+                dump_int(f, lang->pre_exhyphen_char);
+                dump_int(f, lang->post_exhyphen_char);
+                dump_int(f, lang->hyphenation_min);
+                tex_dump_language_hj_codes(f, i);
+            } else {
+                dump_via_int(f, 0);
+            }
+        }
+    }
+}
+
+void tex_undump_language_data(dumpstream f)
+{
+    int top, ptr;
+    undump_int(f, top);
+    undump_int(f, ptr);
+    if (top > 0) {
+        tex_language **tmp = aux_allocate_clear_array(sizeof(tex_language *), top, 0);
+        if (tmp) {
+            lmt_language_state.language_data.top = top;
+            lmt_language_state.language_data.ptr = ptr;
+            lmt_language_state.languages = tmp;
+            for (int i = 0; i < top; i++) {
+                int x;
+                undump_int(f, x);
+                if (x == 1) {
+                    tex_language *lang = lmt_memory_malloc(sizeof(struct tex_language));
+                    if (lang) {
+                        lmt_language_state.languages[i] = lang;
+                        lmt_language_state.language_data.allocated += sizeof(struct tex_language);
+                        lang->exceptions = 0;
+                        lang->patterns = NULL;
+                        lang->wordhandler = 0;
+                        lang->hjcode_head = NULL;
+                        undump_int(f, lang->id);
+                        undump_int(f, lang->pre_hyphen_char);
+                        undump_int(f, lang->post_hyphen_char);
+                        undump_int(f, lang->pre_exhyphen_char);
+                        undump_int(f, lang->post_exhyphen_char);
+                        undump_int(f, lang->hyphenation_min);
+                        tex_undump_language_hj_codes(f, i);
+                        if (lang->id != i) {
+                            tex_formatted_warning("languages", "undumped language id mismatch: %d <> %d", lang->id, i);
+                            lang->id = i;
+                        }
+                    } else {
+                        tex_overflow_error("languages", i);
+                    }
+                    tmp[i] = lang;
+                } else {
+                    tmp[i] = NULL;
+                }
+            }
+            lmt_language_state.language_data.initial = lmt_language_state.language_data.ptr;
+        } else {
+            tex_overflow_error("languages", top);
+            lmt_language_state.language_data.initial = 0;
+        }
+    } else {
+        /*tex Indeed we can have no languages stored. */
+        tex_initialize_languages();
+    }
+}
+
+/*tex All kind of accessors. */
+
+void tex_set_pre_hyphen_char(halfword n, halfword v)
+{
+    struct tex_language *l = tex_get_language(n);
+    if (l) {
+        l->pre_hyphen_char = v;
+    }
+}
+
+void tex_set_post_hyphen_char(halfword n, halfword v)
+{
+    struct tex_language *l = tex_get_language(n);
+    if (l) {
+        l->post_hyphen_char = v;
+    }
+}
+
+void tex_set_pre_exhyphen_char(halfword n, halfword v)
+{
+    struct tex_language *l = tex_get_language(n);
+    if (l) {
+        l->pre_exhyphen_char = v;
+    }
+}
+
+void tex_set_post_exhyphen_char(halfword n, halfword v)
+{
+    struct tex_language *l = tex_get_language(n);
+    if (l) {
+        l->post_exhyphen_char = v;
+    }
+}
+
+halfword tex_get_pre_hyphen_char(halfword n)
+{
+    struct tex_language *l = tex_get_language(n);
+    return l ? l->pre_hyphen_char : -1;
+}
+
+halfword tex_get_post_hyphen_char(halfword n)
+{
+    struct tex_language *l = tex_get_language(n);
+    return l ? l->post_hyphen_char : -1;
+}
+
+halfword tex_get_pre_exhyphen_char(halfword n)
+{
+    struct tex_language *l = tex_get_language(n);
+    return l ? l->pre_exhyphen_char : -1;
+}
+
+halfword tex_get_post_exhyphen_char(halfword n)
+{
+    struct tex_language *l = tex_get_language(n);
+    return (l) ? (int) l->post_exhyphen_char : -1;
+}
+
+void tex_set_hyphenation_min(halfword n, halfword v)
+{
+    struct tex_language *l = tex_get_language(n);
+    if (l) {
+        l->hyphenation_min = v;
+    }
+}
+
+halfword tex_get_hyphenation_min(halfword n)
+{
+    struct tex_language *l = tex_get_language((int) n);
+    return l ? l->hyphenation_min : -1;
+}
+
+void tex_load_patterns(struct tex_language *lang, const unsigned char *buff)
+{
+    if ((! lang) || (! buff) || strlen((const char *) buff) == 0) {
+        return;
+    } else {
+        if (! lang->patterns) {
+            lang->patterns = hnj_dictionary_new();
+        }
+        hnj_dictionary_load(lang->patterns, buff, tracing_hyphenation_par > 0);
+    }
+}
+
+void tex_clear_patterns(struct tex_language *lang)
+{
+    if (lang && lang->patterns) {
+        hnj_dictionary_clear(lang->patterns);
+    }
+}
+
+void tex_load_tex_patterns(halfword curlang, halfword head)
+{
+    char *s = tex_tokenlist_to_tstring(head, 1, NULL, 0, 0, 0);
+    if (s) {
+        tex_load_patterns(tex_get_language(curlang), (unsigned char *) s);
+    }
+}
+
+/*
+    This cleans one word which is returned in |cleaned|, returns the new offset into |buffer|.
+*/
+
+/* define tex_isspace(c) (c == ' ' || c == '\t') */
+#  define tex_isspace(c) (c == ' ')
+
+const char *tex_clean_hyphenation(halfword id, const char *buff, char **cleaned)
+{
+    int items = 0;
+    /*tex Work buffer for bytes: */
+    unsigned char word[max_size_of_word + 1];
+    /*tex Work buffer for \UNICODE: */
+    unsigned uword[max_size_of_word + 1] = { 0 };
+    /*tex The \UNICODE\ buffer value: */
+    int i = 0;
+    char *uindex = (char *) word;
+    const char *s = buff;
+    while (*s && ! tex_isspace((unsigned char)*s)) {
+        word[i++] = (unsigned char) *s;
+        s++;
+        if ((s-buff) > max_size_of_word) {
+            /*tex Todo: this is too strict, should count \UNICODE, not bytes. */
+            *cleaned = NULL;
+            tex_handle_error(
+                normal_error_type,
+                "Exception too long",
+                NULL
+            );
+            return s;
+        }
+    }
+    /*tex Now convert the input to \UNICODE. */
+    word[i] = '\0';
+    aux_splitutf2uni(uword, (const char *)word);
+    /*tex
+        Build the new word string. The hjcode values < 32 indicate a length, so that
+        for instance \|hjcode`ܽ2| makes that ligature count okay.
+    */
+    i = 0;
+    while (uword[i] > 0) {
+        int u = uword[i++];
+        if (u == '-') {
+            /*tex Skip. */
+        } else if (u == '=') {
+            unsigned c = tex_get_hj_code(id, '-');
+            uindex = aux_uni2string(uindex, (! c || c <= 32) ? '-' : c);
+        } else if (u == '{') {
+            u = uword[i++];
+            items = 0;
+            while (u && u != '}') {
+                u = uword[i++];
+            }
+            if (u == '}') {
+                items++;
+                u = uword[i++];
+            }
+            while (u && u != '}') {
+                u = uword[i++];
+            }
+            if (u == '}') {
+                items++;
+                u = uword[i++];
+            }
+            if (u == '{') {
+                u = uword[i++];
+            }
+            while (u && u != '}') {
+                unsigned c = tex_get_hj_code(id, u);
+                uindex = aux_uni2string(uindex, (! c || c <= 32) ? u : c);
+                u = uword[i++];
+            }
+            if (u == '}') {
+                items++;
+            }
+            if (items != 3) {
+                /* hm, we intercept that elsewhere in a better way so why here? Best remove the test here or move the other one here. */
+                *cleaned = NULL;
+                tex_handle_error(
+                    normal_error_type,
+                    "Exception syntax error, a discretionary has three components: {}{}{}.",
+                    NULL
+                );
+                return s;
+            } else {
+                /* skip replacement (chars) */
+                if (uword[i] == '(') {
+                    while (uword[++i] && uword[i] != ')') { };
+                    if (uword[i] != ')') {
+                        tex_handle_error(
+                            normal_error_type,
+                            "Exception syntax error, an alternative replacement is defined as (text).",
+                            NULL
+                        );
+                        return s;
+                    } else if (uword[i]) {
+                        i++;
+                   }
+                }
+                /* skip penalty: [digit] but we intercept multiple digits */
+                if (uword[i] == '[') {
+                    if (uword[i+1] && uword[i+1] >= '0' && uword[i+1] <= '9' && uword[i+2] && uword[i+2] == ']') {
+                        i += 3;
+                    } else {
+                        tex_handle_error(
+                            normal_error_type,
+                            "Exception syntax error, a penalty is defined as [digit].",
+                            NULL
+                        );
+                        return s;
+                    }
+                }
+            }
+        } else {
+            unsigned c = tex_get_hj_code(id, u);
+            uindex = aux_uni2string(uindex, (! c || c <= 32) ? u : c);
+        }
+    }
+    *uindex = '\0';
+    *cleaned = lmt_memory_strdup((char *) word);
+    return s;
+}
+
+void tex_load_hyphenation(struct tex_language *lang, const unsigned char *buff)
+{
+    if (lang) {
+        lua_State *L = lmt_lua_state.lua_instance;
+        const char *s = (const char *) buff;
+        char *cleaned = NULL;
+        int id = lang->id;
+        if (lang->exceptions == 0) {
+            lua_newtable(L);
+            lang->exceptions = luaL_ref(L, LUA_REGISTRYINDEX);
+        }
+        lua_rawgeti(L, LUA_REGISTRYINDEX, lang->exceptions);
+        while (*s) {
+            while (tex_isspace((unsigned char) *s)) {
+                s++;
+            }
+            if (*s) {
+                const char *value = s;
+                s = tex_clean_hyphenation(id, s, &cleaned);
+                if (cleaned) {
+                    size_t len = s - value;
+                    if (len > 0) {
+                        lua_pushstring(L, cleaned);
+                        lua_pushlstring(L, value, len);
+                        lua_rawset(L, -3);
+                    }
+                    lmt_memory_free(cleaned);
+                } else {
+                    /* tex_formatted_warning("hyphenation","skipping invalid hyphenation exception: %s", value); */
+                }
+            }
+        }
+    }
+}
+
+void tex_clear_hyphenation(struct tex_language *lang)
+{
+    if (lang && lang->exceptions != 0) {
+        lua_State *L = lmt_lua_state.lua_instance;
+        luaL_unref(L, LUA_REGISTRYINDEX, lang->exceptions);
+        lang->exceptions = 0;
+    }
+}
+
+void tex_load_tex_hyphenation(halfword curlang, halfword head)
+{
+    char *s = tex_tokenlist_to_tstring(head, 1, NULL, 0, 0, 0);
+    if (s) {
+        tex_load_hyphenation(tex_get_language(curlang), (unsigned char *) s);
+    }
+}
+
+static halfword tex_aux_insert_discretionary(halfword t, halfword pre, halfword post, halfword replace, quarterword subtype, int penalty)
+{
+    /*tex For compound words following explicit hyphens we take the current font. */
+    halfword d = tex_new_disc_node(subtype);
+    halfword a = node_attr(t) ;
+    disc_penalty(d) = penalty;
+    if (t == replace) {
+        /*tex We have |prev disc next-next|. */
+        tex_try_couple_nodes(d, node_next(t));
+        tex_try_couple_nodes(node_prev(t), d);
+        node_prev(t) = null;
+        node_next(t) = null;
+        replace = t;
+    } else {
+        /*tex We have |prev disc next|. */
+        tex_try_couple_nodes(d, node_next(t));
+        tex_couple_nodes(t, d);
+    }
+    if (a) {
+        tex_attach_attribute_list_attribute(d, a);
+    }
+    tex_set_disc_field(d, pre_break_code, pre);
+    tex_set_disc_field(d, post_break_code, post);
+    tex_set_disc_field(d, no_break_code, replace);
+    return d;
+}
+
+static halfword tex_aux_insert_syllable_discretionary(halfword t, lang_variables *lan)
+{
+    halfword n = tex_new_disc_node(syllable_discretionary_code);
+    disc_penalty(n) = hyphen_penalty_par;
+    tex_couple_nodes(n, node_next(t));
+    tex_couple_nodes(t, n);
+    tex_attach_attribute_list_attribute(n, get_attribute_list(t));
+    if (lan->pre_hyphen_char > 0) {
+        halfword g = tex_new_glyph_node(glyph_unset_subtype, glyph_font(t), lan->pre_hyphen_char, t);
+        tex_set_disc_field(n, pre_break_code, g);
+    }
+    if (lan->post_hyphen_char > 0) {
+        halfword g = tex_new_glyph_node(glyph_unset_subtype, glyph_font(t), lan->post_hyphen_char, t);
+        tex_set_disc_field(n, post_break_code, g);
+    }
+    return n;
+}
+
+static halfword tex_aux_compound_word_break(halfword t, halfword clang, halfword chr)
+{
+    halfword prechar, postchar, pre, post, disc;
+    if (chr == ex_hyphen_char_par) {
+        halfword pre_exhyphen_char = tex_get_pre_exhyphen_char(clang);
+        halfword post_exhyphen_char = tex_get_post_exhyphen_char(clang);
+        prechar  = pre_exhyphen_char  > 0 ? pre_exhyphen_char  : ex_hyphen_char_par;
+        postchar = post_exhyphen_char > 0 ? post_exhyphen_char : null;
+    } else {
+        /* we need a flag : use pre/post cf language spec */
+        prechar  = chr;
+        postchar = null;
+    }
+    pre  = prechar  > 0 ? tex_new_glyph_node(glyph_unset_subtype, glyph_font(t), prechar,  t) : null;
+    post = postchar > 0 ? tex_new_glyph_node(glyph_unset_subtype, glyph_font(t), postchar, t) : null;
+    disc = tex_aux_insert_discretionary(t, pre, post, t, automatic_discretionary_code, tex_automatic_disc_penalty(glyph_hyphenate(t)));
+    return disc;
+}
+
+static char *tex_aux_hyphenation_exception(int exceptions, char *w)
+{
+    lua_State *L = lmt_lua_state.lua_instance;
+    char *ret = NULL;
+    if (lua_rawgeti(L, LUA_REGISTRYINDEX, exceptions) == LUA_TTABLE) {
+        /*tex Word table: */
+        lua_pushstring(L, w);
+        lua_rawget(L, -2);
+        if (lua_type(L, -1) == LUA_TSTRING) {
+            ret = lmt_memory_strdup(lua_tostring(L, -1));
+        }
+        lua_pop(L, 2);
+    } else {
+        lua_pop(L, 1);
+    }
+    return ret;
+}
+
+/*tex Kept as reference: */
+
+/*
+char *get_exception_strings(struct tex_language *lang)
+{
+    char *ret = NULL;
+    if (lang && lang->exceptions) {
+        lua_State *L = lua_state.lua_instance;
+        if (lua_rawgeti(L, LUA_REGISTRYINDEX, lang->exceptions) == LUA_TTABLE) {
+            size_t size = 0;
+            size_t current = 0;
+            lua_pushnil(L);
+            while (lua_next(L, -2)) {
+                size_t l = 0;
+                const char *value = lua_tolstring(L, -1, &l);
+                if (current + l + 2 > size) {
+                    size_t new = (size + size/5) + current + l + 1024;
+                    char *tmp = lmt_memory_realloc(ret, new);
+                    if (tmp) {
+                        ret = tmp;
+                        size = new;
+                    } else {
+                        overflow_error("exceptions", (int) size);
+                    }
+                }
+                if (ret) {
+                    ret[current] = ' ';
+                    strcpy(&ret[current + 1], value);
+                    current += l + 1;
+                }
+                lua_pop(L, 1);
+            }
+        }
+    }
+    return ret;
+}
+*/
+
+/*tex
+
+    The sequence from |wordstart| to |r| can contain only normal characters it could be faster to
+    modify a halfword pointer and return an integer
+
+*/
+
+# define zws  0x200B /* zero width space makes no sense */
+# define zwnj 0x200C
+# define zwj  0x200D
+
+static halfword tex_aux_find_exception_part(unsigned int *j, unsigned int *uword, int len, halfword parent, char final)
+{
+    halfword head = null;
+    halfword tail = null;
+    unsigned i = *j;
+    int noligature = 0;
+    int nokerning = 0;
+    /*tex This puts uword[i] on the |{|. */
+    i++;
+    while (i < (unsigned) len && uword[i + 1] != (unsigned int) final) {
+        if (tail) {
+            switch (uword[i + 1]) {
+                case zwj:
+                    noligature = 1;
+                    nokerning = 0;
+                    break;
+                case zwnj:
+                    noligature = 1;
+                    nokerning = 1;
+                    break;
+                default:
+                    {
+                        halfword s = tex_new_glyph_node(glyph_unset_subtype, glyph_font(parent), (int) uword[i + 1], parent); /* todo: data */
+                        tex_couple_nodes(tail, s);
+                        if (noligature) {
+                            tex_add_glyph_option(tail, glyph_option_no_right_ligature);
+                            tex_add_glyph_option(s, glyph_option_no_left_ligature);
+                            noligature = 0;
+                        }
+                        if (nokerning) {
+                            tex_add_glyph_option(tail, glyph_option_no_right_kern);
+                            tex_add_glyph_option(s, glyph_option_no_left_kern);
+                            nokerning = 0;
+                        }
+                        tail = node_next(tail);
+                        break;
+                    }
+            }
+        } else {
+            head = tex_new_glyph_node(glyph_unset_subtype, glyph_font(parent), (int) uword[i + 1], parent); /* todo: data */
+            tail = head;
+        }
+        i++;
+    }
+    *j = ++i;
+    return head;
+}
+
+static int tex_aux_count_exception_part(unsigned int *j, unsigned int *uword, int len)
+{
+    int n = 0;
+    unsigned i = *j;
+    /*tex This puts uword[i] on the |{|. */
+    i++;
+    while (i < (unsigned) len && uword[i + 1] != '}') {
+        n++;
+        i++;
+    }
+    *j = ++i;
+    return n;
+}
+
+static void tex_aux_show_exception_error(const char *part)
+{
+    tex_handle_error(
+        normal_error_type,
+        "Invalid %s part in exception",
+        part,
+        "Exception discretionaries should contain three pairs of braced items.\n"
+        "No intervening spaces are allowed."
+    );
+}
+
+/*tex
+
+    The exceptions are taken as-is: no min values are taken into account. One can add normal
+    patterns on-the-fly if needed.
+
+*/
+
+static void tex_aux_do_exception(halfword wordstart, halfword r, char *replacement)
+{
+    halfword t = wordstart;
+    lang_variables langdata;
+    unsigned uword[max_size_of_word + 1] = { 0 };
+    unsigned len = aux_splitutf2uni(uword, replacement);
+    int clang = get_glyph_language(wordstart);
+    langdata.pre_hyphen_char = tex_get_pre_hyphen_char(clang);
+    langdata.post_hyphen_char = tex_get_post_hyphen_char(clang);
+    for (unsigned i = 0; i < len; i++) {
+        if (uword[i + 1] == 0 ) {
+            /*tex We ran out of the exception pattern. */
+            break;
+        } else if (uword[i + 1] == '-') {
+            /*tex A hyphen follows. */
+            if (node_next(t) == r) {
+                break;
+            } else {
+                tex_aux_insert_syllable_discretionary(t, &langdata);
+                /*tex Skip the new disc */
+                t = node_next(t);
+            }
+        } else if (uword[i + 1] == '=') {
+            /*tex We skip a disc. */
+            t = node_next(t);
+        } else if (uword[i + 1] == '{') {
+            /*tex We ran into an exception |{}{}{}| or |{}{}{}[]|. */
+            halfword pre = null;
+            halfword post = null;
+            halfword replace = null;
+            int count = 0;
+            int alternative = null;
+            halfword penalty;
+            /*tex |pre| */
+            pre = tex_aux_find_exception_part(&i, uword, (int) len, wordstart, '}');
+            if (i == len || uword[i + 1] != '{') {
+                tex_aux_show_exception_error("pre");
+            }
+            /*tex |post| */
+            post = tex_aux_find_exception_part(&i, uword, (int) len, wordstart, '}');
+            if (i == len || uword[i + 1] != '{') {
+                tex_aux_show_exception_error("post");
+            }
+            /*tex |replace| */
+            count = tex_aux_count_exception_part(&i, uword, (int) len);
+            if (i == len) {
+                tex_aux_show_exception_error("replace");
+            } else if (uword[i] && uword[i + 1] == '(') {
+                alternative = tex_aux_find_exception_part(&i, uword, (int) len, wordstart, ')');;
+            }
+            /*tex Play safe. */
+            if (node_next(t) == r) {
+                break;
+            } else {
+                /*tex Let's deal with an (optional) replacement. */
+                if (count > 0) {
+                    /*tex Assemble the replace stream. */
+                    halfword q = t;
+                    replace = node_next(q);
+                    while (count > 0 && q) {
+                        halfword t = node_type(q);
+                        q = node_next(q);
+                        if (t == glyph_node || t == disc_node) {
+                            count--;
+                        } else {
+                            break ;
+                        }
+                    }
+                    /*tex Remove it from the main stream */
+                    tex_try_couple_nodes(t, node_next(q));
+                    /*tex and finish it in the replace. */
+                    node_next(q) = null;
+                    if (alternative) {
+                        tex_flush_node_list(replace);
+                        replace = alternative;
+                    } else {
+                        /*tex Sanitize the replace stream (we could use the flattener instead). */
+                        q = replace ;
+                        while (q) {
+                            halfword n = node_next(q);
+                            if (node_type(q) == disc_node) {
+                                /*tex Beware: the replacement starts after the no_break pointer. */
+                                halfword nb = disc_no_break_head(q);
+                                disc_no_break_head(q) = null;
+                                node_prev(nb) = null ; /* used at all? */
+                                /*tex Insert the replacement glyph. */
+                                if (q == replace) {
+                                    replace = nb;
+                                } else {
+                                    tex_try_couple_nodes(node_prev(q), nb);
+                                }
+                                /*tex Append the glyph (one). */
+                                tex_try_couple_nodes(nb, n);
+                                /*tex Flush the disc. */
+                                tex_flush_node(q);
+                            }
+                            q = n ;
+                        }
+                    }
+                }
+                /*tex Let's check if we have a penalty spec. If we have more then we're toast, we just ignore them. */
+                if (uword[i] && uword[i + 1] == '[') {
+                    i += 2;
+                    if (uword[i] && uword[i] >= '0' && uword[i] <= '9') {
+                        if (exception_penalty_par > 0) {
+                            if (exception_penalty_par > infinite_penalty) {
+                                penalty = exception_penalty_par;
+                            } else {
+                                penalty = (uword[i] - '0') * exception_penalty_par ;
+                            }
+                        } else {
+                            penalty = hyphen_penalty_par;
+                        }
+                        ++i;
+                        while (uword[i] && uword[i] != ']') {
+                            ++i;
+                        }
+                    } else {
+                        penalty = hyphen_penalty_par;
+                    }
+                } else {
+                    penalty = hyphen_penalty_par;
+                }
+                /*tex And now we insert a disc node (this was |syllable_discretionary_code|). */
+                t = tex_aux_insert_discretionary(t, pre, post, replace, normal_discretionary_code, penalty);
+                /*tex We skip the new disc node. */
+                t = node_next(t);
+                /*tex 
+                    We need to check if we have two discretionaries in a row, test case: |\hyphenation 
+                    {a{>}{<}{b}{>}{<}{c}de} \hsize 1pt abcde \par| which gives |a> <> <de|. 
+                */
+                if (uword[i] && uword[i + 1] == '{') {
+                    i--;
+                    t = node_prev(t); /*tex Tricky! */
+                }
+            }
+        } else {
+            t = node_next(t);
+        }
+        /*tex Again we play safe. */
+        if (! t || node_next(t) == r) {
+            break;
+        }
+    }
+}
+
+/*tex
+
+    The following description is no longer valid for \LUATEX. Although we use the same algorithm
+    for hyphenation, it is not integrated in the par builder. Instead it is a separate run over
+    the node list, preceding the line-breaking routine, possibly replaced by a callback. We keep
+    the description here because the principles remain.
+
+    \startnarrower
+
+    When the line-breaking routine is unable to find a feasible sequence of breakpoints, it makes
+    a second pass over the paragraph, attempting to hyphenate the hyphenatable words. The goal of
+    hyphenation is to insert discretionary material into the paragraph so that there are more
+    potential places to break.
+
+    The general rules for hyphenation are somewhat complex and technical, because we want to be
+    able to hyphenate words that are preceded or followed by punctuation marks, and because we
+    want the rules to work for languages other than English. We also must contend with the fact
+    that hyphens might radically alter the ligature and kerning structure of a word.
+
+    A sequence of characters will be considered for hyphenation only if it belongs to a \quotation
+    {potentially hyphenatable part} of the current paragraph. This is a sequence of nodes $p_0p_1
+    \ldots p_m$ where $p_0$ is a glue node, $p_1\ldots p_{m-1}$ are either character or ligature
+    or whatsit or implicit kern nodes, and $p_m$ is a glue or penalty or insertion or adjust or
+    mark or whatsit or explicit kern node. (Therefore hyphenation is disabled by boxes, math
+    formulas, and discretionary nodes already inserted by the user.) The ligature nodes among $p_1
+    \ldots p_{m-1}$ are effectively expanded into the original non-ligature characters; the kern
+    nodes and whatsits are ignored. Each character |c| is now classified as either a nonletter (if
+    |lc_code(c)=0|), a lowercase letter (if |lc_code(c)=c|), or an uppercase letter (otherwise); an
+    uppercase letter is treated as if it were |lc_code(c)| for purposes of hyphenation. The
+    characters generated by $p_1\ldots p_{m-1}$ may begin with nonletters; let $c_1$ be the first
+    letter that is not in the middle of a ligature. Whatsit nodes preceding $c_1$ are ignored; a
+    whatsit found after $c_1$ will be the terminating node $p_m$. All characters that do not have
+    the same font as $c_1$ will be treated as nonletters. The |hyphen_char| for that font must be
+    between 0 and 255, otherwise hyphenation will not be attempted. \TeX\ looks ahead for as many
+    consecutive letters $c_1\ldots c_n$ as possible; however, |n| must be less than 64, so a
+    character that would otherwise be $c_{64}$ is effectively not a letter. Furthermore $c_n$ must
+    not be in the middle of a ligature. In this way we obtain a string of letters $c_1\ldots c_n$
+    that are generated by nodes $p_a\ldots p_b$, where |1<=a<=b+1<=m|. If |n>=l_hyf+r_hyf|, this
+    string qualifies for hyphenation; however, |uc_hyph| must be positive, if $c_1$ is uppercase.
+
+    The hyphenation process takes place in three stages. First, the candidate sequence $c_1 \ldots
+    c_n$ is found; then potential positions for hyphens are determined by referring to hyphenation
+    tables; and finally, the nodes $p_a\ldots p_b$ are replaced by a new sequence of nodes that
+    includes the discretionary breaks found.
+
+    Fortunately, we do not have to do all this calculation very often, because of the way it has
+    been taken out of \TEX's inner loop. For example, when the second edition of the author's
+    700-page book {\sl Seminumerical Algorithms} was typeset by \TEX, only about 1.2 hyphenations
+    needed to be tried per paragraph, since the line breaking algorithm needed to use two passes on
+    only about 5 per cent of the paragraphs. (This is not true in \LUATEX: we always hyphenate the
+    whole list.)
+
+    When a word been set up to contain a candidate for hyphenation, \TEX\ first looks to see if it
+    is in the user's exception dictionary. If not, hyphens are inserted based on patterns that
+    appear within the given word, using an algorithm due to Frank~M. Liang.
+
+    \stopnarrower
+
+    This is incompatible with \TEX\ because the first word of a paragraph can be hyphenated, but
+    most European users seem to agree that prohibiting hyphenation there was not the best idea ever.
+
+    To be documented: |\hyphenationmode| (a bit set).
+
+    \startbuffer
+    \parindent0pt \hsize=1.1cm
+    12-34-56 \par
+    12-34-\hbox{56} \par
+    12-34-\vrule width 1em height 1.5ex \par
+    12-\hbox{34}-56 \par
+    12-\vrule width 1em height 1.5ex-56 \par
+    \hjcode`\1=`\1 \hjcode`\2=`\2 \hjcode`\3=`\3 \hjcode`\4=`\4 \vskip.5cm
+    12-34-56 \par
+    12-34-\hbox{56} \par
+    12-34-\vrule width 1em height 1.5ex \par
+    12-\hbox{34}-56 \par
+    12-\vrule width 1em height 1.5ex-56 \par
+    \stopbuffer
+
+    \typebuffer
+
+    \startpacked \getbuffer \stopbuffer
+
+    We only accept an explicit hyphen when there is a preceding glyph and we skip a sequence of
+    explicit hyphens as that normally indicates a \type {--} or \type {---} ligature in which case
+    we can in a worse case usage get bad node lists later on due to messed up ligature building as
+    these dashes are ligatures in base fonts. This is a side effect of the separating the
+    hyphenation, ligaturing and kerning steps. A test is cmr with \type {------}.
+
+    A font handler can collapse successive hyphens but it's not nice to put the burden there. A
+    somewhat messy border case is \type {----} but in \LUATEX\ we don't treat \type {--} and \type
+    {---} special. Also, traditional \TEX\ will break a line at \type {-foo} but this can be
+    disabled by setting the automatic mode to \type {1}.
+
+*/
+
+// # define is_hyphen_char(chr)  (get_hc_code(chr) || chr == ex_hyphen_char_par)
+
+inline static halfword tex_aux_is_hyphen_char(halfword chr)
+{
+    if (tex_get_hc_code(chr)) {
+        return tex_get_hc_code(chr);
+    } else if (chr == ex_hyphen_char_par) {
+        return ex_hyphen_char_par;
+    } else {
+        return null;
+    }
+}
+
+static halfword tex_aux_find_next_wordstart(halfword r, halfword first_language)
+{
+    int start_ok = 1;
+    int mathlevel = 1;
+    halfword lastglyph = r;
+    while (r) {
+        switch (node_type(r)) {
+            case boundary_node:
+                if (node_subtype(r) == word_boundary) {
+                    start_ok = 1;
+                }
+                break;
+            case disc_node:
+                start_ok = has_disc_option(r, disc_option_post_word);
+                break;
+            case hlist_node:
+            case vlist_node:
+            case rule_node:
+            case dir_node:
+            case whatsit_node:
+                if (hyphenation_permitted(glyph_hyphenate(lastglyph), strict_start_hyphenation_mode)) {
+                    start_ok = 0;
+                }
+                break;
+            case glue_node:
+                start_ok = 1;
+                break;
+            case math_node:
+                while (mathlevel > 0) {
+                    r = node_next(r);
+                    if (! r) {
+                        return r;
+                    } else if (node_type(r) == math_node) {
+                        if (node_subtype(r) == begin_inline_math) {
+                            mathlevel++;
+                        } else {
+                            mathlevel--;
+                        }
+                    }
+                }
+                break;
+            case glyph_node:
+                {
+                    /*tex
+                        When we have no word yet and meet a hyphen (equivalent) we should just
+                        keep going. This is not compatible but it does make sense.
+                    */
+                    int chr = glyph_character(r);
+                    int hyp = tex_aux_is_hyphen_char(chr);
+                    lastglyph = r;
+                    if (hyp) {
+                        if (hyphenation_permitted(glyph_hyphenate(r), ignore_bounds_hyphenation_mode)) {
+                            /* maybe some tracing */
+                        } else {
+                            /* todo: already check if we have hj chars left/right i.e. no digits and minus mess */
+                            halfword t = node_next(r) ;
+                            /*tex Kind of weird that we have the opposite flag test here. */
+                            if (t && (node_type(t) == glyph_node) && (! tex_aux_is_hyphen_char(glyph_character(t))) && ! hyphenation_permitted(glyph_hyphenate(r), automatic_hyphenation_mode)) {
+                                /*tex We have no word yet and the next character is a non hyphen. */
+                                r = tex_aux_compound_word_break(r, get_glyph_language(r), hyp);
+                                // test case: \automatichyphenmode0 10\high{-6-1-2-4}
+                                start_ok = 1; // todo: also in luatex
+                            } else {
+                                /*tex We jump over the sequence of hyphens. */
+                                while (t && (node_type(t) == glyph_node) && tex_aux_is_hyphen_char(glyph_character(t))) {
+                                    r = t ;
+                                    t = node_next(r) ;
+                                }
+                                if (t) {
+                                    /*tex We need a restart. */
+                                    start_ok = 0;
+                                } else {
+                                    /*tex We reached the end of the list so we have no word start. */
+                                    return null;
+                                }
+                            }
+                        }
+                    } else if (start_ok && (get_glyph_language(r) >= first_language) && get_glyph_dohyph(r)) {
+                        int l = tex_get_hj_code(get_glyph_language(r), chr);
+                        if (l > 0) {
+                            if (l == chr || l <= 32 || get_glyph_uchyph(r)) {
+                                return r;
+                            } else {
+                                start_ok = 0;
+                            }
+                        } else {
+                            /*tex We go on. */
+                        }
+                    } else {
+                        /*tex We go on. */
+                    }
+                }
+                break;
+            default:
+                start_ok = 0;
+                break;
+        }
+        r = node_next(r);
+    }
+    return r; /* null */
+}
+
+/*tex
+
+    This is the original test, extended with bounds, but still the complex expression turned into
+    a function.  However, it actually is part of the old mechanism where hyphenation was mixed
+    with ligature building and kerning, so there was this skipping over a font kern whuch is no
+    longer needed as we have separate steps.
+
+    We keep this as reference:
+
+    \starttyping
+    static int valid_wordend(halfword s, halfword strict_bound)
+    {
+        if (s) {
+            halfword r = s;
+            int clang = get_glyph_language(s);
+            while ( (r) &&
+                   (    (type(r) == glyph_node && clang == get_glyph_language(r))
+                     || (type(r) == kern_node && (subtype(r) == font_kern))
+                    )
+                   ) {
+                r = node_next(r);
+            }
+            return (! r || (type(r) == glyph_node && clang != get_glyph_language(r))
+                        ||  type(r) == glue_node
+                        ||  type(r) == penalty_node
+                        || (type(r) == kern_node && (subtype(r) == explicit_kern ||
+                                                     subtype(r) == italic_kern   ||
+                                                     subtype(r) == accent_kern   ))
+                        ||  ((type(r) == hlist_node   ||
+                              type(r) == vlist_node   ||
+                              type(r) == rule_node    ||
+                              type(r) == dir_node     ||
+                              type(r) == whatsit_node ||
+                              type(r) == insert_node  ||
+                              type(r) == adjust_node
+                             ) && ! (strict_bound == 2 || strict_bound == 3))
+                        ||  type(r) == boundary_node
+                );
+        } else {
+            return 1;
+        }
+    }
+    \stopttyping
+
+*/
+
+static int tex_aux_valid_wordend(halfword end_word, halfword r)
+{
+    if (r) {
+        switch (node_type(r)) {
+         // case glyph_node:
+         // case glue_node:
+         // case penalty_node:
+         // case kern_node:
+         //     return 1;
+            case disc_node:
+                return has_disc_option(r, disc_option_pre_word);
+            case hlist_node:
+            case vlist_node:
+            case rule_node:
+            case dir_node:
+            case whatsit_node:
+            case insert_node:
+            case adjust_node:
+                return ! hyphenation_permitted(glyph_hyphenate(end_word), strict_end_hyphenation_mode);
+        }
+    }
+    return 1;
+}
+
+void tex_handle_hyphenation(halfword head, halfword tail)
+{
+    if (head && node_next(head)) {
+        int callback_id = lmt_callback_defined(hyphenate_callback);
+        if (callback_id > 0) {
+            lua_State *L = lmt_lua_state.lua_instance;
+            int top = 0;
+            if (lmt_callback_okay(L, callback_id, &top)) {
+                int i;
+                lmt_node_list_to_lua(L, head);
+                lmt_node_list_to_lua(L, tail);
+                i = lmt_callback_call(L, 2, 0, top);
+                if (i) {
+                    lmt_callback_error(L, top, i);
+                } else {
+                    lmt_callback_wrapup(L, top);
+                }
+            }
+        } else if (callback_id == 0) {
+            tex_hyphenate_list(head, tail);
+        } else {
+            /* -1 : disabled */
+        }
+    }
+}
+
+static int tex_aux_hnj_hyphen_hyphenate(
+    hjn_dictionary *dict,
+    halfword        first,
+    halfword        last,
+    int             length,
+    halfword        left,
+    halfword        right,
+    lang_variables *lan
+)
+{
+    /*tex +2 for dots at each end, +1 for points outside characters. */
+    int ext_word_len = length + 2;
+    int hyphen_len = ext_word_len + 1;
+    /*tex Because we have a limit of 64 characters we could just use a static array here: */
+    char *hyphens = lmt_memory_calloc(hyphen_len, sizeof(unsigned char));
+    if (hyphens) {
+        halfword here;
+        int state = 0;
+        int char_num = 0;
+        int done = 0;
+        /*tex Add a '.' to beginning and end to facilitate matching. */
+        node_next(begin_period) = first;
+        node_next(end_period) = node_next(last);
+        node_next(last) = end_period;
+
+     // for (int i = 0; i < hyphen_len; i++) {
+     //     hyphens[i] = '0';
+     // }
+     // hyphens[hyphen_len] = 0;
+
+        /*tex Now, run the finite state machine. */
+        for (char_num = 0, here = begin_period; here != node_next(end_period); here = node_next(here)) {
+            int ch;
+            if (here == begin_period || here == end_period) {
+                ch = '.';
+            } else {
+                ch = tex_get_hj_code(get_glyph_language(here), glyph_character(here));
+                if (ch <= 32) {
+                    ch = glyph_character(here);
+                }
+            }
+            while (state != -1) {
+                hjn_state *hstate = &dict->states[state];
+                for (int k = 0; k < hstate->num_trans; k++) {
+                    if (hstate->trans[k].uni_ch == ch) {
+                        char *match;
+                        state = hstate->trans[k].new_state;
+                        match = dict->states[state].match;
+                        if (match) {
+                            /*tex
+                                We add +2 because 1 string length is one bigger than offset and 1
+                                hyphenation starts before first character.
+
+                                Why not store the length in states[state] instead of calculating
+                                it each time? Okay, performance is okay but still ...
+                            */
+                            int offset = (int) (char_num + 2 - (int) strlen(match));
+                            for (int m = 0; match[m]; m++) {
+                                if (hyphens[offset + m] < match[m]) {
+                                    hyphens[offset + m] = match[m];
+                                }
+                            }
+                        }
+                        goto NEXTLETTER;
+                    }
+                }
+                state = hstate->fallback_state;
+            }
+            /*tex Nothing worked, let's go to the next character. */
+            state = 0;
+        NEXTLETTER:;
+            char_num++;
+        }
+        /*tex Restore the correct pointers. */
+        node_next(last) = node_next(end_period);
+        /*tex
+            Pattern is |.word.| and |word_len| is 4, |ext_word_len| is 6 and |hyphens| is 7; drop first
+            two and stop after |word_len-1|.
+         */
+        for (here = first, char_num = 2; here != left; here = node_next(here)) {
+            char_num++;
+        }
+        for (; here != right; here = node_next(here)) {
+            if (hyphens[char_num] & 1) {
+                here = tex_aux_insert_syllable_discretionary(here, lan);
+                done += 1;
+            }
+            char_num++;
+        }
+        lmt_memory_free(hyphens);
+        return done;
+    } else {
+        tex_overflow_error("patterns", hyphen_len);
+        return 0;
+    }
+}
+
+/* we can also check the original */
+
+static int tex_aux_still_okay(halfword f, halfword l, halfword r, int n, const char *utf8original) {
+    if (_valid_node_(f) && _valid_node_(l) && node_next(l) == r) {
+        int i = 0;
+        while (f) {
+            ++i;
+            if (node_type(f) != glyph_node) {
+                tex_normal_warning("language", "the hyphenated word contains non-glyphs, skipping");
+                return 0;
+            } else {
+                halfword c = (halfword) aux_str2uni((const unsigned char *) utf8original);
+                utf8original += utf8_size(c);
+                if (! (c && c == glyph_character(f))) {
+                    tex_normal_warning("language", "the hyphenated word contains different characters, skipping");
+                    return 0;
+                } else if (f != l) {
+                    f = node_next(f);
+                } else if (i == n) {
+                    return 1;
+                } else {
+                    tex_normal_warning("language", "the hyphenated word changed length, skipping");
+                    return 0;
+                }
+            }
+        }
+    }
+    tex_normal_warning("language", "the hyphenation list is messed up, skipping");
+    return 0;
+}
+
+static void tex_aux_hyphenate_show(halfword beg, halfword end)
+{
+    if (_valid_node_(beg) && _valid_node_(end)) {
+        halfword nxt = node_next(end);
+        node_next(end) = null;
+        tex_show_node_list(beg, 100, 10000);
+        node_next(end) = nxt;
+    }
+}
+
+/* maybe split: first a processing run */
+
+inline static int is_traditional_hyphen(halfword n)
+{
+    return (
+        (glyph_character(n) == ex_hyphen_char_par)                             /*tex parameter */
+     && (has_font_text_control(glyph_font(n),text_control_collapse_hyphens))   /*tex font driven */
+     && (hyphenation_permitted(glyph_hyphenate(n), collapse_hyphenation_mode)) /*tex language driven */
+    );
+}
+
+int tex_collapse_list(halfword head, halfword c1, halfword c2, halfword c3) /* ex_hyphen_char_par 0x2013 0x2014 */
+{
+    /*tex Let's play safe: */
+    halfword found = 0;
+    if (head && c1 && c2 && c3) {
+        halfword n1 = head;
+        while (n1) {
+            halfword n2 = node_next(n1);
+            switch (node_type(n1)) {
+                case glyph_node:
+                    if (is_traditional_hyphen(n1)) {
+                        set_glyph_discpart(n1, glyph_discpart_always);
+                        if (n2 && node_type(n2) == glyph_node && is_traditional_hyphen(n2) && glyph_font(n1) == glyph_font(n2)) {
+                            halfword n3 = node_next(n2);
+                            if (n3 && node_type(n3) == glyph_node && is_traditional_hyphen(n3) && glyph_font(n1) == glyph_font(n3)) {
+                                halfword n4 = node_next(n3);
+                                glyph_character(n1) = c3;
+                                tex_try_couple_nodes(n1, n4);
+                                tex_flush_node(n2);
+                                tex_flush_node(n3);
+                                n1 = n4;
+                            } else {
+                                glyph_character(n1) = c2;
+                                tex_try_couple_nodes(n1, n3);
+                                tex_flush_node(n2);
+                                n1 = n3;
+                            }
+                            found = 1;
+                            goto AGAIN;
+                        } else {
+                            glyph_character(n1) = c1; /* can become language dependent */
+                        }
+                    }
+                    break;
+                case disc_node:
+                    {
+                        halfword done = 0;
+                        if (disc_pre_break_head(n1) && tex_collapse_list(disc_pre_break_head(n1), c1, c2, c3)) {
+                            ++done;
+                        }
+                        if (disc_post_break_head(n1) && tex_collapse_list(disc_post_break_head(n1), c1, c2, c3)) {
+                            ++done;
+                        }
+                        if (disc_no_break_head(n1) && tex_collapse_list(disc_no_break_head(n1), c1, c2, c3)) {
+                            ++done;
+                        }
+                        if (done) {
+                            tex_check_disc_field(n1);
+                        }
+                        break;
+                    }
+                default:
+                    break;
+            }
+            n1 = n2;
+          AGAIN:;
+        }
+    }
+    return found;
+}
+
+void tex_hyphenate_list(halfword head, halfword tail)
+{
+    /*tex Let's play safe: */
+    if (tail) {
+        halfword first_language = first_valid_language_par; /* combine with check below */
+        halfword trace = tracing_hyphenation_par;
+        halfword r = head;
+        /*tex
+            This first movement assures two things:
+
+            \startitemize
+                \startitem
+                    That we won't waste lots of time on something that has been handled already (in
+                    that case, none of the glyphs match |simple_character|).
+                \stopitem
+                \startitem
+                    That the first word can be hyphenated. If the movement was not explicit, then
+                    the indentation at the start of a paragraph list would make |find_next_wordstart()|
+                    look too far ahead.
+                \stopitem
+            \stopitemize
+        */
+        while (r && node_type(r) != glyph_node) {
+            r = node_next(r);
+        }
+        if (r) {
+            r = tex_aux_find_next_wordstart(r, first_language);
+            if (r) {
+                lang_variables langdata;
+                char utf8word[(4 * max_size_of_word) + 1] = { 0 };
+                char utf8original[(4 * max_size_of_word) + 1] = { 0 };
+                char *utf8ptr = utf8word;
+                char *utf8ori = utf8original;
+                int word_length = 0;
+                int explicit_hyphen = 0;
+                int last_char = 0;
+                int valid = 0;
+                halfword explicit_start = null;
+                halfword saved_tail = node_next(tail);
+                halfword penalty = tex_new_penalty_node(0, word_penalty_subtype);
+                /* kind of curious hack, this addition that we later remove */
+                tex_attach_attribute_list_copy(penalty, r);
+                tex_couple_nodes(tail, penalty); /* todo: attrobute */
+                while (r) {
+                    halfword word_start = r;
+                    int word_language = get_glyph_language(word_start);
+                    if (tex_is_valid_language(word_language)) {
+                        halfword word_end = r;
+                        int lhmin = get_glyph_lhmin(word_start);
+                        int rhmin = get_glyph_rhmin(word_start);
+                        int hmin = tex_get_hyphenation_min(word_language);
+                        halfword word_font = glyph_font(word_start);
+                        if (! tex_is_valid_font(word_font) || font_hyphen_char(word_font) < 0) {
+                            /*tex For backward compatibility we set: */
+                            word_font = 0;
+                        }
+                        langdata.pre_hyphen_char = tex_get_pre_hyphen_char(word_language);
+                        langdata.post_hyphen_char = tex_get_post_hyphen_char(word_language);
+                        while (r && node_type(r) == glyph_node && word_language == get_glyph_language(r)) {
+                            halfword chr = glyph_character(r);
+                            halfword hyp = tex_aux_is_hyphen_char(chr);
+                            if (word_language >= first_language) {
+                                last_char = tex_get_hj_code(word_language, chr);
+                                if (last_char > 0) {
+                                    goto GOFORWARD;
+                                }
+                            }
+                            if (hyp) {
+                                last_char = hyp;
+                             // if (last_char) {
+                             //     goto GOFORWARD;
+                             // }
+                            } else {
+                                break;
+                            }
+                          GOFORWARD:
+                         // explicit_hyphen = is_hyphen_char(chr);
+                            explicit_hyphen = hyp;
+                            if (explicit_hyphen && node_next(r) && node_type(node_next(r)) != glyph_node && hyphenation_permitted(glyph_hyphenate(r), ignore_bounds_hyphenation_mode)) {
+                                /* maybe some tracing */
+                                explicit_hyphen = 0;
+                            }
+                            if (explicit_hyphen) {
+                                break;
+                            } else {
+                                word_length++;
+                                if (word_length >= max_size_of_word) {
+                                    /* tex_normal_warning("language", "ignoring long word"); */
+                                    while (r && node_type(r) == glyph_node) {
+                                        r = node_next(r);
+                                    }
+                                    goto PICKUP;
+                                } else {
+                                    if (last_char <= 32) {
+                                        if (last_char == 32) {
+                                            last_char = 0 ;
+                                        }
+                                        if (word_length <= lhmin) {
+                                            lhmin = lhmin - last_char + 1 ;
+                                            if (lhmin < 0) {
+                                                lhmin = 1;
+                                            }
+                                        }
+                                        if (word_length >= rhmin) {
+                                            rhmin = rhmin - last_char + 1 ;
+                                            if (rhmin < 0) {
+                                                rhmin = 1;
+                                            }
+                                        }
+                                        hmin = hmin - last_char + 1 ;
+                                        if (hmin < 0) {
+                                            rhmin = 1;
+                                        }
+                                        last_char = chr ;
+                                    }
+                                    utf8ori = aux_uni2string(utf8ori, (unsigned) chr);
+                                    utf8ptr = aux_uni2string(utf8ptr, (unsigned) last_char);
+                                    word_end = r;
+                                    r = node_next(r);
+                                }
+                            }
+                        }
+                        if (explicit_hyphen) {
+                            /*tex We are not at the start, so we only need to look ahead. */
+                            if ((get_glyph_discpart(r) == glyph_discpart_replace && ! hyphenation_permitted(glyph_hyphenate(r), syllable_hyphenation_mode))) {
+                                /*tex
+                                    This can be the consequence of inhibition too, see |finish_discretionary|
+                                    in which case the replace got injected which can have a hyphen. And we want
+                                    to run the callback if set in order to replace.
+                                */
+                                valid = 1;
+                                goto MESSYCODE;
+                            } else {
+                                /*tex Maybe we should get rid of this ----- stuff. */
+                                halfword t = node_next(r);
+                                if (t && node_type(t) == glyph_node && ! tex_aux_is_hyphen_char(glyph_character(t)) && hyphenation_permitted(glyph_hyphenate(t), automatic_hyphenation_mode)) {
+                                    /*tex we have a word already but the next character may not be a hyphen too */
+                                    halfword g = r;
+                                    r = tex_aux_compound_word_break(r, get_glyph_language(g), explicit_hyphen);
+                                    if (trace > 1) {
+                                        *utf8ori = 0;
+                                        tex_begin_diagnostic();
+                                        tex_print_format("[language: compound word break after %s]", utf8original);
+                                        tex_end_diagnostic();
+                                    }
+                                    if (hyphenation_permitted(glyph_hyphenate(g), compound_hyphenation_mode)) {
+                                        explicit_hyphen = 0;
+                                        if (hyphenation_permitted(glyph_hyphenate(g), force_handler_hyphenation_mode) || hyphenation_permitted(glyph_hyphenate(g), feedback_compound_hyphenation_mode)) {
+                                            set_disc_option(r, disc_option_pre_word | disc_option_post_word);
+                                            explicit_start = null;
+                                            valid = 1;
+                                            goto MESSYCODE;
+                                        } else {
+                                            if (! explicit_start) {
+                                                explicit_start = word_start;
+                                            }
+                                            /*tex For exceptions. */
+                                            utf8ptr = aux_uni2string(utf8ptr, '-');
+                                            r = t;
+                                            continue;
+                                        }
+                                    }
+                                } else {
+                                    /*tex We jump over the sequence of hyphens ... traditional. */
+                                    while (t && node_type(t) == glyph_node && tex_aux_is_hyphen_char(glyph_character(t))) {
+                                        r = t;
+                                        t = node_next(r);
+                                    }
+                                    if (! t) {
+                                        /*tex we reached the end of the list and will quit the loop later */
+                                        r = null;
+                                    }
+                                }
+                            }
+                        } else {
+                            valid = tex_aux_valid_wordend(word_end, r);
+                          MESSYCODE:
+                            /*tex We have a word, r is at the next node. */
+                            if (word_font && word_language >= first_language) {
+                                /*tex We have a language, actually we already tested that. */
+                                struct tex_language *lang = lmt_language_state.languages[word_language];
+                                if (lang) {
+                                    char *replacement = NULL;
+                                    halfword start = explicit_start ? explicit_start : word_start;
+                                    int okay = word_length >= lhmin + rhmin && (hmin <= 0 || word_length >= hmin) && hyphenation_permitted(glyph_hyphenate(start), syllable_hyphenation_mode);
+                                    *utf8ptr = 0;
+                                    *utf8ori = 0;
+                                    if (lang->wordhandler && hyphenation_permitted(glyph_hyphenate(start), force_handler_hyphenation_mode)) {
+                                        halfword restart = node_prev(start); /*tex before the word. */
+                                        int done = lmt_handle_word(lang, utf8original, utf8word, word_length, start, word_end, &replacement);
+                                        if (replacement) {
+                                            if (tex_aux_still_okay(start, word_end, r, word_length, utf8original)) {
+                                                goto EXCEPTIONS2;
+                                            } else {
+                                                goto PICKUP;
+                                            }
+                                        } else {
+                                            /* 1: restart 2: exceptions+patterns 3: patterns *: next word */
+                                            switch (done) {
+                                                case 1:
+                                                    if (_valid_node_(restart)) {
+                                                        r = restart;
+                                                    } else if (_valid_node_(start)) {
+                                                        r = node_prev(start);
+                                                    }
+                                                    if (! r) {
+                                                        if (_valid_node_(head)) {
+                                                            tex_normal_warning("language", "the hyphenation list is messed up, recovering");
+                                                            r = head;
+                                                        } else {
+                                                            tex_normal_error("language", "the hyphenated head is messed up, aborting");
+                                                            return;
+                                                        }
+                                                    }
+                                                    goto PICKUP;
+                                                case 2:
+                                                    if (tex_aux_still_okay(start, word_end, r, word_length, utf8original)) {
+                                                        goto EXCEPTIONS1;
+                                                    } else {
+                                                        goto PICKUP;
+                                                    }
+                                                case 3:
+                                                    if (tex_aux_still_okay(start, word_end, r, word_length, utf8original)) {
+                                                        goto PATTERNS;
+                                                    } else {
+                                                        goto PICKUP;
+                                                    }
+                                                default:
+                                                    if (_valid_node_(r)) { /* or word_end */
+                                                        goto PICKUP;
+                                                    } else if (_valid_node_(tail)) {
+                                                        tex_normal_warning("language", "the hyphenation list is messed up, quitting");
+                                                        goto ABORT;
+                                                    } else {
+                                                        // tex_normal_error("language","the hyphenated tail is messed up, aborting");
+                                                        return;
+                                                    }
+                                            }
+                                        }
+                                    }
+                                    if (! okay || ! valid) {
+                                        goto PICKUP;
+                                    }
+                                    /*tex
+                                        This is messy and nasty: we can have a word with a - in it which is why
+                                        we have two branches. Also, every word that suits the length criteria
+                                        is checked via \LUA. Optimizing this because tests have demonstrated
+                                        that checking against the min and max lengths of exception strings has
+                                        no gain.
+                                    */
+                                  EXCEPTIONS1:
+                                    if (lang->exceptions) {
+                                        replacement = tex_aux_hyphenation_exception(lang->exceptions, utf8word);
+                                    }
+                                  EXCEPTIONS2:
+                                    if (replacement) {
+                                        /*tex handle the exception and go on to the next word */
+                                        halfword start = explicit_start ? explicit_start : word_start;
+                                        halfword beg = node_prev(start);
+                                        tex_aux_do_exception(start, r, replacement); // r == next_node(word_end)
+                                        if (trace > 1) {
+                                            tex_begin_diagnostic();
+                                            tex_print_format("[language: exception %s to %s]", utf8original, replacement);
+                                            if (trace > 2) {
+                                                tex_aux_hyphenate_show(node_next(beg), node_prev(r));
+                                            }
+                                            tex_end_diagnostic();
+                                        }
+                                        lmt_memory_free(replacement);
+                                        goto PICKUP;
+                                    }
+                                    PATTERNS:
+                                    if (lang->patterns) {
+                                        if (explicit_start) {
+                                            /*tex We're done already */
+                                        } else if (hyphenation_permitted(glyph_hyphenate(word_start), syllable_hyphenation_mode)) {
+                                            halfword left = word_start;
+                                            halfword right = r; /*tex We're one after |word_end|. */
+                                            for (int i = lhmin; i > 1; i--) {
+                                                left = node_next(left);
+                                                if (! left || left == right) {
+                                                    goto PICKUP;
+                                                }
+                                            }
+                                            if (right != left) {
+                                                int done = 0;
+                                                for (int i = rhmin; i > 0; i--) {
+                                                    right = node_prev(right);
+                                                    if (! right || right == left) {
+                                                        goto PICKUP;
+                                                    }
+                                                }
+                                                done = tex_aux_hnj_hyphen_hyphenate(lang->patterns, word_start, word_end, word_length, left, right, &langdata);
+                                                if (trace > 1) {
+                                                    tex_begin_diagnostic();
+                                                    if (done) {
+                                                        tex_print_format("[language: hyphenated %s at %i positions]", utf8original, done);
+                                                        if (trace > 2) {
+                                                            tex_aux_hyphenate_show(node_next(left), node_prev(right));
+                                                        }
+                                                    } else {
+                                                        tex_print_format("[language: not hyphenated %s]", utf8original);
+                                                    }
+                                                    tex_end_diagnostic();
+                                                }
+                                            }
+                                        }
+                                    }
+                                }
+                            }
+                        }
+                    }
+                  PICKUP:
+                    explicit_start = null ;
+                    explicit_hyphen = 0;
+                    word_length = 0;
+                    utf8ptr = utf8word;
+                    utf8ori = utf8original;
+                    if (r) {
+                        r = tex_aux_find_next_wordstart(r, first_language);
+                    } else {
+                        break;
+                    }
+                }
+              ABORT:
+                tex_flush_node(node_next(tail));
+                node_next(tail) = saved_tail;
+            }
+        }
+    }
+}
+
+halfword tex_glyph_to_discretionary(halfword glyph, quarterword code, int keepkern)
+{
+    halfword prev = node_prev(glyph);
+    halfword next = node_next(glyph);
+    halfword disc = tex_new_disc_node(code);
+    halfword kern = null;
+    if (keepkern && next && node_type(next) == kern_node && node_subtype(next) == italic_kern_subtype) {
+        kern = node_next(next);
+        next = node_next(kern);
+        node_next(kern) = null;
+    } else { 
+        node_next(glyph) = null;
+    }
+    node_prev(glyph) = null;
+    tex_attach_attribute_list_copy(disc, glyph);
+    tex_set_disc_field(disc, pre_break_code, tex_copy_node_list(glyph, null));
+    tex_set_disc_field(disc, post_break_code, tex_copy_node_list(glyph, null));
+    tex_set_disc_field(disc, no_break_code, glyph);
+    tex_try_couple_nodes(prev, disc);
+    tex_try_couple_nodes(disc, next);
+    return disc; 
+}
+\ No newline at end of file