summaryrefslogtreecommitdiff
path: root/source/luametatex/source/tex/texlanguage.c
diff options
context:
space:
mode:
Diffstat (limited to 'source/luametatex/source/tex/texlanguage.c')
-rw-r--r--source/luametatex/source/tex/texlanguage.c1774
1 files changed, 1774 insertions, 0 deletions
diff --git a/source/luametatex/source/tex/texlanguage.c b/source/luametatex/source/tex/texlanguage.c
new file mode 100644
index 000000000..6f3460c22
--- /dev/null
+++ b/source/luametatex/source/tex/texlanguage.c
@@ -0,0 +1,1774 @@
+/*
+ See license.txt in the root of this project.
+*/
+
+# include "luametatex.h"
+
+/*tex
+
+ We no longer dump the patterns and exeptions as they as supposed to be loaded runtime. There is
+ no gain getting them from the format. But we do dump some of the properties.
+
+ There were all kind of checks for simple characters i.e. not ligatures but there is no need for
+ that in \LUAMETATEX. We have separated stages and the hyphenator sees just glyphs. And when a
+ traditional font has glyphs we can assume that the old school font encoding matches the patterns
+ i.e. that ligatures are not in the normal character slots.
+
+ Exceptions are stored at the \LUA\ end. We cannot easilly go dynamic because fonts are stored
+ in the eqtb so we would have to use some more indirect mechanism (doable as we do it for other
+ items) too.
+
+*/
+
+language_state_info lmt_language_state = {
+ .languages = NULL,
+ .language_data = {
+ .minimum = min_language_size,
+ .maximum = max_language_size,
+ .size = memory_data_unset,
+ .step = stp_language_size,
+ .allocated = 0,
+ .itemsize = 1,
+ .top = 0,
+ .ptr = 0,
+ .initial = memory_data_unset,
+ .offset = 0,
+ },
+ .handler_table_id = 0,
+ .handler_count = 0,
+};
+
+/*tex
+ We can enforce a language id but we want to be sequential so we accept holes! So one
+ has to define bottom-up. As with fonts, we have a zero language but that one normally
+ is not set.
+*/
+
+static void tex_aux_reset_language(halfword id)
+{
+ tex_language *lang = lmt_language_state.languages[id];
+ lang->id = id;
+ lang->exceptions = 0;
+ lang->patterns = NULL;
+ lang->wordhandler = 0;
+ lang->pre_hyphen_char = '-';
+ lang->post_hyphen_char = 0;
+ lang->pre_exhyphen_char = 0;
+ lang->post_exhyphen_char = 0;
+ lang->hyphenation_min = -1;
+ lang->hjcode_head = NULL;
+}
+
+/*tex
+ A value below zero will bump the language id. Because we have a rather limited number of
+ languages there is no configuration, size is just maximum.
+*/
+
+static halfword tex_aux_new_language_id(halfword id)
+{
+ int top;
+ if (id >= 0) {
+ if (id <= lmt_language_state.language_data.top) {
+ if (lmt_language_state.languages[id]) {
+ return tex_formatted_error("languages", "the language with id %d is already created", id);
+ } else {
+ return id;
+ }
+ } else if (id > lmt_language_state.language_data.maximum) {
+ goto OVERFLOWERROR;
+ } else {
+ top = id;
+ }
+ } else if (lmt_language_state.language_data.ptr < lmt_language_state.language_data.top) {
+ ++lmt_language_state.language_data.ptr;
+ return lmt_language_state.language_data.ptr;
+ } else if (lmt_language_state.language_data.top >= lmt_language_state.language_data.maximum) {
+ goto OVERFLOWERROR;
+ } else if (lmt_language_state.language_data.top + lmt_language_state.language_data.step > lmt_language_state.language_data.maximum) {
+ top = lmt_language_state.language_data.maximum;
+ } else {
+ top = lmt_language_state.language_data.top + lmt_language_state.language_data.step;
+ }
+ /*tex Finally we can bump memory. */
+ {
+ tex_language **tmp = aux_reallocate_array(lmt_language_state.languages, sizeof(tex_language *), top, 0);
+ if (tmp) {
+ for (int i = lmt_language_state.language_data.top + 1; i <= top; i++) {
+ tmp[i] = NULL;
+ }
+ lmt_language_state.languages = tmp;
+ lmt_language_state.language_data.allocated += ((size_t) top - lmt_language_state.language_data.top) * sizeof(tex_language *);
+ lmt_language_state.language_data.top = top;
+ lmt_language_state.language_data.ptr += 1;
+ return lmt_language_state.language_data.ptr;
+ }
+ }
+ OVERFLOWERROR:
+ tex_overflow_error("languages", lmt_language_state.language_data.maximum);
+ return 0;
+}
+
+void tex_initialize_languages(void)
+{
+ tex_language **tmp = aux_allocate_clear_array(sizeof(tex_language *), lmt_language_state.language_data.minimum, 0);
+ if (tmp) {
+ for (int i = 0; i < lmt_language_state.language_data.minimum; i++) {
+ tmp[i] = NULL;
+ }
+ lmt_language_state.languages = tmp;
+ lmt_language_state.language_data.allocated += lmt_language_state.language_data.minimum * sizeof(tex_language *);
+ lmt_language_state.language_data.top = lmt_language_state.language_data.minimum;
+ } else {
+ tex_overflow_error("languages", lmt_language_state.language_data.minimum);
+ }
+}
+
+/*
+halfword tex_aux_maximum_language_id(void)
+{
+ return language_state.language_data.maximum;
+}
+*/
+
+int tex_is_valid_language(halfword n)
+{
+ if (n == 0) {
+ return 1;
+ } else if (n > 0 && n <= lmt_language_state.language_data.top) {
+ return lmt_language_state.languages[n] ? 1 : 0;
+ } else {
+ return 0;
+ }
+}
+
+tex_language *tex_new_language(halfword n)
+{
+ halfword id = tex_aux_new_language_id(n);
+ if (id >= 0) {
+ tex_language *lang = lmt_memory_malloc(sizeof(struct tex_language));
+ if (lang) {
+ lmt_language_state.languages[id] = lang;
+ lmt_language_state.language_data.allocated += sizeof(struct tex_language);
+ tex_aux_reset_language(id);
+ if (saving_hyph_codes_par) {
+ /*tex
+ For now, we might just use specific value for whatever task. This will become
+ obsolete.
+ */
+ tex_hj_codes_from_lc_codes(id);
+ }
+ } else {
+ tex_overflow_error("language", sizeof(struct tex_language));
+ }
+ return lang;
+ } else {
+ return NULL;
+ }
+}
+
+tex_language *tex_get_language(halfword n)
+{
+ if (n >= 0) {
+ if (n <= lmt_language_state.language_data.top && lmt_language_state.languages[n]) {
+ return lmt_language_state.languages[n];
+ }
+ if (n <= lmt_language_state.language_data.maximum) {
+ return tex_new_language(n);
+ }
+ }
+ return NULL;
+}
+
+/*tex
+ Freeing, dumping, undumping languages:
+*/
+
+/*
+void free_languages(void)
+{
+ for (int i = 0; i < language_state.language_data.top; i++) {
+ if (language_state.languages[i]) {
+ lmt_memory_free(language_state.languages[i]);
+ language_state.languages[i] = NULL;
+ }
+ }
+}
+*/
+
+void tex_dump_language_data(dumpstream f)
+{
+ dump_int(f, lmt_language_state.language_data.top);
+ dump_int(f, lmt_language_state.language_data.ptr);
+ if (lmt_language_state.language_data.top > 0) {
+ for (int i = 0; i < lmt_language_state.language_data.top; i++) {
+ tex_language *lang = lmt_language_state.languages[i];
+ if (lang) {
+ dump_via_int(f, 1);
+ dump_int(f, lang->id);
+ dump_int(f, lang->pre_hyphen_char);
+ dump_int(f, lang->post_hyphen_char);
+ dump_int(f, lang->pre_exhyphen_char);
+ dump_int(f, lang->post_exhyphen_char);
+ dump_int(f, lang->hyphenation_min);
+ tex_dump_language_hj_codes(f, i);
+ } else {
+ dump_via_int(f, 0);
+ }
+ }
+ }
+}
+
+void tex_undump_language_data(dumpstream f)
+{
+ int top, ptr;
+ undump_int(f, top);
+ undump_int(f, ptr);
+ if (top > 0) {
+ tex_language **tmp = aux_allocate_clear_array(sizeof(tex_language *), top, 0);
+ if (tmp) {
+ lmt_language_state.language_data.top = top;
+ lmt_language_state.language_data.ptr = ptr;
+ lmt_language_state.languages = tmp;
+ for (int i = 0; i < top; i++) {
+ int x;
+ undump_int(f, x);
+ if (x == 1) {
+ tex_language *lang = lmt_memory_malloc(sizeof(struct tex_language));
+ if (lang) {
+ lmt_language_state.languages[i] = lang;
+ lmt_language_state.language_data.allocated += sizeof(struct tex_language);
+ lang->exceptions = 0;
+ lang->patterns = NULL;
+ lang->wordhandler = 0;
+ lang->hjcode_head = NULL;
+ undump_int(f, lang->id);
+ undump_int(f, lang->pre_hyphen_char);
+ undump_int(f, lang->post_hyphen_char);
+ undump_int(f, lang->pre_exhyphen_char);
+ undump_int(f, lang->post_exhyphen_char);
+ undump_int(f, lang->hyphenation_min);
+ tex_undump_language_hj_codes(f, i);
+ if (lang->id != i) {
+ tex_formatted_warning("languages", "undumped language id mismatch: %d <> %d", lang->id, i);
+ lang->id = i;
+ }
+ } else {
+ tex_overflow_error("languages", i);
+ }
+ tmp[i] = lang;
+ } else {
+ tmp[i] = NULL;
+ }
+ }
+ lmt_language_state.language_data.initial = lmt_language_state.language_data.ptr;
+ } else {
+ tex_overflow_error("languages", top);
+ lmt_language_state.language_data.initial = 0;
+ }
+ } else {
+ /*tex Indeed we can have no languages stored. */
+ tex_initialize_languages();
+ }
+}
+
+/*tex All kind of accessors. */
+
+void tex_set_pre_hyphen_char(halfword n, halfword v)
+{
+ struct tex_language *l = tex_get_language(n);
+ if (l) {
+ l->pre_hyphen_char = v;
+ }
+}
+
+void tex_set_post_hyphen_char(halfword n, halfword v)
+{
+ struct tex_language *l = tex_get_language(n);
+ if (l) {
+ l->post_hyphen_char = v;
+ }
+}
+
+void tex_set_pre_exhyphen_char(halfword n, halfword v)
+{
+ struct tex_language *l = tex_get_language(n);
+ if (l) {
+ l->pre_exhyphen_char = v;
+ }
+}
+
+void tex_set_post_exhyphen_char(halfword n, halfword v)
+{
+ struct tex_language *l = tex_get_language(n);
+ if (l) {
+ l->post_exhyphen_char = v;
+ }
+}
+
+halfword tex_get_pre_hyphen_char(halfword n)
+{
+ struct tex_language *l = tex_get_language(n);
+ return l ? l->pre_hyphen_char : -1;
+}
+
+halfword tex_get_post_hyphen_char(halfword n)
+{
+ struct tex_language *l = tex_get_language(n);
+ return l ? l->post_hyphen_char : -1;
+}
+
+halfword tex_get_pre_exhyphen_char(halfword n)
+{
+ struct tex_language *l = tex_get_language(n);
+ return l ? l->pre_exhyphen_char : -1;
+}
+
+halfword tex_get_post_exhyphen_char(halfword n)
+{
+ struct tex_language *l = tex_get_language(n);
+ return (l) ? (int) l->post_exhyphen_char : -1;
+}
+
+void tex_set_hyphenation_min(halfword n, halfword v)
+{
+ struct tex_language *l = tex_get_language(n);
+ if (l) {
+ l->hyphenation_min = v;
+ }
+}
+
+halfword tex_get_hyphenation_min(halfword n)
+{
+ struct tex_language *l = tex_get_language((int) n);
+ return l ? l->hyphenation_min : -1;
+}
+
+void tex_load_patterns(struct tex_language *lang, const unsigned char *buff)
+{
+ if ((! lang) || (! buff) || strlen((const char *) buff) == 0) {
+ return;
+ } else {
+ if (! lang->patterns) {
+ lang->patterns = hnj_dictionary_new();
+ }
+ hnj_dictionary_load(lang->patterns, buff, tracing_hyphenation_par > 0);
+ }
+}
+
+void tex_clear_patterns(struct tex_language *lang)
+{
+ if (lang && lang->patterns) {
+ hnj_dictionary_clear(lang->patterns);
+ }
+}
+
+void tex_load_tex_patterns(halfword curlang, halfword head)
+{
+ char *s = tex_tokenlist_to_tstring(head, 1, NULL, 0, 0, 0);
+ if (s) {
+ tex_load_patterns(tex_get_language(curlang), (unsigned char *) s);
+ }
+}
+
+/*
+ This cleans one word which is returned in |cleaned|, returns the new offset into |buffer|.
+*/
+
+/* define tex_isspace(c) (c == ' ' || c == '\t') */
+# define tex_isspace(c) (c == ' ')
+
+const char *tex_clean_hyphenation(halfword id, const char *buff, char **cleaned)
+{
+ int items = 0;
+ /*tex Work buffer for bytes: */
+ unsigned char word[max_size_of_word + 1];
+ /*tex Work buffer for \UNICODE: */
+ unsigned uword[max_size_of_word + 1] = { 0 };
+ /*tex The \UNICODE\ buffer value: */
+ int i = 0;
+ char *uindex = (char *) word;
+ const char *s = buff;
+ while (*s && ! tex_isspace((unsigned char)*s)) {
+ word[i++] = (unsigned char) *s;
+ s++;
+ if ((s-buff) > max_size_of_word) {
+ /*tex Todo: this is too strict, should count \UNICODE, not bytes. */
+ *cleaned = NULL;
+ tex_handle_error(
+ normal_error_type,
+ "Exception too long",
+ NULL
+ );
+ return s;
+ }
+ }
+ /*tex Now convert the input to \UNICODE. */
+ word[i] = '\0';
+ aux_splitutf2uni(uword, (const char *)word);
+ /*tex
+ Build the new word string. The hjcode values < 32 indicate a length, so that
+ for instance \|hjcode`ܽ2| makes that ligature count okay.
+ */
+ i = 0;
+ while (uword[i] > 0) {
+ int u = uword[i++];
+ if (u == '-') {
+ /*tex Skip. */
+ } else if (u == '=') {
+ unsigned c = tex_get_hj_code(id, '-');
+ uindex = aux_uni2string(uindex, (! c || c <= 32) ? '-' : c);
+ } else if (u == '{') {
+ u = uword[i++];
+ items = 0;
+ while (u && u != '}') {
+ u = uword[i++];
+ }
+ if (u == '}') {
+ items++;
+ u = uword[i++];
+ }
+ while (u && u != '}') {
+ u = uword[i++];
+ }
+ if (u == '}') {
+ items++;
+ u = uword[i++];
+ }
+ if (u == '{') {
+ u = uword[i++];
+ }
+ while (u && u != '}') {
+ unsigned c = tex_get_hj_code(id, u);
+ uindex = aux_uni2string(uindex, (! c || c <= 32) ? u : c);
+ u = uword[i++];
+ }
+ if (u == '}') {
+ items++;
+ }
+ if (items != 3) {
+ /* hm, we intercept that elsewhere in a better way so why here? Best remove the test here or move the other one here. */
+ *cleaned = NULL;
+ tex_handle_error(
+ normal_error_type,
+ "Exception syntax error, a discretionary has three components: {}{}{}.",
+ NULL
+ );
+ return s;
+ } else {
+ /* skip replacement (chars) */
+ if (uword[i] == '(') {
+ while (uword[++i] && uword[i] != ')') { };
+ if (uword[i] != ')') {
+ tex_handle_error(
+ normal_error_type,
+ "Exception syntax error, an alternative replacement is defined as (text).",
+ NULL
+ );
+ return s;
+ } else if (uword[i]) {
+ i++;
+ }
+ }
+ /* skip penalty: [digit] but we intercept multiple digits */
+ if (uword[i] == '[') {
+ if (uword[i+1] && uword[i+1] >= '0' && uword[i+1] <= '9' && uword[i+2] && uword[i+2] == ']') {
+ i += 3;
+ } else {
+ tex_handle_error(
+ normal_error_type,
+ "Exception syntax error, a penalty is defined as [digit].",
+ NULL
+ );
+ return s;
+ }
+ }
+ }
+ } else {
+ unsigned c = tex_get_hj_code(id, u);
+ uindex = aux_uni2string(uindex, (! c || c <= 32) ? u : c);
+ }
+ }
+ *uindex = '\0';
+ *cleaned = lmt_memory_strdup((char *) word);
+ return s;
+}
+
+void tex_load_hyphenation(struct tex_language *lang, const unsigned char *buff)
+{
+ if (lang) {
+ lua_State *L = lmt_lua_state.lua_instance;
+ const char *s = (const char *) buff;
+ char *cleaned = NULL;
+ int id = lang->id;
+ if (lang->exceptions == 0) {
+ lua_newtable(L);
+ lang->exceptions = luaL_ref(L, LUA_REGISTRYINDEX);
+ }
+ lua_rawgeti(L, LUA_REGISTRYINDEX, lang->exceptions);
+ while (*s) {
+ while (tex_isspace((unsigned char) *s)) {
+ s++;
+ }
+ if (*s) {
+ const char *value = s;
+ s = tex_clean_hyphenation(id, s, &cleaned);
+ if (cleaned) {
+ size_t len = s - value;
+ if (len > 0) {
+ lua_pushstring(L, cleaned);
+ lua_pushlstring(L, value, len);
+ lua_rawset(L, -3);
+ }
+ lmt_memory_free(cleaned);
+ } else {
+ /* tex_formatted_warning("hyphenation","skipping invalid hyphenation exception: %s", value); */
+ }
+ }
+ }
+ }
+}
+
+void tex_clear_hyphenation(struct tex_language *lang)
+{
+ if (lang && lang->exceptions != 0) {
+ lua_State *L = lmt_lua_state.lua_instance;
+ luaL_unref(L, LUA_REGISTRYINDEX, lang->exceptions);
+ lang->exceptions = 0;
+ }
+}
+
+void tex_load_tex_hyphenation(halfword curlang, halfword head)
+{
+ char *s = tex_tokenlist_to_tstring(head, 1, NULL, 0, 0, 0);
+ if (s) {
+ tex_load_hyphenation(tex_get_language(curlang), (unsigned char *) s);
+ }
+}
+
+static halfword tex_aux_insert_discretionary(halfword t, halfword pre, halfword post, halfword replace, quarterword subtype, int penalty)
+{
+ /*tex For compound words following explicit hyphens we take the current font. */
+ halfword d = tex_new_disc_node(subtype);
+ halfword a = node_attr(t) ;
+ disc_penalty(d) = penalty;
+ if (t == replace) {
+ /*tex We have |prev disc next-next|. */
+ tex_try_couple_nodes(d, node_next(t));
+ tex_try_couple_nodes(node_prev(t), d);
+ node_prev(t) = null;
+ node_next(t) = null;
+ replace = t;
+ } else {
+ /*tex We have |prev disc next|. */
+ tex_try_couple_nodes(d, node_next(t));
+ tex_couple_nodes(t, d);
+ }
+ if (a) {
+ tex_attach_attribute_list_attribute(d, a);
+ }
+ tex_set_disc_field(d, pre_break_code, pre);
+ tex_set_disc_field(d, post_break_code, post);
+ tex_set_disc_field(d, no_break_code, replace);
+ return d;
+}
+
+static halfword tex_aux_insert_syllable_discretionary(halfword t, lang_variables *lan)
+{
+ halfword n = tex_new_disc_node(syllable_discretionary_code);
+ disc_penalty(n) = hyphen_penalty_par;
+ tex_couple_nodes(n, node_next(t));
+ tex_couple_nodes(t, n);
+ tex_attach_attribute_list_attribute(n, get_attribute_list(t));
+ if (lan->pre_hyphen_char > 0) {
+ halfword g = tex_new_glyph_node(glyph_unset_subtype, glyph_font(t), lan->pre_hyphen_char, t);
+ tex_set_disc_field(n, pre_break_code, g);
+ }
+ if (lan->post_hyphen_char > 0) {
+ halfword g = tex_new_glyph_node(glyph_unset_subtype, glyph_font(t), lan->post_hyphen_char, t);
+ tex_set_disc_field(n, post_break_code, g);
+ }
+ return n;
+}
+
+static halfword tex_aux_compound_word_break(halfword t, halfword clang, halfword chr)
+{
+ halfword prechar, postchar, pre, post, disc;
+ if (chr == ex_hyphen_char_par) {
+ halfword pre_exhyphen_char = tex_get_pre_exhyphen_char(clang);
+ halfword post_exhyphen_char = tex_get_post_exhyphen_char(clang);
+ prechar = pre_exhyphen_char > 0 ? pre_exhyphen_char : ex_hyphen_char_par;
+ postchar = post_exhyphen_char > 0 ? post_exhyphen_char : null;
+ } else {
+ /* we need a flag : use pre/post cf language spec */
+ prechar = chr;
+ postchar = null;
+ }
+ pre = prechar > 0 ? tex_new_glyph_node(glyph_unset_subtype, glyph_font(t), prechar, t) : null;
+ post = postchar > 0 ? tex_new_glyph_node(glyph_unset_subtype, glyph_font(t), postchar, t) : null;
+ disc = tex_aux_insert_discretionary(t, pre, post, t, automatic_discretionary_code, tex_automatic_disc_penalty(glyph_hyphenate(t)));
+ return disc;
+}
+
+static char *tex_aux_hyphenation_exception(int exceptions, char *w)
+{
+ lua_State *L = lmt_lua_state.lua_instance;
+ char *ret = NULL;
+ if (lua_rawgeti(L, LUA_REGISTRYINDEX, exceptions) == LUA_TTABLE) {
+ /*tex Word table: */
+ lua_pushstring(L, w);
+ lua_rawget(L, -2);
+ if (lua_type(L, -1) == LUA_TSTRING) {
+ ret = lmt_memory_strdup(lua_tostring(L, -1));
+ }
+ lua_pop(L, 2);
+ } else {
+ lua_pop(L, 1);
+ }
+ return ret;
+}
+
+/*tex Kept as reference: */
+
+/*
+char *get_exception_strings(struct tex_language *lang)
+{
+ char *ret = NULL;
+ if (lang && lang->exceptions) {
+ lua_State *L = lua_state.lua_instance;
+ if (lua_rawgeti(L, LUA_REGISTRYINDEX, lang->exceptions) == LUA_TTABLE) {
+ size_t size = 0;
+ size_t current = 0;
+ lua_pushnil(L);
+ while (lua_next(L, -2)) {
+ size_t l = 0;
+ const char *value = lua_tolstring(L, -1, &l);
+ if (current + l + 2 > size) {
+ size_t new = (size + size/5) + current + l + 1024;
+ char *tmp = lmt_memory_realloc(ret, new);
+ if (tmp) {
+ ret = tmp;
+ size = new;
+ } else {
+ overflow_error("exceptions", (int) size);
+ }
+ }
+ if (ret) {
+ ret[current] = ' ';
+ strcpy(&ret[current + 1], value);
+ current += l + 1;
+ }
+ lua_pop(L, 1);
+ }
+ }
+ }
+ return ret;
+}
+*/
+
+/*tex
+
+ The sequence from |wordstart| to |r| can contain only normal characters it could be faster to
+ modify a halfword pointer and return an integer
+
+*/
+
+# define zws 0x200B /* zero width space makes no sense */
+# define zwnj 0x200C
+# define zwj 0x200D
+
+static halfword tex_aux_find_exception_part(unsigned int *j, unsigned int *uword, int len, halfword parent, char final)
+{
+ halfword head = null;
+ halfword tail = null;
+ unsigned i = *j;
+ int noligature = 0;
+ int nokerning = 0;
+ /*tex This puts uword[i] on the |{|. */
+ i++;
+ while (i < (unsigned) len && uword[i + 1] != (unsigned int) final) {
+ if (tail) {
+ switch (uword[i + 1]) {
+ case zwj:
+ noligature = 1;
+ nokerning = 0;
+ break;
+ case zwnj:
+ noligature = 1;
+ nokerning = 1;
+ break;
+ default:
+ {
+ halfword s = tex_new_glyph_node(glyph_unset_subtype, glyph_font(parent), (int) uword[i + 1], parent); /* todo: data */
+ tex_couple_nodes(tail, s);
+ if (noligature) {
+ tex_add_glyph_option(tail, glyph_option_no_right_ligature);
+ tex_add_glyph_option(s, glyph_option_no_left_ligature);
+ noligature = 0;
+ }
+ if (nokerning) {
+ tex_add_glyph_option(tail, glyph_option_no_right_kern);
+ tex_add_glyph_option(s, glyph_option_no_left_kern);
+ nokerning = 0;
+ }
+ tail = node_next(tail);
+ break;
+ }
+ }
+ } else {
+ head = tex_new_glyph_node(glyph_unset_subtype, glyph_font(parent), (int) uword[i + 1], parent); /* todo: data */
+ tail = head;
+ }
+ i++;
+ }
+ *j = ++i;
+ return head;
+}
+
+static int tex_aux_count_exception_part(unsigned int *j, unsigned int *uword, int len)
+{
+ int n = 0;
+ unsigned i = *j;
+ /*tex This puts uword[i] on the |{|. */
+ i++;
+ while (i < (unsigned) len && uword[i + 1] != '}') {
+ n++;
+ i++;
+ }
+ *j = ++i;
+ return n;
+}
+
+static void tex_aux_show_exception_error(const char *part)
+{
+ tex_handle_error(
+ normal_error_type,
+ "Invalid %s part in exception",
+ part,
+ "Exception discretionaries should contain three pairs of braced items.\n"
+ "No intervening spaces are allowed."
+ );
+}
+
+/*tex
+
+ The exceptions are taken as-is: no min values are taken into account. One can add normal
+ patterns on-the-fly if needed.
+
+*/
+
+static void tex_aux_do_exception(halfword wordstart, halfword r, char *replacement)
+{
+ halfword t = wordstart;
+ lang_variables langdata;
+ unsigned uword[max_size_of_word + 1] = { 0 };
+ unsigned len = aux_splitutf2uni(uword, replacement);
+ int clang = get_glyph_language(wordstart);
+ langdata.pre_hyphen_char = tex_get_pre_hyphen_char(clang);
+ langdata.post_hyphen_char = tex_get_post_hyphen_char(clang);
+ for (unsigned i = 0; i < len; i++) {
+ if (uword[i + 1] == 0 ) {
+ /*tex We ran out of the exception pattern. */
+ break;
+ } else if (uword[i + 1] == '-') {
+ /*tex A hyphen follows. */
+ if (node_next(t) == r) {
+ break;
+ } else {
+ tex_aux_insert_syllable_discretionary(t, &langdata);
+ /*tex Skip the new disc */
+ t = node_next(t);
+ }
+ } else if (uword[i + 1] == '=') {
+ /*tex We skip a disc. */
+ t = node_next(t);
+ } else if (uword[i + 1] == '{') {
+ /*tex We ran into an exception |{}{}{}| or |{}{}{}[]|. */
+ halfword pre = null;
+ halfword post = null;
+ halfword replace = null;
+ int count = 0;
+ int alternative = null;
+ halfword penalty;
+ /*tex |pre| */
+ pre = tex_aux_find_exception_part(&i, uword, (int) len, wordstart, '}');
+ if (i == len || uword[i + 1] != '{') {
+ tex_aux_show_exception_error("pre");
+ }
+ /*tex |post| */
+ post = tex_aux_find_exception_part(&i, uword, (int) len, wordstart, '}');
+ if (i == len || uword[i + 1] != '{') {
+ tex_aux_show_exception_error("post");
+ }
+ /*tex |replace| */
+ count = tex_aux_count_exception_part(&i, uword, (int) len);
+ if (i == len) {
+ tex_aux_show_exception_error("replace");
+ } else if (uword[i] && uword[i + 1] == '(') {
+ alternative = tex_aux_find_exception_part(&i, uword, (int) len, wordstart, ')');;
+ }
+ /*tex Play safe. */
+ if (node_next(t) == r) {
+ break;
+ } else {
+ /*tex Let's deal with an (optional) replacement. */
+ if (count > 0) {
+ /*tex Assemble the replace stream. */
+ halfword q = t;
+ replace = node_next(q);
+ while (count > 0 && q) {
+ halfword t = node_type(q);
+ q = node_next(q);
+ if (t == glyph_node || t == disc_node) {
+ count--;
+ } else {
+ break ;
+ }
+ }
+ /*tex Remove it from the main stream */
+ tex_try_couple_nodes(t, node_next(q));
+ /*tex and finish it in the replace. */
+ node_next(q) = null;
+ if (alternative) {
+ tex_flush_node_list(replace);
+ replace = alternative;
+ } else {
+ /*tex Sanitize the replace stream (we could use the flattener instead). */
+ q = replace ;
+ while (q) {
+ halfword n = node_next(q);
+ if (node_type(q) == disc_node) {
+ /*tex Beware: the replacement starts after the no_break pointer. */
+ halfword nb = disc_no_break_head(q);
+ disc_no_break_head(q) = null;
+ node_prev(nb) = null ; /* used at all? */
+ /*tex Insert the replacement glyph. */
+ if (q == replace) {
+ replace = nb;
+ } else {
+ tex_try_couple_nodes(node_prev(q), nb);
+ }
+ /*tex Append the glyph (one). */
+ tex_try_couple_nodes(nb, n);
+ /*tex Flush the disc. */
+ tex_flush_node(q);
+ }
+ q = n ;
+ }
+ }
+ }
+ /*tex Let's check if we have a penalty spec. If we have more then we're toast, we just ignore them. */
+ if (uword[i] && uword[i + 1] == '[') {
+ i += 2;
+ if (uword[i] && uword[i] >= '0' && uword[i] <= '9') {
+ if (exception_penalty_par > 0) {
+ if (exception_penalty_par > infinite_penalty) {
+ penalty = exception_penalty_par;
+ } else {
+ penalty = (uword[i] - '0') * exception_penalty_par ;
+ }
+ } else {
+ penalty = hyphen_penalty_par;
+ }
+ ++i;
+ while (uword[i] && uword[i] != ']') {
+ ++i;
+ }
+ } else {
+ penalty = hyphen_penalty_par;
+ }
+ } else {
+ penalty = hyphen_penalty_par;
+ }
+ /*tex And now we insert a disc node (this was |syllable_discretionary_code|). */
+ t = tex_aux_insert_discretionary(t, pre, post, replace, normal_discretionary_code, penalty);
+ /*tex We skip the new disc node. */
+ t = node_next(t);
+ /*tex
+ We need to check if we have two discretionaries in a row, test case: |\hyphenation
+ {a{>}{<}{b}{>}{<}{c}de} \hsize 1pt abcde \par| which gives |a> <> <de|.
+ */
+ if (uword[i] && uword[i + 1] == '{') {
+ i--;
+ t = node_prev(t); /*tex Tricky! */
+ }
+ }
+ } else {
+ t = node_next(t);
+ }
+ /*tex Again we play safe. */
+ if (! t || node_next(t) == r) {
+ break;
+ }
+ }
+}
+
+/*tex
+
+ The following description is no longer valid for \LUATEX. Although we use the same algorithm
+ for hyphenation, it is not integrated in the par builder. Instead it is a separate run over
+ the node list, preceding the line-breaking routine, possibly replaced by a callback. We keep
+ the description here because the principles remain.
+
+ \startnarrower
+
+ When the line-breaking routine is unable to find a feasible sequence of breakpoints, it makes
+ a second pass over the paragraph, attempting to hyphenate the hyphenatable words. The goal of
+ hyphenation is to insert discretionary material into the paragraph so that there are more
+ potential places to break.
+
+ The general rules for hyphenation are somewhat complex and technical, because we want to be
+ able to hyphenate words that are preceded or followed by punctuation marks, and because we
+ want the rules to work for languages other than English. We also must contend with the fact
+ that hyphens might radically alter the ligature and kerning structure of a word.
+
+ A sequence of characters will be considered for hyphenation only if it belongs to a \quotation
+ {potentially hyphenatable part} of the current paragraph. This is a sequence of nodes $p_0p_1
+ \ldots p_m$ where $p_0$ is a glue node, $p_1\ldots p_{m-1}$ are either character or ligature
+ or whatsit or implicit kern nodes, and $p_m$ is a glue or penalty or insertion or adjust or
+ mark or whatsit or explicit kern node. (Therefore hyphenation is disabled by boxes, math
+ formulas, and discretionary nodes already inserted by the user.) The ligature nodes among $p_1
+ \ldots p_{m-1}$ are effectively expanded into the original non-ligature characters; the kern
+ nodes and whatsits are ignored. Each character |c| is now classified as either a nonletter (if
+ |lc_code(c)=0|), a lowercase letter (if |lc_code(c)=c|), or an uppercase letter (otherwise); an
+ uppercase letter is treated as if it were |lc_code(c)| for purposes of hyphenation. The
+ characters generated by $p_1\ldots p_{m-1}$ may begin with nonletters; let $c_1$ be the first
+ letter that is not in the middle of a ligature. Whatsit nodes preceding $c_1$ are ignored; a
+ whatsit found after $c_1$ will be the terminating node $p_m$. All characters that do not have
+ the same font as $c_1$ will be treated as nonletters. The |hyphen_char| for that font must be
+ between 0 and 255, otherwise hyphenation will not be attempted. \TeX\ looks ahead for as many
+ consecutive letters $c_1\ldots c_n$ as possible; however, |n| must be less than 64, so a
+ character that would otherwise be $c_{64}$ is effectively not a letter. Furthermore $c_n$ must
+ not be in the middle of a ligature. In this way we obtain a string of letters $c_1\ldots c_n$
+ that are generated by nodes $p_a\ldots p_b$, where |1<=a<=b+1<=m|. If |n>=l_hyf+r_hyf|, this
+ string qualifies for hyphenation; however, |uc_hyph| must be positive, if $c_1$ is uppercase.
+
+ The hyphenation process takes place in three stages. First, the candidate sequence $c_1 \ldots
+ c_n$ is found; then potential positions for hyphens are determined by referring to hyphenation
+ tables; and finally, the nodes $p_a\ldots p_b$ are replaced by a new sequence of nodes that
+ includes the discretionary breaks found.
+
+ Fortunately, we do not have to do all this calculation very often, because of the way it has
+ been taken out of \TEX's inner loop. For example, when the second edition of the author's
+ 700-page book {\sl Seminumerical Algorithms} was typeset by \TEX, only about 1.2 hyphenations
+ needed to be tried per paragraph, since the line breaking algorithm needed to use two passes on
+ only about 5 per cent of the paragraphs. (This is not true in \LUATEX: we always hyphenate the
+ whole list.)
+
+ When a word been set up to contain a candidate for hyphenation, \TEX\ first looks to see if it
+ is in the user's exception dictionary. If not, hyphens are inserted based on patterns that
+ appear within the given word, using an algorithm due to Frank~M. Liang.
+
+ \stopnarrower
+
+ This is incompatible with \TEX\ because the first word of a paragraph can be hyphenated, but
+ most European users seem to agree that prohibiting hyphenation there was not the best idea ever.
+
+ To be documented: |\hyphenationmode| (a bit set).
+
+ \startbuffer
+ \parindent0pt \hsize=1.1cm
+ 12-34-56 \par
+ 12-34-\hbox{56} \par
+ 12-34-\vrule width 1em height 1.5ex \par
+ 12-\hbox{34}-56 \par
+ 12-\vrule width 1em height 1.5ex-56 \par
+ \hjcode`\1=`\1 \hjcode`\2=`\2 \hjcode`\3=`\3 \hjcode`\4=`\4 \vskip.5cm
+ 12-34-56 \par
+ 12-34-\hbox{56} \par
+ 12-34-\vrule width 1em height 1.5ex \par
+ 12-\hbox{34}-56 \par
+ 12-\vrule width 1em height 1.5ex-56 \par
+ \stopbuffer
+
+ \typebuffer
+
+ \startpacked \getbuffer \stopbuffer
+
+ We only accept an explicit hyphen when there is a preceding glyph and we skip a sequence of
+ explicit hyphens as that normally indicates a \type {--} or \type {---} ligature in which case
+ we can in a worse case usage get bad node lists later on due to messed up ligature building as
+ these dashes are ligatures in base fonts. This is a side effect of the separating the
+ hyphenation, ligaturing and kerning steps. A test is cmr with \type {------}.
+
+ A font handler can collapse successive hyphens but it's not nice to put the burden there. A
+ somewhat messy border case is \type {----} but in \LUATEX\ we don't treat \type {--} and \type
+ {---} special. Also, traditional \TEX\ will break a line at \type {-foo} but this can be
+ disabled by setting the automatic mode to \type {1}.
+
+*/
+
+// # define is_hyphen_char(chr) (get_hc_code(chr) || chr == ex_hyphen_char_par)
+
+inline static halfword tex_aux_is_hyphen_char(halfword chr)
+{
+ if (tex_get_hc_code(chr)) {
+ return tex_get_hc_code(chr);
+ } else if (chr == ex_hyphen_char_par) {
+ return ex_hyphen_char_par;
+ } else {
+ return null;
+ }
+}
+
+static halfword tex_aux_find_next_wordstart(halfword r, halfword first_language)
+{
+ int start_ok = 1;
+ int mathlevel = 1;
+ halfword lastglyph = r;
+ while (r) {
+ switch (node_type(r)) {
+ case boundary_node:
+ if (node_subtype(r) == word_boundary) {
+ start_ok = 1;
+ }
+ break;
+ case disc_node:
+ start_ok = has_disc_option(r, disc_option_post_word);
+ break;
+ case hlist_node:
+ case vlist_node:
+ case rule_node:
+ case dir_node:
+ case whatsit_node:
+ if (hyphenation_permitted(glyph_hyphenate(lastglyph), strict_start_hyphenation_mode)) {
+ start_ok = 0;
+ }
+ break;
+ case glue_node:
+ start_ok = 1;
+ break;
+ case math_node:
+ while (mathlevel > 0) {
+ r = node_next(r);
+ if (! r) {
+ return r;
+ } else if (node_type(r) == math_node) {
+ if (node_subtype(r) == begin_inline_math) {
+ mathlevel++;
+ } else {
+ mathlevel--;
+ }
+ }
+ }
+ break;
+ case glyph_node:
+ {
+ /*tex
+ When we have no word yet and meet a hyphen (equivalent) we should just
+ keep going. This is not compatible but it does make sense.
+ */
+ int chr = glyph_character(r);
+ int hyp = tex_aux_is_hyphen_char(chr);
+ lastglyph = r;
+ if (hyp) {
+ if (hyphenation_permitted(glyph_hyphenate(r), ignore_bounds_hyphenation_mode)) {
+ /* maybe some tracing */
+ } else {
+ /* todo: already check if we have hj chars left/right i.e. no digits and minus mess */
+ halfword t = node_next(r) ;
+ /*tex Kind of weird that we have the opposite flag test here. */
+ if (t && (node_type(t) == glyph_node) && (! tex_aux_is_hyphen_char(glyph_character(t))) && ! hyphenation_permitted(glyph_hyphenate(r), automatic_hyphenation_mode)) {
+ /*tex We have no word yet and the next character is a non hyphen. */
+ r = tex_aux_compound_word_break(r, get_glyph_language(r), hyp);
+ // test case: \automatichyphenmode0 10\high{-6-1-2-4}
+ start_ok = 1; // todo: also in luatex
+ } else {
+ /*tex We jump over the sequence of hyphens. */
+ while (t && (node_type(t) == glyph_node) && tex_aux_is_hyphen_char(glyph_character(t))) {
+ r = t ;
+ t = node_next(r) ;
+ }
+ if (t) {
+ /*tex We need a restart. */
+ start_ok = 0;
+ } else {
+ /*tex We reached the end of the list so we have no word start. */
+ return null;
+ }
+ }
+ }
+ } else if (start_ok && (get_glyph_language(r) >= first_language) && get_glyph_dohyph(r)) {
+ int l = tex_get_hj_code(get_glyph_language(r), chr);
+ if (l > 0) {
+ if (l == chr || l <= 32 || get_glyph_uchyph(r)) {
+ return r;
+ } else {
+ start_ok = 0;
+ }
+ } else {
+ /*tex We go on. */
+ }
+ } else {
+ /*tex We go on. */
+ }
+ }
+ break;
+ default:
+ start_ok = 0;
+ break;
+ }
+ r = node_next(r);
+ }
+ return r; /* null */
+}
+
+/*tex
+
+ This is the original test, extended with bounds, but still the complex expression turned into
+ a function. However, it actually is part of the old mechanism where hyphenation was mixed
+ with ligature building and kerning, so there was this skipping over a font kern whuch is no
+ longer needed as we have separate steps.
+
+ We keep this as reference:
+
+ \starttyping
+ static int valid_wordend(halfword s, halfword strict_bound)
+ {
+ if (s) {
+ halfword r = s;
+ int clang = get_glyph_language(s);
+ while ( (r) &&
+ ( (type(r) == glyph_node && clang == get_glyph_language(r))
+ || (type(r) == kern_node && (subtype(r) == font_kern))
+ )
+ ) {
+ r = node_next(r);
+ }
+ return (! r || (type(r) == glyph_node && clang != get_glyph_language(r))
+ || type(r) == glue_node
+ || type(r) == penalty_node
+ || (type(r) == kern_node && (subtype(r) == explicit_kern ||
+ subtype(r) == italic_kern ||
+ subtype(r) == accent_kern ))
+ || ((type(r) == hlist_node ||
+ type(r) == vlist_node ||
+ type(r) == rule_node ||
+ type(r) == dir_node ||
+ type(r) == whatsit_node ||
+ type(r) == insert_node ||
+ type(r) == adjust_node
+ ) && ! (strict_bound == 2 || strict_bound == 3))
+ || type(r) == boundary_node
+ );
+ } else {
+ return 1;
+ }
+ }
+ \stopttyping
+
+*/
+
+static int tex_aux_valid_wordend(halfword end_word, halfword r)
+{
+ if (r) {
+ switch (node_type(r)) {
+ // case glyph_node:
+ // case glue_node:
+ // case penalty_node:
+ // case kern_node:
+ // return 1;
+ case disc_node:
+ return has_disc_option(r, disc_option_pre_word);
+ case hlist_node:
+ case vlist_node:
+ case rule_node:
+ case dir_node:
+ case whatsit_node:
+ case insert_node:
+ case adjust_node:
+ return ! hyphenation_permitted(glyph_hyphenate(end_word), strict_end_hyphenation_mode);
+ }
+ }
+ return 1;
+}
+
+void tex_handle_hyphenation(halfword head, halfword tail)
+{
+ if (head && node_next(head)) {
+ int callback_id = lmt_callback_defined(hyphenate_callback);
+ if (callback_id > 0) {
+ lua_State *L = lmt_lua_state.lua_instance;
+ int top = 0;
+ if (lmt_callback_okay(L, callback_id, &top)) {
+ int i;
+ lmt_node_list_to_lua(L, head);
+ lmt_node_list_to_lua(L, tail);
+ i = lmt_callback_call(L, 2, 0, top);
+ if (i) {
+ lmt_callback_error(L, top, i);
+ } else {
+ lmt_callback_wrapup(L, top);
+ }
+ }
+ } else if (callback_id == 0) {
+ tex_hyphenate_list(head, tail);
+ } else {
+ /* -1 : disabled */
+ }
+ }
+}
+
+static int tex_aux_hnj_hyphen_hyphenate(
+ hjn_dictionary *dict,
+ halfword first,
+ halfword last,
+ int length,
+ halfword left,
+ halfword right,
+ lang_variables *lan
+)
+{
+ /*tex +2 for dots at each end, +1 for points outside characters. */
+ int ext_word_len = length + 2;
+ int hyphen_len = ext_word_len + 1;
+ /*tex Because we have a limit of 64 characters we could just use a static array here: */
+ char *hyphens = lmt_memory_calloc(hyphen_len, sizeof(unsigned char));
+ if (hyphens) {
+ halfword here;
+ int state = 0;
+ int char_num = 0;
+ int done = 0;
+ /*tex Add a '.' to beginning and end to facilitate matching. */
+ node_next(begin_period) = first;
+ node_next(end_period) = node_next(last);
+ node_next(last) = end_period;
+
+ // for (int i = 0; i < hyphen_len; i++) {
+ // hyphens[i] = '0';
+ // }
+ // hyphens[hyphen_len] = 0;
+
+ /*tex Now, run the finite state machine. */
+ for (char_num = 0, here = begin_period; here != node_next(end_period); here = node_next(here)) {
+ int ch;
+ if (here == begin_period || here == end_period) {
+ ch = '.';
+ } else {
+ ch = tex_get_hj_code(get_glyph_language(here), glyph_character(here));
+ if (ch <= 32) {
+ ch = glyph_character(here);
+ }
+ }
+ while (state != -1) {
+ hjn_state *hstate = &dict->states[state];
+ for (int k = 0; k < hstate->num_trans; k++) {
+ if (hstate->trans[k].uni_ch == ch) {
+ char *match;
+ state = hstate->trans[k].new_state;
+ match = dict->states[state].match;
+ if (match) {
+ /*tex
+ We add +2 because 1 string length is one bigger than offset and 1
+ hyphenation starts before first character.
+
+ Why not store the length in states[state] instead of calculating
+ it each time? Okay, performance is okay but still ...
+ */
+ int offset = (int) (char_num + 2 - (int) strlen(match));
+ for (int m = 0; match[m]; m++) {
+ if (hyphens[offset + m] < match[m]) {
+ hyphens[offset + m] = match[m];
+ }
+ }
+ }
+ goto NEXTLETTER;
+ }
+ }
+ state = hstate->fallback_state;
+ }
+ /*tex Nothing worked, let's go to the next character. */
+ state = 0;
+ NEXTLETTER:;
+ char_num++;
+ }
+ /*tex Restore the correct pointers. */
+ node_next(last) = node_next(end_period);
+ /*tex
+ Pattern is |.word.| and |word_len| is 4, |ext_word_len| is 6 and |hyphens| is 7; drop first
+ two and stop after |word_len-1|.
+ */
+ for (here = first, char_num = 2; here != left; here = node_next(here)) {
+ char_num++;
+ }
+ for (; here != right; here = node_next(here)) {
+ if (hyphens[char_num] & 1) {
+ here = tex_aux_insert_syllable_discretionary(here, lan);
+ done += 1;
+ }
+ char_num++;
+ }
+ lmt_memory_free(hyphens);
+ return done;
+ } else {
+ tex_overflow_error("patterns", hyphen_len);
+ return 0;
+ }
+}
+
+/* we can also check the original */
+
+static int tex_aux_still_okay(halfword f, halfword l, halfword r, int n, const char *utf8original) {
+ if (_valid_node_(f) && _valid_node_(l) && node_next(l) == r) {
+ int i = 0;
+ while (f) {
+ ++i;
+ if (node_type(f) != glyph_node) {
+ tex_normal_warning("language", "the hyphenated word contains non-glyphs, skipping");
+ return 0;
+ } else {
+ halfword c = (halfword) aux_str2uni((const unsigned char *) utf8original);
+ utf8original += utf8_size(c);
+ if (! (c && c == glyph_character(f))) {
+ tex_normal_warning("language", "the hyphenated word contains different characters, skipping");
+ return 0;
+ } else if (f != l) {
+ f = node_next(f);
+ } else if (i == n) {
+ return 1;
+ } else {
+ tex_normal_warning("language", "the hyphenated word changed length, skipping");
+ return 0;
+ }
+ }
+ }
+ }
+ tex_normal_warning("language", "the hyphenation list is messed up, skipping");
+ return 0;
+}
+
+static void tex_aux_hyphenate_show(halfword beg, halfword end)
+{
+ if (_valid_node_(beg) && _valid_node_(end)) {
+ halfword nxt = node_next(end);
+ node_next(end) = null;
+ tex_show_node_list(beg, 100, 10000);
+ node_next(end) = nxt;
+ }
+}
+
+/* maybe split: first a processing run */
+
+inline static int is_traditional_hyphen(halfword n)
+{
+ return (
+ (glyph_character(n) == ex_hyphen_char_par) /*tex parameter */
+ && (has_font_text_control(glyph_font(n),text_control_collapse_hyphens)) /*tex font driven */
+ && (hyphenation_permitted(glyph_hyphenate(n), collapse_hyphenation_mode)) /*tex language driven */
+ );
+}
+
+int tex_collapse_list(halfword head, halfword c1, halfword c2, halfword c3) /* ex_hyphen_char_par 0x2013 0x2014 */
+{
+ /*tex Let's play safe: */
+ halfword found = 0;
+ if (head && c1 && c2 && c3) {
+ halfword n1 = head;
+ while (n1) {
+ halfword n2 = node_next(n1);
+ switch (node_type(n1)) {
+ case glyph_node:
+ if (is_traditional_hyphen(n1)) {
+ set_glyph_discpart(n1, glyph_discpart_always);
+ if (n2 && node_type(n2) == glyph_node && is_traditional_hyphen(n2) && glyph_font(n1) == glyph_font(n2)) {
+ halfword n3 = node_next(n2);
+ if (n3 && node_type(n3) == glyph_node && is_traditional_hyphen(n3) && glyph_font(n1) == glyph_font(n3)) {
+ halfword n4 = node_next(n3);
+ glyph_character(n1) = c3;
+ tex_try_couple_nodes(n1, n4);
+ tex_flush_node(n2);
+ tex_flush_node(n3);
+ n1 = n4;
+ } else {
+ glyph_character(n1) = c2;
+ tex_try_couple_nodes(n1, n3);
+ tex_flush_node(n2);
+ n1 = n3;
+ }
+ found = 1;
+ goto AGAIN;
+ } else {
+ glyph_character(n1) = c1; /* can become language dependent */
+ }
+ }
+ break;
+ case disc_node:
+ {
+ halfword done = 0;
+ if (disc_pre_break_head(n1) && tex_collapse_list(disc_pre_break_head(n1), c1, c2, c3)) {
+ ++done;
+ }
+ if (disc_post_break_head(n1) && tex_collapse_list(disc_post_break_head(n1), c1, c2, c3)) {
+ ++done;
+ }
+ if (disc_no_break_head(n1) && tex_collapse_list(disc_no_break_head(n1), c1, c2, c3)) {
+ ++done;
+ }
+ if (done) {
+ tex_check_disc_field(n1);
+ }
+ break;
+ }
+ default:
+ break;
+ }
+ n1 = n2;
+ AGAIN:;
+ }
+ }
+ return found;
+}
+
+void tex_hyphenate_list(halfword head, halfword tail)
+{
+ /*tex Let's play safe: */
+ if (tail) {
+ halfword first_language = first_valid_language_par; /* combine with check below */
+ halfword trace = tracing_hyphenation_par;
+ halfword r = head;
+ /*tex
+ This first movement assures two things:
+
+ \startitemize
+ \startitem
+ That we won't waste lots of time on something that has been handled already (in
+ that case, none of the glyphs match |simple_character|).
+ \stopitem
+ \startitem
+ That the first word can be hyphenated. If the movement was not explicit, then
+ the indentation at the start of a paragraph list would make |find_next_wordstart()|
+ look too far ahead.
+ \stopitem
+ \stopitemize
+ */
+ while (r && node_type(r) != glyph_node) {
+ r = node_next(r);
+ }
+ if (r) {
+ r = tex_aux_find_next_wordstart(r, first_language);
+ if (r) {
+ lang_variables langdata;
+ char utf8word[(4 * max_size_of_word) + 1] = { 0 };
+ char utf8original[(4 * max_size_of_word) + 1] = { 0 };
+ char *utf8ptr = utf8word;
+ char *utf8ori = utf8original;
+ int word_length = 0;
+ int explicit_hyphen = 0;
+ int last_char = 0;
+ int valid = 0;
+ halfword explicit_start = null;
+ halfword saved_tail = node_next(tail);
+ halfword penalty = tex_new_penalty_node(0, word_penalty_subtype);
+ /* kind of curious hack, this addition that we later remove */
+ tex_attach_attribute_list_copy(penalty, r);
+ tex_couple_nodes(tail, penalty); /* todo: attrobute */
+ while (r) {
+ halfword word_start = r;
+ int word_language = get_glyph_language(word_start);
+ if (tex_is_valid_language(word_language)) {
+ halfword word_end = r;
+ int lhmin = get_glyph_lhmin(word_start);
+ int rhmin = get_glyph_rhmin(word_start);
+ int hmin = tex_get_hyphenation_min(word_language);
+ halfword word_font = glyph_font(word_start);
+ if (! tex_is_valid_font(word_font) || font_hyphen_char(word_font) < 0) {
+ /*tex For backward compatibility we set: */
+ word_font = 0;
+ }
+ langdata.pre_hyphen_char = tex_get_pre_hyphen_char(word_language);
+ langdata.post_hyphen_char = tex_get_post_hyphen_char(word_language);
+ while (r && node_type(r) == glyph_node && word_language == get_glyph_language(r)) {
+ halfword chr = glyph_character(r);
+ halfword hyp = tex_aux_is_hyphen_char(chr);
+ if (word_language >= first_language) {
+ last_char = tex_get_hj_code(word_language, chr);
+ if (last_char > 0) {
+ goto GOFORWARD;
+ }
+ }
+ if (hyp) {
+ last_char = hyp;
+ // if (last_char) {
+ // goto GOFORWARD;
+ // }
+ } else {
+ break;
+ }
+ GOFORWARD:
+ // explicit_hyphen = is_hyphen_char(chr);
+ explicit_hyphen = hyp;
+ if (explicit_hyphen && node_next(r) && node_type(node_next(r)) != glyph_node && hyphenation_permitted(glyph_hyphenate(r), ignore_bounds_hyphenation_mode)) {
+ /* maybe some tracing */
+ explicit_hyphen = 0;
+ }
+ if (explicit_hyphen) {
+ break;
+ } else {
+ word_length++;
+ if (word_length >= max_size_of_word) {
+ /* tex_normal_warning("language", "ignoring long word"); */
+ while (r && node_type(r) == glyph_node) {
+ r = node_next(r);
+ }
+ goto PICKUP;
+ } else {
+ if (last_char <= 32) {
+ if (last_char == 32) {
+ last_char = 0 ;
+ }
+ if (word_length <= lhmin) {
+ lhmin = lhmin - last_char + 1 ;
+ if (lhmin < 0) {
+ lhmin = 1;
+ }
+ }
+ if (word_length >= rhmin) {
+ rhmin = rhmin - last_char + 1 ;
+ if (rhmin < 0) {
+ rhmin = 1;
+ }
+ }
+ hmin = hmin - last_char + 1 ;
+ if (hmin < 0) {
+ rhmin = 1;
+ }
+ last_char = chr ;
+ }
+ utf8ori = aux_uni2string(utf8ori, (unsigned) chr);
+ utf8ptr = aux_uni2string(utf8ptr, (unsigned) last_char);
+ word_end = r;
+ r = node_next(r);
+ }
+ }
+ }
+ if (explicit_hyphen) {
+ /*tex We are not at the start, so we only need to look ahead. */
+ if ((get_glyph_discpart(r) == glyph_discpart_replace && ! hyphenation_permitted(glyph_hyphenate(r), syllable_hyphenation_mode))) {
+ /*tex
+ This can be the consequence of inhibition too, see |finish_discretionary|
+ in which case the replace got injected which can have a hyphen. And we want
+ to run the callback if set in order to replace.
+ */
+ valid = 1;
+ goto MESSYCODE;
+ } else {
+ /*tex Maybe we should get rid of this ----- stuff. */
+ halfword t = node_next(r);
+ if (t && node_type(t) == glyph_node && ! tex_aux_is_hyphen_char(glyph_character(t)) && hyphenation_permitted(glyph_hyphenate(t), automatic_hyphenation_mode)) {
+ /*tex we have a word already but the next character may not be a hyphen too */
+ halfword g = r;
+ r = tex_aux_compound_word_break(r, get_glyph_language(g), explicit_hyphen);
+ if (trace > 1) {
+ *utf8ori = 0;
+ tex_begin_diagnostic();
+ tex_print_format("[language: compound word break after %s]", utf8original);
+ tex_end_diagnostic();
+ }
+ if (hyphenation_permitted(glyph_hyphenate(g), compound_hyphenation_mode)) {
+ explicit_hyphen = 0;
+ if (hyphenation_permitted(glyph_hyphenate(g), force_handler_hyphenation_mode) || hyphenation_permitted(glyph_hyphenate(g), feedback_compound_hyphenation_mode)) {
+ set_disc_option(r, disc_option_pre_word | disc_option_post_word);
+ explicit_start = null;
+ valid = 1;
+ goto MESSYCODE;
+ } else {
+ if (! explicit_start) {
+ explicit_start = word_start;
+ }
+ /*tex For exceptions. */
+ utf8ptr = aux_uni2string(utf8ptr, '-');
+ r = t;
+ continue;
+ }
+ }
+ } else {
+ /*tex We jump over the sequence of hyphens ... traditional. */
+ while (t && node_type(t) == glyph_node && tex_aux_is_hyphen_char(glyph_character(t))) {
+ r = t;
+ t = node_next(r);
+ }
+ if (! t) {
+ /*tex we reached the end of the list and will quit the loop later */
+ r = null;
+ }
+ }
+ }
+ } else {
+ valid = tex_aux_valid_wordend(word_end, r);
+ MESSYCODE:
+ /*tex We have a word, r is at the next node. */
+ if (word_font && word_language >= first_language) {
+ /*tex We have a language, actually we already tested that. */
+ struct tex_language *lang = lmt_language_state.languages[word_language];
+ if (lang) {
+ char *replacement = NULL;
+ halfword start = explicit_start ? explicit_start : word_start;
+ int okay = word_length >= lhmin + rhmin && (hmin <= 0 || word_length >= hmin) && hyphenation_permitted(glyph_hyphenate(start), syllable_hyphenation_mode);
+ *utf8ptr = 0;
+ *utf8ori = 0;
+ if (lang->wordhandler && hyphenation_permitted(glyph_hyphenate(start), force_handler_hyphenation_mode)) {
+ halfword restart = node_prev(start); /*tex before the word. */
+ int done = lmt_handle_word(lang, utf8original, utf8word, word_length, start, word_end, &replacement);
+ if (replacement) {
+ if (tex_aux_still_okay(start, word_end, r, word_length, utf8original)) {
+ goto EXCEPTIONS2;
+ } else {
+ goto PICKUP;
+ }
+ } else {
+ /* 1: restart 2: exceptions+patterns 3: patterns *: next word */
+ switch (done) {
+ case 1:
+ if (_valid_node_(restart)) {
+ r = restart;
+ } else if (_valid_node_(start)) {
+ r = node_prev(start);
+ }
+ if (! r) {
+ if (_valid_node_(head)) {
+ tex_normal_warning("language", "the hyphenation list is messed up, recovering");
+ r = head;
+ } else {
+ tex_normal_error("language", "the hyphenated head is messed up, aborting");
+ return;
+ }
+ }
+ goto PICKUP;
+ case 2:
+ if (tex_aux_still_okay(start, word_end, r, word_length, utf8original)) {
+ goto EXCEPTIONS1;
+ } else {
+ goto PICKUP;
+ }
+ case 3:
+ if (tex_aux_still_okay(start, word_end, r, word_length, utf8original)) {
+ goto PATTERNS;
+ } else {
+ goto PICKUP;
+ }
+ default:
+ if (_valid_node_(r)) { /* or word_end */
+ goto PICKUP;
+ } else if (_valid_node_(tail)) {
+ tex_normal_warning("language", "the hyphenation list is messed up, quitting");
+ goto ABORT;
+ } else {
+ // tex_normal_error("language","the hyphenated tail is messed up, aborting");
+ return;
+ }
+ }
+ }
+ }
+ if (! okay || ! valid) {
+ goto PICKUP;
+ }
+ /*tex
+ This is messy and nasty: we can have a word with a - in it which is why
+ we have two branches. Also, every word that suits the length criteria
+ is checked via \LUA. Optimizing this because tests have demonstrated
+ that checking against the min and max lengths of exception strings has
+ no gain.
+ */
+ EXCEPTIONS1:
+ if (lang->exceptions) {
+ replacement = tex_aux_hyphenation_exception(lang->exceptions, utf8word);
+ }
+ EXCEPTIONS2:
+ if (replacement) {
+ /*tex handle the exception and go on to the next word */
+ halfword start = explicit_start ? explicit_start : word_start;
+ halfword beg = node_prev(start);
+ tex_aux_do_exception(start, r, replacement); // r == next_node(word_end)
+ if (trace > 1) {
+ tex_begin_diagnostic();
+ tex_print_format("[language: exception %s to %s]", utf8original, replacement);
+ if (trace > 2) {
+ tex_aux_hyphenate_show(node_next(beg), node_prev(r));
+ }
+ tex_end_diagnostic();
+ }
+ lmt_memory_free(replacement);
+ goto PICKUP;
+ }
+ PATTERNS:
+ if (lang->patterns) {
+ if (explicit_start) {
+ /*tex We're done already */
+ } else if (hyphenation_permitted(glyph_hyphenate(word_start), syllable_hyphenation_mode)) {
+ halfword left = word_start;
+ halfword right = r; /*tex We're one after |word_end|. */
+ for (int i = lhmin; i > 1; i--) {
+ left = node_next(left);
+ if (! left || left == right) {
+ goto PICKUP;
+ }
+ }
+ if (right != left) {
+ int done = 0;
+ for (int i = rhmin; i > 0; i--) {
+ right = node_prev(right);
+ if (! right || right == left) {
+ goto PICKUP;
+ }
+ }
+ done = tex_aux_hnj_hyphen_hyphenate(lang->patterns, word_start, word_end, word_length, left, right, &langdata);
+ if (trace > 1) {
+ tex_begin_diagnostic();
+ if (done) {
+ tex_print_format("[language: hyphenated %s at %i positions]", utf8original, done);
+ if (trace > 2) {
+ tex_aux_hyphenate_show(node_next(left), node_prev(right));
+ }
+ } else {
+ tex_print_format("[language: not hyphenated %s]", utf8original);
+ }
+ tex_end_diagnostic();
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ PICKUP:
+ explicit_start = null ;
+ explicit_hyphen = 0;
+ word_length = 0;
+ utf8ptr = utf8word;
+ utf8ori = utf8original;
+ if (r) {
+ r = tex_aux_find_next_wordstart(r, first_language);
+ } else {
+ break;
+ }
+ }
+ ABORT:
+ tex_flush_node(node_next(tail));
+ node_next(tail) = saved_tail;
+ }
+ }
+ }
+}
+
+halfword tex_glyph_to_discretionary(halfword glyph, quarterword code, int keepkern)
+{
+ halfword prev = node_prev(glyph);
+ halfword next = node_next(glyph);
+ halfword disc = tex_new_disc_node(code);
+ halfword kern = null;
+ if (keepkern && next && node_type(next) == kern_node && node_subtype(next) == italic_kern_subtype) {
+ kern = node_next(next);
+ next = node_next(kern);
+ node_next(kern) = null;
+ } else {
+ node_next(glyph) = null;
+ }
+ node_prev(glyph) = null;
+ tex_attach_attribute_list_copy(disc, glyph);
+ tex_set_disc_field(disc, pre_break_code, tex_copy_node_list(glyph, null));
+ tex_set_disc_field(disc, post_break_code, tex_copy_node_list(glyph, null));
+ tex_set_disc_field(disc, no_break_code, glyph);
+ tex_try_couple_nodes(prev, disc);
+ tex_try_couple_nodes(disc, next);
+ return disc;
+} \ No newline at end of file