/* See license.txt in the root of this project. */ # include "luametatex.h" /*tex We no longer dump the patterns and exeptions as they as supposed to be loaded runtime. There is no gain getting them from the format. But we do dump some of the properties. There were all kind of checks for simple characters i.e. not ligatures but there is no need for that in \LUAMETATEX. We have separated stages and the hyphenator sees just glyphs. And when a traditional font has glyphs we can assume that the old school font encoding matches the patterns i.e. that ligatures are not in the normal character slots. Exceptions are stored at the \LUA\ end. We cannot easilly go dynamic because fonts are stored in the eqtb so we would have to use some more indirect mechanism (doable as we do it for other items) too. */ language_state_info lmt_language_state = { .languages = NULL, .language_data = { .minimum = min_language_size, .maximum = max_language_size, .size = memory_data_unset, .step = stp_language_size, .allocated = 0, .itemsize = 1, .top = 0, .ptr = 0, .initial = memory_data_unset, .offset = 0, }, .handler_table_id = 0, .handler_count = 0, }; /*tex We can enforce a language id but we want to be sequential so we accept holes! So one has to define bottom-up. As with fonts, we have a zero language but that one normally is not set. */ static void tex_aux_reset_language(halfword id) { tex_language *lang = lmt_language_state.languages[id]; lang->id = id; lang->exceptions = 0; lang->patterns = NULL; lang->wordhandler = 0; lang->pre_hyphen_char = '-'; lang->post_hyphen_char = 0; lang->pre_exhyphen_char = 0; lang->post_exhyphen_char = 0; lang->hyphenation_min = -1; lang->hjcode_head = NULL; } /*tex A value below zero will bump the language id. Because we have a rather limited number of languages there is no configuration, size is just maximum. */ static halfword tex_aux_new_language_id(halfword id) { int top; if (id >= 0) { if (id <= lmt_language_state.language_data.top) { if (lmt_language_state.languages[id]) { return tex_formatted_error("languages", "the language with id %d is already created", id); } else { return id; } } else if (id > lmt_language_state.language_data.maximum) { goto OVERFLOWERROR; } else { top = id; } } else if (lmt_language_state.language_data.ptr < lmt_language_state.language_data.top) { ++lmt_language_state.language_data.ptr; return lmt_language_state.language_data.ptr; } else if (lmt_language_state.language_data.top >= lmt_language_state.language_data.maximum) { goto OVERFLOWERROR; } else if (lmt_language_state.language_data.top + lmt_language_state.language_data.step > lmt_language_state.language_data.maximum) { top = lmt_language_state.language_data.maximum; } else { top = lmt_language_state.language_data.top + lmt_language_state.language_data.step; } /*tex Finally we can bump memory. */ { tex_language **tmp = aux_reallocate_array(lmt_language_state.languages, sizeof(tex_language *), top, 0); if (tmp) { for (int i = lmt_language_state.language_data.top + 1; i <= top; i++) { tmp[i] = NULL; } lmt_language_state.languages = tmp; lmt_language_state.language_data.allocated += ((size_t) top - lmt_language_state.language_data.top) * sizeof(tex_language *); lmt_language_state.language_data.top = top; lmt_language_state.language_data.ptr += 1; return lmt_language_state.language_data.ptr; } } OVERFLOWERROR: tex_overflow_error("languages", lmt_language_state.language_data.maximum); return 0; } void tex_initialize_languages(void) { tex_language **tmp = aux_allocate_clear_array(sizeof(tex_language *), lmt_language_state.language_data.minimum, 0); if (tmp) { for (int i = 0; i < lmt_language_state.language_data.minimum; i++) { tmp[i] = NULL; } lmt_language_state.languages = tmp; lmt_language_state.language_data.allocated += lmt_language_state.language_data.minimum * sizeof(tex_language *); lmt_language_state.language_data.top = lmt_language_state.language_data.minimum; } else { tex_overflow_error("languages", lmt_language_state.language_data.minimum); } } /* halfword tex_aux_maximum_language_id(void) { return language_state.language_data.maximum; } */ int tex_is_valid_language(halfword n) { if (n == 0) { return 1; } else if (n > 0 && n <= lmt_language_state.language_data.top) { return lmt_language_state.languages[n] ? 1 : 0; } else { return 0; } } tex_language *tex_new_language(halfword n) { halfword id = tex_aux_new_language_id(n); if (id >= 0) { tex_language *lang = lmt_memory_malloc(sizeof(struct tex_language)); if (lang) { lmt_language_state.languages[id] = lang; lmt_language_state.language_data.allocated += sizeof(struct tex_language); tex_aux_reset_language(id); if (saving_hyph_codes_par) { /*tex For now, we might just use specific value for whatever task. This will become obsolete. */ tex_hj_codes_from_lc_codes(id); } } else { tex_overflow_error("language", sizeof(struct tex_language)); } return lang; } else { return NULL; } } tex_language *tex_get_language(halfword n) { if (n >= 0) { if (n <= lmt_language_state.language_data.top && lmt_language_state.languages[n]) { return lmt_language_state.languages[n]; } if (n <= lmt_language_state.language_data.maximum) { return tex_new_language(n); } } return NULL; } /*tex Freeing, dumping, undumping languages: */ /* void free_languages(void) { for (int i = 0; i < language_state.language_data.top; i++) { if (language_state.languages[i]) { lmt_memory_free(language_state.languages[i]); language_state.languages[i] = NULL; } } } */ void tex_dump_language_data(dumpstream f) { dump_int(f, lmt_language_state.language_data.top); dump_int(f, lmt_language_state.language_data.ptr); if (lmt_language_state.language_data.top > 0) { for (int i = 0; i < lmt_language_state.language_data.top; i++) { tex_language *lang = lmt_language_state.languages[i]; if (lang) { dump_via_int(f, 1); dump_int(f, lang->id); dump_int(f, lang->pre_hyphen_char); dump_int(f, lang->post_hyphen_char); dump_int(f, lang->pre_exhyphen_char); dump_int(f, lang->post_exhyphen_char); dump_int(f, lang->hyphenation_min); tex_dump_language_hj_codes(f, i); } else { dump_via_int(f, 0); } } } } void tex_undump_language_data(dumpstream f) { int top, ptr; undump_int(f, top); undump_int(f, ptr); if (top > 0) { tex_language **tmp = aux_allocate_clear_array(sizeof(tex_language *), top, 0); if (tmp) { lmt_language_state.language_data.top = top; lmt_language_state.language_data.ptr = ptr; lmt_language_state.languages = tmp; for (int i = 0; i < top; i++) { int x; undump_int(f, x); if (x == 1) { tex_language *lang = lmt_memory_malloc(sizeof(struct tex_language)); if (lang) { lmt_language_state.languages[i] = lang; lmt_language_state.language_data.allocated += sizeof(struct tex_language); lang->exceptions = 0; lang->patterns = NULL; lang->wordhandler = 0; lang->hjcode_head = NULL; undump_int(f, lang->id); undump_int(f, lang->pre_hyphen_char); undump_int(f, lang->post_hyphen_char); undump_int(f, lang->pre_exhyphen_char); undump_int(f, lang->post_exhyphen_char); undump_int(f, lang->hyphenation_min); tex_undump_language_hj_codes(f, i); if (lang->id != i) { tex_formatted_warning("languages", "undumped language id mismatch: %d <> %d", lang->id, i); lang->id = i; } } else { tex_overflow_error("languages", i); } tmp[i] = lang; } else { tmp[i] = NULL; } } lmt_language_state.language_data.initial = lmt_language_state.language_data.ptr; } else { tex_overflow_error("languages", top); lmt_language_state.language_data.initial = 0; } } else { /*tex Indeed we can have no languages stored. */ tex_initialize_languages(); } } /*tex All kind of accessors. */ void tex_set_pre_hyphen_char(halfword n, halfword v) { struct tex_language *l = tex_get_language(n); if (l) { l->pre_hyphen_char = v; } } void tex_set_post_hyphen_char(halfword n, halfword v) { struct tex_language *l = tex_get_language(n); if (l) { l->post_hyphen_char = v; } } void tex_set_pre_exhyphen_char(halfword n, halfword v) { struct tex_language *l = tex_get_language(n); if (l) { l->pre_exhyphen_char = v; } } void tex_set_post_exhyphen_char(halfword n, halfword v) { struct tex_language *l = tex_get_language(n); if (l) { l->post_exhyphen_char = v; } } halfword tex_get_pre_hyphen_char(halfword n) { struct tex_language *l = tex_get_language(n); return l ? l->pre_hyphen_char : -1; } halfword tex_get_post_hyphen_char(halfword n) { struct tex_language *l = tex_get_language(n); return l ? l->post_hyphen_char : -1; } halfword tex_get_pre_exhyphen_char(halfword n) { struct tex_language *l = tex_get_language(n); return l ? l->pre_exhyphen_char : -1; } halfword tex_get_post_exhyphen_char(halfword n) { struct tex_language *l = tex_get_language(n); return (l) ? (int) l->post_exhyphen_char : -1; } void tex_set_hyphenation_min(halfword n, halfword v) { struct tex_language *l = tex_get_language(n); if (l) { l->hyphenation_min = v; } } halfword tex_get_hyphenation_min(halfword n) { struct tex_language *l = tex_get_language((int) n); return l ? l->hyphenation_min : -1; } void tex_load_patterns(struct tex_language *lang, const unsigned char *buff) { if ((! lang) || (! buff) || strlen((const char *) buff) == 0) { return; } else { if (! lang->patterns) { lang->patterns = hnj_dictionary_new(); } hnj_dictionary_load(lang->patterns, buff, tracing_hyphenation_par > 0); } } void tex_clear_patterns(struct tex_language *lang) { if (lang && lang->patterns) { hnj_dictionary_clear(lang->patterns); } } void tex_load_tex_patterns(halfword curlang, halfword head) { char *s = tex_tokenlist_to_tstring(head, 1, NULL, 0, 0, 0, 0); if (s) { tex_load_patterns(tex_get_language(curlang), (unsigned char *) s); } } /* This cleans one word which is returned in |cleaned|, returns the new offset into |buffer|. */ /* define tex_isspace(c) (c == ' ' || c == '\t') */ # define tex_isspace(c) (c == ' ') const char *tex_clean_hyphenation(halfword id, const char *buff, char **cleaned) { int items = 0; /*tex Work buffer for bytes: */ unsigned char word[max_size_of_word + 1]; /*tex Work buffer for \UNICODE: */ unsigned uword[max_size_of_word + 1] = { 0 }; /*tex The \UNICODE\ buffer value: */ int i = 0; char *uindex = (char *) word; const char *s = buff; while (*s && ! tex_isspace((unsigned char)*s)) { word[i++] = (unsigned char) *s; s++; if ((s-buff) > max_size_of_word) { /*tex Todo: this is too strict, should count \UNICODE, not bytes. */ *cleaned = NULL; tex_handle_error( normal_error_type, "Exception too long", NULL ); return s; } } /*tex Now convert the input to \UNICODE. */ word[i] = '\0'; aux_splitutf2uni(uword, (const char *)word); /*tex Build the new word string. The hjcode values < 32 indicate a length, so that for instance \|hjcode`ܽ2| makes that ligature count okay. */ i = 0; while (uword[i] > 0) { int u = uword[i++]; if (u == '-') { /*tex Skip. */ } else if (u == '=') { unsigned c = tex_get_hj_code(id, '-'); uindex = aux_uni2string(uindex, (! c || c <= 32) ? '-' : c); } else if (u == '{') { u = uword[i++]; items = 0; while (u && u != '}') { u = uword[i++]; } if (u == '}') { items++; u = uword[i++]; } while (u && u != '}') { u = uword[i++]; } if (u == '}') { items++; u = uword[i++]; } if (u == '{') { u = uword[i++]; } while (u && u != '}') { unsigned c = tex_get_hj_code(id, u); uindex = aux_uni2string(uindex, (! c || c <= 32) ? u : c); u = uword[i++]; } if (u == '}') { items++; } if (items != 3) { /* hm, we intercept that elsewhere in a better way so why here? Best remove the test here or move the other one here. */ *cleaned = NULL; tex_handle_error( normal_error_type, "Exception syntax error, a discretionary has three components: {}{}{}.", NULL ); return s; } else { /* skip replacement (chars) */ if (uword[i] == '(') { while (uword[++i] && uword[i] != ')') { }; if (uword[i] != ')') { tex_handle_error( normal_error_type, "Exception syntax error, an alternative replacement is defined as (text).", NULL ); return s; } else if (uword[i]) { i++; } } /* skip penalty: [digit] but we intercept multiple digits */ if (uword[i] == '[') { if (uword[i+1] && uword[i+1] >= '0' && uword[i+1] <= '9' && uword[i+2] && uword[i+2] == ']') { i += 3; } else { tex_handle_error( normal_error_type, "Exception syntax error, a penalty is defined as [digit].", NULL ); return s; } } } } else { unsigned c = tex_get_hj_code(id, u); uindex = aux_uni2string(uindex, (! c || c <= 32) ? u : c); } } *uindex = '\0'; *cleaned = lmt_memory_strdup((char *) word); return s; } void tex_load_hyphenation(struct tex_language *lang, const unsigned char *buff) { if (lang) { lua_State *L = lmt_lua_state.lua_instance; const char *s = (const char *) buff; char *cleaned = NULL; int id = lang->id; if (lang->exceptions == 0) { lua_newtable(L); lang->exceptions = luaL_ref(L, LUA_REGISTRYINDEX); } lua_rawgeti(L, LUA_REGISTRYINDEX, lang->exceptions); while (*s) { while (tex_isspace((unsigned char) *s)) { s++; } if (*s) { const char *value = s; s = tex_clean_hyphenation(id, s, &cleaned); if (cleaned) { size_t len = s - value; if (len > 0) { lua_pushstring(L, cleaned); lua_pushlstring(L, value, len); lua_rawset(L, -3); } lmt_memory_free(cleaned); } else { /* tex_formatted_warning("hyphenation","skipping invalid hyphenation exception: %s", value); */ } } } } } void tex_clear_hyphenation(struct tex_language *lang) { if (lang && lang->exceptions != 0) { lua_State *L = lmt_lua_state.lua_instance; luaL_unref(L, LUA_REGISTRYINDEX, lang->exceptions); lang->exceptions = 0; } } void tex_load_tex_hyphenation(halfword curlang, halfword head) { char *s = tex_tokenlist_to_tstring(head, 1, NULL, 0, 0, 0, 0); if (s) { tex_load_hyphenation(tex_get_language(curlang), (unsigned char *) s); } } static halfword tex_aux_insert_discretionary(halfword t, halfword pre, halfword post, halfword replace, quarterword subtype, int penalty) { /*tex For compound words following explicit hyphens we take the current font. */ halfword d = tex_new_disc_node(subtype); halfword a = node_attr(t) ; disc_penalty(d) = penalty; if (t == replace) { /*tex We have |prev disc next-next|. */ tex_try_couple_nodes(d, node_next(t)); tex_try_couple_nodes(node_prev(t), d); node_prev(t) = null; node_next(t) = null; replace = t; } else { /*tex We have |prev disc next|. */ tex_try_couple_nodes(d, node_next(t)); tex_couple_nodes(t, d); } if (a) { tex_attach_attribute_list_attribute(d, a); } tex_set_disc_field(d, pre_break_code, pre); tex_set_disc_field(d, post_break_code, post); tex_set_disc_field(d, no_break_code, replace); return d; } static halfword tex_aux_insert_syllable_discretionary(halfword t, lang_variables *lan) { halfword n = tex_new_disc_node(syllable_discretionary_code); disc_penalty(n) = hyphen_penalty_par; tex_couple_nodes(n, node_next(t)); tex_couple_nodes(t, n); tex_attach_attribute_list_attribute(n, get_attribute_list(t)); if (lan->pre_hyphen_char > 0) { halfword g = tex_new_glyph_node(glyph_unset_subtype, glyph_font(t), lan->pre_hyphen_char, t); tex_set_disc_field(n, pre_break_code, g); } if (lan->post_hyphen_char > 0) { halfword g = tex_new_glyph_node(glyph_unset_subtype, glyph_font(t), lan->post_hyphen_char, t); tex_set_disc_field(n, post_break_code, g); } return n; } static halfword tex_aux_compound_word_break(halfword t, halfword clang, halfword chr) { halfword prechar, postchar, pre, post, disc; if (chr == ex_hyphen_char_par) { halfword pre_exhyphen_char = tex_get_pre_exhyphen_char(clang); halfword post_exhyphen_char = tex_get_post_exhyphen_char(clang); prechar = pre_exhyphen_char > 0 ? pre_exhyphen_char : ex_hyphen_char_par; postchar = post_exhyphen_char > 0 ? post_exhyphen_char : null; } else { /* we need a flag : use pre/post cf language spec */ prechar = chr; postchar = null; } pre = prechar > 0 ? tex_new_glyph_node(glyph_unset_subtype, glyph_font(t), prechar, t) : null; post = postchar > 0 ? tex_new_glyph_node(glyph_unset_subtype, glyph_font(t), postchar, t) : null; disc = tex_aux_insert_discretionary(t, pre, post, t, automatic_discretionary_code, tex_automatic_disc_penalty(glyph_hyphenate(t))); return disc; } static char *tex_aux_hyphenation_exception(int exceptions, char *w) { lua_State *L = lmt_lua_state.lua_instance; char *ret = NULL; if (lua_rawgeti(L, LUA_REGISTRYINDEX, exceptions) == LUA_TTABLE) { /*tex Word table: */ lua_pushstring(L, w); lua_rawget(L, -2); if (lua_type(L, -1) == LUA_TSTRING) { ret = lmt_memory_strdup(lua_tostring(L, -1)); } lua_pop(L, 2); } else { lua_pop(L, 1); } return ret; } /*tex Kept as reference: */ /* char *get_exception_strings(struct tex_language *lang) { char *ret = NULL; if (lang && lang->exceptions) { lua_State *L = lua_state.lua_instance; if (lua_rawgeti(L, LUA_REGISTRYINDEX, lang->exceptions) == LUA_TTABLE) { size_t size = 0; size_t current = 0; lua_pushnil(L); while (lua_next(L, -2)) { size_t l = 0; const char *value = lua_tolstring(L, -1, &l); if (current + l + 2 > size) { size_t new = (size + size/5) + current + l + 1024; char *tmp = lmt_memory_realloc(ret, new); if (tmp) { ret = tmp; size = new; } else { overflow_error("exceptions", (int) size); } } if (ret) { ret[current] = ' '; strcpy(&ret[current + 1], value); current += l + 1; } lua_pop(L, 1); } } } return ret; } */ /*tex The sequence from |wordstart| to |r| can contain only normal characters it could be faster to modify a halfword pointer and return an integer */ # define zws 0x200B /* zero width space makes no sense */ # define zwnj 0x200C # define zwj 0x200D static halfword tex_aux_find_exception_part(unsigned int *j, unsigned int *uword, int len, halfword parent, char final) { halfword head = null; halfword tail = null; unsigned i = *j; int noligature = 0; int nokerning = 0; /*tex This puts uword[i] on the |{|. */ i++; while (i < (unsigned) len && uword[i + 1] != (unsigned int) final) { if (tail) { switch (uword[i + 1]) { case zwj: noligature = 1; nokerning = 0; break; case zwnj: noligature = 1; nokerning = 1; break; default: { halfword s = tex_new_glyph_node(glyph_unset_subtype, glyph_font(parent), (int) uword[i + 1], parent); /* todo: data */ tex_couple_nodes(tail, s); if (noligature) { tex_add_glyph_option(tail, glyph_option_no_right_ligature); tex_add_glyph_option(s, glyph_option_no_left_ligature); noligature = 0; } if (nokerning) { tex_add_glyph_option(tail, glyph_option_no_right_kern); tex_add_glyph_option(s, glyph_option_no_left_kern); nokerning = 0; } tail = node_next(tail); break; } } } else { head = tex_new_glyph_node(glyph_unset_subtype, glyph_font(parent), (int) uword[i + 1], parent); /* todo: data */ tail = head; } i++; } *j = ++i; return head; } static int tex_aux_count_exception_part(unsigned int *j, unsigned int *uword, int len) { int n = 0; unsigned i = *j; /*tex This puts uword[i] on the |{|. */ i++; while (i < (unsigned) len && uword[i + 1] != '}') { n++; i++; } *j = ++i; return n; } static void tex_aux_show_exception_error(const char *part) { tex_handle_error( normal_error_type, "Invalid %s part in exception", part, "Exception discretionaries should contain three pairs of braced items.\n" "No intervening spaces are allowed." ); } /*tex The exceptions are taken as-is: no min values are taken into account. One can add normal patterns on-the-fly if needed. */ static void tex_aux_do_exception(halfword wordstart, halfword r, char *replacement) { halfword t = wordstart; lang_variables langdata; unsigned uword[max_size_of_word + 1] = { 0 }; unsigned len = aux_splitutf2uni(uword, replacement); int clang = get_glyph_language(wordstart); langdata.pre_hyphen_char = tex_get_pre_hyphen_char(clang); langdata.post_hyphen_char = tex_get_post_hyphen_char(clang); for (unsigned i = 0; i < len; i++) { if (uword[i + 1] == 0 ) { /*tex We ran out of the exception pattern. */ break; } else if (uword[i + 1] == '-') { /*tex A hyphen follows. */ if (node_next(t) == r) { break; } else { tex_aux_insert_syllable_discretionary(t, &langdata); /*tex Skip the new disc */ t = node_next(t); } } else if (uword[i + 1] == '=') { /*tex We skip a disc. */ t = node_next(t); } else if (uword[i + 1] == '{') { /*tex We ran into an exception |{}{}{}| or |{}{}{}[]|. */ halfword pre = null; halfword post = null; halfword replace = null; int count = 0; int alternative = null; halfword penalty; /*tex |pre| */ pre = tex_aux_find_exception_part(&i, uword, (int) len, wordstart, '}'); if (i == len || uword[i + 1] != '{') { tex_aux_show_exception_error("pre"); } /*tex |post| */ post = tex_aux_find_exception_part(&i, uword, (int) len, wordstart, '}'); if (i == len || uword[i + 1] != '{') { tex_aux_show_exception_error("post"); } /*tex |replace| */ count = tex_aux_count_exception_part(&i, uword, (int) len); if (i == len) { tex_aux_show_exception_error("replace"); } else if (uword[i] && uword[i + 1] == '(') { alternative = tex_aux_find_exception_part(&i, uword, (int) len, wordstart, ')');; } /*tex Play safe. */ if (node_next(t) == r) { break; } else { /*tex Let's deal with an (optional) replacement. */ if (count > 0) { /*tex Assemble the replace stream. */ halfword q = t; replace = node_next(q); while (count > 0 && q) { halfword t = node_type(q); q = node_next(q); if (t == glyph_node || t == disc_node) { count--; } else { break ; } } /*tex Remove it from the main stream */ tex_try_couple_nodes(t, node_next(q)); /*tex and finish it in the replace. */ node_next(q) = null; if (alternative) { tex_flush_node_list(replace); replace = alternative; } else { /*tex Sanitize the replace stream (we could use the flattener instead). */ q = replace ; while (q) { halfword n = node_next(q); if (node_type(q) == disc_node) { /*tex Beware: the replacement starts after the no_break pointer. */ halfword nb = disc_no_break_head(q); disc_no_break_head(q) = null; node_prev(nb) = null ; /* used at all? */ /*tex Insert the replacement glyph. */ if (q == replace) { replace = nb; } else { tex_try_couple_nodes(node_prev(q), nb); } /*tex Append the glyph (one). */ tex_try_couple_nodes(nb, n); /*tex Flush the disc. */ tex_flush_node(q); } q = n ; } } } /*tex Let's check if we have a penalty spec. If we have more then we're toast, we just ignore them. */ if (uword[i] && uword[i + 1] == '[') { i += 2; if (uword[i] && uword[i] >= '0' && uword[i] <= '9') { if (exception_penalty_par > 0) { if (exception_penalty_par > infinite_penalty) { penalty = exception_penalty_par; } else { penalty = (uword[i] - '0') * exception_penalty_par ; } } else { penalty = hyphen_penalty_par; } ++i; while (uword[i] && uword[i] != ']') { ++i; } } else { penalty = hyphen_penalty_par; } } else { penalty = hyphen_penalty_par; } /*tex And now we insert a disc node (this was |syllable_discretionary_code|). */ t = tex_aux_insert_discretionary(t, pre, post, replace, normal_discretionary_code, penalty); /*tex We skip the new disc node. */ t = node_next(t); /*tex We need to check if we have two discretionaries in a row, test case: |\hyphenation {a{>}{<}{b}{>}{<}{c}de} \hsize 1pt abcde \par| which gives |a> <> =l_hyf+r_hyf|, this string qualifies for hyphenation; however, |uc_hyph| must be positive, if $c_1$ is uppercase. The hyphenation process takes place in three stages. First, the candidate sequence $c_1 \ldots c_n$ is found; then potential positions for hyphens are determined by referring to hyphenation tables; and finally, the nodes $p_a\ldots p_b$ are replaced by a new sequence of nodes that includes the discretionary breaks found. Fortunately, we do not have to do all this calculation very often, because of the way it has been taken out of \TEX's inner loop. For example, when the second edition of the author's 700-page book {\sl Seminumerical Algorithms} was typeset by \TEX, only about 1.2 hyphenations needed to be tried per paragraph, since the line breaking algorithm needed to use two passes on only about 5 per cent of the paragraphs. (This is not true in \LUATEX: we always hyphenate the whole list.) When a word been set up to contain a candidate for hyphenation, \TEX\ first looks to see if it is in the user's exception dictionary. If not, hyphens are inserted based on patterns that appear within the given word, using an algorithm due to Frank~M. Liang. \stopnarrower This is incompatible with \TEX\ because the first word of a paragraph can be hyphenated, but most European users seem to agree that prohibiting hyphenation there was not the best idea ever. To be documented: |\hyphenationmode| (a bit set). \startbuffer \parindent0pt \hsize=1.1cm 12-34-56 \par 12-34-\hbox{56} \par 12-34-\vrule width 1em height 1.5ex \par 12-\hbox{34}-56 \par 12-\vrule width 1em height 1.5ex-56 \par \hjcode`\1=`\1 \hjcode`\2=`\2 \hjcode`\3=`\3 \hjcode`\4=`\4 \vskip.5cm 12-34-56 \par 12-34-\hbox{56} \par 12-34-\vrule width 1em height 1.5ex \par 12-\hbox{34}-56 \par 12-\vrule width 1em height 1.5ex-56 \par \stopbuffer \typebuffer \startpacked \getbuffer \stopbuffer We only accept an explicit hyphen when there is a preceding glyph and we skip a sequence of explicit hyphens as that normally indicates a \type {--} or \type {---} ligature in which case we can in a worse case usage get bad node lists later on due to messed up ligature building as these dashes are ligatures in base fonts. This is a side effect of the separating the hyphenation, ligaturing and kerning steps. A test is cmr with \type {------}. A font handler can collapse successive hyphens but it's not nice to put the burden there. A somewhat messy border case is \type {----} but in \LUATEX\ we don't treat \type {--} and \type {---} special. Also, traditional \TEX\ will break a line at \type {-foo} but this can be disabled by setting the automatic mode to \type {1}. */ // # define is_hyphen_char(chr) (get_hc_code(chr) || chr == ex_hyphen_char_par) inline static halfword tex_aux_is_hyphen_char(halfword chr) { if (tex_get_hc_code(chr)) { return tex_get_hc_code(chr); } else if (chr == ex_hyphen_char_par) { return ex_hyphen_char_par; } else { return null; } } static halfword tex_aux_find_next_wordstart(halfword r, halfword first_language) { int start_ok = 1; int mathlevel = 1; halfword lastglyph = r; while (r) { switch (node_type(r)) { case boundary_node: if (node_subtype(r) == word_boundary) { start_ok = 1; } break; case disc_node: start_ok = has_disc_option(r, disc_option_post_word); break; case hlist_node: case vlist_node: case rule_node: case dir_node: case whatsit_node: if (hyphenation_permitted(glyph_hyphenate(lastglyph), strict_start_hyphenation_mode)) { start_ok = 0; } break; case glue_node: start_ok = 1; break; case math_node: while (mathlevel > 0) { r = node_next(r); if (! r) { return r; } else if (node_type(r) == math_node) { if (node_subtype(r) == begin_inline_math) { mathlevel++; } else { mathlevel--; } } } break; case glyph_node: { /*tex When we have no word yet and meet a hyphen (equivalent) we should just keep going. This is not compatible but it does make sense. */ int chr = glyph_character(r); int hyp = tex_aux_is_hyphen_char(chr); lastglyph = r; if (hyp) { if (hyphenation_permitted(glyph_hyphenate(r), ignore_bounds_hyphenation_mode)) { /* maybe some tracing */ } else { /* todo: already check if we have hj chars left/right i.e. no digits and minus mess */ halfword t = node_next(r) ; /*tex Kind of weird that we have the opposite flag test here. */ if (t && (node_type(t) == glyph_node) && (! tex_aux_is_hyphen_char(glyph_character(t))) && ! hyphenation_permitted(glyph_hyphenate(r), automatic_hyphenation_mode)) { /*tex We have no word yet and the next character is a non hyphen. */ r = tex_aux_compound_word_break(r, get_glyph_language(r), hyp); // test case: \automatichyphenmode0 10\high{-6-1-2-4} start_ok = 1; // todo: also in luatex } else { /*tex We jump over the sequence of hyphens. */ while (t && (node_type(t) == glyph_node) && tex_aux_is_hyphen_char(glyph_character(t))) { r = t ; t = node_next(r) ; } if (t) { /*tex We need a restart. */ start_ok = 0; } else { /*tex We reached the end of the list so we have no word start. */ return null; } } } } else if (start_ok && (get_glyph_language(r) >= first_language) && get_glyph_dohyph(r)) { int l = tex_get_hj_code(get_glyph_language(r), chr); if (l > 0) { if (l == chr || l <= 32 || get_glyph_uchyph(r)) { return r; } else { start_ok = 0; } } else { /*tex We go on. */ } } else { /*tex We go on. */ } } break; default: start_ok = 0; break; } r = node_next(r); } return r; /* null */ } /*tex This is the original test, extended with bounds, but still the complex expression turned into a function. However, it actually is part of the old mechanism where hyphenation was mixed with ligature building and kerning, so there was this skipping over a font kern whuch is no longer needed as we have separate steps. We keep this as reference: \starttyping static int valid_wordend(halfword s, halfword strict_bound) { if (s) { halfword r = s; int clang = get_glyph_language(s); while ( (r) && ( (type(r) == glyph_node && clang == get_glyph_language(r)) || (type(r) == kern_node && (subtype(r) == font_kern)) ) ) { r = node_next(r); } return (! r || (type(r) == glyph_node && clang != get_glyph_language(r)) || type(r) == glue_node || type(r) == penalty_node || (type(r) == kern_node && (subtype(r) == explicit_kern || subtype(r) == italic_kern || subtype(r) == accent_kern )) || ((type(r) == hlist_node || type(r) == vlist_node || type(r) == rule_node || type(r) == dir_node || type(r) == whatsit_node || type(r) == insert_node || type(r) == adjust_node ) && ! (strict_bound == 2 || strict_bound == 3)) || type(r) == boundary_node ); } else { return 1; } } \stopttyping */ static int tex_aux_valid_wordend(halfword end_word, halfword r) { if (r) { switch (node_type(r)) { // case glyph_node: // case glue_node: // case penalty_node: // case kern_node: // return 1; case disc_node: return has_disc_option(r, disc_option_pre_word); case hlist_node: case vlist_node: case rule_node: case dir_node: case whatsit_node: case insert_node: case adjust_node: return ! hyphenation_permitted(glyph_hyphenate(end_word), strict_end_hyphenation_mode); } } return 1; } void tex_handle_hyphenation(halfword head, halfword tail) { if (head && node_next(head)) { int callback_id = lmt_callback_defined(hyphenate_callback); if (callback_id > 0) { lua_State *L = lmt_lua_state.lua_instance; int top = 0; if (lmt_callback_okay(L, callback_id, &top)) { int i; lmt_node_list_to_lua(L, head); lmt_node_list_to_lua(L, tail); i = lmt_callback_call(L, 2, 0, top); if (i) { lmt_callback_error(L, top, i); } else { lmt_callback_wrapup(L, top); } } } else if (callback_id == 0) { tex_hyphenate_list(head, tail); } else { /* -1 : disabled */ } } } static int tex_aux_hnj_hyphen_hyphenate( hjn_dictionary *dict, halfword first, halfword last, int length, halfword left, halfword right, lang_variables *lan ) { /*tex +2 for dots at each end, +1 for points outside characters. */ int ext_word_len = length + 2; int hyphen_len = ext_word_len + 1; /*tex Because we have a limit of 64 characters we could just use a static array here: */ char *hyphens = lmt_memory_calloc(hyphen_len, sizeof(unsigned char)); if (hyphens) { halfword here; int state = 0; int char_num = 0; int done = 0; /*tex Add a '.' to beginning and end to facilitate matching. */ node_next(begin_period) = first; node_next(end_period) = node_next(last); node_next(last) = end_period; // for (int i = 0; i < hyphen_len; i++) { // hyphens[i] = '0'; // } // hyphens[hyphen_len] = 0; /*tex Now, run the finite state machine. */ for (char_num = 0, here = begin_period; here != node_next(end_period); here = node_next(here)) { int ch; if (here == begin_period || here == end_period) { ch = '.'; } else { ch = tex_get_hj_code(get_glyph_language(here), glyph_character(here)); if (ch <= 32) { ch = glyph_character(here); } } while (state != -1) { hjn_state *hstate = &dict->states[state]; for (int k = 0; k < hstate->num_trans; k++) { if (hstate->trans[k].uni_ch == ch) { char *match; state = hstate->trans[k].new_state; match = dict->states[state].match; if (match) { /*tex We add +2 because 1 string length is one bigger than offset and 1 hyphenation starts before first character. Why not store the length in states[state] instead of calculating it each time? Okay, performance is okay but still ... */ int offset = (int) (char_num + 2 - (int) strlen(match)); for (int m = 0; match[m]; m++) { if (hyphens[offset + m] < match[m]) { hyphens[offset + m] = match[m]; } } } goto NEXTLETTER; } } state = hstate->fallback_state; } /*tex Nothing worked, let's go to the next character. */ state = 0; NEXTLETTER:; char_num++; } /*tex Restore the correct pointers. */ node_next(last) = node_next(end_period); /*tex Pattern is |.word.| and |word_len| is 4, |ext_word_len| is 6 and |hyphens| is 7; drop first two and stop after |word_len-1|. */ for (here = first, char_num = 2; here != left; here = node_next(here)) { char_num++; } for (; here != right; here = node_next(here)) { if (hyphens[char_num] & 1) { here = tex_aux_insert_syllable_discretionary(here, lan); done += 1; } char_num++; } lmt_memory_free(hyphens); return done; } else { tex_overflow_error("patterns", hyphen_len); return 0; } } /* we can also check the original */ static int tex_aux_still_okay(halfword f, halfword l, halfword r, int n, const char *utf8original) { if (_valid_node_(f) && _valid_node_(l) && node_next(l) == r) { int i = 0; while (f) { ++i; if (node_type(f) != glyph_node) { tex_normal_warning("language", "the hyphenated word contains non-glyphs, skipping"); return 0; } else { halfword c = (halfword) aux_str2uni((const unsigned char *) utf8original); utf8original += utf8_size(c); if (! (c && c == glyph_character(f))) { tex_normal_warning("language", "the hyphenated word contains different characters, skipping"); return 0; } else if (f != l) { f = node_next(f); } else if (i == n) { return 1; } else { tex_normal_warning("language", "the hyphenated word changed length, skipping"); return 0; } } } } tex_normal_warning("language", "the hyphenation list is messed up, skipping"); return 0; } static void tex_aux_hyphenate_show(halfword beg, halfword end) { if (_valid_node_(beg) && _valid_node_(end)) { halfword nxt = node_next(end); node_next(end) = null; tex_show_node_list(beg, 100, 10000); node_next(end) = nxt; } } /* maybe split: first a processing run */ inline static int is_traditional_hyphen(halfword n) { return ( (glyph_character(n) == ex_hyphen_char_par) /*tex parameter */ && (has_font_text_control(glyph_font(n),text_control_collapse_hyphens)) /*tex font driven */ && (hyphenation_permitted(glyph_hyphenate(n), collapse_hyphenation_mode)) /*tex language driven */ ); } int tex_collapse_list(halfword head, halfword c1, halfword c2, halfword c3) /* ex_hyphen_char_par 0x2013 0x2014 */ { /*tex Let's play safe: */ halfword found = 0; if (head && c1 && c2 && c3) { halfword n1 = head; while (n1) { halfword n2 = node_next(n1); switch (node_type(n1)) { case glyph_node: if (is_traditional_hyphen(n1)) { set_glyph_discpart(n1, glyph_discpart_always); if (n2 && node_type(n2) == glyph_node && is_traditional_hyphen(n2) && glyph_font(n1) == glyph_font(n2)) { halfword n3 = node_next(n2); if (n3 && node_type(n3) == glyph_node && is_traditional_hyphen(n3) && glyph_font(n1) == glyph_font(n3)) { halfword n4 = node_next(n3); glyph_character(n1) = c3; tex_try_couple_nodes(n1, n4); tex_flush_node(n2); tex_flush_node(n3); n1 = n4; } else { glyph_character(n1) = c2; tex_try_couple_nodes(n1, n3); tex_flush_node(n2); n1 = n3; } found = 1; goto AGAIN; } else { glyph_character(n1) = c1; /* can become language dependent */ } } break; case disc_node: { halfword done = 0; if (disc_pre_break_head(n1) && tex_collapse_list(disc_pre_break_head(n1), c1, c2, c3)) { ++done; } if (disc_post_break_head(n1) && tex_collapse_list(disc_post_break_head(n1), c1, c2, c3)) { ++done; } if (disc_no_break_head(n1) && tex_collapse_list(disc_no_break_head(n1), c1, c2, c3)) { ++done; } if (done) { tex_check_disc_field(n1); } break; } default: break; } n1 = n2; AGAIN:; } } return found; } void tex_hyphenate_list(halfword head, halfword tail) { /*tex Let's play safe: */ if (tail) { halfword first_language = first_valid_language_par; /* combine with check below */ halfword trace = tracing_hyphenation_par; halfword r = head; /*tex This first movement assures two things: \startitemize \startitem That we won't waste lots of time on something that has been handled already (in that case, none of the glyphs match |simple_character|). \stopitem \startitem That the first word can be hyphenated. If the movement was not explicit, then the indentation at the start of a paragraph list would make |find_next_wordstart()| look too far ahead. \stopitem \stopitemize */ while (r && node_type(r) != glyph_node) { r = node_next(r); } if (r) { r = tex_aux_find_next_wordstart(r, first_language); if (r) { lang_variables langdata; char utf8word[(4 * max_size_of_word) + 1] = { 0 }; char utf8original[(4 * max_size_of_word) + 1] = { 0 }; char *utf8ptr = utf8word; char *utf8ori = utf8original; int word_length = 0; int explicit_hyphen = 0; int last_char = 0; int valid = 0; halfword explicit_start = null; halfword saved_tail = node_next(tail); halfword penalty = tex_new_penalty_node(0, word_penalty_subtype); /* kind of curious hack, this addition that we later remove */ tex_attach_attribute_list_copy(penalty, r); tex_couple_nodes(tail, penalty); /* todo: attrobute */ while (r) { halfword word_start = r; int word_language = get_glyph_language(word_start); if (tex_is_valid_language(word_language)) { halfword word_end = r; int lhmin = get_glyph_lhmin(word_start); int rhmin = get_glyph_rhmin(word_start); int hmin = tex_get_hyphenation_min(word_language); halfword word_font = glyph_font(word_start); if (! tex_is_valid_font(word_font) || font_hyphen_char(word_font) < 0) { /*tex For backward compatibility we set: */ word_font = 0; } langdata.pre_hyphen_char = tex_get_pre_hyphen_char(word_language); langdata.post_hyphen_char = tex_get_post_hyphen_char(word_language); while (r && node_type(r) == glyph_node && word_language == get_glyph_language(r)) { halfword chr = glyph_character(r); halfword hyp = tex_aux_is_hyphen_char(chr); if (word_language >= first_language) { last_char = tex_get_hj_code(word_language, chr); if (last_char > 0) { goto GOFORWARD; } } if (hyp) { last_char = hyp; // if (last_char) { // goto GOFORWARD; // } } else { break; } GOFORWARD: // explicit_hyphen = is_hyphen_char(chr); explicit_hyphen = hyp; if (explicit_hyphen && node_next(r) && node_type(node_next(r)) != glyph_node && hyphenation_permitted(glyph_hyphenate(r), ignore_bounds_hyphenation_mode)) { /* maybe some tracing */ explicit_hyphen = 0; } if (explicit_hyphen) { break; } else { word_length++; if (word_length >= max_size_of_word) { /* tex_normal_warning("language", "ignoring long word"); */ while (r && node_type(r) == glyph_node) { r = node_next(r); } goto PICKUP; } else { if (last_char <= 32) { if (last_char == 32) { last_char = 0 ; } if (word_length <= lhmin) { lhmin = lhmin - last_char + 1 ; if (lhmin < 0) { lhmin = 1; } } if (word_length >= rhmin) { rhmin = rhmin - last_char + 1 ; if (rhmin < 0) { rhmin = 1; } } hmin = hmin - last_char + 1 ; if (hmin < 0) { rhmin = 1; } last_char = chr ; } utf8ori = aux_uni2string(utf8ori, (unsigned) chr); utf8ptr = aux_uni2string(utf8ptr, (unsigned) last_char); word_end = r; r = node_next(r); } } } if (explicit_hyphen) { /*tex We are not at the start, so we only need to look ahead. */ if ((get_glyph_discpart(r) == glyph_discpart_replace && ! hyphenation_permitted(glyph_hyphenate(r), syllable_hyphenation_mode))) { /*tex This can be the consequence of inhibition too, see |finish_discretionary| in which case the replace got injected which can have a hyphen. And we want to run the callback if set in order to replace. */ valid = 1; goto MESSYCODE; } else { /*tex Maybe we should get rid of this ----- stuff. */ halfword t = node_next(r); if (t && node_type(t) == glyph_node && ! tex_aux_is_hyphen_char(glyph_character(t)) && hyphenation_permitted(glyph_hyphenate(t), automatic_hyphenation_mode)) { /*tex we have a word already but the next character may not be a hyphen too */ halfword g = r; r = tex_aux_compound_word_break(r, get_glyph_language(g), explicit_hyphen); if (trace > 1) { *utf8ori = 0; tex_begin_diagnostic(); tex_print_format("[language: compound word break after %s]", utf8original); tex_end_diagnostic(); } if (hyphenation_permitted(glyph_hyphenate(g), compound_hyphenation_mode)) { explicit_hyphen = 0; if (hyphenation_permitted(glyph_hyphenate(g), force_handler_hyphenation_mode) || hyphenation_permitted(glyph_hyphenate(g), feedback_compound_hyphenation_mode)) { set_disc_option(r, disc_option_pre_word | disc_option_post_word); explicit_start = null; valid = 1; goto MESSYCODE; } else { if (! explicit_start) { explicit_start = word_start; } /*tex For exceptions. */ utf8ptr = aux_uni2string(utf8ptr, '-'); r = t; continue; } } } else { /*tex We jump over the sequence of hyphens ... traditional. */ while (t && node_type(t) == glyph_node && tex_aux_is_hyphen_char(glyph_character(t))) { r = t; t = node_next(r); } if (! t) { /*tex we reached the end of the list and will quit the loop later */ r = null; } } } } else { valid = tex_aux_valid_wordend(word_end, r); MESSYCODE: /*tex We have a word, r is at the next node. */ if (word_font && word_language >= first_language) { /*tex We have a language, actually we already tested that. */ struct tex_language *lang = lmt_language_state.languages[word_language]; if (lang) { char *replacement = NULL; halfword start = explicit_start ? explicit_start : word_start; int okay = word_length >= lhmin + rhmin && (hmin <= 0 || word_length >= hmin) && hyphenation_permitted(glyph_hyphenate(start), syllable_hyphenation_mode); *utf8ptr = 0; *utf8ori = 0; if (lang->wordhandler && hyphenation_permitted(glyph_hyphenate(start), force_handler_hyphenation_mode)) { halfword restart = node_prev(start); /*tex before the word. */ int done = lmt_handle_word(lang, utf8original, utf8word, word_length, start, word_end, &replacement); if (replacement) { if (tex_aux_still_okay(start, word_end, r, word_length, utf8original)) { goto EXCEPTIONS2; } else { goto PICKUP; } } else { /* 1: restart 2: exceptions+patterns 3: patterns *: next word */ switch (done) { case 1: if (_valid_node_(restart)) { r = restart; } else if (_valid_node_(start)) { r = node_prev(start); } if (! r) { if (_valid_node_(head)) { tex_normal_warning("language", "the hyphenation list is messed up, recovering"); r = head; } else { tex_normal_error("language", "the hyphenated head is messed up, aborting"); return; } } goto PICKUP; case 2: if (tex_aux_still_okay(start, word_end, r, word_length, utf8original)) { goto EXCEPTIONS1; } else { goto PICKUP; } case 3: if (tex_aux_still_okay(start, word_end, r, word_length, utf8original)) { goto PATTERNS; } else { goto PICKUP; } default: if (_valid_node_(r)) { /* or word_end */ goto PICKUP; } else if (_valid_node_(tail)) { tex_normal_warning("language", "the hyphenation list is messed up, quitting"); goto ABORT; } else { // tex_normal_error("language","the hyphenated tail is messed up, aborting"); return; } } } } if (! okay || ! valid) { goto PICKUP; } /*tex This is messy and nasty: we can have a word with a - in it which is why we have two branches. Also, every word that suits the length criteria is checked via \LUA. Optimizing this because tests have demonstrated that checking against the min and max lengths of exception strings has no gain. */ EXCEPTIONS1: if (lang->exceptions) { replacement = tex_aux_hyphenation_exception(lang->exceptions, utf8word); } EXCEPTIONS2: if (replacement) { /*tex handle the exception and go on to the next word */ halfword start = explicit_start ? explicit_start : word_start; halfword beg = node_prev(start); tex_aux_do_exception(start, r, replacement); // r == next_node(word_end) if (trace > 1) { tex_begin_diagnostic(); tex_print_format("[language: exception %s to %s]", utf8original, replacement); if (trace > 2) { tex_aux_hyphenate_show(node_next(beg), node_prev(r)); } tex_end_diagnostic(); } lmt_memory_free(replacement); goto PICKUP; } PATTERNS: if (lang->patterns) { if (explicit_start) { /*tex We're done already */ } else if (hyphenation_permitted(glyph_hyphenate(word_start), syllable_hyphenation_mode)) { halfword left = word_start; halfword right = r; /*tex We're one after |word_end|. */ for (int i = lhmin; i > 1; i--) { left = node_next(left); if (! left || left == right) { goto PICKUP; } } if (right != left) { int done = 0; for (int i = rhmin; i > 0; i--) { right = node_prev(right); if (! right || right == left) { goto PICKUP; } } done = tex_aux_hnj_hyphen_hyphenate(lang->patterns, word_start, word_end, word_length, left, right, &langdata); if (trace > 1) { tex_begin_diagnostic(); if (done) { tex_print_format("[language: hyphenated %s at %i positions]", utf8original, done); if (trace > 2) { tex_aux_hyphenate_show(node_next(left), node_prev(right)); } } else { tex_print_format("[language: not hyphenated %s]", utf8original); } tex_end_diagnostic(); } } } } } } } } PICKUP: explicit_start = null ; explicit_hyphen = 0; word_length = 0; utf8ptr = utf8word; utf8ori = utf8original; if (r) { r = tex_aux_find_next_wordstart(r, first_language); } else { break; } } ABORT: tex_flush_node(node_next(tail)); node_next(tail) = saved_tail; } } } } halfword tex_glyph_to_discretionary(halfword glyph, quarterword code, int keepkern) { halfword prev = node_prev(glyph); halfword next = node_next(glyph); halfword disc = tex_new_disc_node(code); halfword kern = null; if (keepkern && next && node_type(next) == kern_node && node_subtype(next) == italic_kern_subtype) { kern = node_next(next); next = node_next(kern); node_next(kern) = null; } else { node_next(glyph) = null; } node_prev(glyph) = null; tex_attach_attribute_list_copy(disc, glyph); tex_set_disc_field(disc, pre_break_code, tex_copy_node_list(glyph, null)); tex_set_disc_field(disc, post_break_code, tex_copy_node_list(glyph, null)); tex_set_disc_field(disc, no_break_code, glyph); tex_try_couple_nodes(prev, disc); tex_try_couple_nodes(disc, next); return disc; }