diff options
Diffstat (limited to 'source/luametatex/source/libraries/hnj/hnjhyphen.c')
-rw-r--r-- | source/luametatex/source/libraries/hnj/hnjhyphen.c | 627 |
1 files changed, 627 insertions, 0 deletions
diff --git a/source/luametatex/source/libraries/hnj/hnjhyphen.c b/source/luametatex/source/libraries/hnj/hnjhyphen.c new file mode 100644 index 000000000..ad9d87683 --- /dev/null +++ b/source/luametatex/source/libraries/hnj/hnjhyphen.c @@ -0,0 +1,627 @@ +/* + See license.txt in the root of this project. +*/ + +/* + + This file is derived from libhnj which is is dual licensed under LGPL and MPL. Boilerplate + for both licenses follows. + + LibHnj - a library for high quality hyphenation and justification + + (C) 1998 Raph Levien, + (C) 2001 ALTLinux, Moscow (http://www.alt-linux.org), + (C) 2001 Peter Novodvorsky (nidd@cs.msu.su) + + This library is free software; you can redistribute it and/or modify it under the terms of the + GNU Library General Public License as published by the Free Software Foundation; either version + 2 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; + without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + the GNU Library General Public License for more details. + + You should have received a copy of the GNU Library General Public License along with this + library; if not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, + Boston, MA 02111-1307 USA. + + The contents of this file are subject to the Mozilla Public License Version 1.0 (the "MPL"); + you may not use this file except in compliance with the MPL. You may obtain a copy of the MPL + at http://www.mozilla.org/MPL/ + + Software distributed under the MPL is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY + KIND, either express or implied. See the MPL for the specific language governing rights and + limitations under the MPL. + + Remark: I'm not sure if something fundamental was adapted in the perspective of using this + library in LuaTeX. However, for instance error reporting has been hooked into the Lua(Meta)TeX + error reporting mechanisms. Also a bit of reformatting was done. This module won't change. + Also, the code has been adapted a little in order to fit in the rest (function names etc) + because it is more exposed. We use the alternative memory allocator. + +*/ + +/*tex We need the warning subsystem, so: */ + +# include "luametatex.h" + +/*tex A few helpers (from |hnjalloc|): */ + +static void *hnj_malloc(int size) +{ + void *p = lmt_memory_malloc((size_t) size); + if (! p) { + tex_formatted_error("hyphenation", "allocating %d bytes failed\n", size); + } + return p; +} + +static void *hnj_realloc(void *p, int size) +{ + void *n = lmt_memory_realloc(p, (size_t) size); + if (! n) { + tex_formatted_error("hyphenation", "reallocating %d bytes failed\n", size); + } + return n; +} + +static void hnj_free(void *p) +{ + lmt_memory_free(p); +} + +static unsigned char *hnj_strdup(const unsigned char *s) +{ + size_t l = strlen((const char *) s); + unsigned char *n = hnj_malloc((int) l + 1); + memcpy(n, s, l); + n[l] = 0; + return n; +} + +/*tex + + Combine two right-aligned number patterns, 04000 + 020 becomes 04020. This works also for utf8 + sequences because the substring is identical to the last |substring - length| bytes of expr + except for the (single byte) hyphenation encoders + +*/ + +static char *combine_patterns(char *expr, const char *subexpr) +{ + size_t l1 = strlen(expr); + size_t l2 = strlen(subexpr); + size_t off = l1 - l2; + for (unsigned j = 0; j < l2; j++) { + if (expr[off + j] < subexpr[j]) { + expr[off + j] = subexpr[j]; + } + } + return expr; +} + +/*tex Some original code: */ + +static hjn_hashiterator *new_hashiterator(hjn_hashtable *h) +{ + hjn_hashiterator *i = hnj_malloc(sizeof(hjn_hashiterator)); + i->e = h->entries; + i->cur = NULL; + i->ndx = -1; + return i; +} + +static int nexthashstealpattern(hjn_hashiterator *i, unsigned char **word, char **pattern) +{ + while (i->cur == NULL) { + if (i->ndx >= HNJ_HASH_SIZE - 1) { + return 0; + } else { + i->cur = i->e[++i->ndx]; + } + } + *word = i->cur->key; + *pattern = i->cur->u.hyppat; + i->cur->u.hyppat = NULL; + i->cur = i->cur->next; + return 1; +} + +static int nexthash(hjn_hashiterator *i, unsigned char **word) +{ + while (! i->cur) { + if (i->ndx >= HNJ_HASH_SIZE - 1) { + return 0; + } else { + i->cur = i->e[++i->ndx]; + } + } + *word = i->cur->key; + i->cur = i->cur->next; + return 1; +} + +static int eachhash(hjn_hashiterator *i, unsigned char **word, char **pattern) +{ + while (! i->cur) { + if (i->ndx >= HNJ_HASH_SIZE - 1) { + return 0; + } else { + i->cur = i->e[++i->ndx]; + } + } + *word = i->cur->key; + *pattern = i->cur->u.hyppat; + i->cur = i->cur->next; + return 1; +} + +static void delete_hashiterator(hjn_hashiterator *i) +{ + hnj_free(i); +} + +/*tex A |char*| hash function from ASU, adapted from |Gtk+|: */ + +static unsigned int string_hash(const unsigned char *s) +{ + const unsigned char *p = s; + unsigned int h = 0, g; + for (; *p != '\0'; p += 1) { + h = (h << 4) + *p; + g = h & 0xf0000000; + if (g) { + h = h ^ (g >> 24); + h = h ^ g; + } + } + return h; +} + +/*tex This assumes that key is not already present! */ + +static void state_insert(hjn_hashtable *hashtab, unsigned char *key, int state) +{ + int i = (int) (string_hash(key) % HNJ_HASH_SIZE); + hjn_hashentry* e = hnj_malloc(sizeof(hjn_hashentry)); + e->next = hashtab->entries[i]; + e->key = key; + e->u.state = state; + hashtab->entries[i] = e; +} + +/*tex This also assumes that key is not already present! */ + +static void insert_pattern(hjn_hashtable *hashtab, unsigned char *key, char *hyppat, int trace) +{ + hjn_hashentry *e; + int i = (int) (string_hash(key) % HNJ_HASH_SIZE); + for (e = hashtab->entries[i]; e; e = e->next) { + if (strcmp((char *) e->key, (char *) key) == 0) { + if (e->u.hyppat) { + if (trace && hyppat && strcmp((char *) e->u.hyppat, (char *) hyppat) != 0) { + tex_formatted_warning("hyphenation", "a conflicting pattern '%s' has been ignored", hyppat); + } + hnj_free(e->u.hyppat); + } + e->u.hyppat = hyppat; + hnj_free(key); + return; + } + } + e = hnj_malloc(sizeof(hjn_hashentry)); + e->next = hashtab->entries[i]; + e->key = key; + e->u.hyppat = hyppat; + hashtab->entries[i] = e; +} + +/*tex We return |state| if found, otherwise |-1|. */ + +static int state_lookup(hjn_hashtable *hashtab, const unsigned char *key) +{ + int i = (int) (string_hash(key) % HNJ_HASH_SIZE); + for (hjn_hashentry *e = hashtab->entries[i]; e; e = e->next) { + if (! strcmp((const char *) key, (const char *) e->key)) { + return e->u.state; + } + } + return -1; +} + +/*tex We return |state| if found, otherwise |-1|. The 256 should be enough. */ + +static char *lookup_pattern(hjn_hashtable * hashtab, const unsigned char *chars, int l) +{ + int i; + unsigned char key[256]; + strncpy((char *) key, (const char *) chars, (size_t) l); + key[l] = 0; + i = (int) (string_hash(key) % HNJ_HASH_SIZE); + for (hjn_hashentry *e = hashtab->entries[i]; e; e = e->next) { + if (! strcmp((char *) key, (char *) e->key)) { + return e->u.hyppat; + } + } + return NULL; +} + +/*tex Get the state number, allocating a new state if necessary. */ + +static int hnj_get_state(hjn_dictionary *dict, const unsigned char *str, int *state_num) +{ + *state_num = state_lookup(dict->state_num, str); + if (*state_num >= 0) { + return *state_num; + } else { + state_insert(dict->state_num, hnj_strdup(str), dict->num_states); + /*tex The predicate is true if |dict->num_states| is a power of two: */ + if (! (dict->num_states & (dict->num_states - 1))) { + dict->states = hnj_realloc(dict->states, (int) ((dict->num_states << 1) * (int) sizeof(hjn_state))); + } + dict->states[dict->num_states] = (hjn_state) { .match = NULL, .fallback_state = -1, .num_trans = 0, .trans = NULL }; + return dict->num_states++; + } +} + +/*tex + + Add a transition from state1 to state2 through ch - assumes that the transition does not + already exist. + +*/ + +static void hnj_add_trans(hjn_dictionary *dict, int state1, int state2, int chr) +{ + /*tex + + This test was a bit too strict, it is quite normal for old patterns to have chars in the + range 0-31 or 127-159 (inclusive). To ease the transition, let's only disallow |nul| for + now, which probably is a requirement of the code anyway. + + */ + if (chr) { + int num_trans = dict->states[state1].num_trans; + if (num_trans == 0) { + dict->states[state1].trans = hnj_malloc(sizeof(hjn_transition)); + } else { + /*tex + + The old version did: + + \starttyping + } else if (!(num_trans & (num_trans - 1))) { + ... = hnj_realloc(dict->states[state1].trans, + (int) ((num_trans << 1) * sizeof(HyphenTrans))); + \stoptyping + + but that is incredibly nasty when adding patters one-at-a-time. Controlled growth + would be nicer than the current +1, but if no one complains, and no one did in a + decade, this is good enough. + + */ + dict->states[state1].trans = hnj_realloc(dict->states[state1].trans, (int) ((num_trans + 1) * sizeof(hjn_transition))); + } + dict->states[state1].trans[num_trans].uni_ch = chr; + dict->states[state1].trans[num_trans].new_state = state2; + dict->states[state1].num_trans++; + } else { + tex_normal_error("hyphenation","a nul character is not permited"); + } +} + +/*tex + + We did change the semantics a bit here: |hnj_hyphen_load| used to operate on a file, but now + the argument is a string buffer. + +*/ + +/* define tex_isspace(c) (c == ' ' || c == '\t') */ +# define tex_isspace(c) (c == ' ') + +static const unsigned char *next_pattern(size_t* length, const unsigned char** buf) +{ + const unsigned char *here, *rover = *buf; + while (*rover && tex_isspace(*rover)) { + rover++; + } + here = rover; + while (*rover) { + if (tex_isspace(*rover)) { + *length = (size_t) (rover - here); + *buf = rover; + return here; + } else { + rover++; + } + } + *length = (size_t) (rover - here); + *buf = rover; + return *length ? here : NULL; +} + +static void init_hash(hjn_hashtable **h) +{ + if (! *h) { + *h = hnj_malloc(sizeof(hjn_hashtable)); + for (int i = 0; i < HNJ_HASH_SIZE; i++) { + (*h)->entries[i] = NULL; + } + } +} + +static void clear_state_hash(hjn_hashtable **h) +{ + if (*h) { + for (int i = 0; i < HNJ_HASH_SIZE; i++) { + hjn_hashentry *e, *next; + for (e = (*h)->entries[i]; e; e = next) { + next = e->next; + hnj_free(e->key); + hnj_free(e); + } + } + hnj_free(*h); + *h = NULL; + } +} + +static void clear_pattern_hash(hjn_hashtable **h) +{ + if (*h) { + for (int i = 0; i < HNJ_HASH_SIZE; i++) { + hjn_hashentry *e, *next; + for (e = (*h)->entries[i]; e; e = next) { + next = e->next; + hnj_free(e->key); + if (e->u.hyppat) { + hnj_free(e->u.hyppat); + } + hnj_free(e); + } + } + hnj_free(*h); + *h = NULL; + } +} + +static void init_dictionary(hjn_dictionary *dict) +{ + dict->num_states = 1; + dict->pat_length = 0; + dict->states = hnj_malloc(sizeof(hjn_state)); + dict->states[0] = (hjn_state) { .match = NULL, .fallback_state = -1, .num_trans = 0, .trans = NULL }; + dict->patterns = NULL; + dict->merged = NULL; + dict->state_num = NULL; + init_hash(&dict->patterns); +} + +static void clear_dictionary(hjn_dictionary *dict) +{ + for (int state_num = 0; state_num < dict->num_states; state_num++) { + hjn_state *hstate = &dict->states[state_num]; + if (hstate->match) { + hnj_free(hstate->match); + } + if (hstate->trans) { + hnj_free(hstate->trans); + } + } + hnj_free(dict->states); + clear_pattern_hash(&dict->patterns); + clear_pattern_hash(&dict->merged); + clear_state_hash(&dict->state_num); +} + +hjn_dictionary *hnj_dictionary_new(void) +{ + hjn_dictionary *dict = hnj_malloc(sizeof(hjn_dictionary)); + init_dictionary(dict); + return dict; +} + +void hnj_dictionary_clear(hjn_dictionary *dict) +{ + clear_dictionary(dict); + init_dictionary(dict); +} + +void hnj_dictionary_free(hjn_dictionary *dict) +{ + clear_dictionary(dict); + hnj_free(dict); +} + +unsigned char *hnj_dictionary_tostring(hjn_dictionary *dict) +{ + unsigned char *word; + char *pattern; + unsigned char *buf = hnj_malloc(dict->pat_length); + unsigned char *cur = buf; + hjn_hashiterator *v = new_hashiterator(dict->patterns); + while (eachhash(v, &word, &pattern)) { + int i = 0; + int e = 0; + while (word[e + i]) { + if (pattern[i] != '0') { + *cur++ = (unsigned char) pattern[i]; + } + *cur++ = word[e + i++]; + while (is_utf8_follow(word[e + i])) { + *cur++ = word[i + e++]; + } + } + if (pattern[i] != '0') { + *cur++ = (unsigned char) pattern[i]; + } + *cur++ = ' '; + } + delete_hashiterator(v); + *cur = 0; + return buf; +} + +/*tex + + In hyphenation patterns we use signed bytes where |0|, or actually any negative number, + indicates end: + + \starttyping + prio(1+),startpos,length,len1,[replace],len2,[replace] + \starttyping + + A basic example is: + + \starttyping + p n 0 0 0 + \starttyping + + for a hyphenation point between characters. + +*/ + +void hnj_dictionary_load(hjn_dictionary *dict, const unsigned char *f, int trace) +{ + int state_num, last_state; + int ch; + int found; + hjn_hashiterator *v; + unsigned char *word; + char *pattern; + size_t l = 0; + const unsigned char *format; + const unsigned char *begin = f; + while ((format = next_pattern(&l, &f)) != NULL) { + if (l > 0 && l < 255) { + int i, j, e1; + for (i = 0, j = 0, e1 = 0; (unsigned) i < l; i++) { + if (format[i] >= '0' && format[i] <= '9') { + j++; + } + if (is_utf8_follow(format[i])) { + e1++; + } + } + /*tex + Here |l-e1| is the number of {\em characters} not {\em bytes}, |l-j| the number of + pattern bytes and |l-e1-j| the number of pattern characters. + */ + { + unsigned char *pat = (unsigned char *) hnj_malloc((1 + (int) l - j)); + char *org = (char *) hnj_malloc(2 + (int) l - e1 - j); + /*tex Remove hyphenation encoders (digits) from pat. */ + org[0] = '0'; + for (i = 0, j = 0, e1 = 0; (unsigned) i < l; i++) { + unsigned char c = format[i]; + if (is_utf8_follow(c)) { + pat[j + e1++] = c; + } else if (c < '0' || c > '9') { + pat[e1 + j++] = c; + org[j] = '0'; + } else { + org[j] = (char) c; + } + } + pat[e1 + j] = 0; + org[j + 1] = 0; + insert_pattern(dict->patterns, pat, org, trace); + } + } else { + tex_normal_warning("hyphenation", "a pattern of more than 254 bytes is ignored"); + } + } + /*tex We add 2 bytes for spurious spaces. */ + dict->pat_length += (int) ((f - begin) + 2); + init_hash(&dict->merged); + v = new_hashiterator(dict->patterns); + while (nexthash(v, &word)) { + int wordsize = (int) strlen((char *) word); + for (int l1 = 1; l1 <= wordsize; l1++) { + if (is_utf8_follow(word[l1])) { + /*tex Do not clip an utf8 sequence. */ + } else { + for (int j1 = 1; j1 <= l1; j1++) { + int i1 = l1 - j1; + if (is_utf8_follow(word[i1])) { + /*tex Do not start halfway an utf8 sequence. */ + } else { + char *subpat_pat = lookup_pattern(dict->patterns, word + i1, j1); + if (subpat_pat) { + char *newpat_pat = lookup_pattern(dict->merged, word, l1); + if (! newpat_pat) { + char *neworg; + unsigned char *newword = (unsigned char *) hnj_malloc((size_t) (l1 + 1)); + int e1 = 0; + strncpy((char *) newword, (char *) word, (size_t) l1); + newword[l1] = 0; + for (i1 = 0; i1 < l1; i1++) { + if (is_utf8_follow(newword[i1])) { + e1++; + } + } + neworg = hnj_malloc((size_t) (l1 + 2 - e1)); + /*tex Fill with right amount of zeros: */ + sprintf(neworg, "%0*d", l1 + 1 - e1, 0); + insert_pattern(dict->merged, newword, combine_patterns(neworg, subpat_pat), trace); + } else { + combine_patterns(newpat_pat, subpat_pat); + } + } + } + } + } + } + } + delete_hashiterator(v); + init_hash(&dict->state_num); + state_insert(dict->state_num, hnj_strdup((const unsigned char *) ""), 0); + v = new_hashiterator(dict->merged); + while (nexthashstealpattern(v, &word, &pattern)) { + static unsigned char mask[] = { 0x3F, 0x1F, 0xF, 0x7 }; + int j1 = (int) strlen((char *) word); + state_num = hnj_get_state(dict, word, &found); + dict->states[state_num].match = pattern; + /*tex Now, put in the prefix transitions. */ + while (found < 0) { + j1--; + last_state = state_num; + ch = word[j1]; + if (ch >= 0x80) { /* why not is_utf8_follow(ch) here */ + int m; + int i1 = 1; + while (is_utf8_follow(word[j1 - i1])) { + i1++; + } + ch = word[j1 - i1] & mask[i1]; + m = j1 - i1; + while (i1--) { + ch = (ch << 6) + (0x3F & word[j1 - i1]); + } + j1 = m; + } + word[j1] = '\0'; + state_num = hnj_get_state(dict, word, &found); + hnj_add_trans(dict, state_num, last_state, ch); + } + } + delete_hashiterator(v); + clear_pattern_hash(&dict->merged); + /*tex Put in the fallback states. */ + for (int i = 0; i < HNJ_HASH_SIZE; i++) { + for (hjn_hashentry *e = dict->state_num->entries[i]; e; e = e->next) { + /*tex Do not do |state == 0| otherwise things get confused. */ + if (e->u.state) { + for (int j = 1; 1; j++) { + state_num = state_lookup(dict->state_num, e->key + j); + if (state_num >= 0) { + break; + } + } + dict->states[e->u.state].fallback_state = state_num; + } + } + } + clear_state_hash(&dict->state_num); +} |