summaryrefslogtreecommitdiff
path: root/source/luametatex/source/tex/texstringpool.c
diff options
context:
space:
mode:
Diffstat (limited to 'source/luametatex/source/tex/texstringpool.c')
-rw-r--r--source/luametatex/source/tex/texstringpool.c607
1 files changed, 607 insertions, 0 deletions
diff --git a/source/luametatex/source/tex/texstringpool.c b/source/luametatex/source/tex/texstringpool.c
new file mode 100644
index 000000000..8367447da
--- /dev/null
+++ b/source/luametatex/source/tex/texstringpool.c
@@ -0,0 +1,607 @@
+/*
+ See license.txt in the root of this project.
+*/
+
+# include "luametatex.h"
+
+/*tex
+
+ Control sequence names and diagnostic messages are variable length strings of eight bit
+ characters. Since \PASCAL\ did not have a well-developed string mechanism, \TEX\ did all of its
+ string processing by homegrown methods.
+
+ Elaborate facilities for dynamic strings are not needed, so all of the necessary operations can
+ be handled with a simple data structure. The array |str_pool| contains all of the (eight-bit)
+ bytes off all of the strings, and the array |str_start| contains indices of the starting points
+ of each string. Strings are referred to by integer numbers, so that string number |s| comprises
+ the characters |str_pool[j]| for |str_start_macro(s) <= j < str_start_macro (s + 1)|. Additional
+ integer variables |pool_ptr| and |str_ptr| indicate the number of entries used so far in
+ |str_pool| and |str_start|, respectively; locations |str_pool[pool_ptr]| and |str_start_macro
+ (str_ptr)| are ready for the next string to be allocated.
+
+ String numbers 0 to |biggest_char| are reserved for strings that correspond to single \UNICODE\
+ characters. This is in accordance with the conventions of \WEB\ which converts single-character
+ strings into the ASCII code number of the single character involved.
+
+ The stringpool variables are collected in:
+
+*/
+
+string_pool_info lmt_string_pool_state = {
+ .string_pool = NULL,
+ .string_pool_data = {
+ .minimum = min_pool_size,
+ .maximum = max_pool_size,
+ .size = siz_pool_size,
+ .step = stp_pool_size,
+ .allocated = 0,
+ .itemsize = sizeof(lstring),
+ .top = 0,
+ .ptr = 0,
+ .initial = 0,
+ .offset = cs_offset_value,
+ },
+ .string_body_data = {
+ .minimum = min_body_size,
+ .maximum = max_body_size,
+ .size = siz_body_size,
+ .step = stp_body_size,
+ .allocated = 0,
+ .itemsize = sizeof(unsigned char),
+ .top = memory_data_unset,
+ .ptr = memory_data_unset,
+ .initial = 0,
+ .offset = 0,
+ },
+ .reserved = 0,
+ .string_max_length = 0,
+ .string_temp = NULL,
+ .string_temp_allocated = 0,
+ .string_temp_top = 0,
+};
+
+/*tex
+
+ The array of strings is |string_pool|, the number of the current string being created is
+ |str_ptr|, the starting value of |str_ptr| is |init_str_ptr|, and the current string buffer,
+ the current index in that buffer, the mallocedsize of |cur_string| and the occupied byte count
+ are kept in |cur_string|, |cur_length|, |cur_string_size| and |pool_size|.
+
+ Once a sequence of characters has been appended to |cur_string|, it officially becomes a string
+ when the function |make_string| is called. This function returns the identification number of
+ the new string as its value.
+
+ Strings end with a zero character which makes \TEX\ string also valid \CCODE\ strings. The
+ |string_temp*| fields deal with a temporary string (building).
+
+ The |ptr| is always one ahead. This is kind of a safeguard: an overflow happens already when we
+ still assemble a new string.
+
+*/
+
+# define initial_temp_string_slots 256
+# define reserved_temp_string_slots 2
+
+static inline void tex_aux_increment_pool_string(int n)
+{
+ lmt_string_pool_state.string_body_data.allocated += n;
+ if (lmt_string_pool_state.string_body_data.allocated > lmt_string_pool_state.string_body_data.size) {
+ tex_overflow_error("poolbody", lmt_string_pool_state.string_body_data.allocated);
+ }
+}
+
+static inline void tex_aux_decrement_pool_string(int n)
+{
+ lmt_string_pool_state.string_body_data.allocated -= n;
+}
+
+static void tex_aux_flush_cur_string(void)
+{
+ if (lmt_string_pool_state.string_temp) {
+ aux_deallocate_array(lmt_string_pool_state.string_temp);
+ }
+ lmt_string_pool_state.string_temp = NULL;
+ lmt_string_pool_state.string_temp_top = 0;
+ lmt_string_pool_state.string_temp_allocated = 0;
+}
+
+void tex_reset_cur_string(void)
+{
+ unsigned char *tmp = aux_allocate_clear_array(sizeof(unsigned char), initial_temp_string_slots, reserved_temp_string_slots);
+ if (tmp) {
+ lmt_string_pool_state.string_temp = tmp;
+ lmt_string_pool_state.string_temp_top = 0;
+ lmt_string_pool_state.string_temp_allocated = initial_temp_string_slots;
+ } else {
+ tex_overflow_error("pool", initial_temp_string_slots);
+ }
+}
+
+static int tex_aux_room_in_string(int wsize)
+{
+ /* no callback here */
+ if (! lmt_string_pool_state.string_temp) {
+ tex_reset_cur_string();
+ }
+ if ((lmt_string_pool_state.string_temp_top + wsize) > lmt_string_pool_state.string_temp_allocated) {
+ unsigned char *tmp = NULL;
+ int size = lmt_string_pool_state.string_temp_allocated + lmt_string_pool_state.string_temp_allocated / 5 + STRING_EXTRA_AMOUNT;
+ if (size < wsize) {
+ size = wsize + STRING_EXTRA_AMOUNT;
+ }
+ tmp = aux_reallocate_array(lmt_string_pool_state.string_temp, sizeof(unsigned char), size, reserved_temp_string_slots);
+ if (tmp) {
+ lmt_string_pool_state.string_temp = tmp;
+ memset(tmp + lmt_string_pool_state.string_temp_top, 0, (size_t) size - lmt_string_pool_state.string_temp_top);
+ } else {
+ tex_overflow_error("pool", size);
+ }
+ lmt_string_pool_state.string_temp_allocated = size;
+ }
+ return 1;
+}
+
+# define reserved_string_slots 1
+
+/*tex Messy: ptr and top have cs_offset_value included */
+
+void tex_initialize_string_mem(void)
+{
+ int size = lmt_string_pool_state.string_pool_data.minimum;
+ if (lmt_main_state.run_state == initializing_state) {
+ size = lmt_string_pool_state.string_pool_data.minimum;
+ lmt_string_pool_state.string_pool_data.ptr = cs_offset_value;
+ } else {
+ size = lmt_string_pool_state.string_pool_data.allocated;
+ lmt_string_pool_state.string_pool_data.initial = lmt_string_pool_state.string_pool_data.ptr;
+ }
+ if (size > 0) {
+ lstring *pool = aux_allocate_clear_array(sizeof(lstring), size, reserved_string_slots);
+ if (pool) {
+ lmt_string_pool_state.string_pool = pool;
+ lmt_string_pool_state.string_pool_data.allocated = size;
+ } else {
+ tex_overflow_error("pool", size);
+ }
+ }
+}
+
+void tex_initialize_string_pool(void)
+{
+ unsigned char *nullstring = lmt_memory_malloc(1);
+ int size = lmt_string_pool_state.string_pool_data.allocated;
+ if (size && nullstring) {
+ lmt_string_pool_state.string_pool[0].s = nullstring;
+ nullstring[0] = '\0';
+ lmt_string_pool_state.string_pool_data.ptr += 1;
+ tex_reset_cur_string();
+ } else {
+ tex_overflow_error("pool", size);
+ }
+}
+
+static int tex_aux_room_in_string_pool(int n)
+{
+ int top = lmt_string_pool_state.string_pool_data.ptr + n;
+ if (top > lmt_string_pool_state.string_pool_data.top) {
+ lmt_string_pool_state.string_pool_data.top = top;
+ top -= cs_offset_value;
+ if (top > lmt_string_pool_state.string_pool_data.allocated) {
+ lstring *tmp = NULL;
+ top = lmt_string_pool_state.string_pool_data.allocated;
+ do {
+ top += lmt_string_pool_state.string_pool_data.step;
+ n -= lmt_string_pool_state.string_pool_data.step;
+ } while (n > 0);
+ if (top > lmt_string_pool_state.string_pool_data.size) {
+ top = lmt_string_pool_state.string_pool_data.size;
+ }
+ if (top > lmt_string_pool_state.string_pool_data.allocated) {
+ lmt_string_pool_state.string_pool_data.allocated = top;
+ tmp = aux_reallocate_array(lmt_string_pool_state.string_pool, sizeof(lstring), top, reserved_string_slots);
+ lmt_string_pool_state.string_pool = tmp;
+ }
+ lmt_run_memory_callback("pool", tmp ? 1 : 0);
+ if (! tmp) {
+ tex_overflow_error("pool", top);
+ return 0;
+ }
+ }
+ }
+ return 1;
+}
+
+/*tex
+
+ Checking for the last one to be the same as the previous one doesn't save much some 10K on a
+ \CONTEXT\ format.
+
+*/
+
+strnumber tex_make_string(void)
+{
+ if (tex_aux_room_in_string(1)) {
+ int ptr = lmt_string_pool_state.string_pool_data.ptr;
+ lmt_string_pool_state.string_temp[lmt_string_pool_state.string_temp_top] = '\0';
+ str_string(ptr) = lmt_string_pool_state.string_temp;
+ str_length(ptr) = lmt_string_pool_state.string_temp_top;
+ tex_aux_increment_pool_string(lmt_string_pool_state.string_temp_top);
+ tex_reset_cur_string();
+ if (tex_aux_room_in_string_pool(1)) {
+ lmt_string_pool_state.string_pool_data.ptr++;
+ }
+ return ptr;
+ } else {
+ return get_nullstr();
+ }
+}
+
+strnumber tex_push_string(const unsigned char *s, int l)
+{
+ if (tex_aux_room_in_string_pool(1)) {
+ unsigned char *t = lmt_memory_malloc(sizeof(char) * ((size_t) l + 1));
+ if (t) {
+ int ptr = lmt_string_pool_state.string_pool_data.ptr;
+ memcpy(t, s, l);
+ t[l] = '\0';
+ str_string(ptr) = t;
+ str_length(ptr) = l;
+ lmt_string_pool_state.string_pool_data.ptr++;
+ tex_aux_increment_pool_string(l);
+ return ptr;
+ }
+ }
+ return get_nullstr();
+}
+
+char *tex_take_string(int *len)
+{
+ char* ptr = NULL;
+ if (tex_aux_room_in_string(1)) {
+ lmt_string_pool_state.string_temp[lmt_string_pool_state.string_temp_top] = '\0';
+ if (len) {
+ *len = lmt_string_pool_state.string_temp_top;
+ }
+ ptr = (char *) lmt_string_pool_state.string_temp;
+ tex_reset_cur_string();
+ }
+ return ptr;
+}
+
+/*tex
+
+ The following subroutine compares string |s| with another string of the same length that appears
+ in |buffer| starting at position |k|; the result is |true| if and only if the strings are equal.
+ Empirical tests indicate that |str_eq_buf| is used in such a way that it tends to return |true|
+ about 80 percent of the time.
+
+ \startyping
+ unsigned char *j = str_string(s);
+ unsigned char *l = j + str_length(s);
+ while (j < l) {
+ if (*j++ != buffer[k++])
+ return 0;
+ }
+ \stoptyping
+
+*/
+
+int tex_str_eq_buf(strnumber s, int k, int n)
+{
+ if (s < cs_offset_value) {
+ return buffer_to_unichar(k) == (unsigned int) s;
+ } else {
+ return memcmp(str_string(s), &lmt_fileio_state.io_buffer[k], n) == 0;
+ }
+}
+
+/*tex
+
+ Here is a similar routine, but it compares two strings in the string pool, and it does not
+ assume that they have the same length.
+
+ \starttyping
+ k = str_string(t);
+ j = str_string(s);
+ l = j + str_length(s);
+ while (j < l) {
+ if (*j++ != *k++)
+ return 0;
+ }
+ \stoptyping
+*/
+
+int tex_str_eq_str(strnumber s, strnumber t)
+{
+ if (s >= cs_offset_value) {
+ if (t >= cs_offset_value) {
+ /* s and t are strings, this is the most likely test */
+ return (str_length(s) == str_length(t)) && ! memcmp(str_string(s), str_string(t), str_length(s));
+ } else {
+ /* s is a string and t an unicode character, happens seldom */
+ return (strnumber) aux_str2uni(str_string(s)) == t;
+ }
+ } else if (t >= cs_offset_value) {
+ /* s is an unicode character and t is a string, happens seldom */
+ return (strnumber) aux_str2uni(str_string(t)) == s;
+ } else {
+ /* s and t are unicode characters */
+ return s == t;
+ }
+}
+
+/*tex A string compare helper: */
+
+int tex_str_eq_cstr(strnumber r, const char *s, size_t l)
+{
+ return (l == str_length(r)) && ! strncmp((const char *) (str_string(r)), s, l);
+}
+
+/*tex
+
+ The initial values of |str_pool|, |str_start|, |pool_ptr|, and |str_ptr| are computed set in
+ \INITEX\ mode. The first |string_offset| strings are single characters strings matching Unicode.
+ There is no point in generating all of these. But |str_ptr| has initialized properly, otherwise
+ |print_char| cannot see the difference between characters and strings.
+
+*/
+
+int tex_get_strings_started(void)
+{
+ tex_reset_cur_string();
+ return 1;
+}
+
+/*tex
+
+ The string recycling routines. \TEX\ uses 2 upto 4 {\em new} strings when scanning a filename
+ in an |\input|, |\openin|, or |\openout| operation. These strings are normally lost because the
+ reference to them are not saved after finishing the operation. |search_string| searches through
+ the string pool for the given string and returns either 0 or the found string number. However,
+ in \LUAMETATEX\ filenames (and fontnames) are implemented more efficiently so that code is gone.
+
+*/
+
+strnumber tex_maketexstring(const char *s)
+{
+ if (s && *s) {
+ return tex_maketexlstring(s, strlen(s));
+ } else {
+ return get_nullstr();
+ }
+}
+
+strnumber tex_maketexlstring(const char *s, size_t l)
+{
+ if (s && l > 0) {
+ int ptr = lmt_string_pool_state.string_pool_data.ptr;
+ size_t len = l + 1;
+ unsigned char *tmp = lmt_memory_malloc(len);
+ if (tmp) {
+ str_length(ptr) = l;
+ str_string(ptr) = tmp;
+ tex_aux_increment_pool_string((int) l);
+ memcpy(tmp, s, len);
+ if (tex_aux_room_in_string_pool(1)) {
+ lmt_string_pool_state.string_pool_data.ptr += 1;
+ }
+ return ptr;
+ } else {
+ tex_overflow_error("string pool", (int) len);
+ }
+ }
+ return get_nullstr();
+}
+
+/*tex
+ These two functions appends bytes to the current \TEX\ string. There is no checking on what
+ gets appended nd as in \LUA\ zero bytes are okay. Unlike the other engines we don't provide
+ |^^| escaping, which is already optional in \LUATEX.
+*/
+
+void tex_append_string(const unsigned char *s, unsigned l)
+{
+ if (s && l > 0 && tex_aux_room_in_string(l)) {
+ memcpy(lmt_string_pool_state.string_temp + lmt_string_pool_state.string_temp_top, s, l);
+ lmt_string_pool_state.string_temp_top += l;
+ }
+}
+
+void tex_append_char(unsigned char c)
+{
+ if (tex_aux_room_in_string(1)) {
+ lmt_string_pool_state.string_temp[lmt_string_pool_state.string_temp_top++] = (unsigned char) c;
+ }
+}
+
+char *tex_makeclstring(int s, size_t *len)
+{
+ if (s < cs_offset_value) {
+ *len = (size_t) utf8_size(s);
+ return (char *) aux_uni2str((unsigned) s);
+ } else {
+ size_t l = (size_t) str_length(s);
+ char *tmp = lmt_memory_malloc(l + 1);
+ if (tmp) {
+ memcpy(tmp, str_string(s), l);
+ tmp[l] = '\0';
+ *len = l;
+ return tmp;
+ } else {
+ tex_overflow_error("string pool", (int) l);
+ *len = 0;
+ return NULL;
+ }
+ }
+}
+
+char *tex_makecstring(int s)
+{
+ if (s < cs_offset_value) {
+ return (char *) aux_uni2str((unsigned) s);
+ } else {
+ return lmt_memory_strdup((str_length(s) > 0) ? (const char *) str_string(s) : "");
+ }
+}
+
+/*tex
+
+ We can save some 150 K on the format file size by using a signed char as length (after checking)
+ because the max size of a string in \CONTEXT\ is around 70. A flag could indicate if we use 1 or
+ 4 bytes for the length. But not yet (preroll needed). Dumping and undumping all strings in a
+ block (where we need to zero terminate them) doesn't really work out any better. Okay, in the end
+ it was done.
+
+*/
+
+/*tex We use the real accessors here, not the macros that use |cs_offset_value|. */
+
+void tex_compact_string_pool(void)
+{
+ int n_of_strings = lmt_string_pool_state.string_pool_data.ptr - cs_offset_value;
+ int max_length = 0;
+ for (int j = 1; j < n_of_strings; j++) {
+ if (lmt_string_pool_state.string_pool[j].l > (unsigned int) max_length) {
+ max_length = (int) lmt_string_pool_state.string_pool[j].l;
+ }
+ }
+ lmt_string_pool_state.string_max_length = max_length;
+ tex_print_format("max string length %i, ", max_length);
+}
+
+void tex_dump_string_pool(dumpstream f)
+{
+ int n_of_strings = lmt_string_pool_state.string_pool_data.ptr - cs_offset_value;
+ int total_length = lmt_string_pool_state.string_body_data.allocated;
+ int max_length = lmt_string_pool_state.string_max_length;
+ dump_via_int(f, lmt_string_pool_state.string_pool_data.allocated);
+ dump_via_int(f, lmt_string_pool_state.string_pool_data.top); /* includes cs_offset_value */
+ dump_via_int(f, lmt_string_pool_state.string_pool_data.ptr); /* includes cs_offset_value */
+ dump_via_int(f, n_of_strings);
+ dump_via_int(f, max_length);
+ dump_via_int(f, total_length);
+ if (max_length > 0 && max_length < 126) {
+ /*tex We only have short strings. */
+ for (int j = 0; j < n_of_strings; j++) {
+ int l = (int) lmt_string_pool_state.string_pool[j].l;
+ char c;
+ if (! lmt_string_pool_state.string_pool[j].s) {
+ l = -1;
+ }
+ c = (char) l;
+ dump_things(f, c, 1);
+ if (l > 0) {
+ dump_things(f, *lmt_string_pool_state.string_pool[j].s, l);
+ }
+ }
+ } else {
+ /*tex We also have long strings. */
+ for (int j = 0; j < n_of_strings; j++) {
+ int l = (int) lmt_string_pool_state.string_pool[j].l;
+ if (! lmt_string_pool_state.string_pool[j].s) {
+ l = -1;
+ }
+ dump_int(f, l);
+ if (l > 0) {
+ dump_things(f, *lmt_string_pool_state.string_pool[j].s, l);
+ }
+ }
+ }
+}
+
+void tex_undump_string_pool(dumpstream f)
+{
+ int n_of_strings;
+ int max_length;
+ int total_length;
+ undump_int(f, lmt_string_pool_state.string_pool_data.allocated);
+ undump_int(f, lmt_string_pool_state.string_pool_data.top); /* includes cs_offset_value */
+ undump_int(f, lmt_string_pool_state.string_pool_data.ptr); /* includes cs_offset_value */
+ undump_int(f, n_of_strings);
+ undump_int(f, max_length);
+ undump_int(f, total_length);
+ lmt_string_pool_state.string_max_length = max_length;
+ tex_initialize_string_mem();
+ {
+ int a = 0;
+ int compact = max_length > 0 && max_length < 126;
+ for (int j = 0; j < n_of_strings; j++) {
+ int x;
+ if (compact) {
+ /*tex We only have short strings. */
+ char c;
+ undump_things(f, c, 1);
+ x = c;
+ } else {
+ /*tex We also have long strings. */
+ undump_int(f, x);
+ }
+ if (x >= 0) {
+ /* we can overflow reserved_string_slots */
+ int n = x + 1;
+ unsigned char *s = aux_allocate_clear_array(sizeof(unsigned char), n, reserved_string_slots);
+ if (s) {
+ lmt_string_pool_state.string_pool[j].s = s;
+ undump_things(f, s[0], x);
+ s[x] = '\0';
+ a += n;
+ } else {
+ tex_overflow_error("string pool", n);
+ x = 0;
+ }
+ } else {
+ x = 0;
+ }
+ lmt_string_pool_state.string_pool[j].l = x;
+ }
+ lmt_string_pool_state.string_body_data.allocated = a;
+ lmt_string_pool_state.string_body_data.initial = a;
+ }
+}
+
+/*tex To destroy an already made string, we say |flush_str|. */
+
+void tex_flush_str(strnumber s)
+{
+ if (s > cs_offset_value) {
+ /*tex Don't ever delete the null string! */
+ tex_aux_decrement_pool_string((int) str_length(s));
+ str_length(s) = 0;
+ lmt_memory_free(str_string(s));
+ str_string(s) = NULL;
+ // string_pool_state.string_pool_data.ptr--;
+ }
+ /* why a loop and not in previous branch */
+ while (! str_string((lmt_string_pool_state.string_pool_data.ptr - 1))) {
+ lmt_string_pool_state.string_pool_data.ptr--;
+ }
+}
+
+/*
+ In the old filename code we had the following, but I suspect some mem issue there (as we ran
+ into GB leaks for thousands of names):
+
+ u = save_cur_string();
+ get_x_token();
+ restore_cur_string(u);
+*/
+
+strnumber tex_save_cur_string(void)
+{
+ return (lmt_string_pool_state.string_temp_top > 0 ? tex_make_string() : 0);
+}
+
+void tex_restore_cur_string(strnumber u)
+{
+ if (u) {
+ /*tex Beware, we have no 0 termination here! */
+ int ul = (int) str_length(u);
+ tex_aux_flush_cur_string();
+ if (tex_aux_room_in_string(u)) {
+ memcpy(lmt_string_pool_state.string_temp, str_string(u), ul);
+ lmt_string_pool_state.string_temp_allocated = ul;
+ lmt_string_pool_state.string_temp_top = ul;
+ tex_flush_str(u);
+ }
+ }
+}