summaryrefslogtreecommitdiff
path: root/source/luametatex/source/tex/textoken.c
diff options
context:
space:
mode:
Diffstat (limited to 'source/luametatex/source/tex/textoken.c')
-rw-r--r--source/luametatex/source/tex/textoken.c3511
1 files changed, 3511 insertions, 0 deletions
diff --git a/source/luametatex/source/tex/textoken.c b/source/luametatex/source/tex/textoken.c
new file mode 100644
index 000000000..0d2415233
--- /dev/null
+++ b/source/luametatex/source/tex/textoken.c
@@ -0,0 +1,3511 @@
+/*
+ See license.txt in the root of this project.
+*/
+
+# include "luametatex.h"
+
+/*tex Todo: move some helpers to other places. */
+
+inline static int tex_aux_the_cat_code(halfword b)
+{
+ return (lmt_input_state.cur_input.cattable == default_catcode_table_preset) ?
+ tex_get_cat_code(cat_code_table_par, b)
+ : ( (lmt_input_state.cur_input.cattable > -0xFF) ?
+ tex_get_cat_code(lmt_input_state.cur_input.cattable, b)
+ : (
+ - lmt_input_state.cur_input.cattable - 0xFF
+ ) ) ;
+}
+
+/*tex
+
+ The \TEX\ system does nearly all of its own memory allocation, so that it can readily be
+ transported into environments that do not have automatic facilities for strings, garbage
+ collection, etc., and so that it can be in control of what error messages the user receives.
+ The dynamic storage requirements of \TEX\ are handled by providing two large arrays called
+ |fixmem| and |varmem| in which consecutive blocks of words are used as nodes by the \TEX\
+ routines.
+
+ Pointer variables are indices into this array, or into another array called |eqtb| that
+ will be explained later. A pointer variable might also be a special flag that lies outside
+ the bounds of |mem|, so we allow pointers to assume any |halfword| value. The minimum
+ halfword value represents a null pointer. \TEX\ does not assume that |mem[null]| exists.
+
+ Locations in |fixmem| are used for storing one-word records; a conventional |AVAIL| stack is
+ used for allocation in this array.
+
+ One can make an argument to switch to standard \CCODE\ allocation but the current approach is
+ very efficient in memory usage and performence so we stay with it. On the average memory
+ consumption of \TEX| is not that large, definitely not compared to other programs that deal
+ with text.
+
+ The big dynamic storage area is named |fixmem| where the smallest location of one|-|word
+ memory in use is |fix_mem_min| and the largest location of one|-|word memory in use is
+ |fix_mem_max|.
+
+ The |dyn_used| variable keeps track of how much memory is in use. The head of the list of
+ available one|-|word nodes is registered in |avail|. The last one-|word node used in |mem|
+ is |fix_mem_end|.
+
+ All these variables are packed in the structure |token_memory_state|.
+
+*/
+
+token_memory_state_info lmt_token_memory_state = {
+ .tokens = NULL,
+ .tokens_data = {
+ .minimum = min_token_size,
+ .maximum = max_token_size,
+ .size = siz_token_size,
+ .step = stp_token_size,
+ .allocated = 0,
+ .itemsize = sizeof(memoryword),
+ .top = 0,
+ .ptr = 0, /* used to register usage */
+ .initial = 0,
+ .offset = 0,
+ },
+ .available = 0,
+ .padding = 0,
+};
+
+/*tex
+
+ Token data has its own memory space. Again we have some state variables: |temp_token_head| is
+ the head of a (temporary) list of some kind as are |hold_token_head| and |omit_template|. A
+ permanently empty list is available in |null_list| and the head of the token list built by
+ |scan_keyword| is registered in |backup_head|. All these variables are packed in the structure
+ |token_data| but some have been moved to a more relevant state (so omit and hold are now in the
+ alignment state).
+
+*/
+
+token_state_info lmt_token_state = {
+ .null_list = null,
+ .in_lua_escape = 0,
+ .force_eof = 0,
+ .luacstrings = 0,
+ .par_loc = null,
+ .par_token = null,
+ /* .line_par_loc = null, */ /* removed because not really used and useful */
+ /* .line_par_token = null, */ /* idem */
+ .buffer = NULL,
+ .bufloc = 0,
+ .bufmax = 0,
+ .padding = 0,
+};
+
+/*tex Some properties are dumped in the format so these are aet already! */
+
+# define reserved_token_mem_slots 2 // play safe for slight overuns
+
+void tex_initialize_token_mem(void)
+{
+ memoryword *tokens = NULL;
+ int size = 0;
+ if (lmt_main_state.run_state == initializing_state) {
+ size = lmt_token_memory_state.tokens_data.minimum;
+ } else {
+ size = lmt_token_memory_state.tokens_data.allocated;
+ lmt_token_memory_state.tokens_data.initial = lmt_token_memory_state.tokens_data.ptr;
+ }
+ if (size > 0) {
+ tokens = aux_allocate_clear_array(sizeof(memoryword), size, reserved_token_mem_slots);
+ }
+ if (tokens) {
+ lmt_token_memory_state.tokens = tokens;
+ lmt_token_memory_state.tokens_data.allocated = size;
+ } else {
+ tex_overflow_error("tokens", size);
+ }
+}
+
+static void tex_aux_bump_token_memory(void)
+{
+ /*tex We need to manage the big dynamic storage area. */
+ int size = lmt_token_memory_state.tokens_data.allocated + lmt_token_memory_state.tokens_data.step;
+ if (size > lmt_token_memory_state.tokens_data.size) {
+ lmt_run_memory_callback("token", 0);
+ tex_show_runaway();
+ tex_overflow_error("token memory size", lmt_token_memory_state.tokens_data.allocated);
+ } else {
+ memoryword *tokens = aux_reallocate_array(lmt_token_memory_state.tokens, sizeof(memoryword), size, reserved_token_mem_slots);
+ lmt_run_memory_callback("token", tokens ? 1 : 0);
+ if (tokens) {
+ lmt_token_memory_state.tokens = tokens;
+ } else {
+ /*tex If memory is exhausted, display possible runaway text. */
+ tex_show_runaway();
+ tex_overflow_error("token memory size", lmt_token_memory_state.tokens_data.allocated);
+ }
+ }
+ memset((void *) (lmt_token_memory_state.tokens + lmt_token_memory_state.tokens_data.allocated + 1), 0, ((size_t) lmt_token_memory_state.tokens_data.step + reserved_token_mem_slots) * sizeof(memoryword));
+ lmt_token_memory_state.tokens_data.allocated = size;
+}
+
+void tex_initialize_tokens(void)
+{
+ lmt_token_memory_state.available = null;
+ lmt_token_memory_state.tokens_data.top = 0;
+ lmt_token_state.null_list = tex_get_available_token(null);
+ lmt_token_state.in_lua_escape = 0;
+}
+
+/*tex
+ Experiment. It saves some 512K on the \CONTEXT\ format of October 2020. It makes me wonder if I
+ should spend some time on optimizing token lists (kind of cisc commands as we're currently kind
+ of risc).
+*/
+
+void tex_compact_tokens(void)
+{
+ int nc = 0;
+ // memoryword *target = allocate_array(sizeof(memoryword), (size_t) token_memory_state.tokens_data.allocated, 0);
+ memoryword *target = aux_allocate_clear_array(sizeof(memoryword), (size_t) lmt_token_memory_state.tokens_data.allocated, 0);
+ halfword *mapper = aux_allocate_array(sizeof(halfword), (size_t) lmt_token_memory_state.tokens_data.allocated, 0);
+ int nofluacmds = 0;
+ if (target && mapper) {
+ // memset((void *) target, 0, ((size_t) token_memory_state.tokens_data.allocated) * sizeof(memoryword));
+ memset((void *) mapper, -1, ((size_t) lmt_token_memory_state.tokens_data.allocated) * sizeof(halfword));
+ memoryword *tokens = lmt_token_memory_state.tokens;
+ /* also reset available */
+ for (int cs = 0; cs < (eqtb_size + lmt_hash_state.hash_data.ptr); cs++) {
+ switch (eq_type(cs)) {
+ case call_cmd:
+ case protected_call_cmd:
+ case semi_protected_call_cmd:
+ case tolerant_call_cmd:
+ case tolerant_protected_call_cmd:
+ case tolerant_semi_protected_call_cmd:
+ case internal_toks_reference_cmd:
+ case register_toks_reference_cmd:
+ {
+ halfword v = eq_value(cs); /* ref count token*/
+ if (v) {
+ if (mapper[v] < 0) {
+ // printf("before =>"); { halfword tt = v; while (tt) { printf("%7d ",tt); tt = token_link(tt); } } printf("\n");
+ halfword t = v;
+ nc++;
+ mapper[v] = nc; /* new ref count token index */
+ while (1) {
+ target[nc].half1 = tokens[t].half1; /* info cq. ref count */
+ t = tokens[t].half0;
+ if (t) {
+ nc++;
+ target[nc-1].half0 = nc; /* link to next */
+ } else {
+ target[nc].half0 = null; /* link to next */
+ break;
+ }
+ }
+ // printf("after =>"); { halfword tt = mapper[v]; while (tt) { printf("%7d ",tt); tt = target[tt].half0; } } printf("\n");
+ }
+ eq_value(cs) = mapper[v];
+ }
+ break;
+ }
+ case lua_value_cmd:
+ case lua_call_cmd:
+ case lua_local_call_cmd:
+ {
+ ++nofluacmds;
+ break;
+ }
+ }
+ }
+ // print(dump_state.format_identifier);
+ tex_print_format("tokenlist compacted from %i to %i entries, ", lmt_token_memory_state.tokens_data.top, nc);
+ if (nofluacmds) {
+ /*tex
+ We just mention them because when these are aliased the macro package needs to make
+ sure that after loading that happens again because registered funciton references
+ can have changed between format generation and run!
+ */
+ tex_print_format("%i potentially aliased lua call/value entries, ", nofluacmds);
+ }
+ lmt_token_memory_state.tokens_data.top = nc;
+ lmt_token_memory_state.tokens_data.ptr = nc;
+ aux_deallocate_array(lmt_token_memory_state.tokens);
+ lmt_token_memory_state.tokens = target;
+ lmt_token_memory_state.available = null;
+ } else {
+ tex_overflow_error("token compaction size", lmt_token_memory_state.tokens_data.allocated);
+ }
+}
+
+
+/*tex
+
+ The function |get_avail| returns a pointer (index) to a new one word node whose |link| field is
+ |null| (which is just 0). However, \TEX\ will halt if there is no more room left.
+
+ If the available space list is empty, i.e., if |avail = null|, we try first to increase
+ |fix_mem_end|. If that cannot be done, i.e., if |fix_mem_end = fix_mem_max|, we try to reallocate
+ array |fixmem|. If, that doesn't work, we have to quit. Users can configure \TEX\ to use a lot of
+ memory but in some scenarios limitations make sense.
+
+ Remark: we can have a pool of chunks where we get from or just allocate per token (as we have lots
+ of them that is slow). But then format loading becomes much slower as we need to recreate the
+ linked list. A no go. In todays terms \TEX\ memory usage is low anyway.
+
+ The freed tokens are kept in a linked list. First we check if we can quickly get one of these. If
+ that fails, we try to get one from the available pool. If that fails too, we enlarge the pool and
+ try again. We keep track of the used number of tokens. We also make sure that the tokens links to
+ nothing.
+
+ One problem is of course that tokens can be scattered over memory. We could have some sorter that
+ occasionally kicks in but it doesn't pay off. Normally definitions (in the format) are in sequence
+ but a normal run \unknown\ it would be interesting to know if this impacts the cache.
+
+*/
+
+halfword tex_get_available_token(halfword t)
+{
+ halfword p = lmt_token_memory_state.available;
+ if (p) {
+ lmt_token_memory_state.available = token_link(p);
+ } else if (lmt_token_memory_state.tokens_data.top < lmt_token_memory_state.tokens_data.allocated) {
+ p = ++lmt_token_memory_state.tokens_data.top;
+ } else {
+ tex_aux_bump_token_memory();
+ p = ++lmt_token_memory_state.tokens_data.top;
+ }
+ ++lmt_token_memory_state.tokens_data.ptr;
+ token_link(p) = null;
+ token_info(p) = t;
+ return p;
+}
+
+/*tex
+
+ Because we only have forward links, a freed token ends up at the head of the list of available
+ tokens.
+
+*/
+
+void tex_put_available_token(halfword p)
+{
+ token_link(p) = lmt_token_memory_state.available;
+ lmt_token_memory_state.available = p;
+ --lmt_token_memory_state.tokens_data.ptr;
+}
+
+halfword tex_store_new_token(halfword p, halfword t)
+{
+ halfword q = tex_get_available_token(t);
+ token_link(p) = q;
+ return q;
+}
+
+/*tex
+
+ The procedure |flush_list (p)| frees an entire linked list of oneword nodes that starts at
+ position |p|. It makes list of single word nodes available. The second variant in principle
+ is faster but in practice this goes unnoticed. Of course there is a little price to pay for
+ keeping track of memory usage.
+
+*/
+
+void tex_flush_token_list(halfword head)
+{
+ if (head) {
+ halfword current = head;
+ halfword tail;
+ int i = 0;
+ do {
+ ++i;
+ tail = current;
+ current = token_link(tail);
+ } while (current);
+ lmt_token_memory_state.tokens_data.ptr -= i;
+ token_link(tail) = lmt_token_memory_state.available;
+ lmt_token_memory_state.available = head;
+ }
+}
+
+void tex_flush_token_list_head_tail(halfword head, halfword tail, int n)
+{
+ if (head) {
+ lmt_token_memory_state.tokens_data.ptr -= n;
+ token_link(tail) = lmt_token_memory_state.available;
+ lmt_token_memory_state.available = head;
+ }
+}
+
+void tex_add_token_reference(halfword p)
+{
+ if (get_token_reference(p) < max_token_reference) {
+ add_token_reference(p);
+ } else {
+ tex_overflow_error("reference count", max_token_reference);
+ }
+}
+
+void tex_increment_token_reference(halfword p, int n)
+{
+ if ((get_token_reference(p) + n) < max_token_reference) {
+ inc_token_reference(p,n);
+ } else {
+ tex_overflow_error("reference count", max_token_reference);
+ }
+}
+
+void tex_delete_token_reference(halfword p)
+{
+ if (p) {
+ if (get_token_reference(p)) {
+ sub_token_reference(p);
+ } else {
+ tex_flush_token_list(p);
+ }
+ }
+}
+
+/*tex
+
+ A \TEX\ token is either a character or a control sequence, and it is represented internally in
+ one of two ways:
+
+ \startitemize[n]
+ \startitem
+ A character whose ASCII code number is |c| and whose command code is |m| is represented
+ as the number $2^{21}m+c$; the command code is in the range |1 <= m <= 14|.
+ \stopitem
+ \startitem
+ A control sequence whose |eqtb| address is |p| is represented as the number
+ |cs_token_flag+p|. Here |cs_token_flag = t =| $2^{25}-1$ is larger than $2^{21}m+c$, yet
+ it is small enough that |cs_token_flag + p < max_halfword|; thus, a token fits
+ comfortably in a halfword.
+ \stopitem
+ \stopitemize
+
+ A token |t| represents a |left_brace| command if and only if |t < left_brace_limit|; it
+ represents a |right_brace| command if and only if we have |left_brace_limit <= t <
+ right_brace_limit|; and it represents a |match| or |end_match| command if and only if
+ |match_token <= t <= end_match_token|. The following definitions take care of these
+ token-oriented constants and a few others.
+
+ A token list is a singly linked list of one-word nodes in |mem|, where each word contains a token
+ and a link. Macro definitions, output routine definitions, marks, |\write| texts, and a few other
+ things are remembered by \TEX\ in the form of token lists, usually preceded by a node with a
+ reference count in its |token_ref_count| field. The token stored in location |p| is called
+ |info(p)|.
+
+ Three special commands appear in the token lists of macro definitions. When |m = match|, it means
+ that \TEX\ should scan a parameter for the current macro; when |m = end_match|, it means that
+ parameter matching should end and \TEX\ should start reading the macro text; and when |m =
+ out_param|, it means that \TEX\ should insert parameter number |c| into the text at this point.
+
+ The enclosing |\char'173| and |\char'175| characters of a macro definition are omitted, but the
+ final right brace of an output routine is included at the end of its token list.
+
+ Here is an example macro definition that illustrates these conventions. After \TEX\ processes
+ the text:
+
+ \starttyping
+ \def\mac a#1#2 \b {#1\-a ##1#2 \#2\}
+ \stoptyping
+
+ The definition of |\mac| is represented as a token list containing:
+
+ \starttyping
+ (reference count) letter a match # match # spacer \b end_match
+ out_param1 \- letter a spacer, mac_param # other_char 1
+ out_param2 spacer out_param 2
+ \stoptyping
+
+ The procedure |scan_toks| builds such token lists, and |macro_call| does the parameter matching.
+
+ Examples such as |\def \m {\def \m {a} b}| explain why reference counts would be needed even if
+ \TEX\ had no |\let| operation: When the token list for |\m| is being read, the redefinition of
+ |\m| changes the |eqtb| entry before the token list has been fully consumed, so we dare not
+ simply destroy a token list when its control sequence is being redefined.
+
+ If the parameter-matching part of a definition ends with |#{}|, the corresponding token list
+ will have |{| just before the |end_match| and also at the very end. The first |{| is used to
+ delimit the parameter; the second one keeps the first from disappearing.
+
+ The |print_meaning| subroutine displays |cur_cmd| and |cur_chr| in symbolic form, including the
+ expansion of a macro or mark.
+
+*/
+
+void tex_print_meaning(halfword code)
+{
+ /*tex
+
+ This would make sense but some macro packages don't like it:
+
+ \starttyping
+ if (cur_cmd == math_given_cmd) {
+ cur_cmd = math_xgiven_cmd ;
+ }
+ \stoptyping
+
+ Eventually we might just do it that way. We also can have |\meaningonly| that omits the
+ |macro:| and arguments.
+ */
+ int untraced = is_untraced(eq_flag(cur_cs));
+ if (! untraced) {
+ switch (code) {
+ case meaning_code:
+ case meaning_full_code:
+ case meaning_asis_code:
+ tex_print_cmd_flags(cur_cs, cur_cmd, (code == meaning_full_code || code == meaning_asis_code), code == meaning_asis_code);
+ break;
+ }
+ }
+ switch (cur_cmd) {
+ case call_cmd:
+ case protected_call_cmd:
+ case semi_protected_call_cmd:
+ case tolerant_call_cmd:
+ case tolerant_protected_call_cmd:
+ case tolerant_semi_protected_call_cmd:
+ if (untraced) {
+ tex_print_cs(cur_cs);
+ return;
+ } else {
+ switch (code) {
+ case meaning_code:
+ case meaning_full_code:
+ tex_print_str("macro");
+ goto FOLLOWUP;
+ case meaning_asis_code:
+ // tex_print_format("%e%C %S ", def_cmd, def_code, cur_cs);
+ tex_print_cmd_chr(def_cmd, def_code);
+ tex_print_char(' ');
+ tex_print_cs(cur_cs);
+ tex_print_char(' ');
+ if (cur_chr && token_link(cur_chr)) {
+ halfword body = get_token_parameters(cur_chr) ? tex_show_token_list(token_link(cur_chr), null, default_token_show_max, 1) : token_link(cur_chr);
+ tex_print_char('{');
+ if (body) {
+ tex_show_token_list(body, null, default_token_show_max, 0);
+ }
+ tex_print_char('}');
+ }
+ return;
+ }
+ goto DETAILS;
+ }
+ case get_mark_cmd:
+ tex_print_cmd_chr((singleword) cur_cmd, cur_chr);
+ tex_print_char(':');
+ tex_print_nlp();
+ tex_token_show(tex_get_some_mark(cur_chr, 0), default_token_show_max);
+ return;
+ case lua_value_cmd:
+ case lua_call_cmd:
+ case lua_local_call_cmd:
+ case lua_protected_call_cmd:
+ if (untraced) {
+ tex_print_cs(cur_cs);
+ return;
+ } else {
+ goto DEFAULT;
+ }
+ case if_test_cmd:
+ if (cur_chr > last_if_test_code) {
+ tex_print_cs(cur_cs);
+ return;
+ } else {
+ goto DEFAULT;
+ }
+ default:
+ DEFAULT:
+ tex_print_cmd_chr((singleword) cur_cmd, cur_chr);
+ if (cur_cmd < call_cmd) {
+ return;
+ } else {
+ /* all kind of reference cmds */
+ break;
+ }
+ }
+ FOLLOWUP:
+ tex_print_char(':');
+ DETAILS:
+ tex_print_nlp();
+ tex_token_show(cur_chr, default_token_show_max);
+}
+
+/*tex
+
+ The procedure |show_token_list|, which prints a symbolic form of the token list that starts at
+ a given node |p|, illustrates these conventions. The token list being displayed should not begin
+ with a reference count. However, the procedure is intended to be robust, so that if the memory
+ links are awry or if |p| is not really a pointer to a token list, nothing catastrophic will
+ happen.
+
+ An additional parameter |q| is also given; this parameter is either null or it points to a node
+ in the token list where a certain magic computation takes place that will be explained later.
+ Basically, |q| is non-null when we are printing the two-line context information at the time of
+ an error message; |q| marks the place corresponding to where the second line should begin.
+
+ For example, if |p| points to the node containing the first |a| in the token list above, then
+ |show_token_list| will print the string
+
+ \starttyping
+ a#1#2 \b ->#1-a ##1#2 #2
+ \stoptyping
+
+ and if |q| points to the node containing the second |a|, the magic computation will be performed
+ just before the second |a| is printed.
+
+ The generation will stop, and |\ETC.| will be printed, if the length of printing exceeds a given
+ limit~|l|. Anomalous entries are printed in the form of control sequences that are not followed
+ by a blank space, e.g., |\BAD.|; this cannot be confused with actual control sequences because a
+ real control sequence named |BAD| would come out |\BAD |.
+
+ In \LUAMETATEX\ we have some more node types and token types so we also have additional tracing.
+ Because there is some more granularity in for instance nodes (subtypes) more detail is reported.
+
+*/
+
+static const char *tex_aux_special_cmd_string(halfword cmd, halfword chr, const char *unknown)
+{
+ switch (cmd) {
+ case node_cmd : return "[[special cmd: node pointer]]";
+ case lua_protected_call_cmd : return "[[special cmd: lua protected call]]";
+ case lua_value_cmd : return "[[special cmd: lua value call]]";
+ case iterator_value_cmd : return "[[special cmd: iterator value]]";
+ case lua_call_cmd : return "[[special cmd: lua call]]";
+ case lua_local_call_cmd : return "[[special cmd: lua local call]]";
+ case begin_local_cmd : return "[[special cmd: begin local call]]";
+ case end_local_cmd : return "[[special cmd: end local call]]";
+ // case prefix_cmd : return "[[special cmd: enforced]]";
+ case prefix_cmd : return "\\always";
+ default : printf("[[unknown cmd: (%i,%i)]\n", cmd, chr); return unknown;
+ }
+}
+
+halfword nn = 0;
+
+halfword tex_show_token_list(halfword p, halfword q, int l, int asis)
+{
+ if (p) {
+ /*tex the highest parameter number, as an \ASCII\ digit */
+ unsigned char n = '0';
+ int min = 0;
+ int max = lmt_token_memory_state.tokens_data.top;
+ lmt_print_state.tally = 0;
+// if (l <= 0) {
+ l = extreme_token_show_max;
+// }
+ while (p && (lmt_print_state.tally < l)) {
+ if (p == q) {
+ /*tex Do magic computation. We only end up here in context showing. */
+ tex_set_trick_count();
+ }
+ /*tex Display token |p|, and |return| if there are problems. */
+ if (p < min || p > max) {
+ tex_print_str(error_string_clobbered(41));
+ return null;
+ } else if (token_info(p) >= cs_token_flag) {
+ // if (! ((print_state.inhibit_par_tokens) && (token_info(p) == token_state.par_token))) {
+ tex_print_cs_checked(token_info(p) - cs_token_flag);
+ // }
+ } else if (token_info(p) < 0) {
+ tex_print_str(error_string_bad(42));
+ } else if (token_info(p) == 0) {
+ tex_print_str(error_string_bad(44));
+ } else {
+ int cmd = token_cmd(token_info(p));
+ int chr = token_chr(token_info(p));
+ /*
+ Display the token (|cmd|,|chr|). The procedure usually \quote {learns} the character
+ code used for macro parameters by seeing one in a |match| command before it runs
+ into any |out_param| commands.
+
+ */
+ switch (cmd) {
+ case left_brace_cmd:
+ case right_brace_cmd:
+ case math_shift_cmd:
+ case alignment_tab_cmd:
+ case superscript_cmd:
+ case subscript_cmd:
+ case spacer_cmd:
+ case letter_cmd:
+ case other_char_cmd:
+ case ignore_cmd: /* new */
+ tex_print_tex_str(chr);
+ break;
+ case parameter_cmd:
+ if (! lmt_token_state.in_lua_escape && (lmt_expand_state.cs_name_level == 0)) {
+ tex_print_tex_str(chr);
+ }
+ tex_print_tex_str(chr);
+ break;
+ case parameter_reference_cmd:
+ tex_print_tex_str(match_visualizer);
+ if (chr <= 9) {
+ tex_print_char(chr + '0');
+ } else {
+ tex_print_char('!');
+ return null;
+ }
+ break;
+ case match_cmd:
+ tex_print_char(match_visualizer);
+ if (is_valid_match_ref(chr)) {
+ ++n;
+ }
+ tex_print_char(chr ? chr : '0');
+ if (n > '9') {
+ /*tex Can this happen at all? */
+ return null;
+ } else {
+ break;
+ }
+ case end_match_cmd:
+ if (asis) {
+ return token_link(p);
+ } else if (chr == 0) {
+ tex_print_str("->");
+ }
+ break;
+ case ignore_something_cmd:
+ break;
+ case set_font_cmd:
+ tex_print_format("[font->%s]", font_original(cur_val));
+ break;
+ case end_paragraph_cmd:
+ tex_print_format("%e%s", "par ");
+ break;
+ default:
+ tex_print_str(tex_aux_special_cmd_string(cmd, chr, error_string_bad(43)));
+ break;
+ }
+ }
+ p = token_link(p);
+ }
+ if (p) {
+ tex_print_str_esc("ETC.");
+ }
+ }
+ return p;
+}
+
+/*
+# define do_buffer_to_unichar(a,b) do { \
+ a = (halfword)str2uni(fileio_state.io_buffer+b); \
+ b += utf8_size(a); \
+} while (0)
+*/
+
+inline halfword get_unichar_from_buffer(int *b)
+{
+ halfword a = (halfword) ((const unsigned char) *(lmt_fileio_state.io_buffer + *b));
+ if (a <= 0x80) {
+ *b += 1;
+ } else {
+ a = (halfword) aux_str2uni(lmt_fileio_state.io_buffer + *b);
+ *b += utf8_size(a);
+ }
+ return a;
+}
+
+/*tex
+
+ Here's the way we sometimes want to display a token list, given a pointer to its reference count;
+ the pointer may be null.
+
+*/
+
+void tex_token_show(halfword p, int max)
+{
+ if (p && token_link(p)) {
+ tex_show_token_list(token_link(p), null, max, 0);
+ }
+}
+
+/*tex
+
+ The next function, |delete_token_ref|, is called when a pointer to a token list's reference
+ count is being removed. This means that the token list should disappear if the reference count
+ was |null|, otherwise the count should be decreased by one. Variable |p| points to the reference
+ count of a token list that is losing one reference.
+
+*/
+
+int tex_get_char_cat_code(int c)
+{
+ return tex_aux_the_cat_code(c);
+}
+
+static void tex_aux_invalid_character_error(void)
+{
+ tex_handle_error(
+ normal_error_type,
+ "Text line contains an invalid character",
+ "A funny symbol that I can't read has just been input. Continue, and I'll forget\n"
+ "that it ever happened."
+ );
+}
+
+static int tex_aux_process_sup_mark(void);
+
+static int tex_aux_scan_control_sequence(void);
+
+typedef enum next_line_retval {
+ next_line_ok,
+ next_line_return,
+ next_line_restart
+} next_line_retval;
+
+static next_line_retval tex_aux_next_line(void);
+
+/*tex
+
+ In case you are getting bored, here is a slightly less trivial routine: Given a string of
+ lowercase letters, like |pt| or |plus| or |width|, the |scan_keyword| routine checks to see
+ whether the next tokens of input match this string. The match must be exact, except that
+ ppercase letters will match their lowercase counterparts; uppercase equivalents are determined
+ by subtracting |"a" - "A"|, rather than using the |uc_code| table, since \TEX\ uses this
+ routine only for its own limited set of keywords.
+
+ If a match is found, the characters are effectively removed from the input and |true| is
+ returned. Otherwise |false| is returned, and the input is left essentially unchanged (except
+ for the fact that some macros may have been expanded, etc.).
+
+ In \LUATEX\ and its follow up we have more keywords and for instance when scanning a box
+ specification that is noticeable because the |scan_keyword| function is a little inefficient
+ in the sense that when there is no match, it will push back what got read so far. So there is
+ token allocation, pushing a level etc involved. Keep in mind that expansion happens here so what
+ gets pushing back is not always literally pushing back what we started with.
+
+ In \LUAMETATEX\ we now have a bit different approach. The |scan_mandate_keyword| follows up on
+ |scan_character| so we have a two step approach. We could actually pass a list of valid keywords
+ but that would make for a complex function with no real benefits.
+
+*/
+
+halfword tex_scan_character(const char *s, int left_brace, int skip_space, int skip_relax)
+{
+ halfword save_cur_cs = cur_cs;
+// (void) skip_space; /* some day */
+ while (1) {
+ tex_get_x_token();
+ switch (cur_cmd) {
+ case spacer_cmd:
+ if (skip_space) {
+ break;
+ } else {
+ goto DONE;
+ }
+ break;
+ case relax_cmd:
+ if (skip_relax) {
+ break;
+ } else {
+ goto DONE;
+ }
+ case letter_cmd:
+ case other_char_cmd:
+ if (cur_chr <= 'z' && strchr(s, cur_chr)) {
+ cur_cs = save_cur_cs;
+ return cur_chr;
+ } else {
+ goto DONE;
+ }
+ case left_brace_cmd:
+ if (left_brace) {
+ cur_cs = save_cur_cs;
+ return '{';
+ } else {
+ goto DONE;
+ }
+ default:
+ goto DONE;
+ }
+ }
+ DONE:
+ tex_back_input(cur_tok);
+ cur_cs = save_cur_cs;
+ return 0;
+}
+
+void tex_aux_show_keyword_error(const char *s)
+{
+ tex_handle_error(
+ normal_error_type,
+ "Valid keyword expected, likely '%s'",
+ s,
+ "You started a keyword but it seems to be an invalid one. The first character(s)\n"
+ "might give you a clue. You might want to quit unwanted lookahead with \\relax."
+ );
+}
+
+/*tex
+ Scanning an optional keyword starts at the beginning. This means that we can also (for instance)
+ have a minus or plus sign which means that we have a different loop than with the alternative
+ that already checked the first character.
+*/
+
+int tex_scan_optional_keyword(const char *s)
+{
+ halfword save_cur_cs = cur_cs;
+ int done = 0;
+ const char *p = s;
+ while (*p) {
+ tex_get_x_token();
+ switch (cur_cmd) {
+ case letter_cmd:
+ case other_char_cmd:
+ if ((cur_chr == *p) || (cur_chr == *p - 'a' + 'A')) {
+ if (*(++p)) {
+ done = 1;
+ } else {
+ cur_cs = save_cur_cs;
+ return 1;
+ }
+ } else if (done) {
+ goto BAD_NEWS;
+ } else {
+ // can be a minus or so ! as in \advance\foo -10
+ tex_back_input(cur_tok);
+ cur_cs = save_cur_cs;
+ return 1;
+ }
+ break;
+ case spacer_cmd: /* normally spaces are not pushed back */
+ if (done) {
+ goto BAD_NEWS;
+ } else {
+ break;
+ }
+ // fall through
+ default:
+ tex_back_input(cur_tok);
+ if (done) {
+ /* unless we accept partial keywords */
+ goto BAD_NEWS;
+ } else {
+ cur_cs = save_cur_cs;
+ return 0;
+ }
+ }
+ }
+ BAD_NEWS:
+ tex_aux_show_keyword_error(s);
+ cur_cs = save_cur_cs;
+ return 0;
+}
+
+/*tex
+ Here we know that the first character(s) matched so we are in the middle of a keyword already
+ which means a different loop than the previous one.
+*/
+
+int tex_scan_mandate_keyword(const char *s, int offset)
+{
+ halfword save_cur_cs = cur_cs;
+ int done = 0;
+ // int done = offset > 0;
+ const char *p = s + offset; /* offset always > 0 so no issue with +/- */
+ while (*p) {
+ tex_get_x_token();
+ switch (cur_cmd) {
+ case letter_cmd:
+ case other_char_cmd:
+ if ((cur_chr == *p) || (cur_chr == *p - 'a' + 'A')) {
+ if (*(++p)) {
+ done = 1;
+ } else {
+ cur_cs = save_cur_cs;
+ return 1;
+ }
+ } else {
+ goto BAD_NEWS;
+ }
+ break;
+ // case spacer_cmd: /* normally spaces are not pushed back */
+ // case relax_cmd: /* normally not, should be option */
+ // if (done) {
+ // back_input(cur_tok);
+ // goto BAD_NEWS;
+ // } else {
+ // break;
+ // }
+ // default:
+ // goto BAD_NEWS;
+ case spacer_cmd: /* normally spaces are not pushed back */
+ if (done) {
+ goto BAD_NEWS;
+ } else {
+ break;
+ }
+ // fall through
+ default:
+ tex_back_input(cur_tok);
+ /* unless we accept partial keywords */
+ goto BAD_NEWS;
+ }
+ }
+ BAD_NEWS:
+ tex_aux_show_keyword_error(s);
+ cur_cs = save_cur_cs;
+ return 0;
+}
+
+/*
+ This is the original scanner with push|-|back. It's a matter of choice: we are more restricted
+ on the one hand and more loose on the other.
+*/
+
+int tex_scan_keyword(const char *s)
+{
+ if (*s) {
+ halfword h = null;
+ halfword p = null;
+ halfword save_cur_cs = cur_cs;
+ int n = 0;
+ while (*s) {
+ /*tex Recursion is possible here! */
+ tex_get_x_token();
+ if ((cur_cmd == letter_cmd || cur_cmd == other_char_cmd) && ((cur_chr == *s) || (cur_chr == *s - 'a' + 'A'))) {
+ p = tex_store_new_token(p, cur_tok);
+ if (! h) {
+ h = p;
+ }
+ n++;
+ s++;
+ } else if ((p != h) || (cur_cmd != spacer_cmd)) {
+ tex_back_input(cur_tok);
+ if (h) {
+ tex_begin_backed_up_list(h);
+ }
+ cur_cs = save_cur_cs;
+ return 0;
+ }
+ }
+ if (h) {
+ tex_flush_token_list_head_tail(h, p, n);
+ }
+ cur_cs = save_cur_cs;
+ return 1;
+ } else {
+ /*tex but not with newtokenlib zero keyword simply doesn't match */
+ return 0 ;
+ }
+}
+
+int tex_scan_keyword_case_sensitive(const char *s)
+{
+ if (*s) {
+ halfword h = null;
+ halfword p = null;
+ halfword save_cur_cs = cur_cs;
+ int n = 0;
+ while (*s) {
+ tex_get_x_token();
+ if ((cur_cmd == letter_cmd || cur_cmd == other_char_cmd) && (cur_chr == *s)) {
+ p = tex_store_new_token(p, cur_tok);
+ if (! h) {
+ h = p;
+ }
+ n++;
+ s++;
+ } else if ((p != h) || (cur_cmd != spacer_cmd)) {
+ tex_back_input(cur_tok);
+ if (h) {
+ tex_begin_backed_up_list(h);
+ }
+ cur_cs = save_cur_cs;
+ return 0;
+ }
+ }
+ if (h) {
+ tex_flush_token_list_head_tail(h, p, n);
+ }
+ cur_cs = save_cur_cs;
+ return 1;
+ } else {
+ return 0 ;
+ }
+}
+
+/*tex
+
+ We can not return |undefined_control_sequence| under some conditions (inside |shift_case|,
+ for example). This needs thinking.
+
+*/
+
+halfword tex_active_to_cs(int c, int force)
+{
+ halfword cs = -1;
+ if (c > 0) {
+ /*tex This is not that efficient: we can make a helper that doesn't use an alloc. */
+ char utfbytes[8] = { '\xEF', '\xBF', '\xBF', 0 };
+ aux_uni2string((char *) &utfbytes[3], c);
+ cs = tex_string_locate(utfbytes, (size_t) utf8_size(c) + 3, force);
+ }
+ if (cs < 0) {
+ cs = tex_string_locate("\xEF\xBF\xBF", 4, force); /*tex Including the zero sentinel. */
+ }
+ return cs;
+}
+
+/*tex
+
+ The heart of \TEX's input mechanism is the |get_next| procedure, which we shall develop in the
+ next few sections of the program. Perhaps we shouldn't actually call it the \quote {heart},
+ however, because it really acts as \TEX's eyes and mouth, reading the source files and
+ gobbling them up. And it also helps \TEX\ to regurgitate stored token lists that are to be
+ processed again.
+
+ The main duty of |get_next| is to input one token and to set |cur_cmd| and |cur_chr| to that
+ token's command code and modifier. Furthermore, if the input token is a control sequence, the
+ |eqtb| location of that control sequence is stored in |cur_cs|; otherwise |cur_cs| is set to
+ zero.
+
+ Underlying this simple description is a certain amount of complexity because of all the cases
+ that need to be handled. However, the inner loop of |get_next| is reasonably short and fast.
+
+ When |get_next| is asked to get the next token of a |\read| line, it sets |cur_cmd = cur_chr
+ = cur_cs = 0| in the case that no more tokens appear on that line. (There might not be any
+ tokens at all, if the |end_line_char| has |ignore| as its catcode.)
+
+ The value of |par_loc| is the |eqtb| address of |\par|. This quantity is needed because a
+ blank line of input is supposed to be exactly equivalent to the appearance of |\par|; we must
+ set |cur_cs := par_loc| when detecting a blank line.
+
+ Parts |get_next| are executed more often than any other instructions of \TEX. The global
+ variable |force_eof| is normally |false|; it is set |true| by an |\endinput| command.
+ |luacstrings| is the number of lua print statements waiting to be input, it is changed by
+ |lmt_token_call|.
+
+ If the user has set the |pausing| parameter to some positive value, and if nonstop mode has
+ not been selected, each line of input is displayed on the terminal and the transcript file,
+ followed by |=>|. \TEX\ waits for a response. If the response is simply |carriage_return|,
+ the line is accepted as it stands, otherwise the line typed is used instead of the line in the
+ file.
+
+ We no longer need the following:
+
+*/
+
+// void firm_up_the_line(void)
+// {
+// ilimit = fileio_state.io_last;
+// }
+
+/*tex
+
+ The other variant gives less clutter in tracing cache usage when profiling and for some files
+ (like the manual) also a bit of a speedup. Splitting the switch which gives 10 times less Bim
+ in vallgrind! See the \LUATEX\ source for that code.
+
+ The big switch changes the state if necessary, and |goto switch| if the current character
+ should be ignored, or |goto reswitch| if the current character changes to another.
+
+ The n-way switch accomplishes the scanning quickly, assuming that a decent \CCODE\ compiler
+ has translated the code. Note that the numeric values for |mid_line|, |skip_blanks|, and
+ |new_line| are spaced apart from each other by |max_char_code+1|, so we can add a character's
+ command code to the state to get a single number that characterizes both.
+
+ Remark: checking performance indicated that this switch was the cause of many branch prediction
+ errors but changing it to:
+
+ \starttyping
+ c = istate + cur_cmd;
+ if (c == (mid_line_state + letter_cmd) || c == (mid_line_state + other_char_cmd)) {
+ return 1;
+ } else if (c >= new_line_state) {
+ switch (c) {
+ }
+ } else if (c >= skip_blanks_state) {
+ switch (c) {
+ }
+ } else if (c >= mid_line_state) {
+ switch (c) {
+ }
+ } else {
+ istate = mid_line_state;
+ return 1;
+ }
+ \stoptyping
+
+ This gives as many prediction errors. So, we can indeed assume that the compiler does the right
+ job, or that there is simply no other way.
+
+ When a line is finished a space is emited. When a character of type |spacer| gets through, its
+ character code is changed to |\ =040|. This means that the \ASCII\ codes for tab and space, and
+ for the space inserted at the end of a line, will be treated alike when macro parameters are
+ being matched. We do this since such characters are indistinguishable on most computer terminal
+ displays.
+
+*/
+
+/*
+
+ c = istate + cur_cmd;
+ if (c == (mid_line_state + letter_cmd) || c == (mid_line_state + other_char_cmd)) {
+ return 1;
+ } else if (c >= new_line_state) {
+ ....
+ }
+
+*/
+
+/*tex
+
+ This trick has been dropped when the wrapup mechanism had proven to be useful. The idea was
+ to backport this to \LUATEX\ but some other \PDFTEX\ compatible parstuff made it there and
+ backporting par related features becomes too messy.
+
+ \starttyping
+ lmt_input_state.cur_input.loc = lmt_input_state.cur_input.limit + 1;
+ cur_cs = lmt_token_state.line_par_loc;
+ cur_cmd = eq_type(cur_cs);
+ if (cur_cmd == undefined_cs_cmd) {
+ cur_cs = lmt_token_state.par_loc;
+ cur_cmd = eq_type(cur_cs);
+ }
+ cur_chr = eq_value(cur_cs);
+ \stoptyping
+
+*/
+
+static int tex_aux_get_next_file(void)
+{
+ SWITCH:
+ if (lmt_input_state.cur_input.loc <= lmt_input_state.cur_input.limit) {
+ /*tex current line not yet finished */
+ cur_chr = get_unichar_from_buffer(&lmt_input_state.cur_input.loc);
+ RESWITCH:
+ if (lmt_input_state.cur_input.cattable == no_catcode_table_preset) {
+ /* happens seldom: detokenized line */
+ cur_cmd = cur_chr == ' ' ? 10 : 12;
+ } else {
+ cur_cmd = tex_aux_the_cat_code(cur_chr);
+ }
+ switch (lmt_input_state.cur_input.state + cur_cmd) {
+ case mid_line_state + ignore_cmd:
+ case skip_blanks_state + ignore_cmd:
+ case new_line_state + ignore_cmd:
+ case skip_blanks_state + spacer_cmd:
+ case new_line_state + spacer_cmd:
+ /*tex Cases where character is ignored. */
+ goto SWITCH;
+ case mid_line_state + escape_cmd:
+ case new_line_state + escape_cmd:
+ case skip_blanks_state + escape_cmd:
+ /*tex Scan a control sequence. */
+ lmt_input_state.cur_input.state = (unsigned char) tex_aux_scan_control_sequence();
+ break;
+ case mid_line_state + active_char_cmd:
+ case new_line_state + active_char_cmd:
+ case skip_blanks_state + active_char_cmd:
+ /*tex Process an active-character. */
+ cur_cs = tex_active_to_cs(cur_chr, ! lmt_hash_state.no_new_cs);
+ cur_cmd = eq_type(cur_cs);
+ cur_chr = eq_value(cur_cs);
+ lmt_input_state.cur_input.state = mid_line_state;
+ break;
+ case mid_line_state + superscript_cmd:
+ case new_line_state + superscript_cmd:
+ case skip_blanks_state + superscript_cmd:
+ /*tex We need to check for multiple ^:
+ (0) always check for ^^ ^^^^ ^^^^^^^
+ (1) only check in text mode
+ (*) never
+ */
+ if (sup_mark_mode_par) {
+ if (sup_mark_mode_par == 1 && cur_mode != mmode && tex_aux_process_sup_mark()) {
+ goto RESWITCH;
+ }
+ } else if (tex_aux_process_sup_mark()) {
+ goto RESWITCH;
+ } else {
+ /*tex
+ We provide prescripts and shifted script in math mode and avoid fance |^|
+ processing in text mode (which is what we do in \CONTEXT).
+ */
+ }
+ lmt_input_state.cur_input.state = mid_line_state;
+ break;
+ case mid_line_state + invalid_char_cmd:
+ case new_line_state + invalid_char_cmd:
+ case skip_blanks_state + invalid_char_cmd:
+ /*tex Decry the invalid character and |goto restart|. */
+ tex_aux_invalid_character_error();
+ /*tex Because state may be |token_list| now: */
+ return 0;
+ case mid_line_state + spacer_cmd:
+ /*tex Enter |skip_blanks| state, emit a space. */
+ lmt_input_state.cur_input.state = skip_blanks_state;
+ cur_chr = ' ';
+ break;
+ case mid_line_state + end_line_cmd:
+ /*tex Finish the line. See note above about dropped |\linepar|. */
+ lmt_input_state.cur_input.loc = lmt_input_state.cur_input.limit + 1;
+ cur_cmd = spacer_cmd;
+ cur_chr = ' ';
+ break;
+ case skip_blanks_state + end_line_cmd:
+ case mid_line_state + comment_cmd:
+ case new_line_state + comment_cmd:
+ case skip_blanks_state + comment_cmd:
+ /*tex Finish line, |goto switch|; */
+ lmt_input_state.cur_input.loc = lmt_input_state.cur_input.limit + 1;
+ goto SWITCH;
+ case new_line_state + end_line_cmd:
+ if (! auto_paragraph_mode(auto_paragraph_go_on)) {
+ lmt_input_state.cur_input.loc = lmt_input_state.cur_input.limit + 1;
+ }
+ /*tex Finish line, emit a |\par|; */
+ if (auto_paragraph_mode(auto_paragraph_text)) {
+ cur_cs = null;
+ cur_cmd = end_paragraph_cmd;
+ cur_chr = new_line_end_paragraph_code;
+ // cur_chr = normal_end_paragraph_code;
+ } else {
+ cur_cs = lmt_token_state.par_loc;
+ cur_cmd = eq_type(cur_cs);
+ cur_chr = eq_value(cur_cs);
+ }
+ break;
+ case skip_blanks_state + left_brace_cmd:
+ case new_line_state + left_brace_cmd:
+ lmt_input_state.cur_input.state = mid_line_state;
+ lmt_input_state.align_state++;
+ break;
+ case mid_line_state + left_brace_cmd:
+ lmt_input_state.align_state++;
+ break;
+ case skip_blanks_state + right_brace_cmd:
+ case new_line_state + right_brace_cmd:
+ lmt_input_state.cur_input.state = mid_line_state;
+ lmt_input_state.align_state--;
+ break;
+ case mid_line_state + right_brace_cmd:
+ lmt_input_state.align_state--;
+ break;
+ case mid_line_state + math_shift_cmd:
+ case mid_line_state + alignment_tab_cmd:
+ case mid_line_state + parameter_cmd:
+ case mid_line_state + subscript_cmd:
+ case mid_line_state + letter_cmd:
+ case mid_line_state + other_char_cmd:
+ break;
+ /*
+ case skip_blanks_state + math_shift_cmd:
+ case skip_blanks_state + tab_mark_cmd:
+ case skip_blanks_state + mac_param_cmd:
+ case skip_blanks_state + sub_mark_cmd:
+ case skip_blanks_state + letter_cmd:
+ case skip_blanks_state + other_char_cmd:
+ case new_line_state + math_shift_cmd:
+ case new_line_state + tab_mark_cmd:
+ case new_line_state + mac_param_cmd:
+ case new_line_state + sub_mark_cmd:
+ case new_line_state + letter_cmd:
+ case new_line_state + other_char_cmd:
+ */
+ default:
+ lmt_input_state.cur_input.state = mid_line_state;
+ break;
+ }
+ } else {
+ if (! io_token_input(lmt_input_state.cur_input.name)) {
+ lmt_input_state.cur_input.state = new_line_state;
+ }
+ /*tex
+
+ Move to next line of file, or |goto restart| if there is no next line, or |return| if a
+ |\read| line has finished.
+
+ */
+ do {
+ next_line_retval r = tex_aux_next_line();
+ if (r == next_line_restart) {
+ /*tex This happens more often. */
+ return 0;
+ } else if (r == next_line_return) {
+ return 1;
+ }
+ } while (0);
+ /* check_interrupt(); */
+ goto SWITCH;
+ }
+ return 1;
+}
+
+/*tex
+
+ Notice that a code like |^^8| becomes |x| if not followed by a hex digit. We only support a
+ limited set:
+
+ \starttyping
+ ^^^^^^XXXXXX
+ ^^^^XXXXXX
+ ^^XX ^^<char>
+ \stoptyping
+
+*/
+
+# define is_hex(a) ((a >= '0' && a <= '9') || (a >= 'a' && a <= 'f'))
+
+ inline static halfword tex_aux_two_hex_to_cur_chr(int c1, int c2)
+ {
+ return
+ 0x10 * (c1 <= '9' ? c1 - '0' : c1 - 'a' + 10)
+ + 0x01 * (c2 <= '9' ? c2 - '0' : c2 - 'a' + 10);
+ }
+
+ inline static halfword tex_aux_four_hex_to_cur_chr(int c1, int c2,int c3, int c4)
+ {
+ return
+ 0x1000 * (c1 <= '9' ? c1 - '0' : c1 - 'a' + 10)
+ + 0x0100 * (c2 <= '9' ? c2 - '0' : c2 - 'a' + 10)
+ + 0x0010 * (c3 <= '9' ? c3 - '0' : c3 - 'a' + 10)
+ + 0x0001 * (c4 <= '9' ? c4 - '0' : c4 - 'a' + 10);
+}
+
+inline static halfword tex_aux_six_hex_to_cur_chr(int c1, int c2, int c3, int c4, int c5, int c6)
+{
+ return
+ 0x100000 * (c1 <= '9' ? c1 - '0' : c1 - 'a' + 10)
+ + 0x010000 * (c2 <= '9' ? c2 - '0' : c2 - 'a' + 10)
+ + 0x001000 * (c3 <= '9' ? c3 - '0' : c3 - 'a' + 10)
+ + 0x000100 * (c4 <= '9' ? c4 - '0' : c4 - 'a' + 10)
+ + 0x000010 * (c5 <= '9' ? c5 - '0' : c5 - 'a' + 10)
+ + 0x000001 * (c6 <= '9' ? c6 - '0' : c6 - 'a' + 10);
+
+}
+
+static int tex_aux_process_sup_mark(void)
+{
+ if (cur_chr == lmt_fileio_state.io_buffer[lmt_input_state.cur_input.loc]) {
+ if (lmt_input_state.cur_input.loc < lmt_input_state.cur_input.limit) {
+ if ((cur_chr == lmt_fileio_state.io_buffer[lmt_input_state.cur_input.loc + 1]) && (cur_chr == lmt_fileio_state.io_buffer[lmt_input_state.cur_input.loc + 2])) {
+ if ((cur_chr == lmt_fileio_state.io_buffer[lmt_input_state.cur_input.loc + 3]) && (cur_chr == lmt_fileio_state.io_buffer[lmt_input_state.cur_input.loc + 4])) {
+ if ((lmt_input_state.cur_input.loc + 10) <= lmt_input_state.cur_input.limit) {
+ /*tex |^^^^^^XXXXXX| */
+ int c1 = lmt_fileio_state.io_buffer[lmt_input_state.cur_input.loc + 5];
+ int c2 = lmt_fileio_state.io_buffer[lmt_input_state.cur_input.loc + 6];
+ int c3 = lmt_fileio_state.io_buffer[lmt_input_state.cur_input.loc + 7];
+ int c4 = lmt_fileio_state.io_buffer[lmt_input_state.cur_input.loc + 8];
+ int c5 = lmt_fileio_state.io_buffer[lmt_input_state.cur_input.loc + 9];
+ int c6 = lmt_fileio_state.io_buffer[lmt_input_state.cur_input.loc + 10];
+ if (is_hex(c1) && is_hex(c2) && is_hex(c3) && is_hex(c4) && is_hex(c5) && is_hex(c6)) {
+ lmt_input_state.cur_input.loc += 11;
+ cur_chr = tex_aux_six_hex_to_cur_chr(c1, c2, c3, c4, c5, c6);
+ return 1;
+ } else {
+ tex_handle_error(
+ normal_error_type,
+ "^^^^^^ needs six hex digits",
+ NULL
+ );
+ }
+ } else {
+ tex_handle_error(
+ normal_error_type,
+ "^^^^^^ needs six hex digits, end of input",
+ NULL
+ );
+ }
+ } else if ((lmt_input_state.cur_input.loc + 6) <= lmt_input_state.cur_input.limit) {
+ /*tex |^^^^XXXX| */
+ int c1 = lmt_fileio_state.io_buffer[lmt_input_state.cur_input.loc + 3];
+ int c2 = lmt_fileio_state.io_buffer[lmt_input_state.cur_input.loc + 4];
+ int c3 = lmt_fileio_state.io_buffer[lmt_input_state.cur_input.loc + 5];
+ int c4 = lmt_fileio_state.io_buffer[lmt_input_state.cur_input.loc + 6];
+ if (is_hex(c1) && is_hex(c2) && is_hex(c3) && is_hex(c4)) {
+ lmt_input_state.cur_input.loc += 7;
+ cur_chr = tex_aux_four_hex_to_cur_chr(c1, c2, c3, c4);
+ return 1;
+ } else {
+ tex_handle_error(
+ normal_error_type,
+ "^^^^ needs four hex digits",
+ NULL
+ );
+ }
+ } else {
+ tex_handle_error(
+ normal_error_type,
+ "^^^^ needs four hex digits, end of input",
+ NULL
+ );
+ }
+ } else if ((lmt_input_state.cur_input.loc + 2) <= lmt_input_state.cur_input.limit) {
+ /*tex |^^XX| */
+ int c1 = lmt_fileio_state.io_buffer[lmt_input_state.cur_input.loc + 1];
+ int c2 = lmt_fileio_state.io_buffer[lmt_input_state.cur_input.loc + 2];
+ if (is_hex(c1) && is_hex(c2)) {
+ lmt_input_state.cur_input.loc += 3;
+ cur_chr = tex_aux_two_hex_to_cur_chr(c1, c2);
+ return 1;
+ }
+ }
+ /*tex The single character case: */
+ {
+ int c1 = lmt_fileio_state.io_buffer[lmt_input_state.cur_input.loc + 1];
+ if (c1 < 0200) {
+ lmt_input_state.cur_input.loc = lmt_input_state.cur_input.loc + 2;
+ // if (is_hex(c1) && (iloc <= ilimit)) {
+ // int c2 = fileio_state.io_buffer[iloc];
+ // if (is_hex(c2)) {
+ // ++iloc;
+ // cur_chr = two_hex_to_cur_chr(c1, c2);
+ // return 1;
+ // }
+ // }
+ // /*tex The somewhat odd cases, often special control characters: */
+ cur_chr = (c1 < 0100 ? c1 + 0100 : c1 - 0100);
+ return 1;
+ }
+ }
+ }
+ }
+ return 0;
+}
+
+/*tex
+
+ Control sequence names are scanned only when they appear in some line of a file. Once they have
+ been scanned the first time, their |eqtb| location serves as a unique identification, so \TEX\
+ doesn't need to refer to the original name any more except when it prints the equivalent in
+ symbolic form.
+
+ The program that scans a control sequence has been written carefully in order to avoid the
+ blowups that might otherwise occur if a malicious user tried something like |\catcode'15 = 0|.
+ The algorithm might look at |buffer[ilimit + 1]|, but it never looks at |buffer[ilimit + 2]|.
+
+ If expanded characters like |^^A| or |^^df| appear in or just following a control sequence name,
+ they are converted to single characters in the buffer and the process is repeated, slowly but
+ surely.
+
+*/
+
+/*tex
+
+ Whenever we reach the following piece of code, we will have |cur_chr = buffer[k - 1]| and |k <=
+ ilimit + 1| and |cat = get_cat_code(cat_code_table, cur_chr)|. If an expanded code like |^^A| or
+ |^^df| appears in |buffer[(k - 1) .. (k + 1)]| or |buffer[(k - 1) .. (k + 2)]|, we will store
+ the corresponding code in |buffer[k - 1]| and shift the rest of the buffer left two or three
+ places.
+
+*/
+
+static int tex_aux_check_expanded_code(int *kk, halfword *chr)
+{
+ if (sup_mark_mode_par > 1 || (sup_mark_mode_par == 1 && cur_mode == mmode)) {
+ return 0;
+ } else {
+ int k = *kk;
+ /* chr is the ^ character or an equivalent one */
+ if (lmt_fileio_state.io_buffer[k] == *chr && k < lmt_input_state.cur_input.limit) {
+ int d = 1;
+ int l;
+ if ((*chr == lmt_fileio_state.io_buffer[k + 1]) && (*chr == lmt_fileio_state.io_buffer[k + 2])) {
+ if ((*chr == lmt_fileio_state.io_buffer[k + 3]) && (*chr == lmt_fileio_state.io_buffer[k + 4])) {
+ if ((k + 10) <= lmt_input_state.cur_input.limit) {
+ int c1 = lmt_fileio_state.io_buffer[k + 6 - 1];
+ int c2 = lmt_fileio_state.io_buffer[k + 6 ];
+ int c3 = lmt_fileio_state.io_buffer[k + 6 + 1];
+ int c4 = lmt_fileio_state.io_buffer[k + 6 + 2];
+ int c5 = lmt_fileio_state.io_buffer[k + 6 + 3];
+ int c6 = lmt_fileio_state.io_buffer[k + 6 + 4];
+ if (is_hex(c1) && is_hex(c2) && is_hex(c3) && is_hex(c4) && is_hex(c5) && is_hex(c6)) {
+ d = 6;
+ *chr = tex_aux_six_hex_to_cur_chr(c1, c2, c3, c4, c5, c6);
+ } else {
+ tex_handle_error(
+ normal_error_type,
+ "^^^^^^ needs six hex digits",
+ NULL
+ );
+ }
+ } else {
+ tex_handle_error(
+ normal_error_type,
+ "^^^^^^ needs six hex digits, end of input",
+ NULL
+ );
+ }
+ } else if ((k + 6) <= lmt_input_state.cur_input.limit) {
+ int c1 = lmt_fileio_state.io_buffer[k + 4 - 1];
+ int c2 = lmt_fileio_state.io_buffer[k + 4 ];
+ int c3 = lmt_fileio_state.io_buffer[k + 4 + 1];
+ int c4 = lmt_fileio_state.io_buffer[k + 4 + 2];
+ if (is_hex(c1) && is_hex(c2) && is_hex(c3) && is_hex(c4)) {
+ d = 4;
+ *chr = tex_aux_four_hex_to_cur_chr(c1, c2, c3, c4);
+ } else {
+ tex_handle_error(
+ normal_error_type,
+ "^^^^ needs four hex digits",
+ NULL
+ );
+ }
+ } else {
+ tex_handle_error(
+ normal_error_type,
+ "^^^^ needs four hex digits, end of input",
+ NULL
+ );
+ }
+ } else {
+ int c1 = lmt_fileio_state.io_buffer[k + 1];
+ if (c1 < 0200) { /* really ? */
+ d = 1;
+ if (is_hex(c1) && (k + 2) <= lmt_input_state.cur_input.limit) {
+ int c2 = lmt_fileio_state.io_buffer[k + 2];
+ if (is_hex(c2)) {
+ d = 2;
+ *chr = tex_aux_two_hex_to_cur_chr(c1, c2);
+ } else {
+ *chr = (c1 < 0100 ? c1 + 0100 : c1 - 0100);
+ }
+ } else {
+ *chr = (c1 < 0100 ? c1 + 0100 : c1 - 0100);
+ }
+ }
+ }
+ if (d > 2) {
+ d = 2 * d - 1;
+ } else {
+ d++;
+ }
+ if (*chr <= 0x7F) {
+ lmt_fileio_state.io_buffer[k - 1] = (unsigned char) *chr;
+ } else if (*chr <= 0x7FF) {
+ lmt_fileio_state.io_buffer[k - 1] = (unsigned char) (0xC0 + *chr / 0x40);
+ k++;
+ d--;
+ lmt_fileio_state.io_buffer[k - 1] = (unsigned char) (0x80 + *chr % 0x40);
+ } else if (*chr <= 0xFFFF) {
+ lmt_fileio_state.io_buffer[k - 1] = (unsigned char) (0xE0 + *chr / 0x1000);
+ k++;
+ d--;
+ lmt_fileio_state.io_buffer[k - 1] = (unsigned char) (0x80 + (*chr % 0x1000) / 0x40);
+ k++;
+ d--;
+ lmt_fileio_state.io_buffer[k - 1] = (unsigned char) (0x80 + (*chr % 0x1000) % 0x40);
+ } else {
+ lmt_fileio_state.io_buffer[k - 1] = (unsigned char) (0xF0 + *chr / 0x40000);
+ k++;
+ d--;
+ lmt_fileio_state.io_buffer[k - 1] = (unsigned char) (0x80 + (*chr % 0x40000) / 0x1000);
+ k++;
+ d--;
+ lmt_fileio_state.io_buffer[k - 1] = (unsigned char) (0x80 + ((*chr % 0x40000) % 0x1000) / 0x40);
+ k++;
+ d--;
+ lmt_fileio_state.io_buffer[k - 1] = (unsigned char) (0x80 + ((*chr % 0x40000) % 0x1000) % 0x40);
+ }
+ l = k;
+ lmt_input_state.cur_input.limit -= d;
+ while (l <= lmt_input_state.cur_input.limit) {
+ lmt_fileio_state.io_buffer[l] = lmt_fileio_state.io_buffer[l + d];
+ l++;
+ }
+ *kk = k;
+ cur_chr = *chr; /* hm */
+ return 1;
+ } else {
+ return 0;
+ }
+ }
+}
+
+static int tex_aux_scan_control_sequence(void)
+{
+ int state = mid_line_state;
+ if (lmt_input_state.cur_input.loc > lmt_input_state.cur_input.limit) {
+ /*tex |state| is irrelevant in this case. */
+ cur_cs = null_cs;
+ } else {
+ /*tex |cat_code(cur_chr)|, usually: */
+ while (1) {
+ int loc = lmt_input_state.cur_input.loc;
+ halfword chr = get_unichar_from_buffer(&loc);
+ halfword cat = tex_aux_the_cat_code(chr);
+ if (cat != letter_cmd || loc > lmt_input_state.cur_input.limit) {
+ if (cat == spacer_cmd) {
+ state = skip_blanks_state;
+ } else {
+ state = mid_line_state;
+ if (cat == superscript_cmd && tex_aux_check_expanded_code(&loc, &chr)) {
+ continue;
+ }
+ }
+ // state = cat == spacer_cmd ? skip_blanks_state : mid_line_state;
+ // /*tex If an expanded \unknown */
+ // if (cat == sup_mark_cmd && check_expanded_code(&loc, chr)) {
+ // continue;
+ // }
+ } else {
+ state = skip_blanks_state;
+ do {
+ chr = get_unichar_from_buffer(&loc);
+ cat = tex_aux_the_cat_code(chr);
+ } while (cat == letter_cmd && loc <= lmt_input_state.cur_input.limit);
+ /*tex If an expanded \unknown */
+ if (cat == superscript_cmd && tex_aux_check_expanded_code(&loc, &chr)) {
+ continue;
+ } else if (cat != letter_cmd) {
+ /*tex Backtrack one character which can be \UTF. */
+ if (chr <= 0x7F) {
+ loc -= 1; /* in most cases */
+ } else if (chr > 0xFFFF) {
+ loc -= 4;
+ } else if (chr > 0x7FF) {
+ loc -= 3;
+ } else /* if (cur_chr > 0x7F) */ {
+ loc -= 2;
+ }
+ /*tex Now |k| points to first nonletter. */
+ }
+ }
+ cur_cs = tex_id_locate(lmt_input_state.cur_input.loc, loc - lmt_input_state.cur_input.loc, ! lmt_hash_state.no_new_cs);
+ lmt_input_state.cur_input.loc = loc;
+ break;
+ }
+ }
+ cur_cmd = eq_type(cur_cs);
+ cur_chr = eq_value(cur_cs);
+ return state;
+}
+
+/*tex
+
+ All of the easy branches of |get_next| have now been taken care of. There is one more branch.
+ Conversely, the |file_warning| procedure is invoked when a file ends and some groups entered or
+ conditionals started while reading from that file are still incomplete.
+
+*/
+
+static void tex_aux_file_warning(void)
+{
+ halfword cond_ptr = lmt_save_state.save_stack_data.ptr; /*tex saved value of |save_ptr| or |cond_ptr| */
+ int cur_if = cur_group; /*tex saved value of |cur_group| or |cur_if| */
+ int cur_unless = 0;
+ int if_step = 0;
+ int if_unless = 0;
+ int if_limit = cur_level; /*tex saved value of |cur_level| or |if_limit| */
+ int if_line = 0; /*tex saved value of |if_line| */
+ lmt_save_state.save_stack_data.ptr = cur_boundary;
+ while (lmt_input_state.in_stack[lmt_input_state.in_stack_data.ptr].group != lmt_save_state.save_stack_data.ptr) {
+ --cur_level;
+ tex_print_nlp();
+ tex_print_format("Warning: end of file when %G is incomplete", 1);
+ cur_group = save_level(lmt_save_state.save_stack_data.ptr);
+ lmt_save_state.save_stack_data.ptr = save_value(lmt_save_state.save_stack_data.ptr);
+ }
+ /*tex Restore old values. */
+ lmt_save_state.save_stack_data.ptr = cond_ptr;
+ cur_level = (quarterword) if_limit;
+ cur_group = (quarterword) cur_if;
+ cond_ptr = lmt_condition_state.cond_ptr;
+ cur_if = lmt_condition_state.cur_if;
+ cur_unless = lmt_condition_state.cur_unless;
+ if_step = lmt_condition_state.if_step;
+ if_unless = lmt_condition_state.if_unless;
+ if_limit = lmt_condition_state.if_limit;
+ if_line = lmt_condition_state.if_line;
+ while (lmt_input_state.in_stack[lmt_input_state.in_stack_data.ptr].if_ptr != lmt_condition_state.cond_ptr) {
+ /* todo, more info */
+ tex_print_nlp();
+ tex_print_format("Warning: end of file when %C", if_test_cmd, lmt_condition_state.cur_if);
+ if (lmt_condition_state.if_limit == fi_code) {
+ tex_print_str_esc("else");
+ }
+ if (lmt_condition_state.if_line) {
+ tex_print_format(" entered on line %i", lmt_condition_state.if_line);
+ }
+ tex_print_str(" is incomplete");
+ lmt_condition_state.cur_if = if_limit_subtype(lmt_condition_state.cond_ptr);
+ lmt_condition_state.cur_unless = if_limit_unless(lmt_condition_state.cond_ptr);
+ lmt_condition_state.if_step = if_limit_step(lmt_condition_state.cond_ptr);
+ lmt_condition_state.if_unless = if_limit_stepunless(lmt_condition_state.cond_ptr);
+ lmt_condition_state.if_limit = if_limit_type(lmt_condition_state.cond_ptr);
+ lmt_condition_state.if_line = if_limit_line(lmt_condition_state.cond_ptr);
+ lmt_condition_state.cond_ptr = node_next(lmt_condition_state.cond_ptr);
+ }
+ /*tex restore old values */
+ lmt_condition_state.cond_ptr = cond_ptr;
+ lmt_condition_state.cur_if = cur_if;
+ lmt_condition_state.cur_unless = cur_unless;
+ lmt_condition_state.if_step = if_step;
+ lmt_condition_state.if_unless = if_unless;
+ lmt_condition_state.if_limit = if_limit;
+ lmt_condition_state.if_line = if_line;
+ tex_print_nlp();
+ if (tracing_nesting_par > 1) {
+ tex_show_context();
+ }
+ if (lmt_error_state.history == spotless) {
+ lmt_error_state.history = warning_issued;
+ }
+}
+
+static void tex_aux_check_validity(void)
+{
+ switch (lmt_input_state.scanner_status) {
+ case scanner_is_normal:
+ break;
+ case scanner_is_skipping:
+ tex_handle_error(
+ condition_error_type,
+ "The file ended while I was skipping conditional text.",
+ "This kind of error happens when you say '\\if...' and forget the\n"
+ "matching '\\fi'. It can also be that you use '\\orelse' or '\\orunless\n'"
+ "in the wrong way. Or maybe a forbidden control sequence was encountered."
+ );
+ break;
+ case scanner_is_defining:
+ tex_handle_error(runaway_error_type, "The file ended when scanning a definition.", NULL);
+ break;
+ case scanner_is_matching:
+ tex_handle_error(runaway_error_type, "The file ended when scanning an argument.", NULL);
+ break;
+ case scanner_is_tolerant:
+ break;
+ case scanner_is_aligning:
+ tex_handle_error(runaway_error_type, "The file ended when scanning an alignment preamble.", NULL);
+ break;
+ case scanner_is_absorbing:
+ tex_handle_error(runaway_error_type, "The file ended when absorbing something.", NULL);
+ break;
+ }
+}
+
+static next_line_retval tex_aux_next_line(void)
+{
+ if (lmt_input_state.cur_input.name > io_initial_input_code) {
+ /*tex Read next line of file into |buffer|, or |goto restart| if the file has ended. */
+ unsigned inhibit_eol = 0;
+ ++lmt_input_state.input_line;
+ lmt_fileio_state.io_first = lmt_input_state.cur_input.start;
+ if (! lmt_token_state.force_eof) {
+ unsigned force_eol = 0;
+ switch (lmt_input_state.cur_input.name) {
+ case io_lua_input_code:
+ {
+ halfword n = null;
+ int cattable = 0;
+ int partial = 0;
+ int finalline = 0;
+ int t = lmt_cstring_input(&n, &cattable, &partial, &finalline);
+ switch (t) {
+ case eof_tex_input:
+ lmt_token_state.force_eof = 1;
+ break;
+ case string_tex_input:
+ /*tex string */
+ lmt_input_state.cur_input.limit = lmt_fileio_state.io_last; /*tex Was |firm_up_the_line();|. */
+ lmt_input_state.cur_input.cattable = (short) cattable;
+ lmt_input_state.cur_input.partial = (signed char) partial;
+ if (finalline || partial || cattable == no_catcode_table_preset) {
+ inhibit_eol = 1;
+ }
+ if (! partial) {
+ lmt_input_state.cur_input.state = new_line_state;
+ }
+ break;
+ case token_tex_input:
+ /*tex token */
+ if (n >= cs_token_flag && eq_type(n - cs_token_flag) == input_cmd && eq_value(n - cs_token_flag) == end_of_input_code && lmt_input_state.cur_input.index > 0) {
+ tex_end_file_reading();
+ }
+ tex_back_input(n);
+ return next_line_restart;
+ case token_list_tex_input:
+ /*tex token */
+ tex_begin_backed_up_list(n);
+ return next_line_restart;
+ case node_tex_input:
+ /*tex node */
+ if (node_token_overflow(n)) {
+ tex_back_input(token_val(ignore_cmd, node_token_lsb(n)));
+ tex_reinsert_token(token_val(node_cmd, node_token_msb(n)));
+ return next_line_restart;
+ } else {
+ /*tex |0x10FFFF == 1114111| */
+ tex_back_input(token_val(node_cmd, n));
+ return next_line_restart;
+ }
+ default:
+ lmt_token_state.force_eof = 1;
+ break;
+ }
+ break;
+ }
+ case io_token_input_code:
+ case io_token_eof_input_code:
+ {
+ /* can be simplified but room for extensions now */
+ halfword n = null;
+ int cattable = 0;
+ int partial = 0;
+ int finalline = 0;
+ int t = lmt_cstring_input(&n, &cattable, &partial, &finalline);
+ switch (t) {
+ case eof_tex_input:
+ lmt_token_state.force_eof = 1;
+ if (lmt_input_state.cur_input.name == io_token_eof_input_code && every_eof_par) {
+ force_eol = 1;
+ }
+ break;
+ case string_tex_input:
+ /*tex string */
+ lmt_input_state.cur_input.limit = lmt_fileio_state.io_last; /*tex Was |firm_up_the_line();|. */
+ lmt_input_state.cur_input.cattable = (short) cattable;
+ lmt_input_state.cur_input.partial = (signed char) partial;
+ inhibit_eol = lmt_input_state.cur_input.name != io_token_eof_input_code;
+ if (! partial) {
+ lmt_input_state.cur_input.state = new_line_state;
+ }
+ break;
+ default:
+ lmt_token_state.force_eof = 1;
+ break;
+ }
+ break;
+ }
+ case io_tex_macro_code:
+ /* what */
+ default:
+ if (tex_lua_input_ln()) {
+ /*tex Not end of file, set |ilimit|. */
+ lmt_input_state.cur_input.limit = lmt_fileio_state.io_last; /*tex Was |firm_up_the_line();|. */
+ lmt_input_state.cur_input.cattable = default_catcode_table_preset;
+ } else if (every_eof_par && (! lmt_input_state.in_stack[lmt_input_state.cur_input.index].end_of_file_seen)) {
+ force_eol = 1;
+ } else {
+ tex_aux_check_validity();
+ lmt_token_state.force_eof = 1;
+ }
+ break;
+ }
+ if (force_eol) {
+ lmt_input_state.cur_input.limit = lmt_fileio_state.io_first - 1;
+ /* tex Fake one empty line. */
+ lmt_input_state.in_stack[lmt_input_state.cur_input.index].end_of_file_seen = 1;
+ tex_begin_token_list(every_eof_par, every_eof_text);
+ return next_line_restart;
+ }
+ }
+ if (lmt_token_state.force_eof) {
+ if (tracing_nesting_par > 0) {
+ if ((lmt_input_state.in_stack[lmt_input_state.in_stack_data.ptr].group != cur_boundary) || (lmt_input_state.in_stack[lmt_input_state.in_stack_data.ptr].if_ptr != lmt_condition_state.cond_ptr)) {
+ if (! io_token_input(lmt_input_state.cur_input.name)) {
+ /*tex Give warning for some unfinished groups and/or conditionals. */
+ tex_aux_file_warning();
+ }
+ }
+ }
+ if (io_file_input(lmt_input_state.cur_input.name)) {
+ tex_report_stop_file();
+ --lmt_input_state.open_files;
+ }
+ lmt_token_state.force_eof = 0;
+ tex_end_file_reading();
+ return next_line_restart;
+ } else {
+ if (inhibit_eol || end_line_char_inactive) {
+ lmt_input_state.cur_input.limit--;
+ } else {
+ lmt_fileio_state.io_buffer[lmt_input_state.cur_input.limit] = (unsigned char) end_line_char_par;
+ }
+ lmt_fileio_state.io_first = lmt_input_state.cur_input.limit + 1;
+ lmt_input_state.cur_input.loc = lmt_input_state.cur_input.start;
+ /*tex We're ready to read. */
+ }
+ } else if (lmt_input_state.input_stack_data.ptr > 0) {
+ cur_cmd = 0;
+ cur_chr = 0;
+ return next_line_return;
+ } else {
+ /*tex A somewhat weird check: */
+ switch (lmt_print_state.selector) {
+ case no_print_selector_code:
+ case terminal_selector_code:
+ tex_open_log_file();
+ break;
+ }
+ tex_handle_error(eof_error_type, "end of file encountered", NULL);
+ /*tex Just in case it is not handled in a callback: */
+ if (lmt_error_state.interaction > nonstop_mode) {
+ tex_fatal_error("aborting job");
+ }
+ }
+ return next_line_ok;
+}
+
+/*tex
+ Let's consider now what happens when |get_next| is looking at a token list.
+*/
+
+static int tex_aux_get_next_tokenlist(void)
+{
+ halfword t = token_info(lmt_input_state.cur_input.loc);
+ /*tex Move to next. */
+ lmt_input_state.cur_input.loc = token_link(lmt_input_state.cur_input.loc);
+ if (t >= cs_token_flag) {
+ /*tex A control sequence token */
+ cur_cs = t - cs_token_flag;
+ cur_cmd = eq_type(cur_cs);
+ if (cur_cmd == deep_frozen_dont_expand_cmd) {
+ /*tex
+
+ Get the next token, suppressing expansion. The present point in the program is
+ reached only when the |expand| routine has inserted a special marker into the
+ input. In this special case, |token_info(iloc)| is known to be a control sequence
+ token, and |token_link(iloc) = null|.
+
+ */
+ cur_cs = token_info(lmt_input_state.cur_input.loc) - cs_token_flag;
+ lmt_input_state.cur_input.loc = null;
+ cur_cmd = eq_type(cur_cs);
+ if (cur_cmd > max_command_cmd) {
+ cur_cmd = relax_cmd;
+ // cur_chr = no_expand_flag;
+ cur_chr = no_expand_relax_code;
+ return 1;
+ }
+ }
+ cur_chr = eq_value(cur_cs);
+ } else {
+ cur_cmd = token_cmd(t);
+ cur_chr = token_chr(t);
+ switch (cur_cmd) {
+ case left_brace_cmd:
+ lmt_input_state.align_state++;
+ break;
+ case right_brace_cmd:
+ lmt_input_state.align_state--;
+ break;
+ case parameter_reference_cmd:
+ /*tex Insert macro parameter and |goto restart|. */
+ tex_begin_parameter_list(lmt_input_state.parameter_stack[lmt_input_state.cur_input.parameter_start + cur_chr - 1]);
+ return 0;
+ }
+ }
+ return 1;
+}
+
+/*tex
+
+ Now we're ready to take the plunge into |get_next| itself. Parts of this routine are executed
+ more often than any other instructions of \TEX. This sets |cur_cmd|, |cur_chr|, |cur_cs| to
+ next token.
+
+ Handling alignments is interwoven because there we switch between constructing cells and rows
+ (node lists) based on templates that are token lists. This is why in several places we find
+ checks for |align_state|.
+
+*/
+
+void tex_get_next(void)
+{
+ while (1) {
+ cur_cs = 0;
+ if (lmt_input_state.cur_input.state != token_list_state) {
+ /*tex Input from external file, |goto restart| if no input found. */
+ if (! tex_aux_get_next_file()) {
+ continue;
+ } else {
+ /*tex Check align state later on! */
+ }
+ } else if (! lmt_input_state.cur_input.loc) {
+ /*tex List exhausted, resume previous level. */
+ tex_end_token_list();
+ continue;
+ } else if (! tex_aux_get_next_tokenlist()) {
+ /*tex Parameter needs to be expanded. */
+ continue;
+ }
+ if ((lmt_input_state.align_state == 0) && (cur_cmd == alignment_tab_cmd || cur_cmd == alignment_cmd)) {
+ /*tex If an alignment entry has just ended, take appropriate action. */
+ tex_insert_alignment_template();
+ continue;
+ } else {
+ break;
+ }
+ }
+}
+
+/*tex
+
+ Since |get_next| is used so frequently in \TEX, it is convenient to define three related
+ procedures that do a little more:
+
+ \startitemize
+ \startitem
+ |get_token| not only sets |cur_cmd| and |cur_chr|, it also sets |cur_tok|, a packed
+ halfword version of the current token.
+ \stopitem
+ \startitem
+ |get_x_token|, meaning \quote {get an expanded token}, is like |get_token|, but if the
+ current token turns out to be a user-defined control sequence (i.e., a macro call), or
+ a conditional, or something like |\topmark| or |\expandafter| or |\csname|, it is
+ eliminated from the input by beginning the expansion of the macro or the evaluation of
+ the conditional.
+ \stopitem
+ \startitem
+ |x_token| is like |get_x_token| except that it assumes that |get_next| has already been
+ called.
+ \stopitem
+ \stopitemize
+
+ In fact, these three procedures account for almost every use of |get_next|. No new control
+ sequences will be defined except during a call of |get_token|, or when |\csname| compresses a
+ token list, because |no_new_control_sequence| is always |true| at other times.
+
+ This sets |cur_cmd|, |cur_chr|, |cur_tok|. For convenience we also return the token because in
+ some places we store it and then some direct assignment looks a bit nicer.
+
+*/
+
+halfword tex_get_token(void)
+{
+ lmt_hash_state.no_new_cs = 0;
+ tex_get_next();
+ lmt_hash_state.no_new_cs = 1;
+ cur_tok = cur_cs ? cs_token_flag + cur_cs : token_val(cur_cmd, cur_chr);
+ return cur_tok;
+}
+
+/*tex This changes the string |s| to a token list. */
+
+halfword tex_string_to_toks(const char *ss)
+{
+ const char *s = ss;
+ const char *se = ss + strlen(s);
+ /*tex tail of the token list */
+ halfword h = null;
+ halfword p = null;
+ /*tex new node being added to the token list via |store_new_token| */
+ while (s < se) {
+ halfword t = (halfword) aux_str2uni((const unsigned char *) s);
+ s += utf8_size(t);
+ if (t == ' ') {
+ t = space_token;
+ } else {
+ t += other_token;
+ }
+ p = tex_store_new_token(p, t);
+ if (! h) {
+ h = p;
+ }
+ }
+ return h;
+}
+
+/*tex
+
+ The token lists for macros and for other things like |\mark| and |\output| and |\write| are
+ produced by a procedure called |scan_toks|.
+
+ Before we get into the details of |scan_toks|, let's consider a much simpler task, that of
+ converting the current string into a token list. The |str_toks| function does this; it
+ classifies spaces as type |spacer| and everything else as type |other_char|.
+
+ The token list created by |str_toks| begins at |link(temp_token_head)| and ends at the value
+ |p| that is returned. If |p = temp_token_head|, the list is empty.
+
+ |lua_str_toks| is almost identical, but it also escapes the three symbols that \LUA\ considers
+ special while scanning a literal string.
+
+ This changes the string |str_pool[b .. pool_ptr]| to a token list:
+
+*/
+
+static halfword lmt_str_toks(lstring b) /* returns head */
+{
+ /*tex index into string */
+ unsigned char *k = (unsigned char *) b.s;
+ /*tex tail of the token list */
+ halfword h = null;
+ halfword p = null;
+ while (k < (unsigned char *) b.s + b.l) {
+ /*tex token being appended */
+ halfword t = aux_str2uni(k);
+ k += utf8_size(t);
+ if (t == ' ') {
+ t = space_token;
+ } else {
+ if ((t == '\\') || (t == '"') || (t == '\'') || (t == 10) || (t == 13)) {
+ p = tex_store_new_token(p, escape_token);
+ if (t == 10) {
+ t = 'n';
+ } else if (t == 13) {
+ t = 'r';
+ }
+ }
+ t += other_token;
+ }
+ p = tex_store_new_token(p, t);
+ if (! h) {
+ h = p;
+ }
+ }
+ return h;
+}
+
+/*tex
+
+ Incidentally, the main reason for wanting |str_toks| is the function |the_toks|, which has
+ similar input/output characteristics. This changes the string |str_pool[b .. pool_ptr]| to a
+ token list:
+
+*/
+
+halfword tex_str_toks(lstring s, halfword *tail)
+{
+ halfword h = null;
+ halfword p = null;
+ if (s.s) {
+ unsigned char *k = s.s;
+ unsigned char *l = k + s.l;
+ while (k < l) {
+ halfword t = aux_str2uni(k);
+ if (t == ' ') {
+ k += 1;
+ t = space_token;
+ } else {
+ k += utf8_size(t);
+ t += other_token;
+ }
+ p = tex_store_new_token(p, t);
+ if (! h) {
+ h = p;
+ }
+ }
+ }
+ if (tail) {
+ *tail = null;
+ }
+ return h;
+}
+
+halfword tex_cur_str_toks(halfword *tail)
+{
+ halfword h = null;
+ halfword p = null;
+ unsigned char *k = (unsigned char *) lmt_string_pool_state.string_temp;
+ if (k) {
+ unsigned char *l = k + lmt_string_pool_state.string_temp_top;
+ /*tex tail of the token list */
+ while (k < l) {
+ /*tex token being appended */
+ halfword t = aux_str2uni(k);
+ if (t == ' ') {
+ k += 1;
+ t = space_token;
+ } else {
+ k += utf8_size(t);
+ t += other_token;
+ }
+ p = tex_store_new_token(p, t);
+ if (! h) {
+ h = p;
+ }
+ }
+ }
+ tex_reset_cur_string();
+ if (tail) {
+ *tail = p;
+ }
+ return h;
+}
+
+/*tex
+
+ Most of the converter is similar to the one I made for macro so at some point I can make a
+ helper; also todo: there is no need to go through the pool.
+
+*/
+
+/*tex Change the string |str_pool[b..pool_ptr]| to a token list. */
+
+halfword tex_str_scan_toks(int ct, lstring ls)
+{
+ /*tex index into string */
+ unsigned char *k = ls.s;
+ unsigned char *l = k + ls.l;
+ /*tex tail of the token list */
+ halfword h = null;
+ halfword p = null;
+ while (k < l) {
+ int cc;
+ /*tex token being appended */
+ halfword t = aux_str2uni(k);
+ k += utf8_size(t);
+ cc = tex_get_cat_code(ct, t);
+ if (cc == 0) {
+ /*tex We have a potential control sequence so we check for it. */
+ int lname = 0 ;
+ int s = 0 ;
+ int c = 0 ;
+ unsigned char *name = k ;
+ while (k < l) {
+ t = (halfword) aux_str2uni((const unsigned char *) k);
+ s = utf8_size(t);
+ c = tex_get_cat_code(ct,t);
+ if (c == 11) {
+ k += s ;
+ lname += s ;
+ } else if (c == 10) {
+ /*tex We ignore a trailing space like normal scanning does. */
+ k += s ;
+ break ;
+ } else {
+ break ;
+ }
+ }
+ if (s > 0) {
+ /*tex We have a potential |\cs|. */
+ halfword cs = tex_string_locate((const char *) name, lname, 0);
+ if (cs == undefined_control_sequence) {
+ /*tex Let's play safe and backtrack. */
+ t += cc * (1<<21);
+ k = name ;
+ } else {
+ t = cs_token_flag + cs;
+ }
+ } else {
+ /*tex
+ Just a character with some meaning, so |\unknown| becomes effectively
+ |\unknown| assuming that |\\| has some useful meaning of course.
+ */
+ t += cc * (1<<21);
+ k = name ;
+ }
+ } else {
+ /*tex
+ Whatever token, so for instance $x^2$ just works given a \TEX\ catcode regime.
+ */
+ t += cc * (1<<21);
+ }
+ p = tex_store_new_token(p, t);
+ if (! h) {
+ h = p;
+ }
+ }
+ return h;
+}
+
+/* these two can be combined, then we can avoid the h check */
+
+static void tex_aux_set_toks_register(halfword loc, singleword cmd, halfword t, int g)
+{
+ halfword ref = get_reference_token();
+ set_token_link(ref, t);
+ tex_define((g > 0) ? global_flag_bit : 0, loc, cmd == internal_toks_cmd ? internal_toks_reference_cmd : register_toks_reference_cmd, ref);
+}
+
+static void tex_aux_append_copied_toks_list(halfword loc, singleword cmd, int g, halfword s, halfword t)
+{
+ halfword ref = get_reference_token();
+ halfword p = ref;
+ while (s) {
+ p = tex_store_new_token(p, token_info(s));
+ s = token_link(s);
+ }
+ while (t) {
+ p = tex_store_new_token(p, token_info(t));
+ t = token_link(t);
+ }
+ tex_define((g > 0) ? global_flag_bit : 0, loc, cmd == internal_toks_cmd ? internal_toks_reference_cmd : register_toks_reference_cmd, ref);
+}
+
+/*tex Public helper: */
+
+halfword tex_copy_token_list(halfword h1, halfword *t)
+{
+ halfword h2 = tex_store_new_token(null, token_info(h1));
+ halfword t1 = token_link(h1);
+ halfword t2 = h2;
+ while (t1) {
+ t2 = tex_store_new_token(t2, token_info(t1));
+ t1 = token_link(t1);
+ }
+ if (t) {
+ *t = t2;
+ }
+ return h2;
+}
+
+/*tex
+
+ At some point I decided to implement the following primitives:
+
+ \starttabulate[|T||T||]
+ \NC 0 \NC \type {toksapp} \NC 1 \NC \type {etoksapp} \NC \NR
+ \NC 2 \NC \type {tokspre} \NC 3 \NC \type {etokspre} \NC \NR
+ \NC 4 \NC \type {gtoksapp} \NC 5 \NC \type {xtoksapp} \NC \NR
+ \NC 6 \NC \type {gtokspre} \NC 7 \NC \type {xtokspre} \NC \NR
+ \stoptabulate
+
+ These append and prepend tokens to token lists. In \CONTEXT\ we always had macros doing something
+ like that. It was only a few years later that I ran again into an article that Taco and I wrote
+ in 1999 in the NTG Maps about an extension to \ETEX\ (called eetex). The first revelation was
+ that I had completely forgotten about it, which can be explained by the two decade time-lap. The
+ second was that Taco actually added that to the program at that time, so I could have used (parts
+ of) that code. Anyway, among the other proposed (and implemented) features were manipulating
+ lists and ways to output packed data to the \DVI\ files (numbers packed into 1 upto 4 bytes).
+ Maybe some day I'll have a go at lists, although with todays computers there is not that much to
+ gain. Also, \CONTEXT\ progressed to different internals so the urge is no longer there. The also
+ discussed \SGML\ mode also in no longer that relevant given that we have \LUA.
+
+ If we want to handle macros too we really need to distinguish between toks and macros with
+ |cur_chr| above, but not now. We can't expand, and have to use |get_r_token| or so. I don't need
+ it anyway.
+
+ \starttyping
+ get_r_token();
+ if (cur_cmd == call_cmd) {
+ nt = cur_cs;
+ target = equiv(nt);
+ } else {
+ // some error message
+ }
+ \stoptyping
+*/
+
+# define immediate_permitted(loc,target) ((eq_level(loc) == cur_level) && (get_token_reference(target) == 0))
+
+void tex_run_combine_the_toks(void)
+{
+ halfword source = null;
+ halfword target = null;
+ halfword append, expand, global;
+ halfword nt, ns;
+ singleword cmd;
+ /* */
+ switch (cur_chr) {
+ case expanded_toks_code: append = 0; global = 0; expand = 1; break;
+ case append_toks_code: append = 1; global = 0; expand = 0; break;
+ case append_expanded_toks_code: append = 1; global = 0; expand = 1; break;
+ case prepend_toks_code: append = 2; global = 0; expand = 0; break;
+ case prepend_expanded_toks_code: append = 2; global = 0; expand = 1; break;
+ case global_expanded_toks_code: append = 0; global = 1; expand = 1; break;
+ case global_append_toks_code: append = 1; global = 1; expand = 0; break;
+ case global_append_expanded_toks_code: append = 1; global = 1; expand = 1; break;
+ case global_prepend_toks_code: append = 2; global = 1; expand = 0; break;
+ case global_prepend_expanded_toks_code: append = 2; global = 1; expand = 1; break;
+ default: append = 0; global = 0; expand = 0; break;
+ }
+ /*tex The target. */
+ tex_get_x_token();
+ if (cur_cmd == register_toks_cmd || cur_cmd == internal_toks_cmd) {
+ nt = eq_value(cur_cs);
+ cmd = (singleword) cur_cmd;
+ } else {
+ /*tex Maybe a number. */
+ tex_back_input(cur_tok);
+ nt = register_toks_location(tex_scan_toks_register_number());
+ cmd = register_toks_cmd;
+ }
+ target = eq_value(nt);
+ /*tex The source. */
+ do {
+ tex_get_x_token();
+ } while (cur_cmd == spacer_cmd);
+ if (cur_cmd == left_brace_cmd) {
+ source = expand ? tex_scan_toks_expand(1, NULL, 0) : tex_scan_toks_normal(1, NULL);
+ /*tex The action. */
+ if (source) {
+ if (target) {
+ halfword s = token_link(source);
+ if (s) {
+ halfword t = token_link(target);
+ if (! t) {
+ /*tex Can this happen? */
+ set_token_link(target, s);
+ token_link(source) = null;
+ } else {
+ switch (append) {
+ case 0:
+ goto ASSIGN_1;
+ case 1:
+ /*append */
+ if (immediate_permitted(nt,target)) {
+ halfword p = t;
+ while (token_link(p)) {
+ p = token_link(p);
+ }
+ token_link(p) = s;
+ token_link(source) = null;
+ } else {
+ tex_aux_append_copied_toks_list(nt, cmd, global, t, s);
+ }
+ break;
+ case 2:
+ /* prepend */
+ if (immediate_permitted(nt,target)) {
+ halfword p = s;
+ while (token_link(p)) {
+ p = token_link(p);
+ }
+ token_link(source) = null;
+ set_token_link(p, t);
+ set_token_link(target, s);
+ } else {
+ tex_aux_append_copied_toks_list(nt, cmd, global, s, t);
+ }
+ break;
+ }
+ }
+ }
+ } else {
+ ASSIGN_1:
+ tex_aux_set_toks_register(nt, cmd, token_link(source), global);
+ token_link(source) = null;
+ }
+ tex_flush_token_list(source);
+ }
+ } else {
+ if (cur_cmd == register_toks_cmd) {
+ ns = register_toks_number(eq_value(cur_cs));
+ } else if (cur_cmd == internal_toks_cmd) {
+ ns = internal_toks_number(eq_value(cur_cs));
+ } else {
+ ns = tex_scan_toks_register_number();
+ }
+ /*tex The action. */
+ source = toks_register(ns);
+ if (source) {
+ if (target) {
+ halfword s = token_link(source);
+ halfword t = token_link(target);
+ switch (append) {
+ case 0:
+ /*assign */
+ goto ASSIGN_2;
+ case 1:
+ /*append */
+ if (immediate_permitted(nt, target)) {
+ halfword p = t;
+ while (token_link(p)) {
+ p = token_link(p);
+ }
+ while (s) {
+ p = tex_store_new_token(p, token_info(s));
+ s = token_link(s);
+ }
+ } else {
+ tex_aux_append_copied_toks_list(nt, cmd, global, t, s);
+ }
+ break;
+ case 2:
+ if (immediate_permitted(nt, target)) {
+ halfword h = null;
+ halfword p = null;
+ while (s) {
+ p = tex_store_new_token(p, token_info(s));
+ if (! h) {
+ h = p;
+ }
+ s = token_link(s);
+ }
+ set_token_link(p, t);
+ set_token_link(target, h);
+ } else {
+ tex_aux_append_copied_toks_list(nt, cmd, global, s, t);
+ }
+ break;
+ }
+ } else {
+ ASSIGN_2:
+ // set_toks_register(nt, source, global);
+ tex_add_token_reference(source);
+ eq_value(nt) = source;
+ }
+ }
+ }
+}
+
+/*tex
+
+ This routine, used in the next one, prints the job name, possibly modified by the
+ |process_jobname| callback.
+
+*/
+
+static void tex_aux_print_job_name(void)
+{
+ if (lmt_fileio_state.job_name) {
+ /*tex \CCODE\ strings for jobname before and after processing. */
+ char *s = lmt_fileio_state.job_name;
+ int callback_id = lmt_callback_defined(process_jobname_callback);
+ if (callback_id > 0) {
+ char *ss;
+ int lua_retval = lmt_run_callback(lmt_lua_state.lua_instance, callback_id, "S->S", s, &ss);
+ if (lua_retval && ss) {
+ s = ss;
+ }
+ }
+ tex_print_str(s);
+ }
+}
+
+/*tex
+
+ The procedure |run_convert_tokens| uses |str_toks| to insert the token list for |convert|
+ functions into the scanner; |\outer| control sequences are allowed to follow |\string| and
+ |\meaning|.
+
+*/
+
+/*tex Codes not really needed but cleaner when testing */
+
+# define push_selector { \
+ saved_selector = lmt_print_state.selector; \
+ lmt_print_state.selector = new_string_selector_code; \
+}
+
+# define pop_selector { \
+ lmt_print_state.selector = saved_selector; \
+}
+
+void tex_run_convert_tokens(halfword code)
+{
+ /*tex Scan the argument for command |c|. */
+ switch (code) {
+ /*tex
+ The |number_code| is quite popular. Beware, when used with a lua none function, a zero
+ is injected. We could intercept it at the cost of messy code, but on the other hand,
+ nothing guarantees that the call returns a number so this side effect can be defended
+ as a recovery measure.
+ */
+ case number_code:
+ {
+ int saved_selector;
+ halfword v = tex_scan_int(0, NULL);
+ push_selector;
+ tex_print_int(v);
+ pop_selector;
+ break;
+ }
+ case to_integer_code:
+ case to_hexadecimal_code:
+ {
+ int saved_selector;
+ halfword v = tex_scan_int(0, NULL);
+ tex_get_x_token(); /* maybe not x here */
+ if (cur_cmd != relax_cmd) {
+ tex_back_input(cur_tok);
+ }
+ push_selector;
+ if (code == to_integer_code) {
+ tex_print_int(v);
+ } else {
+ tex_print_hex(v);
+ }
+ pop_selector;
+ break;
+ }
+ case to_scaled_code:
+ case to_sparse_scaled_code:
+ case to_dimension_code:
+ case to_sparse_dimension_code:
+ {
+ int saved_selector;
+ halfword v = tex_scan_dimen(0, 0, 0, 0, NULL);
+ tex_get_x_token(); /* maybe not x here */
+ if (cur_cmd != relax_cmd) {
+ tex_back_input(cur_tok);
+ }
+ push_selector;
+ switch (code) {
+ case to_sparse_dimension_code:
+ case to_sparse_scaled_code:
+ tex_print_sparse_dimension(v, no_unit);
+ break;
+ default:
+ tex_print_dimension(v, no_unit);
+ break;
+ }
+ switch (code) {
+ case to_dimension_code:
+ case to_sparse_dimension_code:
+ tex_print_unit(pt_unit);
+ break;
+ }
+ pop_selector;
+ break;
+ }
+ case to_mathstyle_code:
+ {
+ int saved_selector;
+ halfword v = tex_scan_math_style_identifier(1, 0);
+ push_selector;
+ tex_print_int(v);
+ pop_selector;
+ break;
+ }
+ case lua_function_code:
+ {
+ halfword v = tex_scan_int(0, NULL);
+ if (v > 0) {
+ strnumber u = tex_save_cur_string();
+ lmt_token_state.luacstrings = 0;
+ lmt_function_call(v, 0);
+ tex_restore_cur_string(u);
+ if (lmt_token_state.luacstrings > 0) {
+ tex_lua_string_start();
+ }
+ } else {
+ tex_normal_error("luafunction", "invalid number");
+ }
+ return;
+ }
+ case lua_bytecode_code:
+ {
+ halfword v = tex_scan_int(0, NULL);
+ if (v < 0 || v > 65535) {
+ tex_normal_error("luabytecode", "invalid number");
+ } else {
+ strnumber u = tex_save_cur_string();
+ lmt_token_state.luacstrings = 0;
+ lmt_bytecode_call(v);
+ tex_restore_cur_string(u);
+ if (lmt_token_state.luacstrings > 0) {
+ tex_lua_string_start();
+ }
+ }
+ return;
+ }
+ case lua_code:
+ {
+ full_scanner_status saved_full_status = tex_save_full_scanner_status();
+ strnumber u = tex_save_cur_string();
+ halfword s = tex_scan_toks_expand(0, NULL, 0);
+ tex_unsave_full_scanner_status(saved_full_status);
+ lmt_token_state.luacstrings = 0;
+ lmt_token_call(s);
+ tex_delete_token_reference(s); /* boils down to flush_list */
+ tex_restore_cur_string(u);
+ if (lmt_token_state.luacstrings > 0) {
+ tex_lua_string_start();
+ }
+ /*tex No further action. */
+ return;
+ }
+ case expanded_code:
+ case semi_expanded_code:
+ {
+ full_scanner_status saved_full_status = tex_save_full_scanner_status();
+ strnumber u = tex_save_cur_string();
+ halfword s = tex_scan_toks_expand(0, NULL, code == semi_expanded_code);
+ tex_unsave_full_scanner_status(saved_full_status);
+ if (token_link(s)) {
+ tex_begin_inserted_list(token_link(s));
+ token_link(s) = null;
+ }
+ tex_put_available_token(s);
+ tex_restore_cur_string(u);
+ /*tex No further action. */
+ return;
+ }
+ /* case immediate_assignment_code: */
+ /* case immediate_assigned_code: */
+ /*tex
+ These two were an on-the-road-to-bachotex brain-wave. A first variant did more in
+ sequence till a relax or spacer was seen. These commands permits for instance setting
+ counters in full expansion. However, as we have the more powerful local control
+ mechanisms available these two commands have been dropped in \LUAMETATEX. Performance
+ wise there is not that much to gain from |\immediateassigned| and it's even somewhat
+ limited. So, they're gone now. Actually, one can also use the local control feature in
+ an |\edef|, which {\em is} rather efficient, so we're good anyway. The upgraded code
+ can be found in the archive.
+ */
+ case string_code:
+ {
+ int saved_selector;
+ int saved_scanner_status = lmt_input_state.scanner_status;
+ lmt_input_state.scanner_status = scanner_is_normal;
+ tex_get_token();
+ lmt_input_state.scanner_status = saved_scanner_status;
+ push_selector;
+ if (cur_cs) {
+ tex_print_cs(cur_cs);
+ } else {
+ tex_print_tex_str(cur_chr);
+ }
+ pop_selector;
+ break;
+ }
+ case cs_string_code:
+ {
+ int saved_selector;
+ int saved_scanner_status = lmt_input_state.scanner_status;
+ lmt_input_state.scanner_status = scanner_is_normal;
+ tex_get_token();
+ lmt_input_state.scanner_status = saved_scanner_status;
+ push_selector;
+ if (cur_cs) {
+ tex_print_cs_name(cur_cs);
+ } else {
+ tex_print_tex_str(cur_chr);
+ }
+ pop_selector;
+ break;
+ }
+ case detokenized_code:
+ {
+ int saved_selector;
+ int saved_scanner_status = lmt_input_state.scanner_status;
+ halfword t = null;
+ lmt_input_state.scanner_status = scanner_is_normal;
+ tex_get_token();
+ lmt_input_state.scanner_status = saved_scanner_status;
+ t = tex_get_available_token(cur_tok);
+ push_selector;
+ tex_show_token_list(t, null, extreme_token_show_max, 0);
+ tex_put_available_token(t);
+ pop_selector;
+ break;
+ }
+ case roman_numeral_code:
+ {
+ int saved_selector;
+ halfword v = tex_scan_int(0, NULL);
+ push_selector;
+ tex_print_roman_int(v);
+ pop_selector;
+ break;
+ }
+ case meaning_code:
+ case meaning_full_code:
+ case meaning_less_code:
+ case meaning_asis_code:
+ {
+ int saved_selector;
+ int saved_scanner_status = lmt_input_state.scanner_status;
+ lmt_input_state.scanner_status = scanner_is_normal;
+ tex_get_token();
+ lmt_input_state.scanner_status = saved_scanner_status;
+ push_selector;
+ tex_print_meaning(code);
+ pop_selector;
+ break;
+ }
+ case uchar_code:
+ {
+ int saved_selector;
+ int chr = tex_scan_char_number(0);
+ push_selector;
+ tex_print_tex_str(chr);
+ pop_selector;
+ break;
+ }
+ case lua_escape_string_code:
+ {
+ lstring escstr;
+ int l = 0;
+ int e = lmt_token_state.in_lua_escape;
+ full_scanner_status saved_full_status = tex_save_full_scanner_status();
+ halfword result = tex_scan_toks_expand(0, NULL, 0);
+ lmt_token_state.in_lua_escape = 1;
+ escstr.s = (unsigned char *) tex_tokenlist_to_tstring(result, 0, &l, 0, 0, 0);
+ escstr.l = (unsigned) l;
+ lmt_token_state.in_lua_escape = e;
+ tex_delete_token_reference(result); /* boils down to flush_list */
+ tex_unsave_full_scanner_status(saved_full_status);
+ if (escstr.l) {
+ result = lmt_str_toks(escstr);
+ tex_begin_inserted_list(result);
+ }
+ return;
+ }
+ case font_name_code:
+ {
+ int saved_selector;
+ halfword fnt = tex_scan_font_identifier(NULL);
+ push_selector;
+ tex_print_font(fnt);
+ pop_selector;
+ break;
+ }
+ case font_specification_code:
+ {
+ int saved_selector;
+ halfword fnt = tex_scan_font_identifier(NULL);
+ push_selector;
+ tex_append_string((const unsigned char *) font_original(fnt), (unsigned) strlen(font_original(fnt)));
+ pop_selector;
+ break;
+ }
+ case job_name_code:
+ {
+ int saved_selector;
+ if (! lmt_fileio_state.job_name) {
+ tex_open_log_file();
+ }
+ push_selector;
+ tex_aux_print_job_name();
+ pop_selector;
+ break;
+ }
+ case format_name_code:
+ {
+ int saved_selector;
+ if (! lmt_fileio_state.job_name) {
+ tex_open_log_file();
+ }
+ push_selector;
+ tex_print_tex_str(lmt_dump_state.format_name);
+ pop_selector;
+ break;
+ }
+ case luatex_banner_code:
+ {
+ int saved_selector;
+ push_selector;
+ tex_print_str(lmt_engine_state.luatex_banner);
+ pop_selector;
+ break;
+ }
+ default:
+ tex_confusion("convert tokens");
+ break;
+ }
+ {
+ halfword head = tex_cur_str_toks(NULL);
+ tex_begin_inserted_list(head);
+ }
+}
+
+/*tex
+ The boolean |in_lua_escape| is keeping track of the lua string escape state.
+*/
+
+strnumber tex_the_convert_string(halfword c, int i)
+{
+ int saved_selector = lmt_print_state.selector;
+ strnumber ret = 0;
+ int done = 1 ;
+ lmt_print_state.selector = new_string_selector_code;
+ switch (c) {
+ case number_code:
+ case to_integer_code:
+ tex_print_int(i);
+ break;
+ case to_hexadecimal_code:
+ tex_print_hex(i);
+ break;
+ case to_scaled_code:
+ tex_print_dimension(i, no_unit);
+ break;
+ case to_sparse_scaled_code:
+ tex_print_sparse_dimension(i, no_unit);
+ break;
+ case to_dimension_code:
+ tex_print_dimension(i, pt_unit);
+ break;
+ case to_sparse_dimension_code:
+ tex_print_sparse_dimension(i, pt_unit);
+ break;
+ /* case to_mathstyle_code: */
+ /* case lua_function_code: */
+ /* case lua_code: */
+ /* case expanded_code: */
+ /* case string_code: */
+ /* case cs_string_code: */
+ case roman_numeral_code:
+ tex_print_roman_int(i);
+ break;
+ /* case meaning_code: */
+ case uchar_code:
+ tex_print_tex_str(i);
+ break;
+ /* case lua_escape_string_code: */
+ case font_name_code:
+ tex_print_font(i);
+ break;
+ case font_specification_code:
+ tex_print_str(font_original(i));
+ break;
+ /* case left_margin_kern_code: */
+ /* case right_margin_kern_code: */
+ /* case math_char_class_code: */
+ /* case math_char_fam_code: */
+ /* case math_char_slot_code: */
+ /* case insert_ht_code: */
+ case job_name_code:
+ tex_aux_print_job_name();
+ break;
+ case format_name_code:
+ tex_print_tex_str(lmt_dump_state.format_name);
+ break;
+ case luatex_banner_code:
+ tex_print_str(lmt_engine_state.luatex_banner);
+ break;
+ case font_identifier_code:
+ tex_print_font_identifier(i);
+ break;
+ default:
+ done = 0;
+ break;
+ }
+ if (done) {
+ ret = tex_make_string();
+ }
+ lmt_print_state.selector = saved_selector;
+ return ret;
+}
+
+/*tex Return a string from tokens list: */
+
+strnumber tex_tokens_to_string(halfword p)
+{
+ if (lmt_print_state.selector == new_string_selector_code) {
+ tex_normal_error("tokens", "tokens_to_string() called while selector = new_string");
+ return get_nullstr();
+ } else {
+ int saved_selector = lmt_print_state.selector;
+ lmt_print_state.selector = new_string_selector_code;
+ tex_token_show(p, extreme_token_show_max);
+ lmt_print_state.selector = saved_selector;
+ return tex_make_string();
+ }
+}
+
+/*tex
+
+ The actual token conversion in this function is now functionally equivalent to |show_token_list|,
+ except that it always prints the whole token list. Often the result is not that large, for
+ instance |\directlua| is seldom large. However, this converter is also used for patterns
+ and exceptions where size is mnore an issue. For that reason we used to have three variants,
+ one of which (experimentally) used a buffer. At some point, in the manual we were talking of
+ millions of allocations but times have changed.
+
+ Macros were used to inline the appending code (in the thre variants), but in the end I decided
+ to just merge all into one function, with a bit more overhead because we need to optionally
+ skip a macro preamble.
+
+ Values like 512 and 128 also work ok. There is not much to gain in optimization here. We used
+ to have 3 mostly overlapping functions, one of which used a buffer. We can probably use a
+ larger default buffer size and larger step and only free when we think it's too large.
+
+*/
+
+# define default_buffer_size 512 /*tex This used to be 256 */
+# define default_buffer_step 4096 /*tex When we're larger, we always are much larger. */
+
+// todo: check ret
+
+static void tex_aux_make_room_in_buffer(int a)
+{
+ if (lmt_token_state.bufloc + a + 1 > lmt_token_state.bufmax) {
+ char *tmp = aux_reallocate_array(lmt_token_state.buffer, sizeof(unsigned char), lmt_token_state.bufmax + default_buffer_step, 1);
+ if (tmp) {
+ lmt_token_state.bufmax += default_buffer_step;
+ } else {
+ // error
+ }
+ lmt_token_state.buffer = tmp;
+ }
+}
+
+static void tex_aux_append_uchar_to_buffer(int s)
+{
+ tex_aux_make_room_in_buffer(4);
+ if (s <= 0x7F) {
+ lmt_token_state.buffer[lmt_token_state.bufloc++] = (char) (s);
+ } else if (s <= 0x7FF) {
+ lmt_token_state.buffer[lmt_token_state.bufloc++] = (char) (0xC0 + (s / 0x40));
+ lmt_token_state.buffer[lmt_token_state.bufloc++] = (char) (0x80 + (s % 0x40));
+ } else if (s <= 0xFFFF) {
+ lmt_token_state.buffer[lmt_token_state.bufloc++] = (char) (0xE0 + (s / 0x1000));
+ lmt_token_state.buffer[lmt_token_state.bufloc++] = (char) (0x80 + ((s % 0x1000) / 0x40));
+ lmt_token_state.buffer[lmt_token_state.bufloc++] = (char) (0x80 + ((s % 0x1000) % 0x40));
+ } else if (s >= 0x110000) {
+ lmt_token_state.buffer[lmt_token_state.bufloc++] = (char) (s - 0x11000);
+ } else {
+ lmt_token_state.buffer[lmt_token_state.bufloc++] = (char) (0xF0 + (s / 0x40000));
+ lmt_token_state.buffer[lmt_token_state.bufloc++] = (char) (0x80 + ((s % 0x40000) / 0x1000));
+ lmt_token_state.buffer[lmt_token_state.bufloc++] = (char) (0x80 + (((s % 0x40000) % 0x1000) / 0x40));
+ lmt_token_state.buffer[lmt_token_state.bufloc++] = (char) (0x80 + (((s % 0x40000) % 0x1000) % 0x40));
+ }
+}
+
+static void tex_aux_append_char_to_buffer(int c)
+{
+ tex_aux_make_room_in_buffer(1);
+ lmt_token_state.buffer[lmt_token_state.bufloc++] = (char) (c);
+}
+
+/*tex Only errors and unknowns. */
+
+static void tex_aux_append_str_to_buffer(const char *s)
+{
+ const char *v = s;
+ tex_aux_make_room_in_buffer((int) strlen(v));
+ /*tex Using memcpy will inline and give a larger binary ... and we seldom need this. */
+ while (*v) {
+ lmt_token_state.buffer[lmt_token_state.bufloc++] = (char) (*v);
+ v++;
+ }
+}
+
+/*tex Only bogus csnames. */
+
+static void tex_aux_append_esc_to_buffer(const char *s)
+{
+ int e = escape_char_par;
+ if (e > 0 && e < cs_offset_value) {
+ tex_aux_append_uchar_to_buffer(e);
+ }
+ tex_aux_append_str_to_buffer(s);
+}
+
+# define is_cat_letter(a) (tex_aux_the_cat_code(aux_str2uni(str_string((a)))) == letter_cmd)
+
+/* make two versions: macro and not */
+
+char *tex_tokenlist_to_tstring(int pp, int inhibit_par, int *siz, int skippreamble, int nospace, int strip)
+{
+ if (pp) {
+ /*tex We need to go beyond the reference. */
+ int p = token_link(pp);
+ if (p) {
+ if (lmt_token_state.bufmax > default_buffer_size) {
+ /* Let's start fresh and small. */
+ aux_deallocate_array(lmt_token_state.buffer);
+ lmt_token_state.buffer = aux_allocate_clear_array(sizeof(unsigned char), default_buffer_size, 1);
+ lmt_token_state.bufmax = default_buffer_size;
+ } else if (! lmt_token_state.buffer) {
+ /* Let's start. */
+ lmt_token_state.buffer = aux_allocate_clear_array(sizeof(unsigned char), default_buffer_size, 1);
+ lmt_token_state.bufmax = default_buffer_size;
+ }
+ lmt_token_state.bufloc = 0;
+ int e = escape_char_par; /*tex The serialization of the escape, normally a backlash. */
+ int n = '0'; /*tex The character after |#|, so |#0| upto |#9| */
+ int min = 0;
+ int max = lmt_token_memory_state.tokens_data.top;
+ int skip = 0;
+ if (skippreamble) {
+ skip = get_token_parameters(pp);
+ }
+ while (p) {
+ if (p < min || p > max) {
+ tex_aux_append_str_to_buffer(error_string_clobbered(31));
+ break;
+ } else {
+ int infop = token_info(p);
+ if (infop < 0) {
+ /* unlikely, will go after checking */
+ tex_aux_append_str_to_buffer(error_string_bad(32));
+ } else if (infop < cs_token_flag) {
+ /*tex We nearly always end up here because otherwise we have an error. */
+ int cmd = token_cmd(infop);
+ int chr = token_chr(infop);
+ switch (cmd) {
+ case left_brace_cmd:
+ case right_brace_cmd:
+ case math_shift_cmd:
+ case alignment_tab_cmd:
+ case superscript_cmd:
+ case subscript_cmd:
+ case spacer_cmd:
+ case letter_cmd:
+ case other_char_cmd:
+ if (! skip) {
+ tex_aux_append_uchar_to_buffer(chr);
+ }
+ break;
+ case parameter_cmd:
+ if (! skip) {
+ if (! nospace && (! lmt_token_state.in_lua_escape && (lmt_expand_state.cs_name_level == 0))) {
+ tex_aux_append_uchar_to_buffer(chr);
+ }
+ tex_aux_append_uchar_to_buffer(chr);
+ }
+ break;
+ case parameter_reference_cmd:
+ if (! skip) {
+ tex_aux_append_char_to_buffer(match_visualizer);
+ if (chr <= 9) {
+ tex_aux_append_char_to_buffer(chr + '0');
+ } else {
+ tex_aux_append_char_to_buffer('!');
+ goto EXIT;
+ }
+ } else {
+ if (chr > 9) {
+ goto EXIT;
+ }
+ }
+ break;
+ case match_cmd:
+ if (! skip) {
+ tex_aux_append_char_to_buffer(match_visualizer);
+ }
+ if (is_valid_match_ref(chr)) {
+ ++n;
+ }
+ if (! skip) {
+ tex_aux_append_char_to_buffer(chr ? chr : '0');
+ }
+ if (n > '9') {
+ goto EXIT;
+ }
+ break;
+ case end_match_cmd:
+ if (chr == 0) {
+ if (! skip) {
+ tex_aux_append_char_to_buffer('-');
+ tex_aux_append_char_to_buffer('>');
+ }
+ skip = 0 ;
+ }
+ break;
+ /*
+ case string_cmd:
+ c = c + cs_offset_value;
+ do_make_room((int) str_length(c));
+ for (int i = 0; i < str_length(c); i++) {
+ token_state.buffer[token_state.bufloc++] = str_string(c)[i];
+ }
+ break;
+ */
+ case end_paragraph_cmd:
+ if (! inhibit_par && (auto_paragraph_mode(auto_paragraph_text))) {
+ tex_aux_append_esc_to_buffer("par");
+ }
+ break;
+ default:
+ tex_aux_append_str_to_buffer(tex_aux_special_cmd_string(cmd, chr, error_string_bad(33)));
+ break;
+ }
+ } else if (! (inhibit_par && infop == lmt_token_state.par_token)) {
+ int q = infop - cs_token_flag;
+ if (q < hash_base) {
+ if (q == null_cs) {
+ tex_aux_append_esc_to_buffer("csname");
+ tex_aux_append_esc_to_buffer("endcsname");
+ } else {
+ tex_aux_append_str_to_buffer(error_string_impossible(34));
+ }
+ } else if (eqtb_out_of_range(q)) {
+ tex_aux_append_str_to_buffer(error_string_impossible(35));
+ } else {
+ strnumber txt = cs_text(q);
+ if (txt < 0 || txt >= lmt_string_pool_state.string_pool_data.ptr) {
+ tex_aux_append_str_to_buffer(error_string_nonexistent(36));
+ } else {
+ char *sh = tex_makecstring(txt);
+ char *s = sh;
+ if (tex_is_active_cs(txt)) {
+ s = s + 3;
+ while (*s) {
+ tex_aux_append_char_to_buffer(*s);
+ s++;
+ }
+ } else {
+ if (e >= 0 && e < 0x110000) {
+ tex_aux_append_uchar_to_buffer(e);
+ }
+ while (*s) {
+ tex_aux_append_char_to_buffer(*s);
+ s++;
+ }
+ if ((! nospace) && ((! tex_single_letter(txt)) || is_cat_letter(txt))) {
+ tex_aux_append_char_to_buffer(' ');
+ }
+ }
+ lmt_memory_free(sh);
+ }
+ }
+ }
+ p = token_link(p);
+ }
+ }
+ EXIT:
+ if (strip && lmt_token_state.bufloc > 1) {
+ if (lmt_token_state.buffer[lmt_token_state.bufloc-1] == strip) {
+ lmt_token_state.bufloc -= 1;
+ }
+ if (lmt_token_state.bufloc > 1 && lmt_token_state.buffer[0] == strip) {
+ memcpy(&lmt_token_state.buffer[0], &lmt_token_state.buffer[1], lmt_token_state.bufloc-1);
+ lmt_token_state.bufloc -= 1;
+ }
+ }
+ lmt_token_state.buffer[lmt_token_state.bufloc] = '\0';
+ if (siz) {
+ *siz = lmt_token_state.bufloc;
+ }
+ return lmt_token_state.buffer;
+ }
+ }
+ if (siz) {
+ *siz = 0;
+ }
+ return NULL;
+}
+
+/*tex
+
+ The \LUA\ interface needs some extra functions. The functions themselves are quite boring, but
+ they are handy because otherwise this internal stuff has to be accessed from \CCODE\ directly,
+ where lots of the defines are not available.
+
+*/
+
+halfword tex_get_tex_dimen_register (int j, int internal) { return internal ? dimen_parameter(j) : dimen_register(j) ; }
+halfword tex_get_tex_skip_register (int j, int internal) { return internal ? glue_parameter(j) : skip_register(j) ; }
+halfword tex_get_tex_mu_skip_register (int j, int internal) { return internal ? mu_glue_parameter(j) : mu_skip_register(j); }
+halfword tex_get_tex_count_register (int j, int internal) { return internal ? count_parameter(j) : count_register(j) ; }
+halfword tex_get_tex_attribute_register (int j, int internal) { return internal ? attribute_parameter(j) : attribute_register(j) ; }
+halfword tex_get_tex_box_register (int j, int internal) { return internal ? box_parameter(j) : box_register(j) ; }
+
+void tex_set_tex_dimen_register(int j, halfword v, int flags, int internal)
+{
+ if (global_defs_par) {
+ flags = add_global_flag(flags);
+ }
+ if (internal) {
+ tex_assign_internal_dimen_value(flags, internal_dimen_location(j), v);
+ } else {
+ tex_word_define(flags, register_dimen_location(j), v);
+ }
+}
+
+void tex_set_tex_skip_register(int j, halfword v, int flags, int internal)
+{
+ if (global_defs_par) {
+ flags = add_global_flag(flags);
+ }
+ if (internal) {
+ tex_assign_internal_skip_value(flags, internal_glue_location(j), v);
+ } else {
+ tex_word_define(flags, register_glue_location(j), v);
+ }
+}
+
+void tex_set_tex_mu_skip_register(int j, halfword v, int flags, int internal)
+{
+ if (global_defs_par) {
+ flags = add_global_flag(flags);
+ }
+ tex_word_define(flags, internal ? internal_mu_glue_location(j) : register_mu_glue_location(j), v);
+}
+
+void tex_set_tex_count_register(int j, halfword v, int flags, int internal)
+{
+ if (global_defs_par) {
+ flags = add_global_flag(flags);
+ }
+ if (internal) {
+ tex_assign_internal_int_value(flags, internal_int_location(j), v);
+ } else {
+ tex_word_define(flags, register_int_location(j), v);
+ }
+}
+
+void tex_set_tex_attribute_register(int j, halfword v, int flags, int internal)
+{
+ if (global_defs_par) {
+ flags = add_global_flag(flags);
+ }
+ if (j > lmt_node_memory_state.max_used_attribute) {
+ lmt_node_memory_state.max_used_attribute = j;
+ }
+ change_attribute_register(flags, register_attribute_location(j), v);
+ tex_word_define(flags, internal ? internal_attribute_location(j) : register_attribute_location(j), v);
+}
+
+void tex_set_tex_box_register(int j, halfword v, int flags, int internal)
+{
+ if (global_defs_par) {
+ flags = add_global_flag(flags);
+ }
+ if (internal) {
+ tex_define(flags, internal_box_location(j), internal_box_reference_cmd, v);
+ } else {
+ tex_define(flags, register_box_location(j), register_box_reference_cmd, v);
+ }
+}
+
+void tex_set_tex_toks_register(int j, lstring s, int flags, int internal)
+{
+ halfword ref = get_reference_token();
+ halfword head = tex_str_toks(s, NULL);
+ set_token_link(ref, head);
+ if (global_defs_par) {
+ flags = add_global_flag(flags);
+ }
+ if (internal) {
+ tex_define(flags, internal_toks_location(j), internal_toks_reference_cmd, ref);
+ } else {
+ tex_define(flags, register_toks_location(j), register_toks_reference_cmd, ref);
+ }
+}
+
+void tex_scan_tex_toks_register(int j, int c, lstring s, int flags, int internal)
+{
+ halfword ref = get_reference_token();
+ halfword head = tex_str_scan_toks(c, s);
+ set_token_link(ref, head);
+ if (global_defs_par) {
+ flags = add_global_flag(flags);
+ }
+ if (internal) {
+ tex_define(flags, internal_toks_location(j), internal_toks_reference_cmd, ref);
+ } else {
+ tex_define(flags, register_toks_location(j), register_toks_reference_cmd, ref);
+ }
+}
+
+int tex_get_tex_toks_register(int j, int internal)
+{
+ halfword t = internal ? toks_parameter(j) : toks_register(j);
+ if (t) {
+ return tex_tokens_to_string(t);
+ } else {
+ return get_nullstr();
+ }
+}
+
+/* Options: (0) error when undefined [bad], (1) create [but undefined], (2) ignore [discard] */
+
+halfword tex_parse_str_to_tok(halfword head, halfword *tail, halfword ct, const char *str, size_t lstr, int option)
+{
+ halfword p = null;
+ if (! head) {
+ head = get_reference_token();
+ }
+ p = (tail && *tail) ? *tail : head;
+ if (lstr > 0) {
+ const char *se = str + lstr;
+ while (str < se) {
+ /*tex hh: |str2uni| could return len too (also elsewhere) */
+ halfword u = (halfword) aux_str2uni((const unsigned char *) str);
+ halfword t = null;
+ halfword cc = tex_get_cat_code(ct, u);
+ str += utf8_size(u);
+ /*tex
+ This is a relating simple converter; if more is needed one can just use
+ |tex.print| with a regular |\def| or |\gdef| and feed the string into the
+ regular scanner.
+ */
+ switch (cc) {
+ case escape_cmd:
+ {
+ /*tex We have a potential control sequence so we check for it. */
+ int lname = 0;
+ const char *name = str;
+ while (str < se) {
+ halfword u = (halfword) aux_str2uni((const unsigned char *) str);
+ int s = utf8_size(u);
+ int c = tex_get_cat_code(ct, u);
+ if (c == letter_cmd) {
+ str += s;
+ lname += s;
+ } else if (c == spacer_cmd) {
+ /*tex We ignore a trailing space like normal scanning does. */
+ if (lname == 0) {
+ // if (u == 32) {
+ lname += s;
+ }
+ str += s;
+ break ;
+ } else {
+ if (lname == 0) {
+ lname += s;
+ str += s;
+ }
+ break ;
+ }
+ }
+ if (lname > 0) {
+ /*tex We have a potential |\cs|. */
+ halfword cs = tex_string_locate(name, lname, option == 1 ? 1 : 0); /* 1 == create */
+ if (cs == undefined_control_sequence) {
+ if (option == 2) {
+ /*tex We ignore unknown commands. */
+ // t = null;
+ } else {
+ /*tex We play safe and backtrack, as we have option 0, but never used anyway. */
+ t = u + (cc * (1<<21));
+ str = name;
+ }
+ } else {
+ /* We end up here when option is 1. */
+ t = cs_token_flag + cs;
+ }
+ } else {
+ /*tex
+ Just a character with some meaning, so |\unknown| becomes effectively
+ |\unknown| assuming that |\\| has some useful meaning of course.
+ */
+ t = u + (cc * (1 << 21));
+ str = name;
+ }
+ break;
+ }
+ case comment_cmd:
+ goto DONE;
+ case ignore_cmd:
+ break;
+ case spacer_cmd:
+ /* t = u + (cc * (1<<21)); */
+ t = token_val(spacer_cmd, ' ');
+ break;
+ default:
+ /*tex
+ Whatever token, so for instance $x^2$ just works given a tex catcode regime.
+ */
+ t = u + (cc * (1<<21));
+ break;
+ }
+ if (t) {
+ p = tex_store_new_token(p, t);
+ }
+ }
+ }
+ DONE:
+ if (tail) {
+ *tail = p;
+ }
+ return head;
+}
+
+/*tex So far for the helpers. */
+
+void tex_dump_token_mem(dumpstream f)
+{
+ /*tex
+ It doesn't pay off to prune the available list. We save less than 10K if we do this and
+ it assumes a sequence at the end. It doesn't help that the list is in reverse order so
+ we just dump the lot. But we do check the allocated size. We cheat a bit in reducing
+ the ptr so that we can set the the initial counter on loading.
+ */
+ halfword p = lmt_token_memory_state.available;
+ halfword u = lmt_token_memory_state.tokens_data.top + 1;
+ while (p) {
+ --u;
+ p = token_link(p);
+ }
+ lmt_token_memory_state.tokens_data.ptr = u;
+ dump_int(f, lmt_token_state.null_list); /* the only one left */
+ dump_int(f, lmt_token_memory_state.tokens_data.allocated);
+ dump_int(f, lmt_token_memory_state.tokens_data.top);
+ dump_int(f, lmt_token_memory_state.tokens_data.ptr);
+ dump_int(f, lmt_token_memory_state.available);
+ dump_things(f, lmt_token_memory_state.tokens[0], lmt_token_memory_state.tokens_data.top + 1);
+}
+
+void tex_undump_token_mem(dumpstream f)
+{
+ undump_int(f, lmt_token_state.null_list); /* the only one left */
+ undump_int(f, lmt_token_memory_state.tokens_data.allocated);
+ undump_int(f, lmt_token_memory_state.tokens_data.top);
+ undump_int(f, lmt_token_memory_state.tokens_data.ptr);
+ undump_int(f, lmt_token_memory_state.available);
+ tex_initialize_token_mem();
+ undump_things(f, lmt_token_memory_state.tokens[0], lmt_token_memory_state.tokens_data.top + 1);
+}