diff options
Diffstat (limited to 'source/luametatex/source/tex/textoken.c')
-rw-r--r-- | source/luametatex/source/tex/textoken.c | 3511 |
1 files changed, 3511 insertions, 0 deletions
diff --git a/source/luametatex/source/tex/textoken.c b/source/luametatex/source/tex/textoken.c new file mode 100644 index 000000000..0d2415233 --- /dev/null +++ b/source/luametatex/source/tex/textoken.c @@ -0,0 +1,3511 @@ +/* + See license.txt in the root of this project. +*/ + +# include "luametatex.h" + +/*tex Todo: move some helpers to other places. */ + +inline static int tex_aux_the_cat_code(halfword b) +{ + return (lmt_input_state.cur_input.cattable == default_catcode_table_preset) ? + tex_get_cat_code(cat_code_table_par, b) + : ( (lmt_input_state.cur_input.cattable > -0xFF) ? + tex_get_cat_code(lmt_input_state.cur_input.cattable, b) + : ( + - lmt_input_state.cur_input.cattable - 0xFF + ) ) ; +} + +/*tex + + The \TEX\ system does nearly all of its own memory allocation, so that it can readily be + transported into environments that do not have automatic facilities for strings, garbage + collection, etc., and so that it can be in control of what error messages the user receives. + The dynamic storage requirements of \TEX\ are handled by providing two large arrays called + |fixmem| and |varmem| in which consecutive blocks of words are used as nodes by the \TEX\ + routines. + + Pointer variables are indices into this array, or into another array called |eqtb| that + will be explained later. A pointer variable might also be a special flag that lies outside + the bounds of |mem|, so we allow pointers to assume any |halfword| value. The minimum + halfword value represents a null pointer. \TEX\ does not assume that |mem[null]| exists. + + Locations in |fixmem| are used for storing one-word records; a conventional |AVAIL| stack is + used for allocation in this array. + + One can make an argument to switch to standard \CCODE\ allocation but the current approach is + very efficient in memory usage and performence so we stay with it. On the average memory + consumption of \TEX| is not that large, definitely not compared to other programs that deal + with text. + + The big dynamic storage area is named |fixmem| where the smallest location of one|-|word + memory in use is |fix_mem_min| and the largest location of one|-|word memory in use is + |fix_mem_max|. + + The |dyn_used| variable keeps track of how much memory is in use. The head of the list of + available one|-|word nodes is registered in |avail|. The last one-|word node used in |mem| + is |fix_mem_end|. + + All these variables are packed in the structure |token_memory_state|. + +*/ + +token_memory_state_info lmt_token_memory_state = { + .tokens = NULL, + .tokens_data = { + .minimum = min_token_size, + .maximum = max_token_size, + .size = siz_token_size, + .step = stp_token_size, + .allocated = 0, + .itemsize = sizeof(memoryword), + .top = 0, + .ptr = 0, /* used to register usage */ + .initial = 0, + .offset = 0, + }, + .available = 0, + .padding = 0, +}; + +/*tex + + Token data has its own memory space. Again we have some state variables: |temp_token_head| is + the head of a (temporary) list of some kind as are |hold_token_head| and |omit_template|. A + permanently empty list is available in |null_list| and the head of the token list built by + |scan_keyword| is registered in |backup_head|. All these variables are packed in the structure + |token_data| but some have been moved to a more relevant state (so omit and hold are now in the + alignment state). + +*/ + +token_state_info lmt_token_state = { + .null_list = null, + .in_lua_escape = 0, + .force_eof = 0, + .luacstrings = 0, + .par_loc = null, + .par_token = null, + /* .line_par_loc = null, */ /* removed because not really used and useful */ + /* .line_par_token = null, */ /* idem */ + .buffer = NULL, + .bufloc = 0, + .bufmax = 0, + .padding = 0, +}; + +/*tex Some properties are dumped in the format so these are aet already! */ + +# define reserved_token_mem_slots 2 // play safe for slight overuns + +void tex_initialize_token_mem(void) +{ + memoryword *tokens = NULL; + int size = 0; + if (lmt_main_state.run_state == initializing_state) { + size = lmt_token_memory_state.tokens_data.minimum; + } else { + size = lmt_token_memory_state.tokens_data.allocated; + lmt_token_memory_state.tokens_data.initial = lmt_token_memory_state.tokens_data.ptr; + } + if (size > 0) { + tokens = aux_allocate_clear_array(sizeof(memoryword), size, reserved_token_mem_slots); + } + if (tokens) { + lmt_token_memory_state.tokens = tokens; + lmt_token_memory_state.tokens_data.allocated = size; + } else { + tex_overflow_error("tokens", size); + } +} + +static void tex_aux_bump_token_memory(void) +{ + /*tex We need to manage the big dynamic storage area. */ + int size = lmt_token_memory_state.tokens_data.allocated + lmt_token_memory_state.tokens_data.step; + if (size > lmt_token_memory_state.tokens_data.size) { + lmt_run_memory_callback("token", 0); + tex_show_runaway(); + tex_overflow_error("token memory size", lmt_token_memory_state.tokens_data.allocated); + } else { + memoryword *tokens = aux_reallocate_array(lmt_token_memory_state.tokens, sizeof(memoryword), size, reserved_token_mem_slots); + lmt_run_memory_callback("token", tokens ? 1 : 0); + if (tokens) { + lmt_token_memory_state.tokens = tokens; + } else { + /*tex If memory is exhausted, display possible runaway text. */ + tex_show_runaway(); + tex_overflow_error("token memory size", lmt_token_memory_state.tokens_data.allocated); + } + } + memset((void *) (lmt_token_memory_state.tokens + lmt_token_memory_state.tokens_data.allocated + 1), 0, ((size_t) lmt_token_memory_state.tokens_data.step + reserved_token_mem_slots) * sizeof(memoryword)); + lmt_token_memory_state.tokens_data.allocated = size; +} + +void tex_initialize_tokens(void) +{ + lmt_token_memory_state.available = null; + lmt_token_memory_state.tokens_data.top = 0; + lmt_token_state.null_list = tex_get_available_token(null); + lmt_token_state.in_lua_escape = 0; +} + +/*tex + Experiment. It saves some 512K on the \CONTEXT\ format of October 2020. It makes me wonder if I + should spend some time on optimizing token lists (kind of cisc commands as we're currently kind + of risc). +*/ + +void tex_compact_tokens(void) +{ + int nc = 0; + // memoryword *target = allocate_array(sizeof(memoryword), (size_t) token_memory_state.tokens_data.allocated, 0); + memoryword *target = aux_allocate_clear_array(sizeof(memoryword), (size_t) lmt_token_memory_state.tokens_data.allocated, 0); + halfword *mapper = aux_allocate_array(sizeof(halfword), (size_t) lmt_token_memory_state.tokens_data.allocated, 0); + int nofluacmds = 0; + if (target && mapper) { + // memset((void *) target, 0, ((size_t) token_memory_state.tokens_data.allocated) * sizeof(memoryword)); + memset((void *) mapper, -1, ((size_t) lmt_token_memory_state.tokens_data.allocated) * sizeof(halfword)); + memoryword *tokens = lmt_token_memory_state.tokens; + /* also reset available */ + for (int cs = 0; cs < (eqtb_size + lmt_hash_state.hash_data.ptr); cs++) { + switch (eq_type(cs)) { + case call_cmd: + case protected_call_cmd: + case semi_protected_call_cmd: + case tolerant_call_cmd: + case tolerant_protected_call_cmd: + case tolerant_semi_protected_call_cmd: + case internal_toks_reference_cmd: + case register_toks_reference_cmd: + { + halfword v = eq_value(cs); /* ref count token*/ + if (v) { + if (mapper[v] < 0) { + // printf("before =>"); { halfword tt = v; while (tt) { printf("%7d ",tt); tt = token_link(tt); } } printf("\n"); + halfword t = v; + nc++; + mapper[v] = nc; /* new ref count token index */ + while (1) { + target[nc].half1 = tokens[t].half1; /* info cq. ref count */ + t = tokens[t].half0; + if (t) { + nc++; + target[nc-1].half0 = nc; /* link to next */ + } else { + target[nc].half0 = null; /* link to next */ + break; + } + } + // printf("after =>"); { halfword tt = mapper[v]; while (tt) { printf("%7d ",tt); tt = target[tt].half0; } } printf("\n"); + } + eq_value(cs) = mapper[v]; + } + break; + } + case lua_value_cmd: + case lua_call_cmd: + case lua_local_call_cmd: + { + ++nofluacmds; + break; + } + } + } + // print(dump_state.format_identifier); + tex_print_format("tokenlist compacted from %i to %i entries, ", lmt_token_memory_state.tokens_data.top, nc); + if (nofluacmds) { + /*tex + We just mention them because when these are aliased the macro package needs to make + sure that after loading that happens again because registered funciton references + can have changed between format generation and run! + */ + tex_print_format("%i potentially aliased lua call/value entries, ", nofluacmds); + } + lmt_token_memory_state.tokens_data.top = nc; + lmt_token_memory_state.tokens_data.ptr = nc; + aux_deallocate_array(lmt_token_memory_state.tokens); + lmt_token_memory_state.tokens = target; + lmt_token_memory_state.available = null; + } else { + tex_overflow_error("token compaction size", lmt_token_memory_state.tokens_data.allocated); + } +} + + +/*tex + + The function |get_avail| returns a pointer (index) to a new one word node whose |link| field is + |null| (which is just 0). However, \TEX\ will halt if there is no more room left. + + If the available space list is empty, i.e., if |avail = null|, we try first to increase + |fix_mem_end|. If that cannot be done, i.e., if |fix_mem_end = fix_mem_max|, we try to reallocate + array |fixmem|. If, that doesn't work, we have to quit. Users can configure \TEX\ to use a lot of + memory but in some scenarios limitations make sense. + + Remark: we can have a pool of chunks where we get from or just allocate per token (as we have lots + of them that is slow). But then format loading becomes much slower as we need to recreate the + linked list. A no go. In todays terms \TEX\ memory usage is low anyway. + + The freed tokens are kept in a linked list. First we check if we can quickly get one of these. If + that fails, we try to get one from the available pool. If that fails too, we enlarge the pool and + try again. We keep track of the used number of tokens. We also make sure that the tokens links to + nothing. + + One problem is of course that tokens can be scattered over memory. We could have some sorter that + occasionally kicks in but it doesn't pay off. Normally definitions (in the format) are in sequence + but a normal run \unknown\ it would be interesting to know if this impacts the cache. + +*/ + +halfword tex_get_available_token(halfword t) +{ + halfword p = lmt_token_memory_state.available; + if (p) { + lmt_token_memory_state.available = token_link(p); + } else if (lmt_token_memory_state.tokens_data.top < lmt_token_memory_state.tokens_data.allocated) { + p = ++lmt_token_memory_state.tokens_data.top; + } else { + tex_aux_bump_token_memory(); + p = ++lmt_token_memory_state.tokens_data.top; + } + ++lmt_token_memory_state.tokens_data.ptr; + token_link(p) = null; + token_info(p) = t; + return p; +} + +/*tex + + Because we only have forward links, a freed token ends up at the head of the list of available + tokens. + +*/ + +void tex_put_available_token(halfword p) +{ + token_link(p) = lmt_token_memory_state.available; + lmt_token_memory_state.available = p; + --lmt_token_memory_state.tokens_data.ptr; +} + +halfword tex_store_new_token(halfword p, halfword t) +{ + halfword q = tex_get_available_token(t); + token_link(p) = q; + return q; +} + +/*tex + + The procedure |flush_list (p)| frees an entire linked list of oneword nodes that starts at + position |p|. It makes list of single word nodes available. The second variant in principle + is faster but in practice this goes unnoticed. Of course there is a little price to pay for + keeping track of memory usage. + +*/ + +void tex_flush_token_list(halfword head) +{ + if (head) { + halfword current = head; + halfword tail; + int i = 0; + do { + ++i; + tail = current; + current = token_link(tail); + } while (current); + lmt_token_memory_state.tokens_data.ptr -= i; + token_link(tail) = lmt_token_memory_state.available; + lmt_token_memory_state.available = head; + } +} + +void tex_flush_token_list_head_tail(halfword head, halfword tail, int n) +{ + if (head) { + lmt_token_memory_state.tokens_data.ptr -= n; + token_link(tail) = lmt_token_memory_state.available; + lmt_token_memory_state.available = head; + } +} + +void tex_add_token_reference(halfword p) +{ + if (get_token_reference(p) < max_token_reference) { + add_token_reference(p); + } else { + tex_overflow_error("reference count", max_token_reference); + } +} + +void tex_increment_token_reference(halfword p, int n) +{ + if ((get_token_reference(p) + n) < max_token_reference) { + inc_token_reference(p,n); + } else { + tex_overflow_error("reference count", max_token_reference); + } +} + +void tex_delete_token_reference(halfword p) +{ + if (p) { + if (get_token_reference(p)) { + sub_token_reference(p); + } else { + tex_flush_token_list(p); + } + } +} + +/*tex + + A \TEX\ token is either a character or a control sequence, and it is represented internally in + one of two ways: + + \startitemize[n] + \startitem + A character whose ASCII code number is |c| and whose command code is |m| is represented + as the number $2^{21}m+c$; the command code is in the range |1 <= m <= 14|. + \stopitem + \startitem + A control sequence whose |eqtb| address is |p| is represented as the number + |cs_token_flag+p|. Here |cs_token_flag = t =| $2^{25}-1$ is larger than $2^{21}m+c$, yet + it is small enough that |cs_token_flag + p < max_halfword|; thus, a token fits + comfortably in a halfword. + \stopitem + \stopitemize + + A token |t| represents a |left_brace| command if and only if |t < left_brace_limit|; it + represents a |right_brace| command if and only if we have |left_brace_limit <= t < + right_brace_limit|; and it represents a |match| or |end_match| command if and only if + |match_token <= t <= end_match_token|. The following definitions take care of these + token-oriented constants and a few others. + + A token list is a singly linked list of one-word nodes in |mem|, where each word contains a token + and a link. Macro definitions, output routine definitions, marks, |\write| texts, and a few other + things are remembered by \TEX\ in the form of token lists, usually preceded by a node with a + reference count in its |token_ref_count| field. The token stored in location |p| is called + |info(p)|. + + Three special commands appear in the token lists of macro definitions. When |m = match|, it means + that \TEX\ should scan a parameter for the current macro; when |m = end_match|, it means that + parameter matching should end and \TEX\ should start reading the macro text; and when |m = + out_param|, it means that \TEX\ should insert parameter number |c| into the text at this point. + + The enclosing |\char'173| and |\char'175| characters of a macro definition are omitted, but the + final right brace of an output routine is included at the end of its token list. + + Here is an example macro definition that illustrates these conventions. After \TEX\ processes + the text: + + \starttyping + \def\mac a#1#2 \b {#1\-a ##1#2 \#2\} + \stoptyping + + The definition of |\mac| is represented as a token list containing: + + \starttyping + (reference count) letter a match # match # spacer \b end_match + out_param1 \- letter a spacer, mac_param # other_char 1 + out_param2 spacer out_param 2 + \stoptyping + + The procedure |scan_toks| builds such token lists, and |macro_call| does the parameter matching. + + Examples such as |\def \m {\def \m {a} b}| explain why reference counts would be needed even if + \TEX\ had no |\let| operation: When the token list for |\m| is being read, the redefinition of + |\m| changes the |eqtb| entry before the token list has been fully consumed, so we dare not + simply destroy a token list when its control sequence is being redefined. + + If the parameter-matching part of a definition ends with |#{}|, the corresponding token list + will have |{| just before the |end_match| and also at the very end. The first |{| is used to + delimit the parameter; the second one keeps the first from disappearing. + + The |print_meaning| subroutine displays |cur_cmd| and |cur_chr| in symbolic form, including the + expansion of a macro or mark. + +*/ + +void tex_print_meaning(halfword code) +{ + /*tex + + This would make sense but some macro packages don't like it: + + \starttyping + if (cur_cmd == math_given_cmd) { + cur_cmd = math_xgiven_cmd ; + } + \stoptyping + + Eventually we might just do it that way. We also can have |\meaningonly| that omits the + |macro:| and arguments. + */ + int untraced = is_untraced(eq_flag(cur_cs)); + if (! untraced) { + switch (code) { + case meaning_code: + case meaning_full_code: + case meaning_asis_code: + tex_print_cmd_flags(cur_cs, cur_cmd, (code == meaning_full_code || code == meaning_asis_code), code == meaning_asis_code); + break; + } + } + switch (cur_cmd) { + case call_cmd: + case protected_call_cmd: + case semi_protected_call_cmd: + case tolerant_call_cmd: + case tolerant_protected_call_cmd: + case tolerant_semi_protected_call_cmd: + if (untraced) { + tex_print_cs(cur_cs); + return; + } else { + switch (code) { + case meaning_code: + case meaning_full_code: + tex_print_str("macro"); + goto FOLLOWUP; + case meaning_asis_code: + // tex_print_format("%e%C %S ", def_cmd, def_code, cur_cs); + tex_print_cmd_chr(def_cmd, def_code); + tex_print_char(' '); + tex_print_cs(cur_cs); + tex_print_char(' '); + if (cur_chr && token_link(cur_chr)) { + halfword body = get_token_parameters(cur_chr) ? tex_show_token_list(token_link(cur_chr), null, default_token_show_max, 1) : token_link(cur_chr); + tex_print_char('{'); + if (body) { + tex_show_token_list(body, null, default_token_show_max, 0); + } + tex_print_char('}'); + } + return; + } + goto DETAILS; + } + case get_mark_cmd: + tex_print_cmd_chr((singleword) cur_cmd, cur_chr); + tex_print_char(':'); + tex_print_nlp(); + tex_token_show(tex_get_some_mark(cur_chr, 0), default_token_show_max); + return; + case lua_value_cmd: + case lua_call_cmd: + case lua_local_call_cmd: + case lua_protected_call_cmd: + if (untraced) { + tex_print_cs(cur_cs); + return; + } else { + goto DEFAULT; + } + case if_test_cmd: + if (cur_chr > last_if_test_code) { + tex_print_cs(cur_cs); + return; + } else { + goto DEFAULT; + } + default: + DEFAULT: + tex_print_cmd_chr((singleword) cur_cmd, cur_chr); + if (cur_cmd < call_cmd) { + return; + } else { + /* all kind of reference cmds */ + break; + } + } + FOLLOWUP: + tex_print_char(':'); + DETAILS: + tex_print_nlp(); + tex_token_show(cur_chr, default_token_show_max); +} + +/*tex + + The procedure |show_token_list|, which prints a symbolic form of the token list that starts at + a given node |p|, illustrates these conventions. The token list being displayed should not begin + with a reference count. However, the procedure is intended to be robust, so that if the memory + links are awry or if |p| is not really a pointer to a token list, nothing catastrophic will + happen. + + An additional parameter |q| is also given; this parameter is either null or it points to a node + in the token list where a certain magic computation takes place that will be explained later. + Basically, |q| is non-null when we are printing the two-line context information at the time of + an error message; |q| marks the place corresponding to where the second line should begin. + + For example, if |p| points to the node containing the first |a| in the token list above, then + |show_token_list| will print the string + + \starttyping + a#1#2 \b ->#1-a ##1#2 #2 + \stoptyping + + and if |q| points to the node containing the second |a|, the magic computation will be performed + just before the second |a| is printed. + + The generation will stop, and |\ETC.| will be printed, if the length of printing exceeds a given + limit~|l|. Anomalous entries are printed in the form of control sequences that are not followed + by a blank space, e.g., |\BAD.|; this cannot be confused with actual control sequences because a + real control sequence named |BAD| would come out |\BAD |. + + In \LUAMETATEX\ we have some more node types and token types so we also have additional tracing. + Because there is some more granularity in for instance nodes (subtypes) more detail is reported. + +*/ + +static const char *tex_aux_special_cmd_string(halfword cmd, halfword chr, const char *unknown) +{ + switch (cmd) { + case node_cmd : return "[[special cmd: node pointer]]"; + case lua_protected_call_cmd : return "[[special cmd: lua protected call]]"; + case lua_value_cmd : return "[[special cmd: lua value call]]"; + case iterator_value_cmd : return "[[special cmd: iterator value]]"; + case lua_call_cmd : return "[[special cmd: lua call]]"; + case lua_local_call_cmd : return "[[special cmd: lua local call]]"; + case begin_local_cmd : return "[[special cmd: begin local call]]"; + case end_local_cmd : return "[[special cmd: end local call]]"; + // case prefix_cmd : return "[[special cmd: enforced]]"; + case prefix_cmd : return "\\always"; + default : printf("[[unknown cmd: (%i,%i)]\n", cmd, chr); return unknown; + } +} + +halfword nn = 0; + +halfword tex_show_token_list(halfword p, halfword q, int l, int asis) +{ + if (p) { + /*tex the highest parameter number, as an \ASCII\ digit */ + unsigned char n = '0'; + int min = 0; + int max = lmt_token_memory_state.tokens_data.top; + lmt_print_state.tally = 0; +// if (l <= 0) { + l = extreme_token_show_max; +// } + while (p && (lmt_print_state.tally < l)) { + if (p == q) { + /*tex Do magic computation. We only end up here in context showing. */ + tex_set_trick_count(); + } + /*tex Display token |p|, and |return| if there are problems. */ + if (p < min || p > max) { + tex_print_str(error_string_clobbered(41)); + return null; + } else if (token_info(p) >= cs_token_flag) { + // if (! ((print_state.inhibit_par_tokens) && (token_info(p) == token_state.par_token))) { + tex_print_cs_checked(token_info(p) - cs_token_flag); + // } + } else if (token_info(p) < 0) { + tex_print_str(error_string_bad(42)); + } else if (token_info(p) == 0) { + tex_print_str(error_string_bad(44)); + } else { + int cmd = token_cmd(token_info(p)); + int chr = token_chr(token_info(p)); + /* + Display the token (|cmd|,|chr|). The procedure usually \quote {learns} the character + code used for macro parameters by seeing one in a |match| command before it runs + into any |out_param| commands. + + */ + switch (cmd) { + case left_brace_cmd: + case right_brace_cmd: + case math_shift_cmd: + case alignment_tab_cmd: + case superscript_cmd: + case subscript_cmd: + case spacer_cmd: + case letter_cmd: + case other_char_cmd: + case ignore_cmd: /* new */ + tex_print_tex_str(chr); + break; + case parameter_cmd: + if (! lmt_token_state.in_lua_escape && (lmt_expand_state.cs_name_level == 0)) { + tex_print_tex_str(chr); + } + tex_print_tex_str(chr); + break; + case parameter_reference_cmd: + tex_print_tex_str(match_visualizer); + if (chr <= 9) { + tex_print_char(chr + '0'); + } else { + tex_print_char('!'); + return null; + } + break; + case match_cmd: + tex_print_char(match_visualizer); + if (is_valid_match_ref(chr)) { + ++n; + } + tex_print_char(chr ? chr : '0'); + if (n > '9') { + /*tex Can this happen at all? */ + return null; + } else { + break; + } + case end_match_cmd: + if (asis) { + return token_link(p); + } else if (chr == 0) { + tex_print_str("->"); + } + break; + case ignore_something_cmd: + break; + case set_font_cmd: + tex_print_format("[font->%s]", font_original(cur_val)); + break; + case end_paragraph_cmd: + tex_print_format("%e%s", "par "); + break; + default: + tex_print_str(tex_aux_special_cmd_string(cmd, chr, error_string_bad(43))); + break; + } + } + p = token_link(p); + } + if (p) { + tex_print_str_esc("ETC."); + } + } + return p; +} + +/* +# define do_buffer_to_unichar(a,b) do { \ + a = (halfword)str2uni(fileio_state.io_buffer+b); \ + b += utf8_size(a); \ +} while (0) +*/ + +inline halfword get_unichar_from_buffer(int *b) +{ + halfword a = (halfword) ((const unsigned char) *(lmt_fileio_state.io_buffer + *b)); + if (a <= 0x80) { + *b += 1; + } else { + a = (halfword) aux_str2uni(lmt_fileio_state.io_buffer + *b); + *b += utf8_size(a); + } + return a; +} + +/*tex + + Here's the way we sometimes want to display a token list, given a pointer to its reference count; + the pointer may be null. + +*/ + +void tex_token_show(halfword p, int max) +{ + if (p && token_link(p)) { + tex_show_token_list(token_link(p), null, max, 0); + } +} + +/*tex + + The next function, |delete_token_ref|, is called when a pointer to a token list's reference + count is being removed. This means that the token list should disappear if the reference count + was |null|, otherwise the count should be decreased by one. Variable |p| points to the reference + count of a token list that is losing one reference. + +*/ + +int tex_get_char_cat_code(int c) +{ + return tex_aux_the_cat_code(c); +} + +static void tex_aux_invalid_character_error(void) +{ + tex_handle_error( + normal_error_type, + "Text line contains an invalid character", + "A funny symbol that I can't read has just been input. Continue, and I'll forget\n" + "that it ever happened." + ); +} + +static int tex_aux_process_sup_mark(void); + +static int tex_aux_scan_control_sequence(void); + +typedef enum next_line_retval { + next_line_ok, + next_line_return, + next_line_restart +} next_line_retval; + +static next_line_retval tex_aux_next_line(void); + +/*tex + + In case you are getting bored, here is a slightly less trivial routine: Given a string of + lowercase letters, like |pt| or |plus| or |width|, the |scan_keyword| routine checks to see + whether the next tokens of input match this string. The match must be exact, except that + ppercase letters will match their lowercase counterparts; uppercase equivalents are determined + by subtracting |"a" - "A"|, rather than using the |uc_code| table, since \TEX\ uses this + routine only for its own limited set of keywords. + + If a match is found, the characters are effectively removed from the input and |true| is + returned. Otherwise |false| is returned, and the input is left essentially unchanged (except + for the fact that some macros may have been expanded, etc.). + + In \LUATEX\ and its follow up we have more keywords and for instance when scanning a box + specification that is noticeable because the |scan_keyword| function is a little inefficient + in the sense that when there is no match, it will push back what got read so far. So there is + token allocation, pushing a level etc involved. Keep in mind that expansion happens here so what + gets pushing back is not always literally pushing back what we started with. + + In \LUAMETATEX\ we now have a bit different approach. The |scan_mandate_keyword| follows up on + |scan_character| so we have a two step approach. We could actually pass a list of valid keywords + but that would make for a complex function with no real benefits. + +*/ + +halfword tex_scan_character(const char *s, int left_brace, int skip_space, int skip_relax) +{ + halfword save_cur_cs = cur_cs; +// (void) skip_space; /* some day */ + while (1) { + tex_get_x_token(); + switch (cur_cmd) { + case spacer_cmd: + if (skip_space) { + break; + } else { + goto DONE; + } + break; + case relax_cmd: + if (skip_relax) { + break; + } else { + goto DONE; + } + case letter_cmd: + case other_char_cmd: + if (cur_chr <= 'z' && strchr(s, cur_chr)) { + cur_cs = save_cur_cs; + return cur_chr; + } else { + goto DONE; + } + case left_brace_cmd: + if (left_brace) { + cur_cs = save_cur_cs; + return '{'; + } else { + goto DONE; + } + default: + goto DONE; + } + } + DONE: + tex_back_input(cur_tok); + cur_cs = save_cur_cs; + return 0; +} + +void tex_aux_show_keyword_error(const char *s) +{ + tex_handle_error( + normal_error_type, + "Valid keyword expected, likely '%s'", + s, + "You started a keyword but it seems to be an invalid one. The first character(s)\n" + "might give you a clue. You might want to quit unwanted lookahead with \\relax." + ); +} + +/*tex + Scanning an optional keyword starts at the beginning. This means that we can also (for instance) + have a minus or plus sign which means that we have a different loop than with the alternative + that already checked the first character. +*/ + +int tex_scan_optional_keyword(const char *s) +{ + halfword save_cur_cs = cur_cs; + int done = 0; + const char *p = s; + while (*p) { + tex_get_x_token(); + switch (cur_cmd) { + case letter_cmd: + case other_char_cmd: + if ((cur_chr == *p) || (cur_chr == *p - 'a' + 'A')) { + if (*(++p)) { + done = 1; + } else { + cur_cs = save_cur_cs; + return 1; + } + } else if (done) { + goto BAD_NEWS; + } else { + // can be a minus or so ! as in \advance\foo -10 + tex_back_input(cur_tok); + cur_cs = save_cur_cs; + return 1; + } + break; + case spacer_cmd: /* normally spaces are not pushed back */ + if (done) { + goto BAD_NEWS; + } else { + break; + } + // fall through + default: + tex_back_input(cur_tok); + if (done) { + /* unless we accept partial keywords */ + goto BAD_NEWS; + } else { + cur_cs = save_cur_cs; + return 0; + } + } + } + BAD_NEWS: + tex_aux_show_keyword_error(s); + cur_cs = save_cur_cs; + return 0; +} + +/*tex + Here we know that the first character(s) matched so we are in the middle of a keyword already + which means a different loop than the previous one. +*/ + +int tex_scan_mandate_keyword(const char *s, int offset) +{ + halfword save_cur_cs = cur_cs; + int done = 0; + // int done = offset > 0; + const char *p = s + offset; /* offset always > 0 so no issue with +/- */ + while (*p) { + tex_get_x_token(); + switch (cur_cmd) { + case letter_cmd: + case other_char_cmd: + if ((cur_chr == *p) || (cur_chr == *p - 'a' + 'A')) { + if (*(++p)) { + done = 1; + } else { + cur_cs = save_cur_cs; + return 1; + } + } else { + goto BAD_NEWS; + } + break; + // case spacer_cmd: /* normally spaces are not pushed back */ + // case relax_cmd: /* normally not, should be option */ + // if (done) { + // back_input(cur_tok); + // goto BAD_NEWS; + // } else { + // break; + // } + // default: + // goto BAD_NEWS; + case spacer_cmd: /* normally spaces are not pushed back */ + if (done) { + goto BAD_NEWS; + } else { + break; + } + // fall through + default: + tex_back_input(cur_tok); + /* unless we accept partial keywords */ + goto BAD_NEWS; + } + } + BAD_NEWS: + tex_aux_show_keyword_error(s); + cur_cs = save_cur_cs; + return 0; +} + +/* + This is the original scanner with push|-|back. It's a matter of choice: we are more restricted + on the one hand and more loose on the other. +*/ + +int tex_scan_keyword(const char *s) +{ + if (*s) { + halfword h = null; + halfword p = null; + halfword save_cur_cs = cur_cs; + int n = 0; + while (*s) { + /*tex Recursion is possible here! */ + tex_get_x_token(); + if ((cur_cmd == letter_cmd || cur_cmd == other_char_cmd) && ((cur_chr == *s) || (cur_chr == *s - 'a' + 'A'))) { + p = tex_store_new_token(p, cur_tok); + if (! h) { + h = p; + } + n++; + s++; + } else if ((p != h) || (cur_cmd != spacer_cmd)) { + tex_back_input(cur_tok); + if (h) { + tex_begin_backed_up_list(h); + } + cur_cs = save_cur_cs; + return 0; + } + } + if (h) { + tex_flush_token_list_head_tail(h, p, n); + } + cur_cs = save_cur_cs; + return 1; + } else { + /*tex but not with newtokenlib zero keyword simply doesn't match */ + return 0 ; + } +} + +int tex_scan_keyword_case_sensitive(const char *s) +{ + if (*s) { + halfword h = null; + halfword p = null; + halfword save_cur_cs = cur_cs; + int n = 0; + while (*s) { + tex_get_x_token(); + if ((cur_cmd == letter_cmd || cur_cmd == other_char_cmd) && (cur_chr == *s)) { + p = tex_store_new_token(p, cur_tok); + if (! h) { + h = p; + } + n++; + s++; + } else if ((p != h) || (cur_cmd != spacer_cmd)) { + tex_back_input(cur_tok); + if (h) { + tex_begin_backed_up_list(h); + } + cur_cs = save_cur_cs; + return 0; + } + } + if (h) { + tex_flush_token_list_head_tail(h, p, n); + } + cur_cs = save_cur_cs; + return 1; + } else { + return 0 ; + } +} + +/*tex + + We can not return |undefined_control_sequence| under some conditions (inside |shift_case|, + for example). This needs thinking. + +*/ + +halfword tex_active_to_cs(int c, int force) +{ + halfword cs = -1; + if (c > 0) { + /*tex This is not that efficient: we can make a helper that doesn't use an alloc. */ + char utfbytes[8] = { '\xEF', '\xBF', '\xBF', 0 }; + aux_uni2string((char *) &utfbytes[3], c); + cs = tex_string_locate(utfbytes, (size_t) utf8_size(c) + 3, force); + } + if (cs < 0) { + cs = tex_string_locate("\xEF\xBF\xBF", 4, force); /*tex Including the zero sentinel. */ + } + return cs; +} + +/*tex + + The heart of \TEX's input mechanism is the |get_next| procedure, which we shall develop in the + next few sections of the program. Perhaps we shouldn't actually call it the \quote {heart}, + however, because it really acts as \TEX's eyes and mouth, reading the source files and + gobbling them up. And it also helps \TEX\ to regurgitate stored token lists that are to be + processed again. + + The main duty of |get_next| is to input one token and to set |cur_cmd| and |cur_chr| to that + token's command code and modifier. Furthermore, if the input token is a control sequence, the + |eqtb| location of that control sequence is stored in |cur_cs|; otherwise |cur_cs| is set to + zero. + + Underlying this simple description is a certain amount of complexity because of all the cases + that need to be handled. However, the inner loop of |get_next| is reasonably short and fast. + + When |get_next| is asked to get the next token of a |\read| line, it sets |cur_cmd = cur_chr + = cur_cs = 0| in the case that no more tokens appear on that line. (There might not be any + tokens at all, if the |end_line_char| has |ignore| as its catcode.) + + The value of |par_loc| is the |eqtb| address of |\par|. This quantity is needed because a + blank line of input is supposed to be exactly equivalent to the appearance of |\par|; we must + set |cur_cs := par_loc| when detecting a blank line. + + Parts |get_next| are executed more often than any other instructions of \TEX. The global + variable |force_eof| is normally |false|; it is set |true| by an |\endinput| command. + |luacstrings| is the number of lua print statements waiting to be input, it is changed by + |lmt_token_call|. + + If the user has set the |pausing| parameter to some positive value, and if nonstop mode has + not been selected, each line of input is displayed on the terminal and the transcript file, + followed by |=>|. \TEX\ waits for a response. If the response is simply |carriage_return|, + the line is accepted as it stands, otherwise the line typed is used instead of the line in the + file. + + We no longer need the following: + +*/ + +// void firm_up_the_line(void) +// { +// ilimit = fileio_state.io_last; +// } + +/*tex + + The other variant gives less clutter in tracing cache usage when profiling and for some files + (like the manual) also a bit of a speedup. Splitting the switch which gives 10 times less Bim + in vallgrind! See the \LUATEX\ source for that code. + + The big switch changes the state if necessary, and |goto switch| if the current character + should be ignored, or |goto reswitch| if the current character changes to another. + + The n-way switch accomplishes the scanning quickly, assuming that a decent \CCODE\ compiler + has translated the code. Note that the numeric values for |mid_line|, |skip_blanks|, and + |new_line| are spaced apart from each other by |max_char_code+1|, so we can add a character's + command code to the state to get a single number that characterizes both. + + Remark: checking performance indicated that this switch was the cause of many branch prediction + errors but changing it to: + + \starttyping + c = istate + cur_cmd; + if (c == (mid_line_state + letter_cmd) || c == (mid_line_state + other_char_cmd)) { + return 1; + } else if (c >= new_line_state) { + switch (c) { + } + } else if (c >= skip_blanks_state) { + switch (c) { + } + } else if (c >= mid_line_state) { + switch (c) { + } + } else { + istate = mid_line_state; + return 1; + } + \stoptyping + + This gives as many prediction errors. So, we can indeed assume that the compiler does the right + job, or that there is simply no other way. + + When a line is finished a space is emited. When a character of type |spacer| gets through, its + character code is changed to |\ =040|. This means that the \ASCII\ codes for tab and space, and + for the space inserted at the end of a line, will be treated alike when macro parameters are + being matched. We do this since such characters are indistinguishable on most computer terminal + displays. + +*/ + +/* + + c = istate + cur_cmd; + if (c == (mid_line_state + letter_cmd) || c == (mid_line_state + other_char_cmd)) { + return 1; + } else if (c >= new_line_state) { + .... + } + +*/ + +/*tex + + This trick has been dropped when the wrapup mechanism had proven to be useful. The idea was + to backport this to \LUATEX\ but some other \PDFTEX\ compatible parstuff made it there and + backporting par related features becomes too messy. + + \starttyping + lmt_input_state.cur_input.loc = lmt_input_state.cur_input.limit + 1; + cur_cs = lmt_token_state.line_par_loc; + cur_cmd = eq_type(cur_cs); + if (cur_cmd == undefined_cs_cmd) { + cur_cs = lmt_token_state.par_loc; + cur_cmd = eq_type(cur_cs); + } + cur_chr = eq_value(cur_cs); + \stoptyping + +*/ + +static int tex_aux_get_next_file(void) +{ + SWITCH: + if (lmt_input_state.cur_input.loc <= lmt_input_state.cur_input.limit) { + /*tex current line not yet finished */ + cur_chr = get_unichar_from_buffer(&lmt_input_state.cur_input.loc); + RESWITCH: + if (lmt_input_state.cur_input.cattable == no_catcode_table_preset) { + /* happens seldom: detokenized line */ + cur_cmd = cur_chr == ' ' ? 10 : 12; + } else { + cur_cmd = tex_aux_the_cat_code(cur_chr); + } + switch (lmt_input_state.cur_input.state + cur_cmd) { + case mid_line_state + ignore_cmd: + case skip_blanks_state + ignore_cmd: + case new_line_state + ignore_cmd: + case skip_blanks_state + spacer_cmd: + case new_line_state + spacer_cmd: + /*tex Cases where character is ignored. */ + goto SWITCH; + case mid_line_state + escape_cmd: + case new_line_state + escape_cmd: + case skip_blanks_state + escape_cmd: + /*tex Scan a control sequence. */ + lmt_input_state.cur_input.state = (unsigned char) tex_aux_scan_control_sequence(); + break; + case mid_line_state + active_char_cmd: + case new_line_state + active_char_cmd: + case skip_blanks_state + active_char_cmd: + /*tex Process an active-character. */ + cur_cs = tex_active_to_cs(cur_chr, ! lmt_hash_state.no_new_cs); + cur_cmd = eq_type(cur_cs); + cur_chr = eq_value(cur_cs); + lmt_input_state.cur_input.state = mid_line_state; + break; + case mid_line_state + superscript_cmd: + case new_line_state + superscript_cmd: + case skip_blanks_state + superscript_cmd: + /*tex We need to check for multiple ^: + (0) always check for ^^ ^^^^ ^^^^^^^ + (1) only check in text mode + (*) never + */ + if (sup_mark_mode_par) { + if (sup_mark_mode_par == 1 && cur_mode != mmode && tex_aux_process_sup_mark()) { + goto RESWITCH; + } + } else if (tex_aux_process_sup_mark()) { + goto RESWITCH; + } else { + /*tex + We provide prescripts and shifted script in math mode and avoid fance |^| + processing in text mode (which is what we do in \CONTEXT). + */ + } + lmt_input_state.cur_input.state = mid_line_state; + break; + case mid_line_state + invalid_char_cmd: + case new_line_state + invalid_char_cmd: + case skip_blanks_state + invalid_char_cmd: + /*tex Decry the invalid character and |goto restart|. */ + tex_aux_invalid_character_error(); + /*tex Because state may be |token_list| now: */ + return 0; + case mid_line_state + spacer_cmd: + /*tex Enter |skip_blanks| state, emit a space. */ + lmt_input_state.cur_input.state = skip_blanks_state; + cur_chr = ' '; + break; + case mid_line_state + end_line_cmd: + /*tex Finish the line. See note above about dropped |\linepar|. */ + lmt_input_state.cur_input.loc = lmt_input_state.cur_input.limit + 1; + cur_cmd = spacer_cmd; + cur_chr = ' '; + break; + case skip_blanks_state + end_line_cmd: + case mid_line_state + comment_cmd: + case new_line_state + comment_cmd: + case skip_blanks_state + comment_cmd: + /*tex Finish line, |goto switch|; */ + lmt_input_state.cur_input.loc = lmt_input_state.cur_input.limit + 1; + goto SWITCH; + case new_line_state + end_line_cmd: + if (! auto_paragraph_mode(auto_paragraph_go_on)) { + lmt_input_state.cur_input.loc = lmt_input_state.cur_input.limit + 1; + } + /*tex Finish line, emit a |\par|; */ + if (auto_paragraph_mode(auto_paragraph_text)) { + cur_cs = null; + cur_cmd = end_paragraph_cmd; + cur_chr = new_line_end_paragraph_code; + // cur_chr = normal_end_paragraph_code; + } else { + cur_cs = lmt_token_state.par_loc; + cur_cmd = eq_type(cur_cs); + cur_chr = eq_value(cur_cs); + } + break; + case skip_blanks_state + left_brace_cmd: + case new_line_state + left_brace_cmd: + lmt_input_state.cur_input.state = mid_line_state; + lmt_input_state.align_state++; + break; + case mid_line_state + left_brace_cmd: + lmt_input_state.align_state++; + break; + case skip_blanks_state + right_brace_cmd: + case new_line_state + right_brace_cmd: + lmt_input_state.cur_input.state = mid_line_state; + lmt_input_state.align_state--; + break; + case mid_line_state + right_brace_cmd: + lmt_input_state.align_state--; + break; + case mid_line_state + math_shift_cmd: + case mid_line_state + alignment_tab_cmd: + case mid_line_state + parameter_cmd: + case mid_line_state + subscript_cmd: + case mid_line_state + letter_cmd: + case mid_line_state + other_char_cmd: + break; + /* + case skip_blanks_state + math_shift_cmd: + case skip_blanks_state + tab_mark_cmd: + case skip_blanks_state + mac_param_cmd: + case skip_blanks_state + sub_mark_cmd: + case skip_blanks_state + letter_cmd: + case skip_blanks_state + other_char_cmd: + case new_line_state + math_shift_cmd: + case new_line_state + tab_mark_cmd: + case new_line_state + mac_param_cmd: + case new_line_state + sub_mark_cmd: + case new_line_state + letter_cmd: + case new_line_state + other_char_cmd: + */ + default: + lmt_input_state.cur_input.state = mid_line_state; + break; + } + } else { + if (! io_token_input(lmt_input_state.cur_input.name)) { + lmt_input_state.cur_input.state = new_line_state; + } + /*tex + + Move to next line of file, or |goto restart| if there is no next line, or |return| if a + |\read| line has finished. + + */ + do { + next_line_retval r = tex_aux_next_line(); + if (r == next_line_restart) { + /*tex This happens more often. */ + return 0; + } else if (r == next_line_return) { + return 1; + } + } while (0); + /* check_interrupt(); */ + goto SWITCH; + } + return 1; +} + +/*tex + + Notice that a code like |^^8| becomes |x| if not followed by a hex digit. We only support a + limited set: + + \starttyping + ^^^^^^XXXXXX + ^^^^XXXXXX + ^^XX ^^<char> + \stoptyping + +*/ + +# define is_hex(a) ((a >= '0' && a <= '9') || (a >= 'a' && a <= 'f')) + + inline static halfword tex_aux_two_hex_to_cur_chr(int c1, int c2) + { + return + 0x10 * (c1 <= '9' ? c1 - '0' : c1 - 'a' + 10) + + 0x01 * (c2 <= '9' ? c2 - '0' : c2 - 'a' + 10); + } + + inline static halfword tex_aux_four_hex_to_cur_chr(int c1, int c2,int c3, int c4) + { + return + 0x1000 * (c1 <= '9' ? c1 - '0' : c1 - 'a' + 10) + + 0x0100 * (c2 <= '9' ? c2 - '0' : c2 - 'a' + 10) + + 0x0010 * (c3 <= '9' ? c3 - '0' : c3 - 'a' + 10) + + 0x0001 * (c4 <= '9' ? c4 - '0' : c4 - 'a' + 10); +} + +inline static halfword tex_aux_six_hex_to_cur_chr(int c1, int c2, int c3, int c4, int c5, int c6) +{ + return + 0x100000 * (c1 <= '9' ? c1 - '0' : c1 - 'a' + 10) + + 0x010000 * (c2 <= '9' ? c2 - '0' : c2 - 'a' + 10) + + 0x001000 * (c3 <= '9' ? c3 - '0' : c3 - 'a' + 10) + + 0x000100 * (c4 <= '9' ? c4 - '0' : c4 - 'a' + 10) + + 0x000010 * (c5 <= '9' ? c5 - '0' : c5 - 'a' + 10) + + 0x000001 * (c6 <= '9' ? c6 - '0' : c6 - 'a' + 10); + +} + +static int tex_aux_process_sup_mark(void) +{ + if (cur_chr == lmt_fileio_state.io_buffer[lmt_input_state.cur_input.loc]) { + if (lmt_input_state.cur_input.loc < lmt_input_state.cur_input.limit) { + if ((cur_chr == lmt_fileio_state.io_buffer[lmt_input_state.cur_input.loc + 1]) && (cur_chr == lmt_fileio_state.io_buffer[lmt_input_state.cur_input.loc + 2])) { + if ((cur_chr == lmt_fileio_state.io_buffer[lmt_input_state.cur_input.loc + 3]) && (cur_chr == lmt_fileio_state.io_buffer[lmt_input_state.cur_input.loc + 4])) { + if ((lmt_input_state.cur_input.loc + 10) <= lmt_input_state.cur_input.limit) { + /*tex |^^^^^^XXXXXX| */ + int c1 = lmt_fileio_state.io_buffer[lmt_input_state.cur_input.loc + 5]; + int c2 = lmt_fileio_state.io_buffer[lmt_input_state.cur_input.loc + 6]; + int c3 = lmt_fileio_state.io_buffer[lmt_input_state.cur_input.loc + 7]; + int c4 = lmt_fileio_state.io_buffer[lmt_input_state.cur_input.loc + 8]; + int c5 = lmt_fileio_state.io_buffer[lmt_input_state.cur_input.loc + 9]; + int c6 = lmt_fileio_state.io_buffer[lmt_input_state.cur_input.loc + 10]; + if (is_hex(c1) && is_hex(c2) && is_hex(c3) && is_hex(c4) && is_hex(c5) && is_hex(c6)) { + lmt_input_state.cur_input.loc += 11; + cur_chr = tex_aux_six_hex_to_cur_chr(c1, c2, c3, c4, c5, c6); + return 1; + } else { + tex_handle_error( + normal_error_type, + "^^^^^^ needs six hex digits", + NULL + ); + } + } else { + tex_handle_error( + normal_error_type, + "^^^^^^ needs six hex digits, end of input", + NULL + ); + } + } else if ((lmt_input_state.cur_input.loc + 6) <= lmt_input_state.cur_input.limit) { + /*tex |^^^^XXXX| */ + int c1 = lmt_fileio_state.io_buffer[lmt_input_state.cur_input.loc + 3]; + int c2 = lmt_fileio_state.io_buffer[lmt_input_state.cur_input.loc + 4]; + int c3 = lmt_fileio_state.io_buffer[lmt_input_state.cur_input.loc + 5]; + int c4 = lmt_fileio_state.io_buffer[lmt_input_state.cur_input.loc + 6]; + if (is_hex(c1) && is_hex(c2) && is_hex(c3) && is_hex(c4)) { + lmt_input_state.cur_input.loc += 7; + cur_chr = tex_aux_four_hex_to_cur_chr(c1, c2, c3, c4); + return 1; + } else { + tex_handle_error( + normal_error_type, + "^^^^ needs four hex digits", + NULL + ); + } + } else { + tex_handle_error( + normal_error_type, + "^^^^ needs four hex digits, end of input", + NULL + ); + } + } else if ((lmt_input_state.cur_input.loc + 2) <= lmt_input_state.cur_input.limit) { + /*tex |^^XX| */ + int c1 = lmt_fileio_state.io_buffer[lmt_input_state.cur_input.loc + 1]; + int c2 = lmt_fileio_state.io_buffer[lmt_input_state.cur_input.loc + 2]; + if (is_hex(c1) && is_hex(c2)) { + lmt_input_state.cur_input.loc += 3; + cur_chr = tex_aux_two_hex_to_cur_chr(c1, c2); + return 1; + } + } + /*tex The single character case: */ + { + int c1 = lmt_fileio_state.io_buffer[lmt_input_state.cur_input.loc + 1]; + if (c1 < 0200) { + lmt_input_state.cur_input.loc = lmt_input_state.cur_input.loc + 2; + // if (is_hex(c1) && (iloc <= ilimit)) { + // int c2 = fileio_state.io_buffer[iloc]; + // if (is_hex(c2)) { + // ++iloc; + // cur_chr = two_hex_to_cur_chr(c1, c2); + // return 1; + // } + // } + // /*tex The somewhat odd cases, often special control characters: */ + cur_chr = (c1 < 0100 ? c1 + 0100 : c1 - 0100); + return 1; + } + } + } + } + return 0; +} + +/*tex + + Control sequence names are scanned only when they appear in some line of a file. Once they have + been scanned the first time, their |eqtb| location serves as a unique identification, so \TEX\ + doesn't need to refer to the original name any more except when it prints the equivalent in + symbolic form. + + The program that scans a control sequence has been written carefully in order to avoid the + blowups that might otherwise occur if a malicious user tried something like |\catcode'15 = 0|. + The algorithm might look at |buffer[ilimit + 1]|, but it never looks at |buffer[ilimit + 2]|. + + If expanded characters like |^^A| or |^^df| appear in or just following a control sequence name, + they are converted to single characters in the buffer and the process is repeated, slowly but + surely. + +*/ + +/*tex + + Whenever we reach the following piece of code, we will have |cur_chr = buffer[k - 1]| and |k <= + ilimit + 1| and |cat = get_cat_code(cat_code_table, cur_chr)|. If an expanded code like |^^A| or + |^^df| appears in |buffer[(k - 1) .. (k + 1)]| or |buffer[(k - 1) .. (k + 2)]|, we will store + the corresponding code in |buffer[k - 1]| and shift the rest of the buffer left two or three + places. + +*/ + +static int tex_aux_check_expanded_code(int *kk, halfword *chr) +{ + if (sup_mark_mode_par > 1 || (sup_mark_mode_par == 1 && cur_mode == mmode)) { + return 0; + } else { + int k = *kk; + /* chr is the ^ character or an equivalent one */ + if (lmt_fileio_state.io_buffer[k] == *chr && k < lmt_input_state.cur_input.limit) { + int d = 1; + int l; + if ((*chr == lmt_fileio_state.io_buffer[k + 1]) && (*chr == lmt_fileio_state.io_buffer[k + 2])) { + if ((*chr == lmt_fileio_state.io_buffer[k + 3]) && (*chr == lmt_fileio_state.io_buffer[k + 4])) { + if ((k + 10) <= lmt_input_state.cur_input.limit) { + int c1 = lmt_fileio_state.io_buffer[k + 6 - 1]; + int c2 = lmt_fileio_state.io_buffer[k + 6 ]; + int c3 = lmt_fileio_state.io_buffer[k + 6 + 1]; + int c4 = lmt_fileio_state.io_buffer[k + 6 + 2]; + int c5 = lmt_fileio_state.io_buffer[k + 6 + 3]; + int c6 = lmt_fileio_state.io_buffer[k + 6 + 4]; + if (is_hex(c1) && is_hex(c2) && is_hex(c3) && is_hex(c4) && is_hex(c5) && is_hex(c6)) { + d = 6; + *chr = tex_aux_six_hex_to_cur_chr(c1, c2, c3, c4, c5, c6); + } else { + tex_handle_error( + normal_error_type, + "^^^^^^ needs six hex digits", + NULL + ); + } + } else { + tex_handle_error( + normal_error_type, + "^^^^^^ needs six hex digits, end of input", + NULL + ); + } + } else if ((k + 6) <= lmt_input_state.cur_input.limit) { + int c1 = lmt_fileio_state.io_buffer[k + 4 - 1]; + int c2 = lmt_fileio_state.io_buffer[k + 4 ]; + int c3 = lmt_fileio_state.io_buffer[k + 4 + 1]; + int c4 = lmt_fileio_state.io_buffer[k + 4 + 2]; + if (is_hex(c1) && is_hex(c2) && is_hex(c3) && is_hex(c4)) { + d = 4; + *chr = tex_aux_four_hex_to_cur_chr(c1, c2, c3, c4); + } else { + tex_handle_error( + normal_error_type, + "^^^^ needs four hex digits", + NULL + ); + } + } else { + tex_handle_error( + normal_error_type, + "^^^^ needs four hex digits, end of input", + NULL + ); + } + } else { + int c1 = lmt_fileio_state.io_buffer[k + 1]; + if (c1 < 0200) { /* really ? */ + d = 1; + if (is_hex(c1) && (k + 2) <= lmt_input_state.cur_input.limit) { + int c2 = lmt_fileio_state.io_buffer[k + 2]; + if (is_hex(c2)) { + d = 2; + *chr = tex_aux_two_hex_to_cur_chr(c1, c2); + } else { + *chr = (c1 < 0100 ? c1 + 0100 : c1 - 0100); + } + } else { + *chr = (c1 < 0100 ? c1 + 0100 : c1 - 0100); + } + } + } + if (d > 2) { + d = 2 * d - 1; + } else { + d++; + } + if (*chr <= 0x7F) { + lmt_fileio_state.io_buffer[k - 1] = (unsigned char) *chr; + } else if (*chr <= 0x7FF) { + lmt_fileio_state.io_buffer[k - 1] = (unsigned char) (0xC0 + *chr / 0x40); + k++; + d--; + lmt_fileio_state.io_buffer[k - 1] = (unsigned char) (0x80 + *chr % 0x40); + } else if (*chr <= 0xFFFF) { + lmt_fileio_state.io_buffer[k - 1] = (unsigned char) (0xE0 + *chr / 0x1000); + k++; + d--; + lmt_fileio_state.io_buffer[k - 1] = (unsigned char) (0x80 + (*chr % 0x1000) / 0x40); + k++; + d--; + lmt_fileio_state.io_buffer[k - 1] = (unsigned char) (0x80 + (*chr % 0x1000) % 0x40); + } else { + lmt_fileio_state.io_buffer[k - 1] = (unsigned char) (0xF0 + *chr / 0x40000); + k++; + d--; + lmt_fileio_state.io_buffer[k - 1] = (unsigned char) (0x80 + (*chr % 0x40000) / 0x1000); + k++; + d--; + lmt_fileio_state.io_buffer[k - 1] = (unsigned char) (0x80 + ((*chr % 0x40000) % 0x1000) / 0x40); + k++; + d--; + lmt_fileio_state.io_buffer[k - 1] = (unsigned char) (0x80 + ((*chr % 0x40000) % 0x1000) % 0x40); + } + l = k; + lmt_input_state.cur_input.limit -= d; + while (l <= lmt_input_state.cur_input.limit) { + lmt_fileio_state.io_buffer[l] = lmt_fileio_state.io_buffer[l + d]; + l++; + } + *kk = k; + cur_chr = *chr; /* hm */ + return 1; + } else { + return 0; + } + } +} + +static int tex_aux_scan_control_sequence(void) +{ + int state = mid_line_state; + if (lmt_input_state.cur_input.loc > lmt_input_state.cur_input.limit) { + /*tex |state| is irrelevant in this case. */ + cur_cs = null_cs; + } else { + /*tex |cat_code(cur_chr)|, usually: */ + while (1) { + int loc = lmt_input_state.cur_input.loc; + halfword chr = get_unichar_from_buffer(&loc); + halfword cat = tex_aux_the_cat_code(chr); + if (cat != letter_cmd || loc > lmt_input_state.cur_input.limit) { + if (cat == spacer_cmd) { + state = skip_blanks_state; + } else { + state = mid_line_state; + if (cat == superscript_cmd && tex_aux_check_expanded_code(&loc, &chr)) { + continue; + } + } + // state = cat == spacer_cmd ? skip_blanks_state : mid_line_state; + // /*tex If an expanded \unknown */ + // if (cat == sup_mark_cmd && check_expanded_code(&loc, chr)) { + // continue; + // } + } else { + state = skip_blanks_state; + do { + chr = get_unichar_from_buffer(&loc); + cat = tex_aux_the_cat_code(chr); + } while (cat == letter_cmd && loc <= lmt_input_state.cur_input.limit); + /*tex If an expanded \unknown */ + if (cat == superscript_cmd && tex_aux_check_expanded_code(&loc, &chr)) { + continue; + } else if (cat != letter_cmd) { + /*tex Backtrack one character which can be \UTF. */ + if (chr <= 0x7F) { + loc -= 1; /* in most cases */ + } else if (chr > 0xFFFF) { + loc -= 4; + } else if (chr > 0x7FF) { + loc -= 3; + } else /* if (cur_chr > 0x7F) */ { + loc -= 2; + } + /*tex Now |k| points to first nonletter. */ + } + } + cur_cs = tex_id_locate(lmt_input_state.cur_input.loc, loc - lmt_input_state.cur_input.loc, ! lmt_hash_state.no_new_cs); + lmt_input_state.cur_input.loc = loc; + break; + } + } + cur_cmd = eq_type(cur_cs); + cur_chr = eq_value(cur_cs); + return state; +} + +/*tex + + All of the easy branches of |get_next| have now been taken care of. There is one more branch. + Conversely, the |file_warning| procedure is invoked when a file ends and some groups entered or + conditionals started while reading from that file are still incomplete. + +*/ + +static void tex_aux_file_warning(void) +{ + halfword cond_ptr = lmt_save_state.save_stack_data.ptr; /*tex saved value of |save_ptr| or |cond_ptr| */ + int cur_if = cur_group; /*tex saved value of |cur_group| or |cur_if| */ + int cur_unless = 0; + int if_step = 0; + int if_unless = 0; + int if_limit = cur_level; /*tex saved value of |cur_level| or |if_limit| */ + int if_line = 0; /*tex saved value of |if_line| */ + lmt_save_state.save_stack_data.ptr = cur_boundary; + while (lmt_input_state.in_stack[lmt_input_state.in_stack_data.ptr].group != lmt_save_state.save_stack_data.ptr) { + --cur_level; + tex_print_nlp(); + tex_print_format("Warning: end of file when %G is incomplete", 1); + cur_group = save_level(lmt_save_state.save_stack_data.ptr); + lmt_save_state.save_stack_data.ptr = save_value(lmt_save_state.save_stack_data.ptr); + } + /*tex Restore old values. */ + lmt_save_state.save_stack_data.ptr = cond_ptr; + cur_level = (quarterword) if_limit; + cur_group = (quarterword) cur_if; + cond_ptr = lmt_condition_state.cond_ptr; + cur_if = lmt_condition_state.cur_if; + cur_unless = lmt_condition_state.cur_unless; + if_step = lmt_condition_state.if_step; + if_unless = lmt_condition_state.if_unless; + if_limit = lmt_condition_state.if_limit; + if_line = lmt_condition_state.if_line; + while (lmt_input_state.in_stack[lmt_input_state.in_stack_data.ptr].if_ptr != lmt_condition_state.cond_ptr) { + /* todo, more info */ + tex_print_nlp(); + tex_print_format("Warning: end of file when %C", if_test_cmd, lmt_condition_state.cur_if); + if (lmt_condition_state.if_limit == fi_code) { + tex_print_str_esc("else"); + } + if (lmt_condition_state.if_line) { + tex_print_format(" entered on line %i", lmt_condition_state.if_line); + } + tex_print_str(" is incomplete"); + lmt_condition_state.cur_if = if_limit_subtype(lmt_condition_state.cond_ptr); + lmt_condition_state.cur_unless = if_limit_unless(lmt_condition_state.cond_ptr); + lmt_condition_state.if_step = if_limit_step(lmt_condition_state.cond_ptr); + lmt_condition_state.if_unless = if_limit_stepunless(lmt_condition_state.cond_ptr); + lmt_condition_state.if_limit = if_limit_type(lmt_condition_state.cond_ptr); + lmt_condition_state.if_line = if_limit_line(lmt_condition_state.cond_ptr); + lmt_condition_state.cond_ptr = node_next(lmt_condition_state.cond_ptr); + } + /*tex restore old values */ + lmt_condition_state.cond_ptr = cond_ptr; + lmt_condition_state.cur_if = cur_if; + lmt_condition_state.cur_unless = cur_unless; + lmt_condition_state.if_step = if_step; + lmt_condition_state.if_unless = if_unless; + lmt_condition_state.if_limit = if_limit; + lmt_condition_state.if_line = if_line; + tex_print_nlp(); + if (tracing_nesting_par > 1) { + tex_show_context(); + } + if (lmt_error_state.history == spotless) { + lmt_error_state.history = warning_issued; + } +} + +static void tex_aux_check_validity(void) +{ + switch (lmt_input_state.scanner_status) { + case scanner_is_normal: + break; + case scanner_is_skipping: + tex_handle_error( + condition_error_type, + "The file ended while I was skipping conditional text.", + "This kind of error happens when you say '\\if...' and forget the\n" + "matching '\\fi'. It can also be that you use '\\orelse' or '\\orunless\n'" + "in the wrong way. Or maybe a forbidden control sequence was encountered." + ); + break; + case scanner_is_defining: + tex_handle_error(runaway_error_type, "The file ended when scanning a definition.", NULL); + break; + case scanner_is_matching: + tex_handle_error(runaway_error_type, "The file ended when scanning an argument.", NULL); + break; + case scanner_is_tolerant: + break; + case scanner_is_aligning: + tex_handle_error(runaway_error_type, "The file ended when scanning an alignment preamble.", NULL); + break; + case scanner_is_absorbing: + tex_handle_error(runaway_error_type, "The file ended when absorbing something.", NULL); + break; + } +} + +static next_line_retval tex_aux_next_line(void) +{ + if (lmt_input_state.cur_input.name > io_initial_input_code) { + /*tex Read next line of file into |buffer|, or |goto restart| if the file has ended. */ + unsigned inhibit_eol = 0; + ++lmt_input_state.input_line; + lmt_fileio_state.io_first = lmt_input_state.cur_input.start; + if (! lmt_token_state.force_eof) { + unsigned force_eol = 0; + switch (lmt_input_state.cur_input.name) { + case io_lua_input_code: + { + halfword n = null; + int cattable = 0; + int partial = 0; + int finalline = 0; + int t = lmt_cstring_input(&n, &cattable, &partial, &finalline); + switch (t) { + case eof_tex_input: + lmt_token_state.force_eof = 1; + break; + case string_tex_input: + /*tex string */ + lmt_input_state.cur_input.limit = lmt_fileio_state.io_last; /*tex Was |firm_up_the_line();|. */ + lmt_input_state.cur_input.cattable = (short) cattable; + lmt_input_state.cur_input.partial = (signed char) partial; + if (finalline || partial || cattable == no_catcode_table_preset) { + inhibit_eol = 1; + } + if (! partial) { + lmt_input_state.cur_input.state = new_line_state; + } + break; + case token_tex_input: + /*tex token */ + if (n >= cs_token_flag && eq_type(n - cs_token_flag) == input_cmd && eq_value(n - cs_token_flag) == end_of_input_code && lmt_input_state.cur_input.index > 0) { + tex_end_file_reading(); + } + tex_back_input(n); + return next_line_restart; + case token_list_tex_input: + /*tex token */ + tex_begin_backed_up_list(n); + return next_line_restart; + case node_tex_input: + /*tex node */ + if (node_token_overflow(n)) { + tex_back_input(token_val(ignore_cmd, node_token_lsb(n))); + tex_reinsert_token(token_val(node_cmd, node_token_msb(n))); + return next_line_restart; + } else { + /*tex |0x10FFFF == 1114111| */ + tex_back_input(token_val(node_cmd, n)); + return next_line_restart; + } + default: + lmt_token_state.force_eof = 1; + break; + } + break; + } + case io_token_input_code: + case io_token_eof_input_code: + { + /* can be simplified but room for extensions now */ + halfword n = null; + int cattable = 0; + int partial = 0; + int finalline = 0; + int t = lmt_cstring_input(&n, &cattable, &partial, &finalline); + switch (t) { + case eof_tex_input: + lmt_token_state.force_eof = 1; + if (lmt_input_state.cur_input.name == io_token_eof_input_code && every_eof_par) { + force_eol = 1; + } + break; + case string_tex_input: + /*tex string */ + lmt_input_state.cur_input.limit = lmt_fileio_state.io_last; /*tex Was |firm_up_the_line();|. */ + lmt_input_state.cur_input.cattable = (short) cattable; + lmt_input_state.cur_input.partial = (signed char) partial; + inhibit_eol = lmt_input_state.cur_input.name != io_token_eof_input_code; + if (! partial) { + lmt_input_state.cur_input.state = new_line_state; + } + break; + default: + lmt_token_state.force_eof = 1; + break; + } + break; + } + case io_tex_macro_code: + /* what */ + default: + if (tex_lua_input_ln()) { + /*tex Not end of file, set |ilimit|. */ + lmt_input_state.cur_input.limit = lmt_fileio_state.io_last; /*tex Was |firm_up_the_line();|. */ + lmt_input_state.cur_input.cattable = default_catcode_table_preset; + } else if (every_eof_par && (! lmt_input_state.in_stack[lmt_input_state.cur_input.index].end_of_file_seen)) { + force_eol = 1; + } else { + tex_aux_check_validity(); + lmt_token_state.force_eof = 1; + } + break; + } + if (force_eol) { + lmt_input_state.cur_input.limit = lmt_fileio_state.io_first - 1; + /* tex Fake one empty line. */ + lmt_input_state.in_stack[lmt_input_state.cur_input.index].end_of_file_seen = 1; + tex_begin_token_list(every_eof_par, every_eof_text); + return next_line_restart; + } + } + if (lmt_token_state.force_eof) { + if (tracing_nesting_par > 0) { + if ((lmt_input_state.in_stack[lmt_input_state.in_stack_data.ptr].group != cur_boundary) || (lmt_input_state.in_stack[lmt_input_state.in_stack_data.ptr].if_ptr != lmt_condition_state.cond_ptr)) { + if (! io_token_input(lmt_input_state.cur_input.name)) { + /*tex Give warning for some unfinished groups and/or conditionals. */ + tex_aux_file_warning(); + } + } + } + if (io_file_input(lmt_input_state.cur_input.name)) { + tex_report_stop_file(); + --lmt_input_state.open_files; + } + lmt_token_state.force_eof = 0; + tex_end_file_reading(); + return next_line_restart; + } else { + if (inhibit_eol || end_line_char_inactive) { + lmt_input_state.cur_input.limit--; + } else { + lmt_fileio_state.io_buffer[lmt_input_state.cur_input.limit] = (unsigned char) end_line_char_par; + } + lmt_fileio_state.io_first = lmt_input_state.cur_input.limit + 1; + lmt_input_state.cur_input.loc = lmt_input_state.cur_input.start; + /*tex We're ready to read. */ + } + } else if (lmt_input_state.input_stack_data.ptr > 0) { + cur_cmd = 0; + cur_chr = 0; + return next_line_return; + } else { + /*tex A somewhat weird check: */ + switch (lmt_print_state.selector) { + case no_print_selector_code: + case terminal_selector_code: + tex_open_log_file(); + break; + } + tex_handle_error(eof_error_type, "end of file encountered", NULL); + /*tex Just in case it is not handled in a callback: */ + if (lmt_error_state.interaction > nonstop_mode) { + tex_fatal_error("aborting job"); + } + } + return next_line_ok; +} + +/*tex + Let's consider now what happens when |get_next| is looking at a token list. +*/ + +static int tex_aux_get_next_tokenlist(void) +{ + halfword t = token_info(lmt_input_state.cur_input.loc); + /*tex Move to next. */ + lmt_input_state.cur_input.loc = token_link(lmt_input_state.cur_input.loc); + if (t >= cs_token_flag) { + /*tex A control sequence token */ + cur_cs = t - cs_token_flag; + cur_cmd = eq_type(cur_cs); + if (cur_cmd == deep_frozen_dont_expand_cmd) { + /*tex + + Get the next token, suppressing expansion. The present point in the program is + reached only when the |expand| routine has inserted a special marker into the + input. In this special case, |token_info(iloc)| is known to be a control sequence + token, and |token_link(iloc) = null|. + + */ + cur_cs = token_info(lmt_input_state.cur_input.loc) - cs_token_flag; + lmt_input_state.cur_input.loc = null; + cur_cmd = eq_type(cur_cs); + if (cur_cmd > max_command_cmd) { + cur_cmd = relax_cmd; + // cur_chr = no_expand_flag; + cur_chr = no_expand_relax_code; + return 1; + } + } + cur_chr = eq_value(cur_cs); + } else { + cur_cmd = token_cmd(t); + cur_chr = token_chr(t); + switch (cur_cmd) { + case left_brace_cmd: + lmt_input_state.align_state++; + break; + case right_brace_cmd: + lmt_input_state.align_state--; + break; + case parameter_reference_cmd: + /*tex Insert macro parameter and |goto restart|. */ + tex_begin_parameter_list(lmt_input_state.parameter_stack[lmt_input_state.cur_input.parameter_start + cur_chr - 1]); + return 0; + } + } + return 1; +} + +/*tex + + Now we're ready to take the plunge into |get_next| itself. Parts of this routine are executed + more often than any other instructions of \TEX. This sets |cur_cmd|, |cur_chr|, |cur_cs| to + next token. + + Handling alignments is interwoven because there we switch between constructing cells and rows + (node lists) based on templates that are token lists. This is why in several places we find + checks for |align_state|. + +*/ + +void tex_get_next(void) +{ + while (1) { + cur_cs = 0; + if (lmt_input_state.cur_input.state != token_list_state) { + /*tex Input from external file, |goto restart| if no input found. */ + if (! tex_aux_get_next_file()) { + continue; + } else { + /*tex Check align state later on! */ + } + } else if (! lmt_input_state.cur_input.loc) { + /*tex List exhausted, resume previous level. */ + tex_end_token_list(); + continue; + } else if (! tex_aux_get_next_tokenlist()) { + /*tex Parameter needs to be expanded. */ + continue; + } + if ((lmt_input_state.align_state == 0) && (cur_cmd == alignment_tab_cmd || cur_cmd == alignment_cmd)) { + /*tex If an alignment entry has just ended, take appropriate action. */ + tex_insert_alignment_template(); + continue; + } else { + break; + } + } +} + +/*tex + + Since |get_next| is used so frequently in \TEX, it is convenient to define three related + procedures that do a little more: + + \startitemize + \startitem + |get_token| not only sets |cur_cmd| and |cur_chr|, it also sets |cur_tok|, a packed + halfword version of the current token. + \stopitem + \startitem + |get_x_token|, meaning \quote {get an expanded token}, is like |get_token|, but if the + current token turns out to be a user-defined control sequence (i.e., a macro call), or + a conditional, or something like |\topmark| or |\expandafter| or |\csname|, it is + eliminated from the input by beginning the expansion of the macro or the evaluation of + the conditional. + \stopitem + \startitem + |x_token| is like |get_x_token| except that it assumes that |get_next| has already been + called. + \stopitem + \stopitemize + + In fact, these three procedures account for almost every use of |get_next|. No new control + sequences will be defined except during a call of |get_token|, or when |\csname| compresses a + token list, because |no_new_control_sequence| is always |true| at other times. + + This sets |cur_cmd|, |cur_chr|, |cur_tok|. For convenience we also return the token because in + some places we store it and then some direct assignment looks a bit nicer. + +*/ + +halfword tex_get_token(void) +{ + lmt_hash_state.no_new_cs = 0; + tex_get_next(); + lmt_hash_state.no_new_cs = 1; + cur_tok = cur_cs ? cs_token_flag + cur_cs : token_val(cur_cmd, cur_chr); + return cur_tok; +} + +/*tex This changes the string |s| to a token list. */ + +halfword tex_string_to_toks(const char *ss) +{ + const char *s = ss; + const char *se = ss + strlen(s); + /*tex tail of the token list */ + halfword h = null; + halfword p = null; + /*tex new node being added to the token list via |store_new_token| */ + while (s < se) { + halfword t = (halfword) aux_str2uni((const unsigned char *) s); + s += utf8_size(t); + if (t == ' ') { + t = space_token; + } else { + t += other_token; + } + p = tex_store_new_token(p, t); + if (! h) { + h = p; + } + } + return h; +} + +/*tex + + The token lists for macros and for other things like |\mark| and |\output| and |\write| are + produced by a procedure called |scan_toks|. + + Before we get into the details of |scan_toks|, let's consider a much simpler task, that of + converting the current string into a token list. The |str_toks| function does this; it + classifies spaces as type |spacer| and everything else as type |other_char|. + + The token list created by |str_toks| begins at |link(temp_token_head)| and ends at the value + |p| that is returned. If |p = temp_token_head|, the list is empty. + + |lua_str_toks| is almost identical, but it also escapes the three symbols that \LUA\ considers + special while scanning a literal string. + + This changes the string |str_pool[b .. pool_ptr]| to a token list: + +*/ + +static halfword lmt_str_toks(lstring b) /* returns head */ +{ + /*tex index into string */ + unsigned char *k = (unsigned char *) b.s; + /*tex tail of the token list */ + halfword h = null; + halfword p = null; + while (k < (unsigned char *) b.s + b.l) { + /*tex token being appended */ + halfword t = aux_str2uni(k); + k += utf8_size(t); + if (t == ' ') { + t = space_token; + } else { + if ((t == '\\') || (t == '"') || (t == '\'') || (t == 10) || (t == 13)) { + p = tex_store_new_token(p, escape_token); + if (t == 10) { + t = 'n'; + } else if (t == 13) { + t = 'r'; + } + } + t += other_token; + } + p = tex_store_new_token(p, t); + if (! h) { + h = p; + } + } + return h; +} + +/*tex + + Incidentally, the main reason for wanting |str_toks| is the function |the_toks|, which has + similar input/output characteristics. This changes the string |str_pool[b .. pool_ptr]| to a + token list: + +*/ + +halfword tex_str_toks(lstring s, halfword *tail) +{ + halfword h = null; + halfword p = null; + if (s.s) { + unsigned char *k = s.s; + unsigned char *l = k + s.l; + while (k < l) { + halfword t = aux_str2uni(k); + if (t == ' ') { + k += 1; + t = space_token; + } else { + k += utf8_size(t); + t += other_token; + } + p = tex_store_new_token(p, t); + if (! h) { + h = p; + } + } + } + if (tail) { + *tail = null; + } + return h; +} + +halfword tex_cur_str_toks(halfword *tail) +{ + halfword h = null; + halfword p = null; + unsigned char *k = (unsigned char *) lmt_string_pool_state.string_temp; + if (k) { + unsigned char *l = k + lmt_string_pool_state.string_temp_top; + /*tex tail of the token list */ + while (k < l) { + /*tex token being appended */ + halfword t = aux_str2uni(k); + if (t == ' ') { + k += 1; + t = space_token; + } else { + k += utf8_size(t); + t += other_token; + } + p = tex_store_new_token(p, t); + if (! h) { + h = p; + } + } + } + tex_reset_cur_string(); + if (tail) { + *tail = p; + } + return h; +} + +/*tex + + Most of the converter is similar to the one I made for macro so at some point I can make a + helper; also todo: there is no need to go through the pool. + +*/ + +/*tex Change the string |str_pool[b..pool_ptr]| to a token list. */ + +halfword tex_str_scan_toks(int ct, lstring ls) +{ + /*tex index into string */ + unsigned char *k = ls.s; + unsigned char *l = k + ls.l; + /*tex tail of the token list */ + halfword h = null; + halfword p = null; + while (k < l) { + int cc; + /*tex token being appended */ + halfword t = aux_str2uni(k); + k += utf8_size(t); + cc = tex_get_cat_code(ct, t); + if (cc == 0) { + /*tex We have a potential control sequence so we check for it. */ + int lname = 0 ; + int s = 0 ; + int c = 0 ; + unsigned char *name = k ; + while (k < l) { + t = (halfword) aux_str2uni((const unsigned char *) k); + s = utf8_size(t); + c = tex_get_cat_code(ct,t); + if (c == 11) { + k += s ; + lname += s ; + } else if (c == 10) { + /*tex We ignore a trailing space like normal scanning does. */ + k += s ; + break ; + } else { + break ; + } + } + if (s > 0) { + /*tex We have a potential |\cs|. */ + halfword cs = tex_string_locate((const char *) name, lname, 0); + if (cs == undefined_control_sequence) { + /*tex Let's play safe and backtrack. */ + t += cc * (1<<21); + k = name ; + } else { + t = cs_token_flag + cs; + } + } else { + /*tex + Just a character with some meaning, so |\unknown| becomes effectively + |\unknown| assuming that |\\| has some useful meaning of course. + */ + t += cc * (1<<21); + k = name ; + } + } else { + /*tex + Whatever token, so for instance $x^2$ just works given a \TEX\ catcode regime. + */ + t += cc * (1<<21); + } + p = tex_store_new_token(p, t); + if (! h) { + h = p; + } + } + return h; +} + +/* these two can be combined, then we can avoid the h check */ + +static void tex_aux_set_toks_register(halfword loc, singleword cmd, halfword t, int g) +{ + halfword ref = get_reference_token(); + set_token_link(ref, t); + tex_define((g > 0) ? global_flag_bit : 0, loc, cmd == internal_toks_cmd ? internal_toks_reference_cmd : register_toks_reference_cmd, ref); +} + +static void tex_aux_append_copied_toks_list(halfword loc, singleword cmd, int g, halfword s, halfword t) +{ + halfword ref = get_reference_token(); + halfword p = ref; + while (s) { + p = tex_store_new_token(p, token_info(s)); + s = token_link(s); + } + while (t) { + p = tex_store_new_token(p, token_info(t)); + t = token_link(t); + } + tex_define((g > 0) ? global_flag_bit : 0, loc, cmd == internal_toks_cmd ? internal_toks_reference_cmd : register_toks_reference_cmd, ref); +} + +/*tex Public helper: */ + +halfword tex_copy_token_list(halfword h1, halfword *t) +{ + halfword h2 = tex_store_new_token(null, token_info(h1)); + halfword t1 = token_link(h1); + halfword t2 = h2; + while (t1) { + t2 = tex_store_new_token(t2, token_info(t1)); + t1 = token_link(t1); + } + if (t) { + *t = t2; + } + return h2; +} + +/*tex + + At some point I decided to implement the following primitives: + + \starttabulate[|T||T||] + \NC 0 \NC \type {toksapp} \NC 1 \NC \type {etoksapp} \NC \NR + \NC 2 \NC \type {tokspre} \NC 3 \NC \type {etokspre} \NC \NR + \NC 4 \NC \type {gtoksapp} \NC 5 \NC \type {xtoksapp} \NC \NR + \NC 6 \NC \type {gtokspre} \NC 7 \NC \type {xtokspre} \NC \NR + \stoptabulate + + These append and prepend tokens to token lists. In \CONTEXT\ we always had macros doing something + like that. It was only a few years later that I ran again into an article that Taco and I wrote + in 1999 in the NTG Maps about an extension to \ETEX\ (called eetex). The first revelation was + that I had completely forgotten about it, which can be explained by the two decade time-lap. The + second was that Taco actually added that to the program at that time, so I could have used (parts + of) that code. Anyway, among the other proposed (and implemented) features were manipulating + lists and ways to output packed data to the \DVI\ files (numbers packed into 1 upto 4 bytes). + Maybe some day I'll have a go at lists, although with todays computers there is not that much to + gain. Also, \CONTEXT\ progressed to different internals so the urge is no longer there. The also + discussed \SGML\ mode also in no longer that relevant given that we have \LUA. + + If we want to handle macros too we really need to distinguish between toks and macros with + |cur_chr| above, but not now. We can't expand, and have to use |get_r_token| or so. I don't need + it anyway. + + \starttyping + get_r_token(); + if (cur_cmd == call_cmd) { + nt = cur_cs; + target = equiv(nt); + } else { + // some error message + } + \stoptyping +*/ + +# define immediate_permitted(loc,target) ((eq_level(loc) == cur_level) && (get_token_reference(target) == 0)) + +void tex_run_combine_the_toks(void) +{ + halfword source = null; + halfword target = null; + halfword append, expand, global; + halfword nt, ns; + singleword cmd; + /* */ + switch (cur_chr) { + case expanded_toks_code: append = 0; global = 0; expand = 1; break; + case append_toks_code: append = 1; global = 0; expand = 0; break; + case append_expanded_toks_code: append = 1; global = 0; expand = 1; break; + case prepend_toks_code: append = 2; global = 0; expand = 0; break; + case prepend_expanded_toks_code: append = 2; global = 0; expand = 1; break; + case global_expanded_toks_code: append = 0; global = 1; expand = 1; break; + case global_append_toks_code: append = 1; global = 1; expand = 0; break; + case global_append_expanded_toks_code: append = 1; global = 1; expand = 1; break; + case global_prepend_toks_code: append = 2; global = 1; expand = 0; break; + case global_prepend_expanded_toks_code: append = 2; global = 1; expand = 1; break; + default: append = 0; global = 0; expand = 0; break; + } + /*tex The target. */ + tex_get_x_token(); + if (cur_cmd == register_toks_cmd || cur_cmd == internal_toks_cmd) { + nt = eq_value(cur_cs); + cmd = (singleword) cur_cmd; + } else { + /*tex Maybe a number. */ + tex_back_input(cur_tok); + nt = register_toks_location(tex_scan_toks_register_number()); + cmd = register_toks_cmd; + } + target = eq_value(nt); + /*tex The source. */ + do { + tex_get_x_token(); + } while (cur_cmd == spacer_cmd); + if (cur_cmd == left_brace_cmd) { + source = expand ? tex_scan_toks_expand(1, NULL, 0) : tex_scan_toks_normal(1, NULL); + /*tex The action. */ + if (source) { + if (target) { + halfword s = token_link(source); + if (s) { + halfword t = token_link(target); + if (! t) { + /*tex Can this happen? */ + set_token_link(target, s); + token_link(source) = null; + } else { + switch (append) { + case 0: + goto ASSIGN_1; + case 1: + /*append */ + if (immediate_permitted(nt,target)) { + halfword p = t; + while (token_link(p)) { + p = token_link(p); + } + token_link(p) = s; + token_link(source) = null; + } else { + tex_aux_append_copied_toks_list(nt, cmd, global, t, s); + } + break; + case 2: + /* prepend */ + if (immediate_permitted(nt,target)) { + halfword p = s; + while (token_link(p)) { + p = token_link(p); + } + token_link(source) = null; + set_token_link(p, t); + set_token_link(target, s); + } else { + tex_aux_append_copied_toks_list(nt, cmd, global, s, t); + } + break; + } + } + } + } else { + ASSIGN_1: + tex_aux_set_toks_register(nt, cmd, token_link(source), global); + token_link(source) = null; + } + tex_flush_token_list(source); + } + } else { + if (cur_cmd == register_toks_cmd) { + ns = register_toks_number(eq_value(cur_cs)); + } else if (cur_cmd == internal_toks_cmd) { + ns = internal_toks_number(eq_value(cur_cs)); + } else { + ns = tex_scan_toks_register_number(); + } + /*tex The action. */ + source = toks_register(ns); + if (source) { + if (target) { + halfword s = token_link(source); + halfword t = token_link(target); + switch (append) { + case 0: + /*assign */ + goto ASSIGN_2; + case 1: + /*append */ + if (immediate_permitted(nt, target)) { + halfword p = t; + while (token_link(p)) { + p = token_link(p); + } + while (s) { + p = tex_store_new_token(p, token_info(s)); + s = token_link(s); + } + } else { + tex_aux_append_copied_toks_list(nt, cmd, global, t, s); + } + break; + case 2: + if (immediate_permitted(nt, target)) { + halfword h = null; + halfword p = null; + while (s) { + p = tex_store_new_token(p, token_info(s)); + if (! h) { + h = p; + } + s = token_link(s); + } + set_token_link(p, t); + set_token_link(target, h); + } else { + tex_aux_append_copied_toks_list(nt, cmd, global, s, t); + } + break; + } + } else { + ASSIGN_2: + // set_toks_register(nt, source, global); + tex_add_token_reference(source); + eq_value(nt) = source; + } + } + } +} + +/*tex + + This routine, used in the next one, prints the job name, possibly modified by the + |process_jobname| callback. + +*/ + +static void tex_aux_print_job_name(void) +{ + if (lmt_fileio_state.job_name) { + /*tex \CCODE\ strings for jobname before and after processing. */ + char *s = lmt_fileio_state.job_name; + int callback_id = lmt_callback_defined(process_jobname_callback); + if (callback_id > 0) { + char *ss; + int lua_retval = lmt_run_callback(lmt_lua_state.lua_instance, callback_id, "S->S", s, &ss); + if (lua_retval && ss) { + s = ss; + } + } + tex_print_str(s); + } +} + +/*tex + + The procedure |run_convert_tokens| uses |str_toks| to insert the token list for |convert| + functions into the scanner; |\outer| control sequences are allowed to follow |\string| and + |\meaning|. + +*/ + +/*tex Codes not really needed but cleaner when testing */ + +# define push_selector { \ + saved_selector = lmt_print_state.selector; \ + lmt_print_state.selector = new_string_selector_code; \ +} + +# define pop_selector { \ + lmt_print_state.selector = saved_selector; \ +} + +void tex_run_convert_tokens(halfword code) +{ + /*tex Scan the argument for command |c|. */ + switch (code) { + /*tex + The |number_code| is quite popular. Beware, when used with a lua none function, a zero + is injected. We could intercept it at the cost of messy code, but on the other hand, + nothing guarantees that the call returns a number so this side effect can be defended + as a recovery measure. + */ + case number_code: + { + int saved_selector; + halfword v = tex_scan_int(0, NULL); + push_selector; + tex_print_int(v); + pop_selector; + break; + } + case to_integer_code: + case to_hexadecimal_code: + { + int saved_selector; + halfword v = tex_scan_int(0, NULL); + tex_get_x_token(); /* maybe not x here */ + if (cur_cmd != relax_cmd) { + tex_back_input(cur_tok); + } + push_selector; + if (code == to_integer_code) { + tex_print_int(v); + } else { + tex_print_hex(v); + } + pop_selector; + break; + } + case to_scaled_code: + case to_sparse_scaled_code: + case to_dimension_code: + case to_sparse_dimension_code: + { + int saved_selector; + halfword v = tex_scan_dimen(0, 0, 0, 0, NULL); + tex_get_x_token(); /* maybe not x here */ + if (cur_cmd != relax_cmd) { + tex_back_input(cur_tok); + } + push_selector; + switch (code) { + case to_sparse_dimension_code: + case to_sparse_scaled_code: + tex_print_sparse_dimension(v, no_unit); + break; + default: + tex_print_dimension(v, no_unit); + break; + } + switch (code) { + case to_dimension_code: + case to_sparse_dimension_code: + tex_print_unit(pt_unit); + break; + } + pop_selector; + break; + } + case to_mathstyle_code: + { + int saved_selector; + halfword v = tex_scan_math_style_identifier(1, 0); + push_selector; + tex_print_int(v); + pop_selector; + break; + } + case lua_function_code: + { + halfword v = tex_scan_int(0, NULL); + if (v > 0) { + strnumber u = tex_save_cur_string(); + lmt_token_state.luacstrings = 0; + lmt_function_call(v, 0); + tex_restore_cur_string(u); + if (lmt_token_state.luacstrings > 0) { + tex_lua_string_start(); + } + } else { + tex_normal_error("luafunction", "invalid number"); + } + return; + } + case lua_bytecode_code: + { + halfword v = tex_scan_int(0, NULL); + if (v < 0 || v > 65535) { + tex_normal_error("luabytecode", "invalid number"); + } else { + strnumber u = tex_save_cur_string(); + lmt_token_state.luacstrings = 0; + lmt_bytecode_call(v); + tex_restore_cur_string(u); + if (lmt_token_state.luacstrings > 0) { + tex_lua_string_start(); + } + } + return; + } + case lua_code: + { + full_scanner_status saved_full_status = tex_save_full_scanner_status(); + strnumber u = tex_save_cur_string(); + halfword s = tex_scan_toks_expand(0, NULL, 0); + tex_unsave_full_scanner_status(saved_full_status); + lmt_token_state.luacstrings = 0; + lmt_token_call(s); + tex_delete_token_reference(s); /* boils down to flush_list */ + tex_restore_cur_string(u); + if (lmt_token_state.luacstrings > 0) { + tex_lua_string_start(); + } + /*tex No further action. */ + return; + } + case expanded_code: + case semi_expanded_code: + { + full_scanner_status saved_full_status = tex_save_full_scanner_status(); + strnumber u = tex_save_cur_string(); + halfword s = tex_scan_toks_expand(0, NULL, code == semi_expanded_code); + tex_unsave_full_scanner_status(saved_full_status); + if (token_link(s)) { + tex_begin_inserted_list(token_link(s)); + token_link(s) = null; + } + tex_put_available_token(s); + tex_restore_cur_string(u); + /*tex No further action. */ + return; + } + /* case immediate_assignment_code: */ + /* case immediate_assigned_code: */ + /*tex + These two were an on-the-road-to-bachotex brain-wave. A first variant did more in + sequence till a relax or spacer was seen. These commands permits for instance setting + counters in full expansion. However, as we have the more powerful local control + mechanisms available these two commands have been dropped in \LUAMETATEX. Performance + wise there is not that much to gain from |\immediateassigned| and it's even somewhat + limited. So, they're gone now. Actually, one can also use the local control feature in + an |\edef|, which {\em is} rather efficient, so we're good anyway. The upgraded code + can be found in the archive. + */ + case string_code: + { + int saved_selector; + int saved_scanner_status = lmt_input_state.scanner_status; + lmt_input_state.scanner_status = scanner_is_normal; + tex_get_token(); + lmt_input_state.scanner_status = saved_scanner_status; + push_selector; + if (cur_cs) { + tex_print_cs(cur_cs); + } else { + tex_print_tex_str(cur_chr); + } + pop_selector; + break; + } + case cs_string_code: + { + int saved_selector; + int saved_scanner_status = lmt_input_state.scanner_status; + lmt_input_state.scanner_status = scanner_is_normal; + tex_get_token(); + lmt_input_state.scanner_status = saved_scanner_status; + push_selector; + if (cur_cs) { + tex_print_cs_name(cur_cs); + } else { + tex_print_tex_str(cur_chr); + } + pop_selector; + break; + } + case detokenized_code: + { + int saved_selector; + int saved_scanner_status = lmt_input_state.scanner_status; + halfword t = null; + lmt_input_state.scanner_status = scanner_is_normal; + tex_get_token(); + lmt_input_state.scanner_status = saved_scanner_status; + t = tex_get_available_token(cur_tok); + push_selector; + tex_show_token_list(t, null, extreme_token_show_max, 0); + tex_put_available_token(t); + pop_selector; + break; + } + case roman_numeral_code: + { + int saved_selector; + halfword v = tex_scan_int(0, NULL); + push_selector; + tex_print_roman_int(v); + pop_selector; + break; + } + case meaning_code: + case meaning_full_code: + case meaning_less_code: + case meaning_asis_code: + { + int saved_selector; + int saved_scanner_status = lmt_input_state.scanner_status; + lmt_input_state.scanner_status = scanner_is_normal; + tex_get_token(); + lmt_input_state.scanner_status = saved_scanner_status; + push_selector; + tex_print_meaning(code); + pop_selector; + break; + } + case uchar_code: + { + int saved_selector; + int chr = tex_scan_char_number(0); + push_selector; + tex_print_tex_str(chr); + pop_selector; + break; + } + case lua_escape_string_code: + { + lstring escstr; + int l = 0; + int e = lmt_token_state.in_lua_escape; + full_scanner_status saved_full_status = tex_save_full_scanner_status(); + halfword result = tex_scan_toks_expand(0, NULL, 0); + lmt_token_state.in_lua_escape = 1; + escstr.s = (unsigned char *) tex_tokenlist_to_tstring(result, 0, &l, 0, 0, 0); + escstr.l = (unsigned) l; + lmt_token_state.in_lua_escape = e; + tex_delete_token_reference(result); /* boils down to flush_list */ + tex_unsave_full_scanner_status(saved_full_status); + if (escstr.l) { + result = lmt_str_toks(escstr); + tex_begin_inserted_list(result); + } + return; + } + case font_name_code: + { + int saved_selector; + halfword fnt = tex_scan_font_identifier(NULL); + push_selector; + tex_print_font(fnt); + pop_selector; + break; + } + case font_specification_code: + { + int saved_selector; + halfword fnt = tex_scan_font_identifier(NULL); + push_selector; + tex_append_string((const unsigned char *) font_original(fnt), (unsigned) strlen(font_original(fnt))); + pop_selector; + break; + } + case job_name_code: + { + int saved_selector; + if (! lmt_fileio_state.job_name) { + tex_open_log_file(); + } + push_selector; + tex_aux_print_job_name(); + pop_selector; + break; + } + case format_name_code: + { + int saved_selector; + if (! lmt_fileio_state.job_name) { + tex_open_log_file(); + } + push_selector; + tex_print_tex_str(lmt_dump_state.format_name); + pop_selector; + break; + } + case luatex_banner_code: + { + int saved_selector; + push_selector; + tex_print_str(lmt_engine_state.luatex_banner); + pop_selector; + break; + } + default: + tex_confusion("convert tokens"); + break; + } + { + halfword head = tex_cur_str_toks(NULL); + tex_begin_inserted_list(head); + } +} + +/*tex + The boolean |in_lua_escape| is keeping track of the lua string escape state. +*/ + +strnumber tex_the_convert_string(halfword c, int i) +{ + int saved_selector = lmt_print_state.selector; + strnumber ret = 0; + int done = 1 ; + lmt_print_state.selector = new_string_selector_code; + switch (c) { + case number_code: + case to_integer_code: + tex_print_int(i); + break; + case to_hexadecimal_code: + tex_print_hex(i); + break; + case to_scaled_code: + tex_print_dimension(i, no_unit); + break; + case to_sparse_scaled_code: + tex_print_sparse_dimension(i, no_unit); + break; + case to_dimension_code: + tex_print_dimension(i, pt_unit); + break; + case to_sparse_dimension_code: + tex_print_sparse_dimension(i, pt_unit); + break; + /* case to_mathstyle_code: */ + /* case lua_function_code: */ + /* case lua_code: */ + /* case expanded_code: */ + /* case string_code: */ + /* case cs_string_code: */ + case roman_numeral_code: + tex_print_roman_int(i); + break; + /* case meaning_code: */ + case uchar_code: + tex_print_tex_str(i); + break; + /* case lua_escape_string_code: */ + case font_name_code: + tex_print_font(i); + break; + case font_specification_code: + tex_print_str(font_original(i)); + break; + /* case left_margin_kern_code: */ + /* case right_margin_kern_code: */ + /* case math_char_class_code: */ + /* case math_char_fam_code: */ + /* case math_char_slot_code: */ + /* case insert_ht_code: */ + case job_name_code: + tex_aux_print_job_name(); + break; + case format_name_code: + tex_print_tex_str(lmt_dump_state.format_name); + break; + case luatex_banner_code: + tex_print_str(lmt_engine_state.luatex_banner); + break; + case font_identifier_code: + tex_print_font_identifier(i); + break; + default: + done = 0; + break; + } + if (done) { + ret = tex_make_string(); + } + lmt_print_state.selector = saved_selector; + return ret; +} + +/*tex Return a string from tokens list: */ + +strnumber tex_tokens_to_string(halfword p) +{ + if (lmt_print_state.selector == new_string_selector_code) { + tex_normal_error("tokens", "tokens_to_string() called while selector = new_string"); + return get_nullstr(); + } else { + int saved_selector = lmt_print_state.selector; + lmt_print_state.selector = new_string_selector_code; + tex_token_show(p, extreme_token_show_max); + lmt_print_state.selector = saved_selector; + return tex_make_string(); + } +} + +/*tex + + The actual token conversion in this function is now functionally equivalent to |show_token_list|, + except that it always prints the whole token list. Often the result is not that large, for + instance |\directlua| is seldom large. However, this converter is also used for patterns + and exceptions where size is mnore an issue. For that reason we used to have three variants, + one of which (experimentally) used a buffer. At some point, in the manual we were talking of + millions of allocations but times have changed. + + Macros were used to inline the appending code (in the thre variants), but in the end I decided + to just merge all into one function, with a bit more overhead because we need to optionally + skip a macro preamble. + + Values like 512 and 128 also work ok. There is not much to gain in optimization here. We used + to have 3 mostly overlapping functions, one of which used a buffer. We can probably use a + larger default buffer size and larger step and only free when we think it's too large. + +*/ + +# define default_buffer_size 512 /*tex This used to be 256 */ +# define default_buffer_step 4096 /*tex When we're larger, we always are much larger. */ + +// todo: check ret + +static void tex_aux_make_room_in_buffer(int a) +{ + if (lmt_token_state.bufloc + a + 1 > lmt_token_state.bufmax) { + char *tmp = aux_reallocate_array(lmt_token_state.buffer, sizeof(unsigned char), lmt_token_state.bufmax + default_buffer_step, 1); + if (tmp) { + lmt_token_state.bufmax += default_buffer_step; + } else { + // error + } + lmt_token_state.buffer = tmp; + } +} + +static void tex_aux_append_uchar_to_buffer(int s) +{ + tex_aux_make_room_in_buffer(4); + if (s <= 0x7F) { + lmt_token_state.buffer[lmt_token_state.bufloc++] = (char) (s); + } else if (s <= 0x7FF) { + lmt_token_state.buffer[lmt_token_state.bufloc++] = (char) (0xC0 + (s / 0x40)); + lmt_token_state.buffer[lmt_token_state.bufloc++] = (char) (0x80 + (s % 0x40)); + } else if (s <= 0xFFFF) { + lmt_token_state.buffer[lmt_token_state.bufloc++] = (char) (0xE0 + (s / 0x1000)); + lmt_token_state.buffer[lmt_token_state.bufloc++] = (char) (0x80 + ((s % 0x1000) / 0x40)); + lmt_token_state.buffer[lmt_token_state.bufloc++] = (char) (0x80 + ((s % 0x1000) % 0x40)); + } else if (s >= 0x110000) { + lmt_token_state.buffer[lmt_token_state.bufloc++] = (char) (s - 0x11000); + } else { + lmt_token_state.buffer[lmt_token_state.bufloc++] = (char) (0xF0 + (s / 0x40000)); + lmt_token_state.buffer[lmt_token_state.bufloc++] = (char) (0x80 + ((s % 0x40000) / 0x1000)); + lmt_token_state.buffer[lmt_token_state.bufloc++] = (char) (0x80 + (((s % 0x40000) % 0x1000) / 0x40)); + lmt_token_state.buffer[lmt_token_state.bufloc++] = (char) (0x80 + (((s % 0x40000) % 0x1000) % 0x40)); + } +} + +static void tex_aux_append_char_to_buffer(int c) +{ + tex_aux_make_room_in_buffer(1); + lmt_token_state.buffer[lmt_token_state.bufloc++] = (char) (c); +} + +/*tex Only errors and unknowns. */ + +static void tex_aux_append_str_to_buffer(const char *s) +{ + const char *v = s; + tex_aux_make_room_in_buffer((int) strlen(v)); + /*tex Using memcpy will inline and give a larger binary ... and we seldom need this. */ + while (*v) { + lmt_token_state.buffer[lmt_token_state.bufloc++] = (char) (*v); + v++; + } +} + +/*tex Only bogus csnames. */ + +static void tex_aux_append_esc_to_buffer(const char *s) +{ + int e = escape_char_par; + if (e > 0 && e < cs_offset_value) { + tex_aux_append_uchar_to_buffer(e); + } + tex_aux_append_str_to_buffer(s); +} + +# define is_cat_letter(a) (tex_aux_the_cat_code(aux_str2uni(str_string((a)))) == letter_cmd) + +/* make two versions: macro and not */ + +char *tex_tokenlist_to_tstring(int pp, int inhibit_par, int *siz, int skippreamble, int nospace, int strip) +{ + if (pp) { + /*tex We need to go beyond the reference. */ + int p = token_link(pp); + if (p) { + if (lmt_token_state.bufmax > default_buffer_size) { + /* Let's start fresh and small. */ + aux_deallocate_array(lmt_token_state.buffer); + lmt_token_state.buffer = aux_allocate_clear_array(sizeof(unsigned char), default_buffer_size, 1); + lmt_token_state.bufmax = default_buffer_size; + } else if (! lmt_token_state.buffer) { + /* Let's start. */ + lmt_token_state.buffer = aux_allocate_clear_array(sizeof(unsigned char), default_buffer_size, 1); + lmt_token_state.bufmax = default_buffer_size; + } + lmt_token_state.bufloc = 0; + int e = escape_char_par; /*tex The serialization of the escape, normally a backlash. */ + int n = '0'; /*tex The character after |#|, so |#0| upto |#9| */ + int min = 0; + int max = lmt_token_memory_state.tokens_data.top; + int skip = 0; + if (skippreamble) { + skip = get_token_parameters(pp); + } + while (p) { + if (p < min || p > max) { + tex_aux_append_str_to_buffer(error_string_clobbered(31)); + break; + } else { + int infop = token_info(p); + if (infop < 0) { + /* unlikely, will go after checking */ + tex_aux_append_str_to_buffer(error_string_bad(32)); + } else if (infop < cs_token_flag) { + /*tex We nearly always end up here because otherwise we have an error. */ + int cmd = token_cmd(infop); + int chr = token_chr(infop); + switch (cmd) { + case left_brace_cmd: + case right_brace_cmd: + case math_shift_cmd: + case alignment_tab_cmd: + case superscript_cmd: + case subscript_cmd: + case spacer_cmd: + case letter_cmd: + case other_char_cmd: + if (! skip) { + tex_aux_append_uchar_to_buffer(chr); + } + break; + case parameter_cmd: + if (! skip) { + if (! nospace && (! lmt_token_state.in_lua_escape && (lmt_expand_state.cs_name_level == 0))) { + tex_aux_append_uchar_to_buffer(chr); + } + tex_aux_append_uchar_to_buffer(chr); + } + break; + case parameter_reference_cmd: + if (! skip) { + tex_aux_append_char_to_buffer(match_visualizer); + if (chr <= 9) { + tex_aux_append_char_to_buffer(chr + '0'); + } else { + tex_aux_append_char_to_buffer('!'); + goto EXIT; + } + } else { + if (chr > 9) { + goto EXIT; + } + } + break; + case match_cmd: + if (! skip) { + tex_aux_append_char_to_buffer(match_visualizer); + } + if (is_valid_match_ref(chr)) { + ++n; + } + if (! skip) { + tex_aux_append_char_to_buffer(chr ? chr : '0'); + } + if (n > '9') { + goto EXIT; + } + break; + case end_match_cmd: + if (chr == 0) { + if (! skip) { + tex_aux_append_char_to_buffer('-'); + tex_aux_append_char_to_buffer('>'); + } + skip = 0 ; + } + break; + /* + case string_cmd: + c = c + cs_offset_value; + do_make_room((int) str_length(c)); + for (int i = 0; i < str_length(c); i++) { + token_state.buffer[token_state.bufloc++] = str_string(c)[i]; + } + break; + */ + case end_paragraph_cmd: + if (! inhibit_par && (auto_paragraph_mode(auto_paragraph_text))) { + tex_aux_append_esc_to_buffer("par"); + } + break; + default: + tex_aux_append_str_to_buffer(tex_aux_special_cmd_string(cmd, chr, error_string_bad(33))); + break; + } + } else if (! (inhibit_par && infop == lmt_token_state.par_token)) { + int q = infop - cs_token_flag; + if (q < hash_base) { + if (q == null_cs) { + tex_aux_append_esc_to_buffer("csname"); + tex_aux_append_esc_to_buffer("endcsname"); + } else { + tex_aux_append_str_to_buffer(error_string_impossible(34)); + } + } else if (eqtb_out_of_range(q)) { + tex_aux_append_str_to_buffer(error_string_impossible(35)); + } else { + strnumber txt = cs_text(q); + if (txt < 0 || txt >= lmt_string_pool_state.string_pool_data.ptr) { + tex_aux_append_str_to_buffer(error_string_nonexistent(36)); + } else { + char *sh = tex_makecstring(txt); + char *s = sh; + if (tex_is_active_cs(txt)) { + s = s + 3; + while (*s) { + tex_aux_append_char_to_buffer(*s); + s++; + } + } else { + if (e >= 0 && e < 0x110000) { + tex_aux_append_uchar_to_buffer(e); + } + while (*s) { + tex_aux_append_char_to_buffer(*s); + s++; + } + if ((! nospace) && ((! tex_single_letter(txt)) || is_cat_letter(txt))) { + tex_aux_append_char_to_buffer(' '); + } + } + lmt_memory_free(sh); + } + } + } + p = token_link(p); + } + } + EXIT: + if (strip && lmt_token_state.bufloc > 1) { + if (lmt_token_state.buffer[lmt_token_state.bufloc-1] == strip) { + lmt_token_state.bufloc -= 1; + } + if (lmt_token_state.bufloc > 1 && lmt_token_state.buffer[0] == strip) { + memcpy(&lmt_token_state.buffer[0], &lmt_token_state.buffer[1], lmt_token_state.bufloc-1); + lmt_token_state.bufloc -= 1; + } + } + lmt_token_state.buffer[lmt_token_state.bufloc] = '\0'; + if (siz) { + *siz = lmt_token_state.bufloc; + } + return lmt_token_state.buffer; + } + } + if (siz) { + *siz = 0; + } + return NULL; +} + +/*tex + + The \LUA\ interface needs some extra functions. The functions themselves are quite boring, but + they are handy because otherwise this internal stuff has to be accessed from \CCODE\ directly, + where lots of the defines are not available. + +*/ + +halfword tex_get_tex_dimen_register (int j, int internal) { return internal ? dimen_parameter(j) : dimen_register(j) ; } +halfword tex_get_tex_skip_register (int j, int internal) { return internal ? glue_parameter(j) : skip_register(j) ; } +halfword tex_get_tex_mu_skip_register (int j, int internal) { return internal ? mu_glue_parameter(j) : mu_skip_register(j); } +halfword tex_get_tex_count_register (int j, int internal) { return internal ? count_parameter(j) : count_register(j) ; } +halfword tex_get_tex_attribute_register (int j, int internal) { return internal ? attribute_parameter(j) : attribute_register(j) ; } +halfword tex_get_tex_box_register (int j, int internal) { return internal ? box_parameter(j) : box_register(j) ; } + +void tex_set_tex_dimen_register(int j, halfword v, int flags, int internal) +{ + if (global_defs_par) { + flags = add_global_flag(flags); + } + if (internal) { + tex_assign_internal_dimen_value(flags, internal_dimen_location(j), v); + } else { + tex_word_define(flags, register_dimen_location(j), v); + } +} + +void tex_set_tex_skip_register(int j, halfword v, int flags, int internal) +{ + if (global_defs_par) { + flags = add_global_flag(flags); + } + if (internal) { + tex_assign_internal_skip_value(flags, internal_glue_location(j), v); + } else { + tex_word_define(flags, register_glue_location(j), v); + } +} + +void tex_set_tex_mu_skip_register(int j, halfword v, int flags, int internal) +{ + if (global_defs_par) { + flags = add_global_flag(flags); + } + tex_word_define(flags, internal ? internal_mu_glue_location(j) : register_mu_glue_location(j), v); +} + +void tex_set_tex_count_register(int j, halfword v, int flags, int internal) +{ + if (global_defs_par) { + flags = add_global_flag(flags); + } + if (internal) { + tex_assign_internal_int_value(flags, internal_int_location(j), v); + } else { + tex_word_define(flags, register_int_location(j), v); + } +} + +void tex_set_tex_attribute_register(int j, halfword v, int flags, int internal) +{ + if (global_defs_par) { + flags = add_global_flag(flags); + } + if (j > lmt_node_memory_state.max_used_attribute) { + lmt_node_memory_state.max_used_attribute = j; + } + change_attribute_register(flags, register_attribute_location(j), v); + tex_word_define(flags, internal ? internal_attribute_location(j) : register_attribute_location(j), v); +} + +void tex_set_tex_box_register(int j, halfword v, int flags, int internal) +{ + if (global_defs_par) { + flags = add_global_flag(flags); + } + if (internal) { + tex_define(flags, internal_box_location(j), internal_box_reference_cmd, v); + } else { + tex_define(flags, register_box_location(j), register_box_reference_cmd, v); + } +} + +void tex_set_tex_toks_register(int j, lstring s, int flags, int internal) +{ + halfword ref = get_reference_token(); + halfword head = tex_str_toks(s, NULL); + set_token_link(ref, head); + if (global_defs_par) { + flags = add_global_flag(flags); + } + if (internal) { + tex_define(flags, internal_toks_location(j), internal_toks_reference_cmd, ref); + } else { + tex_define(flags, register_toks_location(j), register_toks_reference_cmd, ref); + } +} + +void tex_scan_tex_toks_register(int j, int c, lstring s, int flags, int internal) +{ + halfword ref = get_reference_token(); + halfword head = tex_str_scan_toks(c, s); + set_token_link(ref, head); + if (global_defs_par) { + flags = add_global_flag(flags); + } + if (internal) { + tex_define(flags, internal_toks_location(j), internal_toks_reference_cmd, ref); + } else { + tex_define(flags, register_toks_location(j), register_toks_reference_cmd, ref); + } +} + +int tex_get_tex_toks_register(int j, int internal) +{ + halfword t = internal ? toks_parameter(j) : toks_register(j); + if (t) { + return tex_tokens_to_string(t); + } else { + return get_nullstr(); + } +} + +/* Options: (0) error when undefined [bad], (1) create [but undefined], (2) ignore [discard] */ + +halfword tex_parse_str_to_tok(halfword head, halfword *tail, halfword ct, const char *str, size_t lstr, int option) +{ + halfword p = null; + if (! head) { + head = get_reference_token(); + } + p = (tail && *tail) ? *tail : head; + if (lstr > 0) { + const char *se = str + lstr; + while (str < se) { + /*tex hh: |str2uni| could return len too (also elsewhere) */ + halfword u = (halfword) aux_str2uni((const unsigned char *) str); + halfword t = null; + halfword cc = tex_get_cat_code(ct, u); + str += utf8_size(u); + /*tex + This is a relating simple converter; if more is needed one can just use + |tex.print| with a regular |\def| or |\gdef| and feed the string into the + regular scanner. + */ + switch (cc) { + case escape_cmd: + { + /*tex We have a potential control sequence so we check for it. */ + int lname = 0; + const char *name = str; + while (str < se) { + halfword u = (halfword) aux_str2uni((const unsigned char *) str); + int s = utf8_size(u); + int c = tex_get_cat_code(ct, u); + if (c == letter_cmd) { + str += s; + lname += s; + } else if (c == spacer_cmd) { + /*tex We ignore a trailing space like normal scanning does. */ + if (lname == 0) { + // if (u == 32) { + lname += s; + } + str += s; + break ; + } else { + if (lname == 0) { + lname += s; + str += s; + } + break ; + } + } + if (lname > 0) { + /*tex We have a potential |\cs|. */ + halfword cs = tex_string_locate(name, lname, option == 1 ? 1 : 0); /* 1 == create */ + if (cs == undefined_control_sequence) { + if (option == 2) { + /*tex We ignore unknown commands. */ + // t = null; + } else { + /*tex We play safe and backtrack, as we have option 0, but never used anyway. */ + t = u + (cc * (1<<21)); + str = name; + } + } else { + /* We end up here when option is 1. */ + t = cs_token_flag + cs; + } + } else { + /*tex + Just a character with some meaning, so |\unknown| becomes effectively + |\unknown| assuming that |\\| has some useful meaning of course. + */ + t = u + (cc * (1 << 21)); + str = name; + } + break; + } + case comment_cmd: + goto DONE; + case ignore_cmd: + break; + case spacer_cmd: + /* t = u + (cc * (1<<21)); */ + t = token_val(spacer_cmd, ' '); + break; + default: + /*tex + Whatever token, so for instance $x^2$ just works given a tex catcode regime. + */ + t = u + (cc * (1<<21)); + break; + } + if (t) { + p = tex_store_new_token(p, t); + } + } + } + DONE: + if (tail) { + *tail = p; + } + return head; +} + +/*tex So far for the helpers. */ + +void tex_dump_token_mem(dumpstream f) +{ + /*tex + It doesn't pay off to prune the available list. We save less than 10K if we do this and + it assumes a sequence at the end. It doesn't help that the list is in reverse order so + we just dump the lot. But we do check the allocated size. We cheat a bit in reducing + the ptr so that we can set the the initial counter on loading. + */ + halfword p = lmt_token_memory_state.available; + halfword u = lmt_token_memory_state.tokens_data.top + 1; + while (p) { + --u; + p = token_link(p); + } + lmt_token_memory_state.tokens_data.ptr = u; + dump_int(f, lmt_token_state.null_list); /* the only one left */ + dump_int(f, lmt_token_memory_state.tokens_data.allocated); + dump_int(f, lmt_token_memory_state.tokens_data.top); + dump_int(f, lmt_token_memory_state.tokens_data.ptr); + dump_int(f, lmt_token_memory_state.available); + dump_things(f, lmt_token_memory_state.tokens[0], lmt_token_memory_state.tokens_data.top + 1); +} + +void tex_undump_token_mem(dumpstream f) +{ + undump_int(f, lmt_token_state.null_list); /* the only one left */ + undump_int(f, lmt_token_memory_state.tokens_data.allocated); + undump_int(f, lmt_token_memory_state.tokens_data.top); + undump_int(f, lmt_token_memory_state.tokens_data.ptr); + undump_int(f, lmt_token_memory_state.available); + tex_initialize_token_mem(); + undump_things(f, lmt_token_memory_state.tokens[0], lmt_token_memory_state.tokens_data.top + 1); +} |