/* See license.txt in the root of this project. */ # include "luametatex.h" /*tex Todo: move some helpers to other places. */ inline static int tex_aux_the_cat_code(halfword b) { return (lmt_input_state.cur_input.cattable == default_catcode_table_preset) ? tex_get_cat_code(cat_code_table_par, b) : ( (lmt_input_state.cur_input.cattable > -0xFF) ? tex_get_cat_code(lmt_input_state.cur_input.cattable, b) : ( - lmt_input_state.cur_input.cattable - 0xFF ) ) ; } /*tex The \TEX\ system does nearly all of its own memory allocation, so that it can readily be transported into environments that do not have automatic facilities for strings, garbage collection, etc., and so that it can be in control of what error messages the user receives. The dynamic storage requirements of \TEX\ are handled by providing two large arrays called |fixmem| and |varmem| in which consecutive blocks of words are used as nodes by the \TEX\ routines. Pointer variables are indices into this array, or into another array called |eqtb| that will be explained later. A pointer variable might also be a special flag that lies outside the bounds of |mem|, so we allow pointers to assume any |halfword| value. The minimum halfword value represents a null pointer. \TEX\ does not assume that |mem[null]| exists. Locations in |fixmem| are used for storing one-word records; a conventional |AVAIL| stack is used for allocation in this array. One can make an argument to switch to standard \CCODE\ allocation but the current approach is very efficient in memory usage and performence so we stay with it. On the average memory consumption of \TEX| is not that large, definitely not compared to other programs that deal with text. The big dynamic storage area is named |fixmem| where the smallest location of one|-|word memory in use is |fix_mem_min| and the largest location of one|-|word memory in use is |fix_mem_max|. The |dyn_used| variable keeps track of how much memory is in use. The head of the list of available one|-|word nodes is registered in |avail|. The last one-|word node used in |mem| is |fix_mem_end|. All these variables are packed in the structure |token_memory_state|. */ token_memory_state_info lmt_token_memory_state = { .tokens = NULL, .tokens_data = { .minimum = min_token_size, .maximum = max_token_size, .size = siz_token_size, .step = stp_token_size, .allocated = 0, .itemsize = sizeof(memoryword), .top = 0, .ptr = 0, /* used to register usage */ .initial = 0, .offset = 0, }, .available = 0, .padding = 0, }; /*tex Token data has its own memory space. Again we have some state variables: |temp_token_head| is the head of a (temporary) list of some kind as are |hold_token_head| and |omit_template|. A permanently empty list is available in |null_list| and the head of the token list built by |scan_keyword| is registered in |backup_head|. All these variables are packed in the structure |token_data| but some have been moved to a more relevant state (so omit and hold are now in the alignment state). */ token_state_info lmt_token_state = { .null_list = null, .in_lua_escape = 0, .force_eof = 0, .luacstrings = 0, .par_loc = null, .par_token = null, /* .line_par_loc = null, */ /* removed because not really used and useful */ /* .line_par_token = null, */ /* idem */ .buffer = NULL, .bufloc = 0, .bufmax = 0, .empty = null, }; /*tex Some properties are dumped in the format so these are aet already! */ # define reserved_token_mem_slots 2 // play safe for slight overuns void tex_initialize_token_mem(void) { memoryword *tokens = NULL; int size = 0; if (lmt_main_state.run_state == initializing_state) { size = lmt_token_memory_state.tokens_data.minimum; } else { size = lmt_token_memory_state.tokens_data.allocated; lmt_token_memory_state.tokens_data.initial = lmt_token_memory_state.tokens_data.ptr; } if (size > 0) { tokens = aux_allocate_clear_array(sizeof(memoryword), size, reserved_token_mem_slots); } if (tokens) { lmt_token_memory_state.tokens = tokens; lmt_token_memory_state.tokens_data.allocated = size; } else { tex_overflow_error("tokens", size); } } static void tex_aux_bump_token_memory(void) { /*tex We need to manage the big dynamic storage area. */ int size = lmt_token_memory_state.tokens_data.allocated + lmt_token_memory_state.tokens_data.step; if (size > lmt_token_memory_state.tokens_data.size) { lmt_run_memory_callback("token", 0); tex_show_runaway(); tex_overflow_error("token memory size", lmt_token_memory_state.tokens_data.allocated); } else { memoryword *tokens = aux_reallocate_array(lmt_token_memory_state.tokens, sizeof(memoryword), size, reserved_token_mem_slots); lmt_run_memory_callback("token", tokens ? 1 : 0); if (tokens) { lmt_token_memory_state.tokens = tokens; } else { /*tex If memory is exhausted, display possible runaway text. */ tex_show_runaway(); tex_overflow_error("token memory size", lmt_token_memory_state.tokens_data.allocated); } } memset((void *) (lmt_token_memory_state.tokens + lmt_token_memory_state.tokens_data.allocated + 1), 0, ((size_t) lmt_token_memory_state.tokens_data.step + reserved_token_mem_slots) * sizeof(memoryword)); lmt_token_memory_state.tokens_data.allocated = size; } void tex_initialize_tokens(void) { lmt_token_memory_state.available = null; lmt_token_memory_state.tokens_data.top = 0; lmt_token_state.null_list = tex_get_available_token(null); lmt_token_state.in_lua_escape = 0; } /*tex Experiment. It saves some 512K on the \CONTEXT\ format of October 2020. It makes me wonder if I should spend some time on optimizing token lists (kind of cisc commands as we're currently kind of risc). */ void tex_compact_tokens(void) { int nc = 0; // memoryword *target = allocate_array(sizeof(memoryword), (size_t) token_memory_state.tokens_data.allocated, 0); memoryword *target = aux_allocate_clear_array(sizeof(memoryword), lmt_token_memory_state.tokens_data.allocated, 0); halfword *mapper = aux_allocate_array(sizeof(halfword), lmt_token_memory_state.tokens_data.allocated, 0); int nofluacmds = 0; if (target && mapper) { memoryword *tokens = lmt_token_memory_state.tokens; memset((void *) mapper, -1, ((size_t) lmt_token_memory_state.tokens_data.allocated) * sizeof(halfword)); /* also reset available */ for (int cs = 0; cs < (eqtb_size + lmt_hash_state.hash_data.ptr + 1); cs++) { switch (eq_type(cs)) { case call_cmd: case protected_call_cmd: case semi_protected_call_cmd: case tolerant_call_cmd: case tolerant_protected_call_cmd: case tolerant_semi_protected_call_cmd: case internal_toks_reference_cmd: case register_toks_reference_cmd: { halfword v = eq_value(cs); /* ref count token*/ if (v) { if (mapper[v] < 0) { // printf("before =>"); { halfword tt = v; while (tt) { printf("%7d ",tt); tt = token_link(tt); } } printf("\n"); halfword t = v; nc++; mapper[v] = nc; /* new ref count token index */ while (1) { target[nc].half1 = tokens[t].half1; /* info cq. ref count */ t = tokens[t].half0; if (t) { nc++; target[nc-1].half0 = nc; /* link to next */ } else { target[nc].half0 = null; /* link to next */ break; } } // printf("after =>"); { halfword tt = mapper[v]; while (tt) { printf("%7d ",tt); tt = target[tt].half0; } } printf("\n"); } eq_value(cs) = mapper[v]; } break; } case lua_value_cmd: case lua_call_cmd: case lua_local_call_cmd: { ++nofluacmds; break; } } } lmt_token_state.empty = mapper[lmt_token_state.empty]; // print(dump_state.format_identifier); tex_print_format("tokenlist compacted from %i to %i entries, ", lmt_token_memory_state.tokens_data.top, nc); if (nofluacmds) { /*tex We just mention them because when these are aliased the macro package needs to make sure that after loading that happens again because registered funciton references can have changed between format generation and run! */ tex_print_format("%i potentially aliased lua call/value entries, ", nofluacmds); } lmt_token_memory_state.tokens_data.top = nc; lmt_token_memory_state.tokens_data.ptr = nc; aux_deallocate_array(lmt_token_memory_state.tokens); lmt_token_memory_state.tokens = target; lmt_token_memory_state.available = null; } else { tex_overflow_error("token compaction size", lmt_token_memory_state.tokens_data.allocated); } } /*tex The function |get_avail| returns a pointer (index) to a new one word node whose |link| field is |null| (which is just 0). However, \TEX\ will halt if there is no more room left. If the available space list is empty, i.e., if |avail = null|, we try first to increase |fix_mem_end|. If that cannot be done, i.e., if |fix_mem_end = fix_mem_max|, we try to reallocate array |fixmem|. If, that doesn't work, we have to quit. Users can configure \TEX\ to use a lot of memory but in some scenarios limitations make sense. Remark: we can have a pool of chunks where we get from or just allocate per token (as we have lots of them that is slow). But then format loading becomes much slower as we need to recreate the linked list. A no go. In todays terms \TEX\ memory usage is low anyway. The freed tokens are kept in a linked list. First we check if we can quickly get one of these. If that fails, we try to get one from the available pool. If that fails too, we enlarge the pool and try again. We keep track of the used number of tokens. We also make sure that the tokens links to nothing. One problem is of course that tokens can be scattered over memory. We could have some sorter that occasionally kicks in but it doesn't pay off. Normally definitions (in the format) are in sequence but a normal run \unknown\ it would be interesting to know if this impacts the cache. */ halfword tex_get_available_token(halfword t) { halfword p = lmt_token_memory_state.available; if (p) { lmt_token_memory_state.available = token_link(p); } else if (lmt_token_memory_state.tokens_data.top < lmt_token_memory_state.tokens_data.allocated) { p = ++lmt_token_memory_state.tokens_data.top; } else { tex_aux_bump_token_memory(); p = ++lmt_token_memory_state.tokens_data.top; } ++lmt_token_memory_state.tokens_data.ptr; token_link(p) = null; token_info(p) = t; return p; } /*tex Because we only have forward links, a freed token ends up at the head of the list of available tokens. */ void tex_put_available_token(halfword p) { token_link(p) = lmt_token_memory_state.available; lmt_token_memory_state.available = p; --lmt_token_memory_state.tokens_data.ptr; } halfword tex_store_new_token(halfword p, halfword t) { halfword q = tex_get_available_token(t); token_link(p) = q; return q; } /*tex The procedure |flush_list (p)| frees an entire linked list of oneword nodes that starts at position |p|. It makes list of single word nodes available. The second variant in principle is faster but in practice this goes unnoticed. Of course there is a little price to pay for keeping track of memory usage. */ void tex_flush_token_list(halfword head) { if (head) { halfword current = head; halfword tail; int i = 0; do { ++i; tail = current; current = token_link(tail); } while (current); lmt_token_memory_state.tokens_data.ptr -= i; token_link(tail) = lmt_token_memory_state.available; lmt_token_memory_state.available = head; } } void tex_flush_token_list_head_tail(halfword head, halfword tail, int n) { if (head) { lmt_token_memory_state.tokens_data.ptr -= n; token_link(tail) = lmt_token_memory_state.available; lmt_token_memory_state.available = head; } } void tex_add_token_reference(halfword p) { if (get_token_reference(p) < max_token_reference) { add_token_reference(p); // } else { // tex_overflow_error("reference count", max_token_reference); } } void tex_increment_token_reference(halfword p, int n) { if ((get_token_reference(p) + n) < max_token_reference) { inc_token_reference(p, n); } else { inc_token_reference(p, max_token_reference - get_token_reference(p)); // } else { // tex_overflow_error("reference count", max_token_reference); } } void tex_delete_token_reference(halfword p) { if (p) { halfword r = get_token_reference(p); if (! r) { tex_flush_token_list(p); } if (r < max_token_reference) { sub_token_reference(p); } } } /*tex A \TEX\ token is either a character or a control sequence, and it is represented internally in one of two ways: \startitemize[n] \startitem A character whose ASCII code number is |c| and whose command code is |m| is represented as the number $2^{21}m+c$; the command code is in the range |1 <= m <= 14|. \stopitem \startitem A control sequence whose |eqtb| address is |p| is represented as the number |cs_token_flag+p|. Here |cs_token_flag = t =| $2^{25}-1$ is larger than $2^{21}m+c$, yet it is small enough that |cs_token_flag + p < max_halfword|; thus, a token fits comfortably in a halfword. \stopitem \stopitemize A token |t| represents a |left_brace| command if and only if |t < left_brace_limit|; it represents a |right_brace| command if and only if we have |left_brace_limit <= t < right_brace_limit|; and it represents a |match| or |end_match| command if and only if |match_token <= t <= end_match_token|. The following definitions take care of these token-oriented constants and a few others. A token list is a singly linked list of one-word nodes in |mem|, where each word contains a token and a link. Macro definitions, output routine definitions, marks, |\write| texts, and a few other things are remembered by \TEX\ in the form of token lists, usually preceded by a node with a reference count in its |token_ref_count| field. The token stored in location |p| is called |info(p)|. Three special commands appear in the token lists of macro definitions. When |m = match|, it means that \TEX\ should scan a parameter for the current macro; when |m = end_match|, it means that parameter matching should end and \TEX\ should start reading the macro text; and when |m = out_param|, it means that \TEX\ should insert parameter number |c| into the text at this point. The enclosing |\char'173| and |\char'175| characters of a macro definition are omitted, but the final right brace of an output routine is included at the end of its token list. Here is an example macro definition that illustrates these conventions. After \TEX\ processes the text: \starttyping \def\mac a#1#2 \b {#1\-a ##1#2 \#2\} \stoptyping The definition of |\mac| is represented as a token list containing: \starttyping (reference count) letter a match # match # spacer \b end_match out_param1 \- letter a spacer, mac_param # other_char 1 out_param2 spacer out_param 2 \stoptyping The procedure |scan_toks| builds such token lists, and |macro_call| does the parameter matching. Examples such as |\def \m {\def \m {a} b}| explain why reference counts would be needed even if \TEX\ had no |\let| operation: When the token list for |\m| is being read, the redefinition of |\m| changes the |eqtb| entry before the token list has been fully consumed, so we dare not simply destroy a token list when its control sequence is being redefined. If the parameter-matching part of a definition ends with |#{}|, the corresponding token list will have |{| just before the |end_match| and also at the very end. The first |{| is used to delimit the parameter; the second one keeps the first from disappearing. The |print_meaning| subroutine displays |cur_cmd| and |cur_chr| in symbolic form, including the expansion of a macro or mark. */ void tex_print_meaning(halfword code) { /*tex This would make sense but some macro packages don't like it: \starttyping if (cur_cmd == math_given_cmd) { cur_cmd = math_xgiven_cmd ; } \stoptyping Eventually we might just do it that way. We also can have |\meaningonly| that omits the |macro:| and arguments. */ int untraced = is_untraced(eq_flag(cur_cs)); if (! untraced) { switch (code) { case meaning_code: case meaning_full_code: case meaning_ful_code: case meaning_asis_code: tex_print_cmd_flags(cur_cs, cur_cmd, (code != meaning_code), code == meaning_asis_code); break; } } switch (cur_cmd) { case call_cmd: case protected_call_cmd: case semi_protected_call_cmd: case tolerant_call_cmd: case tolerant_protected_call_cmd: case tolerant_semi_protected_call_cmd: if (untraced) { tex_print_cs(cur_cs); return; } else { int constant = (cur_chr && get_token_reference(cur_chr) == max_token_reference); switch (code) { case meaning_code: case meaning_full_code: case meaning_ful_code: if (constant) { tex_print_str("constant "); } tex_print_str("macro"); if (code == meaning_ful_code) { return; } else { goto FOLLOWUP; } case meaning_asis_code: if (constant) { tex_print_str_esc("constant "); } // tex_print_format("%e%C %S ", def_cmd, def_code, cur_cs); tex_print_cmd_chr(def_cmd, def_code); tex_print_char(' '); tex_print_cs(cur_cs); tex_print_char(' '); if (cur_chr && token_link(cur_chr)) { tex_show_token_list(token_link(cur_chr), get_token_preamble(cur_chr)); } return; case meaning_les_code: if (cur_chr && token_link(cur_chr)) { tex_show_token_list(token_link(cur_chr), 2); } return; } goto DETAILS; } case get_mark_cmd: tex_print_cmd_chr((singleword) cur_cmd, cur_chr); tex_print_char(':'); tex_print_nlp(); tex_token_show(tex_get_some_mark(cur_chr, 0)); return; case lua_value_cmd: case lua_call_cmd: case lua_local_call_cmd: case lua_protected_call_cmd: if (untraced) { tex_print_cs(cur_cs); return; } else { goto DEFAULT; } case if_test_cmd: if (cur_chr > last_if_test_code) { tex_print_cs(cur_cs); return; } else { goto DEFAULT; } default: DEFAULT: tex_print_cmd_chr((singleword) cur_cmd, cur_chr); if (cur_cmd < call_cmd) { return; } else { /* all kind of reference cmds */ break; } } FOLLOWUP: tex_print_char(':'); DETAILS: tex_print_nlp(); tex_token_show(cur_chr); } /*tex The procedure |show_token_list|, which prints a symbolic form of the token list that starts at a given node |p|, illustrates these conventions. The token list being displayed should not begin with a reference count. However, the procedure is intended to be robust, so that if the memory links are awry or if |p| is not really a pointer to a token list, nothing catastrophic will happen. An additional parameter |q| is also given; this parameter is either null or it points to a node in the token list where a certain magic computation takes place that will be explained later. Basically, |q| is non-null when we are printing the two-line context information at the time of an error message; |q| marks the place corresponding to where the second line should begin. For example, if |p| points to the node containing the first |a| in the token list above, then |show_token_list| will print the string \starttyping a#1#2 \b ->#1-a ##1#2 #2 \stoptyping and if |q| points to the node containing the second |a|, the magic computation will be performed just before the second |a| is printed. The generation will stop, and |\ETC.| will be printed, if the length of printing exceeds a given limit~|l|. Anomalous entries are printed in the form of control sequences that are not followed by a blank space, e.g., |\BAD.|; this cannot be confused with actual control sequences because a real control sequence named |BAD| would come out |\BAD |. In \LUAMETATEX\ we have some more node types and token types so we also have additional tracing. Because there is some more granularity in for instance nodes (subtypes) more detail is reported. It made sense to split the |tex_show_token_list| funciton in two, ine specialized for showing the context. That saves some testing and passing arguments. */ static const char *tex_aux_special_cmd_string(halfword cmd, halfword chr, const char *unknown) { switch (cmd) { case node_cmd : return "[[special cmd: node pointer]]"; case lua_protected_call_cmd : return "[[special cmd: lua protected call]]"; case lua_value_cmd : return "[[special cmd: lua value call]]"; case iterator_value_cmd : return "[[special cmd: iterator value]]"; case lua_call_cmd : return "[[special cmd: lua call]]"; case lua_local_call_cmd : return "[[special cmd: lua local call]]"; case begin_local_cmd : return "[[special cmd: begin local call]]"; case end_local_cmd : return "[[special cmd: end local call]]"; // case prefix_cmd : return "[[special cmd: enforced]]"; case prefix_cmd : return "\\always "; default : printf("[[unknown cmd: (%i,%i)]\n", cmd, chr); return unknown; } } void tex_show_token_list(halfword p, int asis) { if (p) { unsigned char n = 0; int max = lmt_token_memory_state.tokens_data.top; while (p) { if (p < 0 || p > max) { tex_print_str(error_string_clobbered(41)); return; } else if (token_info(p) >= cs_token_flag) { tex_print_cs_checked(token_info(p) - cs_token_flag); } else if (token_info(p) > 0) { int cmd = token_cmd(token_info(p)); int chr = token_chr(token_info(p)); switch (cmd) { case left_brace_cmd: case right_brace_cmd: case math_shift_cmd: case alignment_tab_cmd: case superscript_cmd: case subscript_cmd: case spacer_cmd: case letter_cmd: case other_char_cmd: case active_char_cmd: case ignore_cmd: tex_print_tex_str(chr); break; case parameter_cmd: if (! lmt_token_state.in_lua_escape && (lmt_expand_state.cs_name_level == 0)) { tex_print_tex_str(chr); } tex_print_tex_str(chr); break; case parameter_reference_cmd: tex_print_tex_str(match_visualizer); if (chr <= 9) { tex_print_char(chr + '0'); } else if (chr <= max_match_count) { tex_print_char(chr + '0' + gap_match_count); } else { tex_print_char('!'); return; } break; case match_cmd: tex_print_char(match_visualizer); if (is_valid_match_ref(chr)) { ++n; } tex_print_char(chr ? chr : '0'); if (n > max_match_count) { return; } else { break; } case end_match_cmd: switch (asis) { case 1: tex_print_char('{'); break; case 2: return; default: if (chr == 0) { tex_print_str("->"); } break; } break; case ignore_something_cmd: break; case set_font_cmd: tex_print_format("[font->%s]", font_original(cur_val)); break; case end_paragraph_cmd: /* tex_print_format("%e%s", "par "); */ tex_print_str_esc("par "); break; default: tex_print_str(tex_aux_special_cmd_string(cmd, chr, error_string_bad(43))); break; } } else { tex_print_str(error_string_bad(42)); } p = token_link(p); } if (asis == 1) { tex_print_char('}'); } } } void tex_show_token_list_context(halfword p, halfword q) { if (p) { /*tex the highest parameter number, as an \ASCII\ digit */ unsigned char n = 0; int max = lmt_token_memory_state.tokens_data.top; lmt_print_state.tally = 0; while (p) { if (p == q) { /*tex Do magic computation. We only end up here in context showing. */ tex_set_trick_count(); } /*tex Display token |p|, and |return| if there are problems. */ if (p < 0 || p > max) { tex_print_str(error_string_clobbered(41)); return; } else if (token_info(p) >= cs_token_flag) { // if (! ((print_state.inhibit_par_tokens) && (token_info(p) == token_state.par_token))) { tex_print_cs_checked(token_info(p) - cs_token_flag); // } } else if (token_info(p) > 0) { int cmd = token_cmd(token_info(p)); int chr = token_chr(token_info(p)); /* Display the token (|cmd|,|chr|). The procedure usually \quote {learns} the character code used for macro parameters by seeing one in a |match| command before it runs into any |out_param| commands. This is probably not true any longer. */ switch (cmd) { case left_brace_cmd: case right_brace_cmd: case math_shift_cmd: case alignment_tab_cmd: case superscript_cmd: case subscript_cmd: case spacer_cmd: case letter_cmd: case other_char_cmd: case active_char_cmd: /* new */ case ignore_cmd: /* new */ tex_print_tex_str(chr); break; case parameter_cmd: if (! lmt_token_state.in_lua_escape && (lmt_expand_state.cs_name_level == 0)) { tex_print_tex_str(chr); } tex_print_tex_str(chr); break; case parameter_reference_cmd: tex_print_tex_str(match_visualizer); if (chr <= 9) { tex_print_char(chr + '0'); } else if (chr <= max_match_count) { tex_print_char(chr + '0' + gap_match_count); } else { tex_print_char('!'); return; } break; case match_cmd: tex_print_char(match_visualizer); if (is_valid_match_ref(chr)) { ++n; } tex_print_char(chr ? chr : '0'); if (n > max_match_count) { /*tex Can this happen at all? */ return; } else { break; } case end_match_cmd: tex_print_str("->"); break; case ignore_something_cmd: break; case set_font_cmd: tex_print_format("[font->%s]", font_original(cur_val)); break; case end_paragraph_cmd: /* tex_print_format("%e%s", "par "); */ tex_print_str_esc("par "); break; default: tex_print_str(tex_aux_special_cmd_string(cmd, chr, error_string_bad(43))); break; } // } else if (token_info(p) == 0) { // tex_print_str(error_string_bad(44)); } else { tex_print_str(error_string_bad(42)); } p = token_link(p); } if (p) { tex_print_str_esc("ETC."); } } } /* # define do_buffer_to_unichar(a,b) do { \ a = (halfword)str2uni(fileio_state.io_buffer+b); \ b += utf8_size(a); \ } while (0) */ inline static halfword get_unichar_from_buffer(int *b) { halfword a = (halfword) ((const unsigned char) *(lmt_fileio_state.io_buffer + *b)); if (a <= 0x80) { *b += 1; } else { int al; a = (halfword) aux_str2uni_len(lmt_fileio_state.io_buffer + *b, &al); *b += al; } return a; } /*tex Here's the way we sometimes want to display a token list, given a pointer to its reference count; the pointer may be null. */ void tex_token_show(halfword p) { if (p && token_link(p)) { tex_show_token_list(token_link(p), 0); } } /*tex The next function, |delete_token_ref|, is called when a pointer to a token list's reference count is being removed. This means that the token list should disappear if the reference count was |null|, otherwise the count should be decreased by one. Variable |p| points to the reference count of a token list that is losing one reference. */ int tex_get_char_cat_code(int c) { return tex_aux_the_cat_code(c); } static void tex_aux_invalid_character_error(void) { tex_handle_error( normal_error_type, "Text line contains an invalid character", "A funny symbol that I can't read has just been input. Continue, and I'll forget\n" "that it ever happened." ); } static int tex_aux_process_sup_mark(void); static int tex_aux_scan_control_sequence(void); typedef enum next_line_retval { next_line_ok, next_line_return, next_line_restart } next_line_retval; static inline next_line_retval tex_aux_next_line(void); /*tex In case you are getting bored, here is a slightly less trivial routine: Given a string of lowercase letters, like |pt| or |plus| or |width|, the |scan_keyword| routine checks to see whether the next tokens of input match this string. The match must be exact, except that ppercase letters will match their lowercase counterparts; uppercase equivalents are determined by subtracting |"a" - "A"|, rather than using the |uc_code| table, since \TEX\ uses this routine only for its own limited set of keywords. If a match is found, the characters are effectively removed from the input and |true| is returned. Otherwise |false| is returned, and the input is left essentially unchanged (except for the fact that some macros may have been expanded, etc.). In \LUATEX\ and its follow up we have more keywords and for instance when scanning a box specification that is noticeable because the |scan_keyword| function is a little inefficient in the sense that when there is no match, it will push back what got read so far. So there is token allocation, pushing a level etc involved. Keep in mind that expansion happens here so what gets pushing back is not always literally pushing back what we started with. In \LUAMETATEX\ we now have a bit different approach. The |scan_mandate_keyword| follows up on |scan_character| so we have a two step approach. We could actually pass a list of valid keywords but that would make for a complex function with no real benefits. */ halfword tex_scan_character(const char *s, int left_brace, int skip_space, int skip_relax) { halfword save_cur_cs = cur_cs; while (1) { tex_get_x_token(); switch (cur_cmd) { case spacer_cmd: if (skip_space) { break; } else { goto DONE; } case relax_cmd: if (skip_relax) { break; } else { goto DONE; } case letter_cmd: case other_char_cmd: if (cur_chr <= 'z' && strchr(s, cur_chr)) { cur_cs = save_cur_cs; return cur_chr; } else { goto DONE; } case left_brace_cmd: if (left_brace) { cur_cs = save_cur_cs; return '{'; } else { goto DONE; } default: goto DONE; } } DONE: tex_back_input(cur_tok); cur_cs = save_cur_cs; return 0; } void tex_aux_show_keyword_error(const char *s) { tex_handle_error( normal_error_type, "Valid keyword expected, likely '%s'", s, "You started a keyword but it seems to be an invalid one. The first character(s)\n" "might give you a clue. You might want to quit unwanted lookahead with \\relax." ); } /*tex Scanning an optional keyword starts at the beginning. This means that we can also (for instance) have a minus or plus sign which means that we have a different loop than with the alternative that already checked the first character. */ int tex_scan_optional_keyword(const char *s) { halfword save_cur_cs = cur_cs; int done = 0; const char *p = s; while (*p) { tex_get_x_token(); switch (cur_cmd) { case letter_cmd: case other_char_cmd: if ((cur_chr == *p) || (cur_chr == *p - 'a' + 'A')) { if (*(++p)) { done = 1; } else { cur_cs = save_cur_cs; return 1; } } else if (done) { goto BAD_NEWS; } else { // can be a minus or so ! as in \advance\foo -10 tex_back_input(cur_tok); cur_cs = save_cur_cs; return 1; } break; case spacer_cmd: /* normally spaces are not pushed back */ if (done) { goto BAD_NEWS; } else { break; } // fall through default: tex_back_input(cur_tok); if (done) { /* unless we accept partial keywords */ goto BAD_NEWS; } else { cur_cs = save_cur_cs; return 0; } } } BAD_NEWS: tex_aux_show_keyword_error(s); cur_cs = save_cur_cs; return 0; } /*tex Here we know that the first character(s) matched so we are in the middle of a keyword already which means a different loop than the previous one. */ int tex_scan_mandate_keyword(const char *s, int offset) { halfword save_cur_cs = cur_cs; int done = 0; // int done = offset > 0; const char *p = s + offset; /* offset always > 0 so no issue with +/- */ while (*p) { tex_get_x_token(); switch (cur_cmd) { case letter_cmd: case other_char_cmd: if ((cur_chr == *p) || (cur_chr == *p - 'a' + 'A')) { if (*(++p)) { done = 1; } else { cur_cs = save_cur_cs; return 1; } } else { goto BAD_NEWS; } break; // case spacer_cmd: /* normally spaces are not pushed back */ // case relax_cmd: /* normally not, should be option */ // if (done) { // back_input(cur_tok); // goto BAD_NEWS; // } else { // break; // } // default: // goto BAD_NEWS; case spacer_cmd: /* normally spaces are not pushed back */ if (done) { goto BAD_NEWS; } else { break; } // fall through default: tex_back_input(cur_tok); /* unless we accept partial keywords */ goto BAD_NEWS; } } BAD_NEWS: tex_aux_show_keyword_error(s); cur_cs = save_cur_cs; return 0; } /* This is the original scanner with push|-|back. It's a matter of choice: we are more restricted on the one hand and more loose on the other. */ int tex_scan_keyword(const char *s) { if (*s) { halfword h = null; halfword p = null; halfword save_cur_cs = cur_cs; int n = 0; while (*s) { /*tex Recursion is possible here! */ tex_get_x_token(); if ((cur_cmd == letter_cmd || cur_cmd == other_char_cmd) && ((cur_chr == *s) || (cur_chr == *s - 'a' + 'A'))) { p = tex_store_new_token(p, cur_tok); if (! h) { h = p; } n++; s++; } else if ((p != h) || (cur_cmd != spacer_cmd)) { tex_back_input(cur_tok); if (h) { tex_begin_backed_up_list(h); } cur_cs = save_cur_cs; return 0; } } if (h) { tex_flush_token_list_head_tail(h, p, n); } cur_cs = save_cur_cs; return 1; } else { /*tex but not with newtokenlib zero keyword simply doesn't match */ return 0 ; } } int tex_scan_keyword_case_sensitive(const char *s) { if (*s) { halfword h = null; halfword p = null; halfword save_cur_cs = cur_cs; int n = 0; while (*s) { tex_get_x_token(); if ((cur_cmd == letter_cmd || cur_cmd == other_char_cmd) && (cur_chr == *s)) { p = tex_store_new_token(p, cur_tok); if (! h) { h = p; } n++; s++; } else if ((p != h) || (cur_cmd != spacer_cmd)) { tex_back_input(cur_tok); if (h) { tex_begin_backed_up_list(h); } cur_cs = save_cur_cs; return 0; } } if (h) { tex_flush_token_list_head_tail(h, p, n); } cur_cs = save_cur_cs; return 1; } else { return 0 ; } } /*tex We can not return |undefined_control_sequence| under some conditions (inside |shift_case|, for example). This needs thinking. */ halfword tex_active_to_cs(int c, int force) { halfword cs = -1; if (c >= 0 && c <= max_character_code) { char utfbytes[8] = { active_character_first, active_character_second, active_character_third, 0 }; aux_uni2string((char *) &utfbytes[3], c); cs = tex_string_locate(utfbytes, (size_t) utf8_size(c) + 3, force); } if (cs < 0) { cs = tex_string_locate(active_character_unknown, 4, force); /*tex Including the zero sentinel. */ } return cs; } /*tex The heart of \TEX's input mechanism is the |get_next| procedure, which we shall develop in the next few sections of the program. Perhaps we shouldn't actually call it the \quote {heart}, however, because it really acts as \TEX's eyes and mouth, reading the source files and gobbling them up. And it also helps \TEX\ to regurgitate stored token lists that are to be processed again. The main duty of |get_next| is to input one token and to set |cur_cmd| and |cur_chr| to that token's command code and modifier. Furthermore, if the input token is a control sequence, the |eqtb| location of that control sequence is stored in |cur_cs|; otherwise |cur_cs| is set to zero. Underlying this simple description is a certain amount of complexity because of all the cases that need to be handled. However, the inner loop of |get_next| is reasonably short and fast. When |get_next| is asked to get the next token of a |\read| line, it sets |cur_cmd = cur_chr = cur_cs = 0| in the case that no more tokens appear on that line. (There might not be any tokens at all, if the |end_line_char| has |ignore| as its catcode.) The value of |par_loc| is the |eqtb| address of |\par|. This quantity is needed because a blank line of input is supposed to be exactly equivalent to the appearance of |\par|; we must set |cur_cs := par_loc| when detecting a blank line. Parts |get_next| are executed more often than any other instructions of \TEX. The global variable |force_eof| is normally |false|; it is set |true| by an |\endinput| command. |luacstrings| is the number of lua print statements waiting to be input, it is changed by |lmt_token_call|. If the user has set the |pausing| parameter to some positive value, and if nonstop mode has not been selected, each line of input is displayed on the terminal and the transcript file, followed by |=>|. \TEX\ waits for a response. If the response is simply |carriage_return|, the line is accepted as it stands, otherwise the line typed is used instead of the line in the file. We no longer need the following: */ // void firm_up_the_line(void) // { // ilimit = fileio_state.io_last; // } /*tex The other variant gives less clutter in tracing cache usage when profiling and for some files (like the manual) also a bit of a speedup. Splitting the switch which gives 10 times less Bim in vallgrind! See the \LUATEX\ source for that code. The big switch changes the state if necessary, and |goto switch| if the current character should be ignored, or |goto reswitch| if the current character changes to another. The n-way switch accomplishes the scanning quickly, assuming that a decent \CCODE\ compiler has translated the code. Note that the numeric values for |mid_line|, |skip_blanks|, and |new_line| are spaced apart from each other by |max_char_code+1|, so we can add a character's command code to the state to get a single number that characterizes both. Remark: checking performance indicated that this switch was the cause of many branch prediction errors but changing it to: \starttyping c = istate + cur_cmd; if (c == (mid_line_state + letter_cmd) || c == (mid_line_state + other_char_cmd)) { return 1; } else if (c >= new_line_state) { switch (c) { } } else if (c >= skip_blanks_state) { switch (c) { } } else if (c >= mid_line_state) { switch (c) { } } else { istate = mid_line_state; return 1; } \stoptyping This gives as many prediction errors. So, we can indeed assume that the compiler does the right job, or that there is simply no other way. When a line is finished a space is emited. When a character of type |spacer| gets through, its character code is changed to |\ =040|. This means that the \ASCII\ codes for tab and space, and for the space inserted at the end of a line, will be treated alike when macro parameters are being matched. We do this since such characters are indistinguishable on most computer terminal displays. */ /* c = istate + cur_cmd; if (c == (mid_line_state + letter_cmd) || c == (mid_line_state + other_char_cmd)) { return 1; } else if (c >= new_line_state) { .... } */ /*tex This trick has been dropped when the wrapup mechanism had proven to be useful. The idea was to backport this to \LUATEX\ but some other \PDFTEX\ compatible parstuff made it there and backporting par related features becomes too messy. \starttyping lmt_input_state.cur_input.loc = lmt_input_state.cur_input.limit + 1; cur_cs = lmt_token_state.line_par_loc; cur_cmd = eq_type(cur_cs); if (cur_cmd == undefined_cs_cmd) { cur_cs = lmt_token_state.par_loc; cur_cmd = eq_type(cur_cs); } cur_chr = eq_value(cur_cs); \stoptyping */ static int tex_aux_get_next_file(void) { SWITCH: if (lmt_input_state.cur_input.loc <= lmt_input_state.cur_input.limit) { /*tex current line not yet finished */ cur_chr = get_unichar_from_buffer(&lmt_input_state.cur_input.loc); RESWITCH: if (lmt_input_state.cur_input.cattable == no_catcode_table_preset) { /* happens seldom: detokenized line */ cur_cmd = cur_chr == ' ' ? spacer_cmd : other_char_cmd; } else { cur_cmd = tex_aux_the_cat_code(cur_chr); } switch (lmt_input_state.cur_input.state + cur_cmd) { case mid_line_state + ignore_cmd: case skip_blanks_state + ignore_cmd: case new_line_state + ignore_cmd: case skip_blanks_state + spacer_cmd: case new_line_state + spacer_cmd: /*tex Cases where character is ignored. */ goto SWITCH; case mid_line_state + escape_cmd: case new_line_state + escape_cmd: case skip_blanks_state + escape_cmd: /*tex Scan a control sequence. */ lmt_input_state.cur_input.state = (unsigned char) tex_aux_scan_control_sequence(); break; case mid_line_state + active_char_cmd: case new_line_state + active_char_cmd: case skip_blanks_state + active_char_cmd: /*tex Process an active-character. */ if ((lmt_input_state.scanner_status == scanner_is_tolerant || lmt_input_state.scanner_status == scanner_is_matching) && tex_pass_active_math_char(cur_chr)) { /*tex We need to intercept a delimiter in arguments. */ } else if ((lmt_input_state.scanner_status == scanner_is_defining || lmt_input_state.scanner_status == scanner_is_absorbing) && tex_pass_active_math_char(cur_chr)) { /*tex We are storing stuff in a token list or macro body. */ } else if ((cur_mode == mmode || lmt_nest_state.math_mode) && tex_check_active_math_char(cur_chr)) { /*tex We have an intercept. */ } else { cur_cs = tex_active_to_cs(cur_chr, ! lmt_hash_state.no_new_cs); cur_cmd = eq_type(cur_cs); cur_chr = eq_value(cur_cs); } lmt_input_state.cur_input.state = mid_line_state; break; case mid_line_state + superscript_cmd: case new_line_state + superscript_cmd: case skip_blanks_state + superscript_cmd: /*tex We need to check for multiple ^: (0) always check for ^^ ^^^^ ^^^^^^^ (1) only check in text mode (*) never */ if (sup_mark_mode_par) { if (sup_mark_mode_par == 1 && cur_mode != mmode && tex_aux_process_sup_mark()) { goto RESWITCH; } } else if (tex_aux_process_sup_mark()) { goto RESWITCH; } else { /*tex We provide prescripts and shifted script in math mode and avoid fance |^| processing in text mode (which is what we do in \CONTEXT). */ } lmt_input_state.cur_input.state = mid_line_state; break; case mid_line_state + invalid_char_cmd: case new_line_state + invalid_char_cmd: case skip_blanks_state + invalid_char_cmd: /*tex Decry the invalid character and |goto restart|. */ tex_aux_invalid_character_error(); /*tex Because state may be |token_list| now: */ return 0; case mid_line_state + spacer_cmd: /*tex Enter |skip_blanks| state, emit a space. */ lmt_input_state.cur_input.state = skip_blanks_state; cur_chr = ' '; break; case mid_line_state + end_line_cmd: /*tex Finish the line. See note above about dropped |\linepar|. */ lmt_input_state.cur_input.loc = lmt_input_state.cur_input.limit + 1; cur_cmd = spacer_cmd; cur_chr = ' '; break; case skip_blanks_state + end_line_cmd: case mid_line_state + comment_cmd: case new_line_state + comment_cmd: case skip_blanks_state + comment_cmd: /*tex Finish line, |goto switch|; */ lmt_input_state.cur_input.loc = lmt_input_state.cur_input.limit + 1; goto SWITCH; case new_line_state + end_line_cmd: if (! auto_paragraph_mode(auto_paragraph_go_on)) { lmt_input_state.cur_input.loc = lmt_input_state.cur_input.limit + 1; } /*tex Finish line, emit a |\par|; */ if (auto_paragraph_mode(auto_paragraph_text)) { cur_cs = null; cur_cmd = end_paragraph_cmd; cur_chr = new_line_end_paragraph_code; // cur_chr = normal_end_paragraph_code; } else { cur_cs = lmt_token_state.par_loc; cur_cmd = eq_type(cur_cs); cur_chr = eq_value(cur_cs); } break; case skip_blanks_state + left_brace_cmd: case new_line_state + left_brace_cmd: lmt_input_state.cur_input.state = mid_line_state; lmt_input_state.align_state++; break; case mid_line_state + left_brace_cmd: lmt_input_state.align_state++; break; case skip_blanks_state + right_brace_cmd: case new_line_state + right_brace_cmd: lmt_input_state.cur_input.state = mid_line_state; lmt_input_state.align_state--; break; case mid_line_state + right_brace_cmd: lmt_input_state.align_state--; break; case mid_line_state + math_shift_cmd: case mid_line_state + alignment_tab_cmd: case mid_line_state + parameter_cmd: case mid_line_state + subscript_cmd: case mid_line_state + letter_cmd: case mid_line_state + other_char_cmd: break; /* case skip_blanks_state + math_shift_cmd: case skip_blanks_state + alignment_tab_cmd: case skip_blanks_state + parameter_cmd: case skip_blanks_state + subscript_cmd: case skip_blanks_state + letter_cmd: case skip_blanks_state + other_char_cmd: case new_line_state + math_shift_cmd: case new_line_state + alignment_tab_cmd: case new_line_state + parameter_cmd: case new_line_state + subscript_cmd: case new_line_state + letter_cmd: case new_line_state + other_char_cmd: */ default: lmt_input_state.cur_input.state = mid_line_state; break; } } else { if (! io_token_input(lmt_input_state.cur_input.name)) { lmt_input_state.cur_input.state = new_line_state; } /*tex Move to next line of file, or |goto restart| if there is no next line, or |return| if a |\read| line has finished. */ do { next_line_retval r = tex_aux_next_line(); if (r == next_line_restart) { /*tex This happens more often. */ return 0; } else if (r == next_line_return) { return 1; } } while (0); /* check_interrupt(); */ goto SWITCH; } return 1; } /*tex Notice that a code like |^^8| becomes |x| if not followed by a hex digit. We only support a limited set: \starttyping ^^^^^^XXXXXX ^^^^XXXXXX ^^XX ^^ \stoptyping */ # define is_hex(a) ((a >= '0' && a <= '9') || (a >= 'a' && a <= 'f')) inline static halfword tex_aux_two_hex_to_cur_chr(int c1, int c2) { return 0x10 * (c1 <= '9' ? c1 - '0' : c1 - 'a' + 10) + 0x01 * (c2 <= '9' ? c2 - '0' : c2 - 'a' + 10); } inline static halfword tex_aux_four_hex_to_cur_chr(int c1, int c2,int c3, int c4) { return 0x1000 * (c1 <= '9' ? c1 - '0' : c1 - 'a' + 10) + 0x0100 * (c2 <= '9' ? c2 - '0' : c2 - 'a' + 10) + 0x0010 * (c3 <= '9' ? c3 - '0' : c3 - 'a' + 10) + 0x0001 * (c4 <= '9' ? c4 - '0' : c4 - 'a' + 10); } inline static halfword tex_aux_six_hex_to_cur_chr(int c1, int c2, int c3, int c4, int c5, int c6) { return 0x100000 * (c1 <= '9' ? c1 - '0' : c1 - 'a' + 10) + 0x010000 * (c2 <= '9' ? c2 - '0' : c2 - 'a' + 10) + 0x001000 * (c3 <= '9' ? c3 - '0' : c3 - 'a' + 10) + 0x000100 * (c4 <= '9' ? c4 - '0' : c4 - 'a' + 10) + 0x000010 * (c5 <= '9' ? c5 - '0' : c5 - 'a' + 10) + 0x000001 * (c6 <= '9' ? c6 - '0' : c6 - 'a' + 10); } static int tex_aux_process_sup_mark(void) { if (cur_chr == lmt_fileio_state.io_buffer[lmt_input_state.cur_input.loc]) { if (lmt_input_state.cur_input.loc < lmt_input_state.cur_input.limit) { if ((cur_chr == lmt_fileio_state.io_buffer[lmt_input_state.cur_input.loc + 1]) && (cur_chr == lmt_fileio_state.io_buffer[lmt_input_state.cur_input.loc + 2])) { if ((cur_chr == lmt_fileio_state.io_buffer[lmt_input_state.cur_input.loc + 3]) && (cur_chr == lmt_fileio_state.io_buffer[lmt_input_state.cur_input.loc + 4])) { if ((lmt_input_state.cur_input.loc + 10) <= lmt_input_state.cur_input.limit) { /*tex |^^^^^^XXXXXX| */ int c1 = lmt_fileio_state.io_buffer[lmt_input_state.cur_input.loc + 5]; int c2 = lmt_fileio_state.io_buffer[lmt_input_state.cur_input.loc + 6]; int c3 = lmt_fileio_state.io_buffer[lmt_input_state.cur_input.loc + 7]; int c4 = lmt_fileio_state.io_buffer[lmt_input_state.cur_input.loc + 8]; int c5 = lmt_fileio_state.io_buffer[lmt_input_state.cur_input.loc + 9]; int c6 = lmt_fileio_state.io_buffer[lmt_input_state.cur_input.loc + 10]; if (is_hex(c1) && is_hex(c2) && is_hex(c3) && is_hex(c4) && is_hex(c5) && is_hex(c6)) { lmt_input_state.cur_input.loc += 11; cur_chr = tex_aux_six_hex_to_cur_chr(c1, c2, c3, c4, c5, c6); return 1; } else { tex_handle_error( normal_error_type, "^^^^^^ needs six hex digits", NULL ); } } else { tex_handle_error( normal_error_type, "^^^^^^ needs six hex digits, end of input", NULL ); } } else if ((lmt_input_state.cur_input.loc + 6) <= lmt_input_state.cur_input.limit) { /*tex |^^^^XXXX| */ int c1 = lmt_fileio_state.io_buffer[lmt_input_state.cur_input.loc + 3]; int c2 = lmt_fileio_state.io_buffer[lmt_input_state.cur_input.loc + 4]; int c3 = lmt_fileio_state.io_buffer[lmt_input_state.cur_input.loc + 5]; int c4 = lmt_fileio_state.io_buffer[lmt_input_state.cur_input.loc + 6]; if (is_hex(c1) && is_hex(c2) && is_hex(c3) && is_hex(c4)) { lmt_input_state.cur_input.loc += 7; cur_chr = tex_aux_four_hex_to_cur_chr(c1, c2, c3, c4); return 1; } else { tex_handle_error( normal_error_type, "^^^^ needs four hex digits", NULL ); } } else { tex_handle_error( normal_error_type, "^^^^ needs four hex digits, end of input", NULL ); } } else if ((lmt_input_state.cur_input.loc + 2) <= lmt_input_state.cur_input.limit) { /*tex |^^XX| */ int c1 = lmt_fileio_state.io_buffer[lmt_input_state.cur_input.loc + 1]; int c2 = lmt_fileio_state.io_buffer[lmt_input_state.cur_input.loc + 2]; if (is_hex(c1) && is_hex(c2)) { lmt_input_state.cur_input.loc += 3; cur_chr = tex_aux_two_hex_to_cur_chr(c1, c2); return 1; } } /*tex The single character case: */ { int c1 = lmt_fileio_state.io_buffer[lmt_input_state.cur_input.loc + 1]; if (c1 < 0200) { lmt_input_state.cur_input.loc = lmt_input_state.cur_input.loc + 2; // if (is_hex(c1) && (iloc <= ilimit)) { // int c2 = fileio_state.io_buffer[iloc]; // if (is_hex(c2)) { // ++iloc; // cur_chr = two_hex_to_cur_chr(c1, c2); // return 1; // } // } // /*tex The somewhat odd cases, often special control characters: */ cur_chr = (c1 < 0100 ? c1 + 0100 : c1 - 0100); return 1; } } } } return 0; } /*tex Control sequence names are scanned only when they appear in some line of a file. Once they have been scanned the first time, their |eqtb| location serves as a unique identification, so \TEX\ doesn't need to refer to the original name any more except when it prints the equivalent in symbolic form. The program that scans a control sequence has been written carefully in order to avoid the blowups that might otherwise occur if a malicious user tried something like |\catcode'15 = 0|. The algorithm might look at |buffer[ilimit + 1]|, but it never looks at |buffer[ilimit + 2]|. If expanded characters like |^^A| or |^^df| appear in or just following a control sequence name, they are converted to single characters in the buffer and the process is repeated, slowly but surely. */ /*tex Whenever we reach the following piece of code, we will have |cur_chr = buffer[k - 1]| and |k <= ilimit + 1| and |cat = get_cat_code(cat_code_table, cur_chr)|. If an expanded code like |^^A| or |^^df| appears in |buffer[(k - 1) .. (k + 1)]| or |buffer[(k - 1) .. (k + 2)]|, we will store the corresponding code in |buffer[k - 1]| and shift the rest of the buffer left two or three places. */ static int tex_aux_check_expanded_code(int *kk, halfword *chr) { if (sup_mark_mode_par > 1 || (sup_mark_mode_par == 1 && cur_mode == mmode)) { return 0; } else { int k = *kk; /* chr is the ^ character or an equivalent one */ if (lmt_fileio_state.io_buffer[k] == *chr && k < lmt_input_state.cur_input.limit) { int d = 1; int l; if ((*chr == lmt_fileio_state.io_buffer[k + 1]) && (*chr == lmt_fileio_state.io_buffer[k + 2])) { if ((*chr == lmt_fileio_state.io_buffer[k + 3]) && (*chr == lmt_fileio_state.io_buffer[k + 4])) { if ((k + 10) <= lmt_input_state.cur_input.limit) { int c1 = lmt_fileio_state.io_buffer[k + 6 - 1]; int c2 = lmt_fileio_state.io_buffer[k + 6 ]; int c3 = lmt_fileio_state.io_buffer[k + 6 + 1]; int c4 = lmt_fileio_state.io_buffer[k + 6 + 2]; int c5 = lmt_fileio_state.io_buffer[k + 6 + 3]; int c6 = lmt_fileio_state.io_buffer[k + 6 + 4]; if (is_hex(c1) && is_hex(c2) && is_hex(c3) && is_hex(c4) && is_hex(c5) && is_hex(c6)) { d = 6; *chr = tex_aux_six_hex_to_cur_chr(c1, c2, c3, c4, c5, c6); } else { tex_handle_error( normal_error_type, "^^^^^^ needs six hex digits", NULL ); } } else { tex_handle_error( normal_error_type, "^^^^^^ needs six hex digits, end of input", NULL ); } } else if ((k + 6) <= lmt_input_state.cur_input.limit) { int c1 = lmt_fileio_state.io_buffer[k + 4 - 1]; int c2 = lmt_fileio_state.io_buffer[k + 4 ]; int c3 = lmt_fileio_state.io_buffer[k + 4 + 1]; int c4 = lmt_fileio_state.io_buffer[k + 4 + 2]; if (is_hex(c1) && is_hex(c2) && is_hex(c3) && is_hex(c4)) { d = 4; *chr = tex_aux_four_hex_to_cur_chr(c1, c2, c3, c4); } else { tex_handle_error( normal_error_type, "^^^^ needs four hex digits", NULL ); } } else { tex_handle_error( normal_error_type, "^^^^ needs four hex digits, end of input", NULL ); } } else { int c1 = lmt_fileio_state.io_buffer[k + 1]; if (c1 < 0200) { /* really ? */ d = 1; if (is_hex(c1) && (k + 2) <= lmt_input_state.cur_input.limit) { int c2 = lmt_fileio_state.io_buffer[k + 2]; if (is_hex(c2)) { d = 2; *chr = tex_aux_two_hex_to_cur_chr(c1, c2); } else { *chr = (c1 < 0100 ? c1 + 0100 : c1 - 0100); } } else { *chr = (c1 < 0100 ? c1 + 0100 : c1 - 0100); } } } if (d > 2) { d = 2 * d - 1; } else { d++; } if (*chr <= 0x7F) { lmt_fileio_state.io_buffer[k - 1] = (unsigned char) *chr; } else if (*chr <= 0x7FF) { lmt_fileio_state.io_buffer[k - 1] = (unsigned char) (0xC0 + *chr / 0x40); k++; d--; lmt_fileio_state.io_buffer[k - 1] = (unsigned char) (0x80 + *chr % 0x40); } else if (*chr <= 0xFFFF) { lmt_fileio_state.io_buffer[k - 1] = (unsigned char) (0xE0 + *chr / 0x1000); k++; d--; lmt_fileio_state.io_buffer[k - 1] = (unsigned char) (0x80 + (*chr % 0x1000) / 0x40); k++; d--; lmt_fileio_state.io_buffer[k - 1] = (unsigned char) (0x80 + (*chr % 0x1000) % 0x40); } else { lmt_fileio_state.io_buffer[k - 1] = (unsigned char) (0xF0 + *chr / 0x40000); k++; d--; lmt_fileio_state.io_buffer[k - 1] = (unsigned char) (0x80 + (*chr % 0x40000) / 0x1000); k++; d--; lmt_fileio_state.io_buffer[k - 1] = (unsigned char) (0x80 + ((*chr % 0x40000) % 0x1000) / 0x40); k++; d--; lmt_fileio_state.io_buffer[k - 1] = (unsigned char) (0x80 + ((*chr % 0x40000) % 0x1000) % 0x40); } l = k; lmt_input_state.cur_input.limit -= d; while (l <= lmt_input_state.cur_input.limit) { lmt_fileio_state.io_buffer[l] = lmt_fileio_state.io_buffer[l + d]; l++; } *kk = k; cur_chr = *chr; /* hm */ return 1; } else { return 0; } } } static int tex_aux_scan_control_sequence(void) { int state = mid_line_state; if (lmt_input_state.cur_input.loc > lmt_input_state.cur_input.limit) { /*tex |state| is irrelevant in this case. */ cur_cs = null_cs; } else { /*tex |cat_code(cur_chr)|, usually: */ while (1) { int loc = lmt_input_state.cur_input.loc; halfword chr = get_unichar_from_buffer(&loc); halfword cat = tex_aux_the_cat_code(chr); if (cat != letter_cmd || loc > lmt_input_state.cur_input.limit) { if (cat == spacer_cmd) { state = skip_blanks_state; } else { state = mid_line_state; if (cat == superscript_cmd && tex_aux_check_expanded_code(&loc, &chr)) { continue; } } } else { state = skip_blanks_state; do { chr = get_unichar_from_buffer(&loc); cat = tex_aux_the_cat_code(chr); } while (cat == letter_cmd && loc <= lmt_input_state.cur_input.limit); /*tex If an expanded \unknown */ if (cat == superscript_cmd && tex_aux_check_expanded_code(&loc, &chr)) { continue; } else if (cat != letter_cmd) { /*tex Backtrack one character which can be \UTF. */ if (chr <= 0x7F) { loc -= 1; /* in most cases */ } else if (chr > 0xFFFF) { loc -= 4; } else if (chr > 0x7FF) { loc -= 3; } else /* if (cur_chr > 0x7F) */ { loc -= 2; } /*tex Now |k| points to first nonletter. */ } } cur_cs = tex_id_locate(lmt_input_state.cur_input.loc, loc - lmt_input_state.cur_input.loc, ! lmt_hash_state.no_new_cs); lmt_input_state.cur_input.loc = loc; break; } } cur_cmd = eq_type(cur_cs); cur_chr = eq_value(cur_cs); return state; } /*tex All of the easy branches of |get_next| have now been taken care of. There is one more branch. Conversely, the |file_warning| procedure is invoked when a file ends and some groups entered or conditionals started while reading from that file are still incomplete. */ static void tex_aux_file_warning(void) { { // save_state_info saved_save_stack_data = lmt_save_state; halfword saved_stack_ptr = lmt_save_state.save_stack_data.ptr; quarterword saved_group = cur_group; quarterword saved_level = cur_level; lmt_save_state.save_stack_data.ptr = cur_boundary; while (lmt_input_state.in_stack[lmt_input_state.in_stack_data.ptr].group != lmt_save_state.save_stack_data.ptr) { --cur_level; tex_print_nlp(); tex_print_format("Warning: end of file when %G is incomplete", 1); cur_group = save_level(lmt_save_state.save_stack_data.ptr); lmt_save_state.save_stack_data.ptr = save_value(lmt_save_state.save_stack_data.ptr); } // lmt_save_state = saved_save_stack_data; lmt_save_state.save_stack_data.ptr = saved_stack_ptr; cur_level = saved_level; cur_group = saved_group; } { condition_state_info saved_condition_state = lmt_condition_state; // halfword cond_ptr = lmt_condition_state.cond_ptr; // int cur_if = lmt_condition_state.cur_if; // int cur_unless = lmt_condition_state.cur_unless; // int if_step = lmt_condition_state.if_step; // int if_unless = lmt_condition_state.if_unless; // int if_limit = lmt_condition_state.if_limit; // int if_line = lmt_condition_state.if_line; while (lmt_input_state.in_stack[lmt_input_state.in_stack_data.ptr].if_ptr != lmt_condition_state.cond_ptr) { /* todo, more info */ tex_print_nlp(); tex_print_format("Warning: end of file when %C", if_test_cmd, lmt_condition_state.cur_if); if (lmt_condition_state.if_limit == fi_code) { tex_print_str_esc("else"); } if (lmt_condition_state.if_line) { tex_print_format(" entered on line %i", lmt_condition_state.if_line); } tex_print_str(" is incomplete"); lmt_condition_state.cur_if = if_limit_subtype(lmt_condition_state.cond_ptr); lmt_condition_state.cur_unless = if_limit_unless(lmt_condition_state.cond_ptr); lmt_condition_state.if_step = if_limit_step(lmt_condition_state.cond_ptr); lmt_condition_state.if_unless = if_limit_stepunless(lmt_condition_state.cond_ptr); lmt_condition_state.if_limit = if_limit_type(lmt_condition_state.cond_ptr); lmt_condition_state.if_line = if_limit_line(lmt_condition_state.cond_ptr); lmt_condition_state.cond_ptr = node_next(lmt_condition_state.cond_ptr); } lmt_condition_state = saved_condition_state; // lmt_condition_state.cond_ptr = cond_ptr; // lmt_condition_state.cur_if = cur_if; // lmt_condition_state.cur_unless = cur_unless; // lmt_condition_state.if_step = if_step; // lmt_condition_state.if_unless = if_unless; // lmt_condition_state.if_limit = if_limit; // lmt_condition_state.if_line = if_line; } tex_print_nlp(); if (tracing_nesting_par > 1) { tex_show_context(); } if (lmt_error_state.history == spotless) { lmt_error_state.history = warning_issued; } } static void tex_aux_check_validity(void) { switch (lmt_input_state.scanner_status) { case scanner_is_normal: break; case scanner_is_skipping: tex_handle_error( condition_error_type, "The file ended while I was skipping conditional text.", "This kind of error happens when you say '\\if...' and forget the\n" "matching '\\fi'. It can also be that you use '\\orelse' or '\\orunless\n'" "in the wrong way. Or maybe a forbidden control sequence was encountered." ); break; case scanner_is_defining: tex_handle_error(runaway_error_type, "The file ended when scanning a definition.", NULL); break; case scanner_is_matching: tex_handle_error(runaway_error_type, "The file ended when scanning an argument.", NULL); break; case scanner_is_tolerant: break; case scanner_is_aligning: tex_handle_error(runaway_error_type, "The file ended when scanning an alignment preamble.", NULL); break; case scanner_is_absorbing: tex_handle_error(runaway_error_type, "The file ended when absorbing something.", NULL); break; } } static inline next_line_retval tex_aux_next_line(void) { if (lmt_input_state.cur_input.name > io_initial_input_code) { /*tex Read next line of file into |buffer|, or |goto restart| if the file has ended. */ unsigned inhibit_eol = 0; ++lmt_input_state.input_line; lmt_fileio_state.io_first = lmt_input_state.cur_input.start; if (! lmt_token_state.force_eof) { unsigned force_eol = 0; switch (lmt_input_state.cur_input.name) { case io_lua_input_code: { halfword result = null; int cattable = 0; int partial = 0; int finalline = 0; int type = lmt_cstring_input(&result, &cattable, &partial, &finalline); switch (type) { case eof_tex_input: lmt_token_state.force_eof = 1; break; case string_tex_input: /*tex string */ lmt_input_state.cur_input.limit = lmt_fileio_state.io_last; /*tex Was |firm_up_the_line();|. */ lmt_input_state.cur_input.cattable = (short) cattable; lmt_input_state.cur_input.partial = (signed char) partial; if (finalline || partial || cattable == no_catcode_table_preset) { inhibit_eol = 1; } if (! partial) { lmt_input_state.cur_input.state = new_line_state; } break; case token_tex_input: /*tex token */ if (result >= cs_token_flag && eq_type(result - cs_token_flag) == input_cmd && eq_value(result - cs_token_flag) == end_of_input_code && lmt_input_state.cur_input.index > 0) { tex_end_file_reading(); } tex_back_input(result); return next_line_restart; case token_list_tex_input: /*tex token */ tex_begin_backed_up_list(result); return next_line_restart; case node_tex_input: /*tex node */ if (node_token_overflow(result)) { tex_back_input(token_val(ignore_cmd, node_token_lsb(result))); tex_reinsert_token(token_val(node_cmd, node_token_msb(result))); return next_line_restart; } else { /*tex |0x10FFFF == 1114111| */ tex_back_input(token_val(node_cmd, result)); return next_line_restart; } default: lmt_token_state.force_eof = 1; break; } break; } case io_token_input_code: case io_token_eof_input_code: { /* can be simplified but room for extensions now */ halfword result = null; int cattable = 0; int partial = 0; int finalline = 0; int type = lmt_cstring_input(&result, &cattable, &partial, &finalline); switch (type) { case eof_tex_input: lmt_token_state.force_eof = 1; if (lmt_input_state.cur_input.name == io_token_eof_input_code && every_eof_par) { force_eol = 1; } break; case string_tex_input: /*tex string */ lmt_input_state.cur_input.limit = lmt_fileio_state.io_last; /*tex Was |firm_up_the_line();|. */ lmt_input_state.cur_input.cattable = (short) cattable; lmt_input_state.cur_input.partial = (signed char) partial; inhibit_eol = lmt_input_state.cur_input.name != io_token_eof_input_code; if (! partial) { lmt_input_state.cur_input.state = new_line_state; } break; default: lmt_token_state.force_eof = 1; break; } break; } case io_tex_macro_code: /* this can't happen and will fail with the next line check */ default: if (tex_lua_input_ln()) { /*tex Not end of file, set |ilimit|. */ lmt_input_state.cur_input.limit = lmt_fileio_state.io_last; /*tex Was |firm_up_the_line();|. */ lmt_input_state.cur_input.cattable = default_catcode_table_preset; } else if (every_eof_par && (! lmt_input_state.in_stack[lmt_input_state.cur_input.index].end_of_file_seen)) { force_eol = 1; } else { tex_aux_check_validity(); lmt_token_state.force_eof = 1; } break; } if (force_eol) { lmt_input_state.cur_input.limit = lmt_fileio_state.io_first - 1; /* tex Fake one empty line. */ lmt_input_state.in_stack[lmt_input_state.cur_input.index].end_of_file_seen = 1; tex_begin_token_list(every_eof_par, every_eof_text); return next_line_restart; } } if (lmt_token_state.force_eof) { if (tracing_nesting_par > 0) { if ((lmt_input_state.in_stack[lmt_input_state.in_stack_data.ptr].group != cur_boundary) || (lmt_input_state.in_stack[lmt_input_state.in_stack_data.ptr].if_ptr != lmt_condition_state.cond_ptr)) { if (! io_token_input(lmt_input_state.cur_input.name)) { /*tex Give warning for some unfinished groups and/or conditionals. */ tex_aux_file_warning(); } } } if (io_file_input(lmt_input_state.cur_input.name)) { tex_report_stop_file(); --lmt_input_state.open_files; } lmt_token_state.force_eof = 0; tex_end_file_reading(); return next_line_restart; } else { if (inhibit_eol || end_line_char_inactive) { lmt_input_state.cur_input.limit--; } else { lmt_fileio_state.io_buffer[lmt_input_state.cur_input.limit] = (unsigned char) end_line_char_par; } lmt_fileio_state.io_first = lmt_input_state.cur_input.limit + 1; lmt_input_state.cur_input.loc = lmt_input_state.cur_input.start; /*tex We're ready to read. */ } } else if (lmt_input_state.input_stack_data.ptr > 0) { cur_cmd = 0; cur_chr = 0; return next_line_return; } else { /*tex A somewhat weird check: */ switch (lmt_print_state.selector) { case no_print_selector_code: case terminal_selector_code: tex_open_log_file(); break; } tex_handle_error(eof_error_type, "end of file encountered", NULL); /*tex Just in case it is not handled in a callback: */ if (lmt_error_state.interaction > nonstop_mode) { tex_fatal_error("aborting job"); } } /*tex We're in a loop and restart: */ return next_line_ok; } /*tex Let's consider now what happens when |get_next| is looking at a token list. */ static int tex_aux_get_next_tokenlist(void) { halfword t = token_info(lmt_input_state.cur_input.loc); /*tex Move to next. */ lmt_input_state.cur_input.loc = token_link(lmt_input_state.cur_input.loc); if (t >= cs_token_flag) { /*tex A control sequence token */ cur_cs = t - cs_token_flag; cur_cmd = eq_type(cur_cs); if (cur_cmd == deep_frozen_dont_expand_cmd) { /*tex Get the next token, suppressing expansion. The present point in the program is reached only when the |expand| routine has inserted a special marker into the input. In this special case, |token_info(iloc)| is known to be a control sequence token, and |token_link(iloc) = null|. */ cur_cs = token_info(lmt_input_state.cur_input.loc) - cs_token_flag; lmt_input_state.cur_input.loc = null; cur_cmd = eq_type(cur_cs); if (cur_cmd > max_command_cmd) { cur_cmd = relax_cmd; // cur_chr = no_expand_flag; cur_chr = no_expand_relax_code; return 1; } } cur_chr = eq_value(cur_cs); } else { cur_cmd = token_cmd(t); cur_chr = token_chr(t); switch (cur_cmd) { case left_brace_cmd: lmt_input_state.align_state++; break; case right_brace_cmd: lmt_input_state.align_state--; break; case parameter_reference_cmd: /*tex Insert macro parameter and |goto restart|. */ tex_begin_parameter_list(lmt_input_state.parameter_stack[lmt_input_state.cur_input.parameter_start + cur_chr - 1]); return 0; } } return 1; } /*tex Now we're ready to take the plunge into |get_next| itself. Parts of this routine are executed more often than any other instructions of \TEX. This sets |cur_cmd|, |cur_chr|, |cur_cs| to next token. Handling alignments is interwoven because there we switch between constructing cells and rows (node lists) based on templates that are token lists. This is why in several places we find checks for |align_state|. */ void tex_get_next(void) { while (1) { cur_cs = 0; if (lmt_input_state.cur_input.state != token_list_state) { /*tex Input from external file, |goto restart| if no input found. */ if (! tex_aux_get_next_file()) { continue; } else { /*tex Check align state later on! */ } } else if (! lmt_input_state.cur_input.loc) { /*tex List exhausted, resume previous level. */ tex_end_token_list(); continue; } else if (! tex_aux_get_next_tokenlist()) { /*tex Parameter needs to be expanded. */ continue; } if ((lmt_input_state.align_state == 0) && (cur_cmd == alignment_tab_cmd || cur_cmd == alignment_cmd)) { /*tex If an alignment entry has just ended, take appropriate action. */ tex_insert_alignment_template(); continue; } else { break; } } } void tex_get_next_non_spacer(void) { while (1) { cur_cs = 0; if (lmt_input_state.cur_input.state != token_list_state) { /*tex Input from external file, |goto restart| if no input found. */ if (! tex_aux_get_next_file()) { continue; } else { /*tex Check align state later on! */ } } else if (! lmt_input_state.cur_input.loc) { /*tex List exhausted, resume previous level. */ tex_end_token_list(); continue; } else if (! tex_aux_get_next_tokenlist()) { /*tex Parameter needs to be expanded. */ continue; } switch (cur_cmd) { case spacer_cmd: continue; case alignment_tab_cmd: case alignment_cmd: if (lmt_input_state.align_state == 0) { /*tex If an alignment entry has just ended, take appropriate action. */ tex_insert_alignment_template(); continue; } else { return; } default: return; } } } /*tex Since |get_next| is used so frequently in \TEX, it is convenient to define three related procedures that do a little more: \startitemize \startitem |get_token| not only sets |cur_cmd| and |cur_chr|, it also sets |cur_tok|, a packed halfword version of the current token. \stopitem \startitem |get_x_token|, meaning \quote {get an expanded token}, is like |get_token|, but if the current token turns out to be a user-defined control sequence (i.e., a macro call), or a conditional, or something like |\topmark| or |\expandafter| or |\csname|, it is eliminated from the input by beginning the expansion of the macro or the evaluation of the conditional. \stopitem \startitem |x_token| is like |get_x_token| except that it assumes that |get_next| has already been called. \stopitem \stopitemize In fact, these three procedures account for almost every use of |get_next|. No new control sequences will be defined except during a call of |get_token|, or when |\csname| compresses a token list, because |no_new_control_sequence| is always |true| at other times. This sets |cur_cmd|, |cur_chr|, |cur_tok|. For convenience we also return the token because in some places we store it and then some direct assignment looks a bit nicer. */ halfword tex_get_token(void) { lmt_hash_state.no_new_cs = 0; tex_get_next(); lmt_hash_state.no_new_cs = 1; cur_tok = cur_cs ? cs_token_flag + cur_cs : token_val(cur_cmd, cur_chr); return cur_tok; } /*tex The |get_x_or_protected| procedure is like |get_x_token| except that protected macros are not expanded. It sets |cur_cmd|, |cur_chr|, |cur_tok|, and expands non-protected macros. */ void tex_get_x_or_protected(void) { lmt_hash_state.no_new_cs = 0; while (1) { tex_get_next(); if (cur_cmd <= max_command_cmd || is_protected_cmd(cur_cmd)) { break; } else { tex_expand_current_token(); } } cur_tok = cur_cs ? cs_token_flag + cur_cs : token_val(cur_cmd, cur_chr); /* needed afterwards ? */ lmt_hash_state.no_new_cs = 1; } /*tex This changes the string |s| to a token list. */ halfword tex_string_to_toks(const char *ss) { const char *s = ss; const char *se = ss + strlen(s); /*tex tail of the token list */ halfword h = null; halfword p = null; /*tex new node being added to the token list via |store_new_token| */ while (s < se) { int tl; halfword t = (halfword) aux_str2uni_len((const unsigned char *) s, &tl); s += tl; if (t == ' ') { t = space_token; } else { t += other_token; } p = tex_store_new_token(p, t); if (! h) { h = p; } } return h; } /*tex The token lists for macros and for other things like |\mark| and |\output| and |\write| are produced by a procedure called |scan_toks|. Before we get into the details of |scan_toks|, let's consider a much simpler task, that of converting the current string into a token list. The |str_toks| function does this; it classifies spaces as type |spacer| and everything else as type |other_char|. The token list created by |str_toks| begins at |link(temp_token_head)| and ends at the value |p| that is returned. If |p = temp_token_head|, the list is empty. |lua_str_toks| is almost identical, but it also escapes the three symbols that \LUA\ considers special while scanning a literal string. */ static halfword lmt_str_toks(lstring b) /* returns head */ { unsigned char *k = (unsigned char *) b.s; halfword head = null; halfword tail = head; while (k < (unsigned char *) b.s + b.l) { int tl; halfword t = aux_str2uni_len(k, &tl); k += tl; if (t == ' ') { t = space_token; } else { if ((t == '\\') || (t == '"') || (t == '\'') || (t == 10) || (t == 13)) { tail = tex_store_new_token(tail, escape_token); if (! head) { head = tail; } if (t == 10) { t = 'n'; } else if (t == 13) { t = 'r'; } } t += other_token; } tail = tex_store_new_token(tail, t); if (! head) { head = tail; } } return head; } /*tex Incidentally, the main reason for wanting |str_toks| is the function |the_toks|, which has similar input/output characteristics. This changes the string |str_pool[b .. pool_ptr]| to a token list: */ halfword tex_str_toks(lstring s, halfword *tail) { halfword h = null; halfword p = null; if (s.s) { unsigned char *k = s.s; unsigned char *l = k + s.l; while (k < l) { int tl; halfword t = aux_str2uni_len(k, &tl); if (t == ' ') { t = space_token; } else { t += other_token; } k += tl; p = tex_store_new_token(p, t); if (! h) { h = p; } } } if (tail) { *tail = null; } return h; } halfword tex_cur_str_toks(halfword *tail) { halfword h = null; halfword p = null; unsigned char *k = (unsigned char *) lmt_string_pool_state.string_temp; if (k) { unsigned char *l = k + lmt_string_pool_state.string_temp_top; /*tex tail of the token list */ while (k < l) { /*tex token being appended */ int tl; halfword t = aux_str2uni_len(k, &tl); if (t == ' ') { t = space_token; } else { t += other_token; } k += tl; p = tex_store_new_token(p, t); if (! h) { h = p; } } } tex_reset_cur_string(); if (tail) { *tail = p; } return h; } /*tex Most of the converter is similar to the one I made for macro so at some point I can make a helper; also todo: there is no need to go through the pool. */ /*tex Change the string |str_pool[b..pool_ptr]| to a token list. */ halfword tex_str_scan_toks(int ct, lstring ls) { /*tex index into string */ unsigned char *k = ls.s; unsigned char *l = k + ls.l; /*tex tail of the token list */ halfword h = null; halfword p = null; while (k < l) { int cc; /*tex token being appended */ int lt; halfword t = aux_str2uni_len(k, <); k += lt; cc = tex_get_cat_code(ct, t); if (cc == 0) { /*tex We have a potential control sequence so we check for it. */ int lname = 0 ; int s = 0 ; int c = 0 ; unsigned char *name = k ; while (k < l) { t = (halfword) aux_str2uni_len((const unsigned char *) k, &s); c = tex_get_cat_code(ct,t); if (c == 11) { k += s ; lname += s ; } else if (c == 10) { /*tex We ignore a trailing space like normal scanning does. */ k += s ; break ; } else { break ; } } if (s > 0) { /*tex We have a potential |\cs|. */ halfword cs = tex_string_locate((const char *) name, lname, 0); if (cs == undefined_control_sequence) { /*tex Let's play safe and backtrack. */ t += cc * (1<<21); k = name ; } else { t = cs_token_flag + cs; } } else { /*tex Just a character with some meaning, so |\unknown| becomes effectively |\unknown| assuming that |\\| has some useful meaning of course. */ t += cc * (1<<21); k = name ; } } else { /*tex Whatever token, so for instance $x^2$ just works given a \TEX\ catcode regime. */ t += cc * (1<<21); } p = tex_store_new_token(p, t); if (! h) { h = p; } } return h; } /* these two can be combined, then we can avoid the h check */ static void tex_aux_set_toks_register(halfword loc, singleword cmd, halfword t, int g) { halfword ref = get_reference_token(); set_token_link(ref, t); tex_define((g > 0) ? global_flag_bit : 0, loc, cmd == internal_toks_cmd ? internal_toks_reference_cmd : register_toks_reference_cmd, ref); } static void tex_aux_append_copied_toks_list(halfword loc, singleword cmd, int g, halfword s, halfword t) { halfword ref = get_reference_token(); halfword p = ref; while (s) { p = tex_store_new_token(p, token_info(s)); s = token_link(s); } while (t) { p = tex_store_new_token(p, token_info(t)); t = token_link(t); } tex_define((g > 0) ? global_flag_bit : 0, loc, cmd == internal_toks_cmd ? internal_toks_reference_cmd : register_toks_reference_cmd, ref); } /*tex Public helper: */ halfword tex_copy_token_list(halfword h1, halfword *t) { halfword h2 = tex_store_new_token(null, token_info(h1)); halfword t1 = token_link(h1); halfword t2 = h2; while (t1) { t2 = tex_store_new_token(t2, token_info(t1)); t1 = token_link(t1); } if (t) { *t = t2; } return h2; } /*tex At some point I decided to implement the following primitives: \starttabulate[|T||T||] \NC 0 \NC \type {toksapp} \NC 1 \NC \type {etoksapp} \NC \NR \NC 2 \NC \type {tokspre} \NC 3 \NC \type {etokspre} \NC \NR \NC 4 \NC \type {gtoksapp} \NC 5 \NC \type {xtoksapp} \NC \NR \NC 6 \NC \type {gtokspre} \NC 7 \NC \type {xtokspre} \NC \NR \stoptabulate These append and prepend tokens to token lists. In \CONTEXT\ we always had macros doing something like that. It was only a few years later that I ran again into an article that Taco and I wrote in 1999 in the NTG Maps about an extension to \ETEX\ (called eetex). The first revelation was that I had completely forgotten about it, which can be explained by the two decade time-lap. The second was that Taco actually added that to the program at that time, so I could have used (parts of) that code. Anyway, among the other proposed (and implemented) features were manipulating lists and ways to output packed data to the \DVI\ files (numbers packed into 1 upto 4 bytes). Maybe some day I'll have a go at lists, although with todays computers there is not that much to gain. Also, \CONTEXT\ progressed to different internals so the urge is no longer there. The also discussed \SGML\ mode also in no longer that relevant given that we have \LUA. If we want to handle macros too we really need to distinguish between toks and macros with |cur_chr| above, but not now. We can't expand, and have to use |get_r_token| or so. I don't need it anyway. \starttyping get_r_token(); if (cur_cmd == call_cmd) { nt = cur_cs; target = equiv(nt); } else { // some error message } \stoptyping */ # define immediate_permitted(loc,target) ((eq_level(loc) == cur_level) && (get_token_reference(target) == 0)) void tex_run_combine_the_toks(void) { halfword source = null; halfword target = null; halfword append, expand, global; halfword nt, ns; singleword cmd; /* */ switch (cur_chr) { case expanded_toks_code: append = 0; global = 0; expand = 1; break; case append_toks_code: append = 1; global = 0; expand = 0; break; case append_expanded_toks_code: append = 1; global = 0; expand = 1; break; case prepend_toks_code: append = 2; global = 0; expand = 0; break; case prepend_expanded_toks_code: append = 2; global = 0; expand = 1; break; case global_expanded_toks_code: append = 0; global = 1; expand = 1; break; case global_append_toks_code: append = 1; global = 1; expand = 0; break; case global_append_expanded_toks_code: append = 1; global = 1; expand = 1; break; case global_prepend_toks_code: append = 2; global = 1; expand = 0; break; case global_prepend_expanded_toks_code: append = 2; global = 1; expand = 1; break; default: append = 0; global = 0; expand = 0; break; } /*tex The target. */ tex_get_x_token(); if (cur_cmd == register_toks_cmd || cur_cmd == internal_toks_cmd) { nt = eq_value(cur_cs); cmd = (singleword) cur_cmd; } else { /*tex Maybe a number. */ tex_back_input(cur_tok); nt = register_toks_location(tex_scan_toks_register_number()); cmd = register_toks_cmd; } target = eq_value(nt); /*tex The source. */ do { tex_get_x_token(); } while (cur_cmd == spacer_cmd); if (cur_cmd == left_brace_cmd) { source = expand ? tex_scan_toks_expand(1, NULL, 0) : tex_scan_toks_normal(1, NULL); /*tex The action. */ if (source) { if (target) { halfword s = token_link(source); if (s) { halfword t = token_link(target); if (! t) { /*tex Can this happen? */ set_token_link(target, s); token_link(source) = null; } else { switch (append) { case 0: goto ASSIGN_1; case 1: /*append */ if (immediate_permitted(nt,target)) { halfword p = t; while (token_link(p)) { p = token_link(p); } token_link(p) = s; token_link(source) = null; } else { tex_aux_append_copied_toks_list(nt, cmd, global, t, s); } break; case 2: /* prepend */ if (immediate_permitted(nt,target)) { halfword p = s; while (token_link(p)) { p = token_link(p); } token_link(source) = null; set_token_link(p, t); set_token_link(target, s); } else { tex_aux_append_copied_toks_list(nt, cmd, global, s, t); } break; } } } } else { ASSIGN_1: tex_aux_set_toks_register(nt, cmd, token_link(source), global); token_link(source) = null; } tex_flush_token_list(source); } } else { if (cur_cmd == register_toks_cmd) { ns = register_toks_number(eq_value(cur_cs)); } else if (cur_cmd == internal_toks_cmd) { ns = internal_toks_number(eq_value(cur_cs)); } else { ns = tex_scan_toks_register_number(); } /*tex The action. */ source = toks_register(ns); if (source) { if (target) { halfword s = token_link(source); halfword t = token_link(target); switch (append) { case 0: /*assign */ goto ASSIGN_2; case 1: /*append */ if (immediate_permitted(nt, target)) { halfword p = t; while (token_link(p)) { p = token_link(p); } while (s) { p = tex_store_new_token(p, token_info(s)); s = token_link(s); } } else { tex_aux_append_copied_toks_list(nt, cmd, global, t, s); } break; case 2: if (immediate_permitted(nt, target)) { halfword h = null; halfword p = null; while (s) { p = tex_store_new_token(p, token_info(s)); if (! h) { h = p; } s = token_link(s); } set_token_link(p, t); set_token_link(target, h); } else { tex_aux_append_copied_toks_list(nt, cmd, global, s, t); } break; } } else { ASSIGN_2: // set_toks_register(nt, source, global); tex_add_token_reference(source); eq_value(nt) = source; } } } } /*tex This routine, used in the next one, prints the job name, possibly modified by the |process_jobname| callback. */ static void tex_aux_print_job_name(void) { if (lmt_fileio_state.job_name) { /*tex \CCODE\ strings for jobname before and after processing. */ char *s = lmt_fileio_state.job_name; int callback_id = lmt_callback_defined(process_jobname_callback); if (callback_id > 0) { char *ss; int lua_retval = lmt_run_callback(lmt_lua_state.lua_instance, callback_id, "S->S", s, &ss); if (lua_retval && ss) { s = ss; } } tex_print_str(s); } } /*tex The procedure |run_convert_tokens| uses |str_toks| to insert the token list for |convert| functions into the scanner; |\outer| control sequences are allowed to follow |\string| and |\meaning|. */ /*tex Codes not really needed but cleaner when testing */ # define push_selector { \ saved_selector = lmt_print_state.selector; \ lmt_print_state.selector = new_string_selector_code; \ } # define pop_selector { \ lmt_print_state.selector = saved_selector; \ } void tex_run_convert_tokens(halfword code) { /*tex Scan the argument for command |c|. */ switch (code) { /*tex The |number_code| is quite popular. Beware, when used with a lua none function, a zero is injected. We could intercept it at the cost of messy code, but on the other hand, nothing guarantees that the call returns a number so this side effect can be defended as a recovery measure. */ case number_code: { int saved_selector; halfword v = tex_scan_int(0, NULL); push_selector; tex_print_int(v); pop_selector; break; } case to_integer_code: case to_hexadecimal_code: { int saved_selector; halfword v = tex_scan_int(0, NULL); tex_get_x_token(); /* maybe not x here */ if (cur_cmd != relax_cmd) { tex_back_input(cur_tok); } push_selector; if (code == to_integer_code) { tex_print_int(v); } else { tex_print_hex(v); } pop_selector; break; } case to_scaled_code: case to_sparse_scaled_code: case to_dimension_code: case to_sparse_dimension_code: { int saved_selector; halfword v = tex_scan_dimen(0, 0, 0, 0, NULL); tex_get_x_token(); /* maybe not x here */ if (cur_cmd != relax_cmd) { tex_back_input(cur_tok); } push_selector; switch (code) { case to_sparse_dimension_code: case to_sparse_scaled_code: tex_print_sparse_dimension(v, no_unit); break; default: tex_print_dimension(v, no_unit); break; } switch (code) { case to_dimension_code: case to_sparse_dimension_code: tex_print_unit(pt_unit); break; } pop_selector; break; } case to_mathstyle_code: { int saved_selector; halfword v = tex_scan_math_style_identifier(1, 0); push_selector; tex_print_int(v); pop_selector; break; } case lua_function_code: { halfword v = tex_scan_int(0, NULL); if (v > 0) { strnumber u = tex_save_cur_string(); lmt_token_state.luacstrings = 0; lmt_function_call(v, 0); tex_restore_cur_string(u); if (lmt_token_state.luacstrings > 0) { tex_lua_string_start(); } } else { tex_normal_error("luafunction", "invalid number"); } return; } case lua_bytecode_code: { halfword v = tex_scan_int(0, NULL); if (v < 0 || v > 65535) { tex_normal_error("luabytecode", "invalid number"); } else { strnumber u = tex_save_cur_string(); lmt_token_state.luacstrings = 0; lmt_bytecode_call(v); tex_restore_cur_string(u); if (lmt_token_state.luacstrings > 0) { tex_lua_string_start(); } } return; } case lua_code: { full_scanner_status saved_full_status = tex_save_full_scanner_status(); strnumber u = tex_save_cur_string(); halfword s = tex_scan_toks_expand(0, NULL, 0); tex_unsave_full_scanner_status(saved_full_status); lmt_token_state.luacstrings = 0; lmt_token_call(s); tex_delete_token_reference(s); /* boils down to flush_list */ tex_restore_cur_string(u); if (lmt_token_state.luacstrings > 0) { tex_lua_string_start(); } /*tex No further action. */ return; } case expanded_code: case semi_expanded_code: { full_scanner_status saved_full_status = tex_save_full_scanner_status(); strnumber u = tex_save_cur_string(); halfword s = tex_scan_toks_expand(0, NULL, code == semi_expanded_code); tex_unsave_full_scanner_status(saved_full_status); if (token_link(s)) { tex_begin_inserted_list(token_link(s)); token_link(s) = null; } tex_put_available_token(s); tex_restore_cur_string(u); /*tex No further action. */ return; } /* case immediate_assignment_code: */ /* case immediate_assigned_code: */ /*tex These two were an on-the-road-to-bachotex brain-wave. A first variant did more in sequence till a relax or spacer was seen. These commands permits for instance setting counters in full expansion. However, as we have the more powerful local control mechanisms available these two commands have been dropped in \LUAMETATEX. Performance wise there is not that much to gain from |\immediateassigned| and it's even somewhat limited. So, they're gone now. Actually, one can also use the local control feature in an |\edef|, which {\em is} rather efficient, so we're good anyway. The upgraded code can be found in the archive. */ case string_code: { int saved_selector; int saved_scanner_status = lmt_input_state.scanner_status; lmt_input_state.scanner_status = scanner_is_normal; tex_get_token(); lmt_input_state.scanner_status = saved_scanner_status; push_selector; if (cur_cs) { tex_print_cs(cur_cs); } else { tex_print_tex_str(cur_chr); } pop_selector; break; } case cs_string_code: case cs_active_code: { int saved_selector; int saved_scanner_status = lmt_input_state.scanner_status; lmt_input_state.scanner_status = scanner_is_normal; tex_get_token(); lmt_input_state.scanner_status = saved_scanner_status; push_selector; if (code == cs_active_code) { // tex_print_char(active_first); // tex_print_char(active_second); // tex_print_char(active_third); tex_print_str(active_character_namespace); if (cur_cmd == active_char_cmd) { tex_print_char(cur_chr); } else { /*tex So anything else will just inject the hash (abstraction, saves a command). */ tex_back_input(cur_tok); } } else if (cur_cs) { tex_print_cs_name(cur_cs); } else { tex_print_tex_str(cur_chr); } pop_selector; break; } /* case cs_lastname_code: if (lmt_scanner_state.last_cs_name != null_cs) { int saved_selector; push_selector; tex_print_cs_name(lmt_scanner_state.last_cs_name); pop_selector; } break; */ case detokenized_code: /*tex Sort of like |\meaningles| but without the explanationart text. */ { int saved_selector; int saved_scanner_status = lmt_input_state.scanner_status; halfword t = null; lmt_input_state.scanner_status = scanner_is_normal; tex_get_token(); lmt_input_state.scanner_status = saved_scanner_status; t = tex_get_available_token(cur_tok); push_selector; tex_show_token_list(t, 0); tex_put_available_token(t); pop_selector; break; } case detokened_code: /*tex Takes a control sequence or token list. Probably a bad name but so be it. */ { int saved_selector; int saved_scanner_status = lmt_input_state.scanner_status; halfword list = null; lmt_input_state.scanner_status = scanner_is_normal; tex_get_token(); lmt_input_state.scanner_status = saved_scanner_status; switch (cur_cmd) { case call_cmd: case protected_call_cmd: case semi_protected_call_cmd: case tolerant_call_cmd: case tolerant_protected_call_cmd: case tolerant_semi_protected_call_cmd: if (cur_chr) { /* We only serialize macros with no arguments. */ list = token_link(cur_chr); break; } else { goto WHATEVER; } case internal_toks_cmd: case register_toks_cmd: list = token_link(eq_value(cur_chr)); break; case register_cmd: if (cur_chr == tok_val_level) { halfword n = tex_scan_toks_register_number(); list = token_link(toks_register(n)); break; } else { goto WHATEVER; } break; default: WHATEVER: { halfword t = tex_get_available_token(cur_tok); push_selector; tex_show_token_list(t, 0); pop_selector; tex_put_available_token(t); } break; } if (list) { push_selector; tex_show_token_list(list, 2); pop_selector; } break; } case roman_numeral_code: { int saved_selector; halfword v = tex_scan_int(0, NULL); push_selector; tex_print_roman_int(v); pop_selector; break; } case meaning_code: case meaning_full_code: case meaning_less_code: case meaning_ful_code: case meaning_les_code: case meaning_asis_code: { int saved_selector; int saved_scanner_status = lmt_input_state.scanner_status; lmt_input_state.scanner_status = scanner_is_normal; tex_get_token(); lmt_input_state.scanner_status = saved_scanner_status; push_selector; tex_print_meaning(code); pop_selector; break; } case uchar_code: { int saved_selector; int chr = tex_scan_char_number(0); push_selector; tex_print_tex_str(chr); pop_selector; break; } case lua_escape_string_code: /* case lua_token_string_code: */ /* for now rejected: could also be keyword */ { /* tex If I would need it I could probably add support for catcode tables and verbose serialization. Maybe we can use some of the other (more efficient) helpers when we have a detokenize variant. We make sure that the escape character is a backslash because these conversions can occur anywhere and are very much related to \LUA\ calls. (Maybe it makes sense to pass it a argument to the serializer.) A |\luatokenstring| primitive doesn't really make sense because \LUATEX\ lacks it and |\luaescapestring| is a compatibility primitive. */ lstring str; int length = 0; int saved_in_lua_escape = lmt_token_state.in_lua_escape; halfword saved_escape_char = escape_char_par; full_scanner_status saved_full_status = tex_save_full_scanner_status(); halfword result = tex_scan_toks_expand(0, NULL, 0); /* halfword result = tex_scan_toks_expand(0, NULL, code == lua_token_string_code); */ lmt_token_state.in_lua_escape = 1; escape_char_par = '\\'; str.s = (unsigned char *) tex_tokenlist_to_tstring(result, 0, &length, 0, 0, 0, 0); str.l = (unsigned) length; lmt_token_state.in_lua_escape = saved_in_lua_escape; escape_char_par = saved_escape_char; tex_delete_token_reference(result); /* boils down to flush_list */ tex_unsave_full_scanner_status(saved_full_status); if (str.l) { result = lmt_str_toks(str); tex_begin_inserted_list(result); } return; } case font_name_code: { int saved_selector; halfword fnt = tex_scan_font_identifier(NULL); push_selector; tex_print_font(fnt); pop_selector; break; } case font_specification_code: { int saved_selector; halfword fnt = tex_scan_font_identifier(NULL); push_selector; tex_append_string((const unsigned char *) font_original(fnt), (unsigned) strlen(font_original(fnt))); pop_selector; break; } case job_name_code: { int saved_selector; if (! lmt_fileio_state.job_name) { tex_open_log_file(); } push_selector; tex_aux_print_job_name(); pop_selector; break; } case format_name_code: { int saved_selector; if (! lmt_fileio_state.job_name) { tex_open_log_file(); } push_selector; tex_print_str(lmt_engine_state.dump_name); pop_selector; break; } case luatex_banner_code: { int saved_selector; push_selector; tex_print_str(lmt_engine_state.luatex_banner); pop_selector; break; } default: tex_confusion("convert tokens"); break; } { halfword head = tex_cur_str_toks(NULL); tex_begin_inserted_list(head); } } /*tex The boolean |in_lua_escape| is keeping track of the lua string escape state. */ strnumber tex_the_convert_string(halfword c, int i) { int saved_selector = lmt_print_state.selector; strnumber ret = 0; int done = 1 ; lmt_print_state.selector = new_string_selector_code; switch (c) { case number_code: case to_integer_code: tex_print_int(i); break; case to_hexadecimal_code: tex_print_hex(i); break; case to_scaled_code: tex_print_dimension(i, no_unit); break; case to_sparse_scaled_code: tex_print_sparse_dimension(i, no_unit); break; case to_dimension_code: tex_print_dimension(i, pt_unit); break; case to_sparse_dimension_code: tex_print_sparse_dimension(i, pt_unit); break; /* case to_mathstyle_code: */ /* case lua_function_code: */ /* case lua_code: */ /* case expanded_code: */ /* case string_code: */ /* case cs_string_code: */ case roman_numeral_code: tex_print_roman_int(i); break; /* case meaning_code: */ case uchar_code: tex_print_tex_str(i); break; /* case lua_escape_string_code: */ case font_name_code: tex_print_font(i); break; case font_specification_code: tex_print_str(font_original(i)); break; /* case left_margin_kern_code: */ /* case right_margin_kern_code: */ /* case math_char_class_code: */ /* case math_char_fam_code: */ /* case math_char_slot_code: */ /* case insert_ht_code: */ case job_name_code: tex_aux_print_job_name(); break; case format_name_code: tex_print_str(lmt_engine_state.dump_name); break; case luatex_banner_code: tex_print_str(lmt_engine_state.luatex_banner); break; case font_identifier_code: tex_print_font_identifier(i); break; default: done = 0; break; } if (done) { ret = tex_make_string(); } lmt_print_state.selector = saved_selector; return ret; } /*tex Return a string from tokens list: */ strnumber tex_tokens_to_string(halfword p) { if (lmt_print_state.selector == new_string_selector_code) { tex_normal_error("tokens", "tokens_to_string() called while selector = new_string"); return get_nullstr(); } else { int saved_selector = lmt_print_state.selector; lmt_print_state.selector = new_string_selector_code; tex_token_show(p); lmt_print_state.selector = saved_selector; return tex_make_string(); } } /*tex The actual token conversion in this function is now functionally equivalent to |show_token_list|, except that it always prints the whole token list. Often the result is not that large, for instance |\directlua| is seldom large. However, this converter is also used for patterns and exceptions where size is mnore an issue. For that reason we used to have three variants, one of which (experimentally) used a buffer. At some point, in the manual we were talking of millions of allocations but times have changed. Macros were used to inline the appending code (in the thre variants), but in the end I decided to just merge all into one function, with a bit more overhead because we need to optionally skip a macro preamble. Values like 512 and 128 also work ok. There is not much to gain in optimization here. We used to have 3 mostly overlapping functions, one of which used a buffer. We can probably use a larger default buffer size and larger step and only free when we think it's too large. */ # define default_buffer_size 512 /*tex This used to be 256 */ # define default_buffer_step 4096 /*tex When we're larger, we always are much larger. */ // todo: check ret static void tex_aux_make_room_in_buffer(int a) { if (lmt_token_state.bufloc + a + 1 > lmt_token_state.bufmax) { char *tmp = aux_reallocate_array(lmt_token_state.buffer, sizeof(unsigned char), lmt_token_state.bufmax + default_buffer_step, 1); if (tmp) { lmt_token_state.bufmax += default_buffer_step; } else { // error } lmt_token_state.buffer = tmp; } } static void tex_aux_append_uchar_to_buffer(int s) { tex_aux_make_room_in_buffer(4); if (s <= 0x7F) { lmt_token_state.buffer[lmt_token_state.bufloc++] = (char) (s); } else if (s <= 0x7FF) { lmt_token_state.buffer[lmt_token_state.bufloc++] = (char) (0xC0 + (s / 0x40)); lmt_token_state.buffer[lmt_token_state.bufloc++] = (char) (0x80 + (s % 0x40)); } else if (s <= 0xFFFF) { lmt_token_state.buffer[lmt_token_state.bufloc++] = (char) (0xE0 + (s / 0x1000)); lmt_token_state.buffer[lmt_token_state.bufloc++] = (char) (0x80 + ((s % 0x1000) / 0x40)); lmt_token_state.buffer[lmt_token_state.bufloc++] = (char) (0x80 + ((s % 0x1000) % 0x40)); } else { lmt_token_state.buffer[lmt_token_state.bufloc++] = (char) (0xF0 + (s / 0x40000)); lmt_token_state.buffer[lmt_token_state.bufloc++] = (char) (0x80 + ((s % 0x40000) / 0x1000)); lmt_token_state.buffer[lmt_token_state.bufloc++] = (char) (0x80 + (((s % 0x40000) % 0x1000) / 0x40)); lmt_token_state.buffer[lmt_token_state.bufloc++] = (char) (0x80 + (((s % 0x40000) % 0x1000) % 0x40)); } } static void tex_aux_append_char_to_buffer(int c) { tex_aux_make_room_in_buffer(1); lmt_token_state.buffer[lmt_token_state.bufloc++] = (char) (c); } /*tex Only errors and unknowns. */ static void tex_aux_append_str_to_buffer(const char *s) { const char *v = s; tex_aux_make_room_in_buffer((int) strlen(v)); /*tex Using memcpy will inline and give a larger binary ... and we seldom need this. */ while (*v) { lmt_token_state.buffer[lmt_token_state.bufloc++] = (char) (*v); v++; } } /*tex Only bogus csnames. */ static void tex_aux_append_esc_to_buffer(const char *s) { int e = escape_char_par; if (e > 0 && e < cs_offset_value) { tex_aux_append_uchar_to_buffer(e); } tex_aux_append_str_to_buffer(s); } # define is_cat_letter(a) (tex_aux_the_cat_code(aux_str2uni(str_string((a)))) == letter_cmd) /* make two versions: macro and not */ char *tex_tokenlist_to_tstring(int pp, int inhibit_par, int *siz, int skippreamble, int nospace, int strip, int wipe) { if (pp) { /*tex We need to go beyond the reference. */ int p = token_link(pp); if (p) { int e = escape_char_par; /*tex The serialization of the escape, normally a backlash. */ int n = 0; /*tex The character after |#|, so |#0| upto |#9| */ int min = 0; int max = lmt_token_memory_state.tokens_data.top; int skip = 0; int tail = p; int count = 0; if (lmt_token_state.bufmax > default_buffer_size) { /* Let's start fresh and small. */ aux_deallocate_array(lmt_token_state.buffer); lmt_token_state.buffer = aux_allocate_clear_array(sizeof(unsigned char), default_buffer_size, 1); lmt_token_state.bufmax = default_buffer_size; } else if (! lmt_token_state.buffer) { /* Let's start. */ lmt_token_state.buffer = aux_allocate_clear_array(sizeof(unsigned char), default_buffer_size, 1); lmt_token_state.bufmax = default_buffer_size; } lmt_token_state.bufloc = 0; if (skippreamble == 1) { skip = get_token_preamble(pp); } while (p) { if (p < min || p > max) { tex_aux_append_str_to_buffer(error_string_clobbered(31)); break; } else { int infop = token_info(p); if (infop < 0) { /*tex Unlikely, will go after checking (maybe \LUA\ user mess up). */ tex_aux_append_str_to_buffer(error_string_bad(32)); } else if (infop < cs_token_flag) { /*tex We nearly always end up here because otherwise we have an error. */ int cmd = token_cmd(infop); int chr = token_chr(infop); switch (cmd) { case left_brace_cmd: case right_brace_cmd: case math_shift_cmd: case alignment_tab_cmd: case superscript_cmd: case subscript_cmd: case spacer_cmd: case letter_cmd: case other_char_cmd: case active_char_cmd: if (! skip) { tex_aux_append_uchar_to_buffer(chr); } break; case parameter_cmd: if (! skip) { if (! nospace && (! lmt_token_state.in_lua_escape && (lmt_expand_state.cs_name_level == 0))) { tex_aux_append_uchar_to_buffer(chr); } tex_aux_append_uchar_to_buffer(chr); } break; case parameter_reference_cmd: if (! skip) { tex_aux_append_char_to_buffer(match_visualizer); if (chr <= 9) { tex_aux_append_char_to_buffer(chr + '0'); } else if (chr <= max_match_count) { tex_aux_append_char_to_buffer(chr + '0' + gap_match_count); } else { tex_aux_append_char_to_buffer('!'); goto EXIT; } } else { if (chr > max_match_count) { goto EXIT; } } break; case match_cmd: if (! skip) { tex_aux_append_char_to_buffer(match_visualizer); } if (is_valid_match_ref(chr)) { ++n; } if (! skip) { tex_aux_append_char_to_buffer(chr ? chr : '0'); // if (chr <= 9) { // tex_aux_append_char_to_buffer(chr + '0'); // } else if (chr <= max_match_count) { // tex_aux_append_char_to_buffer(chr + '0' + gap_match_count); // } } if (n > max_match_count) { goto EXIT; } break; case end_match_cmd: if (skippreamble == 2) { goto EXIT; } else if (chr == 0) { if (! skip) { tex_aux_append_char_to_buffer('-'); tex_aux_append_char_to_buffer('>'); } skip = 0 ; } break; case end_paragraph_cmd: if (! inhibit_par && (auto_paragraph_mode(auto_paragraph_text))) { tex_aux_append_esc_to_buffer("par"); } break; default: tex_aux_append_str_to_buffer(tex_aux_special_cmd_string(cmd, chr, error_string_bad(33))); break; } } else if (! (inhibit_par && infop == lmt_token_state.par_token)) { int q = infop - cs_token_flag; if (q < hash_base) { if (q == null_cs) { tex_aux_append_esc_to_buffer("csname"); tex_aux_append_esc_to_buffer("endcsname"); } else { tex_aux_append_str_to_buffer(error_string_impossible(34)); } } else if (eqtb_out_of_range(q)) { tex_aux_append_str_to_buffer(error_string_impossible(35)); } else { strnumber txt = cs_text(q); if (txt < 0 || txt >= lmt_string_pool_state.string_pool_data.ptr) { tex_aux_append_str_to_buffer(error_string_nonexistent(36)); } else { int allocated = 0; char *sh = tex_makecstring(txt, &allocated); char *s = sh; if (tex_is_active_cs(txt)) { s = s + 3; while (*s) { tex_aux_append_char_to_buffer(*s); s++; } } else { if (e >= 0) { tex_aux_append_uchar_to_buffer(e); } while (*s) { tex_aux_append_char_to_buffer(*s); s++; } if ((! nospace) && ((! tex_single_letter(txt)) || is_cat_letter(txt))) { tex_aux_append_char_to_buffer(' '); } } if (allocated) { lmt_memory_free(sh); } } } } tail = p; ++count; p = token_link(p); } } EXIT: if (strip && lmt_token_state.bufloc > 1) { if (lmt_token_state.buffer[lmt_token_state.bufloc-1] == strip) { lmt_token_state.bufloc -= 1; } if (lmt_token_state.bufloc > 1 && lmt_token_state.buffer[0] == strip) { memcpy(&lmt_token_state.buffer[0], &lmt_token_state.buffer[1], lmt_token_state.bufloc-1); lmt_token_state.bufloc -= 1; } } lmt_token_state.buffer[lmt_token_state.bufloc] = '\0'; if (siz) { *siz = lmt_token_state.bufloc; } if (wipe) { tex_flush_token_list_head_tail(pp, tail, count); } return lmt_token_state.buffer; } else { if (wipe) { tex_put_available_token(pp); } } } if (siz) { *siz = 0; } return NULL; } /*tex The \LUA\ interface needs some extra functions. The functions themselves are quite boring, but they are handy because otherwise this internal stuff has to be accessed from \CCODE\ directly, where lots of the defines are not available. */ /* The bin gets 1.2K smaller if we inline these. */ halfword tex_get_tex_dimen_register (int j, int internal) { return internal ? dimen_parameter(j) : dimen_register(j) ; } halfword tex_get_tex_skip_register (int j, int internal) { return internal ? glue_parameter(j) : skip_register(j) ; } halfword tex_get_tex_mu_skip_register (int j, int internal) { return internal ? mu_glue_parameter(j) : mu_skip_register(j); } halfword tex_get_tex_count_register (int j, int internal) { return internal ? count_parameter(j) : count_register(j) ; } halfword tex_get_tex_posit_register (int j, int internal) { return internal ? posit_parameter(j) : posit_register(j) ; } halfword tex_get_tex_attribute_register (int j, int internal) { return internal ? attribute_parameter(j) : attribute_register(j) ; } halfword tex_get_tex_box_register (int j, int internal) { return internal ? box_parameter(j) : box_register(j) ; } void tex_set_tex_dimen_register(int j, halfword v, int flags, int internal) { if (global_defs_par) { flags = add_global_flag(flags); } if (internal) { tex_assign_internal_dimen_value(flags, internal_dimen_location(j), v); } else { tex_word_define(flags, register_dimen_location(j), v); } } void tex_set_tex_skip_register(int j, halfword v, int flags, int internal) { if (global_defs_par) { flags = add_global_flag(flags); } if (internal) { tex_assign_internal_skip_value(flags, internal_glue_location(j), v); } else { tex_word_define(flags, register_glue_location(j), v); } } void tex_set_tex_mu_skip_register(int j, halfword v, int flags, int internal) { if (global_defs_par) { flags = add_global_flag(flags); } tex_word_define(flags, internal ? internal_mu_glue_location(j) : register_mu_glue_location(j), v); } void tex_set_tex_count_register(int j, halfword v, int flags, int internal) { if (global_defs_par) { flags = add_global_flag(flags); } if (internal) { tex_assign_internal_int_value(flags, internal_int_location(j), v); } else { tex_word_define(flags, register_int_location(j), v); } } void tex_set_tex_posit_register(int j, halfword v, int flags, int internal) { if (global_defs_par) { flags = add_global_flag(flags); } if (internal) { tex_assign_internal_posit_value(flags, internal_posit_location(j), v); } else { tex_word_define(flags, register_posit_location(j), v); } } void tex_set_tex_attribute_register(int j, halfword v, int flags, int internal) { if (global_defs_par) { flags = add_global_flag(flags); } if (j > lmt_node_memory_state.max_used_attribute) { lmt_node_memory_state.max_used_attribute = j; } tex_change_attribute_register(flags, register_attribute_location(j), v); tex_word_define(flags, internal ? internal_attribute_location(j) : register_attribute_location(j), v); } void tex_set_tex_box_register(int j, halfword v, int flags, int internal) { if (global_defs_par) { flags = add_global_flag(flags); } if (internal) { tex_define(flags, internal_box_location(j), internal_box_reference_cmd, v); } else { tex_define(flags, register_box_location(j), register_box_reference_cmd, v); } } void tex_set_tex_toks_register(int j, lstring s, int flags, int internal) { halfword ref = get_reference_token(); halfword head = tex_str_toks(s, NULL); set_token_link(ref, head); if (global_defs_par) { flags = add_global_flag(flags); } if (internal) { tex_define(flags, internal_toks_location(j), internal_toks_reference_cmd, ref); } else { tex_define(flags, register_toks_location(j), register_toks_reference_cmd, ref); } } void tex_scan_tex_toks_register(int j, int c, lstring s, int flags, int internal) { halfword ref = get_reference_token(); halfword head = tex_str_scan_toks(c, s); set_token_link(ref, head); if (global_defs_par) { flags = add_global_flag(flags); } if (internal) { tex_define(flags, internal_toks_location(j), internal_toks_reference_cmd, ref); } else { tex_define(flags, register_toks_location(j), register_toks_reference_cmd, ref); } } int tex_get_tex_toks_register(int j, int internal) { halfword t = internal ? toks_parameter(j) : toks_register(j); if (t) { return tex_tokens_to_string(t); } else { return get_nullstr(); } } /* Options: (0) error when undefined [bad], (1) create [but undefined], (2) ignore [discard] */ halfword tex_parse_str_to_tok(halfword head, halfword *tail, halfword ct, const char *str, size_t lstr, int option) { halfword p = null; if (! head) { head = get_reference_token(); } p = (tail && *tail) ? *tail : head; if (lstr > 0) { const char *se = str + lstr; while (str < se) { /*tex hh: |str2uni| could return len too (also elsewhere) */ int ul; halfword u = (halfword) aux_str2uni_len((const unsigned char *) str, &ul); halfword t = null; halfword cc = tex_get_cat_code(ct, u); str += ul; /*tex This is a relative simple converter; if more is needed one can just use |tex.print| with a regular |\def| or |\gdef| and feed the string into the regular scanner. */ switch (cc) { case escape_cmd: { /*tex We have a potential control sequence so we check for it. */ int lname = 0; const char *name = str; while (str < se) { int s; halfword u = (halfword) aux_str2uni_len((const unsigned char *) str, &s); int c = tex_get_cat_code(ct, u); if (c == letter_cmd) { str += s; lname += s; } else if (c == spacer_cmd) { /*tex We ignore a trailing space like normal scanning does. */ if (lname == 0) { // if (u == 32) { lname += s; } str += s; break ; } else { if (lname == 0) { lname += s; str += s; } break ; } } if (lname > 0) { /*tex We have a potential |\cs|. */ halfword cs = tex_string_locate(name, lname, option == 1 ? 1 : 0); /* 1 == create */ if (cs == undefined_control_sequence) { if (option == 2) { /*tex We ignore unknown commands. */ // t = null; } else { /*tex We play safe and backtrack, as we have option 0, but never used anyway. */ t = u + (cc * (1<<21)); str = name; } } else { /* We end up here when option is 1. */ t = cs_token_flag + cs; } } else { /*tex Just a character with some meaning, so |\unknown| becomes effectively |\unknown| assuming that |\\| has some useful meaning of course. */ t = u + (cc * (1 << 21)); str = name; } break; } case comment_cmd: goto DONE; case ignore_cmd: break; case spacer_cmd: /* t = u + (cc * (1<<21)); */ t = token_val(spacer_cmd, ' '); break; default: /*tex Whatever token, so for instance $x^2$ just works given a tex catcode regime. */ t = u + (cc * (1<<21)); break; } if (t) { p = tex_store_new_token(p, t); } } } DONE: if (tail) { *tail = p; } return head; } /*tex So far for the helpers. */ void tex_dump_token_mem(dumpstream f) { /*tex It doesn't pay off to prune the available list. We save less than 10K if we do this and it assumes a sequence at the end. It doesn't help that the list is in reverse order so we just dump the lot. But we do check the allocated size. We cheat a bit in reducing the ptr so that we can set the the initial counter on loading. */ halfword p = lmt_token_memory_state.available; halfword u = lmt_token_memory_state.tokens_data.top + 1; while (p) { --u; p = token_link(p); } lmt_token_memory_state.tokens_data.ptr = u; dump_int(f, lmt_token_state.null_list); /* the only one left */ dump_int(f, lmt_token_memory_state.tokens_data.allocated); dump_int(f, lmt_token_memory_state.tokens_data.top); dump_int(f, lmt_token_memory_state.tokens_data.ptr); dump_int(f, lmt_token_memory_state.available); dump_things(f, lmt_token_memory_state.tokens[0], lmt_token_memory_state.tokens_data.top + 1); } void tex_undump_token_mem(dumpstream f) { undump_int(f, lmt_token_state.null_list); /* the only one left */ undump_int(f, lmt_token_memory_state.tokens_data.allocated); undump_int(f, lmt_token_memory_state.tokens_data.top); undump_int(f, lmt_token_memory_state.tokens_data.ptr); undump_int(f, lmt_token_memory_state.available); tex_initialize_token_mem(); undump_things(f, lmt_token_memory_state.tokens[0], lmt_token_memory_state.tokens_data.top + 1); }