1 files changed, 399 insertions, 0 deletions
diff --git a/source/luametatex/source/tex/textoken.h b/source/luametatex/source/tex/textoken.h
new file mode 100644
index 000000000..1996f351c
--- /dev/null
+++ b/source/luametatex/source/tex/textoken.h
@@ -0,0 +1,399 @@
+/*
+    See license.txt in the root of this project.
+*/
+
+# ifndef LMT_TEXTOKEN_H
+# define LMT_TEXTOKEN_H
+
+# include "luametatex.h"
+
+/*tex
+
+    These are constants that can be added to a chr value and then give a token with the right cmd
+    and chr combination, whichs is then equivalent to |token_val (cmd, chr)|. The cmd results from
+    shifting right 21 bits. The following tokens therefore should match the order of the (first
+    bunch) of cmd codes!
+
+    \TEX\ stores the specific match character which defaults to |#|. When tokens get serialized the
+    machinery starts with |match_chr = '#'| but overloads that by the last stored variant. So the
+    last (!) seen |match_chr| in the macro preamble determines what gets used in showing the body.
+    One could argue that this is a buglet but I more see it as a side effect. In practice there is
+    never a mix of such characters used. Anyway, one could as well use the first seen in the
+    preamble and use that for the rest because consistency is better than confusion. Even better is
+    to just always use |#| and store the numbers in preamble match tokens, which opens up
+    possibilities (for strict or tolerant matching, skipping spaces, optional delimiters and even
+    more arguments).
+
+*/
+
+//define cs_token_flag            0x1FFFFFFF
+
+# define node_token_max           0x0FFFFF
+# define node_token_flag          0x100000
+# define node_token_lsb(sum)      (sum & 0x0000FFFF)
+# define node_token_msb(sum)      (((sum & 0xFFFF0000) >> 16) + node_token_flag)
+# define node_token_sum(msb,lsb)  (((msb & 0x0000FFFF) << 16) + lsb)
+# define node_token_overflow(sum) (sum > node_token_max)
+# define node_token_flagged(sum)  (sum > node_token_flag)
+
+/*tex
+    Instead of |fixmem| we use |tokens| because it is dynamic anyway and we then better match variables
+    that deal with managing that. Most was already hidden in a few files anyway.
+*/
+
+typedef struct token_memory_state_info {
+    memoryword  *tokens;      /*tex |memoryword *volatile fixmem;| */
+    memory_data  tokens_data;
+    halfword     available;
+    int          padding;
+} token_memory_state_info;
+
+extern token_memory_state_info lmt_token_memory_state;
+
+typedef enum read_states {
+    reading_normal,      /*tex we're going ahead */
+    reading_just_opened, /*tex newly opened, first line not yet read */
+    reading_closed,      /*tex not open, or at end of file */
+} read_states;
+
+typedef enum lua_input_types {
+    unset_lua_input,
+    string_lua_input,
+    packed_lua_input,
+    token_lua_input,
+    token_list_lua_input,
+    node_lua_input,
+} lua_input_types;
+
+typedef enum tex_input_types {
+    eof_tex_input,
+    string_tex_input,
+    token_tex_input,
+    token_list_tex_input,
+    node_tex_input,
+} tex_input_types;
+
+typedef enum catcode_table_presets {
+    default_catcode_table_preset = -1,
+    no_catcode_table_preset      = -2,
+} catcode_table_presets;
+
+/*tex
+*
+    There are a few temporary head pointers, one is |temp_token_head|. This one we keep because
+    when we expand, we can run into situations where we need that pointer. But, |backup_head| is
+    a real temporary one: we can replace that with local variables. Okay, it is kind of kept in
+    the format file but if it ends up there we're in some kind of troubles anyway. So,
+    |backup_head| is now local and |temp_token_head| only global when we are scanning; in cases
+    where we serialize tokens lists it has been replaced by local variables (and the related
+    functions now keep track of head and tail). This makes sense because in \LUAMETATEX\ we often
+    go between \TEX\ and \LUA\ and this keeps it kind of simple. This also makes clear when we
+    are scanning (the global head is used) and doing something simple with a list. The same is
+    true for |match_token_head| thatmoved to the expand state. The |backup_head| variable is gone
+    because we now use locals.
+
+*/
+
+typedef struct token_state_info {
+    halfword  null_list;        /*tex permanently empty list */
+    int       in_lua_escape;
+    int       force_eof;
+    int       luacstrings;
+    /*tex These are pseudo constants, their value depends on the number of primitives etc. */
+    halfword  par_loc;
+    halfword  par_token;
+ /* halfword  line_par_loc;   */ /*tex See note in textoken.c|. */
+ /* halfword  line_par_token; */ /*tex See note in textoken.c|. */
+    /* */
+    char     *buffer;
+    int       bufloc;
+    int       bufmax;
+    int       padding;
+} token_state_info;
+
+extern token_state_info lmt_token_state;
+
+// # define max_token_reference 0x7FFF /* we can bump to 0xFFFF when we go unsigned here */
+//
+//define token_reference(a)  token_memory_state.tokens[a].half1
+//
+// #define get_token_parameters(a) lmt_token_memory_state.tokens[a].quart2
+// #define get_token_reference(a)  lmt_token_memory_state.tokens[a].quart3
+//
+// #define set_token_parameters(a,b) lmt_token_memory_state.tokens[a].quart2  = (b)
+//
+// #define add_token_reference(a)    lmt_token_memory_state.tokens[a].quart3 += 1
+// #define sub_token_reference(a)    lmt_token_memory_state.tokens[a].quart3 -= 1
+// #define inc_token_reference(a,b)  lmt_token_memory_state.tokens[a].quart3 += (quarterword) (b)
+// #define dec_token_reference(a,b)  lmt_token_memory_state.tokens[a].quart3 -= (quarterword) (b)
+
+# define max_token_reference 0x0FFFFFFF
+
+# define get_token_parameters(a) (lmt_token_memory_state.tokens[a].hulf1 >> 28)
+# define get_token_reference(a)  (lmt_token_memory_state.tokens[a].hulf1 & 0x0FFFFFFF)
+
+# define set_token_parameters(a,b) lmt_token_memory_state.tokens[a].hulf1 += ((b) << 28)  /* normally the variable is still zero here */
+
+# define add_token_reference(a)    lmt_token_memory_state.tokens[a].hulf1 += 1            /* we are way off the parameter count */
+# define sub_token_reference(a)    lmt_token_memory_state.tokens[a].hulf1 -= 1            /* we are way off the parameter count */
+# define inc_token_reference(a,b)  lmt_token_memory_state.tokens[a].hulf1 += (b)          /* we are way off the parameter count */
+# define dec_token_reference(a,b)  lmt_token_memory_state.tokens[a].hulf1 -= (b)          /* we are way off the parameter count */
+
+/* */
+
+# define token_info(a)       lmt_token_memory_state.tokens[a].half1
+# define token_link(a)       lmt_token_memory_state.tokens[a].half0
+# define get_token_info(a)   lmt_token_memory_state.tokens[a].half1
+# define get_token_link(a)   lmt_token_memory_state.tokens[a].half0
+# define set_token_info(a,b) lmt_token_memory_state.tokens[a].half1 = (b)
+# define set_token_link(a,b) lmt_token_memory_state.tokens[a].half0 = (b)
+
+# define token_cmd(A)    ((A) >> cs_offset_bits)
+# define token_chr(A)    ((A) &  cs_offset_max)
+# define token_val(A,B) (((A) << cs_offset_bits) + (B))
+
+/*tex
+    Sometimes we add a value directly. Instead we could use |token_val| on the spot but then we
+    also need different range checkers. We use numbers because we don't have the cmd codes defined
+    yet when we're here. so we can't use for instance |token_val (spacer_cmd, 20)| yet.
+*/
+
+# define left_brace_token        token_val( 1, 0) // token_val(left_brace_cmd,0)
+# define right_brace_token       token_val( 2, 0) // token_val(right_brace_cmd,0)
+# define math_shift_token        token_val( 3, 0) // token_val(math_shift_cmd,0)
+# define alignment_token         token_val( 4, 0)
+# define superscript_token       token_val( 7, 0)
+# define subscript_token         token_val( 8, 0)
+# define ignore_token            token_val( 9, 0) // token_val(ignore_cmd,0)
+# define space_token             token_val(10,32) // token_val(spacer_cmd,32)
+# define letter_token            token_val(11, 0) // token_val(letter_cmd,0)
+# define other_token             token_val(12, 0) // token_val(other_char_cmd,0)
+# define active_token            token_val(13, 0)
+
+# define match_token             token_val(19,0)  // token_val(match_cmd,0)
+# define end_match_token         token_val(20,0)  // token_val(end_match_cmd,0)
+
+# define left_brace_limit  right_brace_token
+# define right_brace_limit math_shift_token
+
+# define octal_token             (other_token  + '\'') /*tex apostrophe, indicates an octal constant */
+# define hex_token               (other_token  + '"')  /*tex double quote, indicates a hex constant */
+# define alpha_token             (other_token  + '`')  /*tex reverse apostrophe, precedes alpha constants */
+# define point_token             (other_token  + '.')  /*tex decimal point */
+# define continental_point_token (other_token  + ',')  /*tex decimal point, Eurostyle */
+# define period_token            (other_token  + '.')  /*tex decimal point */
+# define comma_token             (other_token  + ',')  /*tex decimal comma */
+# define plus_token              (other_token  + '+')
+# define minus_token             (other_token  + '-')
+# define slash_token             (other_token  + '/')
+# define asterisk_token          (other_token  + '*')
+# define colon_token             (other_token  + ':')
+# define semi_colon_token        (other_token  + ';')
+# define equal_token             (other_token  + '=')
+# define less_token              (other_token  + '<')
+# define more_token              (other_token  + '>')
+# define exclamation_token_o     (other_token  + '!')
+# define exclamation_token_l     (letter_token + '!')
+# define underscore_token        (other_token  + '_')
+# define underscore_token_o      (other_token  + '_')
+# define underscore_token_l      (letter_token + '_')
+# define circumflex_token        (other_token  + '^')
+# define circumflex_token_o      (other_token  + '^')
+# define circumflex_token_l      (letter_token + '^')
+# define escape_token            (other_token  + '\\')
+# define left_parent_token       (other_token  + '(')
+# define right_parent_token      (other_token  + ')')
+# define zero_token              (other_token  + '0')  /*tex zero, the smallest digit */
+# define five_token              (other_token  + '5')
+# define seven_token             (other_token  + '7')
+# define nine_token              (other_token  + '9')  /*tex zero, the smallest digit */
+
+# define a_token_l               (letter_token + 'a')  /*tex the smallest special hex digit */
+# define a_token_o               (other_token  + 'a')
+
+# define b_token_l               (letter_token + 'b')  /*tex the smallest special hex digit */
+# define b_token_o               (other_token  + 'b')
+
+# define d_token_l               (letter_token + 'd')
+# define d_token_o               (other_token  + 'd')
+
+# define e_token_l               (letter_token + 'e')
+# define e_token_o               (other_token  + 'e')
+
+# define f_token_l               (letter_token + 'f')  /*tex the largest special hex digit */
+# define f_token_o               (other_token  + 'f')
+
+# define i_token_l               (letter_token + 'i')
+# define i_token_o               (other_token  + 'i')
+
+# define l_token_l               (letter_token + 'l')
+# define l_token_o               (other_token  + 'l')
+
+# define m_token_l               (letter_token + 'm')
+# define m_token_o               (other_token  + 'm')
+
+# define n_token_l               (letter_token + 'n')
+# define n_token_o               (other_token  + 'n')
+
+# define o_token_l               (letter_token + 'o')
+# define o_token_o               (other_token  + 'o')
+
+# define p_token_l               (letter_token + 'p')
+# define p_token_o               (other_token  + 'p')
+
+# define r_token_l               (letter_token + 'r')
+# define r_token_o               (other_token  + 'r')
+
+# define s_token_l               (letter_token + 's')
+# define s_token_o               (other_token  + 's')
+
+# define t_token_l               (letter_token + 't')
+# define t_token_o               (other_token  + 't')
+
+# define u_token_l               (letter_token + 'u')
+# define u_token_o               (other_token  + 'u')
+
+# define x_token_l               (letter_token + 'x')
+# define x_token_o               (other_token  + 'x')
+
+# define A_token_l               (letter_token + 'A')  /*tex the smallest special hex digit */
+# define A_token_o               (other_token  + 'A')
+
+# define E_token_l               (letter_token + 'E')
+# define E_token_o               (other_token  + 'E')
+
+# define F_token_l               (letter_token + 'F')  /*tex the largest special hex digit */
+# define F_token_o               (other_token  + 'F')
+
+# define P_token_l               (letter_token + 'P')  /*tex the largest special hex digit */
+# define P_token_o               (other_token  + 'P')
+
+# define X_token_l               (letter_token + 'X')
+# define X_token_o               (other_token  + 'X')
+
+# define at_token_l              (letter_token + '@')
+# define at_token_o              (other_token  + '@')
+
+# define match_visualizer    '#'
+# define match_spacer        '*'  /* ignore spaces */
+# define match_bracekeeper   '+'  /* keep the braces */
+# define match_thrasher      '-'  /* discard and don't count the argument */
+# define match_par_spacer    '.'  /* ignore pars and spaces */
+# define match_keep_spacer   ','  /* push back space when no match */
+# define match_pruner        '/'  /* remove leading and trailing spaces and pars */
+# define match_continuator   ':'  /* pick up scanning here */
+# define match_quitter       ';'  /* quit scanning */
+# define match_mandate       '='  /* braces are mandate */
+# define match_spacekeeper   '^'  /* keep leading spaces */
+# define match_mandate_keep  '_'  /* braces are mandate and kept */
+# define match_par_command   '@'  /* par delimiter, only internal */
+
+# define spacer_match_token        (match_token + match_spacer)
+# define keep_match_token          (match_token + match_bracekeeper)
+# define thrash_match_token        (match_token + match_thrasher)
+# define par_spacer_match_token    (match_token + match_par_spacer)
+# define keep_spacer_match_token   (match_token + match_keep_spacer)
+# define prune_match_token         (match_token + match_pruner)
+# define continue_match_token      (match_token + match_continuator)
+# define quit_match_token          (match_token + match_quitter)
+# define mandate_match_token       (match_token + match_mandate)
+# define leading_match_token       (match_token + match_spacekeeper)
+# define mandate_keep_match_token  (match_token + match_mandate_keep)
+# define par_command_match_token   (match_token + match_par_command)
+
+# define is_valid_match_ref(r) (r != thrash_match_token && r != spacer_match_token && r != keep_spacer_match_token && r != continue_match_token && r != quit_match_token)
+
+/*tex
+    Managing the head of the list of available one-word nodes. The |get_avail| function has been
+    given a more verbose name. It gets from the pool and should not be confused with |get_token|
+    which reads from the input or token list. The |free_avail| function got renamed to
+    |put_available_token| so we have some symmetry here.
+*/
+
+extern void     tex_compact_tokens            (void);
+extern void     tex_initialize_tokens         (void);
+extern void     tex_initialize_token_mem      (void);
+extern halfword tex_get_available_token       (halfword t);
+extern void     tex_put_available_token       (halfword p);
+extern halfword tex_store_new_token           (halfword p, halfword t);
+extern void     tex_delete_token_reference    (halfword p);
+extern void     tex_add_token_reference       (halfword p);
+extern void     tex_increment_token_reference (halfword p, int n);
+
+# define get_reference_token() tex_get_available_token(null)
+
+/*tex
+
+    The |no_expand_flag| is a special character value that is inserted by |get_next| if it wants to
+    suppress expansion.
+
+*/
+
+# define no_expand_flag special_char /* no_expand_relax_code */
+
+/*tex  A few special values: */
+
+# define default_token_show_min 32
+# define default_token_show_max 2500
+# define extreme_token_show_max 0x3FFFFFFF
+
+/*tex  All kind of helpers: */
+
+extern void       tex_dump_token_mem              (dumpstream f);
+extern void       tex_undump_token_mem            (dumpstream f);
+extern void       tex_print_meaning               (halfword code);
+extern void       tex_flush_token_list            (halfword p);
+extern void       tex_flush_token_list_head_tail  (halfword h, halfword t, int n);
+extern halfword   tex_show_token_list             (halfword p, halfword q, int l, int asis); /* Here |l| will go away. */
+extern void       tex_token_show                  (halfword p, int max);
+/*     void       tex_add_token_ref               (halfword p); */
+/*     void       tex_delete_token_ref            (halfword p); */
+extern void       tex_get_next                    (void);
+extern halfword   tex_scan_character              (const char *s, int left_brace, int skip_space, int skip_relax);
+extern int        tex_scan_optional_keyword       (const char *s);
+extern int        tex_scan_mandate_keyword        (const char *s, int offset);
+extern void       tex_aux_show_keyword_error      (const char *s);
+extern int        tex_scan_keyword                (const char *s);
+extern int        tex_scan_keyword_case_sensitive (const char *s);
+extern halfword   tex_active_to_cs                (int c, int force);
+extern halfword   tex_string_to_toks              (const char *s);
+extern int        tex_get_char_cat_code           (int c);
+extern halfword   tex_get_token                   (void);
+extern halfword   tex_str_toks                    (lstring s, halfword *tail); /* returns head */
+extern halfword   tex_cur_str_toks                (halfword *tail);            /* returns head */
+extern halfword   tex_str_scan_toks               (int c, lstring b);          /* returns head */
+extern void       tex_run_combine_the_toks        (void);
+extern void       tex_run_convert_tokens          (halfword code);
+extern strnumber  tex_the_convert_string          (halfword c, int i);
+extern strnumber  tex_tokens_to_string            (halfword p);
+/*     char      *tex_tokenlist_to_cstring        (int p, int inhibit_par, int *siz); */
+extern char      *tex_tokenlist_to_tstring        (int p, int inhibit_par, int *siz, int skip, int nospace, int strip);
+
+extern halfword   tex_get_tex_dimen_register      (int j, int internal);
+extern halfword   tex_get_tex_skip_register       (int j, int internal);
+extern halfword   tex_get_tex_mu_skip_register    (int j, int internal);
+extern halfword   tex_get_tex_count_register      (int j, int internal);
+extern halfword   tex_get_tex_attribute_register  (int j, int internal);
+extern halfword   tex_get_tex_box_register        (int j, int internal);
+extern halfword   tex_get_tex_toks_register       (int j, int internal);
+
+extern void       tex_set_tex_dimen_register      (int j, halfword v, int flags, int internal);
+extern void       tex_set_tex_skip_register       (int j, halfword v, int flags, int internal);
+extern void       tex_set_tex_mu_skip_register    (int j, halfword v, int flags, int internal);
+extern void       tex_set_tex_count_register      (int j, halfword v, int flags, int internal);
+extern void       tex_set_tex_attribute_register  (int j, halfword v, int flags, int internal);
+extern void       tex_set_tex_box_register        (int j, halfword v, int flags, int internal);
+
+extern void       tex_set_tex_toks_register       (int j,        lstring s, int flags, int internal);
+extern void       tex_scan_tex_toks_register      (int j, int c, lstring s, int flags, int internal);
+
+extern halfword   tex_copy_token_list             (halfword h, halfword *t);
+
+extern halfword   tex_parse_str_to_tok            (halfword head, halfword *tail, halfword ct, const char *str, size_t lstr, int option);
+
+inline int tex_valid_token(int t)
+{
+    return ((t >= 0) && (t <= (int) lmt_token_memory_state.tokens_data.top));
+}
+
+# endif