diff options
Diffstat (limited to 'source/luametatex/source/tex/texexpand.c')
-rw-r--r-- | source/luametatex/source/tex/texexpand.c | 1411 |
1 files changed, 1411 insertions, 0 deletions
diff --git a/source/luametatex/source/tex/texexpand.c b/source/luametatex/source/tex/texexpand.c new file mode 100644 index 000000000..25dcccdf3 --- /dev/null +++ b/source/luametatex/source/tex/texexpand.c @@ -0,0 +1,1411 @@ +/* + See license.txt in the root of this project. +*/ + +# include "luametatex.h" + +/*tex + + Only a dozen or so command codes |> max_command| can possibly be returned by |get_next|; in + increasing order, they are |undefined_cs|, |expand_after|, |no_expand|, |input|, |if_test|, + |fi_or_else|, |cs_name|, |convert|, |the|, |get_mark|, |call|, |long_call|, |outer_call|, + |long_outer_call|, and |end_template|. + + Sometimes, recursive calls to the following |expand| routine may cause exhaustion of the + run-time calling stack, resulting in forced execution stops by the operating system. To + diminish the chance of this happening, a counter is used to keep track of the recursion depth, + in conjunction with a constant called |expand_depth|. + + Note that this does not catch all possible infinite recursion loops, just the ones that + exhaust the application calling stack. The actual maximum value of |expand_depth| is outside + of our control, but the initial setting of |100| should be enough to prevent problems. + +*/ + +expand_state_info lmt_expand_state = { + .limits = { + .minimum = min_expand_depth, + .maximum = max_expand_depth, + .size = min_expand_depth, + .top = 0, + }, + .depth = 0, + .cs_name_level = 0, + .arguments = 0, + .match_token_head = null, + .padding = 0, +}; + + static void tex_aux_macro_call (halfword cs, halfword cmd, halfword chr); +inline static void tex_aux_manufacture_csname (void); +inline static void tex_aux_manufacture_csname_use (void); +inline static void tex_aux_manufacture_csname_future (void); +inline static void tex_aux_inject_last_tested_cs (void); + +/*tex + + We no longer store |match_token_head| in the format file. It is a bit cleaner to just + initialize them. So we free them. + +*/ + +void tex_initialize_expansion(void) +{ + lmt_expand_state.match_token_head = tex_get_available_token(null); +} + +void tex_cleanup_expansion(void) +{ + tex_put_available_token(lmt_expand_state.match_token_head); +} + +halfword tex_expand_match_token_head(void) +{ + return lmt_expand_state.match_token_head; +} + +/*tex + + The |expand| subroutine is used when |cur_cmd > max_command|. It removes a \quote {call} or a + conditional or one of the other special operations just listed. It follows that |expand| might + invoke itself recursively. In all cases, |expand| destroys the current token, but it sets things + up so that the next |get_next| will deliver the appropriate next token. The value of |cur_tok| + need not be known when |expand| is called. + + Since several of the basic scanning routines communicate via global variables, their values are + saved as local variables of |expand| so that recursive calls don't invalidate them. + +*/ + +inline static void tex_aux_expand_after(void) +{ + /*tex + Expand the token after the next token. It takes only a little shuffling to do what \TEX\ + calls |\expandafter|. + */ + halfword t1 = tex_get_token(); + halfword t2 = tex_get_token(); + if (cur_cmd > max_command_cmd) { + tex_expand_current_token(); + } else { + tex_back_input(t2); + } + tex_back_input(t1); +} + +inline static void tex_aux_expand_toks_after(void) +{ + halfword t1 = tex_scan_toks_normal(0, NULL); + halfword t2 = tex_get_token(); + if (cur_cmd > max_command_cmd) { + tex_expand_current_token(); + } else { + tex_back_input(t2); + } + tex_begin_backed_up_list(token_link(t1)); + tex_put_available_token(t1); +} + +/*tex + Here we deal with stuff not in the big switch. Where that is discussed there is mentioning of + it all being a bit messy, also due to the fact that that switch (or actually a lookup table) + also uses the mode for determining what to do. We see no reason to change this model. +*/ + +void tex_expand_current_token(void) +{ + ++lmt_expand_state.depth; + if (lmt_expand_state.depth > lmt_expand_state.limits.top) { + if (lmt_expand_state.depth >= lmt_expand_state.limits.size) { + tex_overflow_error("expansion depth", lmt_expand_state.limits.size); + } else { + lmt_expand_state.limits.top += 1; + } + } + /*tex We're okay. */ + { + halfword saved_cur_val = cur_val; + halfword saved_cur_val_level = cur_val_level; + // halfword saved_head = token_link(token_data.backup_head); + if (cur_cmd < first_call_cmd) { + /*tex Expand a nonmacro. */ + if (tracing_commands_par > 1) { + tex_show_cmd_chr(cur_cmd, cur_chr); + } + switch (cur_cmd) { + case expand_after_cmd: + { + int mode = cur_chr; + switch (mode) { + case expand_after_code: + tex_aux_expand_after(); + break; + /* + case expand_after_3_code: + tex_aux_expand_after(); + // fall-through + case expand_after_2_code: + tex_aux_expand_after(); + tex_aux_expand_after(); + break; + */ + case expand_unless_code: + tex_conditional_unless(); + break; + case future_expand_code: + /*tex + This is an experiment: |\futureexpand| (2) which takes |\check \yes + \nop| as arguments. It's not faster, but gives less tracing noise + than a macro. The variant |\futureexpandis| (3) alternative doesn't + inject the gobbles space(s). + */ + tex_get_token(); + { + halfword spa = null; + halfword chr = cur_chr; + halfword cmd = cur_cmd; + halfword yes = tex_get_token(); /* when match */ + halfword nop = tex_get_token(); /* when no match */ + while (1) { + halfword t = tex_get_token(); + if (cur_cmd == spacer_cmd) { + spa = t; + } else { + tex_back_input(t); + break; + } + } + /*tex The value 1 means: same input level. */ + if (cur_cmd == cmd && cur_chr == chr) { + tex_reinsert_token(yes); + } else { + if (spa) { + tex_reinsert_token(space_token); + } + tex_reinsert_token(nop); + } + } + break; + case future_expand_is_code: + tex_get_token(); + { + halfword chr = cur_chr; + halfword cmd = cur_cmd; + halfword yes = tex_get_token(); /* when match */ + halfword nop = tex_get_token(); /* when no match */ + while (1) { + halfword t = tex_get_token(); + if (cur_cmd != spacer_cmd) { + tex_back_input(t); + break; + } + } + tex_reinsert_token((cur_cmd == cmd && cur_chr == chr) ? yes : nop); + } + break; + case future_expand_is_ap_code: + tex_get_token(); + { + halfword chr = cur_chr; + halfword cmd = cur_cmd; + halfword yes = tex_get_token(); /* when match */ + halfword nop = tex_get_token(); /* when no match */ + while (1) { + halfword t = tex_get_token(); + if (cur_cmd != spacer_cmd && cur_cmd != end_paragraph_cmd) { + tex_back_input(t); + break; + } + } + /*tex We stay at the same input level. */ + tex_reinsert_token((cur_cmd == cmd && cur_chr == chr) ? yes : nop); + } + break; + case expand_after_spaces_code: + { + /* maybe two variants: after_spaces and after_par like in the ignores */ + halfword t1 = tex_get_token(); + while (1) { + halfword t2 = tex_get_token(); + if (cur_cmd != spacer_cmd) { + tex_back_input(t2); + break; + } + } + tex_reinsert_token(t1); + break; + } + case expand_after_pars_code: + { + halfword t1 = tex_get_token(); + while (1) { + halfword t2 = tex_get_token(); + if (cur_cmd != spacer_cmd && cur_cmd != end_paragraph_cmd) { + tex_back_input(t2); + break; + } + } + tex_reinsert_token(t1); + break; + } + case expand_token_code: + { + /* we can share code with lmtokenlib .. todo */ + halfword cat = tex_scan_category_code(); + halfword chr = tex_scan_char_number(0); + /* too fragile: + halfword tok = null; + switch (cat) { + case letter_cmd: + case other_char_cmd: + case ignore_cmd: + case spacer_cmd: + tok = token_val(cat, chr); + break; + case active_char_cmd: + { + halfword cs = tex_active_to_cs(chr, ! lmt_hash_state.no_new_cs); + if (cs) { + chr = eq_value(cs); + tok = cs_token_flag + cs; + break; + } + } + default: + tok = token_val(other_char_cmd, chr); + break; + } + */ + switch (cat) { + case letter_cmd: + case other_char_cmd: + case ignore_cmd: + case spacer_cmd: + break; + default: + cat = other_char_cmd; + break; + } + tex_back_input(token_val(cat, chr)); + break; + } + case expand_cs_token_code: + { + tex_get_token(); + if (cur_tok >= cs_token_flag) { + halfword cmd = eq_type(cur_cs); + switch (cmd) { + case left_brace_cmd: + case right_brace_cmd: + case math_shift_cmd: + case alignment_tab_cmd: + case superscript_cmd: + case subscript_cmd: + case spacer_cmd: + case letter_cmd: + case other_char_cmd: + cur_tok = token_val(cmd, eq_value(cur_cs)); + break; + } + } + tex_back_input(cur_tok); + break; + } + case expand_code: + { + tex_get_token(); + if (cur_cmd >= first_call_cmd && cur_cmd <= last_call_cmd) { + tex_aux_macro_call(cur_cs, cur_cmd, cur_chr); + } else { + /* Use expand_current_token so that protected lua call are dealt with too? */ + tex_back_input(cur_tok); + } + break; + } + case semi_expand_code: + { + tex_get_token(); + if (is_semi_protected_cmd(cur_cmd)) { + tex_aux_macro_call(cur_cs, cur_cmd, cur_chr); + } else { + /* Use expand_current_token so that protected lua call are dealt with too? */ + tex_back_input(cur_tok); + } + break; + } + case expand_after_toks_code: + { + tex_aux_expand_toks_after(); + break; + } + /* + case expand_after_fi: + { + conditional_after_fi(); + break; + } + */ + } + } + break; + case cs_name_cmd: + /*tex Manufacture a control sequence name. */ + switch (cur_chr) { + case cs_name_code: + tex_aux_manufacture_csname(); + break; + case last_named_cs_code: + tex_aux_inject_last_tested_cs(); + break; + case begin_cs_name_code: + tex_aux_manufacture_csname_use(); + break; + case future_cs_name_code: + tex_aux_manufacture_csname_future(); + break; + } + break; + case no_expand_cmd: + { + /*tex + Suppress expansion of the next token. The implementation of |\noexpand| + is a bit trickier, because it is necessary to insert a special + |dont_expand| marker into \TEX's reading mechanism. This special marker + is processed by |get_next|, but it does not slow down the inner loop. + + Since |\outer| macros might arise here, we must also clear the + |scanner_status| temporarily. + */ + halfword t; + halfword save_scanner_status = lmt_input_state.scanner_status; + lmt_input_state.scanner_status = scanner_is_normal; + t = tex_get_token(); + lmt_input_state.scanner_status = save_scanner_status; + tex_back_input(t); + /*tex Now |start| and |loc| point to the backed-up token |t|. */ + if (t >= cs_token_flag) { + halfword p = tex_get_available_token(deep_frozen_dont_expand_token); + set_token_link(p, lmt_input_state.cur_input.loc); + lmt_input_state.cur_input.start = p; + lmt_input_state.cur_input.loc = p; + } + } + break; + case if_test_cmd: + if (cur_chr < first_real_if_test_code) { + tex_conditional_fi_or_else(); + } else if (cur_chr != if_condition_code) { + tex_conditional_if(cur_chr, 0); + } else { + /*tex The |\ifcondition| primitive is a no-op unless we're in skipping mode. */ + } + break; + case the_cmd: + { + halfword h = tex_the_toks(cur_chr, NULL); + tex_begin_inserted_list(h); + break; + } + case lua_call_cmd: + if (cur_chr > 0) { + strnumber u = tex_save_cur_string(); + lmt_token_state.luacstrings = 0; + lmt_function_call(cur_chr, 0); + tex_restore_cur_string(u); + if (lmt_token_state.luacstrings > 0) { + tex_lua_string_start(); + } + } else { + tex_normal_error("luacall", "invalid number"); + } + break; + case lua_local_call_cmd: + if (cur_chr > 0) { + lua_State *L = lmt_lua_state.lua_instance; + strnumber u = tex_save_cur_string(); + lmt_token_state.luacstrings = 0; + /* todo: use a private table as we can overflow, unless we register early */ + lua_rawgeti(L, LUA_REGISTRYINDEX, cur_chr); + if (lua_pcall(L, 0, 0, 0)) { + tex_formatted_warning("luacall", "local call error: %s", lua_tostring(L, -1)); + } else { + tex_restore_cur_string(u); + if (lmt_token_state.luacstrings > 0) { + tex_lua_string_start(); + } + } + } else { + tex_normal_error("luacall", "invalid number"); + } + break; + case begin_local_cmd: + tex_begin_local_control(); + break; + case convert_cmd: + tex_run_convert_tokens(cur_chr); + break; + case input_cmd: + /*tex Initiate or terminate input from a file */ + switch (cur_chr) { + case normal_input_code: + if (lmt_fileio_state.name_in_progress) { + tex_insert_relax_and_cur_cs(); + } else { + tex_start_input(tex_read_file_name(0, NULL, texinput_extension)); + } + break; + case end_of_input_code: + lmt_token_state.force_eof = 1; + break; + case quit_loop_code: + lmt_main_control_state.quit_loop = 1; + break; + case token_input_code: + tex_tex_string_start(io_token_eof_input_code, cat_code_table_par); + break; + case tex_token_input_code: + tex_tex_string_start(io_token_input_code, cat_code_table_par); + break; + case tokenized_code: + case retokenized_code: + { + /*tex + This variant complements the other expandable primitives but + also supports an optional keyword, who knows when that comes in + handy; what goes in is detokenized anyway. For now it is an + undocumented feature. It is likely that there is a |cct| passed + so we don't need to optimize. If needed we can make a version + where this is mandate. + */ + int cattable = (cur_chr == retokenized_code || tex_scan_optional_keyword("catcodetable")) ? tex_scan_int(0, NULL) : cat_code_table_par; + full_scanner_status saved_full_status = tex_save_full_scanner_status(); + strnumber u = tex_save_cur_string(); + halfword s = tex_scan_toks_expand(0, NULL, 0); + tex_unsave_full_scanner_status(saved_full_status); + if (token_link(s)) { + tex_begin_inserted_list(tex_wrapped_token_list(s)); + tex_tex_string_start(io_token_input_code, cattable); + } + tex_put_available_token(s); + tex_restore_cur_string(u); + } + break; + default: + break; + } + break; + case get_mark_cmd: + { + /*tex Insert the appropriate mark text into the scanner. */ + halfword num = 0; + halfword code = cur_chr; + switch (code) { + case top_marks_code: + case first_marks_code: + case bot_marks_code: + case split_first_marks_code: + case split_bot_marks_code: + case current_marks_code: + num = tex_scan_mark_number(); + break; + } + if (tex_valid_mark(num)) { + halfword ptr = tex_get_some_mark(code, num); + if (ptr) { + tex_begin_token_list(ptr, mark_text); + } + } + break; + } + /* + case string_cmd: + { + halfword head = str_toks(str_lstring(cs_offset_value + cur_chr), NULL); + begin_inserted_list(head); + break; + } + */ + default: + /* Maybe ... or maybe an option */ + // if (lmt_expand_state.cs_name_level == 0) { + /*tex Complain about an undefined macro */ + tex_handle_error( + normal_error_type, + "Undefined control sequence %m", cur_cs, + "The control sequence at the end of the top line of your error message was never\n" + "\\def'ed. You can just continue as I'll forget about whatever was undefined." + ); + // } + break; + } + } else if (cur_cmd <= last_call_cmd) { + tex_aux_macro_call(cur_cs, cur_cmd, cur_chr); + } else { + /*tex + Insert a token containing |frozen_endv|. An |end_template| command is effectively + changed to an |endv| command by the following code. (The reason for this is discussed + below; the |frozen_end_template| at the end of the template has passed the + |check_outer_validity| test, so its mission of error detection has been accomplished.) + */ + tex_back_input(deep_frozen_end_template_2_token); + } + cur_val = saved_cur_val; + cur_val_level = saved_cur_val_level; + // set_token_link(token_data.backup_head, saved_head); + } + --lmt_expand_state.depth; +} + +static void tex_aux_complain_missing_csname(void) +{ + tex_handle_error( + back_error_type, + "Missing \\endcsname inserted", + "The control sequence marked <to be read again> should not appear between \\csname\n" + "and \\endcsname." + ); +} + +inline static int tex_aux_uni_to_buffer(unsigned char *b, int m, int c) +{ + if (c <= 0x7F) { + b[m++] = (unsigned char) c; + } else if (c <= 0x7FF) { + b[m++] = (unsigned char) (0xC0 + c / 0x40); + b[m++] = (unsigned char) (0x80 + c % 0x40); + } else if (c <= 0xFFFF) { + b[m++] = (unsigned char) (0xE0 + c / 0x1000); + b[m++] = (unsigned char) (0x80 + (c % 0x1000) / 0x40); + b[m++] = (unsigned char) (0x80 + (c % 0x1000) % 0x40); + } else { + b[m++] = (unsigned char) (0xF0 + c / 0x40000); + b[m++] = (unsigned char) (0x80 + ( c % 0x40000) / 0x1000); + b[m++] = (unsigned char) (0x80 + ((c % 0x40000) % 0x1000) / 0x40); + b[m++] = (unsigned char) (0x80 + ((c % 0x40000) % 0x1000) % 0x40); + } + return m; +} + +/*tex + We also quit on a protected macro call, which is different from \LUATEX\ (and \PDFTEX) but makes + much sense. It also long token lists that never (should) match anyway. +*/ + + +static int tex_aux_collect_cs_tokens(halfword *p, int *n) +{ + while (1) { + tex_get_next(); + switch (cur_cmd) { + case left_brace_cmd: + case right_brace_cmd: + case math_shift_cmd: + case alignment_tab_cmd: + /* case end_line_cmd: */ + case parameter_cmd: + case superscript_cmd: + case subscript_cmd: + /* case ignore_cmd: */ + case spacer_cmd: + case letter_cmd: + case other_char_cmd: + // cur_tok = token_val(cur_cmd, cur_chr); + // *p = tex_store_new_token(*p, cur_tok); + *p = tex_store_new_token(*p, token_val(cur_cmd, cur_chr)); + *n += 1; + break; + /* case active_char_cmd: */ + /* case comment_cmd: */ + /* case invalid_char_cmd: */ + /* + case string_cmd: + cur_tok = token_val(cur_cmd, cur_chr); + *p = store_new_token(*p, cur_tok); + *n += str_length(cs_offset_value + cur_chr); + break; + */ + case call_cmd: + case tolerant_call_cmd: + tex_aux_macro_call(cur_cs, cur_cmd, cur_chr); + break; + case end_cs_name_cmd: + return 1; + default: + if (cur_cmd > max_command_cmd && cur_cmd < first_call_cmd) { + tex_expand_current_token(); + } else { + return 0; + } + } + } +} + +int tex_is_valid_csname(void) +{ + halfword cs = null_cs; + int b = 0; + int n = 0; + halfword h = tex_get_available_token(null); + halfword p = h; + lmt_expand_state.cs_name_level += 1; + if (! tex_aux_collect_cs_tokens(&p, &n)) { + do { + tex_get_x_or_protected(); /* we skip unprotected ! */ + } while (cur_cmd != end_cs_name_cmd); + goto FINISH; + } else if (n) { + /*tex Look up the characters of list |n| in the hash table, and set |cur_cs|. */ + int f = lmt_fileio_state.io_first; + if (tex_room_in_buffer(f + n * 4)) { + int m = f; + halfword l = token_link(h); + while (l) { + m = tex_aux_uni_to_buffer(lmt_fileio_state.io_buffer, m, token_chr(token_info(l))); + l = token_link(l); + } + cs = tex_id_locate(f, m - f, 0); /*tex Don't create a new cs! */ + b = (cs != undefined_control_sequence) && (eq_type(cs) != undefined_cs_cmd); + } + } + FINISH: + tex_flush_token_list_head_tail(h, p, n + 1); + lmt_scanner_state.last_cs_name = cs; + lmt_expand_state.cs_name_level -= 1; + cur_cs = cs; + return b; +} + +inline static halfword tex_aux_get_cs_name(void) +{ + halfword h = tex_get_available_token(null); /* hm */ + halfword p = h; + int n = 0; + lmt_expand_state.cs_name_level += 1; + if (tex_aux_collect_cs_tokens(&p, &n)) { + /*tex Look up the characters of list |r| in the hash table, and set |cur_cs|. */ + int siz; + char *s = tex_tokenlist_to_tstring(h, 1, &siz, 0, 0, 0); + cur_cs = (siz > 0) ? tex_string_locate((char *) s, siz, 1) : null_cs; + } else { + tex_aux_complain_missing_csname(); + } + lmt_scanner_state.last_cs_name = cur_cs; + lmt_expand_state.cs_name_level -= 1; + tex_flush_token_list_head_tail(h, p, n); + return cur_cs; +} + +inline static void tex_aux_manufacture_csname(void) +{ + halfword cs = tex_aux_get_cs_name(); + if (eq_type(cs) == undefined_cs_cmd) { + /*tex The |save_stack| might change! */ + tex_eq_define(cs, relax_cmd, relax_code); + } + /*tex The control sequence will now match |\relax| */ + tex_back_input(cs + cs_token_flag); +} + +inline static void tex_aux_manufacture_csname_use(void) +{ + if (tex_is_valid_csname()) { + tex_back_input(cur_cs + cs_token_flag); + } else { + lmt_scanner_state.last_cs_name = deep_frozen_relax_token; + } +} + +inline static void tex_aux_manufacture_csname_future(void) +{ + halfword t = tex_get_token(); + if (tex_is_valid_csname()) { + tex_back_input(cur_cs + cs_token_flag); + } else { + lmt_scanner_state.last_cs_name = deep_frozen_relax_token; + tex_back_input(t); + } +} + +halfword tex_create_csname(void) +{ + halfword cs = tex_aux_get_cs_name(); + if (eq_type(cs) == undefined_cs_cmd) { + tex_eq_define(cs, relax_cmd, relax_code); + } + return cs; // cs + cs_token_flag; +} + +inline static void tex_aux_inject_last_tested_cs(void) +{ + if (lmt_scanner_state.last_cs_name != null_cs) { + tex_back_input(lmt_scanner_state.last_cs_name + cs_token_flag); + } +} + +/*tex + + Sometimes the expansion looks too far ahead, so we want to insert a harmless |\relax| into the + user's input. +*/ + +void tex_insert_relax_and_cur_cs(void) +{ + tex_back_input(cs_token_flag + cur_cs); + tex_reinsert_token(deep_frozen_relax_token); + lmt_input_state.cur_input.token_type = inserted_text; +} + +/*tex + + Here is a recursive procedure that is \TEX's usual way to get the next token of input. It has + been slightly optimized to take account of common cases. + +*/ + +halfword tex_get_x_token(void) +{ + /*tex This code sets |cur_cmd|, |cur_chr|, |cur_tok|, and expands macros. */ + while (1) { + tex_get_next(); + if (cur_cmd <= max_command_cmd) { + break; + } else if (cur_cmd < first_call_cmd) { + tex_expand_current_token(); + } else if (cur_cmd <= last_call_cmd) { + tex_aux_macro_call(cur_cs, cur_cmd, cur_chr); + } else { + cur_cs = deep_frozen_cs_end_template_2_code; + cur_cmd = end_template_cmd; + /*tex Now |cur_chr = token_state.null_list|. */ + break; + } + } + if (cur_cs) { + cur_tok = cs_token_flag + cur_cs; + } else { + cur_tok = token_val(cur_cmd, cur_chr); + } + return cur_tok; +} + +/*tex + + The |get_x_token| procedure is equivalent to two consecutive procedure calls: |get_next; x_token|. + It's |get_x_token| without the initial |get_next|. + +*/ + +void tex_x_token(void) +{ + while (cur_cmd > max_command_cmd) { + tex_expand_current_token(); + tex_get_next(); + } + if (cur_cs) { + cur_tok = cs_token_flag + cur_cs; + } else { + cur_tok = token_val(cur_cmd, cur_chr); + } +} + +/*tex + + A control sequence that has been |\def|'ed by the user is expanded by \TEX's |macro_call| + procedure. Here we also need to deal with marks, but these are discussed elsewhere. + + So let's consider |macro_call| itself, which is invoked when \TEX\ is scanning a control + sequence whose |cur_cmd| is either |call|, |long_call|, |outer_call|, or |long_outer_call|. The + control sequence definition appears in the token list whose reference count is in location + |cur_chr| of |mem|. + + The global variable |long_state| will be set to |call| or to |long_call|, depending on whether + or not the control sequence disallows |\par| in its parameters. The |get_next| routine will set + |long_state| to |outer_call| and emit |\par|, if a file ends or if an |\outer| control sequence + occurs in the midst of an argument. + + The parameters, if any, must be scanned before the macro is expanded. Parameters are token + lists without reference counts. They are placed on an auxiliary stack called |pstack| while + they are being scanned, since the |param_stack| may be losing entries during the matching + process. (Note that |param_stack| can't be gaining entries, since |macro_call| is the only + routine that puts anything onto |param_stack|, and it is not recursive.) + + After parameter scanning is complete, the parameters are moved to the |param_stack|. Then the + macro body is fed to the scanner; in other words, |macro_call| places the defined text of the + control sequence at the top of \TEX's input stack, so that |get_next| will proceed to read it + next. + + The global variable |cur_cs| contains the |eqtb| address of the control sequence being expanded, + when |macro_call| begins. If this control sequence has not been declared |\long|, i.e., if its + command code in the |eq_type| field is not |long_call| or |long_outer_call|, its parameters are + not allowed to contain the control sequence |\par|. If an illegal |\par| appears, the macro call + is aborted, and the |\par| will be rescanned. + + Beware: we cannot use |cur_cmd| here because for instance |\bgroup| can be part of an argument + without there being an |\egroup|. We really need to check raw brace tokens (|{}|) here when we + pick up an argument! + + */ + +/*tex + + In \LUAMETATEX| we have an extended argument definition system. The approach is still the same + and the additional code kind of fits in. There is a bit more testing going on but the overhead + is kept at a minimum so performance is not hit. Macro packages like \CONTEXT\ spend a lot of + time expanding and the extra overhead of the extensions is compensated by some gain in using + them. However, the most important motive is in readability of macro code on the one hand and + the wish for less tracing (due to all this multi-step processing) on the other. It suits me + well. This is definitely a case of |goto| abuse. + +*/ + +static halfword tex_aux_prune_list(halfword h) +{ + halfword t = h; + halfword p = null; + int done = 0; + int last = null; + while (t) { + halfword l = token_link(t); + halfword i = token_info(t); + halfword c = token_cmd(i); + if (c != spacer_cmd && c != end_paragraph_cmd && i != lmt_token_state.par_token) { // c != 0xFF + done = 1; + last = null; + } else if (done) { + if (! last) { + last = p; /* before space */ + } + } else { + h = l; + tex_put_available_token(t); + } + p = t; + t = l; + } + if (last) { + halfword l = token_link(last); + token_link(last) = null; + tex_flush_token_list(l); + } + return h; +} + +int tex_get_parameter_count(void) +{ + int n = 0; + for (int i = lmt_input_state.cur_input.parameter_start; i < lmt_input_state.parameter_stack_data.ptr; i++) { + if (lmt_input_state.parameter_stack[i]) { + ++n; + } else { + break; + } + } + return n; +} + +static void tex_aux_macro_call(halfword cs, halfword cmd, halfword chr) +{ + int tracing = tracing_macros_par > 0; + if (tracing) { + /*tex + Setting |\tracingmacros| to 2 means that elsewhere marks etc are shown so in fact a bit + more detail. However, as we turn that on anyway, using a value of 3 is not that weird + for less info here. Introducing an extra parameter makes no sense. + */ + tex_begin_diagnostic(); + tex_print_cs_checked(cs); + if (is_untraced(eq_flag(cs))) { + tracing = 0; + } else { + if (! get_token_parameters(chr)) { + tex_print_str("->"); + } else { + /* maybe move the preamble scanner to here */ + } + tex_token_show(chr, default_token_show_max); + } + tex_end_diagnostic(); + } + if (get_token_parameters(chr)) { + halfword matchpointer = token_link(chr); + halfword matchtoken = token_info(matchpointer); + int save_scanner_status = lmt_input_state.scanner_status; + halfword save_warning_index = lmt_input_state.warning_index; + int nofscanned = 0; + int nofarguments = 0; + halfword pstack[9]; /* We could go for 15 if we accept |#A-#F|. */ + /*tex + Scan the parameters and make |link(r)| point to the macro body; but |return| if an + illegal |\par| is detected. + + At this point, the reader will find it advisable to review the explanation of token + list format that was presented earlier, since many aspects of that format are of + importance chiefly in the |macro_call| routine. + + The token list might begin with a string of compulsory tokens before the first + |match| or |end_match|. In that case the macro name is supposed to be followed by + those tokens; the following program will set |s=null| to represent this restriction. + Otherwise |s| will be set to the first token of a string that will delimit the next + parameter. + */ + int tolerant = is_tolerant_cmd(cmd); + /*tex the number of tokens or groups (usually) */ + halfword count = 0; + /*tex one step before the last |right_brace| token */ + halfword rightbrace = null; + /*tex the state, currently the character used in parameter */ + int match = 0; + int thrash = 0; + int quitting = 0; + int last = 0; + /*tex current node in parameter token list being built */ + halfword p = null; + /*tex backup pointer for parameter matching */ + halfword s = null; + int spacer = 0; + /*tex + One day I will check the next code for too many tests, no that much branching that it. + The numbers in |#n| are match tokens except the last one, which is has a different + token info. + */ + lmt_input_state.warning_index = cs; + lmt_input_state.scanner_status = tolerant ? scanner_is_tolerant : scanner_is_matching; + /* */ + do { + /*tex + So, can we use a local head here? After all, there is no expansion going on here, + so no need to access |temp_token_head|. On the other hand, it's also used as a + signal, so not now. + */ + RESTART: + set_token_link(lmt_expand_state.match_token_head, null); + AGAIN: + spacer = 0; + LATER: + if (matchtoken < match_token || matchtoken >= end_match_token) { + s = null; + } else { + switch (matchtoken) { + case spacer_match_token: + matchpointer = token_link(matchpointer); + matchtoken = token_info(matchpointer); + do { + tex_get_token(); + } while (cur_cmd == spacer_cmd); + last = 1; + goto AGAIN; + case mandate_match_token: + match = match_mandate; + goto MANDATE; + case mandate_keep_match_token: + match = match_bracekeeper; + MANDATE: + if (last) { + last = 0; + } else { + tex_get_token(); + last = 1; + } + if (cur_tok < left_brace_limit) { + matchpointer = token_link(matchpointer); + matchtoken = token_info(matchpointer); + s = matchpointer; + p = lmt_expand_state.match_token_head; + count = 0; + last = 0; + goto GROUPED; + } else { + if (tolerant) { + last = 0; + nofarguments = nofscanned; + tex_back_input(cur_tok); + goto QUITTING; + } else { + last = 0; + tex_back_input(cur_tok); + } + s = null; + goto BAD; + } + break; + case thrash_match_token: + match = 0; + thrash = 1; + break; + case leading_match_token: + match = match_spacekeeper; + break; + case prune_match_token: + match = match_pruner; + break; + case continue_match_token: + matchpointer = token_link(matchpointer); + matchtoken = token_info(matchpointer); + goto AGAIN; + case quit_match_token: + match = match_quitter; + if (tolerant) { + last = 0; + nofarguments = nofscanned; + matchpointer = token_link(matchpointer); + matchtoken = token_info(matchpointer); + goto QUITTING; + } else { + break; + } + case par_spacer_match_token: + matchpointer = token_link(matchpointer); + matchtoken = token_info(matchpointer); + do { + /* discard as we go */ + tex_get_token(); + } while (cur_cmd == spacer_cmd || cur_cmd == end_paragraph_cmd); + last = 1; + goto AGAIN; + case keep_spacer_match_token: + matchpointer = token_link(matchpointer); + matchtoken = token_info(matchpointer); + do { + tex_get_token(); + if (cur_cmd == spacer_cmd) { + spacer = 1; + } else { + break; + } + } while (1); + last = 1; + goto LATER; + case par_command_match_token: + /* this discards till the next par token */ + do { + tex_get_token(); + } while (cur_cmd != end_paragraph_cmd); + goto DELIMITER; + default: + match = matchtoken - match_token; + break; + } + matchpointer = token_link(matchpointer); + matchtoken = token_info(matchpointer); + s = matchpointer; + p = lmt_expand_state.match_token_head; + count = 0; + } + /*tex + Scan a parameter until its delimiter string has been found; or, if |s = null|, + simply scan the delimiter string. If |info(r)| is a |match| or |end_match| + command, it cannot be equal to any token found by |get_token|. Therefore an + undelimited parameter --- i.e., a |match| that is immediately followed by + |match| or |end_match| --- will always fail the test |cur_tok=info(r)| in the + following algorithm. + */ + CONTINUE: + /*tex Set |cur_tok| to the next token of input. */ + if (last) { + last = 0; + } else { + tex_get_token(); + } + /* is token_cmd reliable here? */ + if (! count && token_cmd(matchtoken) == ignore_cmd) { + if (cur_cmd < ignore_cmd || cur_cmd > other_char_cmd || cur_chr != token_chr(matchtoken)) { + /*tex We could optimize this but it doesn't pay off now. */ + tex_back_input(cur_tok); + } + matchpointer = token_link(matchpointer); + matchtoken = token_info(matchpointer); + if (s) { + s = matchpointer; + } + goto AGAIN; + } + if (cur_tok == matchtoken) { + /*tex + When we end up here we have a match on a delimiter. Advance |r|; |goto found| + if the parameter delimiter has been fully matched, otherwise |goto continue|. + A slightly subtle point arises here: When the parameter delimiter ends with + |#|, the token list will have a left brace both before and after the + |end_match|. Only one of these should affect the |align_state|, but both will + be scanned, so we must make a correction. + */ + DELIMITER: + matchpointer = token_link(matchpointer); + matchtoken = token_info(matchpointer); + if (matchtoken >= match_token && matchtoken <= end_match_token) { + if (cur_tok < left_brace_limit) { + --lmt_input_state.align_state; + } + goto FOUND; + } else { + goto CONTINUE; + } + } else if (cur_cmd == ignore_something_cmd && cur_chr == ignore_argument_code) { + quitting = count ? 1 : count ? 2 : 3; + goto FOUND; + } + /*tex + Contribute the recently matched tokens to the current parameter, and |goto continue| + if a partial match is still in effect; but abort if |s = null|. + + When the following code becomes active, we have matched tokens from |s| to the + predecessor of |r|, and we have found that |cur_tok <> info(r)|. An interesting + situation now presents itself: If the parameter is to be delimited by a string such + as |ab|, and if we have scanned |aa|, we want to contribute one |a| to the current + parameter and resume looking for a |b|. The program must account for such partial + matches and for others that can be quite complex. But most of the time we have + |s = r| and nothing needs to be done. + + Incidentally, it is possible for |\par| tokens to sneak in to certain parameters of + non-|\long| macros. For example, consider a case like |\def\a#1\par!{...}| where + the first |\par| is not followed by an exclamation point. In such situations it + does not seem appropriate to prohibit the |\par|, so \TEX\ keeps quiet about this + bending of the rules. + */ + if (s != matchpointer) { + BAD: + if (tolerant) { + quitting = nofscanned ? 1 : count ? 2 : 3; + tex_back_input(cur_tok); + // last = 0; + goto FOUND; + } else if (s) { + /*tex cycle pointer for backup recovery */ + halfword t = s; + do { + halfword u, v; + if (match) { + p = tex_store_new_token(p, token_info(t)); + } + ++count; + u = token_link(t); + v = s; + while (1) { + if (u == matchpointer) { + if (cur_tok != token_info(v)) { + break; + } else { + matchpointer = token_link(v); + matchtoken = token_info(matchpointer); + goto CONTINUE; + } + } + if (token_info(u) != token_info(v)) { + break; + } else { + u = token_link(u); + v = token_link(v); + } + } + t = token_link(t); + } while (t != matchpointer); + matchpointer = s; + matchtoken = token_info(matchpointer); + /*tex At this point, no tokens are recently matched. */ + } else { + tex_handle_error( + normal_error_type, + "Use of %S doesn't match its definition", + lmt_input_state.warning_index, + "If you say, e.g., '\\def\\a1{...}', then you must always put '1' after '\\a',\n" + "since control sequence names are made up of letters only. The macro here has not\n" + "been followed by the required stuff, so I'm ignoring it." + ); + goto EXIT; + } + } + GROUPED: + if (cur_tok < left_brace_limit) { + /*tex Contribute an entire group to the current parameter. */ + int unbalance = 0; + while (1) { + if (match) { + p = tex_store_new_token(p, cur_tok); + } + if (last) { + last = 0; + } else { + tex_get_token(); + } + if (cur_tok < right_brace_limit) { + if (cur_tok < left_brace_limit) { + ++unbalance; + } else if (unbalance) { + --unbalance; + } else { + break; + } + } + } + rightbrace = p; + if (match) { + p = tex_store_new_token(p, cur_tok); + } + } else if (cur_tok < right_brace_limit) { + /*tex Report an extra right brace and |goto continue|. */ + tex_back_input(cur_tok); + /* moved up: */ + ++lmt_input_state.align_state; + tex_insert_paragraph_token(); + /* till here */ + tex_handle_error( + insert_error_type, + "Argument of %S has an extra }", + lmt_input_state.warning_index, + "I've run across a '}' that doesn't seem to match anything. For example,\n" + "'\\def\\a#1{...}' and '\\a}' would produce this error. The '\\par' that I've just\n" + "inserted will cause me to report a runaway argument that might be the root of the\n" + "problem." ); + goto CONTINUE; + /*tex A white lie; the |\par| won't always trigger a runaway. */ + } else { + /*tex + Store the current token, but |goto continue| if it is a blank space that would + become an undelimited parameter. + */ + if (cur_tok == space_token && matchtoken <= end_match_token && matchtoken >= match_token && matchtoken != leading_match_token) { + goto CONTINUE; + } + if (match) { + p = tex_store_new_token(p, cur_tok); + } + } + ++count; + if (matchtoken > end_match_token || matchtoken < match_token) { + goto CONTINUE; + } + FOUND: + if (s) { + /* + Tidy up the parameter just scanned, and tuck it away. If the parameter consists + of a single group enclosed in braces, we must strip off the enclosing braces. + That's why |rightbrace| was introduced. Actually, in most cases |m == 1|. + */ + if (! thrash) { + if (token_info(p) < right_brace_limit && count == 1 && p != lmt_expand_state.match_token_head && match != match_bracekeeper) { + set_token_link(rightbrace, null); + tex_put_available_token(p); + p = token_link(lmt_expand_state.match_token_head); + pstack[nofscanned] = token_link(p); + tex_put_available_token(p); + } else { + pstack[nofscanned] = token_link(lmt_expand_state.match_token_head); + } + if (match == match_pruner) { + pstack[nofscanned] = tex_aux_prune_list(pstack[nofscanned]); + } + ++nofscanned; + if (tracing) { + tex_begin_diagnostic(); + tex_print_format("%c%i<-", match_visualizer, nofscanned); + tex_show_token_list(pstack[nofscanned - 1], null, default_token_show_max, 0); + tex_end_diagnostic(); + } + } else { + thrash = 0; + } + } + /*tex + Now |info(r)| is a token whose command code is either |match| or |end_match|. + */ + if (quitting) { + nofarguments = quitting == 3 ? 0 : quitting == 2 && count == 0 ? 0 : nofscanned; + QUITTING: + if (spacer) { + tex_back_input(space_token); /* experiment */ + } + while (1) { + switch (matchtoken) { + case end_match_token: + goto QUITDONE; + case spacer_match_token: + case thrash_match_token: + case par_spacer_match_token: + case keep_spacer_match_token: + goto NEXTMATCH; + case mandate_match_token: + case leading_match_token: + pstack[nofscanned] = null; + break; + case mandate_keep_match_token: + p = tex_store_new_token(null, left_brace_token); + pstack[nofscanned] = p; + p = tex_store_new_token(p, right_brace_token); + break; + case continue_match_token: + matchpointer = token_link(matchpointer); + matchtoken = token_info(matchpointer); + quitting = 0; + goto RESTART; + case quit_match_token: + if (quitting) { + matchpointer = token_link(matchpointer); + matchtoken = token_info(matchpointer); + quitting = 0; + goto RESTART; + } else { + goto NEXTMATCH; + } + default: + if (matchtoken >= match_token && matchtoken < end_match_token) { + pstack[nofscanned] = null; + break; + } else { + goto NEXTMATCH; + } + } + nofscanned++; + if (tracing) { + tex_begin_diagnostic(); + tex_print_format("%c%i--", match_visualizer, nofscanned); + tex_end_diagnostic(); + } + NEXTMATCH: + matchpointer = token_link(matchpointer); + matchtoken = token_info(matchpointer); + } + } + } while (matchtoken != end_match_token); + nofarguments = nofscanned; + QUITDONE: + matchpointer = token_link(matchpointer); + /*tex + Feed the macro body and its parameters to the scanner Before we put a new token list on the + input stack, it is wise to clean off all token lists that have recently been depleted. Then + a user macro that ends with a call to itself will not require unbounded stack space. + */ + tex_cleanup_input_state(); + /*tex + We don't really start a list, it's more housekeeping. The starting point is the body and + the later set |loc| reflects that. + */ + tex_begin_macro_list(chr); + /*tex + Beware: here the |name| is used for symbolic locations but also for macro indices but these + are way above the symbolic |token_types| that we use. Better would be to have a dedicated + variable but let's not open up a can of worms now. We can't use |warning_index| combined + with a symbolic name either. We're at |end_match_token| now so we need to advance. + */ + lmt_input_state.cur_input.name = cs; + lmt_input_state.cur_input.loc = matchpointer; + /*tex + This comes last, after the cleanup and the start of the macro list. + */ + if (nofscanned) { + tex_copy_pstack_to_param_stack(&pstack[0], nofscanned); + } + EXIT: + lmt_expand_state.arguments = nofarguments; + lmt_input_state.scanner_status = save_scanner_status; + lmt_input_state.warning_index = save_warning_index; + } else { + tex_cleanup_input_state(); + if (token_link(chr)) { + tex_begin_macro_list(chr); + lmt_expand_state.arguments = 0; + lmt_input_state.cur_input.name = lmt_input_state.warning_index; + lmt_input_state.cur_input.loc = token_link(chr); + } else { + /* We ignore empty bodies but it doesn't gain us that much. */ + } + } +} |