% \iffalse meta-comment % %% File: l3tl-analysis.dtx Copyright (C) 2011-2012 The LaTeX3 Project %% %% It may be distributed and/or modified under the conditions of the %% LaTeX Project Public License (LPPL), either version 1.3c of this %% license or (at your option) any later version. The latest version %% of this license is in the file %% %% http://www.latex-project.org/lppl.txt %% %% This file is part of the "l3experimental bundle" (The Work in LPPL) %% and all files in that bundle must be distributed together. %% %% The released version of this bundle is available from CTAN. %% %% ----------------------------------------------------------------------- %% %% The development version of the bundle can be found at %% %% http://www.latex-project.org/svnroot/experimental/trunk/ %% %% for those people who are interested. %% %%%%%%%%%%% %% NOTE: %% %%%%%%%%%%% %% %% Snapshots taken from the repository represent work in progress and may %% not work or may contain conflicting material! We therefore ask %% people _not_ to put them into distributions, archives, etc. without %% prior consultation with the LaTeX3 Project. %% %% ----------------------------------------------------------------------- % %<*driver|package> \RequirePackage{expl3} \GetIdInfo$Id: l3tl-analysis.dtx 4745 2014-05-06 10:41:27Z joseph $ {L3 Experimental token lists analysis} % %<*driver> \documentclass[full]{l3doc} \usepackage{amsmath} \begin{document} \DocInput{\jobname.dtx} \end{document} % % \fi % % % \title{^^A % The \textsf{l3tl-analysis} package: analysing token lists^^A % \thanks{This file describes v\ExplFileVersion, % last revised \ExplFileDate.}^^A % } % % \author{^^A % The \LaTeX3 Project\thanks % {^^A % E-mail: % \href{mailto:latex-team@latex-project.org} % {latex-team@latex-project.org}^^A % }^^A % } % % \date{Released \ExplFileDate} % % \maketitle % % \begin{documentation} % % \section{\pkg{l3tl-analysis} documentation} % % This module mostly provides internal functions for use in the % \pkg{l3regex} module. However, it provides as a side-effect a user % debugging function, very similar to the \cs{ShowTokens} macro from the % \pkg{ted} package. % % \begin{function}{\tl_show_analysis:N, \tl_show_analysis:n} % \begin{syntax} % \cs{tl_show_analysis:n} \Arg{token list} % \end{syntax} % Displays to the terminal the detailed decomposition of the % \meta{token list} into tokens, showing the category code of each % character token, the meaning of control sequences and active % characters, and the value of registers. % \end{function} % % \subsection{Internal functions} % % \begin{variable}{\s__tl} % The format used to store token lists internally uses the scan mark % \cs{s__tl} as a delimiter. % \end{variable} % % \begin{function}{\__tl_analysis_map_inline:nn} % \begin{syntax} % \cs{__tl_analysis_map_inline:nn} \Arg{token list} \Arg{inline function} % \end{syntax} % Applies the \meta{inline function} to each individual \meta{token} % in the \meta{token list}. The \meta{inline function} receives three % arguments: % \begin{itemize} % \item \meta{tokens}, which both \texttt{o}-expand and % \texttt{x}-expand to the \meta{token}. The detailed form of % \meta{token} may change in later releases. % \item \meta{catcode}, a capital hexadecimal digit which denotes % the category code of the \meta{token} (0: control sequence, 1: % begin-group, 2: end-group, 3: math shift, 4: alignment tab, 6: % parameter, 7: superscript, 8: subscript, A: space, B: letter, % C:other, D:active). % \item \meta{char code}, a decimal representation of the character % code of the token, $-1$ if it is a control sequence (with % \meta{catcode} $0$). % \end{itemize} % \end{function} % % For optimizations in \pkg{l3regex} (when matching control sequences), % it may be useful to provide a \cs{__tl_analysis_from_str_map_inline:nn} % function, perhaps named \cs{__str_analysis_map_inline:nn}. % % \subsection{Internal format} % % The task of the \pkg{l3tl-analysis} module is to convert token lists % to an internal format which allows us to extract all the relevant % information about individual tokens (category code, character code), % as well as reconstruct the token list quickly. This internal format is % used in \pkg{l3regex} where we need to support arbitrary tokens, and % it is used in conversion functions in \pkg{l3str-convert}, where we wish to % support clusters of characters instead of single tokens. % % We thus need a way to encode any \meta{token} (even begin-group and % end-group character tokens) in a way amenable to manipulating tokens % individually. The best we can do is to find \meta{tokens} which both % \texttt{o}-expand and \texttt{x}-expand to the given % \meta{token}. Collecting more information about the category code and % character code is also useful for regular expressions, since most % regexes are catcode-agnostic. The internal format thus takes the form % of a succession of items of the form % \begin{quote} % \meta{tokens} \cs{s__tl} \meta{catcode} \meta{char code} \cs{s__tl} % \end{quote} % The \meta{tokens} \texttt{o}- \emph{and} \texttt{x}-expand to the % original token in the token list or to the cluster of tokens % corresponding to one Unicode character in the given encoding (for % \pkg{l3str-convert}). The \meta{catcode} is given as a single hexadecimal % digit, $0$ for control sequences. The \meta{char code} is given as a % decimal number, $-1$ for control sequences. % % Using delimited arguments lets us build the \meta{tokens} % progressively when doing an encoding conversion in \pkg{l3str-convert}. On the % other hand, the delimiter \cs{s__tl} may not appear unbraced in % \meta{tokens}. This is not a problem because we are careful to wrap % control sequences in braces (as an argument to \cs{exp_not:n}) when % converting from a general token list to the internal format. % % The current rule for converting a \meta{token} to a balanced set of % \meta{tokens} which both \texttt{o}-expands and \texttt{x}-expands to % it is the following. % \begin{itemize} % \item A control sequence |\cs| becomes |\exp_not:n { \cs }| % \cs{s__tl} $0$ $-1$ \cs{s__tl}. % \item A begin-group character |{| becomes \cs{exp_after:wN} |{| % \cs{if_false:} |}| \cs{fi:} \cs{s__tl} $1$ \meta{char code} % \cs{s__tl}. % \item An end-group character |}| becomes \cs{if_false:} |{| \cs{fi:} % |}| \cs{s__tl} $2$ \meta{char code} \cs{s__tl}. % \item A character with any other category code becomes % \cs{exp_not:n} \Arg{character} \cs{s__tl} \meta{hex catcode} % \meta{char code} \cs{s__tl}. % \end{itemize} % % ^^A todo: ask LuaTeX list for an \ifx\undefined % ^^A which does not add the in memory. % % \end{documentation} % % \begin{implementation} % % \section{\pkg{l3tl-analysis} implementation} % % \begin{macrocode} %<*initex|package> % \end{macrocode} % % \begin{macrocode} %<@@=tl_analysis> % \end{macrocode} % % \begin{macrocode} \ProvidesExplPackage {\ExplFileName}{\ExplFileDate}{\ExplFileVersion}{\ExplFileDescription} \RequirePackage{l3str} % \end{macrocode} % % \subsection{Variables and helper functions} % % \begin{variable}{\s__tl} % The scan mark \cs{s__tl} is used as a delimiter in the internal % format. This is more practical than using a quark, because we would % then need to control expansion much more carefully: compare % \cs{__int_value:w} |`#1| \cs{s__tl} with \cs{__int_value:w} |`#1| % \cs{exp_stop_f:} \cs{exp_not:N} \cs{q_mark} to extract a character % code followed by the delimiter in an \texttt{x}-expansion. % \begin{macrocode} \__scan_new:N \s__tl % \end{macrocode} % \end{variable} % % \begin{variable}{\l_@@_internal_tl} % This token list variable is used to hand the argument of % \cs{tl_show_analysis:n} to \cs{tl_show_analysis:N}. % \begin{macrocode} \tl_new:N \l_@@_internal_tl % \end{macrocode} % \end{variable} % % \begin{variable}{\l_@@_token} % \begin{variable}{\l_@@_char_token} % The tokens in the token list are probed with the \TeX{} primitive % \tn{futurelet}. We use \cs{l_@@_token} in that % construction. In some cases, we convert the following token to a % string before probing it: then the token variable used is % \cs{l_@@_char_token}. % \begin{macrocode} \cs_new_eq:NN \l_@@_token ? \cs_new_eq:NN \l_@@_char_token ? % \end{macrocode} % \end{variable} % \end{variable} % % \begin{variable}{\l_@@_normal_int} % The number of normal (\texttt{N}-type argument) tokens since the % last special token. % \begin{macrocode} \int_new:N \l_@@_normal_int % \end{macrocode} % \end{variable} % % \begin{variable}{\l_@@_index_int} % During the first pass, this is the index in the array being built. % During the second pass, it is equal to the maximum index in the % array from the first pass. % \begin{macrocode} \int_new:N \l_@@_index_int % \end{macrocode} % \end{variable} % % \begin{variable}{\l_@@_nesting_int} % Nesting depth of explicit begin-group and end-group characters % during the first pass. This lets us detect the end of the token list % without a reserved end-marker. % \begin{macrocode} \int_new:N \l_@@_nesting_int % \end{macrocode} % \end{variable} % % \begin{variable}{\l_@@_type_int} % When encountering special characters, we record their \enquote{type} % in this integer. % \begin{macrocode} \int_new:N \l_@@_type_int % \end{macrocode} % \end{variable} % % \begin{variable}{\g_@@_result_tl} % The result of the conversion is stored in this token list, with a % succession of items of the form % \begin{quote} % \meta{tokens} \cs{s__tl} \meta{catcode} \meta{char code} \cs{s__tl} % \end{quote} % \begin{macrocode} \tl_new:N \g_@@_result_tl % \end{macrocode} % \end{variable} % % \begin{macro}[int, EXP]{\@@_extract_charcode:} % \begin{macro}[aux, EXP]{\@@_extract_charcode_aux:w} % Extracting the character code from the meaning of % \cs{l_@@_token}. This has no error checking, and should % only be assumed to work for begin-group and end-group character % tokens. It produces a number in the form |`|\meta{char}. % \begin{macrocode} \cs_new_nopar:Npn \@@_extract_charcode: { \exp_after:wN \@@_extract_charcode_aux:w \token_to_meaning:N \l_@@_token } \cs_new:Npn \@@_extract_charcode_aux:w #1 ~ #2 ~ { ` } % \end{macrocode} % \end{macro} % \end{macro} % % \begin{macro}[int, EXP]{\@@_cs_space_count:NN} % \begin{macro}[aux, EXP]{\@@_cs_space_count:w} % \begin{macro}[aux, EXP]{\@@_cs_space_count_end:w} % Counts the number of spaces in the string representation of its % second argument, as well as the number of characters following the % last space in that representation, and feeds the two numbers as % semicolon-delimited arguments to the first argument. When this % function is used, the escape character is printable and non-space. % \begin{macrocode} \cs_new:Npn \@@_cs_space_count:NN #1 #2 { \exp_after:wN #1 \__int_value:w \__int_eval:w \c_zero \exp_after:wN \@@_cs_space_count:w \token_to_str:N #2 \fi: \@@_cs_space_count_end:w ; ~ ! } \cs_new:Npn \@@_cs_space_count:w #1 ~ { \if_false: #1 #1 \fi: + \c_one \@@_cs_space_count:w } \cs_new:Npn \@@_cs_space_count_end:w ; #1 \fi: #2 ! { \exp_after:wN ; \__int_value:w \str_count_ignore_spaces:n {#1} ; } % \end{macrocode} % \end{macro} % \end{macro} % \end{macro} % % \subsection{Plan of attack} % % Our goal is to produce a token list of the form roughly % \begin{quote} % \meta{token 1} \cs{s__tl} \meta{catcode 1} \meta{char code 1} \cs{s__tl} \\ % \meta{token 2} \cs{s__tl} \meta{catcode 2} \meta{char code 2} \cs{s__tl} \\ % \ldots{} % \meta{token N} \cs{s__tl} \meta{catcode N} \meta{char code N} \cs{s__tl} % \end{quote} % Most but not all tokens can be grabbed as an undelimited % (\texttt{N}-type) argument by \TeX{}. The plan is to have a two pass % system. In the first pass, locate special tokens, and store them in % various \tn{toks} registers. In the second pass, which is done within % an \texttt{x}-expanding assignment, normal tokens are taken in as % \texttt{N}-type arguments, and special tokens are retrieved from the % \tn{toks} registers, and removed from the input stream by some means. % The whole process takes linear time, because we avoid building the % result one item at a time. % % To ease the difficult first pass, we first do some setup with % \cs{@@_setup:n}. Active characters set equal to non-active % characters cause trouble, so we disable all active characters by % setting them equal to \texttt{undefined} locally. We also set there % the escape character to be printable (backslash, but this later % oscillates between slash and backslash): this makes it possible to % distinguish characters from control sequences. % % A token has two characteristics: its \tn{meaning}, and what it looks % like for \TeX{} when it is in scanning mode (\emph{e.g.}, when % capturing parameters for a macro). For our purposes, we distinguish % the following meanings: % \begin{itemize} % \item begin-group token (category code $1$), either space (character % code $32$), or non-space; % \item end-group token (category code $2$), either space (character % code $32$), or non-space; % \item space token (category code $10$, character code $32$); % \item anything else (then the token is always an \texttt{N}-type % argument). % \end{itemize} % The token itself can \enquote{look like} one of the following % \begin{itemize} % \item a non-active character, in which case its meaning is % automatically that associated to its character code and category % code, we call it \enquote{true} character; % \item an active character (we eliminate those in the setup step); % \item a control sequence. % \end{itemize} % The only tokens which are not valid \texttt{N}-type arguments are true % begin-group characters, true end-group characters, and true spaces. % We will detect those characters by scanning ahead with \tn{futurelet}, % then distinguishing true characters from control sequences set equal % to them using the \tn{string} representation. % % The second pass is a simple exercise in expandable loops. % % \begin{macro}[int]{\@@:n} % Everything is done within a group, and all definitions will be % local. We use \cs{group_align_safe_begin/end:} to avoid problems in % case \cs{@@:n} is used within an alignment and its argument % contains alignment tab tokens. % \begin{macrocode} \cs_new_protected:Npn \@@:n #1 { \group_begin: \group_align_safe_begin: \@@_setup:n {#1} \@@_a:n {#1} \@@_b:n {#1} \group_align_safe_end: \group_end: } % \end{macrocode} % \end{macro} % % \subsection{Setup} % % \begin{macro}[int]{\@@_setup:n} % \begin{macro}[aux]{\@@_disable_loop:N} % Active characters can cause problems later on in the processing, % so the first step is to disable them, by setting them to % \texttt{undefined}. Since Unicode contains too many characters % to loop over all of them, we instead loop over the input token % list as a string: any active character in the token list % must appear in its string representation. The string is shortened % a little by making the escape character unprintable. The active % space must be disabled separately (the loop skips over it otherwise), % and we end the loop by feeding an odd non-\texttt{N}-type % argument to the looping macro. % \begin{macrocode} \cs_new_protected:Npn \@@_setup:n #1 { \int_set_eq:NN \tex_escapechar:D \c_minus_one \exp_after:wN \@@_disable_loop:N \tl_to_str:n {#1} { ~ } { ? ~ \__prg_break: } \__prg_break_point: } \group_begin: \char_set_catcode_active:N \^^@ \cs_new_protected:Npn \@@_disable_loop:N #1 { \tex_lccode:D \c_zero `#1 ~ \tl_to_lowercase:n { \tex_let:D ^^@ } \tex_undefined:D \@@_disable_loop:N } \group_end: % \end{macrocode} % \end{macro} % \end{macro} % % \subsection{First pass} % % The goal of this pass is to detect special (non-\texttt{N}-type) tokens, % and count how many \texttt{N}-type tokens lie between special tokens. % Also, we wish to store some representation of each special token % in a \tn{toks} register. % % After the setup step, we have $11$ types of tokens: % \begin{itemize} % \item[1.] a true non-space begin-group character; % \item[2.] a true space begin-group character; % \item[3.] a true non-space end-group character; % \item[4.] a true space end-group character; % \item[5.] a true space blank space character; % \item[6.] an undefined active character; % \item[7.] any other true character; % \item[8.] a control sequence equal to a begin-group token (category code $1$); % \item[9.] a control sequence equal to an end-group token (category code $2$); % \item[10.] a control sequence equal to a space token % (character code $32$, category code $10$); % \item[11.] any other control sequence. % \end{itemize} % Our first tool is \tn{futurelet}. This cannot distinguish % case $8$ from $1$ or $2$, nor case $9$ from $3$ or $4$, % nor case $10$ from case $5$. Those cases will be distinguished % by applying the \tn{string} primitive to the following token, % after possibly changing the escape character to ensure that % a control sequence's string representation cannot be mistaken % for the true character. % % In cases $6$, $7$, and $11$, the following token is a valid % \texttt{N}-type argument, so we grab it and distinguish the case % of a character from a control sequence: in the latter case, % \cs{str_tail:n} \Arg{token} is non-empty, because the % escape character is printable. % % \begin{macro}[int]{\@@_a:n} % We read tokens one by one using \tn{futurelet}. % While performing the loop, we keep track of the number of % true begin-group characters minus the number of % true end-group characters in \cs{l_@@_nesting_int}. % This reaches $-1$ when we read the closing brace. % \begin{macrocode} \cs_new_protected:Npn \@@_a:n #1 { \int_set:Nn \tex_escapechar:D { 92 } \int_zero:N \l_@@_normal_int \int_zero:N \l_@@_index_int \int_zero:N \l_@@_nesting_int \if_false: { \fi: \@@_a_loop:w #1 } \int_decr:N \l_@@_index_int } % \end{macrocode} % \end{macro} % % \begin{macro}[int]{\@@_a_loop:w} % Read one character and check its type. % \begin{macrocode} \cs_new_protected_nopar:Npn \@@_a_loop:w { \tex_futurelet:D \l_@@_token \@@_a_type:w } % \end{macrocode} % \end{macro} % % \begin{macro}[int]{\@@_a_type:w} % At this point, \cs{l_@@_token} holds the meaning % of the following token. We store in \cs{l_@@_type_int} % the meaning of the token ahead: % \begin{itemize} % \item 0 space token; % \item 1 begin-group token; % \item -1 end-group token; % \item 2 other. % \end{itemize} % The values $0$, $1$, $-1$ correspond to how much a true such % character changes the nesting level ($2$ is used only here, % and is irrelevant later). Then call the auxiliary for each case. % Note that nesting conditionals here is safe because we only skip % over \cs{l_@@_token} if it matches with one of the % character tokens (hence is not a primitive conditional). % \begin{macrocode} \cs_new_protected_nopar:Npn \@@_a_type:w { \l_@@_type_int = \if_meaning:w \l_@@_token \c_space_token \c_zero \else: \if_catcode:w \exp_not:N \l_@@_token \c_group_begin_token \c_one \else: \if_catcode:w \exp_not:N \l_@@_token \c_group_end_token \c_minus_one \else: \c_two \fi: \fi: \fi: \if_case:w \l_@@_type_int \exp_after:wN \@@_a_space:w \or: \exp_after:wN \@@_a_bgroup:w \or: \exp_after:wN \@@_a_safe:N \else: \exp_after:wN \@@_a_egroup:w \fi: } % \end{macrocode} % \end{macro} % % \begin{macro}[int]{\@@_a_space:w} % \begin{macro}[aux]{\@@_a_space_test:w} % In this branch, the following token's meaning is a blank space. % Apply \tn{string} to that token: if it is a control sequence % the result starts with the escape character; otherwise it is % a true blank space, whose string representation is also a blank space. % We test for that in \cs{@@_a_space_test:w}, % after grabbing as \cs{l_@@_char_token} the first character % of the string representation. % Also, since \cs{@@_a_store:} expects the special token to be % stored in the relevant \tn{toks} register, we do that. The extra % \cs{exp_not:n} is unnecessary of course, but it makes the treatment % of all tokens more homogeneous. % If we discover that the next token was actually a control sequence % instead of a true space, then we step the counter of normal tokens. % We now have in front of us the whole string representation of % the control sequence, including potential spaces; those will appear % to be true spaces later in this pass. Hence, all other branches of % the code in this first pass need to consider the string representation, % so that the second pass does not need to test the meaning of tokens, % only strings. % \begin{macrocode} \cs_new_protected_nopar:Npn \@@_a_space:w { \tex_afterassignment:D \@@_a_space_test:w \exp_after:wN \cs_set_eq:NN \exp_after:wN \l_@@_char_token \token_to_str:N } \cs_new_protected_nopar:Npn \@@_a_space_test:w { \if_meaning:w \l_@@_char_token \c_space_token \tex_toks:D \l_@@_index_int { \exp_not:n { ~ } } \@@_a_store: \else: \int_incr:N \l_@@_normal_int \fi: \@@_a_loop:w } % \end{macrocode} % \end{macro} % \end{macro} % % \begin{macro}[int]{\@@_a_bgroup:w, \@@_a_egroup:w} % \begin{macro}[aux]{\@@_a_group:nw} % \begin{macro}[aux]{\@@_a_group_test:w} % The token might be either a true character token with % catcode $1$ or $2$, or it could be a control sequence. % The only tricky case is if the character code happens % to be equal to the escape character: then we change % the escape character from backslash to solidus or back, % so that the string representation of the true character % and of a control sequence set equal to it start differently. % Then probe what the first character of that string % representation is: this is the place where we need % \cs{l_@@_char_token} to be a separate control % sequence from \cs{l_@@_token}, to compare them. % \begin{macrocode} \group_begin: \char_set_catcode_group_begin:N \^^@ \char_set_catcode_group_end:N \^^E \cs_new_protected_nopar:Npn \@@_a_bgroup:w { \@@_a_group:nw { \exp_after:wN ^^@ \if_false: ^^E \fi: } } \char_set_catcode_group_begin:N \^^B \char_set_catcode_group_end:N \^^@ \cs_new_protected_nopar:Npn \@@_a_egroup:w { \@@_a_group:nw { \if_false: ^^B \fi: ^^@ } } \group_end: \cs_new_protected:Npn \@@_a_group:nw #1 { \tex_lccode:D \c_zero = \@@_extract_charcode: \scan_stop: \tl_to_lowercase:n { \tex_toks:D \l_@@_index_int {#1} } \if_int_compare:w \tex_lccode:D \c_zero = \tex_escapechar:D \int_set:Nn \tex_escapechar:D { 139 - \tex_escapechar:D } \fi: \tex_afterassignment:D \@@_a_group_test:w \exp_after:wN \cs_set_eq:NN \exp_after:wN \l_@@_char_token \token_to_str:N } \cs_new_protected_nopar:Npn \@@_a_group_test:w { \if_charcode:w \l_@@_token \l_@@_char_token \@@_a_store: \else: \int_incr:N \l_@@_normal_int \fi: \@@_a_loop:w } % \end{macrocode} % \end{macro} % \end{macro} % \end{macro} % % \begin{macro}[int]{\@@_a_store:} % This function is called each time we meet a special token; % at this point, the \tn{toks} register \cs{l_@@_index_int} % holds a token list which expands to the given special token. % Also, the value of \cs{l_@@_type_int} indicates which case % we are in: % \begin{itemize} % \item -1 end-group character; % \item 0 space character; % \item 1 begin-group character. % \end{itemize} % We need to distinguish further the case of a space character % (code $32$) from other character codes, because those will % behave differently in the second pass. Namely, after testing % the \tn{lccode} of $0$ (which holds the present character code) % we change the cases above to % \begin{itemize} % \item -2 space end-group character; % \item -1 non-space end-group character; % \item 0 space blank space character; % \item 1 non-space begin-group character; % \item 2 space begin-group character. % \end{itemize} % This has the property that non-space characters correspond to odd % values of \cs{l_@@_type_int}. % The number of normal tokens, and the type of special token, % are packed into a \tn{skip} register. % Finally, we check whether we reached the last closing brace, in which % case we stop by disabling the looping function (locally). % \begin{macrocode} \cs_new_protected_nopar:Npn \@@_a_store: { \tex_advance:D \l_@@_nesting_int \l_@@_type_int \if_int_compare:w \tex_lccode:D \c_zero = \c_thirty_two \tex_multiply:D \l_@@_type_int \c_two \fi: \tex_skip:D \l_@@_index_int = \l_@@_normal_int sp plus \l_@@_type_int sp \scan_stop: \int_incr:N \l_@@_index_int \int_zero:N \l_@@_normal_int \if_int_compare:w \l_@@_nesting_int = \c_minus_one \cs_set_eq:NN \@@_a_loop:w \scan_stop: \fi: } % \end{macrocode} % \end{macro} % % \begin{macro}[int]{\@@_a_safe:N} % \begin{macro}[aux]{\@@_a_cs:ww} % This should be the simplest case: since the upcoming token is safe, % we can simply grab it in a second pass. However, other branches of % the code must pass their tokens through \tn{string}, hence we do it % here as well, with some optimizations. If the token is a single % character (including space), the \cs{if_charcode:w} test yields % true, and we simply count one \enquote{normal} token. On the other % hand, if the token is a control sequence, we should replace it by % its string representation for compatibility with other code % branches. Instead of slowly looping through the characters with % the main code, we use the knowledge of how the second pass works: % if the control sequence name contains no space, count that token % as a number of normal tokens equal to its string length. If the % control sequence contains spaces, they should be registered as % special characters by increasing \cs{l_@@_index_int} % (no need to carefully count character between each space), and % all characters after the last space should be counted in the % following sequence of \enquote{normal} tokens. % \begin{macrocode} \cs_new_protected:Npn \@@_a_safe:N #1 { \if_charcode:w \scan_stop: \exp_after:wN \use_none:n \token_to_str:N #1 \prg_do_nothing: \scan_stop: \int_incr:N \l_@@_normal_int \else: \@@_cs_space_count:NN \@@_a_cs:ww #1 \fi: \@@_a_loop:w } \cs_new_protected:Npn \@@_a_cs:ww #1; #2; { \if_int_compare:w #1 > \c_zero \tex_skip:D \l_@@_index_int = \__int_eval:w \l_@@_normal_int + \c_one sp \scan_stop: \tex_advance:D \l_@@_index_int #1 \exp_stop_f: \l_@@_normal_int #2 \exp_stop_f: \else: \tex_advance:D \l_@@_normal_int #2 \exp_stop_f: \fi: } % \end{macrocode} % \end{macro} % \end{macro} % % \subsection{Second pass} % % The second pass is an exercise in expandable loops. % All the necessary information is stored in \tn{skip} % and \tn{toks} registers. % % \begin{macro}[int]{\@@_b:n} % \begin{macro}[int, EXP]{\@@_b_loop:w} % Start the loop with the index $0$. No need for an end-marker: % the loop will stop by itself when the last index is read. % We will repeatedly oscillate between reading long stretches % of normal tokens, and reading special tokens. % \begin{macrocode} \cs_new_protected:Npn \@@_b:n #1 { \tl_gset:Nx \g_@@_result_tl { \@@_b_loop:w 0; #1 \__prg_break_point: } } \cs_new:Npn \@@_b_loop:w #1; { \exp_after:wN \@@_b_normals:ww \__int_value:w \tex_skip:D #1 ; #1 ; } % \end{macrocode} % \end{macro} % \end{macro} % % \begin{macro}[int, EXP]{\@@_b_normals:ww} % \begin{macro}[aux, EXP]{\@@_b_normal:wwN} % The first argument is the number of normal tokens which remain % to be read, and the second argument is the index in the array % produced in the first step. % A character's string representation is always one character long, % while a control sequence is always longer (we have set the escape % character to a printable value). In both cases, we leave % \cs{exp_not:n} \Arg{token} \cs{s__tl} in the input stream % (after \texttt{x}-expansion). Here, \cs{exp_not:n} is used % rather than \cs{exp_not:N} because |#3| could be \cs{s__tl}, % hence must be hidden behind braces in the result. % \begin{macrocode} \cs_new:Npn \@@_b_normals:ww #1; { \if_int_compare:w #1 = \c_zero \@@_b_special:w \fi: \@@_b_normal:wwN #1; } \cs_new:Npn \@@_b_normal:wwN #1; #2; #3 { \exp_not:n { \exp_not:n { #3 } } \s__tl \if_charcode:w \scan_stop: \exp_after:wN \use_none:n \token_to_str:N #3 \prg_do_nothing: \scan_stop: \exp_after:wN \@@_b_char:Nww \else: \exp_after:wN \@@_b_cs:Nww \fi: #3 #1; #2; } % \end{macrocode} % \end{macro} % \end{macro} % % \begin{macro}[int, EXP]{\@@_b_char:Nww} % If the normal token we grab is a character, leave % \meta{catcode} \meta{charcode} followed by \cs{s__tl} % in the input stream, and call \cs{@@_b_normals:ww} % with its first argument decremented. % \begin{macrocode} \group_begin: \char_set_catcode_other:N A \char_set_catcode_other:N B \char_set_catcode_other:N C \char_set_uccode:nn { `? } { `D } \tl_to_uppercase:n { \cs_new:Npn \@@_b_char:Nww #1 { \if_meaning:w #1 \tex_undefined:D ? \else: \if_catcode:w #1 \c_catcode_other_token C \else: \if_catcode:w #1 \c_catcode_letter_token B \else: \if_catcode:w #1 \c_math_toggle_token 3 \else: \if_catcode:w #1 \c_alignment_token 4 \else: \if_catcode:w #1 \c_math_superscript_token 7 \else: \if_catcode:w #1 \c_math_subscript_token 8 \else: \if_catcode:w #1 \c_space_token A \else: 6 \fi: \fi: \fi: \fi: \fi: \fi: \fi: \fi: \__int_value:w `#1 \s__tl \exp_after:wN \@@_b_normals:ww \int_use:N \__int_eval:w \c_minus_one + } } \group_end: % \end{macrocode} % \end{macro} % % \begin{macro}[int, EXP]{\@@_b_cs:Nww} % \begin{macro}[aux, EXP]{\@@_b_cs_test:ww} % If the token we grab is a control sequence, leave % |0 -1| (as category code and character code) in the input stream, % followed by \cs{s__tl}, % and call \cs{@@_b_normals:ww} with updated arguments. % \begin{macrocode} \cs_new:Npn \@@_b_cs:Nww #1 { 0 -1 \s__tl \@@_cs_space_count:NN \@@_b_cs_test:ww #1 } \cs_new:Npn \@@_b_cs_test:ww #1 ; #2 ; #3 ; #4 ; { \exp_after:wN \@@_b_normals:ww \int_use:N \__int_eval:w \if_int_compare:w #1 = \c_zero #3 \else: \tex_skip:D \__int_eval:w #4 + #1 \__int_eval_end: \fi: - #2 \exp_after:wN ; \int_use:N \__int_eval:w #4 + #1 ; } % \end{macrocode} % \end{macro} % \end{macro} % % \begin{macro}[int, EXP]{\@@_b_special:w} % \begin{macro}[aux, EXP]{\@@_b_special_char:wN} % \begin{macro}[aux, EXP]{\@@_b_special_space:w} % Here, |#1| is the current index in the array built in the first pass. % Check now whether we reached the end (we shouldn't keep the trailing % end-group character that marked the end of the token list in the % first pass). % Unpack the \tn{toks} register: when \texttt{x}-expanding again, % we will get the special token. % Then leave the category code in the input stream, followed by % the character code, and call \cs{@@_b_loop:w} with the next index. % \begin{macrocode} \group_begin: \char_set_catcode_other:N A \cs_new:Npn \@@_b_special:w \fi: \@@_b_normal:wwN 0 ; #1 ; { \fi: \if_int_compare:w #1 = \l_@@_index_int \exp_after:wN \__prg_break: \fi: \tex_the:D \tex_toks:D #1 \s__tl \if_case:w \etex_gluestretch:D \tex_skip:D #1 \exp_stop_f: A \or: 1 \or: 1 \else: 2 \fi: \if_int_odd:w \etex_gluestretch:D \tex_skip:D #1 \exp_stop_f: \exp_after:wN \@@_b_special_char:wN \int_use:N \else: \exp_after:wN \@@_b_special_space:w \int_use:N \fi: \__int_eval:w \c_one + #1 \exp_after:wN ; \token_to_str:N } \group_end: \cs_new:Npn \@@_b_special_char:wN #1 ; #2 { \__int_value:w `#2 \s__tl \@@_b_loop:w #1 ; } \cs_new:Npn \@@_b_special_space:w #1 ; ~ { 32 \s__tl \@@_b_loop:w #1 ; } % \end{macrocode} % \end{macro} % \end{macro} % \end{macro} % % \subsection{Mapping through the analysis} % % \begin{macro}[int]{\@@_map_inline:nn} % \begin{macro}[aux]{\@@_map_inline_aux:Nn} % First obtain the analysis of the token list into % \cs{g_@@_result_tl}. To allow nested mappings, increase the % nesting depth \cs{g__prg_map_int} (shared between all modules), then % define the looping macro, which has a name specific to that nesting % depth. That looping grabs the \meta{tokens}, \meta{catcode} and % \meta{char code}; it checks for the end of the loop with % \cs{use_none:n} |##2|, normally empty, but which becomes % \cs{tl_map_break:} at the end; it then performs the user's code % |#2|, and loops by calling itself. When the loop ends, remember to % decrease the nesting depth. % \begin{macrocode} \cs_new_protected:Npn \@@_map_inline:nn #1 { \@@:n {#1} \int_gincr:N \g__prg_map_int \exp_args:Nc \@@_map_inline_aux:Nn { @@_map_inline_ \int_use:N \g__prg_map_int :wNw } } \cs_new_protected:Npn \@@_map_inline_aux:Nn #1#2 { \cs_gset_protected:Npn #1 ##1 \s__tl ##2 ##3 \s__tl { \use_none:n ##2 #2 #1 } \exp_after:wN #1 \g_@@_result_tl \s__tl { ? \tl_map_break: } \s__tl \__prg_break_point:Nn \tl_map_break: { \int_gdecr:N \g__prg_map_int } } % \end{macrocode} % \end{macro} % \end{macro} % % \subsection{Showing the results} % % \begin{macro}{\tl_show_analysis:N, \tl_show_analysis:n} % \begin{macro}[int]{\@@_show:N} % Add to \cs{@@:n} a third pass to display tokens to the terminal. % \begin{macrocode} \cs_new_protected:Npn \tl_show_analysis:N #1 { \exp_args:No \@@:n {#1} \@@_show:N #1 } \cs_new_protected:Npn \tl_show_analysis:n #1 { \@@:n {#1} \tl_set:Nn \l_@@_internal_tl {#1} \@@_show:N \l_@@_internal_tl } \cs_new_protected:Npn \@@_show:N #1 { \group_begin: \use:x { \group_end: \exp_not:n { \__msg_show_variable:Nnn #1 } { tl-analysis } { \exp_after:wN \@@_show_loop:wNw \g_@@_result_tl \s__tl { ? \__prg_break: } \s__tl \__prg_break_point: } } } % \end{macrocode} % \end{macro} % \end{macro} % % \begin{macro}[aux, rEXP]{\@@_show_loop:wNw} % Here, |#1| \texttt{o}- and \texttt{x}-expands to the token; % |#2| is the category code (one uppercase hexadecimal digit), % $0$ for control sequences; % |#3| is the character code, which we ignore. % In the cases of control sequences and active characters, % the meaning may overflow one line, and we want to truncate % it. Those cases are thus separated out. % \begin{macrocode} \cs_new:Npn \@@_show_loop:wNw #1 \s__tl #2 #3 \s__tl { \use_none:n #2 \exp_not:n { \\ > \ \ } \if_int_compare:w "#2 = \c_zero \exp_after:wN \@@_show_cs:n \else: \if_int_compare:w "#2 = \c_thirteen \exp_after:wN \exp_after:wN \exp_after:wN \@@_show_active:n \else: \exp_after:wN \exp_after:wN \exp_after:wN \@@_show_normal:n \fi: \fi: {#1} \@@_show_loop:wNw } % \end{macrocode} % \end{macro} % % \begin{macro}[aux, rEXP]{\@@_show_normal:n} % Non-active characters are a simple matter of printing % the character, and its meaning. Our test suite checks that % begin-group and end-group characters do not mess up % \TeX{}'s alignment status. % \begin{macrocode} \cs_new:Npn \@@_show_normal:n #1 { \exp_after:wN \token_to_str:N #1 ~ ( \exp_after:wN \token_to_meaning:N #1 ) } % \end{macrocode} % \end{macro} % % \begin{macro}[EXP]{\@@_show_value:N} % This expands to the value of |#1| if it has any. % \begin{macrocode} \cs_new:Npn \@@_show_value:N #1 { \token_if_expandable:NF #1 { \token_if_chardef:NTF #1 \__prg_break: { } \token_if_mathchardef:NTF #1 \__prg_break: { } \token_if_dim_register:NTF #1 \__prg_break: { } \token_if_int_register:NTF #1 \__prg_break: { } \token_if_skip_register:NTF #1 \__prg_break: { } \token_if_toks_register:NTF #1 \__prg_break: { } \use_none:nnn \__prg_break_point: \use:n { = \tex_the:D #1 } } } % \end{macrocode} % \end{macro} % % \begin{macro}[aux, rEXP]{\@@_show_cs:n} % \begin{macro}[aux, rEXP]{\@@_show_active:n} % \begin{macro}[aux, rEXP]{\@@_show_long:nn} % \begin{macro}[aux, rEXP]{\@@_show_long_aux:nnnn} % Control sequences and active characters are printed in the same way, % making sure not to go beyond the \cs{l_iow_line_count_int}. In case % of an overflow, we replace the last characters by % \cs{c_@@_show_etc_str}. % \begin{macrocode} \cs_new:Npn \@@_show_cs:n #1 { \exp_args:No \@@_show_long:nn {#1} { control~sequence= } } \cs_new:Npn \@@_show_active:n #1 { \exp_args:No \@@_show_long:nn {#1} { active~character= } } \cs_new:Npn \@@_show_long:nn #1 { \@@_show_long_aux:oofn { \token_to_str:N #1 } { \token_to_meaning:N #1 } { \@@_show_value:N #1 } } \cs_new:Npn \@@_show_long_aux:nnnn #1#2#3#4 { \int_compare:nNnTF { \str_count:n { #1 ~ ( #4 #2 #3 ) } } > { \l_iow_line_count_int - \c_three } { \str_range:nnn { #1 ~ ( #4 #2 #3 ) } \c_one { \l_iow_line_count_int - \c_three - \str_count:N \c_@@_show_etc_str } \c_@@_show_etc_str } { #1 ~ ( #4 #2 #3 ) } } \cs_generate_variant:Nn \@@_show_long_aux:nnnn { oof } % \end{macrocode} % \end{macro} % \end{macro} % \end{macro} % \end{macro} % % \subsection{Messages} % % \begin{variable}{\c_@@_show_etc_str} % When a control sequence (or active character) % and its meaning are too long to fit in one line % of the terminal, the end is replaced by this token list. % \begin{macrocode} \tl_const:Nx \c_@@_show_etc_str % ( { \token_to_str:N \ETC.) } % \end{macrocode} % \end{variable} % % \begin{macrocode} \__msg_kernel_new:nnn { kernel } { show-tl-analysis } { The~token~list~ \str_if_eq:nnF {#1} { \l_@@_internal_tl } { \token_to_str:N #1 ~ } \tl_if_empty:NTF #1 { is~empty } { contains~the~tokens: } } % \end{macrocode} % % \begin{macrocode} % % \end{macrocode} % % \end{implementation} % % \PrintIndex