% \iffalse meta-comment % %% File: l3unicode-data.dtx Copyright(C) 2014-2015 The LaTeX3 Project %% %% It may be distributed and/or modified under the conditions of the %% LaTeX Project Public License (LPPL), either version 1.3c of this %% license or (at your option) any later version. The latest version %% of this license is in the file %% %% http://www.latex-project.org/lppl.txt %% %% This file is part of the "l3kernel bundle" (The Work in LPPL) %% and all files in that bundle must be distributed together. %% %% The released version of this bundle is available from CTAN. %% %% ----------------------------------------------------------------------- %% %% The development version of the bundle can be found at %% %% http://www.latex-project.org/svnroot/experimental/trunk/ %% %% for those people who are interested. %% %%%%%%%%%%% %% NOTE: %% %%%%%%%%%%% %% %% Snapshots taken from the repository represent work in progress and may %% not work or may contain conflicting material! We therefore ask %% people _not_ to put them into distributions, archives, etc. without %% prior consultation with the LaTeX Project Team. %% %% ----------------------------------------------------------------------- %% % % Both the driver and the script need \pkg{expl3}: as the script runs with % plain \TeX{}, set up in generic mode. %<*driver|script> \input expl3-generic\relax \GetIdInfo$Id: l3unicode-data.dtx 5544 2015-03-01 18:30:00Z joseph $ {L3 Case data script} % % % The same approach as used in \pkg{DocStrip}: if \cs{documentclass} % is undefined then skip the driver, allowing the file to be used directly. % This works as the \cs{fi} is only seen if \LaTeX{} is not in use. The odd % \cs{jobname} business allows the extraction to work with \LaTeX{} provided % an appropriate \texttt{.ins} file is set up. %<*gobble> \ifx\jobname\relax \let\documentclass\undefined \fi \begingroup\expandafter\expandafter\expandafter\endgroup \expandafter\ifx\csname documentclass\endcsname\relax \else \csname fi\endcsname % % %<*driver> \documentclass[full]{l3doc} \begin{document} \DocInput{\jobname.dtx} \end{document} %<*gobble> \fi % % % \fi % % \title{^^A % The \textsf{l3unicode-data} script\\Unicode data script^^A % \thanks{This file describes v\ExplFileVersion, % last revised \ExplFileDate.}^^A % } % % \author{^^A % The \LaTeX3 Project\thanks % {^^A % E-mail: % \href{mailto:latex-team@latex-project.org} % {latex-team@latex-project.org}^^A % }^^A % } % % \date{Released \ExplFileDate} % % \maketitle % % \begin{documentation} % % The Unicode Consortium provide comprehensive data on the standard % mapping of characters (or more formally codepoints) when carrying % out various different case-changing functions: % \begin{itemize} % \item Uppercasing % \item Lowercasing % \item Titlecasing (used for the first codepoint of a word: % may be subtly different to uppercasing) % \item Folding (removing case for comparison purposes: close % but not identical to lowercasing) % \end{itemize} % This data is available in machine readable format, such that many of % the basics of case changing can be set up on an automated basis. % % This file provides a script which will read the raw Unicode files % and convert the material to a form which can be used by \pkg{expl3}. % As the conversions here cover the entire Unicode range, this cannot % be carried out by \pdfTeX{}: at present, the script works only % with \LuaTeX{}. % % Note that this file is designed such that running \LaTeX{} will typeset % the documentation using any engine: the script will be run if the file % is processed by plain \TeX{}, specifically the |luatex| command. % This process requires the files |CaseFolding.txt|, |SpecialCasing.txt| % and |UnicodeData.txt| from the \href{http://www.unicode.org/}^^A % {Unicode Consortium website}. % % The file produced by this script, |l3unicode-data.def|, contains % appropriate definitions for all of the data structures used by \pkg{expl3} % for Unicode transformations. It also provides appropriate alternative % definitions for use with \pdfTeX{}. % % \end{documentation} % % \begin{implementation} % % \section{\pkg{l3unicode-data} Implementation} % % \begin{macrocode} %<*script> % \end{macrocode} % % The driver part has loaded \pkg{expl3}: turn on the syntax environment. % \begin{macrocode} \ExplSyntaxOn % \end{macrocode} % % \subsection{Setup} % % \begin{variable}{\l__unicode_tmp_tl} % Scratch space. % \begin{macrocode} \tl_new:N \l__unicode_tmp_tl % \end{macrocode} % \end{variable} % % The first step is to generate a series of temporary variables to % contain the data as it's extracted. This requires a nested loop % to give a total of $100$ token lists. % \begin{macrocode} \tl_map_inline:nn { 0123456789 } { \tl_map_inline:nn { 0123456789 } { \tl_new:c { l__unicode_lower_ #1 _X_ ##1 _tl } \tl_new:c { l__unicode_upper_ #1 _X_ ##1 _tl } } } % \end{macrocode} % % \begin{variable}{\l__unicode_compat_seq} % A sequence to hold the list of compatibility chars currently defined by % Unicode. This is needed for both case mapping and case folding (it's % defined by information in the master file |UnicodeData.txt|). % \begin{macrocode} \seq_new:N \l__unicode_compat_seq % \end{macrocode} % \end{variable} % % \begin{variable}^^A % { % \l__unicode_lower_exceptions_tl , % \l__unicode_title_exceptions_tl , % \l__unicode_upper_exceptions_tl % } % Exceptions to the usual mappings. % \begin{macrocode} \tl_new:N \l__unicode_lower_exceptions_tl \tl_new:N \l__unicode_title_exceptions_tl \tl_new:N \l__unicode_upper_exceptions_tl % \end{macrocode} % \end{variable} % % \begin{macro}{\__unicode_file_map:nn} % A utility function for handling file reading. % \begin{macrocode} \cs_new_protected:Npn \__unicode_file_map:nn #1#2 { \ior_open:Nn \g__unicode_data_ior {#1} \ior_if_eof:NTF \g__unicode_data_ior { \__msg_kernel_error:nnn { unicode } { file-not-found } {#1} } { \ior_str_map_inline:Nn \g__unicode_data_ior {#2} \ior_close:N \g__unicode_data_ior } } \__msg_kernel_new:nnn { unicode } { file-not-found } { Could~not~find~data~file~'#1'. } % \end{macrocode} % \end{macro} % % \begin{variable}{\g__unicode_data_ior} % \begin{variable}{\g__unicode_result_iow} % Streams for reading and writing the data. % \begin{macrocode} \ior_new:N \g__unicode_data_ior \iow_new:N \g__unicode_result_iow % \end{macrocode} % \end{variable} % \end{variable} % % Open the data file for writing. % \begin{macrocode} \iow_open:Nn \g__unicode_result_iow { l3unicode-data.def } % \end{macrocode} % % \subsection{Verbatim copying} % % \begin{macro}[int]{\__unicode_verb:} % \begin{macro}[aux]{\__unicode_verb_auxi:w, \__unicode_verb_auxii:w} % \begin{macro}[int]{\__unicode_verb_end:} % There are various bits of code which need to be transferred into the data % file from the source. This has to take place as part of the general writing % process so needs to be done without using DocStrip. That is achieved by % having a verbatim-copy mechanism available: this is all set up here. % As the line containing the \cs{__unicode_verb:} function will end up with a % (category code $12$) space at the start, there is a dedicated function to % clear this part up. % \begin{macrocode} \group_begin: \char_set_catcode_other:n { `\^^M }% \cs_new_protected:Npn \__unicode_verb:% {% \group_begin:% \char_set_catcode_other:n { `\^^M }% \tex_endlinechar:D = `\^^M% \clist_map_inline:nn% { \\ , \{ , \} , \# , \^ , \% , \ }% { \char_set_catcode_other:n { `##1 } }% \__unicode_verb_auxi:w% }% \cs_new_protected:Npn \__unicode_verb_auxi:w#1^^M% {% \exp_after:wN \__unicode_verb_auxii:w \use_none:n #1 ^^M }% \cs_new_protected:Npn \__unicode_verb_auxii:w#1^^M% {% \str_if_eq_x:nnTF {#1} { \token_to_str:N \__unicode_verb_end: }% { \group_end: }% {% \iow_now:Nn \g__unicode_result_iow {#1}% \__unicode_verb_auxii:w% }% }% \group_end:% % \end{macrocode} % \end{macro} % \end{macro} % \end{macro} % % \subsection{Shared data} % % \end{macrocode} % There are various lines that now need to go at the start of the file. % First, there is some header information. % \begin{macrocode} \__unicode_verb: %% This is the file l3unicode-data.def %% generated using the script l3unicode-data.dtx. %% %% The data here are derived from the files %% - UnicodeData.txt %% - SpecialCasing.txt %% - CaseFolding.txt %% which are maintained by the Unicode Consortium. %% \__unicode_verb_end: % \end{macrocode} % % \begin{macro}[EXP]{\__unicode_zero_fill:N} % A short piece of reusable code for the header. % \begin{macrocode} \cs_new:Npn \__unicode_zero_fill:N #1 { \int_compare:nNnT #1 < \c_ten { 0 } \int_use:N #1 } % \end{macrocode} % \end{macro} % % Automatically include the current date. % \begin{macrocode} \iow_now:Nx \g__unicode_result_iow { \iow_char:N \% \iow_char:N \% \c_space_tl Generated~on~ \int_use:N \tex_year:D - \__unicode_zero_fill:N \tex_month:D - \__unicode_zero_fill:N \tex_day:D . } \iow_now:Nx \g__unicode_result_iow { \iow_char:N \% \iow_char:N \% } % \end{macrocode} % Write an identification line to the file: the file data here can't be set % automatically and so will need to be edited by hand. As such, the data here % the standard SVN filler. % \begin{macrocode} \iow_now:Nx \g__unicode_result_iow { \exp_not:N \ProvidesExplFile { l3unicode-data.def } ~ { \int_use:N \tex_year:D / \__unicode_zero_fill:N \tex_month:D / \__unicode_zero_fill:N \tex_day:D } ~ { -1 } ~ { L3~Unicode~data } } % \end{macrocode} % % \subsection{\pdfTeX{} support} % % As \pdfTeX{} does not support Unicode input natively, most of the data % here will not be useful. Rather than use two separate mechanisms for % each function depending on the engine, the system is designed such that % \enquote{truncated} data structures are provided for \pdfTeX{}. These % are coded here for direct transfer to the |.def| file, which can then % abort loading when \pdfTeX{} is in use. % % The idea here is simple: map over all of the letters of the Latin % alphabet and create appropriate token lists, then add all of the rest % of the data structures. % % After the mapping, the small number of fixed data structures that are % used for the special case conversions are created. These are mainly empty, % but for cases where a match is possible (as the test char is in the \pdfTeX{} % range), no-op data is included (as the \emph{output} would be out-of-range). % \begin{macrocode} \__unicode_verb: \pdftex_if_engine:T { \group_begin: \cs_set_protected:Npn \__unicode_tmp:NN #1#2 { \quark_if_recursion_tail_stop:N #1 \exp_after:wN \__unicode_tmp:NNNNNNN \tex_number:D \__int_eval:w `#1 \exp_after:wN \__int_eval_end: \tex_number:D \__int_eval:w 100 + `#2 \__int_eval_end: #1 #2 \__unicode_tmp:NN } \cs_set_protected:Npn \__unicode_tmp:NNNNNNN #1#2#3#4#5#6#7 { \tl_const:cx { c__unicode_fold_ #1 _X_ #2 _ tl } { #6#7 } \tl_const:cn { c__unicode_lower_ #1 _X_ #2 _ tl } { #6#7 } \tl_const:cn { c__unicode_upper_ #4 _X_ #5 _ tl } { #7#6 } } \__unicode_tmp:NN AaBbCcDdEeFfGgHhIiJjKkLlMmNnOoPpQqRrSsTtUuVvWwXxYyZz \q_recursion_tail ? \q_recursion_stop \group_end: \int_step_inline:nnnn { 0 } { 1 } { 9 } { \int_step_inline:nnnn { 0 } { 1 } { 9 } { \clist_map_inline:nn { fold , lower , upper } { \tl_if_exist:cF { c__unicode_ ####1 _ #1 _X_ ##1 _ tl } { \tl_const:cn { c__unicode_ ####1 _ #1 _X_ ##1 _ tl } { } } } } } \tl_const:Nn \c__unicode_lower_exceptions_tl { } \tl_const:Nn \c__unicode_mixed_exceptions_tl { } \tl_const:Nn \c__unicode_upper_exceptions_tl { } \tl_const:Nn \c__unicode_std_sigma_tl { } \tl_const:Nn \c__unicode_final_sigma_tl { } \tl_const:Nn \c__unicode_accents_lt_tl { } \tl_const:Nn \c__unicode_dot_above_tl { } \tl_const:Nn \c__unicode_dotless_i_tl { I } \tl_const:Nn \c__unicode_dotted_I_tl { i } \tl_const:Nn \c__unicode_i_ogonek_tl { } \tl_const:Nn \c__unicode_I_ogonek_tl { } \tex_endinput:D } \__unicode_verb_end: % \end{macrocode} % % \subsection{Upper/lower/title casing} % % \begin{macro}{\__unicode_parse_line:w} % \begin{macro}[aux]{\__unicode_parse_line_auxi:w} % \begin{macro}[aux]{\__unicode_parse_line_auxii:nw} % \begin{macro}[aux]{\__unicode_parse_line_auxiii:nw} % \begin{macro}[aux]{\__unicode_parse_line_auxiv:nnn} % \begin{macro}[aux]{\__unicode_parse_line_auxv:wnnn} % \begin{macro}[aux]{\__unicode_parse_line_auxvi:nnNNn} % \begin{macro}[aux]{\__unicode_brace:n} % When parsing |UnicodeData.txt| there are no comments or blank lines. Case % mappings when present here are one-to-one and so are easy to deal with. The % slight complication here is that the lines are rather long, so a multi-part % approach is needed to grab the correct parts of the line as arguments. Of % the first set of arguments, the two that needed are |#1| (the code point) % and |#6| (details about the code point which may include the fact it's a % compatibility char). % \begin{macrocode} \cs_new_protected:Npn \__unicode_parse_line:w #1 ; #2 ; #3 ; #4 ; #5 ; #6 ; #7 ; #8 ; #9 ; { \__unicode_parse_line_auxi:w #1 ; #6 ; } % \end{macrocode} % With some data items removed, at this stage the hexadecimal % representation of the char is |#1|, any compatibility char information is % in |#2|, the upper case code point is |#6|, the lower case one |#7| and % the title case one |#8|. These may or may not be present and the upper and % title case values may be identical. The compatibility data is first % extracted into a sequence, then the main information is processed. Where % there are values for upper/lower case, they are saved into the appropriate % token lists. For title case, since the number of exceptions is small and so % they are saved in a single dedicated space. Note that there is a space at % the end of |#8| as we are reading the data in with spaces not ignored: that % has to be allowed for to get the equality test right. The % \enquote{business end} of the code here is inside a rescan block so the % later parts of the code do not need to be concerned with string % \emph{versus} standard category codes. % \begin{macrocode} \cs_new_protected:Npn \__unicode_parse_line_auxi:w #1 ; #2 ; #3 ; #4 ; #5 ; #6 ; #7 ; #8 \q_stop { \use:x { \__unicode_parse_line_auxii:nw {#1} #2 \tl_to_str:n { } \c_space_tl \exp_not:N \q_stop } \tl_rescan:nn { } { \__unicode_parse_line_auxiv:nnn {#1} {#6} { upper } \__unicode_parse_line_auxiv:nnn {#1} {#7} { lower } \bool_if:nF { \tl_if_empty_p:n {#6} || \str_if_eq_p:nn {#6} {#8} } { \seq_if_in:NnTF \l__unicode_compat_seq {#1} { \cs_set:Npn \__unicode_brace:n ##1 { { ##1 } } } { \cs_set_eq:NN \__unicode_brace:n \use:n } \tl_put_right:Nx \l__unicode_title_exceptions_tl { \__unicode_brace:n { \luatex_Uchar:D "#1 \c_space_tl } \__unicode_brace:n { \luatex_Uchar:D "#8 \c_space_tl } } } } } % \end{macrocode} % Compatibility chars have information as the marker || then a list % of one to three resulting code points. The one-to-one cases are not an % issue for dealing the the data, so it's only the more complex versions that % need to be recorded. % \begin{macrocode} \use:x { \cs_new_protected:Npn \exp_not:N \__unicode_parse_line_auxii:nw ##1##2 \tl_to_str:n { } ~ ##3 \exp_not:N \q_stop } { \tl_if_blank:nF {#3} { \__unicode_parse_line_auxiii:nw {#1} #3 ~ \q_stop } } \cs_new_protected:Npn \__unicode_parse_line_auxiii:nw #1#2 ~ #3 \q_stop { \tl_if_blank:nF {#3} { \seq_put_right:Nn \l__unicode_compat_seq {#1} } } % \end{macrocode} % The array structure here is in two parts, one for upper and one % for lower case mappings. % \begin{macrocode} \cs_new_protected:Npn \__unicode_parse_line_auxiv:nnn #1#2 { \exp_last_unbraced:Nf \__unicode_parse_line_auxv:wnnn { \int_eval:n { 1000000 + "#1 } } \q_stop {#1} {#2} } \cs_new_protected:Npn \__unicode_parse_line_auxv:wnnn #1#2#3#4#5#6#7 \q_stop #8#9 { \__unicode_parse_line_auxvi:nnNNn {#8} {#9} #6 #7 } % \end{macrocode} % Store data as long as there is something to actually do in a mapping. % For entries in the the compatibility list there is a need to add braces % around the chars in case there is any normalisation during file reading. % \begin{macrocode} \cs_new_protected:Npn \__unicode_parse_line_auxvi:nnNNn #1#2#3#4#5 { \tl_if_empty:nF {#2} { \seq_if_in:NnTF \l__unicode_compat_seq {#1} { \cs_set:Npn \__unicode_brace:n ##1 { { ##1 } } } { \cs_set_eq:NN \__unicode_brace:n \use:n } \tl_put_right:cx { l__unicode_ #5 _ #3 _X_ #4 _tl } { \__unicode_brace:n { \luatex_Uchar:D "#1 \c_space_tl } \__unicode_brace:n { \luatex_Uchar:D "#2 \c_space_tl } } } } \cs_new_eq:NN \__unicode_brace:n \use:n % \end{macrocode} % \end{macro} % \end{macro} % \end{macro} % \end{macro} % \end{macro} % \end{macro} % \end{macro} % \end{macro} % % Everything is set up and so the read loop can take place. % \begin{macrocode} \__unicode_file_map:nn { UnicodeData.txt } { \__unicode_parse_line:w #1 \q_stop } % \end{macrocode} % % \begin{macro}{\__unicode_parse_line:w} % \begin{macro}[aux]{\__unicode_parse_line_auxii:w} % \begin{macro}[aux]{\__unicode_parse_line_auxiii:nnn} % \begin{macro}[aux]{\__unicode_parse_line_auxiv:nwn} % The file |SpecialCasing.txt| uses C-style comments and may contain % blank lines: those two awkward situations need to be filtered out before % parsing the real data in the line. % \begin{macrocode} \cs_set_protected:Npn \__unicode_parse_line:w #1 \q_stop { \tl_if_blank:nF {#1} { \str_if_eq_x:nnF { \tl_head:n {#1} } { \cs_to_str:N \# } { \__unicode_parse_line_auxii:w #1 \q_stop } } } % \end{macrocode} % Here, |#1| is the code point for the input, |#2| is the lower case mapping, % |#3| the title case mapping and |#4| the upper case mapping: all three % mappings are always given even if they are also in |UnicodeData.txt|. As % most of the title case exceptions are also upper case exceptions, a test is % made so that we are only storing truly useful exceptions for title case. % \begin{macrocode} \cs_new_protected:Npn \__unicode_parse_line_auxii:w #1 ;~ #2 ;~ #3 ;~ #4 ; #5 \q_stop { \__unicode_parse_line_auxiii:nnn {#1} {#2} { lower } \str_if_eq:nnF {#3} {#4} { \__unicode_parse_line_auxiii:nnn {#1} {#3} { title } } \__unicode_parse_line_auxiii:nnn {#1} {#4} { upper } } % \end{macrocode} % For each mapping there may be one, two or three code points in the % output. After a bit of a trick to allow for ease of parsing, we check if % there are at least two numbers for the case-changed char. If there are, % then save the exception. If not, then the value will also be in the main % table and we can ignore it here. There is also a test to see if the % current value is a title case exception: they don't need extra braces % for those. % \begin{macrocode} \cs_new_protected:Npn \__unicode_parse_line_auxiii:nnn #1#2#3 { \use:n { \__unicode_parse_line_auxiv:nwn {#1} #2 ~ } ~ \q_stop {#3} } \cs_new_protected:Npn \__unicode_parse_line_auxiv:nwn #1#2 ~ #3 ~ #4 \q_stop #5 { \tl_if_empty:nF {#3} { \seq_if_in:NnTF \l__unicode_compat_seq {#1} { \cs_set:Npn \__unicode_brace:n ##1 { { ##1 } } } { \cs_set_eq:NN \__unicode_brace:n \use:n } \tl_rescan:nn { } { \tl_put_right:cx { l__unicode_ #5 _exceptions_tl } { \__unicode_brace:n { \luatex_Uchar:D "#1 \c_space_tl } { \luatex_Uchar:D "#2 \c_space_tl \luatex_Uchar:D "#3 \c_space_tl \tl_if_empty:nF {#4} { \luatex_Uchar:D "#4 \c_space_tl } } } } } } % \end{macrocode} % \end{macro} % \end{macro} % \end{macro} % \end{macro} % % Parsing set up, read the special cases file. The input contains both % general special cases and ones dependent on context. We only want to read % the former, so there is a check for the line that splits the two: % at that point, simply stop parsing. % \begin{macrocode} \__unicode_file_map:nn { SpecialCasing.txt } { \str_if_eq_x:nnTF {#1} { \cs_to_str:N \# \c_space_tl Conditional~Mappings } { \ior_map_break: } { \__unicode_parse_line:w #1 \q_stop } } % \end{macrocode} % % Saving the data uses a single file, with the two arrays followed by the % exceptions. % \begin{macrocode} \tl_map_inline:nn { 0123456789 } { \tl_map_inline:nn { 0123456789 } { \iow_now:Nx \g__unicode_result_iow { \tl_const:cn { ~ c__unicode_lower_ #1 _X_ ##1 _tl ~ } ~ { ~ \exp_not:v { l__unicode_lower_ #1 _X_ ##1 _tl } ~ } } \tl_clear:c { l__unicode_lower_ #1 _X_ ##1 _tl } } } \tl_map_inline:nn { 0123456789 } { \tl_map_inline:nn { 0123456789 } { \iow_now:Nx \g__unicode_result_iow { \tl_const:cn { ~ c__unicode_upper_ #1 _X_ ##1 _tl ~ } ~ { ~ \exp_not:v { l__unicode_upper_ #1 _X_ ##1 _tl } ~ } } } } \clist_map_inline:nn { upper , lower , title } { \iow_now:Nx \g__unicode_result_iow { \tl_const:Nn \exp_not:c { c__unicode_ \str_if_eq:nnTF {#1} { title } { mixed } {#1} _exceptions_tl } { \exp_not:v { l__unicode_ #1 _ exceptions_tl } } } } % \end{macrocode} % % Data for the special cases is now stored. This is mainly a series of simple % token lists with appropriate names and content, but there is also one place % where a small mapping list is required. % \begin{macrocode} \cs_new_protected:Npn \__unicode_special_case:nn #1#2 { \quark_if_recursion_tail_stop:n {#1} \iow_now:Nx \g__unicode_result_iow { \tl_const:Nn \exp_not:c { c__unicode_ #1 _tl } { ~ \luatex_Uchar:D "#2 \c_space_tl \c_space_tl } } \__unicode_special_case:nn } \__unicode_special_case:nn { std_sigma } { 03C3 } { final_sigma } { 03C2 } { dotless_i } { 0131 } { dot_above } { 0307 } { dotted_I } { 0130 } { i_ogonek } { 012F } { I_ogonek } { 012E } \q_recursion_tail { } \q_recursion_stop \iow_now:Nx \g__unicode_result_iow { \tl_const:Nn \exp_not:N \c__unicode_accents_lt_tl { \luatex_Uchar:D "00CC { \luatex_Uchar:D "0069 \luatex_Uchar:D "0307 \luatex_Uchar:D "0300 } \luatex_Uchar:D "00CD { \luatex_Uchar:D "0069 \luatex_Uchar:D "0307 \luatex_Uchar:D "0301 } \luatex_Uchar:D "0128 { \luatex_Uchar:D "0069 \luatex_Uchar:D "0307 \luatex_Uchar:D "0303 } } } % \end{macrocode} % % \subsection{Case folding} % % \begin{macro}{\__unicode_parse_line:w} % \begin{macro}[aux]{\__unicode_parse_line_auxi:Nw} % \begin{macro}[aux]{\__unicode_parse_line_auxii:w} % \begin{macro}[aux]{\__unicode_parse_line_auxiii:nw} % \begin{macro}[aux]{\__unicode_parse_line_auxiv:nn} % \begin{macro}[aux]{\__unicode_parse_line_auxv:wnn} % As for |SpecialCasing.txt|, the format of |CaseFolding.txt| allows both % blank lines and C-style comments starting with |#|. % \begin{macrocode} \cs_set_protected:Npn \__unicode_parse_line:w #1 \q_stop { \tl_if_blank:nF {#1} { \__unicode_parse_line_auxi:Nw #1 \q_stop } } \cs_set_protected:Npn \__unicode_parse_line_auxi:Nw #1#2 \q_stop { \str_if_eq_x:nnF { \exp_not:n {#1} } { \cs_to_str:N \# } { \__unicode_parse_line_auxii:w #1#2 \q_stop } } % \end{macrocode} % For lines actually containing data, there will be four entries separated by % |;| tokens: the hex code for the char itself, which folding regim\'{e}s % the line applies to, the hex code(s) for the folded char and a % description. Of these, we need all but the last one. In the simple % case of core foldings, the mapping is one--one and this information % can be passed directly to the next stage. We also handle the full % mappings (dropping simple ones plus any Turkic variation): an additional % step is needed to parse this case. % \begin{macrocode} \cs_set_protected:Npn \__unicode_parse_line_auxii:w #1 ;~ #2 ; #3 ; #4 \q_stop { \str_if_eq:nnTF {#2} { C } { \__unicode_parse_line_auxiv:nn {#1} { \luatex_Uchar:D "#3 \c_space_tl } } { \str_if_eq:nnT {#2} { F } { \__unicode_parse_line_auxiii:nw {#1} #3 ~ \q_stop } } } % \end{macrocode} % Full folding produces two or three Unicode code points from a single % input char. To deal with this, we split the relevant part of the input % and check how many chars to generate. The entire folding output is % braced so that when read back \TeX{} will see this as a group in our % replacement code: the only exceptions occur when the input char is on % the compatibility list, as that would lead to an extra set of braces. % \begin{macrocode} \cs_set_protected:Npn \__unicode_parse_line_auxiii:nw #1 ~ #2 ~ #3 ~ #4 \q_stop { \seq_if_in:NnTF \l__unicode_compat_seq {#1} { \cs_set_eq:NN \__unicode_brace:n \use:n } { \cs_set:Npn \__unicode_brace:n ##1 { { ##1 } } } \exp_args:Nno \__unicode_parse_line_auxiv:nn {#1} { \__unicode_brace:n { \luatex_Uchar:D "#2 \c_space_tl \luatex_Uchar:D "#3 \c_space_tl \tl_if_empty:nF {#4} { \luatex_Uchar:D "#4 \c_space_tl } } } } % \end{macrocode} % The final stage of extracting the mapping is to split the various cases % up such that comparison and replacement does not need to check every % character. That is done by taking the charcode modulo $100$: this splits % the list of chars into $100$ much shorter lists. With that done, the % input and output chars are added to the appropriate token lists. % \begin{macrocode} \cs_new_protected:Npn \__unicode_parse_line_auxiv:nn #1#2 { \exp_last_unbraced:Nf \__unicode_parse_line_auxv:wnn { \int_eval:n { 1000000 + "#1 } } \q_stop {#1} {#2} } % \end{macrocode} % As the input is read in string mode, there is a need for a rescan % here since \tn{Uchar} requires letters for hexadecimal digits % beyond~$9$. % \begin{macrocode} \cs_new_protected:Npn \__unicode_parse_line_auxv:wnn #1#2#3#4#5#6#7 \q_stop #8#9 { \seq_if_in:NnTF \l__unicode_compat_seq {#8} { \cs_set:Npn \__unicode_brace:n ##1 { { ##1 } } } { \cs_set_eq:NN \__unicode_brace:n \use:n } \tl_rescan:nn { } { \tl_put_right:cx { l__unicode_lower_ #6 _X_ #7 _tl } { \__unicode_brace:n { \luatex_Uchar:D "#8 \c_space_tl } \__unicode_brace:n { #9 } } } } % \end{macrocode} % \end{macro} % \end{macro} % \end{macro} % \end{macro} % \end{macro} % \end{macro} % % The main loop can now take place, reading the source data and saving all of % the information in the token list array. % \begin{macrocode} \__unicode_file_map:nn { CaseFolding.txt } { \__unicode_parse_line:w #1 \q_stop } % \end{macrocode} % % Write the processed data to the |.def| file. % \begin{macrocode} \tl_map_inline:nn { 0123456789 } { \tl_map_inline:nn { 0123456789 } { \iow_now:Nx \g__unicode_result_iow { \tl_const:cn { ~ c__unicode_fold_ #1 _X_ ##1 _tl ~ } ~ { ~ \exp_not:v { l__unicode_lower_ #1 _X_ ##1 _tl } ~ } } \tl_clear:c { l__unicode_lower_ #1 _X_ ##1 _tl } } } % \end{macrocode} % % Job done, end the \TeX{} run. % \begin{macrocode} \iow_close:N \g__unicode_result_iow \tex_end:D % \end{macrocode} % % \begin{macrocode} % % \end{macrocode} % % \end{implementation} % % \PrintIndex