Updates from Fredrik Lundh <effbot@telia.com> about Unicode-related

behavior.
2024-12-05 07:43:50 +08:00 · 2000-09-25 17:52:40 +00:00 · 2000-09-25 17:52:40 +00:00 · e53793bf4c
commit e53793bf4c
parent af57431701
1 changed files with 43 additions and 31 deletions
--- a/Doc/lib/libre.tex
+++ b/Doc/lib/libre.tex
@ -175,13 +175,14 @@ Extensions usually do not create a new group;
 \regexp{(?P<\var{name}>...)} is the only exception to this rule.
 Following are the currently supported extensions.

-\item[\code{(?iLmsx)}] (One or more letters from the set \character{i},
-\character{L}, \character{m}, \character{s}, \character{x}.)  The group matches
-the empty string; the letters set the corresponding flags
-(\constant{re.I}, \constant{re.L}, \constant{re.M}, \constant{re.S},
-\constant{re.X}) for the entire regular expression.  This is useful if
-you wish to include the flags as part of the regular expression, instead
-of passing a \var{flag} argument to the \function{compile()} function. 
+\item[\code{(?iLmsux)}] (One or more letters from the set \character{i},
+\character{L}, \character{m}, \character{s}, \character{u},
+\character{x}.)  The group matches the empty string; the letters set
+the corresponding flags (\constant{re.I}, \constant{re.L},
+\constant{re.M}, \constant{re.S}, \constant{re.U}, \constant{re.X})
+for the entire regular expression.  This is useful if you wish to
+include the flags as part of the regular expression, instead of
+passing a \var{flag} argument to the \function{compile()} function.

 \item[\code{(?:...)}] A non-grouping version of regular parentheses.
 Matches whatever regular expression is inside the parentheses, but the
@ -227,7 +228,6 @@ resulting RE will match the second character.  For example,

 \begin{list}{}{\leftmargin 0.7in \labelwidth 0.65in}

-%
 \item[\code{\e \var{number}}] Matches the contents of the group of the
 same number.  Groups are numbered starting from 1.  For example,
 \regexp{(.+) \e 1} matches \code{'the the'} or \code{'55 55'}, but not
@ -238,45 +238,50 @@ is 0, or \var{number} is 3 octal digits long, it will not be interpreted
 as a group match, but as the character with octal value \var{number}.
 Inside the \character{[} and \character{]} of a character class, all numeric
 escapes are treated as characters. 
-%
+
 \item[\code{\e A}] Matches only at the start of the string.
-%
+
 \item[\code{\e b}] Matches the empty string, but only at the
 beginning or end of a word.  A word is defined as a sequence of
 alphanumeric characters, so the end of a word is indicated by
 whitespace or a non-alphanumeric character.  Inside a character range,
 \regexp{\e b} represents the backspace character, for compatibility with
 Python's string literals.
-%
+
 \item[\code{\e B}] Matches the empty string, but only when it is
 \emph{not} at the beginning or end of a word.
-%
+
 \item[\code{\e d}]Matches any decimal digit; this is
 equivalent to the set \regexp{[0-9]}.
-%
+
 \item[\code{\e D}]Matches any non-digit character; this is
 equivalent to the set \regexp{[{\^}0-9]}.
-%
+
 \item[\code{\e s}]Matches any whitespace character; this is
 equivalent to the set \regexp{[ \e t\e n\e r\e f\e v]}.
-%
+
 \item[\code{\e S}]Matches any non-whitespace character; this is
 equivalent to the set \regexp{[\^\ \e t\e n\e r\e f\e v]}.
-%
-\item[\code{\e w}]When the \constant{LOCALE} flag is not specified,
+
+\item[\code{\e w}]When the \constant{LOCALE} and \constant{UNICODE}
+flags are not specified,
 matches any alphanumeric character; this is equivalent to the set
 \regexp{[a-zA-Z0-9_]}.  With \constant{LOCALE}, it will match the set
-\regexp{[0-9_]} plus whatever characters are defined as letters for the
-current locale.
-%
-\item[\code{\e W}]When the \constant{LOCALE} flag is not specified,
-matches any non-alphanumeric character; this is equivalent to the set
-\regexp{[{\^}a-zA-Z0-9_]}.   With \constant{LOCALE}, it will match any
-character not in the set \regexp{[0-9_]}, and not defined as a letter
-for the current locale.
+\regexp{[0-9_]} plus whatever characters are defined as letters for
+the current locale.  If \constant{UNICODE} is set, this will match the
+characters \regexp{[0-9_]} plus whatever is classified as alphanumeric
+in the Unicode character properties database.
+
+\item[\code{\e W}]When the \constant{LOCALE} and \constant{UNICODE}
+flags are not specified, matches any non-alphanumeric character; this
+is equivalent to the set \regexp{[{\^}a-zA-Z0-9_]}.   With
+\constant{LOCALE}, it will match any character not in the set
+\regexp{[0-9_]}, and not defined as a letter for the current locale.
+If \constant{UNICODE} is set, this will match anything other than
+\regexp{[0-9_]} and characters marked at alphanumeric in the Unicode
+character properties database.

 \item[\code{\e Z}]Matches only at the end of the string.
-%

 \item[\code{\e \e}] Matches a literal backslash.

@ -354,8 +359,8 @@ lowercase letters, too.  This is not affected by the current locale.

 \begin{datadesc}{L}
 \dataline{LOCALE}
-Make \regexp{\e w}, \regexp{\e W}, \regexp{\e b},
-\regexp{\e B}, dependent on the current locale. 
+Make \regexp{\e w}, \regexp{\e W}, \regexp{\e b}, and
+\regexp{\e B} dependent on the current locale. 
 \end{datadesc}

 \begin{datadesc}{M}
@ -372,9 +377,16 @@ newline (if any) at the end of the string.

 \begin{datadesc}{S}
 \dataline{DOTALL}
-Make the \character{.} special character match any character at all, including a
-newline; without this flag, \character{.} will match anything \emph{except}
-a newline.
+Make the \character{.} special character match any character at all,
+including a newline; without this flag, \character{.} will match
+anything \emph{except} a newline.
+\end{datadesc}
+
+\begin{datadesc}{U}
+\dataline{UNICODE}
+Make \regexp{\e w}, \regexp{\e W}, \regexp{\e b}, and
+\regexp{\e B} dependent on the Unicode character properties database.
+\versionadded{2.0}
 \end{datadesc}

 \begin{datadesc}{X}