Upgrade PCRE lib to 6.7

This commit is contained in:
Ilia Alshanetsky 2006-08-30 20:00:23 +00:00
parent 307b3bcbb4
commit 45debc52ef
39 changed files with 3707 additions and 1281 deletions

View File

@ -5,7 +5,7 @@ ARG_WITH("pcre-regex", "Perl Compatible Regular Expressions", "yes");
if (PHP_PCRE_REGEX == "yes") {
EXTENSION("pcre", "php_pcre.c", PHP_PCRE_REGEX_SHARED,
"-DEXPORT= -DNEWLINE=10 -DSUPPORT_UTF8 -DSUPPORT_UCP -DLINK_SIZE=2 -DPOSIX_MALLOC_THRESHOLD=10 -DMATCH_LIMIT=10000000 -DMATCH_LIMIT_RECURSION=10000000 -DNO_RECURSE -Iext/pcre/pcrelib");
"-DEXPORT= -DNEWLINE=10 -DSUPPORT_UTF8 -DSUPPORT_UCP -DLINK_SIZE=2 -DPOSIX_MALLOC_THRESHOLD=10 -DMATCH_LIMIT=10000000 -DMATCH_LIMIT_RECURSION=10000000 -DMAX_NAME_SIZE=32 -DMAX_NAME_COUNT=10000 -DMAX_DUPLENGTH=30000 -DNO_RECURSE -Iext/pcre/pcrelib");
ADD_SOURCES("ext/pcre/pcrelib", "pcre_chartables.c pcre_ucp_searchfuncs.c pcre_compile.c pcre_config.c pcre_exec.c pcre_fullinfo.c pcre_get.c pcre_globals.c pcre_info.c pcre_maketables.c pcre_ord2utf8.c pcre_refcount.c pcre_study.c pcre_tables.c pcre_try_flipped.c pcre_valid_utf8.c pcre_version.c pcre_xclass.c", "pcre");
ADD_DEF_FILE("ext\\pcre\\php_pcre.def");

View File

@ -13,7 +13,7 @@ PHP_ARG_WITH(pcre-regex,for PCRE support,
if test "$PHP_PCRE_REGEX" != "no"; then
if test "$PHP_PCRE_REGEX" = "yes"; then
PHP_NEW_EXTENSION(pcre, pcrelib/pcre_chartables.c pcrelib/pcre_ucp_searchfuncs.c pcrelib/pcre_compile.c pcrelib/pcre_config.c pcrelib/pcre_exec.c pcrelib/pcre_fullinfo.c pcrelib/pcre_get.c pcrelib/pcre_globals.c pcrelib/pcre_info.c pcrelib/pcre_maketables.c pcrelib/pcre_ord2utf8.c pcrelib/pcre_refcount.c pcrelib/pcre_study.c pcrelib/pcre_tables.c pcrelib/pcre_try_flipped.c pcrelib/pcre_valid_utf8.c pcrelib/pcre_version.c pcrelib/pcre_xclass.c php_pcre.c, $ext_shared,,-DEXPORT= -DNEWLINE=10 -DSUPPORT_UTF8 -DSUPPORT_UCP -DLINK_SIZE=2 -DPOSIX_MALLOC_THRESHOLD=10 -DMATCH_LIMIT=10000000 -DMATCH_LIMIT_RECURSION=10000000 -I@ext_srcdir@/pcrelib)
PHP_NEW_EXTENSION(pcre, pcrelib/pcre_chartables.c pcrelib/pcre_ucp_searchfuncs.c pcrelib/pcre_compile.c pcrelib/pcre_config.c pcrelib/pcre_exec.c pcrelib/pcre_fullinfo.c pcrelib/pcre_get.c pcrelib/pcre_globals.c pcrelib/pcre_info.c pcrelib/pcre_maketables.c pcrelib/pcre_ord2utf8.c pcrelib/pcre_refcount.c pcrelib/pcre_study.c pcrelib/pcre_tables.c pcrelib/pcre_try_flipped.c pcrelib/pcre_valid_utf8.c pcrelib/pcre_version.c pcrelib/pcre_xclass.c php_pcre.c, $ext_shared,,-DEXPORT= -DNEWLINE=10 -DSUPPORT_UTF8 -DSUPPORT_UCP -DLINK_SIZE=2 -DPOSIX_MALLOC_THRESHOLD=10 -DMATCH_LIMIT=10000000 -DMATCH_LIMIT_RECURSION=10000000 -DMAX_NAME_SIZE=32 -DMAX_NAME_COUNT=10000 -DMAX_DUPLENGTH=30000 -I@ext_srcdir@/pcrelib)
PHP_ADD_BUILD_DIR($ext_builddir/pcrelib)
PHP_INSTALL_HEADERS([ext/pcre], [php_pcre.h pcrelib/])
AC_DEFINE(HAVE_BUNDLED_PCRE, 1, [ ])
@ -51,7 +51,7 @@ if test "$PHP_PCRE_REGEX" != "no"; then
AC_DEFINE(HAVE_PCRE, 1, [ ])
PHP_ADD_INCLUDE($PCRE_INCDIR)
PHP_NEW_EXTENSION(pcre, php_pcre.c, $ext_shared,,-DEXPORT= -DNEWLINE=10 -DSUPPORT_UTF8 -DSUPPORT_UCP -DLINK_SIZE=2 -DPOSIX_MALLOC_THRESHOLD=10 -DMATCH_LIMIT=10000000 -DMATCH_LIMIT_RECURSION=10000000)
PHP_NEW_EXTENSION(pcre, php_pcre.c, $ext_shared,,-DEXPORT= -DNEWLINE=10 -DSUPPORT_UTF8 -DSUPPORT_UCP -DLINK_SIZE=2 -DPOSIX_MALLOC_THRESHOLD=10 -DMATCH_LIMIT=10000000 -DMATCH_LIMIT_RECURSION=10000000 -DMAX_NAME_SIZE=32 -DMAX_NAME_COUNT=10000 -DMAX_DUPLENGTH=30000)
fi
PHP_SUBST(PCRE_SHARED_LIBADD)
fi

View File

@ -8,7 +8,7 @@ Email domain: cam.ac.uk
University of Cambridge Computing Service,
Cambridge, England. Phone: +44 1223 334714.
Copyright (c) 1997-2005 University of Cambridge
Copyright (c) 1997-2006 University of Cambridge
All rights reserved
@ -17,7 +17,7 @@ THE C++ WRAPPER LIBRARY
Written by: Google Inc.
Copyright (c) 2005 Google Inc
Copyright (c) 2006 Google Inc
All rights reserved
####

View File

@ -22,7 +22,7 @@ Email domain: cam.ac.uk
University of Cambridge Computing Service,
Cambridge, England. Phone: +44 1223 334714.
Copyright (c) 1997-2005 University of Cambridge
Copyright (c) 1997-2006 University of Cambridge
All rights reserved.
@ -31,7 +31,7 @@ THE C++ WRAPPER FUNCTIONS
Contributed by: Google Inc.
Copyright (c) 2005, Google Inc.
Copyright (c) 2006, Google Inc.
All rights reserved.

View File

@ -1,6 +1,175 @@
ChangeLog for PCRE
------------------
Version 6.7 04-Jul-06
---------------------
1. In order to handle tests when input lines are enormously long, pcretest has
been re-factored so that it automatically extends its buffers when
necessary. The code is crude, but this _is_ just a test program. The
default size has been increased from 32K to 50K.
2. The code in pcre_study() was using the value of the re argument before
testing it for NULL. (Of course, in any sensible call of the function, it
won't be NULL.)
3. The memmove() emulation function in pcre_internal.h, which is used on
systems that lack both memmove() and bcopy() - that is, hardly ever -
was missing a "static" storage class specifier.
4. When UTF-8 mode was not set, PCRE looped when compiling certain patterns
containing an extended class (one that cannot be represented by a bitmap
because it contains high-valued characters or Unicode property items, e.g.
[\pZ]). Almost always one would set UTF-8 mode when processing such a
pattern, but PCRE should not loop if you do not (it no longer does).
[Detail: two cases were found: (a) a repeated subpattern containing an
extended class; (b) a recursive reference to a subpattern that followed a
previous extended class. It wasn't skipping over the extended class
correctly when UTF-8 mode was not set.]
5. A negated single-character class was not being recognized as fixed-length
in lookbehind assertions such as (?<=[^f]), leading to an incorrect
compile error "lookbehind assertion is not fixed length".
6. The RunPerlTest auxiliary script was showing an unexpected difference
between PCRE and Perl for UTF-8 tests. It turns out that it is hard to
write a Perl script that can interpret lines of an input file either as
byte characters or as UTF-8, which is what "perltest" was being required to
do for the non-UTF-8 and UTF-8 tests, respectively. Essentially what you
can't do is switch easily at run time between having the "use utf8;" pragma
or not. In the end, I fudged it by using the RunPerlTest script to insert
"use utf8;" explicitly for the UTF-8 tests.
7. In multiline (/m) mode, PCRE was matching ^ after a terminating newline at
the end of the subject string, contrary to the documentation and to what
Perl does. This was true of both matching functions. Now it matches only at
the start of the subject and immediately after *internal* newlines.
8. A call of pcre_fullinfo() from pcretest to get the option bits was passing
a pointer to an int instead of a pointer to an unsigned long int. This
caused problems on 64-bit systems.
9. Applied a patch from the folks at Google to pcrecpp.cc, to fix "another
instance of the 'standard' template library not being so standard".
10. There was no check on the number of named subpatterns nor the maximum
length of a subpattern name. The product of these values is used to compute
the size of the memory block for a compiled pattern. By supplying a very
long subpattern name and a large number of named subpatterns, the size
computation could be caused to overflow. This is now prevented by limiting
the length of names to 32 characters, and the number of named subpatterns
to 10,000.
11. Subpatterns that are repeated with specific counts have to be replicated in
the compiled pattern. The size of memory for this was computed from the
length of the subpattern and the repeat count. The latter is limited to
65535, but there was no limit on the former, meaning that integer overflow
could in principle occur. The compiled length of a repeated subpattern is
now limited to 30,000 bytes in order to prevent this.
12. Added the optional facility to have named substrings with the same name.
13. Added the ability to use a named substring as a condition, using the
Python syntax: (?(name)yes|no). This overloads (?(R)... and names that
are numbers (not recommended). Forward references are permitted.
14. Added forward references in named backreferences (if you see what I mean).
15. In UTF-8 mode, with the PCRE_DOTALL option set, a quantified dot in the
pattern could run off the end of the subject. For example, the pattern
"(?s)(.{1,5})"8 did this with the subject "ab".
16. If PCRE_DOTALL or PCRE_MULTILINE were set, pcre_dfa_exec() behaved as if
PCRE_CASELESS was set when matching characters that were quantified with ?
or *.
17. A character class other than a single negated character that had a minimum
but no maximum quantifier - for example [ab]{6,} - was not handled
correctly by pce_dfa_exec(). It would match only one character.
18. A valid (though odd) pattern that looked like a POSIX character
class but used an invalid character after [ (for example [[,abc,]]) caused
pcre_compile() to give the error "Failed: internal error: code overflow" or
in some cases to crash with a glibc free() error. This could even happen if
the pattern terminated after [[ but there just happened to be a sequence of
letters, a binary zero, and a closing ] in the memory that followed.
19. Perl's treatment of octal escapes in the range \400 to \777 has changed
over the years. Originally (before any Unicode support), just the bottom 8
bits were taken. Thus, for example, \500 really meant \100. Nowadays the
output from "man perlunicode" includes this:
The regular expression compiler produces polymorphic opcodes. That
is, the pattern adapts to the data and automatically switches to
the Unicode character scheme when presented with Unicode data--or
instead uses a traditional byte scheme when presented with byte
data.
Sadly, a wide octal escape does not cause a switch, and in a string with
no other multibyte characters, these octal escapes are treated as before.
Thus, in Perl, the pattern /\500/ actually matches \100 but the pattern
/\500|\x{1ff}/ matches \500 or \777 because the whole thing is treated as a
Unicode string.
I have not perpetrated such confusion in PCRE. Up till now, it took just
the bottom 8 bits, as in old Perl. I have now made octal escapes with
values greater than \377 illegal in non-UTF-8 mode. In UTF-8 mode they
translate to the appropriate multibyte character.
29. Applied some refactoring to reduce the number of warnings from Microsoft
and Borland compilers. This has included removing the fudge introduced
seven years ago for the OS/2 compiler (see 2.02/2 below) because it caused
a warning about an unused variable.
21. PCRE has not included VT (character 0x0b) in the set of whitespace
characters since release 4.0, because Perl (from release 5.004) does not.
[Or at least, is documented not to: some releases seem to be in conflict
with the documentation.] However, when a pattern was studied with
pcre_study() and all its branches started with \s, PCRE still included VT
as a possible starting character. Of course, this did no harm; it just
caused an unnecessary match attempt.
22. Removed a now-redundant internal flag bit that recorded the fact that case
dependency changed within the pattern. This was once needed for "required
byte" processing, but is no longer used. This recovers a now-scarce options
bit. Also moved the least significant internal flag bit to the most-
significant bit of the word, which was not previously used (hangover from
the days when it was an int rather than a uint) to free up another bit for
the future.
23. Added support for CRLF line endings as well as CR and LF. As well as the
default being selectable at build time, it can now be changed at runtime
via the PCRE_NEWLINE_xxx flags. There are now options for pcregrep to
specify that it is scanning data with non-default line endings.
24. Changed the definition of CXXLINK to make it agree with the definition of
LINK in the Makefile, by replacing LDFLAGS to CXXFLAGS.
25. Applied Ian Taylor's patches to avoid using another stack frame for tail
recursions. This makes a big different to stack usage for some patterns.
26. If a subpattern containing a named recursion or subroutine reference such
as (?P>B) was quantified, for example (xxx(?P>B)){3}, the calculation of
the space required for the compiled pattern went wrong and gave too small a
value. Depending on the environment, this could lead to "Failed: internal
error: code overflow at offset 49" or "glibc detected double free or
corruption" errors.
27. Applied patches from Google (a) to support the new newline modes and (b) to
advance over multibyte UTF-8 characters in GlobalReplace.
28. Change free() to pcre_free() in pcredemo.c. Apparently this makes a
difference for some implementation of PCRE in some Windows version.
29. Added some extra testing facilities to pcretest:
\q<number> in a data line sets the "match limit" value
\Q<number> in a data line sets the "match recursion limt" value
-S <number> sets the stack size, where <number> is in megabytes
The -S option isn't available for Windows.
Version 6.6 06-Feb-06
---------------------

View File

@ -31,7 +31,7 @@ THE C++ WRAPPER FUNCTIONS
Contributed by: Google Inc.
Copyright (c) 2005, Google Inc.
Copyright (c) 2006, Google Inc.
All rights reserved.

View File

@ -1,6 +1,17 @@
News about PCRE releases
------------------------
Release 6.7 04-Jul-06
---------------------
The main additions to this release are the ability to use the same name for
multiple sets of parentheses, and support for CRLF line endings in both the
library and pcregrep (and in pcretest for testing).
Thanks to Ian Taylor, the stack usage for many kinds of pattern has been
significantly reduced for certain subject strings.
Release 6.5 01-Feb-06
---------------------

View File

@ -34,7 +34,7 @@ Documentation for PCRE
----------------------
If you install PCRE in the normal way, you will end up with an installed set of
man pages whose names all start with "pcre". The one that is called "pcre"
man pages whose names all start with "pcre". The one that is just called "pcre"
lists all the others. In addition to these man pages, the PCRE documentation is
supplied in two other forms; however, as there is no standard place to install
them, they are left in the doc directory of the unpacked source distribution.
@ -114,15 +114,17 @@ library. You can read more about them in the pcrebuild man page.
. If, in addition to support for UTF-8 character strings, you want to include
support for the \P, \p, and \X sequences that recognize Unicode character
properties, you must add --enable-unicode-properties to the "configure"
command. This adds about 90K to the size of the library (in the form of a
command. This adds about 30K to the size of the library (in the form of a
property table); only the basic two-letter properties such as Lu are
supported.
. You can build PCRE to recognize either CR or LF as the newline character,
instead of whatever your compiler uses for "\n", by adding --newline-is-cr or
--newline-is-lf to the "configure" command, respectively. Only do this if you
really understand what you are doing. On traditional Unix-like systems, the
newline character is LF.
. You can build PCRE to recognize either CR or LF or the sequence CRLF as
indicating the end of a line. Whatever you specify at build time is the
default; the caller of PCRE can change the selection at run time. The default
newline indicator is a single LF character (the Unix standard). You can
specify the default newline indicator by adding --newline-is-cr or
--newline-is-lf or --newline-is-crlf to the "configure" command,
respectively.
. When called via the POSIX interface, PCRE uses malloc() to get additional
storage for processing capturing parentheses if there are more than 10 of
@ -142,6 +144,16 @@ library. You can read more about them in the pcrebuild man page.
pcre_exec() can supply their own value. There is discussion on the pcreapi
man page.
. There is a separate counter that limits the depth of recursive function calls
during a matching process. This also has a default of ten million, which is
essentially "unlimited". You can change the default by setting, for example,
--with-match-limit-recursion=500000
Recursive function calls use up the runtime stack; running out of stack can
cause programs to crash in strange ways. There is a discussion about stack
sizes in the pcrestack man page.
. The default maximum compiled pattern size is around 64K. You can increase
this by adding --with-link-size=3 to the "configure" command. You can
increase it even more by setting --with-link-size=4, but this is unlikely
@ -165,7 +177,6 @@ library. You can read more about them in the pcrebuild man page.
The "configure" script builds eight files for the basic C library:
. pcre.h is the header file for C programs that call PCRE
. Makefile is the makefile that builds the library
. config.h contains build-time configuration options for the library
. pcre-config is a script that shows the settings of "configure" options
@ -432,25 +443,24 @@ The distribution should contain the following files:
pcre_info.c )
pcre_maketables.c )
pcre_ord2utf8.c )
pcre_printint.c )
pcre_refcount.c )
pcre_study.c )
pcre_tables.c )
pcre_try_flipped.c )
pcre_ucp_findchar.c )
pcre_ucp_searchfuncs.c)
pcre_valid_utf8.c )
pcre_version.c )
pcre_xclass.c )
ucp_findchar.c )
ucp.h ) source for the code that is used for
ucpinternal.h ) Unicode property handling
ucptable.c )
ucptypetable.c )
pcre.in "source" for the header for the external API; pcre.h
is built from this by "configure"
pcre_printint.src ) debugging function that is #included in pcretest, and
) can also be #included in pcre_compile()
pcre.h the public PCRE header file
pcreposix.h header for the external POSIX wrapper API
pcre_internal.h header for internal use
ucp.h ) headers concerned with
ucpinternal.h ) Unicode property handling
config.in template for config.h, which is built by configure
pcrecpp.h the header file for the C++ wrapper
@ -477,8 +487,9 @@ The distribution should contain the following files:
RunGrepTest.in template for a Unix shell script for pcregrep tests
config.guess ) files used by libtool,
config.sub ) used only when building a shared library
config.h.in "source" for the config.h header file
configure a configuring shell script (built by autoconf)
configure.in the autoconf input used to build configure
configure.ac the autoconf input used to build configure
doc/Tech.Notes notes on the encoding
doc/*.3 man page sources for the PCRE functions
doc/*.1 man page sources for pcregrep and pcretest
@ -506,7 +517,6 @@ The distribution should contain the following files:
libpcre.def
libpcreposix.def
pcre.def
(D) Auxiliary file for VPASCAL
@ -515,4 +525,4 @@ The distribution should contain the following files:
Philip Hazel
Email local part: ph10
Email domain: cam.ac.uk
January 2006
June 2006

View File

@ -1,6 +1,9 @@
Technical Notes about PCRE
--------------------------
These are very rough technical notes that record potentially useful information
about PCRE internals.
Historical note 1
-----------------
@ -21,13 +24,14 @@ the pattern, as is expected in Unix and Perl-style regular expressions.
Historical note 2
-----------------
By contrast, the code originally written by Henry Spencer and subsequently
heavily modified for Perl actually compiles the expression twice: once in a
dummy mode in order to find out how much store will be needed, and then for
real. The execution function operates by backtracking and maximizing (or,
optionally, minimizing in Perl) the amount of the subject that matches
individual wild portions of the pattern. This is an "NFA algorithm" in Friedl's
terminology.
By contrast, the code originally written by Henry Spencer (which was
subsequently heavily modified for Perl) compiles the expression twice: once in
a dummy mode in order to find out how much store will be needed, and then for
real. (The Perl version probably doesn't do this any more; I'm talking about
the original library.) The execution function operates by backtracking and
maximizing (or, optionally, minimizing in Perl) the amount of the subject that
matches individual wild portions of the pattern. This is an "NFA algorithm" in
Friedl's terminology.
OK, here's the real stuff
-------------------------
@ -43,7 +47,7 @@ then a second pass to do the real compile - which may use a bit less than the
predicted amount of store. The idea is that this is going to turn out faster
because the first pass is degenerate and the second pass can just store stuff
straight into the vector, which it knows is big enough. It does make the
compiling functions bigger, of course, but they have got quite big anyway to
compiling functions bigger, of course, but they have become quite big anyway to
handle all the Perl stuff.
Traditional matching function
@ -63,7 +67,7 @@ pcre_dfa_exec(). This implements a DFA matching algorithm that searches
simultaneously for all possible matches that start at one point in the subject
string. (Going back to my roots: see Historical Note 1 above.) This function
intreprets the same compiled pattern data as pcre_exec(); however, not all the
facilities are available, and those that are don't always work in quite the
facilities are available, and those that are do not always work in quite the
same way. See the user documentation for details.
Format of compiled patterns
@ -157,10 +161,12 @@ Match by Unicode property
OP_PROP and OP_NOTPROP are used for positive and negative matches of a
character by testing its Unicode property (the \p and \P escape sequences).
Each is followed by a single byte that encodes the desired property value.
Each is followed by two bytes that encode the desired property as a type and a
value.
Repeats of these items use the OP_TYPESTAR etc. set of opcodes, followed by two
bytes: OP_PROP or OP_NOTPROP and then the desired property value.
Repeats of these items use the OP_TYPESTAR etc. set of opcodes, followed by
three bytes: OP_PROP or OP_NOTPROP and then the desired property type and
value.
Matching literal characters
@ -339,4 +345,4 @@ at compile time, and so does not cause anything to be put into the compiled
data.
Philip Hazel
January 2006
June 2006

File diff suppressed because it is too large Load Diff

View File

@ -55,9 +55,9 @@ cannot run ./configure. As it now stands, this file need not be edited in that
circumstance. */
#define PCRE_MAJOR 6
#define PCRE_MINOR 6
#define PCRE_MINOR 7
#define PCRE_PRERELEASE
#define PCRE_DATE 06-Feb-2006
#define PCRE_DATE 04-Jul-2006
/* Win32 uses DLL by default; it needs special stuff for exported functions
when building PCRE. */
@ -116,6 +116,10 @@ extern "C" {
#define PCRE_DFA_SHORTEST 0x00010000
#define PCRE_DFA_RESTART 0x00020000
#define PCRE_FIRSTLINE 0x00040000
#define PCRE_DUPNAMES 0x00080000
#define PCRE_NEWLINE_CR 0x00100000
#define PCRE_NEWLINE_LF 0x00200000
#define PCRE_NEWLINE_CRLF 0x00300000
/* Exec-time and get/set-time error codes */
@ -269,6 +273,8 @@ PCRE_DATA_SCOPE int pcre_fullinfo(const pcre *, const pcre_extra *, int,
PCRE_DATA_SCOPE int pcre_get_named_substring(const pcre *, const char *,
int *, int, const char *, const char **);
PCRE_DATA_SCOPE int pcre_get_stringnumber(const pcre *, const char *);
PCRE_DATA_SCOPE int pcre_get_stringtable_entries(const pcre *, const char *,
char **, char **);
PCRE_DATA_SCOPE int pcre_get_substring(const char *, int *, int, int,
const char **);
PCRE_DATA_SCOPE int pcre_get_substring_list(const char *, int *, int,

View File

@ -42,6 +42,7 @@ POSSIBILITY OF SUCH DAMAGE.
supporting internal functions that are not used by other modules. */
#define NLBLOCK cd /* The block containing newline information */
#include "pcre_internal.h"
@ -190,7 +191,7 @@ static const char *error_texts[] = {
"unrecognized character after (?<",
/* 25 */
"lookbehind assertion is not fixed length",
"malformed number after (?(",
"malformed number or name after (?(",
"conditional group contains more than two branches",
"assertion expected after (?(",
"(?R or (?digits must be followed by )",
@ -210,12 +211,17 @@ static const char *error_texts[] = {
"recursive call could loop indefinitely",
"unrecognized character after (?P",
"syntax error after (?P",
"two named groups have the same name",
"two named subpatterns have the same name",
"invalid UTF-8 string",
/* 45 */
"support for \\P, \\p, and \\X has not been compiled",
"malformed \\P or \\p sequence",
"unknown property name after \\P or \\p"
"unknown property name after \\P or \\p",
"subpattern name is too long (maximum 32 characters)",
"too many named subpatterns (maximum 10,000)",
/* 50 */
"repeated subpattern is too long",
"octal value is greater than \\377 (not in UTF-8 mode)"
};
@ -460,13 +466,16 @@ else
}
/* \0 always starts an octal number, but we may drop through to here with a
larger first octal digit. */
larger first octal digit. The original code used just to take the least
significant 8 bits of octal numbers (I think this is what early Perls used
to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
than 3 octal digits. */
case '0':
c -= '0';
while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
c = c * 8 + *(++ptr) - '0';
c &= 255; /* Take least significant 8 bits */
if (!utf8 && c > 255) *errorcodeptr = ERR51;
break;
/* \x is complicated. \x{ddd} is a character number which can be greater
@ -762,6 +771,48 @@ return p;
/*************************************************
* Find forward referenced named subpattern *
*************************************************/
/* This function scans along a pattern looking for capturing subpatterns, and
counting them. If it finds a named pattern that matches the name it is given,
it returns its number. This is used for forward references to named
subpatterns. We know that if (?P< is encountered, the name will be terminated
by '>' because that is checked in the first pass.
Arguments:
pointer current position in the pattern
count current count of capturing parens
name name to seek
namelen name length
Returns: the number of the named subpattern, or -1 if not found
*/
static int
find_named_parens(const uschar *ptr, int count, const uschar *name, int namelen)
{
const uschar *thisname;
for (; *ptr != 0; ptr++)
{
if (*ptr == '\\' && ptr[1] != 0) { ptr++; continue; }
if (*ptr != '(') continue;
if (ptr[1] != '?') { count++; continue; }
if (ptr[2] == '(') { ptr += 2; continue; }
if (ptr[2] != 'P' || ptr[3] != '<') continue;
count++;
ptr += 4;
thisname = ptr;
while (*ptr != '>') ptr++;
if (namelen == ptr - thisname && strncmp(name, thisname, namelen) == 0)
return count;
}
return -1;
}
/*************************************************
* Find first significant op code *
*************************************************/
@ -917,6 +968,7 @@ for (;;)
case OP_CHAR:
case OP_CHARNC:
case OP_NOT:
branchlength++;
cc += 2;
#ifdef SUPPORT_UTF8
@ -1031,14 +1083,19 @@ Returns: pointer to the opcode for the bracket, or NULL if not found
static const uschar *
find_bracket(const uschar *code, BOOL utf8, int number)
{
#ifndef SUPPORT_UTF8
utf8 = utf8; /* Stop pedantic compilers complaining */
#endif
for (;;)
{
register int c = *code;
if (c == OP_END) return NULL;
/* XCLASS is used for classes that cannot be represented just by a bit
map. This includes negated single high-valued characters. The length in
the table is zero; the actual length is stored in the compiled code. */
if (c == OP_XCLASS) code += GET(code, 1);
/* Handle bracketed group */
else if (c > OP_BRA)
{
int n = c - OP_BRA;
@ -1046,17 +1103,16 @@ for (;;)
if (n == number) return (uschar *)code;
code += _pcre_OP_lengths[OP_BRA];
}
/* Otherwise, we get the item's length from the table. In UTF-8 mode, opcodes
that are followed by a character may be followed by a multi-byte character.
The length in the table is a minimum, so we have to scan along to skip the
extra bytes. All opcodes are less than 128, so we can use relatively
efficient code. */
else
{
code += _pcre_OP_lengths[c];
#ifdef SUPPORT_UTF8
/* In UTF-8 mode, opcodes that are followed by a character may be followed
by a multi-byte character. The length in the table is a minimum, so we have
to scan along to skip the extra bytes. All opcodes are less than 128, so we
can use relatively efficient code. */
if (utf8) switch(c)
{
case OP_CHAR:
@ -1072,16 +1128,7 @@ for (;;)
case OP_MINQUERY:
while ((*code & 0xc0) == 0x80) code++;
break;
/* XCLASS is used for classes that cannot be represented just by a bit
map. This includes negated single high-valued characters. The length in
the table is zero; the actual length is stored in the compiled code. */
case OP_XCLASS:
code += GET(code, 1) + 1;
break;
}
#endif
}
}
}
@ -1105,30 +1152,34 @@ Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
static const uschar *
find_recurse(const uschar *code, BOOL utf8)
{
#ifndef SUPPORT_UTF8
utf8 = utf8; /* Stop pedantic compilers complaining */
#endif
for (;;)
{
register int c = *code;
if (c == OP_END) return NULL;
else if (c == OP_RECURSE) return code;
if (c == OP_RECURSE) return code;
/* XCLASS is used for classes that cannot be represented just by a bit
map. This includes negated single high-valued characters. The length in
the table is zero; the actual length is stored in the compiled code. */
if (c == OP_XCLASS) code += GET(code, 1);
/* All bracketed groups have the same length. */
else if (c > OP_BRA)
{
code += _pcre_OP_lengths[OP_BRA];
}
/* Otherwise, we get the item's length from the table. In UTF-8 mode, opcodes
that are followed by a character may be followed by a multi-byte character.
The length in the table is a minimum, so we have to scan along to skip the
extra bytes. All opcodes are less than 128, so we can use relatively
efficient code. */
else
{
code += _pcre_OP_lengths[c];
#ifdef SUPPORT_UTF8
/* In UTF-8 mode, opcodes that are followed by a character may be followed
by a multi-byte character. The length in the table is a minimum, so we have
to scan along to skip the extra bytes. All opcodes are less than 128, so we
can use relatively efficient code. */
if (utf8) switch(c)
{
case OP_CHAR:
@ -1144,16 +1195,7 @@ for (;;)
case OP_MINQUERY:
while ((*code & 0xc0) == 0x80) code++;
break;
/* XCLASS is used for classes that cannot be represented just by a bit
map. This includes negated single high-valued characters. The length in
the table is zero; the actual length is stored in the compiled code. */
case OP_XCLASS:
code += GET(code, 1) + 1;
break;
}
#endif
}
}
}
@ -1569,7 +1611,6 @@ int greedy_default, greedy_non_default;
int firstbyte, reqbyte;
int zeroreqbyte, zerofirstbyte;
int req_caseopt, reqvary, tempreqvary;
int condcount = 0;
int options = *optionsptr;
int after_manual_callout = 0;
register int c;
@ -1683,10 +1724,14 @@ for (;; ptr++)
if ((cd->ctypes[c] & ctype_space) != 0) continue;
if (c == '#')
{
/* The space before the ; is to avoid a warning on a silly compiler
on the Macintosh. */
while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
if (c != 0) continue; /* Else fall through to handle end of string */
while (*(++ptr) != 0) if (IS_NEWLINE(ptr)) break;
if (*ptr != 0)
{
ptr += cd->nllen - 1;
continue;
}
/* Else fall through to handle end of string */
c = 0;
}
}
@ -2851,37 +2896,91 @@ for (;; ptr++)
case '(':
bravalue = OP_COND; /* Conditional group */
/* Condition to test for recursion */
/* A condition can be a number, referring to a numbered group, a name,
referring to a named group, 'R', referring to recursion, or an
assertion. There are two unfortunate ambiguities, caused by history.
(a) 'R' can be the recursive thing or the name 'R', and (b) a number
could be a name that consists of digits. In both cases, we look for a
name first; if not found, we try the other cases. If the first
character after (?( is a word character, we know the rest up to ) will
also be word characters because the syntax was checked in the first
pass. */
if (ptr[1] == 'R')
if ((cd->ctypes[ptr[1]] & ctype_word) != 0)
{
code[1+LINK_SIZE] = OP_CREF;
PUT2(code, 2+LINK_SIZE, CREF_RECURSE);
int i, namelen;
int condref = 0;
const uschar *name;
uschar *slot = cd->name_table;
/* This is needed for all successful cases. */
skipbytes = 3;
ptr += 3;
}
/* Condition to test for a numbered subpattern match. We know that
if a digit follows ( then there will just be digits until ) because
the syntax was checked in the first pass. */
/* Read the name, but also get it as a number if it's all digits */
else if ((digitab[ptr[1]] && ctype_digit) != 0)
{
int condref; /* Don't amalgamate; some compilers */
condref = *(++ptr) - '0'; /* grumble at autoincrement in declaration */
while (*(++ptr) != ')') condref = condref*10 + *ptr - '0';
if (condref == 0)
name = ++ptr;
while (*ptr != ')')
{
*errorcodeptr = ERR35;
if (condref >= 0)
condref = ((digitab[*ptr] & ctype_digit) != 0)?
condref * 10 + *ptr - '0' : -1;
ptr++;
}
namelen = ptr - name;
ptr++;
for (i = 0; i < cd->names_found; i++)
{
if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
slot += cd->name_entry_size;
}
/* Found a previous named subpattern */
if (i < cd->names_found)
{
condref = GET2(slot, 0);
code[1+LINK_SIZE] = OP_CREF;
PUT2(code, 2+LINK_SIZE, condref);
}
/* Search the pattern for a forward reference */
else if ((i = find_named_parens(ptr, *brackets, name, namelen)) > 0)
{
code[1+LINK_SIZE] = OP_CREF;
PUT2(code, 2+LINK_SIZE, i);
}
/* Check for 'R' for recursion */
else if (namelen == 1 && *name == 'R')
{
code[1+LINK_SIZE] = OP_CREF;
PUT2(code, 2+LINK_SIZE, CREF_RECURSE);
}
/* Check for a subpattern number */
else if (condref > 0)
{
code[1+LINK_SIZE] = OP_CREF;
PUT2(code, 2+LINK_SIZE, condref);
}
/* Either an unidentified subpattern, or a reference to (?(0) */
else
{
*errorcodeptr = (condref == 0)? ERR35: ERR15;
goto FAILED;
}
ptr++;
code[1+LINK_SIZE] = OP_CREF;
PUT2(code, 2+LINK_SIZE, condref);
skipbytes = 3;
}
/* For conditions that are assertions, we just fall through, having
set bravalue above. */
break;
case '=': /* Positive lookahead */
@ -2953,10 +3052,13 @@ for (;; ptr++)
{
if (slot[2+namelen] == 0)
{
*errorcodeptr = ERR43;
goto FAILED;
if ((options & PCRE_DUPNAMES) == 0)
{
*errorcodeptr = ERR43;
goto FAILED;
}
}
crc = -1; /* Current name is substring */
else crc = -1; /* Current name is substring */
}
if (crc < 0)
{
@ -2989,14 +3091,18 @@ for (;; ptr++)
if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
slot += cd->name_entry_size;
}
if (i >= cd->names_found)
if (i < cd->names_found) /* Back reference */
{
recno = GET2(slot, 0);
}
else if ((recno = /* Forward back reference */
find_named_parens(ptr, *brackets, name, namelen)) <= 0)
{
*errorcodeptr = ERR15;
goto FAILED;
}
recno = GET2(slot, 0);
if (type == '>') goto HANDLE_RECURSION; /* A few lines below */
/* Back reference */
@ -3036,9 +3142,8 @@ for (;; ptr++)
regex in case it doesn't exist. */
*code = OP_END;
called = (recno == 0)?
cd->start_code : find_bracket(cd->start_code, utf8, recno);
called = (recno == 0)? cd->start_code :
find_bracket(cd->start_code, utf8, recno);
if (called == NULL)
{
*errorcodeptr = ERR15;
@ -3085,6 +3190,7 @@ for (;; ptr++)
case '-': optset = &unset; break;
case 'i': *optset |= PCRE_CASELESS; break;
case 'J': *optset |= PCRE_DUPNAMES; break;
case 'm': *optset |= PCRE_MULTILINE; break;
case 's': *optset |= PCRE_DOTALL; break;
case 'x': *optset |= PCRE_EXTENDED; break;
@ -3201,7 +3307,7 @@ for (;; ptr++)
else if (bravalue == OP_COND)
{
uschar *tc = code;
condcount = 0;
int condcount = 0;
do {
condcount++;
@ -3906,13 +4012,14 @@ return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
}
PCRE_DATA_SCOPE pcre *
pcre_compile2(const char *pattern, int options, int *errorcodeptr,
const char **errorptr, int *erroroffset, const unsigned char *tables)
{
real_pcre *re;
int length = 1 + LINK_SIZE; /* For initial BRA plus length */
int c, firstbyte, reqbyte;
int c, firstbyte, reqbyte, newline;
int bracount = 0;
int branch_extra = 0;
int branch_newextra;
@ -3933,6 +4040,7 @@ uschar *code;
const uschar *codestart;
const uschar *ptr;
compile_data compile_block;
compile_data *cd = &compile_block;
int brastack[BRASTACK_SIZE];
uschar bralenstack[BRASTACK_SIZE];
@ -3986,18 +4094,42 @@ if ((options & ~PUBLIC_OPTIONS) != 0)
/* Set up pointers to the individual character tables */
if (tables == NULL) tables = _pcre_default_tables;
compile_block.lcc = tables + lcc_offset;
compile_block.fcc = tables + fcc_offset;
compile_block.cbits = tables + cbits_offset;
compile_block.ctypes = tables + ctypes_offset;
cd->lcc = tables + lcc_offset;
cd->fcc = tables + fcc_offset;
cd->cbits = tables + cbits_offset;
cd->ctypes = tables + ctypes_offset;
/* Handle different types of newline. The two bits give four cases. The current
code allows for one- or two-byte sequences. */
switch (options & PCRE_NEWLINE_CRLF)
{
default: newline = NEWLINE; break; /* Compile-time default */
case PCRE_NEWLINE_CR: newline = '\r'; break;
case PCRE_NEWLINE_LF: newline = '\n'; break;
case PCRE_NEWLINE_CR+
PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
}
if (newline > 255)
{
cd->nllen = 2;
cd->nl[0] = (newline >> 8) & 255;
cd->nl[1] = newline & 255;
}
else
{
cd->nllen = 1;
cd->nl[0] = newline;
}
/* Maximum back reference and backref bitmap. This is updated for numeric
references during the first pass, but for named references during the actual
compile pass. The bitmap records up to 31 back references to help in deciding
whether (.*) can be treated as anchored or not. */
compile_block.top_backref = 0;
compile_block.backref_map = 0;
cd->top_backref = 0;
cd->backref_map = 0;
/* Reflect pattern for debugging output */
@ -4031,14 +4163,16 @@ while ((c = *(++ptr)) != 0)
if ((options & PCRE_EXTENDED) != 0)
{
if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
if ((cd->ctypes[c] & ctype_space) != 0) continue;
if (c == '#')
{
/* The space before the ; is to avoid a warning on a silly compiler
on the Macintosh. */
while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
if (c == 0) break;
continue;
while (*(++ptr) != 0) if (IS_NEWLINE(ptr)) break;
if (*ptr != 0)
{
ptr += cd->nllen - 1;
continue;
}
break; /* End loop at end of pattern */
}
}
@ -4128,9 +4262,9 @@ while ((c = *(++ptr)) != 0)
if (c <= -ESC_REF)
{
int refnum = -c - ESC_REF;
compile_block.backref_map |= (refnum < 32)? (1 << refnum) : 1;
if (refnum > compile_block.top_backref)
compile_block.top_backref = refnum;
cd->backref_map |= (refnum < 32)? (1 << refnum) : 1;
if (refnum > cd->top_backref)
cd->top_backref = refnum;
length += 2; /* For single back reference */
if (ptr[1] == '{' && is_counted_repeat(ptr+2))
{
@ -4284,7 +4418,9 @@ while ((c = *(++ptr)) != 0)
/* Check the syntax for POSIX stuff. The bits we actually handle are
checked during the real compile phase. */
else if (*ptr == '[' && check_posix_syntax(ptr, &ptr, &compile_block))
else if (*ptr == '[' &&
(ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
check_posix_syntax(ptr, &ptr, cd))
{
ptr++;
class_optcount = 10; /* Make sure > 1 */
@ -4517,6 +4653,61 @@ while ((c = *(++ptr)) != 0)
ptr += 2;
break;
/* Named subpatterns are an extension copied from Python */
case 'P':
ptr += 3;
/* Handle the definition of a named subpattern */
if (*ptr == '<')
{
const uschar *p; /* Don't amalgamate; some compilers */
p = ++ptr; /* grumble at autoincrement in declaration */
while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
if (*ptr != '>')
{
errorcode = ERR42;
goto PCRE_ERROR_RETURN;
}
name_count++;
if (name_count > MAX_NAME_COUNT)
{
errorcode = ERR49;
goto PCRE_ERROR_RETURN;
}
if (ptr - p > max_name_size)
{
max_name_size = (ptr - p);
if (max_name_size > MAX_NAME_SIZE)
{
errorcode = ERR48;
goto PCRE_ERROR_RETURN;
}
}
capturing = TRUE; /* Named parentheses are always capturing */
break; /* Go handle capturing parentheses */
}
/* Handle back references and recursive calls to named subpatterns */
if (*ptr == '=' || *ptr == '>')
{
length += 3 + 3*LINK_SIZE; /* Allow for the automatic "once" */
while ((cd->ctypes[*(++ptr)] & ctype_word) != 0);
if (*ptr != ')')
{
errorcode = ERR42;
goto PCRE_ERROR_RETURN;
}
goto RECURSE_CHECK_QUANTIFIED;
}
/* Unknown character after (?P */
errorcode = ERR41;
goto PCRE_ERROR_RETURN;
/* (?R) specifies a recursive call to the regex, which is an extension
to provide the facility which can be obtained by (?p{perl-code}) in
Perl 5.6. In Perl 5.8 this has become (??{perl-code}).
@ -4542,8 +4733,10 @@ while ((c = *(++ptr)) != 0)
/* If this item is quantified, it will get wrapped inside brackets so
as to use the code for quantified brackets. We jump down and use the
code that handles this for real brackets. */
code that handles this for real brackets. Come here from code for
named recursions/subroutines. */
RECURSE_CHECK_QUANTIFIED:
if (ptr[1] == '+' || ptr[1] == '*' || ptr[1] == '?' || ptr[1] == '{')
{
length += 2 + 2 * LINK_SIZE; /* to make bracketed */
@ -4567,48 +4760,6 @@ while ((c = *(++ptr)) != 0)
length += 2 + 2*LINK_SIZE;
continue;
/* Named subpatterns are an extension copied from Python */
case 'P':
ptr += 3;
/* Handle the definition of a named subpattern */
if (*ptr == '<')
{
const uschar *p; /* Don't amalgamate; some compilers */
p = ++ptr; /* grumble at autoincrement in declaration */
while ((compile_block.ctypes[*ptr] & ctype_word) != 0) ptr++;
if (*ptr != '>')
{
errorcode = ERR42;
goto PCRE_ERROR_RETURN;
}
name_count++;
if (ptr - p > max_name_size) max_name_size = (ptr - p);
capturing = TRUE; /* Named parentheses are always capturing */
break;
}
/* Handle back references and recursive calls to named subpatterns */
if (*ptr == '=' || *ptr == '>')
{
length += 2 + 2*LINK_SIZE; /* Allow for the automatic "once" */
while ((compile_block.ctypes[*(++ptr)] & ctype_word) != 0);
if (*ptr != ')')
{
errorcode = ERR42;
goto PCRE_ERROR_RETURN;
}
break;
}
/* Unknown character after (?P */
errorcode = ERR41;
goto PCRE_ERROR_RETURN;
/* Lookbehinds are in Perl from version 5.005 */
case '<':
@ -4624,19 +4775,17 @@ while ((c = *(++ptr)) != 0)
/* Conditionals are in Perl from version 5.005. The bracket must either
be followed by a number (for bracket reference) or by an assertion
group, or (a PCRE extension) by 'R' for a recursion test. */
group. PCRE extends this by allowing a name to reference a named group;
unfortunately, previously 'R' was implemented for a recursion test.
When this is compiled, we look for the named group 'R' first. At this
point we just do a basic syntax check. */
case '(':
if (ptr[3] == 'R' && ptr[4] == ')')
if ((cd->ctypes[ptr[3]] & ctype_word) != 0)
{
ptr += 4;
length += 3;
}
else if ((digitab[ptr[3]] & ctype_digit) != 0)
{
ptr += 4;
length += 3;
while ((digitab[*ptr] & ctype_digit) != 0) ptr++;
while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
if (*ptr != ')')
{
errorcode = ERR26;
@ -4675,6 +4824,11 @@ while ((c = *(++ptr)) != 0)
*optset |= PCRE_CASELESS;
continue;
case 'J':
*optset |= PCRE_DUPNAMES;
options |= PCRE_JCHANGED; /* Record that it changed */
continue;
case 'm':
*optset |= PCRE_MULTILINE;
continue;
@ -4740,16 +4894,13 @@ while ((c = *(++ptr)) != 0)
will lead to an over-estimate on the length, but this shouldn't
matter very much. We also have to allow for resetting options at
the start of any alternations, which we do by setting
branch_newextra to 2. Finally, we record whether the case-dependent
flag ever changes within the regex. This is used by the "required
character" code. */
branch_newextra to 2. */
case ':':
if (((set|unset) & PCRE_IMS) != 0)
{
length += 4;
branch_newextra = 2;
if (((set|unset) & PCRE_CASELESS) != 0) options |= PCRE_ICHANGED;
}
goto END_OPTIONS;
@ -4829,6 +4980,12 @@ while ((c = *(++ptr)) != 0)
{
duplength = length - brastack[--brastackptr];
branch_extra = bralenstack[brastackptr];
/* This is a paranoid check to stop integer overflow later on */
if (duplength > MAX_DUPLENGTH)
{
errorcode = ERR50;
goto PCRE_ERROR_RETURN;
}
}
else duplength = 0;
@ -4933,7 +5090,8 @@ if (length > MAX_PATTERN_SIZE)
}
/* Compute the size of data block needed and get it, either from malloc or
externally provided function. */
externally provided function. Integer overflow should no longer be possible
because nowadays we limit the maximum value of name_count and max_name size. */
size = length + sizeof(real_pcre) + name_count * (max_name_size + 3);
re = (real_pcre *)(pcre_malloc)(size);
@ -4963,14 +5121,14 @@ re->nullpad = NULL;
/* The starting points of the name/number translation table and of the code are
passed around in the compile data block. */
compile_block.names_found = 0;
compile_block.name_entry_size = max_name_size + 3;
compile_block.name_table = (uschar *)re + re->name_table_offset;
codestart = compile_block.name_table + re->name_entry_size * re->name_count;
compile_block.start_code = codestart;
compile_block.start_pattern = (const uschar *)pattern;
compile_block.req_varyopt = 0;
compile_block.nopartial = FALSE;
cd->names_found = 0;
cd->name_entry_size = max_name_size + 3;
cd->name_table = (uschar *)re + re->name_table_offset;
codestart = cd->name_table + re->name_entry_size * re->name_count;
cd->start_code = codestart;
cd->start_pattern = (const uschar *)pattern;
cd->req_varyopt = 0;
cd->nopartial = FALSE;
/* Set up a starting, non-extracting bracket, then compile the expression. On
error, errorcode will be set non-zero, so we don't need to look at the result
@ -4981,11 +5139,11 @@ code = (uschar *)codestart;
*code = OP_BRA;
bracount = 0;
(void)compile_regex(options, options & PCRE_IMS, &bracount, &code, &ptr,
&errorcode, FALSE, 0, &firstbyte, &reqbyte, NULL, &compile_block);
&errorcode, FALSE, 0, &firstbyte, &reqbyte, NULL, cd);
re->top_bracket = bracount;
re->top_backref = compile_block.top_backref;
re->top_backref = cd->top_backref;
if (compile_block.nopartial) re->options |= PCRE_NOPARTIAL;
if (cd->nopartial) re->options |= PCRE_NOPARTIAL;
/* If not reached end of pattern on success, there's an excess bracket. */
@ -5031,7 +5189,7 @@ start with ^. and also when all branches start with .* for non-DOTALL matches.
if ((options & PCRE_ANCHORED) == 0)
{
int temp_options = options;
if (is_anchored(codestart, &temp_options, 0, compile_block.backref_map))
if (is_anchored(codestart, &temp_options, 0, cd->backref_map))
re->options |= PCRE_ANCHORED;
else
{
@ -5041,10 +5199,10 @@ if ((options & PCRE_ANCHORED) == 0)
{
int ch = firstbyte & 255;
re->first_byte = ((firstbyte & REQ_CASELESS) != 0 &&
compile_block.fcc[ch] == ch)? ch : firstbyte;
cd->fcc[ch] == ch)? ch : firstbyte;
re->options |= PCRE_FIRSTSET;
}
else if (is_startline(codestart, 0, compile_block.backref_map))
else if (is_startline(codestart, 0, cd->backref_map))
re->options |= PCRE_STARTLINE;
}
}
@ -5058,7 +5216,7 @@ if (reqbyte >= 0 &&
{
int ch = reqbyte & 255;
re->req_byte = ((reqbyte & REQ_CASELESS) != 0 &&
compile_block.fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte;
cd->fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte;
re->options |= PCRE_REQCHSET;
}
@ -5072,11 +5230,10 @@ printf("Length = %d top_bracket = %d top_backref = %d\n",
if (re->options != 0)
{
printf("%s%s%s%s%s%s%s%s%s%s\n",
printf("%s%s%s%s%s%s%s%s%s\n",
((re->options & PCRE_NOPARTIAL) != 0)? "nopartial " : "",
((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
((re->options & PCRE_ICHANGED) != 0)? "case state changed " : "",
((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
((re->options & PCRE_DOTALL) != 0)? "dotall " : "",

View File

@ -42,7 +42,7 @@ POSSIBILITY OF SUCH DAMAGE.
pattern matching using an NFA algorithm, trying to mimic Perl as closely as
possible. There are also some static supporting functions. */
#define NLBLOCK md /* The block containing newline information */
#include "pcre_internal.h"
@ -275,7 +275,7 @@ typedef struct heapframe {
long int Xims;
eptrblock *Xeptrb;
int Xflags;
int Xrdepth;
unsigned int Xrdepth;
/* Function local variables */
@ -374,16 +374,16 @@ Returns: MATCH_MATCH if matched ) these values are >= 0
static int
match(REGISTER USPTR eptr, REGISTER const uschar *ecode,
int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
int flags, int rdepth)
int flags, unsigned int rdepth)
{
/* These variables do not need to be preserved over recursion in this function,
so they can be ordinary variables in all cases. Mark them with "register"
because they are used a lot in loops. */
register int rrc; /* Returns from recursive calls */
register int i; /* Used for loops not involving calls to RMATCH() */
register int c; /* Character values not kept over RMATCH() calls */
register BOOL utf8; /* Local copy of UTF-8 flag for speed */
register int rrc; /* Returns from recursive calls */
register int i; /* Used for loops not involving calls to RMATCH() */
register unsigned int c; /* Character values not kept over RMATCH() calls */
register BOOL utf8; /* Local copy of UTF-8 flag for speed */
/* When recursion is not being used, all "local" variables that have to be
preserved over calls to RMATCH() are part of a "frame" which is obtained from
@ -527,6 +527,13 @@ prop_fail_result = 0;
prop_test_variable = NULL;
#endif
/* This label is used for tail recursion, which is used in a few cases even
when NO_RECURSE is not defined, in order to reduce the amount of stack that is
used. Thanks to Ian Taylor for noticing this possibility and sending the
original patch. */
TAIL_RECURSE:
/* OK, now we can get on with the real code of the function. Recursive calls
are specified by the macro RMATCH and RRETURN is used to return. When
NO_RECURSE is *not* defined, these just turn into a recursive call to match()
@ -542,7 +549,12 @@ if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
original_ims = ims; /* Save for resetting on ')' */
#ifdef SUPPORT_UTF8
utf8 = md->utf8; /* Local copy of the flag */
#else
utf8 = FALSE;
#endif
/* At the start of a bracketed group, add the current subject pointer to the
stack of such pointers, to be re-instated at the end of the group when we hit
@ -642,21 +654,38 @@ for (;;)
{
case OP_BRA: /* Non-capturing bracket: optimized */
DPRINTF(("start bracket 0\n"));
do
/* Loop for all the alternatives */
for (;;)
{
/* When we get to the final alternative within the brackets, we would
return the result of a recursive call to match() whatever happened. We
can reduce stack usage by turning this into a tail recursion. */
if (ecode[GET(ecode, 1)] != OP_ALT)
{
ecode += 1 + LINK_SIZE;
flags = match_isgroup;
DPRINTF(("bracket 0 tail recursion\n"));
goto TAIL_RECURSE;
}
/* For non-final alternatives, continue the loop for a NOMATCH result;
otherwise return. */
RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb,
match_isgroup);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
ecode += GET(ecode, 1);
}
while (*ecode == OP_ALT);
DPRINTF(("bracket 0 failed\n"));
RRETURN(MATCH_NOMATCH);
/* Control never reaches here. */
/* Conditional group: compilation checked that there are no more than
two branches. If the condition is false, skipping the first branch takes us
past the end if there is only one branch, but that's OK because that is
exactly what going to the ket would do. */
exactly what going to the ket would do. As there is only one branch to be
obeyed, we can use tail recursion to avoid using another stack frame. */
case OP_COND:
if (ecode[LINK_SIZE+1] == OP_CREF) /* Condition extract or recurse test */
@ -665,10 +694,9 @@ for (;;)
condition = (offset == CREF_RECURSE * 2)?
(md->recursive != NULL) :
(offset < offset_top && md->offset_vector[offset] >= 0);
RMATCH(rrc, eptr, ecode + (condition?
(LINK_SIZE + 4) : (LINK_SIZE + 1 + GET(ecode, 1))),
offset_top, md, ims, eptrb, match_isgroup);
RRETURN(rrc);
ecode += condition? (LINK_SIZE + 4) : (LINK_SIZE + 1 + GET(ecode, 1));
flags = match_isgroup;
goto TAIL_RECURSE;
}
/* The condition is an assertion. Call match() to evaluate it - setting
@ -688,9 +716,13 @@ for (;;)
RRETURN(rrc); /* Need braces because of following else */
}
else ecode += GET(ecode, 1);
RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb,
match_isgroup);
RRETURN(rrc);
/* We are now at the branch that is to be obeyed. As there is only one,
we can use tail recursion to avoid using another stack frame. */
ecode += 1 + LINK_SIZE;
flags = match_isgroup;
goto TAIL_RECURSE;
}
/* Control never reaches here */
@ -945,71 +977,72 @@ for (;;)
the end of a normal bracket, leaving the subject pointer. */
case OP_ONCE:
prev = ecode;
saved_eptr = eptr;
do
{
prev = ecode;
saved_eptr = eptr;
do
{
RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims,
eptrb, match_isgroup);
if (rrc == MATCH_MATCH) break;
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
ecode += GET(ecode,1);
}
while (*ecode == OP_ALT);
/* If hit the end of the group (which could be repeated), fail */
if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
/* Continue as from after the assertion, updating the offsets high water
mark, since extracts may have been taken. */
do ecode += GET(ecode,1); while (*ecode == OP_ALT);
offset_top = md->end_offset_top;
eptr = md->end_match_ptr;
/* For a non-repeating ket, just continue at this level. This also
happens for a repeating ket if no characters were matched in the group.
This is the forcible breaking of infinite loops as implemented in Perl
5.005. If there is an options reset, it will get obeyed in the normal
course of events. */
if (*ecode == OP_KET || eptr == saved_eptr)
{
ecode += 1+LINK_SIZE;
break;
}
/* The repeating kets try the rest of the pattern or restart from the
preceding bracket, in the appropriate order. We need to reset any options
that changed within the bracket before re-running it, so check the next
opcode. */
if (ecode[1+LINK_SIZE] == OP_OPT)
{
ims = (ims & ~PCRE_IMS) | ecode[4];
DPRINTF(("ims set to %02lx at group repeat\n", ims));
}
if (*ecode == OP_KETRMIN)
{
RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
}
else /* OP_KETRMAX */
{
RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
RMATCH(rrc, eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
}
RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims,
eptrb, match_isgroup);
if (rrc == MATCH_MATCH) break;
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
ecode += GET(ecode,1);
}
RRETURN(MATCH_NOMATCH);
while (*ecode == OP_ALT);
/* If hit the end of the group (which could be repeated), fail */
if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
/* Continue as from after the assertion, updating the offsets high water
mark, since extracts may have been taken. */
do ecode += GET(ecode,1); while (*ecode == OP_ALT);
offset_top = md->end_offset_top;
eptr = md->end_match_ptr;
/* For a non-repeating ket, just continue at this level. This also
happens for a repeating ket if no characters were matched in the group.
This is the forcible breaking of infinite loops as implemented in Perl
5.005. If there is an options reset, it will get obeyed in the normal
course of events. */
if (*ecode == OP_KET || eptr == saved_eptr)
{
ecode += 1+LINK_SIZE;
break;
}
/* The repeating kets try the rest of the pattern or restart from the
preceding bracket, in the appropriate order. The second "call" of match()
uses tail recursion, to avoid using another stack frame. We need to reset
any options that changed within the bracket before re-running it, so
check the next opcode. */
if (ecode[1+LINK_SIZE] == OP_OPT)
{
ims = (ims & ~PCRE_IMS) | ecode[4];
DPRINTF(("ims set to %02lx at group repeat\n", ims));
}
if (*ecode == OP_KETRMIN)
{
RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
ecode = prev;
flags = match_isgroup;
goto TAIL_RECURSE;
}
else /* OP_KETRMAX */
{
RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
ecode += 1 + LINK_SIZE;
flags = 0;
goto TAIL_RECURSE;
}
/* Control never gets here */
/* An alternation is the end of a branch; scan along to find the end of the
bracketed group and go to there. */
@ -1053,114 +1086,114 @@ for (;;)
case OP_KET:
case OP_KETRMIN:
case OP_KETRMAX:
prev = ecode - GET(ecode, 1);
saved_eptr = eptrb->epb_saved_eptr;
/* Back up the stack of bracket start pointers. */
eptrb = eptrb->epb_prev;
if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
*prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
*prev == OP_ONCE)
{
prev = ecode - GET(ecode, 1);
saved_eptr = eptrb->epb_saved_eptr;
md->end_match_ptr = eptr; /* For ONCE */
md->end_offset_top = offset_top;
RRETURN(MATCH_MATCH);
}
/* Back up the stack of bracket start pointers. */
/* In all other cases except a conditional group we have to check the
group number back at the start and if necessary complete handling an
extraction by setting the offsets and bumping the high water mark. */
eptrb = eptrb->epb_prev;
if (*prev != OP_COND)
{
number = *prev - OP_BRA;
if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
*prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
*prev == OP_ONCE)
{
md->end_match_ptr = eptr; /* For ONCE */
md->end_offset_top = offset_top;
RRETURN(MATCH_MATCH);
}
/* For extended extraction brackets (large number), we have to fish out
the number from a dummy opcode at the start. */
/* In all other cases except a conditional group we have to check the
group number back at the start and if necessary complete handling an
extraction by setting the offsets and bumping the high water mark. */
if (*prev != OP_COND)
{
number = *prev - OP_BRA;
/* For extended extraction brackets (large number), we have to fish out
the number from a dummy opcode at the start. */
if (number > EXTRACT_BASIC_MAX) number = GET2(prev, 2+LINK_SIZE);
offset = number << 1;
if (number > EXTRACT_BASIC_MAX) number = GET2(prev, 2+LINK_SIZE);
offset = number << 1;
#ifdef DEBUG
printf("end bracket %d", number);
printf("\n");
printf("end bracket %d", number);
printf("\n");
#endif
/* Test for a numbered group. This includes groups called as a result
of recursion. Note that whole-pattern recursion is coded as a recurse
into group 0, so it won't be picked up here. Instead, we catch it when
the OP_END is reached. */
/* Test for a numbered group. This includes groups called as a result
of recursion. Note that whole-pattern recursion is coded as a recurse
into group 0, so it won't be picked up here. Instead, we catch it when
the OP_END is reached. */
if (number > 0)
if (number > 0)
{
md->capture_last = number;
if (offset >= md->offset_max) md->offset_overflow = TRUE; else
{
md->capture_last = number;
if (offset >= md->offset_max) md->offset_overflow = TRUE; else
{
md->offset_vector[offset] =
md->offset_vector[md->offset_end - number];
md->offset_vector[offset+1] = eptr - md->start_subject;
if (offset_top <= offset) offset_top = offset + 2;
}
/* Handle a recursively called group. Restore the offsets
appropriately and continue from after the call. */
if (md->recursive != NULL && md->recursive->group_num == number)
{
recursion_info *rec = md->recursive;
DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
md->recursive = rec->prevrec;
md->start_match = rec->save_start;
memcpy(md->offset_vector, rec->offset_save,
rec->saved_max * sizeof(int));
ecode = rec->after_call;
ims = original_ims;
break;
}
md->offset_vector[offset] =
md->offset_vector[md->offset_end - number];
md->offset_vector[offset+1] = eptr - md->start_subject;
if (offset_top <= offset) offset_top = offset + 2;
}
}
/* Reset the value of the ims flags, in case they got changed during
the group. */
/* Handle a recursively called group. Restore the offsets
appropriately and continue from after the call. */
ims = original_ims;
DPRINTF(("ims reset to %02lx\n", ims));
/* For a non-repeating ket, just continue at this level. This also
happens for a repeating ket if no characters were matched in the group.
This is the forcible breaking of infinite loops as implemented in Perl
5.005. If there is an options reset, it will get obeyed in the normal
course of events. */
if (*ecode == OP_KET || eptr == saved_eptr)
{
ecode += 1 + LINK_SIZE;
break;
}
/* The repeating kets try the rest of the pattern or restart from the
preceding bracket, in the appropriate order. */
if (*ecode == OP_KETRMIN)
{
RMATCH(rrc, eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
}
else /* OP_KETRMAX */
{
RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
RMATCH(rrc, eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
if (md->recursive != NULL && md->recursive->group_num == number)
{
recursion_info *rec = md->recursive;
DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
md->recursive = rec->prevrec;
md->start_match = rec->save_start;
memcpy(md->offset_vector, rec->offset_save,
rec->saved_max * sizeof(int));
ecode = rec->after_call;
ims = original_ims;
break;
}
}
}
RRETURN(MATCH_NOMATCH);
/* Reset the value of the ims flags, in case they got changed during
the group. */
ims = original_ims;
DPRINTF(("ims reset to %02lx\n", ims));
/* For a non-repeating ket, just continue at this level. This also
happens for a repeating ket if no characters were matched in the group.
This is the forcible breaking of infinite loops as implemented in Perl
5.005. If there is an options reset, it will get obeyed in the normal
course of events. */
if (*ecode == OP_KET || eptr == saved_eptr)
{
ecode += 1 + LINK_SIZE;
break;
}
/* The repeating kets try the rest of the pattern or restart from the
preceding bracket, in the appropriate order. In the second case, we can use
tail recursion to avoid using another stack frame. */
if (*ecode == OP_KETRMIN)
{
RMATCH(rrc, eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
ecode = prev;
flags = match_isgroup;
goto TAIL_RECURSE;
}
else /* OP_KETRMAX */
{
RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
ecode += 1 + LINK_SIZE;
flags = 0;
goto TAIL_RECURSE;
}
/* Control never gets here */
/* Start of subject unless notbol, or after internal newline if multiline */
@ -1168,7 +1201,10 @@ for (;;)
if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
if ((ims & PCRE_MULTILINE) != 0)
{
if (eptr != md->start_subject && eptr[-1] != NEWLINE)
if (eptr != md->start_subject &&
(eptr == md->end_subject ||
eptr < md->start_subject + md->nllen ||
!IS_NEWLINE(eptr - md->nllen)))
RRETURN(MATCH_NOMATCH);
ecode++;
break;
@ -1196,7 +1232,7 @@ for (;;)
if ((ims & PCRE_MULTILINE) != 0)
{
if (eptr < md->end_subject)
{ if (*eptr != NEWLINE) RRETURN(MATCH_NOMATCH); }
{ if (!IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); }
else
{ if (md->noteol) RRETURN(MATCH_NOMATCH); }
ecode++;
@ -1207,14 +1243,14 @@ for (;;)
if (md->noteol) RRETURN(MATCH_NOMATCH);
if (!md->endonly)
{
if (eptr < md->end_subject - 1 ||
(eptr == md->end_subject - 1 && *eptr != NEWLINE))
if (eptr != md->end_subject &&
(eptr != md->end_subject - md->nllen || !IS_NEWLINE(eptr)))
RRETURN(MATCH_NOMATCH);
ecode++;
break;
}
}
/* ... else fall through */
/* ... else fall through for endonly */
/* End of subject assertion (\z) */
@ -1226,8 +1262,9 @@ for (;;)
/* End of subject or ending \n assertion (\Z) */
case OP_EODN:
if (eptr < md->end_subject - 1 ||
(eptr == md->end_subject - 1 && *eptr != NEWLINE)) RRETURN(MATCH_NOMATCH);
if (eptr != md->end_subject &&
(eptr != md->end_subject - md->nllen || !IS_NEWLINE(eptr)))
RRETURN(MATCH_NOMATCH);
ecode++;
break;
@ -1280,13 +1317,14 @@ for (;;)
/* Match a single character type; inline for speed */
case OP_ANY:
if ((ims & PCRE_DOTALL) == 0 && eptr < md->end_subject && *eptr == NEWLINE)
RRETURN(MATCH_NOMATCH);
if ((ims & PCRE_DOTALL) == 0)
{
if (eptr <= md->end_subject - md->nllen && IS_NEWLINE(eptr))
RRETURN(MATCH_NOMATCH);
}
if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
#ifdef SUPPORT_UTF8
if (utf8)
while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
#endif
ecode++;
break;
@ -2573,8 +2611,11 @@ for (;;)
for (i = 1; i <= min; i++)
{
if (eptr >= md->end_subject ||
(*eptr++ == NEWLINE && (ims & PCRE_DOTALL) == 0))
((ims & PCRE_DOTALL) == 0 &&
eptr <= md->end_subject - md->nllen &&
IS_NEWLINE(eptr)))
RRETURN(MATCH_NOMATCH);
eptr++;
while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
}
break;
@ -2659,7 +2700,11 @@ for (;;)
if ((ims & PCRE_DOTALL) == 0)
{
for (i = 1; i <= min; i++)
if (*eptr++ == NEWLINE) RRETURN(MATCH_NOMATCH);
{
if (eptr <= md->end_subject - md->nllen && IS_NEWLINE(eptr))
RRETURN(MATCH_NOMATCH);
eptr++;
}
}
else eptr += min;
break;
@ -2829,13 +2874,15 @@ for (;;)
{
RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
if (fi >= max || eptr >= md->end_subject ||
(ctype == OP_ANY && (ims & PCRE_DOTALL) == 0 &&
eptr <= md->end_subject - md->nllen && IS_NEWLINE(eptr)))
RRETURN(MATCH_NOMATCH);
GETCHARINC(c, eptr);
switch(ctype)
{
case OP_ANY:
if ((ims & PCRE_DOTALL) == 0 && c == NEWLINE) RRETURN(MATCH_NOMATCH);
case OP_ANY: /* This is the DOTALL case */
break;
case OP_ANYBYTE:
@ -2884,12 +2931,15 @@ for (;;)
{
RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
if (fi >= max || eptr >= md->end_subject ||
((ims & PCRE_DOTALL) == 0 &&
eptr <= md->end_subject - md->nllen && IS_NEWLINE(eptr)))
RRETURN(MATCH_NOMATCH);
c = *eptr++;
switch(ctype)
{
case OP_ANY:
if ((ims & PCRE_DOTALL) == 0 && c == NEWLINE) RRETURN(MATCH_NOMATCH);
case OP_ANY: /* This is the DOTALL case */
break;
case OP_ANYBYTE:
@ -3075,9 +3125,9 @@ for (;;)
{
case OP_ANY:
/* Special code is required for UTF8, but when the maximum is unlimited
we don't need it, so we repeat the non-UTF8 code. This is probably
worth it, because .* is quite a common idiom. */
/* Special code is required for UTF8, but when the maximum is
unlimited we don't need it, so we repeat the non-UTF8 code. This is
probably worth it, because .* is quite a common idiom. */
if (max < INT_MAX)
{
@ -3085,7 +3135,9 @@ for (;;)
{
for (i = min; i < max; i++)
{
if (eptr >= md->end_subject || *eptr == NEWLINE) break;
if (eptr >= md->end_subject ||
(eptr <= md->end_subject - md->nllen && IS_NEWLINE(eptr)))
break;
eptr++;
while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
}
@ -3094,6 +3146,7 @@ for (;;)
{
for (i = min; i < max; i++)
{
if (eptr >= md->end_subject) break;
eptr++;
while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
}
@ -3108,7 +3161,9 @@ for (;;)
{
for (i = min; i < max; i++)
{
if (eptr >= md->end_subject || *eptr == NEWLINE) break;
if (eptr >= md->end_subject ||
(eptr <= md->end_subject - md->nllen && IS_NEWLINE(eptr)))
break;
eptr++;
}
break;
@ -3222,7 +3277,9 @@ for (;;)
{
for (i = min; i < max; i++)
{
if (eptr >= md->end_subject || *eptr == NEWLINE) break;
if (eptr >= md->end_subject ||
(eptr <= md->end_subject - md->nllen && IS_NEWLINE(eptr)))
break;
eptr++;
}
break;
@ -3419,7 +3476,8 @@ int rc, resetcount, ocount;
int first_byte = -1;
int req_byte = -1;
int req_byte2 = -1;
unsigned long int ims = 0;
int newline;
unsigned long int ims;
BOOL using_temporary_offsets = FALSE;
BOOL anchored;
BOOL startline;
@ -3427,6 +3485,7 @@ BOOL firstline;
BOOL first_byte_caseless = FALSE;
BOOL req_byte_caseless = FALSE;
match_data match_block;
match_data *md = &match_block;
const uschar *tables;
const uschar *start_bits = NULL;
USPTR start_match = (USPTR)subject + start_offset;
@ -3451,9 +3510,9 @@ if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
the default values. */
study = NULL;
match_block.match_limit = MATCH_LIMIT;
match_block.match_limit_recursion = MATCH_LIMIT_RECURSION;
match_block.callout_data = NULL;
md->match_limit = MATCH_LIMIT;
md->match_limit_recursion = MATCH_LIMIT_RECURSION;
md->callout_data = NULL;
/* The table pointer is always in native byte order. */
@ -3465,11 +3524,11 @@ if (extra_data != NULL)
if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
study = (const pcre_study_data *)extra_data->study_data;
if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
match_block.match_limit = extra_data->match_limit;
md->match_limit = extra_data->match_limit;
if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
match_block.match_limit_recursion = extra_data->match_limit_recursion;
md->match_limit_recursion = extra_data->match_limit_recursion;
if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
match_block.callout_data = extra_data->callout_data;
md->callout_data = extra_data->callout_data;
if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
}
@ -3499,39 +3558,64 @@ firstline = (re->options & PCRE_FIRSTLINE) != 0;
/* The code starts after the real_pcre block and the capture name table. */
match_block.start_code = (const uschar *)external_re + re->name_table_offset +
md->start_code = (const uschar *)external_re + re->name_table_offset +
re->name_count * re->name_entry_size;
match_block.start_subject = (USPTR)subject;
match_block.start_offset = start_offset;
match_block.end_subject = match_block.start_subject + length;
end_subject = match_block.end_subject;
md->start_subject = (USPTR)subject;
md->start_offset = start_offset;
md->end_subject = md->start_subject + length;
end_subject = md->end_subject;
match_block.endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
match_block.utf8 = (re->options & PCRE_UTF8) != 0;
md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
md->utf8 = (re->options & PCRE_UTF8) != 0;
match_block.notbol = (options & PCRE_NOTBOL) != 0;
match_block.noteol = (options & PCRE_NOTEOL) != 0;
match_block.notempty = (options & PCRE_NOTEMPTY) != 0;
match_block.partial = (options & PCRE_PARTIAL) != 0;
match_block.hitend = FALSE;
md->notbol = (options & PCRE_NOTBOL) != 0;
md->noteol = (options & PCRE_NOTEOL) != 0;
md->notempty = (options & PCRE_NOTEMPTY) != 0;
md->partial = (options & PCRE_PARTIAL) != 0;
md->hitend = FALSE;
match_block.recursive = NULL; /* No recursion at top level */
md->recursive = NULL; /* No recursion at top level */
match_block.lcc = tables + lcc_offset;
match_block.ctypes = tables + ctypes_offset;
md->lcc = tables + lcc_offset;
md->ctypes = tables + ctypes_offset;
/* Handle different types of newline. The two bits give four cases. If nothing
is set at run time, whatever was used at compile time applies. */
switch ((((options & PCRE_NEWLINE_CRLF) == 0)? re->options : options) &
PCRE_NEWLINE_CRLF)
{
default: newline = NEWLINE; break; /* Compile-time default */
case PCRE_NEWLINE_CR: newline = '\r'; break;
case PCRE_NEWLINE_LF: newline = '\n'; break;
case PCRE_NEWLINE_CR+
PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
}
if (newline > 255)
{
md->nllen = 2;
md->nl[0] = (newline >> 8) & 255;
md->nl[1] = newline & 255;
}
else
{
md->nllen = 1;
md->nl[0] = newline;
}
/* Partial matching is supported only for a restricted set of regexes at the
moment. */
if (match_block.partial && (re->options & PCRE_NOPARTIAL) != 0)
if (md->partial && (re->options & PCRE_NOPARTIAL) != 0)
return PCRE_ERROR_BADPARTIAL;
/* Check a UTF-8 string if required. Unfortunately there's no way of passing
back the character offset. */
#ifdef SUPPORT_UTF8
if (match_block.utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
if (md->utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
{
if (_pcre_valid_utf8((uschar *)subject, length) >= 0)
return PCRE_ERROR_BADUTF8;
@ -3563,17 +3647,17 @@ ocount = offsetcount - (offsetcount % 3);
if (re->top_backref > 0 && re->top_backref >= ocount/3)
{
ocount = re->top_backref * 3 + 3;
match_block.offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
if (match_block.offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
md->offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
using_temporary_offsets = TRUE;
DPRINTF(("Got memory to hold back references\n"));
}
else match_block.offset_vector = offsets;
else md->offset_vector = offsets;
match_block.offset_end = ocount;
match_block.offset_max = (2*ocount)/3;
match_block.offset_overflow = FALSE;
match_block.capture_last = -1;
md->offset_end = ocount;
md->offset_max = (2*ocount)/3;
md->offset_overflow = FALSE;
md->capture_last = -1;
/* Compute the minimum number of offsets that we need to reset each time. Doing
this makes a huge difference to execution time when there aren't many brackets
@ -3586,9 +3670,9 @@ if (resetcount > offsetcount) resetcount = ocount;
never be used unless previously set, but they get saved and restored, and so we
initialize them to avoid reading uninitialized locations. */
if (match_block.offset_vector != NULL)
if (md->offset_vector != NULL)
{
register int *iptr = match_block.offset_vector + ocount;
register int *iptr = md->offset_vector + ocount;
register int *iend = iptr - resetcount/2 + 1;
while (--iptr >= iend) *iptr = -1;
}
@ -3605,7 +3689,7 @@ if (!anchored)
{
first_byte = re->first_byte & 255;
if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
first_byte = match_block.lcc[first_byte];
first_byte = md->lcc[first_byte];
}
else
if (!startline && study != NULL &&
@ -3632,9 +3716,9 @@ do
/* Reset the maximum number of extractions we might see. */
if (match_block.offset_vector != NULL)
if (md->offset_vector != NULL)
{
register int *iptr = match_block.offset_vector;
register int *iptr = md->offset_vector;
register int *iend = iptr + resetcount;
while (iptr < iend) *iptr++ = -1;
}
@ -3648,7 +3732,7 @@ do
if (firstline)
{
USPTR t = start_match;
while (t < save_end_subject && *t != '\n') t++;
while (t <= save_end_subject - md->nllen && !IS_NEWLINE(t)) t++;
end_subject = t;
}
@ -3658,20 +3742,22 @@ do
{
if (first_byte_caseless)
while (start_match < end_subject &&
match_block.lcc[*start_match] != first_byte)
md->lcc[*start_match] != first_byte)
start_match++;
else
while (start_match < end_subject && *start_match != first_byte)
start_match++;
}
/* Or to just after \n for a multiline match if possible */
/* Or to just after a linebreak for a multiline match if possible */
else if (startline)
{
if (start_match > match_block.start_subject + start_offset)
if (start_match >= md->start_subject + md->nllen +
start_offset)
{
while (start_match < end_subject && start_match[-1] != NEWLINE)
while (start_match <= end_subject &&
!IS_NEWLINE(start_match - md->nllen))
start_match++;
}
}
@ -3693,7 +3779,7 @@ do
#ifdef DEBUG /* Sigh. Some compilers never learn. */
printf(">>>> Match against: ");
pchars(start_match, end_subject - start_match, TRUE, &match_block);
pchars(start_match, end_subject - start_match, TRUE, md);
printf("\n");
#endif
@ -3715,7 +3801,7 @@ do
if (req_byte >= 0 &&
end_subject - start_match < REQ_BYTE_MAX &&
!match_block.partial)
!md->partial)
{
register USPTR p = start_match + ((first_byte >= 0)? 1 : 0);
@ -3759,11 +3845,10 @@ do
those back references that we can. In this case there need not be overflow
if certain parts of the pattern were not used. */
match_block.start_match = start_match;
match_block.match_call_count = 0;
md->start_match = start_match;
md->match_call_count = 0;
rc = match(start_match, match_block.start_code, 2, &match_block, ims, NULL,
match_isgroup, 0);
rc = match(start_match, md->start_code, 2, md, ims, NULL, match_isgroup, 0);
/* When the result is no match, if the subject's first character was a
newline and the PCRE_FIRSTLINE option is set, break (which will return
@ -3774,10 +3859,13 @@ do
if (rc == MATCH_NOMATCH)
{
if (firstline && *start_match == NEWLINE) break;
if (firstline &&
start_match <= md->end_subject - md->nllen &&
IS_NEWLINE(start_match))
break;
start_match++;
#ifdef SUPPORT_UTF8
if (match_block.utf8)
if (md->utf8)
while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
start_match++;
#endif
@ -3797,23 +3885,23 @@ do
{
if (offsetcount >= 4)
{
memcpy(offsets + 2, match_block.offset_vector + 2,
memcpy(offsets + 2, md->offset_vector + 2,
(offsetcount - 2) * sizeof(int));
DPRINTF(("Copied offsets from temporary memory\n"));
}
if (match_block.end_offset_top > offsetcount)
match_block.offset_overflow = TRUE;
if (md->end_offset_top > offsetcount)
md->offset_overflow = TRUE;
DPRINTF(("Freeing temporary memory\n"));
(pcre_free)(match_block.offset_vector);
(pcre_free)(md->offset_vector);
}
rc = match_block.offset_overflow? 0 : match_block.end_offset_top/2;
rc = md->offset_overflow? 0 : md->end_offset_top/2;
if (offsetcount < 2) rc = 0; else
{
offsets[0] = start_match - match_block.start_subject;
offsets[1] = match_block.end_match_ptr - match_block.start_subject;
offsets[0] = start_match - md->start_subject;
offsets[1] = md->end_match_ptr - md->start_subject;
}
DPRINTF((">>>> returning %d\n", rc));
@ -3827,10 +3915,10 @@ while (!anchored && start_match <= end_subject);
if (using_temporary_offsets)
{
DPRINTF(("Freeing temporary memory\n"));
(pcre_free)(match_block.offset_vector);
(pcre_free)(md->offset_vector);
}
if (match_block.partial && match_block.hitend)
if (md->partial && md->hitend)
{
DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
return PCRE_ERROR_PARTIAL;

View File

@ -50,8 +50,8 @@ for these functions came from Scott Wimer. */
* Find number for named string *
*************************************************/
/* This function is used by the two extraction functions below, as well
as being generally available.
/* This function is used by the get_first_set() function below, as well
as being generally available. It assumes that names are unique.
Arguments:
code the compiled regex
@ -93,6 +93,113 @@ return PCRE_ERROR_NOSUBSTRING;
/*************************************************
* Find (multiple) entries for named string *
*************************************************/
/* This is used by the get_first_set() function below, as well as being
generally available. It is used when duplicated names are permitted.
Arguments:
code the compiled regex
stringname the name whose entries required
firstptr where to put the pointer to the first entry
lastptr where to put the pointer to the last entry
Returns: the length of each entry, or a negative number
(PCRE_ERROR_NOSUBSTRING) if not found
*/
int
pcre_get_stringtable_entries(const pcre *code, const char *stringname,
char **firstptr, char **lastptr)
{
int rc;
int entrysize;
int top, bot;
uschar *nametable, *lastentry;
if ((rc = pcre_fullinfo(code, NULL, PCRE_INFO_NAMECOUNT, &top)) != 0)
return rc;
if (top <= 0) return PCRE_ERROR_NOSUBSTRING;
if ((rc = pcre_fullinfo(code, NULL, PCRE_INFO_NAMEENTRYSIZE, &entrysize)) != 0)
return rc;
if ((rc = pcre_fullinfo(code, NULL, PCRE_INFO_NAMETABLE, &nametable)) != 0)
return rc;
lastentry = nametable + entrysize * (top - 1);
bot = 0;
while (top > bot)
{
int mid = (top + bot) / 2;
uschar *entry = nametable + entrysize*mid;
int c = strcmp(stringname, (char *)(entry + 2));
if (c == 0)
{
uschar *first = entry;
uschar *last = entry;
while (first > nametable)
{
if (strcmp(stringname, (char *)(first - entrysize + 2)) != 0) break;
first -= entrysize;
}
while (last < lastentry)
{
if (strcmp(stringname, (char *)(last + entrysize + 2)) != 0) break;
last += entrysize;
}
*firstptr = (char *)first;
*lastptr = (char *)last;
return entrysize;
}
if (c > 0) bot = mid + 1; else top = mid;
}
return PCRE_ERROR_NOSUBSTRING;
}
/*************************************************
* Find first set of multiple named strings *
*************************************************/
/* This function allows for duplicate names in the table of named substrings.
It returns the number of the first one that was set in a pattern match.
Arguments:
code the compiled regex
stringname the name of the capturing substring
ovector the vector of matched substrings
Returns: the number of the first that is set,
or the number of the last one if none are set,
or a negative number on error
*/
static int
get_first_set(const pcre *code, const char *stringname, int *ovector)
{
const real_pcre *re = (const real_pcre *)code;
int entrysize;
char *first, *last;
uschar *entry;
if ((re->options & (PCRE_DUPNAMES | PCRE_JCHANGED)) == 0)
return pcre_get_stringnumber(code, stringname);
entrysize = pcre_get_stringtable_entries(code, stringname, &first, &last);
if (entrysize <= 0) return entrysize;
for (entry = (uschar *)first; entry <= (uschar *)last; entry += entrysize)
{
int n = (entry[0] << 8) + entry[1];
if (ovector[n*2] >= 0) return n;
}
return (first[0] << 8) + first[1];
}
/*************************************************
* Copy captured string to given buffer *
*************************************************/
@ -142,7 +249,8 @@ return yield;
*************************************************/
/* This function copies a single captured substring into a given buffer,
identifying it by name.
identifying it by name. If the regex permits duplicate names, the first
substring that is set is chosen.
Arguments:
code the compiled regex
@ -168,7 +276,7 @@ int
pcre_copy_named_substring(const pcre *code, const char *subject, int *ovector,
int stringcount, const char *stringname, char *buffer, int size)
{
int n = pcre_get_stringnumber(code, stringname);
int n = get_first_set(code, stringname, ovector);
if (n <= 0) return n;
return pcre_copy_substring(subject, ovector, stringcount, n, buffer, size);
}
@ -299,7 +407,8 @@ return yield;
*************************************************/
/* This function copies a single captured substring, identified by name, into
new store.
new store. If the regex permits duplicate names, the first substring that is
set is chosen.
Arguments:
code the compiled regex
@ -324,9 +433,10 @@ int
pcre_get_named_substring(const pcre *code, const char *subject, int *ovector,
int stringcount, const char *stringname, const char **stringptr)
{
int n = pcre_get_stringnumber(code, stringname);
int n = get_first_set(code, stringname, ovector);
if (n <= 0) return n;
return pcre_get_substring(subject, ovector, stringcount, n, stringptr);
}

View File

@ -118,6 +118,14 @@ Unix, where it is defined in sys/types, so use "uschar" instead. */
typedef unsigned char uschar;
/* PCRE is able to support 3 different kinds of newline (CR, LF, CRLF). The
following macro is used to package up testing for newlines. NLBLOCK is defined
in the various modules to indicate in which datablock the parameters exist. */
#define IS_NEWLINE(p) \
((p)[0] == NLBLOCK->nl[0] && \
(NLBLOCK->nllen == 1 || (p)[1] == NLBLOCK->nl[1]))
/* When PCRE is compiled as a C++ library, the subject pointer can be replaced
with a custom type. This makes it possible, for example, to allow pcre_exec()
to process subject strings that are discontinuous by using a smart pointer
@ -164,7 +172,7 @@ case in PCRE. */
#if HAVE_BCOPY
#define memmove(a, b, c) bcopy(b, a, c)
#else /* HAVE_BCOPY */
void *
static void *
pcre_memmove(unsigned char *dest, const unsigned char *src, size_t n)
{
size_t i;
@ -377,16 +385,17 @@ Standard C system should have one. */
#define PCRE_IMS (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL)
/* Private options flags start at the most significant end of the four bytes,
but skip the top bit so we can use ints for convenience without getting tangled
with negative values. The public options defined in pcre.h start at the least
significant end. Make sure they don't overlap! */
/* Private options flags start at the most significant end of the four bytes.
The public options defined in pcre.h start at the least significant end. Make
sure they don't overlap! The bits are getting a bit scarce now -- when we run
out, there is a dummy word in the structure that could be used for the private
bits. */
#define PCRE_NOPARTIAL 0x80000000 /* can't use partial with this regex */
#define PCRE_FIRSTSET 0x40000000 /* first_byte is set */
#define PCRE_REQCHSET 0x20000000 /* req_byte is set */
#define PCRE_STARTLINE 0x10000000 /* start after \n for multiline */
#define PCRE_ICHANGED 0x08000000 /* i option changes within regex */
#define PCRE_NOPARTIAL 0x04000000 /* can't use partial with this regex */
#define PCRE_JCHANGED 0x08000000 /* j option changes within regex */
/* Options for the "extra" block produced by pcre_study(). */
@ -398,15 +407,17 @@ time, run time, or study time, respectively. */
#define PUBLIC_OPTIONS \
(PCRE_CASELESS|PCRE_EXTENDED|PCRE_ANCHORED|PCRE_MULTILINE| \
PCRE_DOTALL|PCRE_DOLLAR_ENDONLY|PCRE_EXTRA|PCRE_UNGREEDY|PCRE_UTF8| \
PCRE_NO_AUTO_CAPTURE|PCRE_NO_UTF8_CHECK|PCRE_AUTO_CALLOUT|PCRE_FIRSTLINE)
PCRE_NO_AUTO_CAPTURE|PCRE_NO_UTF8_CHECK|PCRE_AUTO_CALLOUT|PCRE_FIRSTLINE| \
PCRE_DUPNAMES|PCRE_NEWLINE_CR|PCRE_NEWLINE_LF)
#define PUBLIC_EXEC_OPTIONS \
(PCRE_ANCHORED|PCRE_NOTBOL|PCRE_NOTEOL|PCRE_NOTEMPTY|PCRE_NO_UTF8_CHECK| \
PCRE_PARTIAL)
PCRE_PARTIAL|PCRE_NEWLINE_CR|PCRE_NEWLINE_LF)
#define PUBLIC_DFA_EXEC_OPTIONS \
(PCRE_ANCHORED|PCRE_NOTBOL|PCRE_NOTEOL|PCRE_NOTEMPTY|PCRE_NO_UTF8_CHECK| \
PCRE_PARTIAL|PCRE_DFA_SHORTEST|PCRE_DFA_RESTART)
PCRE_PARTIAL|PCRE_DFA_SHORTEST|PCRE_DFA_RESTART|PCRE_NEWLINE_CR| \
PCRE_NEWLINE_LF)
#define PUBLIC_STUDY_OPTIONS 0 /* None defined */
@ -534,7 +545,7 @@ enum {
OP_DOLL, /* 20 End of line - varies with multiline switch */
OP_CHAR, /* 21 Match one character, casefully */
OP_CHARNC, /* 22 Match one character, caselessly */
OP_NOT, /* 23 Match anything but the following char */
OP_NOT, /* 23 Match one character, not the following one */
OP_STAR, /* 24 The maximizing and minimizing versions of */
OP_MINSTAR, /* 25 all these opcodes must come in pairs, with */
@ -714,7 +725,8 @@ enum { ERR0, ERR1, ERR2, ERR3, ERR4, ERR5, ERR6, ERR7, ERR8, ERR9,
ERR10, ERR11, ERR12, ERR13, ERR14, ERR15, ERR16, ERR17, ERR18, ERR19,
ERR20, ERR21, ERR22, ERR23, ERR24, ERR25, ERR26, ERR27, ERR28, ERR29,
ERR30, ERR31, ERR32, ERR33, ERR34, ERR35, ERR36, ERR37, ERR38, ERR39,
ERR40, ERR41, ERR42, ERR43, ERR44, ERR45, ERR46, ERR47 };
ERR40, ERR41, ERR42, ERR43, ERR44, ERR45, ERR46, ERR47, ERR48, ERR49,
ERR50, ERR51 };
/* The real format of the start of the pcre block; the index of names and the
code vector run on as long as necessary after the end. We store an explicit
@ -778,6 +790,8 @@ typedef struct compile_data {
unsigned int backref_map; /* Bitmap of low back refs */
int req_varyopt; /* "After variable item" flag for reqbyte */
BOOL nopartial; /* Set TRUE if partial won't work */
int nllen; /* 1 or 2 for newline string length */
uschar nl[4]; /* Newline string */
} compile_data;
/* Structure for maintaining a chain of pointers to the currently incomplete
@ -802,11 +816,11 @@ typedef struct recursion_info {
/* When compiling in a mode that doesn't use recursive calls to match(),
a structure is used to remember local variables on the heap. It is defined in
pcre.c, close to the match() function, so that it is easy to keep it in step
with any changes of local variable. However, the pointer to the current frame
must be saved in some "static" place over a longjmp(). We declare the
structure here so that we can put a pointer in the match_data structure.
NOTE: This isn't used for a "normal" compilation of pcre. */
pcre_exec.c, close to the match() function, so that it is easy to keep it in
step with any changes of local variable. However, the pointer to the current
frame must be saved in some "static" place over a longjmp(). We declare the
structure here so that we can put a pointer in the match_data structure. NOTE:
This isn't used for a "normal" compilation of pcre. */
struct heapframe;
@ -820,6 +834,8 @@ typedef struct match_data {
int *offset_vector; /* Offset vector */
int offset_end; /* One past the end */
int offset_max; /* The maximum usable for return data */
int nllen; /* 1 or 2 for newline string length */
uschar nl[4]; /* Newline string */
const uschar *lcc; /* Points to lower casing table */
const uschar *ctypes; /* Points to table of type maps */
BOOL offset_overflow; /* Set if too many extractions */
@ -853,6 +869,8 @@ typedef struct dfa_match_data {
const uschar *tables; /* Character tables */
int moptions; /* Match options */
int poptions; /* Pattern options */
int nllen; /* 1 or 2 for newline string length */
uschar nl[4]; /* Newline string */
void *callout_data; /* To pass back to callouts */
} dfa_match_data;
@ -926,7 +944,7 @@ sense, but are not part of the PCRE public API. */
extern int _pcre_ord2utf8(int, uschar *);
extern real_pcre * _pcre_try_flipped(const real_pcre *, real_pcre *,
const pcre_study_data *, pcre_study_data *);
extern int _pcre_ucp_findprop(const int, int *, int *);
extern int _pcre_ucp_findprop(const unsigned int, int *, int *);
extern int _pcre_ucp_othercase(const int);
extern int _pcre_valid_utf8(const uschar *, int);
extern BOOL _pcre_xclass(int, const uschar *);

View File

@ -130,7 +130,9 @@ for (i = 0; i < 256; i++)
meta-character, which in this sense is any character that terminates a run
of data characters. */
if (strchr("*+?{^.$|()[", i) != 0) x += ctype_meta; *p++ = x; }
if (strchr("*+?{^.$|()[", i) != 0) x += ctype_meta;
*p++ = x;
}
return yield;
}

View File

@ -111,9 +111,9 @@ for (i = _pcre_utt_size; i >= 0; i--)
}
return (i >= 0)? _pcre_utt[i].name : "??";
#else
ptype = ptype; /* Avoid compiler warning */
pvalue = pvalue;
return "??";
/* It gets harder and harder to shut off unwanted compiler warnings. */
ptype = ptype * pvalue;
return (ptype == pvalue)? "??" : "??";
#endif
}
@ -182,32 +182,26 @@ for(;;)
break;
case OP_CHAR:
fprintf(f, " ");
do
{
fprintf(f, " ");
do
{
code++;
code += 1 + print_char(f, code, utf8);
}
while (*code == OP_CHAR);
fprintf(f, "\n");
continue;
code++;
code += 1 + print_char(f, code, utf8);
}
break;
while (*code == OP_CHAR);
fprintf(f, "\n");
continue;
case OP_CHARNC:
fprintf(f, " NC ");
do
{
fprintf(f, " NC ");
do
{
code++;
code += 1 + print_char(f, code, utf8);
}
while (*code == OP_CHARNC);
fprintf(f, "\n");
continue;
code++;
code += 1 + print_char(f, code, utf8);
}
break;
while (*code == OP_CHARNC);
fprintf(f, "\n");
continue;
case OP_KETRMAX:
case OP_KETRMIN:

View File

@ -95,6 +95,13 @@ set_start_bits(const uschar *code, uschar *start_bits, BOOL caseless,
{
register int c;
#if 0
/* ========================================================================= */
/* The following comment and code was inserted in January 1999. In May 2006,
when it was observed to cause compiler warnings about unused values, I took it
out again. If anybody is still using OS/2, they will have to put it back
manually. */
/* This next statement and the later reference to dummy are here in order to
trick the optimizer of the IBM C compiler for OS/2 into generating correct
code. Apparently IBM isn't going to fix the problem, and we would rather not
@ -102,6 +109,8 @@ disable optimization (in this module it actually makes a big difference, and
the pcre module can use all the optimization it can get). */
volatile int dummy;
/* ========================================================================= */
#endif
do
{
@ -159,7 +168,11 @@ do
case OP_BRAMINZERO:
if (!set_start_bits(++tcode, start_bits, caseless, utf8, cd))
return FALSE;
/* =========================================================================
See the comment at the head of this function concerning the next line,
which was an old fudge for the benefit of OS/2.
dummy = 1;
========================================================================= */
do tcode += GET(tcode,1); while (*tcode == OP_ALT);
tcode += 1+LINK_SIZE;
break;
@ -215,15 +228,29 @@ do
try_next = FALSE;
break;
/* The cbit_space table has vertical tab as whitespace; we have to
discard it. */
case OP_NOT_WHITESPACE:
for (c = 0; c < 32; c++)
start_bits[c] |= ~cd->cbits[c+cbit_space];
{
int d = cd->cbits[c+cbit_space];
if (c == 1) d &= ~0x08;
start_bits[c] |= ~d;
}
try_next = FALSE;
break;
/* The cbit_space table has vertical tab as whitespace; we have to
discard it. */
case OP_WHITESPACE:
for (c = 0; c < 32; c++)
start_bits[c] |= cd->cbits[c+cbit_space];
{
int d = cd->cbits[c+cbit_space];
if (c == 1) d &= ~0x08;
start_bits[c] |= d;
}
try_next = FALSE;
break;
@ -277,14 +304,28 @@ do
start_bits[c] |= cd->cbits[c+cbit_digit];
break;
/* The cbit_space table has vertical tab as whitespace; we have to
discard it. */
case OP_NOT_WHITESPACE:
for (c = 0; c < 32; c++)
start_bits[c] |= ~cd->cbits[c+cbit_space];
{
int d = cd->cbits[c+cbit_space];
if (c == 1) d &= ~0x08;
start_bits[c] |= ~d;
}
break;
/* The cbit_space table has vertical tab as whitespace; we have to
discard it. */
case OP_WHITESPACE:
for (c = 0; c < 32; c++)
start_bits[c] |= cd->cbits[c+cbit_space];
{
int d = cd->cbits[c+cbit_space];
if (c == 1) d &= ~0x08;
start_bits[c] |= d;
}
break;
case OP_NOT_WORDCHAR:
@ -408,10 +449,9 @@ uschar start_bits[32];
pcre_extra *extra;
pcre_study_data *study;
const uschar *tables;
const real_pcre *re = (const real_pcre *)external_re;
uschar *code = (uschar *)re + re->name_table_offset +
(re->name_count * re->name_entry_size);
uschar *code;
compile_data compile_block;
const real_pcre *re = (const real_pcre *)external_re;
*errorptr = NULL;
@ -427,6 +467,9 @@ if ((options & ~PUBLIC_STUDY_OPTIONS) != 0)
return NULL;
}
code = (uschar *)re + re->name_table_offset +
(re->name_count * re->name_entry_size);
/* For an anchored pattern, or an unanchored pattern that has a first char, or
a multiline pattern that matches only at "line starts", no further processing
at present. */

View File

@ -62,8 +62,8 @@ Arguments:
Returns: the flipped value
*/
static long int
byteflip(long int value, int n)
static unsigned long int
byteflip(unsigned long int value, int n)
{
if (n == 2) return ((value & 0x00ff) << 8) | ((value & 0xff00) >> 8);
return ((value & 0x000000ff) << 24) |

View File

@ -79,7 +79,7 @@ Returns: the character type category
*/
int
_pcre_ucp_findprop(const int c, int *type_ptr, int *script_ptr)
_pcre_ucp_findprop(const unsigned int c, int *type_ptr, int *script_ptr)
{
int bot = 0;
int top = sizeof(ucp_table)/sizeof(cnode);

View File

@ -332,6 +332,30 @@ bool RE::Replace(const StringPiece& rewrite,
return true;
}
// Returns PCRE_NEWLINE_CRLF, PCRE_NEWLINE_CR, or PCRE_NEWLINE_LF.
// Note that PCRE_NEWLINE_CRLF is defined to be P_N_CR | P_N_LF.
static int NewlineMode(int pcre_options) {
// TODO: if we can make it threadsafe, cache this var
int newline_mode = 0;
/* if (newline_mode) return newline_mode; */ // do this once it's cached
if (pcre_options & (PCRE_NEWLINE_CRLF|PCRE_NEWLINE_CR|PCRE_NEWLINE_LF)) {
newline_mode = (pcre_options &
(PCRE_NEWLINE_CRLF|PCRE_NEWLINE_CR|PCRE_NEWLINE_LF));
} else {
int newline;
pcre_config(PCRE_CONFIG_NEWLINE, &newline);
if (newline == 10)
newline_mode = PCRE_NEWLINE_LF;
else if (newline == 13)
newline_mode = PCRE_NEWLINE_CR;
else if (newline == 3338)
newline_mode = PCRE_NEWLINE_CRLF;
else
assert("" == "Unexpected return value from pcre_config(NEWLINE)");
}
return newline_mode;
}
int RE::GlobalReplace(const StringPiece& rewrite,
string *str) const {
int count = 0;
@ -350,9 +374,27 @@ int RE::GlobalReplace(const StringPiece& rewrite,
if (matchstart == matchend && matchstart == lastend) {
// advance one character if we matched an empty string at the same
// place as the last match occurred
if (start < static_cast<int>(str->length()))
out.push_back((*str)[start]);
start++;
matchend = start + 1;
// If the current char is CR and we're in CRLF mode, skip LF too.
// Note it's better to call pcre_fullinfo() than to examine
// all_options(), since options_ could have changed bewteen
// compile-time and now, but this is simpler and safe enough.
if (start+1 < static_cast<int>(str->length()) &&
(*str)[start] == '\r' && (*str)[start+1] == '\n' &&
NewlineMode(options_.all_options()) == PCRE_NEWLINE_CRLF) {
matchend++;
}
// We also need to advance more than one char if we're in utf8 mode.
#ifdef SUPPORT_UTF8
if (options_.utf8()) {
while (matchend < static_cast<int>(str->length()) &&
((*str)[matchend] & 0xc0) == 0x80)
matchend++;
}
#endif
if (matchend <= static_cast<int>(str->length()))
out.append(*str, start, matchend - start);
start = matchend;
} else {
out.append(*str, start, matchstart - start);
Rewrite(&out, rewrite, *str, vec, matches);

View File

@ -32,6 +32,7 @@
// TODO: Test extractions for PartialMatch/Consume
#include <stdio.h>
#include <cassert>
#include <vector>
#include "config.h"
#include "pcrecpp.h"
@ -259,17 +260,71 @@ static void TestReplace() {
"aaaaa",
"bbaaaaa",
"bbabbabbabbabbabb" },
{ "b*",
"bb",
"aa\naa\n",
"bbaa\naa\n",
"bbabbabb\nbbabbabb\nbb" },
{ "b*",
"bb",
"aa\raa\r",
"bbaa\raa\r",
"bbabbabb\rbbabbabb\rbb" },
{ "b*",
"bb",
"aa\r\naa\r\n",
"bbaa\r\naa\r\n",
"bbabbabb\r\nbbabbabb\r\nbb" },
#ifdef SUPPORT_UTF8
{ "b*",
"bb",
"\xE3\x83\x9B\xE3\x83\xBC\xE3\x83\xA0\xE3\x81\xB8", // utf8
"bb\xE3\x83\x9B\xE3\x83\xBC\xE3\x83\xA0\xE3\x81\xB8",
"bb\xE3\x83\x9B""bb""\xE3\x83\xBC""bb""\xE3\x83\xA0""bb""\xE3\x81\xB8""bb" },
{ "b*",
"bb",
"\xE3\x83\x9B\r\n\xE3\x83\xBC\r\xE3\x83\xA0\n\xE3\x81\xB8\r\n", // utf8
"bb\xE3\x83\x9B\r\n\xE3\x83\xBC\r\xE3\x83\xA0\n\xE3\x81\xB8\r\n",
("bb\xE3\x83\x9B""bb\r\nbb""\xE3\x83\xBC""bb\rbb""\xE3\x83\xA0"
"bb\nbb""\xE3\x81\xB8""bb\r\nbb") },
#endif
{ "", NULL, NULL, NULL, NULL }
};
#ifdef SUPPORT_UTF8
const bool support_utf8 = true;
#else
const bool support_utf8 = false;
#endif
for (const ReplaceTest *t = tests; t->original != NULL; ++t) {
RE re(t->regexp, RE_Options(PCRE_NEWLINE_CRLF).set_utf8(support_utf8));
assert(re.error().empty());
string one(t->original);
CHECK(RE(t->regexp).Replace(t->rewrite, &one));
CHECK(re.Replace(t->rewrite, &one));
CHECK_EQ(one, t->single);
string all(t->original);
CHECK(RE(t->regexp).GlobalReplace(t->rewrite, &all) > 0);
CHECK(re.GlobalReplace(t->rewrite, &all) > 0);
CHECK_EQ(all, t->global);
}
// One final test: test \r\n replacement when we're not in CRLF mode
{
RE re("b*", RE_Options(PCRE_NEWLINE_CR).set_utf8(support_utf8));
assert(re.error().empty());
string all("aa\r\naa\r\n");
CHECK(re.GlobalReplace("bb", &all) > 0);
CHECK_EQ(all, string("bbabbabb\rbb\nbbabbabb\rbb\nbb"));
}
{
RE re("b*", RE_Options(PCRE_NEWLINE_LF).set_utf8(support_utf8));
assert(re.error().empty());
string all("aa\r\naa\r\n");
CHECK(re.GlobalReplace("bb", &all) > 0);
CHECK_EQ(all, string("bbabbabb\rbb\nbbabbabb\rbb\nbb"));
}
// TODO: test what happens when no PCRE_NEWLINE_* flag is set.
// Alas, the answer depends on how pcre was compiled.
}
static void TestExtract() {

View File

@ -117,7 +117,7 @@ if (rc < 0)
*/
default: printf("Matching error %d\n", rc); break;
}
free(re); /* Release memory used for the compiled pattern */
pcre_free(re); /* Release memory used for the compiled pattern */
return 1;
}
@ -223,8 +223,8 @@ if (namecount <= 0) printf("No named substrings\n"); else
if (!find_all)
{
free(re); /* Release the memory used for the compiled pattern */
return 0; /* Finish unless -g was given */
pcre_free(re); /* Release the memory used for the compiled pattern */
return 0; /* Finish unless -g was given */
}
/* Loop for second and subsequent matches */
@ -276,7 +276,7 @@ for (;;)
if (rc < 0)
{
printf("Matching error %d\n", rc);
free(re); /* Release memory used for the compiled pattern */
pcre_free(re); /* Release memory used for the compiled pattern */
return 1;
}
@ -317,7 +317,7 @@ for (;;)
} /* End of loop to find second and subsequent matches */
printf("\n");
free(re); /* Release memory used for the compiled pattern */
pcre_free(re); /* Release memory used for the compiled pattern */
return 0;
}

View File

@ -56,7 +56,7 @@ POSSIBILITY OF SUCH DAMAGE.
typedef int BOOL;
#define VERSION "4.2 09-Jan-2006"
#define VERSION "4.3 01-Jun-2006"
#define MAX_PATTERN_COUNT 100
#if BUFSIZ > 8192
@ -100,10 +100,14 @@ static const char *jfriedl_prefix = "";
static const char *jfriedl_postfix = "";
#endif
static int endlinebyte = '\n'; /* Last byte of endline sequence */
static int endlineextra = 0; /* Extra bytes for endline sequence */
static char *colour_string = (char *)"1;31";
static char *colour_option = NULL;
static char *dee_option = NULL;
static char *DEE_option = NULL;
static char *newline = NULL;
static char *pattern_filename = NULL;
static char *stdin_name = (char *)"(standard input)";
static char *locale = NULL;
@ -185,6 +189,7 @@ static option_item optionlist[] = {
{ OP_STRING, N_LABEL, &stdin_name, "label=name", "set name for standard input" },
{ OP_STRING, N_LOCALE, &locale, "locale=locale", "use the named locale" },
{ OP_NODATA, 'M', NULL, "multiline", "run in multiline mode" },
{ OP_STRING, 'N', &newline, "newline=type", "specify newline type (CR, LR, CRLF)" },
{ OP_NODATA, 'n', NULL, "line-number", "print line number with output lines" },
{ OP_NODATA, 'o', NULL, "only-matching", "show only the part of the line that matched" },
{ OP_NODATA, 'q', NULL, "quiet", "suppress output, just set return code" },
@ -493,8 +498,9 @@ if (after_context > 0 && lastmatchnumber > 0)
char *pp = lastmatchrestart;
if (printname != NULL) fprintf(stdout, "%s-", printname);
if (number) fprintf(stdout, "%d-", lastmatchnumber++);
while (*pp != '\n') pp++;
fwrite(lastmatchrestart, 1, pp - lastmatchrestart + 1, stdout);
while (*pp != endlinebyte) pp++;
fwrite(lastmatchrestart, 1, pp - lastmatchrestart + (1 + endlineextra),
stdout);
lastmatchrestart = pp + 1;
}
hyphenpending = TRUE;
@ -566,7 +572,7 @@ while (ptr < endptr)
that any match is constrained to be in the first line. */
linelength = 0;
while (t < endptr && *t++ != '\n') linelength++;
while (t < endptr && *t++ != endlinebyte) linelength++;
length = multiline? endptr - ptr : linelength;
@ -705,7 +711,7 @@ while (ptr < endptr)
while (p < ptr && linecount < after_context)
{
while (*p != '\n') p++;
while (*p != endlinebyte) p++;
p++;
linecount++;
}
@ -719,8 +725,9 @@ while (ptr < endptr)
char *pp = lastmatchrestart;
if (printname != NULL) fprintf(stdout, "%s-", printname);
if (number) fprintf(stdout, "%d-", lastmatchnumber++);
while (*pp != '\n') pp++;
fwrite(lastmatchrestart, 1, pp - lastmatchrestart + 1, stdout);
while (*pp != endlinebyte) pp++;
fwrite(lastmatchrestart, 1, pp - lastmatchrestart +
(1 + endlineextra), stdout);
lastmatchrestart = pp + 1;
}
if (lastmatchrestart != ptr) hyphenpending = TRUE;
@ -748,7 +755,7 @@ while (ptr < endptr)
{
linecount++;
p--;
while (p > buffer && p[-1] != '\n') p--;
while (p > buffer && p[-1] != endlinebyte) p--;
}
if (lastmatchnumber > 0 && p > lastmatchrestart && !hyphenprinted)
@ -759,8 +766,8 @@ while (ptr < endptr)
char *pp = p;
if (printname != NULL) fprintf(stdout, "%s-", printname);
if (number) fprintf(stdout, "%d-", linenumber - linecount--);
while (*pp != '\n') pp++;
fwrite(p, 1, pp - p + 1, stdout); /* In case binary zero */
while (*pp != endlinebyte) pp++;
fwrite(p, 1, pp - p + (1 + endlineextra), stdout);
p = pp + 1;
}
}
@ -777,14 +784,14 @@ while (ptr < endptr)
/* In multiline mode, we want to print to the end of the line in which
the end of the matched string is found, so we adjust linelength and the
line number appropriately. Because the PCRE_FIRSTLINE option is set, the
start of the match will always be before the first \n character. */
start of the match will always be before the first newline sequence. */
if (multiline)
{
char *endmatch = ptr + offsets[1];
t = ptr;
while (t < endmatch) { if (*t++ == '\n') linenumber++; }
while (endmatch < endptr && *endmatch != '\n') endmatch++;
while (t < endmatch) { if (*t++ == endlinebyte) linenumber++; }
while (endmatch < endptr && *endmatch != endlinebyte) endmatch++;
linelength = endmatch - ptr;
}
@ -1206,7 +1213,7 @@ return FALSE;
*************************************************/
/* When the -F option has been used, each string may be a list of strings,
separated by newlines. They will be matched literally.
separated by line breaks. They will be matched literally.
Arguments:
pattern the pattern string
@ -1227,10 +1234,10 @@ if ((process_options & PO_FIXED_STRINGS) != 0)
char buffer[MBUFTHIRD];
for(;;)
{
char *p = strchr(pattern, '\n');
char *p = strchr(pattern, endlinebyte);
if (p == NULL)
return compile_single_pattern(pattern, options, filename, count);
sprintf(buffer, "%.*s", p - pattern, pattern);
sprintf(buffer, "%.*s", p - pattern - endlineextra, pattern);
pattern = p + 1;
if (!compile_single_pattern(buffer, options, filename, count))
return FALSE;
@ -1260,6 +1267,16 @@ char *patterns[MAX_PATTERN_COUNT];
const char *locale_from = "--locale";
const char *error;
/* Set the default line ending value from the default in the PCRE library. */
(void)pcre_config(PCRE_CONFIG_NEWLINE, &i);
switch(i)
{
default: newline = (char *)"lf"; break;
case '\r': newline = (char *)"cr"; break;
case ('\r' << 8) | '\n': newline = (char *)"crlf"; break;
}
/* Process the options */
for (i = 1; i < argc; i++)
@ -1543,6 +1560,28 @@ if (colour_option != NULL && strcmp(colour_option, "never") != 0)
}
}
/* Interpret the newline type; the default settings are Unix-like. */
if (strcmp(newline, "cr") == 0 || strcmp(newline, "CR") == 0)
{
pcre_options |= PCRE_NEWLINE_CR;
endlinebyte = '\r';
}
else if (strcmp(newline, "lf") == 0 || strcmp(newline, "LF") == 0)
{
pcre_options |= PCRE_NEWLINE_LF;
}
else if (strcmp(newline, "crlf") == 0 || strcmp(newline, "CRLF") == 0)
{
pcre_options |= PCRE_NEWLINE_CRLF;
endlineextra = 1;
}
else
{
fprintf(stderr, "pcregrep: Invalid newline specifier \"%s\"\n", newline);
return 2;
}
/* Interpret the text values for -d and -D */
if (dee_option != NULL)

View File

@ -77,7 +77,7 @@ static const int eint[] = {
REG_ASSERT, /* internal error: code overflow */
REG_BADPAT, /* unrecognized character after (?< */
REG_BADPAT, /* lookbehind assertion is not fixed length */
REG_BADPAT, /* malformed number after (?( */
REG_BADPAT, /* malformed number or name after (?( */
REG_BADPAT, /* conditional group containe more than two branches */
REG_BADPAT, /* assertion expected after (?( */
REG_BADPAT, /* (?R or (?digits must be followed by ) */
@ -94,11 +94,15 @@ static const int eint[] = {
REG_BADPAT, /* recursive call could loop indefinitely */
REG_BADPAT, /* unrecognized character after (?P */
REG_BADPAT, /* syntax error after (?P */
REG_BADPAT, /* two named groups have the same name */
REG_BADPAT, /* two named subpatterns have the same name */
REG_BADPAT, /* invalid UTF-8 string */
REG_BADPAT, /* support for \P, \p, and \X has not been compiled */
REG_BADPAT, /* malformed \P or \p sequence */
REG_BADPAT /* unknown property name after \P or \p */
REG_BADPAT, /* unknown property name after \P or \p */
REG_BADPAT, /* subpattern name is too long (maximum 32 characters) */
REG_BADPAT, /* too many named subpatterns (maximum 10,000) */
REG_BADPAT, /* repeated subpattern is too long */
REG_BADPAT /* octal value is greater than \377 (not in UTF-8 mode) */
};
/* Table of texts corresponding to POSIX error codes */

View File

@ -44,6 +44,10 @@ POSSIBILITY OF SUCH DAMAGE.
#include <locale.h>
#include <errno.h>
#ifndef _WIN32
#include <sys/resource.h>
#endif
#define PCRE_SPY /* For Win32 build, import data, not export */
/* We include pcre_internal.h because we need the internal info for displaying
@ -101,11 +105,6 @@ function (define NOINFOCHECK). */
#define LOOPREPEAT 500000
#define BUFFER_SIZE 30000
#define PBUFFER_SIZE BUFFER_SIZE
#define DBUFFER_SIZE BUFFER_SIZE
/* Static variables */
static FILE *outfile;
@ -119,10 +118,95 @@ static int show_malloc;
static int use_utf8;
static size_t gotten_store;
/* The buffers grow automatically if very long input lines are encountered. */
static int buffer_size = 50000;
static uschar *buffer = NULL;
static uschar *dbuffer = NULL;
static uschar *pbuffer = NULL;
/*************************************************
* Read or extend an input line *
*************************************************/
/* Input lines are read into buffer, but both patterns and data lines can be
continued over multiple input lines. In addition, if the buffer fills up, we
want to automatically expand it so as to be able to handle extremely large
lines that are needed for certain stress tests. When the input buffer is
expanded, the other two buffers must also be expanded likewise, and the
contents of pbuffer, which are a copy of the input for callouts, must be
preserved (for when expansion happens for a data line). This is not the most
optimal way of handling this, but hey, this is just a test program!
Arguments:
f the file to read
start where in buffer to start (this *must* be within buffer)
Returns: pointer to the start of new data
could be a copy of start, or could be moved
NULL if no data read and EOF reached
*/
static uschar *
extend_inputline(FILE *f, uschar *start)
{
uschar *here = start;
for (;;)
{
int rlen = buffer_size - (here - buffer);
if (rlen > 1000)
{
int dlen;
if (fgets((char *)here, rlen, f) == NULL)
return (here == start)? NULL : start;
dlen = (int)strlen((char *)here);
if (dlen > 0 && here[dlen - 1] == '\n') return start;
here += dlen;
}
else
{
int new_buffer_size = 2*buffer_size;
uschar *new_buffer = (unsigned char *)malloc(new_buffer_size);
uschar *new_dbuffer = (unsigned char *)malloc(new_buffer_size);
uschar *new_pbuffer = (unsigned char *)malloc(new_buffer_size);
if (new_buffer == NULL || new_dbuffer == NULL || new_pbuffer == NULL)
{
fprintf(stderr, "pcretest: malloc(%d) failed\n", new_buffer_size);
exit(1);
}
memcpy(new_buffer, buffer, buffer_size);
memcpy(new_pbuffer, pbuffer, buffer_size);
buffer_size = new_buffer_size;
start = new_buffer + (start - buffer);
here = new_buffer + (here - buffer);
free(buffer);
free(dbuffer);
free(pbuffer);
buffer = new_buffer;
dbuffer = new_dbuffer;
pbuffer = new_pbuffer;
}
}
return NULL; /* Control never gets here */
}
/*************************************************
* Read number from string *
*************************************************/
@ -159,19 +243,19 @@ return(result);
and returns the value of the character.
Argument:
buffer a pointer to the byte vector
vptr a pointer to an int to receive the value
utf8bytes a pointer to the byte vector
vptr a pointer to an int to receive the value
Returns: > 0 => the number of bytes consumed
-6 to 0 => malformed UTF-8 character at offset = (-return)
Returns: > 0 => the number of bytes consumed
-6 to 0 => malformed UTF-8 character at offset = (-return)
*/
#if !defined NOUTF8
static int
utf82ord(unsigned char *buffer, int *vptr)
utf82ord(unsigned char *utf8bytes, int *vptr)
{
int c = *buffer++;
int c = *utf8bytes++;
int d = c;
int i, j, s;
@ -191,7 +275,7 @@ d = (c & utf8_table3[i]) << s;
for (j = 0; j < i; j++)
{
c = *buffer++;
c = *utf8bytes++;
if ((c & 0xc0) != 0x80) return -(j+1);
s -= 6;
d |= (c & 0x3f) << s;
@ -222,24 +306,24 @@ and encodes it as a UTF-8 character in 0 to 6 bytes.
Arguments:
cvalue the character value
buffer pointer to buffer for result - at least 6 bytes long
utf8bytes pointer to buffer for result - at least 6 bytes long
Returns: number of characters placed in the buffer
*/
static int
ord2utf8(int cvalue, uschar *buffer)
ord2utf8(int cvalue, uschar *utf8bytes)
{
register int i, j;
for (i = 0; i < utf8_table1_size; i++)
if (cvalue <= utf8_table1[i]) break;
buffer += i;
utf8bytes += i;
for (j = i; j > 0; j--)
{
*buffer-- = 0x80 | (cvalue & 0x3f);
*utf8bytes-- = 0x80 | (cvalue & 0x3f);
cvalue >>= 6;
}
*buffer = utf8_table2[i] | cvalue;
*utf8bytes = utf8_table2[i] | cvalue;
return i + 1;
}
@ -461,8 +545,8 @@ if ((rc = pcre_fullinfo(re, study, option, ptr)) < 0)
* Byte flipping function *
*************************************************/
static long int
byteflip(long int value, int n)
static unsigned long int
byteflip(unsigned long int value, int n)
{
if (n == 2) return ((value & 0x00ff) << 8) | ((value & 0xff00) >> 8);
return ((value & 0x000000ff) << 24) |
@ -525,6 +609,32 @@ return count;
/*************************************************
* Check newline indicator *
*************************************************/
/* This is used both at compile and run-time to check for <xxx> escapes, where
xxx is LF, CR, or CRLF. Print a message and return 0 if there is no match.
Arguments:
p points after the leading '<'
f file for error message
Returns: appropriate PCRE_NEWLINE_xxx flags, or 0
*/
static int
check_newline(uschar *p, FILE *f)
{
if (strncmp((char *)p, "cr>", 3) == 0) return PCRE_NEWLINE_CR;
if (strncmp((char *)p, "lf>", 3) == 0) return PCRE_NEWLINE_LF;
if (strncmp((char *)p, "crlf>", 5) == 0) return PCRE_NEWLINE_CRLF;
fprintf(f, "Unknown newline type at: <%s\n", p);
return 0;
}
/*************************************************
* Main Program *
*************************************************/
@ -553,16 +663,23 @@ int debug = 0;
int done = 0;
int all_use_dfa = 0;
int yield = 0;
int stack_size;
unsigned char *buffer;
unsigned char *dbuffer;
/* These vectors store, end-to-end, a list of captured substring names. Assume
that 1024 is plenty long enough for the few names we'll be testing. */
uschar copynames[1024];
uschar getnames[1024];
uschar *copynamesptr;
uschar *getnamesptr;
/* Get buffers from malloc() so that Electric Fence will check their misuse
when I am debugging. */
when I am debugging. They grow automatically when very long lines are read. */
buffer = (unsigned char *)malloc(BUFFER_SIZE);
dbuffer = (unsigned char *)malloc(DBUFFER_SIZE);
pbuffer = (unsigned char *)malloc(PBUFFER_SIZE);
buffer = (unsigned char *)malloc(buffer_size);
dbuffer = (unsigned char *)malloc(buffer_size);
pbuffer = (unsigned char *)malloc(buffer_size);
/* The outfile variable is static so that new_malloc can use it. The _setmode()
stuff is some magic that I don't understand, but which apparently does good
@ -596,6 +713,28 @@ while (argc > 1 && argv[op][0] == '-')
op++;
argc--;
}
else if (strcmp(argv[op], "-S") == 0 && argc > 2 &&
((stack_size = get_value((unsigned char *)argv[op+1], &endptr)),
*endptr == 0))
{
#ifdef _WIN32
printf("PCRE: -S not supported on this OS\n");
exit(1);
#else
int rc;
struct rlimit rlim;
getrlimit(RLIMIT_STACK, &rlim);
rlim.rlim_cur = stack_size * 1024 * 1024;
rc = setrlimit(RLIMIT_STACK, &rlim);
if (rc != 0)
{
printf("PCRE: setrlimit() failed with error %d\n", rc);
exit(1);
}
op++;
argc--;
#endif
}
#if !defined NOPOSIX
else if (strcmp(argv[op], "-p") == 0) posix = 1;
#endif
@ -609,7 +748,8 @@ while (argc > 1 && argv[op][0] == '-')
(void)pcre_config(PCRE_CONFIG_UNICODE_PROPERTIES, &rc);
printf(" %sUnicode properties support\n", rc? "" : "No ");
(void)pcre_config(PCRE_CONFIG_NEWLINE, &rc);
printf(" Newline character is %s\n", (rc == '\r')? "CR" : "LF");
printf(" Newline sequence is %s\n", (rc == '\r')? "CR" :
(rc == '\n')? "LF" : "CRLF");
(void)pcre_config(PCRE_CONFIG_LINK_SIZE, &rc);
printf(" Internal link size = %d\n", rc);
(void)pcre_config(PCRE_CONFIG_POSIX_MALLOC_THRESHOLD, &rc);
@ -625,7 +765,7 @@ while (argc > 1 && argv[op][0] == '-')
else
{
printf("** Unknown or malformed option %s\n", argv[op]);
printf("Usage: pcretest [-d] [-i] [-o <n>] [-p] [-s] [-t] [<input> [<output>]]\n");
printf("Usage: pcretest [options] [<input> [<output>]]\n");
printf(" -C show PCRE compile-time options and exit\n");
printf(" -d debug: show compiled code; implies -i\n");
#if !defined NODFA
@ -637,6 +777,7 @@ while (argc > 1 && argv[op][0] == '-')
#if !defined NOPOSIX
printf(" -p use POSIX interface\n");
#endif
printf(" -S <n> set stack size to <n> megabytes\n");
printf(" -s output store (memory) used information\n"
" -t time compilation and execution\n");
yield = 1;
@ -723,7 +864,7 @@ while (!done)
use_utf8 = 0;
if (infile == stdin) printf(" re> ");
if (fgets((char *)buffer, BUFFER_SIZE, infile) == NULL) break;
if (extend_inputline(infile, buffer) == NULL) break;
if (infile != stdin) fprintf(outfile, "%s", (char *)buffer);
fflush(outfile);
@ -735,7 +876,7 @@ while (!done)
if (*p == '<' && strchr((char *)(p+1), '<') == NULL)
{
unsigned long int magic;
unsigned long int magic, get_options;
uschar sbuf[8];
FILE *f;
@ -783,8 +924,8 @@ while (!done)
/* Need to know if UTF-8 for printing data strings */
new_info(re, NULL, PCRE_INFO_OPTIONS, &options);
use_utf8 = (options & PCRE_UTF8) != 0;
new_info(re, NULL, PCRE_INFO_OPTIONS, &get_options);
use_utf8 = (get_options & PCRE_UTF8) != 0;
/* Now see if there is any following study data */
@ -838,16 +979,8 @@ while (!done)
pp++;
}
if (*pp != 0) break;
len = BUFFER_SIZE - (pp - buffer);
if (len < 256)
{
fprintf(outfile, "** Expression too long - missing delimiter?\n");
goto SKIP_DATA;
}
if (infile == stdin) printf(" > ");
if (fgets((char *)pp, len, infile) == NULL)
if ((pp = extend_inputline(infile, pp)) == NULL)
{
fprintf(outfile, "** Unexpected EOF\n");
done = 1;
@ -893,6 +1026,7 @@ while (!done)
case 'F': do_flip = 1; break;
case 'G': do_G = 1; break;
case 'I': do_showinfo = 1; break;
case 'J': options |= PCRE_DUPNAMES; break;
case 'M': log_store = 1; break;
case 'N': options |= PCRE_NO_AUTO_CAPTURE; break;
@ -927,6 +1061,15 @@ while (!done)
*pp = 0;
break;
case '<':
{
int x = check_newline(pp, outfile);
if (x == 0) goto SKIP_DATA;
options |= x;
while (*pp++ != '>');
}
break;
case '\r': /* So that it works in Windows */
case '\n':
case ' ':
@ -961,7 +1104,7 @@ while (!done)
if (rc != 0)
{
(void)regerror(rc, &preg, (char *)buffer, BUFFER_SIZE);
(void)regerror(rc, &preg, (char *)buffer, buffer_size);
fprintf(outfile, "Failed: POSIX code %d: %s\n", rc, buffer);
goto SKIP_DATA;
}
@ -1002,7 +1145,7 @@ while (!done)
{
for (;;)
{
if (fgets((char *)buffer, BUFFER_SIZE, infile) == NULL)
if (extend_inputline(infile, buffer) == NULL)
{
done = 1;
goto CONTINUE;
@ -1163,13 +1306,13 @@ while (!done)
if (do_flip)
{
all_options = byteflip(all_options, sizeof(all_options));
}
}
if ((all_options & PCRE_NOPARTIAL) != 0)
fprintf(outfile, "Partial matching not supported\n");
if (get_options == 0) fprintf(outfile, "No options\n");
else fprintf(outfile, "Options:%s%s%s%s%s%s%s%s%s%s%s%s\n",
else fprintf(outfile, "Options:%s%s%s%s%s%s%s%s%s%s%s%s%s\n",
((get_options & PCRE_ANCHORED) != 0)? " anchored" : "",
((get_options & PCRE_CASELESS) != 0)? " caseless" : "",
((get_options & PCRE_EXTENDED) != 0)? " extended" : "",
@ -1181,14 +1324,30 @@ while (!done)
((get_options & PCRE_UNGREEDY) != 0)? " ungreedy" : "",
((get_options & PCRE_NO_AUTO_CAPTURE) != 0)? " no_auto_capture" : "",
((get_options & PCRE_UTF8) != 0)? " utf8" : "",
((get_options & PCRE_NO_UTF8_CHECK) != 0)? " no_utf8_check" : "");
((get_options & PCRE_NO_UTF8_CHECK) != 0)? " no_utf8_check" : "",
((get_options & PCRE_DUPNAMES) != 0)? " dupnames" : "");
if (((((real_pcre *)re)->options) & PCRE_ICHANGED) != 0)
fprintf(outfile, "Case state changes\n");
switch (get_options & PCRE_NEWLINE_CRLF)
{
case PCRE_NEWLINE_CR:
fprintf(outfile, "Forced newline sequence: CR\n");
break;
case PCRE_NEWLINE_LF:
fprintf(outfile, "Forced newline sequence: LF\n");
break;
case PCRE_NEWLINE_CRLF:
fprintf(outfile, "Forced newline sequence: CRLF\n");
break;
default:
break;
}
if (first_char == -1)
{
fprintf(outfile, "First char at start or follows \\n\n");
fprintf(outfile, "First char at start or follows newline\n");
}
else if (first_char < 0)
{
@ -1343,6 +1502,12 @@ while (!done)
options = 0;
*copynames = 0;
*getnames = 0;
copynamesptr = copynames;
getnamesptr = getnames;
pcre_callout = callout;
first_callout = 1;
callout_extra = 0;
@ -1351,15 +1516,24 @@ while (!done)
callout_fail_id = -1;
show_malloc = 0;
if (infile == stdin) printf("data> ");
if (fgets((char *)buffer, BUFFER_SIZE, infile) == NULL)
{
done = 1;
goto CONTINUE;
}
if (infile != stdin) fprintf(outfile, "%s", (char *)buffer);
if (extra != NULL) extra->flags &=
~(PCRE_EXTRA_MATCH_LIMIT|PCRE_EXTRA_MATCH_LIMIT_RECURSION);
len = 0;
for (;;)
{
if (infile == stdin) printf("data> ");
if (extend_inputline(infile, buffer + len) == NULL)
{
if (len > 0) break;
done = 1;
goto CONTINUE;
}
if (infile != stdin) fprintf(outfile, "%s", (char *)buffer);
len = (int)strlen((char *)buffer);
if (buffer[len-1] == '\n') break;
}
len = (int)strlen((char *)buffer);
while (len > 0 && isspace(buffer[len-1])) len--;
buffer[len] = 0;
if (len == 0) break;
@ -1389,6 +1563,17 @@ while (!done)
c -= '0';
while (i++ < 2 && isdigit(*p) && *p != '8' && *p != '9')
c = c * 8 + *p++ - '0';
#if !defined NOUTF8
if (use_utf8 && c > 255)
{
unsigned char buff8[8];
int ii, utn;
utn = ord2utf8(c, buff8);
for (ii = 0; ii < utn - 1; ii++) *q++ = buff8[ii];
c = buff8[ii]; /* Last byte */
}
#endif
break;
case 'x':
@ -1450,14 +1635,14 @@ while (!done)
}
else if (isalnum(*p))
{
uschar name[256];
uschar *npp = name;
uschar *npp = copynamesptr;
while (isalnum(*p)) *npp++ = *p++;
*npp++ = 0;
*npp = 0;
n = pcre_get_stringnumber(re, (char *)name);
n = pcre_get_stringnumber(re, (char *)copynamesptr);
if (n < 0)
fprintf(outfile, "no parentheses with name \"%s\"\n", name);
else copystrings |= 1 << n;
fprintf(outfile, "no parentheses with name \"%s\"\n", copynamesptr);
copynamesptr = npp;
}
else if (*p == '+')
{
@ -1518,14 +1703,14 @@ while (!done)
}
else if (isalnum(*p))
{
uschar name[256];
uschar *npp = name;
uschar *npp = getnamesptr;
while (isalnum(*p)) *npp++ = *p++;
*npp++ = 0;
*npp = 0;
n = pcre_get_stringnumber(re, (char *)name);
n = pcre_get_stringnumber(re, (char *)getnamesptr);
if (n < 0)
fprintf(outfile, "no parentheses with name \"%s\"\n", name);
else getstrings |= 1 << n;
fprintf(outfile, "no parentheses with name \"%s\"\n", getnamesptr);
getnamesptr = npp;
}
continue;
@ -1564,6 +1749,28 @@ while (!done)
options |= PCRE_PARTIAL;
continue;
case 'Q':
while(isdigit(*p)) n = n * 10 + *p++ - '0';
if (extra == NULL)
{
extra = (pcre_extra *)malloc(sizeof(pcre_extra));
extra->flags = 0;
}
extra->flags |= PCRE_EXTRA_MATCH_LIMIT_RECURSION;
extra->match_limit_recursion = n;
continue;
case 'q':
while(isdigit(*p)) n = n * 10 + *p++ - '0';
if (extra == NULL)
{
extra = (pcre_extra *)malloc(sizeof(pcre_extra));
extra->flags = 0;
}
extra->flags |= PCRE_EXTRA_MATCH_LIMIT;
extra->match_limit = n;
continue;
#if !defined NODFA
case 'R':
options |= PCRE_DFA_RESTART;
@ -1581,6 +1788,15 @@ while (!done)
case '?':
options |= PCRE_NO_UTF8_CHECK;
continue;
case '<':
{
int x = check_newline(p, outfile);
if (x == 0) goto NEXT_DATA;
options |= x;
while (*p++ != '>');
}
continue;
}
*q++ = c;
}
@ -1611,7 +1827,7 @@ while (!done)
if (rc != 0)
{
(void)regerror(rc, &preg, (char *)buffer, BUFFER_SIZE);
(void)regerror(rc, &preg, (char *)buffer, buffer_size);
fprintf(outfile, "No match: POSIX code %d: %s\n", rc, buffer);
}
else if ((((const pcre *)preg.re_pcre)->options & PCRE_NO_AUTO_CAPTURE)
@ -1690,7 +1906,7 @@ while (!done)
extra->flags = 0;
}
count = check_match_limit(re, extra, bptr, len, start_offset,
(void)check_match_limit(re, extra, bptr, len, start_offset,
options|g_notempty, use_offsets, use_size_offsets,
PCRE_EXTRA_MATCH_LIMIT, &(extra->match_limit),
PCRE_ERROR_MATCHLIMIT, "match()");
@ -1778,7 +1994,7 @@ while (!done)
{
if ((copystrings & (1 << i)) != 0)
{
char copybuffer[16];
char copybuffer[256];
int rc = pcre_copy_substring((char *)bptr, use_offsets, count,
i, copybuffer, sizeof(copybuffer));
if (rc < 0)
@ -1788,6 +2004,19 @@ while (!done)
}
}
for (copynamesptr = copynames;
*copynamesptr != 0;
copynamesptr += (int)strlen((char*)copynamesptr) + 1)
{
char copybuffer[256];
int rc = pcre_copy_named_substring(re, (char *)bptr, use_offsets,
count, (char *)copynamesptr, copybuffer, sizeof(copybuffer));
if (rc < 0)
fprintf(outfile, "copy substring %s failed %d\n", copynamesptr, rc);
else
fprintf(outfile, " C %s (%d) %s\n", copybuffer, rc, copynamesptr);
}
for (i = 0; i < 32; i++)
{
if ((getstrings & (1 << i)) != 0)
@ -1800,12 +2029,27 @@ while (!done)
else
{
fprintf(outfile, "%2dG %s (%d)\n", i, substring, rc);
/* free((void *)substring); */
pcre_free_substring(substring);
}
}
}
for (getnamesptr = getnames;
*getnamesptr != 0;
getnamesptr += (int)strlen((char*)getnamesptr) + 1)
{
const char *substring;
int rc = pcre_get_named_substring(re, (char *)bptr, use_offsets,
count, (char *)getnamesptr, &substring);
if (rc < 0)
fprintf(outfile, "copy substring %s failed %d\n", getnamesptr, rc);
else
{
fprintf(outfile, " G %s (%d) %s\n", substring, rc, getnamesptr);
pcre_free_substring(substring);
}
}
if (getlist)
{
const char **stringlist;
@ -1905,6 +2149,8 @@ while (!done)
len -= use_offsets[1];
}
} /* End of loop for /g and /G */
NEXT_DATA: continue;
} /* End of loop for data lines */
CONTINUE:

View File

@ -39,4 +39,11 @@ eighteen
nineteen
twenty
Here follows some CR/LF/CRLF test data.
abc
def
ghi
jkl
This is the last line of this file.

View File

@ -1474,11 +1474,11 @@
/(abc)\323/
abc\xd3
/(abc)\500/
/(abc)\100/
abc\x40
abc\100
/(abc)\5000/
/(abc)\1000/
abc\x400
abc\x40\x30
abc\1000
@ -3847,4 +3847,41 @@
** Failers
abcddefg
/(?<![^f]oo)(bar)/
foobarX
** Failers
boobarX
/(?<![^f])X/
offX
** Failers
onyX
/(?<=[^f])X/
onyX
** Failers
offX
/^/mg
a\nb\nc\n
\
/(?<=C\n)^/mg
A\nC\nC\n
/(?:(?(1)a|b)(X))+/
bXaX
/(?:(?(1)\1a|b)(X|Y))+/
bXXaYYaY
bXYaXXaX
/()()()()()()()()()(?:(?(10)\10a|b)(X|Y))+/
bXXaYYaY
/[[,abc,]+]/
abc]
a,b]
[a,b,c]
/ End of testinput1 /

View File

@ -733,7 +733,7 @@
Ab
AB
/[\200-\410]/
/[\200-\110]/
/^(?(0)f|b)oo/
@ -1490,4 +1490,157 @@
/\x{0000ff}/
/^((?P<A>a1)|(?P<A>a2)b)/
/^((?P<A>a1)|(?P<A>a2)b)/J
a1b\CA
a2b\CA
** Failers
a1b\CZ\CA
/^(?P<A>a)(?P<A>b)/J
ab\CA
/^(?P<A>a)(?P<A>b)|cd/J
ab\CA
cd\CA
/^(?P<A>a)(?P<A>b)|cd(?P<A>ef)(?P<A>gh)/J
cdefgh\CA
/^((?P<A>a1)|(?P<A>a2)b)/J
a1b\GA
a2b\GA
** Failers
a1b\GZ\GA
/^(?P<A>a)(?P<A>b)/J
ab\GA
/^(?P<A>a)(?P<A>b)|cd/J
ab\GA
cd\GA
/^(?P<A>a)(?P<A>b)|cd(?P<A>ef)(?P<A>gh)/J
cdefgh\GA
/(?J)^((?P<A>a1)|(?P<A>a2)b)/
a1b\CA
a2b\CA
/^(?P<A>a) (?J:(?P<B>b)(?P<B>c)) (?P<A>d)/
/ In this next test, J is not set at the outer level; consequently it isn't
set in the pattern's options; consequently pcre_get_named_substring() produces
a random value. /x
/^(?P<A>a) (?J:(?P<B>b)(?P<B>c)) (?P<C>d)/
a bc d\CA\CB\CC
/^(?P<A>a)?(?(A)a|b)/
aabc
bc
** Failers
abc
/(?:(?(ZZ)a|b)(?P<ZZ>X))+/
bXaX
/(?:(?(2y)a|b)(X))+/
/(?:(?(ZA)a|b)(?P<ZZ>X))+/
/(?:(?(ZZ)a|b)(?(ZZ)a|b)(?P<ZZ>X))+/
bbXaaX
/(?:(?(ZZ)a|\(b\))\\(?P<ZZ>X))+/
(b)\\Xa\\X
/(?P<ABC/
/(?:(?(A)(?P=A)a|b)(?P<A>X|Y))+/
bXXaYYaY
bXYaXXaX
/()()()()()()()()()(?:(?(A)(?P=A)a|b)(?P<A>X|Y))+/
bXXaYYaY
/\777/
/\s*,\s*/S
\x0b,\x0b
\x0c,\x0d
/^abc/m
xyz\nabc
xyz\nabc\<lf>
xyz\r\nabc\<lf>
xyz\rabc\<cr>
xyz\r\nabc\<crlf>
** Failers
xyz\nabc\<cr>
xyz\r\nabc\<cr>
xyz\nabc\<crlf>
xyz\rabc\<crlf>
xyz\rabc\<lf>
/abc$/m
xyzabc
xyzabc\n
xyzabc\npqr
xyzabc\r\<cr>
xyzabc\rpqr\<cr>
xyzabc\r\n\<crlf>
xyzabc\r\npqr\<crlf>
** Failers
xyzabc\r
xyzabc\rpqr
xyzabc\r\n
xyzabc\r\npqr
/^abc/m<cr>
xyz\rabcdef
xyz\nabcdef\<lf>
** Failers
xyz\nabcdef
/^abc/m<lf>
xyz\nabcdef
xyz\rabcdef\<cr>
** Failers
xyz\rabcdef
/^abc/m<crlf>
xyz\r\nabcdef
xyz\rabcdef\<cr>
** Failers
xyz\rabcdef
/^abc/m<bad>
/abc/
xyz\rabc\<bad>
abc
/.*/
abc\ndef
abc\rdef
abc\r\ndef
\<cr>abc\ndef
\<cr>abc\rdef
\<cr>abc\r\ndef
\<crlf>abc\ndef
\<crlf>abc\rdef
\<crlf>abc\r\ndef
/\w+(.)(.)?def/s
abc\ndef
abc\rdef
abc\r\ndef
+((?:\s|//.*\\n|/[*](?:\\n|.)*?[*]/)*)+
/* this is a C style comment */\M
/(?P<B>25[0-5]|2[0-4]\d|[01]?\d?\d)(?:\.(?P>B)){3}/
/ End of testinput2 /

View File

@ -510,7 +510,14 @@
/^\x{85}$/8i
\x{85}
/^ሴ/8
/^\ሴ/8
"(?s)(.{1,5})"8
abcdefg
ab
/ End of testinput4 /

View File

@ -265,4 +265,10 @@
/^\ሴ/8D
/\777/I
/\777/8I
\x{1ff}
\777
/ End of testinput5 /

View File

@ -738,4 +738,13 @@
\x{1c5}XY
AXY
/^(\p{Z}[^\p{C}\p{Z}]+)*$/
\xa0!
/^[\pL](abc)(?1)/
AabcabcYZ
/([\pL]=(abc))*X/
L=abcX
/ End of testinput6 /

View File

@ -1909,11 +1909,11 @@
/(abc)\323/
abc\xd3
/(abc)\500/
/(abc)\100/
abc\x40
abc\100
/(abc)\5000/
/(abc)\1000/
abc\x400
abc\x40\x30
abc\1000
@ -4019,4 +4019,100 @@
123\P
4\P\R
/^/mg
a\nb\nc\n
\
/(?<=C\n)^/mg
A\nC\nC\n
/(?s)A?B/
AB
aB
/(?s)A*B/
AB
aB
/(?m)A?B/
AB
aB
/(?m)A*B/
AB
aB
/Content-Type\x3A[^\r\n]{6,}/
Content-Type:xxxxxyyy
/Content-Type\x3A[^\r\n]{6,}z/
Content-Type:xxxxxyyyz
/Content-Type\x3A[^a]{6,}/
Content-Type:xxxyyy
/Content-Type\x3A[^a]{6,}z/
Content-Type:xxxyyyz
/^abc/m
xyz\nabc
xyz\nabc\<lf>
xyz\r\nabc\<lf>
xyz\rabc\<cr>
xyz\r\nabc\<crlf>
** Failers
xyz\nabc\<cr>
xyz\r\nabc\<cr>
xyz\nabc\<crlf>
xyz\rabc\<crlf>
xyz\rabc\<lf>
/abc$/m
xyzabc
xyzabc\n
xyzabc\npqr
xyzabc\r\<cr>
xyzabc\rpqr\<cr>
xyzabc\r\n\<crlf>
xyzabc\r\npqr\<crlf>
** Failers
xyzabc\r
xyzabc\rpqr
xyzabc\r\n
xyzabc\r\npqr
/^abc/m<cr>
xyz\rabcdef
xyz\nabcdef\<lf>
** Failers
xyz\nabcdef
/^abc/m<lf>
xyz\nabcdef
xyz\rabcdef\<cr>
** Failers
xyz\rabcdef
/^abc/m<crlf>
xyz\r\nabcdef
xyz\rabcdef\<cr>
** Failers
xyz\rabcdef
/.*/
abc\ndef
abc\rdef
abc\r\ndef
\<cr>abc\ndef
\<cr>abc\rdef
\<cr>abc\r\ndef
\<crlf>abc\ndef
\<crlf>abc\rdef
\<crlf>abc\r\ndef
/\w+(.)(.)?def/s
abc\ndef
abc\rdef
abc\r\ndef
/ End of testinput7 /

View File

@ -2127,7 +2127,7 @@ No match
0: abc\xd3
1: abc
/(abc)\500/
/(abc)\100/
abc\x40
0: abc@
1: abc
@ -2135,7 +2135,7 @@ No match
0: abc@
1: abc
/(abc)\5000/
/(abc)\1000/
abc\x400
0: abc@0
1: abc
@ -6282,4 +6282,76 @@ No match
abcddefg
No match
/(?<![^f]oo)(bar)/
foobarX
0: bar
1: bar
** Failers
No match
boobarX
No match
/(?<![^f])X/
offX
0: X
** Failers
No match
onyX
No match
/(?<=[^f])X/
onyX
0: X
** Failers
No match
offX
No match
/^/mg
a\nb\nc\n
0:
0:
0:
\
0:
/(?<=C\n)^/mg
A\nC\nC\n
0:
/(?:(?(1)a|b)(X))+/
bXaX
0: bXaX
1: X
/(?:(?(1)\1a|b)(X|Y))+/
bXXaYYaY
0: bXXaYYaY
1: Y
bXYaXXaX
0: bX
1: X
/()()()()()()()()()(?:(?(10)\10a|b)(X|Y))+/
bXXaYYaY
0: bX
1:
2:
3:
4:
5:
6:
7:
8:
9:
10: X
/[[,abc,]+]/
abc]
0: abc]
a,b]
0: a,b]
[a,b,c]
0: [a,b,c]
/ End of testinput1 /

View File

@ -115,14 +115,14 @@ Failed: unrecognized character after (? at offset 2
Capturing subpattern count = 0
Partial matching not supported
No options
First char at start or follows \n
First char at start or follows newline
Need char = 'b'
/.*?b/
Capturing subpattern count = 0
Partial matching not supported
No options
First char at start or follows \n
First char at start or follows newline
Need char = 'b'
/cat|dog|elephant/
@ -326,7 +326,7 @@ No need char
Capturing subpattern count = 3
Partial matching not supported
No options
First char at start or follows \n
First char at start or follows newline
No need char
defabc
0: defabc
@ -517,7 +517,6 @@ No need char
/(^b|(?i)^d)/
Capturing subpattern count = 1
Options: anchored
Case state changes
No first char
No need char
@ -552,13 +551,13 @@ Starting byte set: b c x y
/(^a|^b)/m
Capturing subpattern count = 1
Options: multiline
First char at start or follows \n
First char at start or follows newline
No need char
/(?i)(^a|^b)/m
Capturing subpattern count = 1
Options: caseless multiline
First char at start or follows \n
First char at start or follows newline
No need char
/(a)(?(1)a|b|c)/
@ -568,13 +567,13 @@ Failed: conditional group contains more than two branches at offset 13
Failed: conditional group contains more than two branches at offset 12
/(?(1a)/
Failed: malformed number after (?( at offset 4
Failed: reference to non-existent subpattern at offset 6
/(?(?i))/
Failed: assertion expected after (?( at offset 3
/(?(abc))/
Failed: assertion expected after (?( at offset 3
Failed: reference to non-existent subpattern at offset 7
/(?(?<ab))/
Failed: unrecognized character after (?< at offset 5
@ -592,7 +591,6 @@ Capturing subpattern count = 1
Max back reference = 1
Partial matching not supported
No options
Case state changes
First char = 'b' (caseless)
Need char = 'h' (caseless)
@ -609,7 +607,6 @@ Need char = 'h' (caseless)
------------------------------------------------------------------
Capturing subpattern count = 1
No options
Case state changes
First char = 'b' (caseless)
No need char
Study returned NULL
@ -618,7 +615,6 @@ Study returned NULL
Capturing subpattern count = 1
Partial matching not supported
No options
Case state changes
No first char
No need char
Starting byte set: C a b c d
@ -664,7 +660,7 @@ No need char
/^abc/m
Capturing subpattern count = 0
Options: multiline
First char at start or follows \n
First char at start or follows newline
Need char = 'c'
/^((a+)(?U)([ab]+)(?-U)([bc]+)(\w*))/
@ -721,7 +717,7 @@ No match
/^(?<=foo\n)bar/m
Capturing subpattern count = 0
Options: multiline
First char at start or follows \n
First char at start or follows newline
Need char = 'r'
foo\nbarbar
0: bar
@ -737,7 +733,7 @@ No match
/(?>^abc)/m
Capturing subpattern count = 0
Options: multiline
First char at start or follows \n
First char at start or follows newline
Need char = 'c'
abc
0: abc
@ -782,7 +778,6 @@ No match
/(?<=ab(?i)x|y|z)/
Capturing subpattern count = 0
No options
Case state changes
No first char
No need char
@ -790,7 +785,7 @@ No need char
Capturing subpattern count = 2
Partial matching not supported
No options
First char at start or follows \n
First char at start or follows newline
No need char
alphabetabcd
0: alphabetabcd
@ -803,7 +798,6 @@ No need char
/(?<=ab(?i)x(?-i)y|(?i)z|b)ZZ/
Capturing subpattern count = 0
No options
Case state changes
First char = 'Z'
Need char = 'Z'
abxyZZ
@ -966,7 +960,7 @@ Failed: unrecognized character after (? at offset 3
Failed: unrecognized character after (? at offset 3
/(?(1?)a|b)/
Failed: malformed number after (?( at offset 4
Failed: malformed number or name after (?( at offset 4
/(?(1)a|b|c)/
Failed: conditional group contains more than two branches at offset 10
@ -1021,7 +1015,7 @@ No need char
abcdefghijklmnopqrstuvwxyz\C1
0: abcdefghijklmnopqrst
1: abcdefghijklmnopqrst
copy substring 1 failed -6
1C abcdefghijklmnopqrst (20)
abcdefghijklmnopqrstuvwxyz\G1
0: abcdefghijklmnopqrst
1: abcdefghijklmnopqrst
@ -1054,7 +1048,7 @@ No need char
abcdefghijklmnopqrstuvwxyz\C1\G1\L
0: abcdefghijklmnop
1: abcdefghijklmnop
copy substring 1 failed -6
1C abcdefghijklmnop (16)
1G abcdefghijklmnop (16)
0L abcdefghijklmnop
1L abcdefghijklmnop
@ -1128,7 +1122,7 @@ Need char = 'd'
Capturing subpattern count = 0
Partial matching not supported
No options
First char at start or follows \n
First char at start or follows newline
Need char = 'X'
/.*X/Ds
@ -1161,7 +1155,7 @@ Need char = 'X'
Capturing subpattern count = 1
Partial matching not supported
No options
First char at start or follows \n
First char at start or follows newline
No need char
/(.*X|^B)/Ds
@ -1221,7 +1215,7 @@ No need char
Capturing subpattern count = 0
Partial matching not supported
No options
First char at start or follows \n
First char at start or follows newline
No need char
/\Biss\B/+
@ -1306,7 +1300,7 @@ No need char
Capturing subpattern count = 0
Partial matching not supported
No options
First char at start or follows \n
First char at start or follows newline
Need char = 's'
abciss\nxyzisspqr
0: abciss
@ -1365,7 +1359,7 @@ No need char
/^ab\n/mg+
Capturing subpattern count = 0
Options: multiline
First char at start or follows \n
First char at start or follows newline
Need char = 10
ab\nab\ncd
0: ab\x0a
@ -2223,7 +2217,6 @@ No need char
/((?-i)[[:lower:]])[[:lower:]]/i
Capturing subpattern count = 1
Options: caseless
Case state changes
No first char
No need char
ab
@ -2240,11 +2233,11 @@ No match
AB
No match
/[\200-\410]/
/[\200-\110]/
Failed: range out of order in character class at offset 9
/^(?(0)f|b)oo/
Failed: invalid condition (?(0) at offset 5
Failed: invalid condition (?(0) at offset 6
/This one's here because of the large output vector needed/
Capturing subpattern count = 0
@ -2761,7 +2754,6 @@ No need char
------------------------------------------------------------------
Capturing subpattern count = 0
No options
Case state changes
First char = 'a'
Need char = 'b' (caseless)
ab
@ -2787,7 +2779,6 @@ No match
------------------------------------------------------------------
Capturing subpattern count = 1
No options
Case state changes
First char = 'a'
Need char = 'b' (caseless)
ab
@ -3370,7 +3361,7 @@ No need char
Capturing subpattern count = 1
Partial matching not supported
No options
First char at start or follows \n
First char at start or follows newline
No need char
/(.*)\d+\1/Is
@ -3393,7 +3384,7 @@ Capturing subpattern count = 2
Max back reference = 2
Partial matching not supported
No options
First char at start or follows \n
First char at start or follows newline
Need char = 'z'
/((.*))\d+\1/I
@ -3430,7 +3421,6 @@ Need char = 'z' (caseless)
/(?=abc)(?i).xyz/I
Capturing subpattern count = 0
No options
Case state changes
First char = 'a'
Need char = 'z' (caseless)
@ -3553,7 +3543,7 @@ Need char = 'b'
/^a/mI
Capturing subpattern count = 0
Options: multiline
First char at start or follows \n
First char at start or follows newline
Need char = 'a'
abcde
0: a
@ -3580,7 +3570,6 @@ Starting byte set: A B a b
/[ab](?i)cd/IS
Capturing subpattern count = 0
No options
Case state changes
No first char
Need char = 'd' (caseless)
Starting byte set: a b
@ -4503,12 +4492,12 @@ No first char
Need char = 'z'
aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaazzbbbbbb\M
Minimum match() limit = 8
Minimum match() recursion limit = 7
Minimum match() recursion limit = 6
0: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaazz
1: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
aaaaaaaaaaaaaz\M
Minimum match() limit = 32768
Minimum match() recursion limit = 43
Minimum match() recursion limit = 42
No match
/(aaa(?C1)bbb|ab)/
@ -4555,18 +4544,19 @@ Need char = 'h'
1: cd
2: gh
1C cd (2)
2G gh (2)
G gh (2) two
abcdefgh\Cone\Ctwo
0: abcdefgh
1: cd
2: gh
1C cd (2)
2C gh (2)
C cd (2) one
C gh (2) two
abcdefgh\Cthree
no parentheses with name "three"
0: abcdefgh
1: cd
2: gh
copy substring three failed -7
/(?P<Tes>)(?P<Test>)/D
------------------------------------------------------------------
@ -4616,18 +4606,18 @@ Need char = 'a'
0: zzaa
1: zz
2: aa
1C zz (2)
C zz (2) Z
zzaa\CA
0: zzaa
1: zz
2: aa
2C aa (2)
C aa (2) A
/(?P<x>eks)(?P<x>eccs)/
Failed: two named groups have the same name at offset 16
Failed: two named subpatterns have the same name at offset 16
/(?P<abc>abc(?P<def>def)(?P<abc>xyz))/
Failed: two named groups have the same name at offset 31
Failed: two named subpatterns have the same name at offset 31
"\[((?P<elem>\d+)(,(?P>elem))*)\]"
Capturing subpattern count = 3
@ -5769,7 +5759,6 @@ Failed: number too big in {} quantifier at offset 15
Capturing subpattern count = 1
Max back reference = 1
No options
Case state changes
First char = 'a' (caseless)
Need char = 'B'
abcdefghijklAkB
@ -6059,6 +6048,505 @@ No options
First char = 255
No need char
/^((?P<A>a1)|(?P<A>a2)b)/
Failed: two named subpatterns have the same name at offset 18
/^((?P<A>a1)|(?P<A>a2)b)/J
Capturing subpattern count = 3
Named capturing subpatterns:
A 2
A 3
Options: anchored dupnames
No first char
No need char
a1b\CA
0: a1
1: a1
2: a1
C a1 (2) A
a2b\CA
0: a2b
1: a2b
2: <unset>
3: a2
C a2 (2) A
** Failers
No match
a1b\CZ\CA
no parentheses with name "Z"
0: a1
1: a1
2: a1
copy substring Z failed -7
C a1 (2) A
/^(?P<A>a)(?P<A>b)/J
Capturing subpattern count = 2
Named capturing subpatterns:
A 1
A 2
Options: anchored dupnames
No first char
No need char
ab\CA
0: ab
1: a
2: b
C a (1) A
/^(?P<A>a)(?P<A>b)|cd/J
Capturing subpattern count = 2
Named capturing subpatterns:
A 1
A 2
Options: dupnames
No first char
No need char
ab\CA
0: ab
1: a
2: b
C a (1) A
cd\CA
0: cd
copy substring A failed -7
/^(?P<A>a)(?P<A>b)|cd(?P<A>ef)(?P<A>gh)/J
Capturing subpattern count = 4
Named capturing subpatterns:
A 1
A 2
A 3
A 4
Options: dupnames
No first char
No need char
cdefgh\CA
0: cdefgh
1: <unset>
2: <unset>
3: ef
4: gh
C ef (2) A
/^((?P<A>a1)|(?P<A>a2)b)/J
Capturing subpattern count = 3
Named capturing subpatterns:
A 2
A 3
Options: anchored dupnames
No first char
No need char
a1b\GA
0: a1
1: a1
2: a1
G a1 (2) A
a2b\GA
0: a2b
1: a2b
2: <unset>
3: a2
G a2 (2) A
** Failers
No match
a1b\GZ\GA
no parentheses with name "Z"
0: a1
1: a1
2: a1
copy substring Z failed -7
G a1 (2) A
/^(?P<A>a)(?P<A>b)/J
Capturing subpattern count = 2
Named capturing subpatterns:
A 1
A 2
Options: anchored dupnames
No first char
No need char
ab\GA
0: ab
1: a
2: b
G a (1) A
/^(?P<A>a)(?P<A>b)|cd/J
Capturing subpattern count = 2
Named capturing subpatterns:
A 1
A 2
Options: dupnames
No first char
No need char
ab\GA
0: ab
1: a
2: b
G a (1) A
cd\GA
0: cd
copy substring A failed -7
/^(?P<A>a)(?P<A>b)|cd(?P<A>ef)(?P<A>gh)/J
Capturing subpattern count = 4
Named capturing subpatterns:
A 1
A 2
A 3
A 4
Options: dupnames
No first char
No need char
cdefgh\GA
0: cdefgh
1: <unset>
2: <unset>
3: ef
4: gh
G ef (2) A
/(?J)^((?P<A>a1)|(?P<A>a2)b)/
Capturing subpattern count = 3
Named capturing subpatterns:
A 2
A 3
Options: anchored dupnames
No first char
No need char
a1b\CA
0: a1
1: a1
2: a1
C a1 (2) A
a2b\CA
0: a2b
1: a2b
2: <unset>
3: a2
C a2 (2) A
/^(?P<A>a) (?J:(?P<B>b)(?P<B>c)) (?P<A>d)/
Failed: two named subpatterns have the same name at offset 38
/ In this next test, J is not set at the outer level; consequently it isn't
set in the pattern's options; consequently pcre_get_named_substring() produces
a random value. /x
Capturing subpattern count = 1
Options: extended
First char = 'I'
Need char = 'e'
/^(?P<A>a) (?J:(?P<B>b)(?P<B>c)) (?P<C>d)/
Capturing subpattern count = 4
Named capturing subpatterns:
A 1
B 2
B 3
C 4
Options: anchored
No first char
No need char
a bc d\CA\CB\CC
0: a bc d
1: a
2: b
3: c
4: d
C a (1) A
C b (1) B
C d (1) C
/^(?P<A>a)?(?(A)a|b)/
Capturing subpattern count = 1
Named capturing subpatterns:
A 1
Options: anchored
No first char
No need char
aabc
0: aa
1: a
bc
0: b
** Failers
No match
abc
No match
/(?:(?(ZZ)a|b)(?P<ZZ>X))+/
Capturing subpattern count = 1
Named capturing subpatterns:
ZZ 1
No options
No first char
Need char = 'X'
bXaX
0: bXaX
1: X
/(?:(?(2y)a|b)(X))+/
Failed: reference to non-existent subpattern at offset 9
/(?:(?(ZA)a|b)(?P<ZZ>X))+/
Failed: reference to non-existent subpattern at offset 9
/(?:(?(ZZ)a|b)(?(ZZ)a|b)(?P<ZZ>X))+/
Capturing subpattern count = 1
Named capturing subpatterns:
ZZ 1
No options
No first char
Need char = 'X'
bbXaaX
0: bbXaaX
1: X
/(?:(?(ZZ)a|\(b\))\\(?P<ZZ>X))+/
Capturing subpattern count = 1
Named capturing subpatterns:
ZZ 1
No options
No first char
Need char = 'X'
(b)\\Xa\\X
0: (b)\Xa\X
1: X
/(?P<ABC/
Failed: syntax error after (?P at offset 7
/(?:(?(A)(?P=A)a|b)(?P<A>X|Y))+/
Capturing subpattern count = 1
Max back reference = 1
Named capturing subpatterns:
A 1
No options
No first char
No need char
bXXaYYaY
0: bXXaYYaY
1: Y
bXYaXXaX
0: bX
1: X
/()()()()()()()()()(?:(?(A)(?P=A)a|b)(?P<A>X|Y))+/
Capturing subpattern count = 10
Max back reference = 10
Named capturing subpatterns:
A 10
No options
No first char
No need char
bXXaYYaY
0: bXXaYYaY
1:
2:
3:
4:
5:
6:
7:
8:
9:
10: Y
/\777/
Failed: octal value is greater than \377 (not in UTF-8 mode) at offset 3
/\s*,\s*/S
Capturing subpattern count = 0
Partial matching not supported
No options
No first char
Need char = ','
Starting byte set: \x09 \x0a \x0c \x0d \x20 ,
\x0b,\x0b
0: ,
\x0c,\x0d
0: \x0c,\x0d
/^abc/m
Capturing subpattern count = 0
Options: multiline
First char at start or follows newline
Need char = 'c'
xyz\nabc
0: abc
xyz\nabc\<lf>
0: abc
xyz\r\nabc\<lf>
0: abc
xyz\rabc\<cr>
0: abc
xyz\r\nabc\<crlf>
0: abc
** Failers
No match
xyz\nabc\<cr>
No match
xyz\r\nabc\<cr>
No match
xyz\nabc\<crlf>
No match
xyz\rabc\<crlf>
No match
xyz\rabc\<lf>
No match
/abc$/m
Capturing subpattern count = 0
Options: multiline
First char = 'a'
Need char = 'c'
xyzabc
0: abc
xyzabc\n
0: abc
xyzabc\npqr
0: abc
xyzabc\r\<cr>
0: abc
xyzabc\rpqr\<cr>
0: abc
xyzabc\r\n\<crlf>
0: abc
xyzabc\r\npqr\<crlf>
0: abc
** Failers
No match
xyzabc\r
No match
xyzabc\rpqr
No match
xyzabc\r\n
No match
xyzabc\r\npqr
No match
/^abc/m<cr>
Capturing subpattern count = 0
Options: multiline
Forced newline sequence: CR
First char at start or follows newline
Need char = 'c'
xyz\rabcdef
0: abc
xyz\nabcdef\<lf>
0: abc
** Failers
No match
xyz\nabcdef
No match
/^abc/m<lf>
Capturing subpattern count = 0
Options: multiline
Forced newline sequence: LF
First char at start or follows newline
Need char = 'c'
xyz\nabcdef
0: abc
xyz\rabcdef\<cr>
0: abc
** Failers
No match
xyz\rabcdef
No match
/^abc/m<crlf>
Capturing subpattern count = 0
Options: multiline
Forced newline sequence: CRLF
First char at start or follows newline
Need char = 'c'
xyz\r\nabcdef
0: abc
xyz\rabcdef\<cr>
0: abc
** Failers
No match
xyz\rabcdef
No match
/^abc/m<bad>
Unknown newline type at: <bad>
/abc/
Capturing subpattern count = 0
No options
First char = 'a'
Need char = 'c'
xyz\rabc\<bad>
Unknown newline type at: <bad>
abc
0: abc
/.*/
Capturing subpattern count = 0
Partial matching not supported
No options
First char at start or follows newline
No need char
abc\ndef
0: abc
abc\rdef
0: abc\x0ddef
abc\r\ndef
0: abc\x0d
\<cr>abc\ndef
0: abc\x0adef
\<cr>abc\rdef
0: abc
\<cr>abc\r\ndef
0: abc
\<crlf>abc\ndef
0: abc\x0adef
\<crlf>abc\rdef
0: abc\x0ddef
\<crlf>abc\r\ndef
0: abc
/\w+(.)(.)?def/s
Capturing subpattern count = 2
Partial matching not supported
Options: dotall
No first char
Need char = 'f'
abc\ndef
0: abc\x0adef
1: \x0a
abc\rdef
0: abc\x0ddef
1: \x0d
abc\r\ndef
0: abc\x0d\x0adef
1: \x0d
2: \x0a
+((?:\s|//.*\\n|/[*](?:\\n|.)*?[*]/)*)+
Capturing subpattern count = 1
Partial matching not supported
No options
No first char
No need char
/* this is a C style comment */\M
Minimum match() limit = 120
Minimum match() recursion limit = 6
0: /* this is a C style comment */
1: /* this is a C style comment */
/(?P<B>25[0-5]|2[0-4]\d|[01]?\d?\d)(?:\.(?P>B)){3}/
Capturing subpattern count = 1
Named capturing subpatterns:
B 1
No options
No first char
Need char = '.'
/ End of testinput2 /
Capturing subpattern count = 0
No options

View File

@ -898,8 +898,20 @@ No match
\x{85}
0: \x{85}
/^ሴ/8
0: \x{1234}
/^\ሴ/8
0: \x{1234}
"(?s)(.{1,5})"8
abcdefg
0: abcde
1: abcde
ab
0: ab
1: ab
/ End of testinput4 /

View File

@ -1107,4 +1107,17 @@ Options: anchored utf8
No first char
No need char
/\777/I
Failed: octal value is greater than \377 (not in UTF-8 mode) at offset 3
/\777/8I
Capturing subpattern count = 0
Options: utf8
First char = 199
Need char = 191
\x{1ff}
0: \x{1ff}
\777
0: \x{1ff}
/ End of testinput5 /

View File

@ -1394,4 +1394,20 @@ No match
AXY
No match
/^(\p{Z}[^\p{C}\p{Z}]+)*$/
\xa0!
0: \xa0!
1: \xa0!
/^[\pL](abc)(?1)/
AabcabcYZ
0: Aabcabc
1: abc
/([\pL]=(abc))*X/
L=abcX
0: L=abcX
1: L=abc
2: abc
/ End of testinput6 /

View File

@ -3004,13 +3004,13 @@ No match
abc\xd3
0: abc\xd3
/(abc)\500/
/(abc)\100/
abc\x40
0: abc@
abc\100
0: abc@
/(abc)\5000/
/(abc)\1000/
abc\x400
0: abc@0
abc\x40\x30
@ -6523,4 +6523,210 @@ Partial match: 123
4\P\R
0: 4
/^/mg
a\nb\nc\n
0:
0:
0:
\
0:
/(?<=C\n)^/mg
A\nC\nC\n
0:
/(?s)A?B/
AB
0: AB
aB
0: B
/(?s)A*B/
AB
0: AB
aB
0: B
/(?m)A?B/
AB
0: AB
aB
0: B
/(?m)A*B/
AB
0: AB
aB
0: B
/Content-Type\x3A[^\r\n]{6,}/
Content-Type:xxxxxyyy
0: Content-Type:xxxxxyyy
1: Content-Type:xxxxxyy
2: Content-Type:xxxxxy
/Content-Type\x3A[^\r\n]{6,}z/
Content-Type:xxxxxyyyz
0: Content-Type:xxxxxyyyz
/Content-Type\x3A[^a]{6,}/
Content-Type:xxxyyy
0: Content-Type:xxxyyy
/Content-Type\x3A[^a]{6,}z/
Content-Type:xxxyyyz
0: Content-Type:xxxyyyz
/^abc/m
xyz\nabc
0: abc
xyz\nabc\<lf>
0: abc
xyz\r\nabc\<lf>
0: abc
xyz\rabc\<cr>
0: abc
xyz\r\nabc\<crlf>
0: abc
** Failers
No match
xyz\nabc\<cr>
No match
xyz\r\nabc\<cr>
No match
xyz\nabc\<crlf>
No match
xyz\rabc\<crlf>
No match
xyz\rabc\<lf>
No match
/abc$/m
xyzabc
0: abc
xyzabc\n
0: abc
xyzabc\npqr
0: abc
xyzabc\r\<cr>
0: abc
xyzabc\rpqr\<cr>
0: abc
xyzabc\r\n\<crlf>
0: abc
xyzabc\r\npqr\<crlf>
0: abc
** Failers
No match
xyzabc\r
No match
xyzabc\rpqr
No match
xyzabc\r\n
No match
xyzabc\r\npqr
No match
/^abc/m<cr>
xyz\rabcdef
0: abc
xyz\nabcdef\<lf>
0: abc
** Failers
No match
xyz\nabcdef
No match
/^abc/m<lf>
xyz\nabcdef
0: abc
xyz\rabcdef\<cr>
0: abc
** Failers
No match
xyz\rabcdef
No match
/^abc/m<crlf>
xyz\r\nabcdef
0: abc
xyz\rabcdef\<cr>
0: abc
** Failers
No match
xyz\rabcdef
No match
/.*/
abc\ndef
0: abc
1: ab
2: a
3:
abc\rdef
0: abc\x0ddef
1: abc\x0dde
2: abc\x0dd
3: abc\x0d
4: abc
5: ab
6: a
7:
abc\r\ndef
0: abc\x0d
1: abc
2: ab
3: a
4:
\<cr>abc\ndef
0: abc\x0adef
1: abc\x0ade
2: abc\x0ad
3: abc\x0a
4: abc
5: ab
6: a
7:
\<cr>abc\rdef
0: abc
1: ab
2: a
3:
\<cr>abc\r\ndef
0: abc
1: ab
2: a
3:
\<crlf>abc\ndef
0: abc\x0adef
1: abc\x0ade
2: abc\x0ad
3: abc\x0a
4: abc
5: ab
6: a
7:
\<crlf>abc\rdef
0: abc\x0ddef
1: abc\x0dde
2: abc\x0dd
3: abc\x0d
4: abc
5: ab
6: a
7:
\<crlf>abc\r\ndef
0: abc
1: ab
2: a
3:
/\w+(.)(.)?def/s
abc\ndef
0: abc\x0adef
abc\rdef
0: abc\x0ddef
abc\r\ndef
0: abc\x0d\x0adef
/ End of testinput7 /