Merge branch 'PHP-5.3' into PHP-5.4

* PHP-5.3:
  Fixed bug #63284 PCRE upgrade to 8.31
This commit is contained in:
Anatoliy Belsky 2012-10-19 09:51:58 +02:00
commit d2fa182f25
59 changed files with 40225 additions and 29027 deletions

View File

@ -3,7 +3,7 @@
EXTENSION("pcre", "php_pcre.c", false /* never shared */,
"-Iext/pcre/pcrelib");
ADD_SOURCES("ext/pcre/pcrelib", "pcre_chartables.c pcre_ucd.c pcre_compile.c pcre_config.c pcre_exec.c pcre_fullinfo.c pcre_get.c pcre_globals.c pcre_info.c pcre_maketables.c pcre_newline.c pcre_ord2utf8.c pcre_refcount.c pcre_study.c pcre_tables.c pcre_try_flipped.c pcre_valid_utf8.c pcre_version.c pcre_xclass.c", "pcre");
ADD_SOURCES("ext/pcre/pcrelib", "pcre_chartables.c pcre_ucd.c pcre_compile.c pcre_config.c pcre_exec.c pcre_fullinfo.c pcre_get.c pcre_globals.c pcre_maketables.c pcre_newline.c pcre_ord2utf8.c pcre_refcount.c pcre_study.c pcre_tables.c pcre_valid_utf8.c pcre_version.c pcre_xclass.c", "pcre");
ADD_DEF_FILE("ext\\pcre\\php_pcre.def");
AC_DEFINE('HAVE_BUNDLED_PCRE', 1, 'Using bundled PCRE library');

View File

@ -55,9 +55,9 @@ PHP_ARG_WITH(pcre-regex,,
pcrelib_sources="pcrelib/pcre_chartables.c pcrelib/pcre_ucd.c \
pcrelib/pcre_compile.c pcrelib/pcre_config.c pcrelib/pcre_exec.c \
pcrelib/pcre_fullinfo.c pcrelib/pcre_get.c pcrelib/pcre_globals.c \
pcrelib/pcre_info.c pcrelib/pcre_maketables.c pcrelib/pcre_newline.c \
pcrelib/pcre_maketables.c pcrelib/pcre_newline.c \
pcrelib/pcre_ord2utf8.c pcrelib/pcre_refcount.c pcrelib/pcre_study.c \
pcrelib/pcre_tables.c pcrelib/pcre_try_flipped.c pcrelib/pcre_valid_utf8.c \
pcrelib/pcre_tables.c pcrelib/pcre_valid_utf8.c \
pcrelib/pcre_version.c pcrelib/pcre_xclass.c"
PHP_NEW_EXTENSION(pcre, $pcrelib_sources php_pcre.c, no,,-I@ext_srcdir@/pcrelib)
PHP_ADD_BUILD_DIR($ext_builddir/pcrelib)

View File

@ -8,16 +8,38 @@ Email domain: cam.ac.uk
University of Cambridge Computing Service,
Cambridge, England.
Copyright (c) 1997-2010 University of Cambridge
Copyright (c) 1997-2012 University of Cambridge
All rights reserved
PCRE JUST-IN-TIME COMPILATION SUPPORT
-------------------------------------
Written by: Zoltan Herczeg
Email local part: hzmester
Emain domain: freemail.hu
Copyright(c) 2010-2012 Zoltan Herczeg
All rights reserved.
STACK-LESS JUST-IN-TIME COMPILER
--------------------------------
Written by: Zoltan Herczeg
Email local part: hzmester
Emain domain: freemail.hu
Copyright(c) 2009-2012 Zoltan Herczeg
All rights reserved.
THE C++ WRAPPER LIBRARY
-----------------------
Written by: Google Inc.
Copyright (c) 2007-2010 Google Inc
Copyright (c) 2007-2012 Google Inc
All rights reserved
####

View File

@ -1,6 +1,699 @@
ChangeLog for PCRE
------------------
Version 8.31 06-July-2012
-------------------------
1. Fixing a wrong JIT test case and some compiler warnings.
2. Removed a bashism from the RunTest script.
3. Add a cast to pcre_exec.c to fix the warning "unary minus operator applied
to unsigned type, result still unsigned" that was given by an MS compiler
on encountering the code "-sizeof(xxx)".
4. Partial matching support is added to the JIT compiler.
5. Fixed several bugs concerned with partial matching of items that consist
of more than one character:
(a) /^(..)\1/ did not partially match "aba" because checking references was
done on an "all or nothing" basis. This also applied to repeated
references.
(b) \R did not give a hard partial match if \r was found at the end of the
subject.
(c) \X did not give a hard partial match after matching one or more
characters at the end of the subject.
(d) When newline was set to CRLF, a pattern such as /a$/ did not recognize
a partial match for the string "\r".
(e) When newline was set to CRLF, the metacharacter "." did not recognize
a partial match for a CR character at the end of the subject string.
6. If JIT is requested using /S++ or -s++ (instead of just /S+ or -s+) when
running pcretest, the text "(JIT)" added to the output whenever JIT is
actually used to run the match.
7. Individual JIT compile options can be set in pcretest by following -s+[+]
or /S+[+] with a digit between 1 and 7.
8. OP_NOT now supports any UTF character not just single-byte ones.
9. (*MARK) control verb is now supported by the JIT compiler.
10. The command "./RunTest list" lists the available tests without actually
running any of them. (Because I keep forgetting what they all are.)
11. Add PCRE_INFO_MAXLOOKBEHIND.
12. Applied a (slightly modified) user-supplied patch that improves performance
when the heap is used for recursion (compiled with --disable-stack-for-
recursion). Instead of malloc and free for each heap frame each time a
logical recursion happens, frames are retained on a chain and re-used where
possible. This sometimes gives as much as 30% improvement.
13. As documented, (*COMMIT) is now confined to within a recursive subpattern
call.
14. As documented, (*COMMIT) is now confined to within a positive assertion.
15. It is now possible to link pcretest with libedit as an alternative to
libreadline.
16. (*COMMIT) control verb is now supported by the JIT compiler.
17. The Unicode data tables have been updated to Unicode 6.1.0.
18. Added --file-list option to pcregrep.
19. Added binary file support to pcregrep, including the -a, --binary-files,
-I, and --text options.
20. The madvise function is renamed for posix_madvise for QNX compatibility
reasons. Fixed by Giuseppe D'Angelo.
21. Fixed a bug for backward assertions with REVERSE 0 in the JIT compiler.
22. Changed the option for creating symbolic links for 16-bit man pages from
-s to -sf so that re-installing does not cause issues.
23. Support PCRE_NO_START_OPTIMIZE in JIT as (*MARK) support requires it.
24. Fixed a very old bug in pcretest that caused errors with restarted DFA
matches in certain environments (the workspace was not being correctly
retained). Also added to pcre_dfa_exec() a simple plausibility check on
some of the workspace data at the beginning of a restart.
25. \s*\R was auto-possessifying the \s* when it should not, whereas \S*\R
was not doing so when it should - probably a typo introduced by SVN 528
(change 8.10/14).
26. When PCRE_UCP was not set, \w+\x{c4} was incorrectly auto-possessifying the
\w+ when the character tables indicated that \x{c4} was a word character.
There were several related cases, all because the tests for doing a table
lookup were testing for characters less than 127 instead of 255.
27. If a pattern contains capturing parentheses that are not used in a match,
their slots in the ovector are set to -1. For those that are higher than
any matched groups, this happens at the end of processing. In the case when
there were back references that the ovector was too small to contain
(causing temporary malloc'd memory to be used during matching), and the
highest capturing number was not used, memory off the end of the ovector
was incorrectly being set to -1. (It was using the size of the temporary
memory instead of the true size.)
28. To catch bugs like 27 using valgrind, when pcretest is asked to specify an
ovector size, it uses memory at the end of the block that it has got.
29. Check for an overlong MARK name and give an error at compile time. The
limit is 255 for the 8-bit library and 65535 for the 16-bit library.
30. JIT compiler update.
31. JIT is now supported on jailbroken iOS devices. Thanks for Ruiger
Rill for the patch.
32. Put spaces around SLJIT_PRINT_D in the JIT compiler. Required by CXX11.
33. Variable renamings in the PCRE-JIT compiler. No functionality change.
34. Fixed typos in pcregrep: in two places there was SUPPORT_LIBZ2 instead of
SUPPORT_LIBBZ2. This caused a build problem when bzip2 but not gzip (zlib)
was enabled.
35. Improve JIT code generation for greedy plus quantifier.
36. When /((?:a?)*)*c/ or /((?>a?)*)*c/ was matched against "aac", it set group
1 to "aa" instead of to an empty string. The bug affected repeated groups
that could potentially match an empty string.
37. Optimizing single character iterators in JIT.
38. Wide characters specified with \uxxxx in JavaScript mode are now subject to
the same checks as \x{...} characters in non-JavaScript mode. Specifically,
codepoints that are too big for the mode are faulted, and in a UTF mode,
disallowed codepoints are also faulted.
39. If PCRE was compiled with UTF support, in three places in the DFA
matcher there was code that should only have been obeyed in UTF mode, but
was being obeyed unconditionally. In 8-bit mode this could cause incorrect
processing when bytes with values greater than 127 were present. In 16-bit
mode the bug would be provoked by values in the range 0xfc00 to 0xdc00. In
both cases the values are those that cannot be the first data item in a UTF
character. The three items that might have provoked this were recursions,
possessively repeated groups, and atomic groups.
40. Ensure that libpcre is explicitly listed in the link commands for pcretest
and pcregrep, because some OS require shared objects to be explicitly
passed to ld, causing the link step to fail if they are not.
41. There were two incorrect #ifdefs in pcre_study.c, meaning that, in 16-bit
mode, patterns that started with \h* or \R* might be incorrectly matched.
Version 8.30 04-February-2012
-----------------------------
1. Renamed "isnumber" as "is_a_number" because in some Mac environments this
name is defined in ctype.h.
2. Fixed a bug in fixed-length calculation for lookbehinds that would show up
only in quite long subpatterns.
3. Removed the function pcre_info(), which has been obsolete and deprecated
since it was replaced by pcre_fullinfo() in February 2000.
4. For a non-anchored pattern, if (*SKIP) was given with a name that did not
match a (*MARK), and the match failed at the start of the subject, a
reference to memory before the start of the subject could occur. This bug
was introduced by fix 17 of release 8.21.
5. A reference to an unset group with zero minimum repetition was giving
totally wrong answers (in non-JavaScript-compatibility mode). For example,
/(another)?(\1?)test/ matched against "hello world test". This bug was
introduced in release 8.13.
6. Add support for 16-bit character strings (a large amount of work involving
many changes and refactorings).
7. RunGrepTest failed on msys because \r\n was replaced by whitespace when the
command "pattern=`printf 'xxx\r\njkl'`" was run. The pattern is now taken
from a file.
8. Ovector size of 2 is also supported by JIT based pcre_exec (the ovector size
rounding is not applied in this particular case).
9. The invalid Unicode surrogate codepoints U+D800 to U+DFFF are now rejected
if they appear, or are escaped, in patterns.
10. Get rid of a number of -Wunused-but-set-variable warnings.
11. The pattern /(?=(*:x))(q|)/ matches an empty string, and returns the mark
"x". The similar pattern /(?=(*:x))((*:y)q|)/ did not return a mark at all.
Oddly, Perl behaves the same way. PCRE has been fixed so that this pattern
also returns the mark "x". This bug applied to capturing parentheses,
non-capturing parentheses, and atomic parentheses. It also applied to some
assertions.
12. Stephen Kelly's patch to CMakeLists.txt allows it to parse the version
information out of configure.ac instead of relying on pcre.h.generic, which
is not stored in the repository.
13. Applied Dmitry V. Levin's patch for a more portable method for linking with
-lreadline.
14. ZH added PCRE_CONFIG_JITTARGET; added its output to pcretest -C.
15. Applied Graycode's patch to put the top-level frame on the stack rather
than the heap when not using the stack for recursion. This gives a
performance improvement in many cases when recursion is not deep.
16. Experimental code added to "pcretest -C" to output the stack frame size.
Version 8.21 12-Dec-2011
------------------------
1. Updating the JIT compiler.
2. JIT compiler now supports OP_NCREF, OP_RREF and OP_NRREF. New test cases
are added as well.
3. Fix cache-flush issue on PowerPC (It is still an experimental JIT port).
PCRE_EXTRA_TABLES is not suported by JIT, and should be checked before
calling _pcre_jit_exec. Some extra comments are added.
4. (*MARK) settings inside atomic groups that do not contain any capturing
parentheses, for example, (?>a(*:m)), were not being passed out. This bug
was introduced by change 18 for 8.20.
5. Supporting of \x, \U and \u in JavaScript compatibility mode based on the
ECMA-262 standard.
6. Lookbehinds such as (?<=a{2}b) that contained a fixed repetition were
erroneously being rejected as "not fixed length" if PCRE_CASELESS was set.
This bug was probably introduced by change 9 of 8.13.
7. While fixing 6 above, I noticed that a number of other items were being
incorrectly rejected as "not fixed length". This arose partly because newer
opcodes had not been added to the fixed-length checking code. I have (a)
corrected the bug and added tests for these items, and (b) arranged for an
error to occur if an unknown opcode is encountered while checking for fixed
length instead of just assuming "not fixed length". The items that were
rejected were: (*ACCEPT), (*COMMIT), (*FAIL), (*MARK), (*PRUNE), (*SKIP),
(*THEN), \h, \H, \v, \V, and single character negative classes with fixed
repetitions, e.g. [^a]{3}, with and without PCRE_CASELESS.
8. A possessively repeated conditional subpattern such as (?(?=c)c|d)++ was
being incorrectly compiled and would have given unpredicatble results.
9. A possessively repeated subpattern with minimum repeat count greater than
one behaved incorrectly. For example, (A){2,}+ behaved as if it was
(A)(A)++ which meant that, after a subsequent mismatch, backtracking into
the first (A) could occur when it should not.
10. Add a cast and remove a redundant test from the code.
11. JIT should use pcre_malloc/pcre_free for allocation.
12. Updated pcre-config so that it no longer shows -L/usr/lib, which seems
best practice nowadays, and helps with cross-compiling. (If the exec_prefix
is anything other than /usr, -L is still shown).
13. In non-UTF-8 mode, \C is now supported in lookbehinds and DFA matching.
14. Perl does not support \N without a following name in a [] class; PCRE now
also gives an error.
15. If a forward reference was repeated with an upper limit of around 2000,
it caused the error "internal error: overran compiling workspace". The
maximum number of forward references (including repeats) was limited by the
internal workspace, and dependent on the LINK_SIZE. The code has been
rewritten so that the workspace expands (via pcre_malloc) if necessary, and
the default depends on LINK_SIZE. There is a new upper limit (for safety)
of around 200,000 forward references. While doing this, I also speeded up
the filling in of repeated forward references.
16. A repeated forward reference in a pattern such as (a)(?2){2}(.) was
incorrectly expecting the subject to contain another "a" after the start.
17. When (*SKIP:name) is activated without a corresponding (*MARK:name) earlier
in the match, the SKIP should be ignored. This was not happening; instead
the SKIP was being treated as NOMATCH. For patterns such as
/A(*MARK:A)A+(*SKIP:B)Z|AAC/ this meant that the AAC branch was never
tested.
18. The behaviour of (*MARK), (*PRUNE), and (*THEN) has been reworked and is
now much more compatible with Perl, in particular in cases where the result
is a non-match for a non-anchored pattern. For example, if
/b(*:m)f|a(*:n)w/ is matched against "abc", the non-match returns the name
"m", where previously it did not return a name. A side effect of this
change is that for partial matches, the last encountered mark name is
returned, as for non matches. A number of tests that were previously not
Perl-compatible have been moved into the Perl-compatible test files. The
refactoring has had the pleasing side effect of removing one argument from
the match() function, thus reducing its stack requirements.
19. If the /S+ option was used in pcretest to study a pattern using JIT,
subsequent uses of /S (without +) incorrectly behaved like /S+.
21. Retrieve executable code size support for the JIT compiler and fixing
some warnings.
22. A caseless match of a UTF-8 character whose other case uses fewer bytes did
not work when the shorter character appeared right at the end of the
subject string.
23. Added some (int) casts to non-JIT modules to reduce warnings on 64-bit
systems.
24. Added PCRE_INFO_JITSIZE to pass on the value from (21) above, and also
output it when the /M option is used in pcretest.
25. The CheckMan script was not being included in the distribution. Also, added
an explicit "perl" to run Perl scripts from the PrepareRelease script
because this is reportedly needed in Windows.
26. If study data was being save in a file and studying had not found a set of
"starts with" bytes for the pattern, the data written to the file (though
never used) was taken from uninitialized memory and so caused valgrind to
complain.
27. Updated RunTest.bat as provided by Sheri Pierce.
28. Fixed a possible uninitialized memory bug in pcre_jit_compile.c.
29. Computation of memory usage for the table of capturing group names was
giving an unnecessarily large value.
Version 8.20 21-Oct-2011
------------------------
1. Change 37 of 8.13 broke patterns like [:a]...[b:] because it thought it had
a POSIX class. After further experiments with Perl, which convinced me that
Perl has bugs and confusions, a closing square bracket is no longer allowed
in a POSIX name. This bug also affected patterns with classes that started
with full stops.
2. If a pattern such as /(a)b|ac/ is matched against "ac", there is no
captured substring, but while checking the failing first alternative,
substring 1 is temporarily captured. If the output vector supplied to
pcre_exec() was not big enough for this capture, the yield of the function
was still zero ("insufficient space for captured substrings"). This cannot
be totally fixed without adding another stack variable, which seems a lot
of expense for a edge case. However, I have improved the situation in cases
such as /(a)(b)x|abc/ matched against "abc", where the return code
indicates that fewer than the maximum number of slots in the ovector have
been set.
3. Related to (2) above: when there are more back references in a pattern than
slots in the output vector, pcre_exec() uses temporary memory during
matching, and copies in the captures as far as possible afterwards. It was
using the entire output vector, but this conflicts with the specification
that only 2/3 is used for passing back captured substrings. Now it uses
only the first 2/3, for compatibility. This is, of course, another edge
case.
4. Zoltan Herczeg's just-in-time compiler support has been integrated into the
main code base, and can be used by building with --enable-jit. When this is
done, pcregrep automatically uses it unless --disable-pcregrep-jit or the
runtime --no-jit option is given.
5. When the number of matches in a pcre_dfa_exec() run exactly filled the
ovector, the return from the function was zero, implying that there were
other matches that did not fit. The correct "exactly full" value is now
returned.
6. If a subpattern that was called recursively or as a subroutine contained
(*PRUNE) or any other control that caused it to give a non-standard return,
invalid errors such as "Error -26 (nested recursion at the same subject
position)" or even infinite loops could occur.
7. If a pattern such as /a(*SKIP)c|b(*ACCEPT)|/ was studied, it stopped
computing the minimum length on reaching *ACCEPT, and so ended up with the
wrong value of 1 rather than 0. Further investigation indicates that
computing a minimum subject length in the presence of *ACCEPT is difficult
(think back references, subroutine calls), and so I have changed the code
so that no minimum is registered for a pattern that contains *ACCEPT.
8. If (*THEN) was present in the first (true) branch of a conditional group,
it was not handled as intended. [But see 16 below.]
9. Replaced RunTest.bat and CMakeLists.txt with improved versions provided by
Sheri Pierce.
10. A pathological pattern such as /(*ACCEPT)a/ was miscompiled, thinking that
the first byte in a match must be "a".
11. Change 17 for 8.13 increased the recursion depth for patterns like
/a(?:.)*?a/ drastically. I've improved things by remembering whether a
pattern contains any instances of (*THEN). If it does not, the old
optimizations are restored. It would be nice to do this on a per-group
basis, but at the moment that is not feasible.
12. In some environments, the output of pcretest -C is CRLF terminated. This
broke RunTest's code that checks for the link size. A single white space
character after the value is now allowed for.
13. RunTest now checks for the "fr" locale as well as for "fr_FR" and "french".
For "fr", it uses the Windows-specific input and output files.
14. If (*THEN) appeared in a group that was called recursively or as a
subroutine, it did not work as intended. [But see next item.]
15. Consider the pattern /A (B(*THEN)C) | D/ where A, B, C, and D are complex
pattern fragments (but not containing any | characters). If A and B are
matched, but there is a failure in C so that it backtracks to (*THEN), PCRE
was behaving differently to Perl. PCRE backtracked into A, but Perl goes to
D. In other words, Perl considers parentheses that do not contain any |
characters to be part of a surrounding alternative, whereas PCRE was
treading (B(*THEN)C) the same as (B(*THEN)C|(*FAIL)) -- which Perl handles
differently. PCRE now behaves in the same way as Perl, except in the case
of subroutine/recursion calls such as (?1) which have in any case always
been different (but PCRE had them first :-).
16. Related to 15 above: Perl does not treat the | in a conditional group as
creating alternatives. Such a group is treated in the same way as an
ordinary group without any | characters when processing (*THEN). PCRE has
been changed to match Perl's behaviour.
17. If a user had set PCREGREP_COLO(U)R to something other than 1:31, the
RunGrepTest script failed.
18. Change 22 for version 13 caused atomic groups to use more stack. This is
inevitable for groups that contain captures, but it can lead to a lot of
stack use in large patterns. The old behaviour has been restored for atomic
groups that do not contain any capturing parentheses.
19. If the PCRE_NO_START_OPTIMIZE option was set for pcre_compile(), it did not
suppress the check for a minimum subject length at run time. (If it was
given to pcre_exec() or pcre_dfa_exec() it did work.)
20. Fixed an ASCII-dependent infelicity in pcretest that would have made it
fail to work when decoding hex characters in data strings in EBCDIC
environments.
21. It appears that in at least one Mac OS environment, the isxdigit() function
is implemented as a macro that evaluates to its argument more than once,
contravening the C 90 Standard (I haven't checked a later standard). There
was an instance in pcretest which caused it to go wrong when processing
\x{...} escapes in subject strings. The has been rewritten to avoid using
things like p++ in the argument of isxdigit().
Version 8.13 16-Aug-2011
------------------------
1. The Unicode data tables have been updated to Unicode 6.0.0.
2. Two minor typos in pcre_internal.h have been fixed.
3. Added #include <string.h> to pcre_scanner_unittest.cc, pcrecpp.cc, and
pcrecpp_unittest.cc. They are needed for strcmp(), memset(), and strchr()
in some environments (e.g. Solaris 10/SPARC using Sun Studio 12U2).
4. There were a number of related bugs in the code for matching backrefences
caselessly in UTF-8 mode when codes for the characters concerned were
different numbers of bytes. For example, U+023A and U+2C65 are an upper
and lower case pair, using 2 and 3 bytes, respectively. The main bugs were:
(a) A reference to 3 copies of a 2-byte code matched only 2 of a 3-byte
code. (b) A reference to 2 copies of a 3-byte code would not match 2 of a
2-byte code at the end of the subject (it thought there wasn't enough data
left).
5. Comprehensive information about what went wrong is now returned by
pcre_exec() and pcre_dfa_exec() when the UTF-8 string check fails, as long
as the output vector has at least 2 elements. The offset of the start of
the failing character and a reason code are placed in the vector.
6. When the UTF-8 string check fails for pcre_compile(), the offset that is
now returned is for the first byte of the failing character, instead of the
last byte inspected. This is an incompatible change, but I hope it is small
enough not to be a problem. It makes the returned offset consistent with
pcre_exec() and pcre_dfa_exec().
7. pcretest now gives a text phrase as well as the error number when
pcre_exec() or pcre_dfa_exec() fails; if the error is a UTF-8 check
failure, the offset and reason code are output.
8. When \R was used with a maximizing quantifier it failed to skip backwards
over a \r\n pair if the subsequent match failed. Instead, it just skipped
back over a single character (\n). This seems wrong (because it treated the
two characters as a single entity when going forwards), conflicts with the
documentation that \R is equivalent to (?>\r\n|\n|...etc), and makes the
behaviour of \R* different to (\R)*, which also seems wrong. The behaviour
has been changed.
9. Some internal refactoring has changed the processing so that the handling
of the PCRE_CASELESS and PCRE_MULTILINE options is done entirely at compile
time (the PCRE_DOTALL option was changed this way some time ago: version
7.7 change 16). This has made it possible to abolish the OP_OPT op code,
which was always a bit of a fudge. It also means that there is one less
argument for the match() function, which reduces its stack requirements
slightly. This change also fixes an incompatibility with Perl: the pattern
(?i:([^b]))(?1) should not match "ab", but previously PCRE gave a match.
10. More internal refactoring has drastically reduced the number of recursive
calls to match() for possessively repeated groups such as (abc)++ when
using pcre_exec().
11. While implementing 10, a number of bugs in the handling of groups were
discovered and fixed:
(?<=(a)+) was not diagnosed as invalid (non-fixed-length lookbehind).
(a|)*(?1) gave a compile-time internal error.
((a|)+)+ did not notice that the outer group could match an empty string.
(^a|^)+ was not marked as anchored.
(.*a|.*)+ was not marked as matching at start or after a newline.
12. Yet more internal refactoring has removed another argument from the match()
function. Special calls to this function are now indicated by setting a
value in a variable in the "match data" data block.
13. Be more explicit in pcre_study() instead of relying on "default" for
opcodes that mean there is no starting character; this means that when new
ones are added and accidentally left out of pcre_study(), testing should
pick them up.
14. The -s option of pcretest has been documented for ages as being an old
synonym of -m (show memory usage). I have changed it to mean "force study
for every regex", that is, assume /S for every regex. This is similar to -i
and -d etc. It's slightly incompatible, but I'm hoping nobody is still
using it. It makes it easier to run collections of tests with and without
study enabled, and thereby test pcre_study() more easily. All the standard
tests are now run with and without -s (but some patterns can be marked as
"never study" - see 20 below).
15. When (*ACCEPT) was used in a subpattern that was called recursively, the
restoration of the capturing data to the outer values was not happening
correctly.
16. If a recursively called subpattern ended with (*ACCEPT) and matched an
empty string, and PCRE_NOTEMPTY was set, pcre_exec() thought the whole
pattern had matched an empty string, and so incorrectly returned a no
match.
17. There was optimizing code for the last branch of non-capturing parentheses,
and also for the obeyed branch of a conditional subexpression, which used
tail recursion to cut down on stack usage. Unfortunately, now that there is
the possibility of (*THEN) occurring in these branches, tail recursion is
no longer possible because the return has to be checked for (*THEN). These
two optimizations have therefore been removed. [But see 8.20/11 above.]
18. If a pattern containing \R was studied, it was assumed that \R always
matched two bytes, thus causing the minimum subject length to be
incorrectly computed because \R can also match just one byte.
19. If a pattern containing (*ACCEPT) was studied, the minimum subject length
was incorrectly computed.
20. If /S is present twice on a test pattern in pcretest input, it now
*disables* studying, thereby overriding the use of -s on the command line
(see 14 above). This is necessary for one or two tests to keep the output
identical in both cases.
21. When (*ACCEPT) was used in an assertion that matched an empty string and
PCRE_NOTEMPTY was set, PCRE applied the non-empty test to the assertion.
22. When an atomic group that contained a capturing parenthesis was
successfully matched, but the branch in which it appeared failed, the
capturing was not being forgotten if a higher numbered group was later
captured. For example, /(?>(a))b|(a)c/ when matching "ac" set capturing
group 1 to "a", when in fact it should be unset. This applied to multi-
branched capturing and non-capturing groups, repeated or not, and also to
positive assertions (capturing in negative assertions does not happen
in PCRE) and also to nested atomic groups.
23. Add the ++ qualifier feature to pcretest, to show the remainder of the
subject after a captured substring, to make it easier to tell which of a
number of identical substrings has been captured.
24. The way atomic groups are processed by pcre_exec() has been changed so that
if they are repeated, backtracking one repetition now resets captured
values correctly. For example, if ((?>(a+)b)+aabab) is matched against
"aaaabaaabaabab" the value of captured group 2 is now correctly recorded as
"aaa". Previously, it would have been "a". As part of this code
refactoring, the way recursive calls are handled has also been changed.
25. If an assertion condition captured any substrings, they were not passed
back unless some other capturing happened later. For example, if
(?(?=(a))a) was matched against "a", no capturing was returned.
26. When studying a pattern that contained subroutine calls or assertions,
the code for finding the minimum length of a possible match was handling
direct recursions such as (xxx(?1)|yyy) but not mutual recursions (where
group 1 called group 2 while simultaneously a separate group 2 called group
1). A stack overflow occurred in this case. I have fixed this by limiting
the recursion depth to 10.
27. Updated RunTest.bat in the distribution to the version supplied by Tom
Fortmann. This supports explicit test numbers on the command line, and has
argument validation and error reporting.
28. An instance of \X with an unlimited repeat could fail if at any point the
first character it looked at was a mark character.
29. Some minor code refactoring concerning Unicode properties and scripts
should reduce the stack requirement of match() slightly.
30. Added the '=' option to pcretest to check the setting of unused capturing
slots at the end of the pattern, which are documented as being -1, but are
not included in the return count.
31. If \k was not followed by a braced, angle-bracketed, or quoted name, PCRE
compiled something random. Now it gives a compile-time error (as does
Perl).
32. A *MARK encountered during the processing of a positive assertion is now
recorded and passed back (compatible with Perl).
33. If --only-matching or --colour was set on a pcregrep call whose pattern
had alternative anchored branches, the search for a second match in a line
was done as if at the line start. Thus, for example, /^01|^02/ incorrectly
matched the line "0102" twice. The same bug affected patterns that started
with a backwards assertion. For example /\b01|\b02/ also matched "0102"
twice.
34. Previously, PCRE did not allow quantification of assertions. However, Perl
does, and because of capturing effects, quantifying parenthesized
assertions may at times be useful. Quantifiers are now allowed for
parenthesized assertions.
35. A minor code tidy in pcre_compile() when checking options for \R usage.
36. \g was being checked for fancy things in a character class, when it should
just be a literal "g".
37. PCRE was rejecting [:a[:digit:]] whereas Perl was not. It seems that the
appearance of a nested POSIX class supersedes an apparent external class.
For example, [:a[:digit:]b:] matches "a", "b", ":", or a digit. Also,
unescaped square brackets may also appear as part of class names. For
example, [:a[:abc]b:] gives unknown class "[:abc]b:]". PCRE now behaves
more like Perl. (But see 8.20/1 above.)
38. PCRE was giving an error for \N with a braced quantifier such as {1,} (this
was because it thought it was \N{name}, which is not supported).
39. Add minix to OS list not supporting the -S option in pcretest.
40. PCRE tries to detect cases of infinite recursion at compile time, but it
cannot analyze patterns in sufficient detail to catch mutual recursions
such as ((?1))((?2)). There is now a runtime test that gives an error if a
subgroup is called recursively as a subpattern for a second time at the
same position in the subject string. In previous releases this might have
been caught by the recursion limit, or it might have run out of stack.
41. A pattern such as /(?(R)a+|(?R)b)/ is quite safe, as the recursion can
happen only once. PCRE was, however incorrectly giving a compile time error
"recursive call could loop indefinitely" because it cannot analyze the
pattern in sufficient detail. The compile time test no longer happens when
PCRE is compiling a conditional subpattern, but actual runaway loops are
now caught at runtime (see 40 above).
42. It seems that Perl allows any characters other than a closing parenthesis
to be part of the NAME in (*MARK:NAME) and other backtracking verbs. PCRE
has been changed to be the same.
43. Updated configure.ac to put in more quoting round AC_LANG_PROGRAM etc. so
as not to get warnings when autogen.sh is called. Also changed
AC_PROG_LIBTOOL (deprecated) to LT_INIT (the current macro).
44. To help people who use pcregrep to scan files containing exceedingly long
lines, the following changes have been made:
(a) The default value of the buffer size parameter has been increased from
8K to 20K. (The actual buffer used is three times this size.)
(b) The default can be changed by ./configure --with-pcregrep-bufsize when
PCRE is built.
(c) A --buffer-size=n option has been added to pcregrep, to allow the size
to be set at run time.
(d) Numerical values in pcregrep options can be followed by K or M, for
example --buffer-size=50K.
(e) If a line being scanned overflows pcregrep's buffer, an error is now
given and the return code is set to 2.
45. Add a pointer to the latest mark to the callout data block.
46. The pattern /.(*F)/, when applied to "abc" with PCRE_PARTIAL_HARD, gave a
partial match of an empty string instead of no match. This was specific to
the use of ".".
47. The pattern /f.*/8s, when applied to "for" with PCRE_PARTIAL_HARD, gave a
complete match instead of a partial match. This bug was dependent on both
the PCRE_UTF8 and PCRE_DOTALL options being set.
48. For a pattern such as /\babc|\bdef/ pcre_study() was failing to set up the
starting byte set, because \b was not being ignored.
Version 8.12 15-Jan-2011
------------------------

View File

@ -2,7 +2,8 @@ Technical Notes about PCRE
--------------------------
These are very rough technical notes that record potentially useful information
about PCRE internals.
about PCRE internals. For information about testing PCRE, see the pcretest
documentation and the comment at the head of the RunTest file.
Historical note 1
@ -48,6 +49,18 @@ complexity in Perl regular expressions, I couldn't do this. In any case, a
first pass through the pattern is helpful for other reasons.
Support for 16-bit data strings
-------------------------------
From release 8.30, PCRE supports 16-bit as well as 8-bit data strings, by being
compilable in either 8-bit or 16-bit modes, or both. Thus, two different
libraries can be created. In the description that follows, the word "short" is
used for a 16-bit data quantity, and the word "unit" is used for a quantity
that is a byte in 8-bit mode and a short in 16-bit mode. However, so as not to
over-complicate the text, the names of PCRE functions are given in 8-bit form
only.
Computing the memory requirement: how it was
--------------------------------------------
@ -68,7 +81,7 @@ things I did for 6.8 was to fix Yet Another Bug in the memory computation. Then
I had a flash of inspiration as to how I could run the real compile function in
a "fake" mode that enables it to compute how much memory it would need, while
actually only ever using a few hundred bytes of working memory, and without too
many tests of the mode that might slow it down. So I re-factored the compiling
many tests of the mode that might slow it down. So I refactored the compiling
functions to work this way. This got rid of about 600 lines of source. It
should make future maintenance and development easier. As this was such a major
change, I never released 6.8, instead upping the number to 7.0 (other quite
@ -88,7 +101,10 @@ The "traditional", and original, matching function is called pcre_exec(), and
it implements an NFA algorithm, similar to the original Henry Spencer algorithm
and the way that Perl works. This is not surprising, since it is intended to be
as compatible with Perl as possible. This is the function most users of PCRE
will use most of the time.
will use most of the time. From release 8.20, if PCRE is compiled with
just-in-time (JIT) support, and studying a compiled pattern with JIT is
successful, the JIT code is run instead of the normal pcre_exec() code, but the
result is the same.
Supplementary matching function
@ -108,28 +124,38 @@ needed at compile time to produce a traditional FSM where only one state is
ever active at once. I believe some other regex matchers work this way.
Changeable options
------------------
The /i, /m, or /s options (PCRE_CASELESS, PCRE_MULTILINE, PCRE_DOTALL) may
change in the middle of patterns. From PCRE 8.13, their processing is handled
entirely at compile time by generating different opcodes for the different
settings. The runtime functions do not need to keep track of an options state
any more.
Format of compiled patterns
---------------------------
The compiled form of a pattern is a vector of bytes, containing items of
variable length. The first byte in an item is an opcode, and the length of the
item is either implicit in the opcode or contained in the data bytes that
follow it.
The compiled form of a pattern is a vector of units (bytes in 8-bit mode, or
shorts in 16-bit mode), containing items of variable length. The first unit in
an item contains an opcode, and the length of the item is either implicit in
the opcode or contained in the data that follows it.
In many cases below LINK_SIZE data values are specified for offsets within the
compiled pattern. The default value for LINK_SIZE is 2, but PCRE can be
compiled to use 3-byte or 4-byte values for these offsets (impairing the
performance). This is necessary only when patterns whose compiled length is
greater than 64K are going to be processed. In this description, we assume the
"normal" compilation options. Data values that are counts (e.g. for
quantifiers) are always just two bytes long.
A list of the opcodes follows:
In many cases listed below, LINK_SIZE data values are specified for offsets
within the compiled pattern. LINK_SIZE always specifies a number of bytes. The
default value for LINK_SIZE is 2, but PCRE can be compiled to use 3-byte or
4-byte values for these offsets, although this impairs the performance. (3-byte
LINK_SIZE values are available only in 8-bit mode.) Specifing a LINK_SIZE
larger than 2 is necessary only when patterns whose compiled length is greater
than 64K are going to be processed. In this description, we assume the "normal"
compilation options. Data values that are counts (e.g. for quantifiers) are
always just two bytes long (one short in 16-bit mode).
Opcodes with no following data
------------------------------
These items are all just one byte long
These items are all just one unit long
OP_END end of pattern
OP_ANY match any one character other than newline
@ -138,7 +164,8 @@ These items are all just one byte long
OP_SOD match start of data: \A
OP_SOM, start of match (subject + offset): \G
OP_SET_SOM, set start of match (\K)
OP_CIRC ^ (start of data, or after \n in multiline)
OP_CIRC ^ (start of data)
OP_CIRCM ^ multiline mode (start of data or after newline)
OP_NOT_WORD_BOUNDARY \W
OP_WORD_BOUNDARY \w
OP_NOT_DIGIT \D
@ -153,7 +180,8 @@ These items are all just one byte long
OP_WORDCHAR \w
OP_EODN match end of data or \n at end: \Z
OP_EOD match end of data: \z
OP_DOLL $ (end of data, or before \n in multiline)
OP_DOLL $ (end of data, or before final newline)
OP_DOLLM $ multiline mode (end of data or before newline)
OP_EXTUNI match an extended Unicode character
OP_ANYNL match any Unicode newline sequence
@ -164,49 +192,57 @@ These items are all just one byte long
OP_SKIP ) indicating which parentheses must be closed.
Backtracking control verbs with data
------------------------------------
OP_THEN is followed by a LINK_SIZE offset, which is the distance back to the
start of the current branch.
Backtracking control verbs with (optional) data
-----------------------------------------------
OP_MARK is followed by the mark name, preceded by a one-byte length, and
followed by a binary zero. For (*PRUNE), (*SKIP), and (*THEN) with arguments,
the opcodes OP_PRUNE_ARG, OP_SKIP_ARG, and OP_THEN_ARG are used. For the first
two, the name follows immediately; for OP_THEN_ARG, it follows the LINK_SIZE
offset value.
(*THEN) without an argument generates the opcode OP_THEN and no following data.
OP_MARK is followed by the mark name, preceded by a one-unit length, and
followed by a binary zero. For (*PRUNE), (*SKIP), and (*THEN) with arguments,
the opcodes OP_PRUNE_ARG, OP_SKIP_ARG, and OP_THEN_ARG are used, with the name
following in the same format.
Matching literal characters
---------------------------
The OP_CHAR opcode is followed by a single character that is to be matched
casefully. For caseless matching, OP_CHARI is used. In UTF-8 or UTF-16 modes,
the character may be more than one unit long.
Repeating single characters
---------------------------
The common repeats (*, +, ?) when applied to a single character use the
following opcodes:
The common repeats (*, +, ?), when applied to a single character, use the
following opcodes, which come in caseful and caseless versions:
OP_STAR
OP_MINSTAR
OP_POSSTAR
OP_PLUS
OP_MINPLUS
OP_POSPLUS
OP_QUERY
OP_MINQUERY
OP_POSQUERY
Caseful Caseless
OP_STAR OP_STARI
OP_MINSTAR OP_MINSTARI
OP_POSSTAR OP_POSSTARI
OP_PLUS OP_PLUSI
OP_MINPLUS OP_MINPLUSI
OP_POSPLUS OP_POSPLUSI
OP_QUERY OP_QUERYI
OP_MINQUERY OP_MINQUERYI
OP_POSQUERY OP_POSQUERYI
In ASCII mode, these are two-byte items; in UTF-8 mode, the length is variable.
Those with "MIN" in their name are the minimizing versions. Those with "POS" in
their names are possessive versions. Each is followed by the character that is
to be repeated. Other repeats make use of
Each opcode is followed by the character that is to be repeated. In ASCII mode,
these are two-unit items; in UTF-8 or UTF-16 modes, the length is variable.
Those with "MIN" in their names are the minimizing versions. Those with "POS"
in their names are possessive versions. Other repeats make use of these
opcodes:
OP_UPTO
OP_MINUPTO
OP_POSUPTO
OP_EXACT
Caseful Caseless
OP_UPTO OP_UPTOI
OP_MINUPTO OP_MINUPTOI
OP_POSUPTO OP_POSUPTOI
OP_EXACT OP_EXACTI
which are followed by a two-byte count (most significant first) and the
repeated character. OP_UPTO matches from 0 to the given number. A repeat with a
non-zero minimum and a fixed maximum is coded as an OP_EXACT followed by an
OP_UPTO (or OP_MINUPTO or OPT_POSUPTO).
Each of these is followed by a two-byte (one short) count (most significant
byte first in 8-bit mode) and then the repeated character. OP_UPTO matches from
0 to the given number. A repeat with a non-zero minimum and a fixed maximum is
coded as an OP_EXACT followed by an OP_UPTO (or OP_MINUPTO or OPT_POSUPTO).
Repeating character types
@ -214,7 +250,7 @@ Repeating character types
Repeats of things like \d are done exactly as for single characters, except
that instead of a character, the opcode for the type is stored in the data
byte. The opcodes are:
unit. The opcodes are:
OP_TYPESTAR
OP_TYPEMINSTAR
@ -236,65 +272,58 @@ Match by Unicode property
OP_PROP and OP_NOTPROP are used for positive and negative matches of a
character by testing its Unicode property (the \p and \P escape sequences).
Each is followed by two bytes that encode the desired property as a type and a
Each is followed by two units that encode the desired property as a type and a
value.
Repeats of these items use the OP_TYPESTAR etc. set of opcodes, followed by
three bytes: OP_PROP or OP_NOTPROP and then the desired property type and
Repeats of these items use the OP_TYPESTAR etc. set of opcodes, followed by
three units: OP_PROP or OP_NOTPROP, and then the desired property type and
value.
Matching literal characters
---------------------------
The OP_CHAR opcode is followed by a single character that is to be matched
casefully. For caseless matching, OP_CHARNC is used. In UTF-8 mode, the
character may be more than one byte long. (Earlier versions of PCRE used
multi-character strings, but this was changed to allow some new features to be
added.)
Character classes
-----------------
If there is only one character, OP_CHAR or OP_CHARNC is used for a positive
class, and OP_NOT for a negative one (that is, for something like [^a]).
However, in UTF-8 mode, the use of OP_NOT applies only to characters with
values < 128, because OP_NOT is confined to single bytes.
If there is only one character in the class, OP_CHAR or OP_CHARI is used for a
positive class, and OP_NOT or OP_NOTI for a negative one (that is, for
something like [^a]).
Another set of repeating opcodes (OP_NOTSTAR etc.) are used for a repeated,
negated, single-character class. The normal ones (OP_STAR etc.) are used for a
repeated positive single-character class.
Another set of 13 repeating opcodes (called OP_NOTSTAR etc.) are used for
repeated, negated, single-character classes. The normal single-character
opcodes (OP_STAR, etc.) are used for repeated positive single-character
classes.
When there's more than one character in a class and all the characters are less
than 256, OP_CLASS is used for a positive class, and OP_NCLASS for a negative
one. In either case, the opcode is followed by a 32-byte bit map containing a 1
bit for every character that is acceptable. The bits are counted from the least
significant end of each byte.
When there is more than one character in a class and all the characters are
less than 256, OP_CLASS is used for a positive class, and OP_NCLASS for a
negative one. In either case, the opcode is followed by a 32-byte (16-short)
bit map containing a 1 bit for every character that is acceptable. The bits are
counted from the least significant end of each unit. In caseless mode, bits for
both cases are set.
The reason for having both OP_CLASS and OP_NCLASS is so that, in UTF-8 mode,
subject characters with values greater than 256 can be handled correctly. For
OP_CLASS they don't match, whereas for OP_NCLASS they do.
The reason for having both OP_CLASS and OP_NCLASS is so that, in UTF-8/16 mode,
subject characters with values greater than 255 can be handled correctly. For
OP_CLASS they do not match, whereas for OP_NCLASS they do.
For classes containing characters with values > 255, OP_XCLASS is used. It
optionally uses a bit map (if any characters lie within it), followed by a list
of pairs and single characters. There is a flag character than indicates
whether it's a positive or a negative class.
For classes containing characters with values greater than 255, OP_XCLASS is
used. It optionally uses a bit map (if any characters lie within it), followed
by a list of pairs (for a range) and single characters. In caseless mode, both
cases are explicitly listed. There is a flag character than indicates whether
it is a positive or a negative class.
Back references
---------------
OP_REF is followed by two bytes containing the reference number.
OP_REF (caseful) or OP_REFI (caseless) is followed by two bytes (one short)
containing the reference number.
Repeating character classes and back references
-----------------------------------------------
Single-character classes are handled specially (see above). This section
applies to OP_CLASS and OP_REF. In both cases, the repeat information follows
the base item. The matching code looks at the following opcode to see if it is
one of
applies to OP_CLASS and OP_REF[I]. In both cases, the repeat information
follows the base item. The matching code looks at the following opcode to see
if it is one of
OP_CRSTAR
OP_CRMINSTAR
@ -305,10 +334,10 @@ one of
OP_CRRANGE
OP_CRMINRANGE
All but the last two are just single-byte items. The others are followed by
four bytes of data, comprising the minimum and maximum repeat counts. There are
no special possessive opcodes for these repeats; a possessive repeat is
compiled into an atomic group.
All but the last two are just single-unit items. The others are followed by
four bytes (two shorts) of data, comprising the minimum and maximum repeat
counts. There are no special possessive opcodes for these repeats; a possessive
repeat is compiled into an atomic group.
Brackets and alternation
@ -318,7 +347,8 @@ A pair of non-capturing (round) brackets is wrapped round each expression at
compile time, so alternation always happens in the context of brackets.
[Note for North Americans: "bracket" to some English speakers, including
myself, can be round, square, curly, or pointy. Hence this usage.]
myself, can be round, square, curly, or pointy. Hence this usage rather than
"parentheses".]
Non-capturing brackets use the opcode OP_BRA. Originally PCRE was limited to 99
capturing brackets and it used a different opcode for each one. From release
@ -330,16 +360,17 @@ A bracket opcode is followed by LINK_SIZE bytes which give the offset to the
next alternative OP_ALT or, if there aren't any branches, to the matching
OP_KET opcode. Each OP_ALT is followed by LINK_SIZE bytes giving the offset to
the next one, or to the OP_KET opcode. For capturing brackets, the bracket
number immediately follows the offset, always as a 2-byte item.
number immediately follows the offset, always as a 2-byte (one short) item.
OP_KET is used for subpatterns that do not repeat indefinitely, while
OP_KET is used for subpatterns that do not repeat indefinitely, and
OP_KETRMIN and OP_KETRMAX are used for indefinite repetitions, minimally or
maximally respectively. All three are followed by LINK_SIZE bytes giving (as a
positive number) the offset back to the matching bracket opcode.
maximally respectively (see below for possessive repetitions). All three are
followed by LINK_SIZE bytes giving (as a positive number) the offset back to
the matching bracket opcode.
If a subpattern is quantified such that it is permitted to match zero times, it
is preceded by one of OP_BRAZERO, OP_BRAMINZERO, or OP_SKIPZERO. These are
single-byte opcodes that tell the matcher that skipping the following
single-unit opcodes that tell the matcher that skipping the following
subpattern entirely is a valid branch. In the case of the first two, not
skipping the pattern is also valid (greedy and non-greedy). The third is used
when a pattern has the quantifier {0,0}. It cannot be entirely discarded,
@ -362,6 +393,15 @@ final replication is changed to OP_SBRA or OP_SCBRA. This tells the matcher
that it needs to check for matching an empty string when it hits OP_KETRMIN or
OP_KETRMAX, and if so, to break the loop.
Possessive brackets
-------------------
When a repeated group (capturing or non-capturing) is marked as possessive by
the "+" notation, e.g. (abc)++, different opcodes are used. Their names all
have POS on the end, e.g. OP_BRAPOS instead of OP_BRA and OP_SCPBRPOS instead
of OP_SCBRA. The end of such a group is marked by OP_KETRPOS. If the minimum
repetition is zero, the group is preceded by OP_BRAPOSZERO.
Assertions
----------
@ -369,11 +409,11 @@ Assertions
Forward assertions are just like other subpatterns, but starting with one of
the opcodes OP_ASSERT or OP_ASSERT_NOT. Backward assertions use the opcodes
OP_ASSERTBACK and OP_ASSERTBACK_NOT, and the first opcode inside the assertion
is OP_REVERSE, followed by a two byte count of the number of characters to move
back the pointer in the subject string. When operating in UTF-8 mode, the count
is a character count rather than a byte count. A separate count is present in
each alternative of a lookbehind assertion, allowing them to have different
fixed lengths.
is OP_REVERSE, followed by a two byte (one short) count of the number of
characters to move back the pointer in the subject string. In ASCII mode, the
count is a number of units, but in UTF-8/16 mode each character may occupy more
than one unit. A separate count is present in each alternative of a lookbehind
assertion, allowing them to have different fixed lengths.
Once-only (atomic) subpatterns
@ -390,14 +430,15 @@ Conditional subpatterns
These are like other subpatterns, but they start with the opcode OP_COND, or
OP_SCOND for one that might match an empty string in an unbounded repeat. If
the condition is a back reference, this is stored at the start of the
subpattern using the opcode OP_CREF followed by two bytes containing the
reference number. OP_NCREF is used instead if the reference was generated by
name (so that the runtime code knows to check for duplicate names).
subpattern using the opcode OP_CREF followed by two bytes (one short)
containing the reference number. OP_NCREF is used instead if the reference was
generated by name (so that the runtime code knows to check for duplicate
names).
If the condition is "in recursion" (coded as "(?(R)"), or "in recursion of
group x" (coded as "(?(Rx)"), the group number is stored at the start of the
subpattern using the opcode OP_RREF or OP_NRREF (cf OP_NCREF), and a value of
zero for "the whole pattern". For a DEFINE condition, just the single byte
zero for "the whole pattern". For a DEFINE condition, just the single unit
OP_DEF is used (it has no associated data). Otherwise, a conditional subpattern
always starts with one of the assertions.
@ -416,25 +457,12 @@ are not strictly a recursion.
Callout
-------
OP_CALLOUT is followed by one byte of data that holds a callout number in the
OP_CALLOUT is followed by one unit of data that holds a callout number in the
range 0 to 254 for manual callouts, or 255 for an automatic callout. In both
cases there follows a two-byte value giving the offset in the pattern to the
start of the following item, and another two-byte item giving the length of the
next item.
cases there follows a two-byte (one short) value giving the offset in the
pattern to the start of the following item, and another two-byte (one short)
item giving the length of the next item.
Changing options
----------------
If any of the /i, /m, or /s options are changed within a pattern, an OP_OPT
opcode is compiled, followed by one byte containing the new settings of these
flags. If there are several alternatives, there is an occurrence of OP_OPT at
the start of all those following the first options change, to set appropriate
options for the start of the alternative. Immediately after the end of the
group there is another such item to reset the flags to their previous values. A
change of flag right at the very start of the pattern can be handled entirely
at compile time, and so does not cause anything to be put into the compiled
data.
Philip Hazel
October 2010
February 2012

View File

@ -9,7 +9,9 @@ specified below. The documentation for PCRE, supplied in the "doc"
directory, is distributed under the same terms as the software itself.
The basic library functions are written in C and are freestanding. Also
included in the distribution is a set of C++ wrapper functions.
included in the distribution is a set of C++ wrapper functions, and a
just-in-time compiler that can be used to optimize pattern matching. These
are both optional features that can be omitted when the library is built.
THE BASIC LIBRARY FUNCTIONS
@ -22,7 +24,29 @@ Email domain: cam.ac.uk
University of Cambridge Computing Service,
Cambridge, England.
Copyright (c) 1997-2010 University of Cambridge
Copyright (c) 1997-2012 University of Cambridge
All rights reserved.
PCRE JUST-IN-TIME COMPILATION SUPPORT
-------------------------------------
Written by: Zoltan Herczeg
Email local part: hzmester
Emain domain: freemail.hu
Copyright(c) 2010-2012 Zoltan Herczeg
All rights reserved.
STACK-LESS JUST-IN-TIME COMPILER
--------------------------------
Written by: Zoltan Herczeg
Email local part: hzmester
Emain domain: freemail.hu
Copyright(c) 2009-2012 Zoltan Herczeg
All rights reserved.
@ -31,7 +55,7 @@ THE C++ WRAPPER FUNCTIONS
Contributed by: Google Inc.
Copyright (c) 2007-2010, Google Inc.
Copyright (c) 2007-2012, Google Inc.
All rights reserved.

View File

@ -1,6 +1,82 @@
News about PCRE releases
------------------------
Release 8.31 06-July-2012
-------------------------
This is mainly a bug-fixing release, with a small number of developments:
. The JIT compiler now supports partial matching and the (*MARK) and
(*COMMIT) verbs.
. PCRE_INFO_MAXLOOKBEHIND can be used to find the longest lookbehing in a
pattern.
. There should be a performance improvement when using the heap instead of the
stack for recursion.
. pcregrep can now be linked with libedit as an alternative to libreadline.
. pcregrep now has a --file-list option where the list of files to scan is
given as a file.
. pcregrep now recognizes binary files and there are related options.
. The Unicode tables have been updated to 6.1.0.
As always, the full list of changes is in the ChangeLog file.
Release 8.30 04-February-2012
-----------------------------
Release 8.30 introduces a major new feature: support for 16-bit character
strings, compiled as a separate library. There are a few changes to the
8-bit library, in addition to some bug fixes.
. The pcre_info() function, which has been obsolete for over 10 years, has
been removed.
. When a compiled pattern was saved to a file and later reloaded on a host
with different endianness, PCRE used automatically to swap the bytes in some
of the data fields. With the advent of the 16-bit library, where more of this
swapping is needed, it is no longer done automatically. Instead, the bad
endianness is detected and a specific error is given. The user can then call
a new function called pcre_pattern_to_host_byte_order() (or an equivalent
16-bit function) to do the swap.
. In UTF-8 mode, the values 0xd800 to 0xdfff are not legal Unicode
code points and are now faulted. (They are the so-called "surrogates"
that are reserved for coding high values in UTF-16.)
Release 8.21 12-Dec-2011
------------------------
This is almost entirely a bug-fix release. The only new feature is the ability
to obtain the size of the memory used by the JIT compiler.
Release 8.20 21-Oct-2011
------------------------
The main change in this release is the inclusion of Zoltan Herczeg's
just-in-time compiler support, which can be accessed by building PCRE with
--enable-jit. Large performance benefits can be had in many situations. 8.20
also fixes an unfortunate bug that was introduced in 8.13 as well as tidying up
a number of infelicities and differences from Perl.
Release 8.13 16-Aug-2011
------------------------
This is mainly a bug-fix release. There has been a lot of internal refactoring.
The Unicode tables have been updated. The only new feature in the library is
the passing of *MARK information to callouts. Some additions have been made to
pcretest to make testing easier and more comprehensive. There is a new option
for pcregrep to adjust its internal buffer size.
Release 8.12 15-Jan-2011
------------------------

View File

@ -1,501 +1,7 @@
Compiling PCRE on non-Unix systems
----------------------------------
This document contains the following sections:
This has been renamed to better reflect its contents. Please see the file
NON-AUTOTOOLS-BUILD for details of how to build PCRE without using autotools.
General
Generic instructions for the PCRE C library
The C++ wrapper functions
Building for virtual Pascal
Stack size in Windows environments
Linking programs in Windows environments
Comments about Win32 builds
Building PCRE on Windows with CMake
Use of relative paths with CMake on Windows
Testing with RunTest.bat
Building under Windows with BCC5.5
Building PCRE on OpenVMS
Building PCRE on Stratus OpenVOS
GENERAL
I (Philip Hazel) have no experience of Windows or VMS sytems and how their
libraries work. The items in the PCRE distribution and Makefile that relate to
anything other than Unix-like systems are untested by me.
There are some other comments and files (including some documentation in CHM
format) in the Contrib directory on the FTP site:
ftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre/Contrib
If you want to compile PCRE for a non-Unix system (especially for a system that
does not support "configure" and "make" files), note that the basic PCRE
library consists entirely of code written in Standard C, and so should compile
successfully on any system that has a Standard C compiler and library. The C++
wrapper functions are a separate issue (see below).
The PCRE distribution includes a "configure" file for use by the Configure/Make
build system, as found in many Unix-like environments. There is also support
support for CMake, which some users prefer, especially in Windows environments.
There are some instructions for CMake under Windows in the section entitled
"Building PCRE with CMake" below. CMake can also be used to build PCRE in
Unix-like systems.
GENERIC INSTRUCTIONS FOR THE PCRE C LIBRARY
The following are generic comments about building the PCRE C library "by hand".
(1) Copy or rename the file config.h.generic as config.h, and edit the macro
settings that it contains to whatever is appropriate for your environment.
In particular, if you want to force a specific value for newline, you can
define the NEWLINE macro. When you compile any of the PCRE modules, you
must specify -DHAVE_CONFIG_H to your compiler so that config.h is included
in the sources.
An alternative approach is not to edit config.h, but to use -D on the
compiler command line to make any changes that you need to the
configuration options. In this case -DHAVE_CONFIG_H must not be set.
NOTE: There have been occasions when the way in which certain parameters
in config.h are used has changed between releases. (In the configure/make
world, this is handled automatically.) When upgrading to a new release,
you are strongly advised to review config.h.generic before re-using what
you had previously.
(2) Copy or rename the file pcre.h.generic as pcre.h.
(3) EITHER:
Copy or rename file pcre_chartables.c.dist as pcre_chartables.c.
OR:
Compile dftables.c as a stand-alone program (using -DHAVE_CONFIG_H if
you have set up config.h), and then run it with the single argument
"pcre_chartables.c". This generates a set of standard character tables
and writes them to that file. The tables are generated using the default
C locale for your system. If you want to use a locale that is specified
by LC_xxx environment variables, add the -L option to the dftables
command. You must use this method if you are building on a system that
uses EBCDIC code.
The tables in pcre_chartables.c are defaults. The caller of PCRE can
specify alternative tables at run time.
(4) Ensure that you have the following header files:
pcre_internal.h
ucp.h
(5) Also ensure that you have the following file, which is #included as source
when building a debugging version of PCRE, and is also used by pcretest.
pcre_printint.src
(6) Compile the following source files, setting -DHAVE_CONFIG_H as a compiler
option if you have set up config.h with your configuration, or else use
other -D settings to change the configuration as required.
pcre_chartables.c
pcre_compile.c
pcre_config.c
pcre_dfa_exec.c
pcre_exec.c
pcre_fullinfo.c
pcre_get.c
pcre_globals.c
pcre_info.c
pcre_maketables.c
pcre_newline.c
pcre_ord2utf8.c
pcre_refcount.c
pcre_study.c
pcre_tables.c
pcre_try_flipped.c
pcre_ucd.c
pcre_valid_utf8.c
pcre_version.c
pcre_xclass.c
Make sure that you include -I. in the compiler command (or equivalent for
an unusual compiler) so that all included PCRE header files are first
sought in the current directory. Otherwise you run the risk of picking up
a previously-installed file from somewhere else.
(7) Now link all the compiled code into an object library in whichever form
your system keeps such libraries. This is the basic PCRE C library. If
your system has static and shared libraries, you may have to do this once
for each type.
(8) Similarly, if you want to build the POSIX wrapper functions, ensure that
you have the pcreposix.h file and then compile pcreposix.c (remembering
-DHAVE_CONFIG_H if necessary). Link the result (on its own) as the
pcreposix library.
(9) Compile the test program pcretest.c (again, don't forget -DHAVE_CONFIG_H).
This needs the functions in the PCRE library when linking. It also needs
the pcreposix wrapper functions unless you compile it with -DNOPOSIX. The
pcretest.c program also needs the pcre_printint.src source file, which it
#includes.
(10) Run pcretest on the testinput files in the testdata directory, and check
that the output matches the corresponding testoutput files. Note that the
supplied files are in Unix format, with just LF characters as line
terminators. You may need to edit them to change this if your system uses
a different convention. If you are using Windows, you probably should use
the wintestinput3 file instead of testinput3 (and the corresponding output
file). This is a locale test; wintestinput3 sets the locale to "french"
rather than "fr_FR", and there some minor output differences.
(11) If you want to use the pcregrep command, compile and link pcregrep.c; it
uses only the basic PCRE library (it does not need the pcreposix library).
THE C++ WRAPPER FUNCTIONS
The PCRE distribution also contains some C++ wrapper functions and tests,
contributed by Google Inc. On a system that can use "configure" and "make",
the functions are automatically built into a library called pcrecpp. It should
be straightforward to compile the .cc files manually on other systems. The
files called xxx_unittest.cc are test programs for each of the corresponding
xxx.cc files.
BUILDING FOR VIRTUAL PASCAL
A script for building PCRE using Borland's C++ compiler for use with VPASCAL
was contributed by Alexander Tokarev. Stefan Weber updated the script and added
additional files. The following files in the distribution are for building PCRE
for use with VP/Borland: makevp_c.txt, makevp_l.txt, makevp.bat, pcregexp.pas.
STACK SIZE IN WINDOWS ENVIRONMENTS
The default processor stack size of 1Mb in some Windows environments is too
small for matching patterns that need much recursion. In particular, test 2 may
fail because of this. Normally, running out of stack causes a crash, but there
have been cases where the test program has just died silently. See your linker
documentation for how to increase stack size if you experience problems. The
Linux default of 8Mb is a reasonable choice for the stack, though even that can
be too small for some pattern/subject combinations.
PCRE has a compile configuration option to disable the use of stack for
recursion so that heap is used instead. However, pattern matching is
significantly slower when this is done. There is more about stack usage in the
"pcrestack" documentation.
LINKING PROGRAMS IN WINDOWS ENVIRONMENTS
If you want to statically link a program against a PCRE library in the form of
a non-dll .a file, you must define PCRE_STATIC before including pcre.h or
pcrecpp.h, otherwise the pcre_malloc() and pcre_free() exported functions will
be declared __declspec(dllimport), with unwanted results.
CALLING CONVENTIONS IN WINDOWS ENVIRONMENTS
It is possible to compile programs to use different calling conventions using
MSVC. Search the web for "calling conventions" for more information. To make it
easier to change the calling convention for the exported functions in the
PCRE library, the macro PCRE_CALL_CONVENTION is present in all the external
definitions. It can be set externally when compiling (e.g. in CFLAGS). If it is
not set, it defaults to empty; the default calling convention is then used
(which is what is wanted most of the time).
COMMENTS ABOUT WIN32 BUILDS (see also "BUILDING PCRE WITH CMAKE" below)
There are two ways of building PCRE using the "configure, make, make install"
paradigm on Windows systems: using MinGW or using Cygwin. These are not at all
the same thing; they are completely different from each other. There is also
support for building using CMake, which some users find a more straightforward
way of building PCRE under Windows. However, the tests are not run
automatically when CMake is used.
The MinGW home page (http://www.mingw.org/) says this:
MinGW: A collection of freely available and freely distributable Windows
specific header files and import libraries combined with GNU toolsets that
allow one to produce native Windows programs that do not rely on any
3rd-party C runtime DLLs.
The Cygwin home page (http://www.cygwin.com/) says this:
Cygwin is a Linux-like environment for Windows. It consists of two parts:
. A DLL (cygwin1.dll) which acts as a Linux API emulation layer providing
substantial Linux API functionality
. A collection of tools which provide Linux look and feel.
The Cygwin DLL currently works with all recent, commercially released x86 32
bit and 64 bit versions of Windows, with the exception of Windows CE.
On both MinGW and Cygwin, PCRE should build correctly using:
./configure && make && make install
This should create two libraries called libpcre and libpcreposix, and, if you
have enabled building the C++ wrapper, a third one called libpcrecpp. These are
independent libraries: when you like with libpcreposix or libpcrecpp you must
also link with libpcre, which contains the basic functions. (Some earlier
releases of PCRE included the basic libpcre functions in libpcreposix. This no
longer happens.)
A user submitted a special-purpose patch that makes it easy to create
"pcre.dll" under mingw32 using the "msys" environment. It provides "pcre.dll"
as a special target. If you use this target, no other files are built, and in
particular, the pcretest and pcregrep programs are not built. An example of how
this might be used is:
./configure --enable-utf --disable-cpp CFLAGS="-03 -s"; make pcre.dll
Using Cygwin's compiler generates libraries and executables that depend on
cygwin1.dll. If a library that is generated this way is distributed,
cygwin1.dll has to be distributed as well. Since cygwin1.dll is under the GPL
licence, this forces not only PCRE to be under the GPL, but also the entire
application. A distributor who wants to keep their own code proprietary must
purchase an appropriate Cygwin licence.
MinGW has no such restrictions. The MinGW compiler generates a library or
executable that can run standalone on Windows without any third party dll or
licensing issues.
But there is more complication:
If a Cygwin user uses the -mno-cygwin Cygwin gcc flag, what that really does is
to tell Cygwin's gcc to use the MinGW gcc. Cygwin's gcc is only acting as a
front end to MinGW's gcc (if you install Cygwin's gcc, you get both Cygwin's
gcc and MinGW's gcc). So, a user can:
. Build native binaries by using MinGW or by getting Cygwin and using
-mno-cygwin.
. Build binaries that depend on cygwin1.dll by using Cygwin with the normal
compiler flags.
The test files that are supplied with PCRE are in Unix format, with LF
characters as line terminators. It may be necessary to change the line
terminators in order to get some of the tests to work.
BUILDING PCRE ON WINDOWS WITH CMAKE
CMake is an alternative configuration facility that can be used instead of the
traditional Unix "configure". CMake creates project files (make files, solution
files, etc.) tailored to numerous development environments, including Visual
Studio, Borland, Msys, MinGW, NMake, and Unix. The following instructions
were contributed by a PCRE user.
1. Install the latest CMake version available from http://www.cmake.org/, and
ensure that cmake\bin is on your path.
2. Unzip (retaining folder structure) the PCRE source tree into a source
directory such as C:\pcre.
3. Create a new, empty build directory, for example C:\pcre\build\
4. Run cmake-gui from the Shell envirornment of your build tool, for example,
Msys for Msys/MinGW or Visual Studio Command Prompt for VC/VC++.
5. Enter C:\pcre\pcre-xx and C:\pcre\build for the source and build
directories, respectively.
6. Hit the "Configure" button.
7. Select the particular IDE / build tool that you are using (Visual
Studio, MSYS makefiles, MinGW makefiles, etc.)
8. The GUI will then list several configuration options. This is where
you can enable UTF-8 support or other PCRE optional features.
9. Hit "Configure" again. The adjacent "Generate" button should now be
active.
10. Hit "Generate".
11. The build directory should now contain a usable build system, be it a
solution file for Visual Studio, makefiles for MinGW, etc. Exit from
cmake-gui and use the generated build system with your compiler or IDE.
USE OF RELATIVE PATHS WITH CMAKE ON WINDOWS
A PCRE user comments as follows:
I thought that others may want to know the current state of
CMAKE_USE_RELATIVE_PATHS support on Windows.
Here it is:
-- AdditionalIncludeDirectories is only partially modified (only the
first path - see below)
-- Only some of the contained file paths are modified - shown below for
pcre.vcproj
-- It properly modifies
I am sure CMake people can fix that if they want to. Until then one will
need to replace existing absolute paths in project files with relative
paths manually (e.g. from VS) - relative to project file location. I did
just that before being told to try CMAKE_USE_RELATIVE_PATHS. Not a big
deal.
AdditionalIncludeDirectories="E:\builds\pcre\build;E:\builds\pcre\pcre-7.5;"
AdditionalIncludeDirectories=".;E:\builds\pcre\pcre-7.5;"
RelativePath="pcre.h">
RelativePath="pcre_chartables.c">
RelativePath="pcre_chartables.c.rule">
TESTING WITH RUNTEST.BAT
1. Copy RunTest.bat into the directory where pcretest.exe has been created.
2. Edit RunTest.bat and insert a line that indentifies the relative location of
the pcre source, e.g.:
set srcdir=..\pcre-7.4-RC3
3. Run RunTest.bat from a command shell environment. Test outputs will
automatically be compared to expected results, and discrepancies will
identified in the console output.
4. To test pcrecpp, run pcrecpp_unittest.exe, pcre_stringpiece_unittest.exe and
pcre_scanner_unittest.exe.
BUILDING UNDER WINDOWS WITH BCC5.5
Michael Roy sent these comments about building PCRE under Windows with BCC5.5:
Some of the core BCC libraries have a version of PCRE from 1998 built in,
which can lead to pcre_exec() giving an erroneous PCRE_ERROR_NULL from a
version mismatch. I'm including an easy workaround below, if you'd like to
include it in the non-unix instructions:
When linking a project with BCC5.5, pcre.lib must be included before any of
the libraries cw32.lib, cw32i.lib, cw32mt.lib, and cw32mti.lib on the command
line.
BUILDING UNDER WINDOWS CE WITH VISUAL STUDIO 200x
Vincent Richomme sent a zip archive of files to help with this process. They
can be found in the file "pcre-vsbuild.zip" in the Contrib directory of the FTP
site.
BUILDING PCRE ON OPENVMS
Dan Mooney sent the following comments about building PCRE on OpenVMS. They
relate to an older version of PCRE that used fewer source files, so the exact
commands will need changing. See the current list of source files above.
"It was quite easy to compile and link the library. I don't have a formal
make file but the attached file [reproduced below] contains the OpenVMS DCL
commands I used to build the library. I had to add #define
POSIX_MALLOC_THRESHOLD 10 to pcre.h since it was not defined anywhere.
The library was built on:
O/S: HP OpenVMS v7.3-1
Compiler: Compaq C v6.5-001-48BCD
Linker: vA13-01
The test results did not match 100% due to the issues you mention in your
documentation regarding isprint(), iscntrl(), isgraph() and ispunct(). I
modified some of the character tables temporarily and was able to get the
results to match. Tests using the fr locale did not match since I don't have
that locale loaded. The study size was always reported to be 3 less than the
value in the standard test output files."
=========================
$! This DCL procedure builds PCRE on OpenVMS
$!
$! I followed the instructions in the non-unix-use file in the distribution.
$!
$ COMPILE == "CC/LIST/NOMEMBER_ALIGNMENT/PREFIX_LIBRARY_ENTRIES=ALL_ENTRIES
$ COMPILE DFTABLES.C
$ LINK/EXE=DFTABLES.EXE DFTABLES.OBJ
$ RUN DFTABLES.EXE/OUTPUT=CHARTABLES.C
$ COMPILE MAKETABLES.C
$ COMPILE GET.C
$ COMPILE STUDY.C
$! I had to set POSIX_MALLOC_THRESHOLD to 10 in PCRE.H since the symbol
$! did not seem to be defined anywhere.
$! I edited pcre.h and added #DEFINE SUPPORT_UTF8 to enable UTF8 support.
$ COMPILE PCRE.C
$ LIB/CREATE PCRE MAKETABLES.OBJ, GET.OBJ, STUDY.OBJ, PCRE.OBJ
$! I had to set POSIX_MALLOC_THRESHOLD to 10 in PCRE.H since the symbol
$! did not seem to be defined anywhere.
$ COMPILE PCREPOSIX.C
$ LIB/CREATE PCREPOSIX PCREPOSIX.OBJ
$ COMPILE PCRETEST.C
$ LINK/EXE=PCRETEST.EXE PCRETEST.OBJ, PCRE/LIB, PCREPOSIX/LIB
$! C programs that want access to command line arguments must be
$! defined as a symbol
$ PCRETEST :== "$ SYS$ROADSUSERS:[DMOONEY.REGEXP]PCRETEST.EXE"
$! Arguments must be enclosed in quotes.
$ PCRETEST "-C"
$! Test results:
$!
$! The test results did not match 100%. The functions isprint(), iscntrl(),
$! isgraph() and ispunct() on OpenVMS must not produce the same results
$! as the system that built the test output files provided with the
$! distribution.
$!
$! The study size did not match and was always 3 less on OpenVMS.
$!
$! Locale could not be set to fr
$!
=========================
BUILDING PCRE ON STRATUS OPENVOS
These notes on the port of PCRE to VOS (lightly edited) were supplied by
Ashutosh Warikoo, whose email address has the local part awarikoo and the
domain nse.co.in. The port was for version 7.9 in August 2009.
1. Building PCRE
I built pcre on OpenVOS Release 17.0.1at using GNU Tools 3.4a without any
problems. I used the following packages to build PCRE:
ftp://ftp.stratus.com/pub/vos/posix/ga/posix.save.evf.gz
Please read and follow the instructions that come with these packages. To start
the build of pcre, from the root of the package type:
./build.sh
2. Installing PCRE
Once you have successfully built PCRE, login to the SysAdmin group, switch to
the root user, and type
[ !create_dir (master_disk)>usr --if needed ]
[ !create_dir (master_disk)>usr>local --if needed ]
!gmake install
This installs PCRE and its man pages into /usr/local. You can add
(master_disk)>usr>local>bin to your command search paths, or if you are in
BASH, add /usr/local/bin to the PATH environment variable.
4. Restrictions
This port requires readline library optionally. However during the build I
faced some yet unexplored errors while linking with readline. As it was an
optional component I chose to disable it.
5. Known Problems
I ran a the test suite, but you will have to be your own judge of whether this
command, and this port, suits your purposes. If you find any problems that
appear to be related to the port itself, please let me know. Please see the
build.log file in the root of the package also.
=========================
Last Updated: 26 May 2010
****
####

View File

@ -18,11 +18,12 @@ The contents of this README file are:
The PCRE APIs
Documentation for PCRE
Contributions by users of PCRE
Building PCRE on non-Unix systems
Building PCRE on Unix-like systems
Retrieving configuration information on Unix-like systems
Shared libraries on Unix-like systems
Cross-compiling on Unix-like systems
Building PCRE on non-Unix-like systems
Building PCRE without using autotools
Building PCRE using autotools
Retrieving configuration information
Shared libraries
Cross-compiling using autotools
Using HP's ANSI C++ compiler (aCC)
Using PCRE from MySQL
Making new tarballs
@ -34,16 +35,19 @@ The contents of this README file are:
The PCRE APIs
-------------
PCRE is written in C, and it has its own API. The distribution also includes a
set of C++ wrapper functions (see the pcrecpp man page for details), courtesy
of Google Inc.
PCRE is written in C, and it has its own API. There are two sets of functions,
one for the 8-bit library, which processes strings of bytes, and one for the
16-bit library, which processes strings of 16-bit values. The distribution also
includes a set of C++ wrapper functions (see the pcrecpp man page for details),
courtesy of Google Inc., which can be used to call the 8-bit PCRE library from
C++.
In addition, there is a set of C wrapper functions that are based on the POSIX
regular expression API (see the pcreposix man page). These end up in the
library called libpcreposix. Note that this just provides a POSIX calling
interface to PCRE; the regular expressions themselves still follow Perl syntax
and semantics. The POSIX API is restricted, and does not give full access to
all of PCRE's facilities.
In addition, there is a set of C wrapper functions (again, just for the 8-bit
library) that are based on the POSIX regular expression API (see the pcreposix
man page). These end up in the library called libpcreposix. Note that this just
provides a POSIX calling interface to PCRE; the regular expressions themselves
still follow Perl syntax and semantics. The POSIX API is restricted, and does
not give full access to all of PCRE's facilities.
The header file for the POSIX-style functions is called pcreposix.h. The
official POSIX name is regex.h, but I did not want to risk possible problems
@ -106,36 +110,45 @@ Windows (I myself do not use Windows). Nowadays there is more Windows support
in the standard distribution, so these contibutions have been archived.
Building PCRE on non-Unix systems
---------------------------------
Building PCRE on non-Unix-like systems
--------------------------------------
For a non-Unix system, please read the comments in the file NON-UNIX-USE,
though if your system supports the use of "configure" and "make" you may be
able to build PCRE in the same way as for Unix-like systems. PCRE can also be
configured in many platform environments using the GUI facility provided by
CMake's cmake-gui command. This creates Makefiles, solution files, etc.
For a non-Unix-like system, please read the comments in the file
NON-AUTOTOOLS-BUILD, though if your system supports the use of "configure" and
"make" you may be able to build PCRE using autotools in the same way as for
many Unix-like systems.
PCRE can also be configured using the GUI facility provided by CMake's
cmake-gui command. This creates Makefiles, solution files, etc. The file
NON-AUTOTOOLS-BUILD has information about CMake.
PCRE has been compiled on many different operating systems. It should be
straightforward to build PCRE on any system that has a Standard C compiler and
library, because it uses only Standard C functions.
Building PCRE on Unix-like systems
----------------------------------
Building PCRE without using autotools
-------------------------------------
The use of autotools (in particular, libtool) is problematic in some
environments, even some that are Unix or Unix-like. See the NON-AUTOTOOLS-BUILD
file for ways of building PCRE without using autotools.
Building PCRE using autotools
-----------------------------
If you are using HP's ANSI C++ compiler (aCC), please see the special note
in the section entitled "Using HP's ANSI C++ compiler (aCC)" below.
The following instructions assume the use of the widely used "configure, make,
make install" process. There is also support for CMake in the PCRE
distribution; there are some comments about using CMake in the NON-UNIX-USE
file, though it can also be used in Unix-like systems.
The following instructions assume the use of the widely used "configure; make;
make install" (autotools) process.
To build PCRE on a Unix-like system, first run the "configure" command from the
PCRE distribution directory, with your current directory set to the directory
where you want the files to be created. This command is a standard GNU
"autoconf" configuration script, for which generic instructions are supplied in
the file INSTALL.
To build PCRE on system that supports autotools, first run the "configure"
command from the PCRE distribution directory, with your current directory set
to the directory where you want the files to be created. This command is a
standard GNU "autoconf" configuration script, for which generic instructions
are supplied in the file INSTALL.
Most commonly, people build PCRE within its own distribution directory, and in
this case, on many systems, just running "./configure" is sufficient. However,
@ -143,9 +156,9 @@ the usual methods of changing standard defaults are available. For example:
CFLAGS='-O2 -Wall' ./configure --prefix=/opt/local
specifies that the C compiler should be run with the flags '-O2 -Wall' instead
of the default, and that "make install" should install PCRE under /opt/local
instead of the default /usr/local.
This command specifies that the C compiler should be run with the flags '-O2
-Wall' instead of the default, and that "make install" should install PCRE
under /opt/local instead of the default /usr/local.
If you want to build in a different directory, just run "configure" with that
directory as current. For example, suppose you have unpacked the PCRE source
@ -159,27 +172,59 @@ possible to build it as a C++ library, though the provided building apparatus
does not have any features to support this.
There are some optional features that can be included or omitted from the PCRE
library. You can read more about them in the pcrebuild man page.
library. They are also documented in the pcrebuild man page.
. If you want to suppress the building of the C++ wrapper library, you can add
--disable-cpp to the "configure" command. Otherwise, when "configure" is run,
it will try to find a C++ compiler and C++ header files, and if it succeeds,
it will try to build the C++ wrapper.
. By default, both shared and static libraries are built. You can change this
by adding one of these options to the "configure" command:
--disable-shared
--disable-static
(See also "Shared libraries on Unix-like systems" below.)
. By default, only the 8-bit library is built. If you add --enable-pcre16 to
the "configure" command, the 16-bit library is also built. If you want only
the 16-bit library, use "./configure --enable-pcre16 --disable-pcre8".
. If you are building the 8-bit library and want to suppress the building of
the C++ wrapper library, you can add --disable-cpp to the "configure"
command. Otherwise, when "configure" is run without --disable-pcre8, it will
try to find a C++ compiler and C++ header files, and if it succeeds, it will
try to build the C++ wrapper.
. If you want to include support for just-in-time compiling, which can give
large performance improvements on certain platforms, add --enable-jit to the
"configure" command. This support is available only for certain hardware
architectures. If you try to enable it on an unsupported architecture, there
will be a compile time error.
. When JIT support is enabled, pcregrep automatically makes use of it, unless
you add --disable-pcregrep-jit to the "configure" command.
. If you want to make use of the support for UTF-8 Unicode character strings in
PCRE, you must add --enable-utf8 to the "configure" command. Without it, the
code for handling UTF-8 is not included in the library. Even when included,
it still has to be enabled by an option at run time. When PCRE is compiled
with this option, its input can only either be ASCII or UTF-8, even when
running on EBCDIC platforms. It is not possible to use both --enable-utf8 and
--enable-ebcdic at the same time.
the 8-bit library, or UTF-16 Unicode character strings in the 16-bit library,
you must add --enable-utf to the "configure" command. Without it, the code
for handling UTF-8 and UTF-16 is not included in the relevant library. Even
when --enable-utf is included, the use of a UTF encoding still has to be
enabled by an option at run time. When PCRE is compiled with this option, its
input can only either be ASCII or UTF-8/16, even when running on EBCDIC
platforms. It is not possible to use both --enable-utf and --enable-ebcdic at
the same time.
. If, in addition to support for UTF-8 character strings, you want to include
support for the \P, \p, and \X sequences that recognize Unicode character
properties, you must add --enable-unicode-properties to the "configure"
command. This adds about 30K to the size of the library (in the form of a
property table); only the basic two-letter properties such as Lu are
supported.
. There are no separate options for enabling UTF-8 and UTF-16 independently
because that would allow ridiculous settings such as requesting UTF-16
support while building only the 8-bit library. However, the option
--enable-utf8 is retained for backwards compatibility with earlier releases
that did not support 16-bit character strings. It is synonymous with
--enable-utf. It is not possible to configure one library with UTF support
and the other without in the same configuration.
. If, in addition to support for UTF-8/16 character strings, you want to
include support for the \P, \p, and \X sequences that recognize Unicode
character properties, you must add --enable-unicode-properties to the
"configure" command. This adds about 30K to the size of the library (in the
form of a property table); only the basic two-letter properties such as Lu
are supported.
. You can build PCRE to recognize either CR or LF or the sequence CRLF or any
of the preceding, or any of the Unicode newline sequences as indicating the
@ -232,10 +277,11 @@ library. You can read more about them in the pcrebuild man page.
sizes in the pcrestack man page.
. The default maximum compiled pattern size is around 64K. You can increase
this by adding --with-link-size=3 to the "configure" command. You can
increase it even more by setting --with-link-size=4, but this is unlikely
ever to be necessary. Increasing the internal link size will reduce
performance.
this by adding --with-link-size=3 to the "configure" command. In the 8-bit
library, PCRE then uses three bytes instead of two for offsets to different
parts of the compiled pattern. In the 16-bit library, --with-link-size=3 is
the same as --with-link-size=4, which (in both libraries) uses four-byte
offsets. Increasing the internal link size reduces performance.
. You can build PCRE so that its internal match() function that is called from
pcre_exec() does not call itself recursively. Instead, it uses memory blocks
@ -247,9 +293,10 @@ library. You can read more about them in the pcrebuild man page.
on the "configure" command. PCRE runs more slowly in this mode, but it may be
necessary in environments with limited stack sizes. This applies only to the
pcre_exec() function; it does not apply to pcre_dfa_exec(), which does not
use deeply nested recursion. There is a discussion about stack sizes in the
pcrestack man page.
normal execution of the pcre_exec() function; if JIT support is being
successfully used, it is not relevant. Equally, it does not apply to
pcre_dfa_exec(), which does not use deeply nested recursion. There is a
discussion about stack sizes in the pcrestack man page.
. For speed, PCRE uses four tables for manipulating and identifying characters
whose code point values are less than 256. By default, it uses a set of
@ -269,27 +316,37 @@ library. You can read more about them in the pcrebuild man page.
This automatically implies --enable-rebuild-chartables (see above). However,
when PCRE is built this way, it always operates in EBCDIC. It cannot support
both EBCDIC and UTF-8.
both EBCDIC and UTF-8/16.
. It is possible to compile pcregrep to use libz and/or libbz2, in order to
read .gz and .bz2 files (respectively), by specifying one or both of
. The pcregrep program currently supports only 8-bit data files, and so
requires the 8-bit PCRE library. It is possible to compile pcregrep to use
libz and/or libbz2, in order to read .gz and .bz2 files (respectively), by
specifying one or both of
--enable-pcregrep-libz
--enable-pcregrep-libbz2
Of course, the relevant libraries must be installed on your system.
. It is possible to compile pcretest so that it links with the libreadline
library, by specifying
. The default size of internal buffer used by pcregrep can be set by, for
example:
--enable-pcretest-libreadline
--with-pcregrep-bufsize=50K
The default value is 20K.
. It is possible to compile pcretest so that it links with the libreadline
or libedit libraries, by specifying, respectively,
--enable-pcretest-libreadline or --enable-pcretest-libedit
If this is done, when pcretest's input is from a terminal, it reads it using
the readline() function. This provides line-editing and history facilities.
Note that libreadline is GPL-licenced, so if you distribute a binary of
pcretest linked in this way, there may be licensing issues.
pcretest linked in this way, there may be licensing issues. These can be
avoided by linking with libedit (which has a BSD licence) instead.
Setting this option causes the -lreadline option to be added to the pcretest
Enabling libreadline causes the -lreadline option to be added to the pcretest
build. In many operating environments with a sytem-installed readline
library this is sufficient. However, in some environments (e.g. if an
unmodified distribution version of readline is in use), it may be necessary
@ -302,37 +359,42 @@ library. You can read more about them in the pcrebuild man page.
The "configure" script builds the following files for the basic C library:
. Makefile is the makefile that builds the library
. config.h contains build-time configuration options for the library
. pcre.h is the public PCRE header file
. pcre-config is a script that shows the settings of "configure" options
. libpcre.pc is data for the pkg-config command
. libtool is a script that builds shared and/or static libraries
. RunTest is a script for running tests on the basic C library
. RunGrepTest is a script for running tests on the pcregrep command
. Makefile the makefile that builds the library
. config.h build-time configuration options for the library
. pcre.h the public PCRE header file
. pcre-config script that shows the building settings such as CFLAGS
that were set for "configure"
. libpcre.pc ) data for the pkg-config command
. libpcre16.pc )
. libpcreposix.pc )
. libtool script that builds shared and/or static libraries
Versions of config.h and pcre.h are distributed in the PCRE tarballs under the
names config.h.generic and pcre.h.generic. These are provided for those who
have to built PCRE without using "configure" or CMake. If you use "configure"
or CMake, the .generic versions are not used.
If a C++ compiler is found, the following files are also built:
When building the 8-bit library, if a C++ compiler is found, the following
files are also built:
. libpcrecpp.pc is data for the pkg-config command
. pcrecpparg.h is a header file for programs that call PCRE via the C++ wrapper
. pcre_stringpiece.h is the header for the C++ "stringpiece" functions
. libpcrecpp.pc data for the pkg-config command
. pcrecpparg.h header file for calling PCRE via the C++ wrapper
. pcre_stringpiece.h header for the C++ "stringpiece" functions
The "configure" script also creates config.status, which is an executable
script that can be run to recreate the configuration, and config.log, which
contains compiler output from tests that "configure" runs.
Once "configure" has run, you can run "make". It builds two libraries, called
libpcre and libpcreposix, a test program called pcretest, and the pcregrep
command. If a C++ compiler was found on your system, "make" also builds the C++
wrapper library, which is called libpcrecpp, and some test programs called
pcrecpp_unittest, pcre_scanner_unittest, and pcre_stringpiece_unittest.
Building the C++ wrapper can be disabled by adding --disable-cpp to the
"configure" command.
Once "configure" has run, you can run "make". This builds either or both of the
libraries libpcre and libpcre16, and a test program called pcretest. If you
enabled JIT support with --enable-jit, a test program called pcre_jit_test is
built as well.
If the 8-bit library is built, libpcreposix and the pcregrep command are also
built, and if a C++ compiler was found on your system, and you did not disable
it with --disable-cpp, "make" builds the C++ wrapper library, which is called
libpcrecpp, as well as some test programs called pcrecpp_unittest,
pcre_scanner_unittest, and pcre_stringpiece_unittest.
The command "make check" runs all the appropriate tests. Details of the PCRE
tests are given below in a separate section of this document.
@ -343,16 +405,19 @@ system. The following are installed (file names are all relative to the
Commands (bin):
pcretest
pcregrep
pcregrep (if 8-bit support is enabled)
pcre-config
Libraries (lib):
libpcre
libpcreposix
libpcrecpp (if C++ support is enabled)
libpcre16 (if 16-bit support is enabled)
libpcre (if 8-bit support is enabled)
libpcreposix (if 8-bit support is enabled)
libpcrecpp (if 8-bit and C++ support is enabled)
Configuration information (lib/pkgconfig):
libpcre16.pc
libpcre.pc
libpcreposix.pc
libpcrecpp.pc (if C++ support is enabled)
Header files (include):
@ -366,6 +431,7 @@ system. The following are installed (file names are all relative to the
Man pages (share/man/man{1,3}):
pcregrep.1
pcretest.1
pcre-config.1
pcre.3
pcre*.3 (lots more pages, all starting "pcre")
@ -380,17 +446,18 @@ system. The following are installed (file names are all relative to the
LICENCE
NEWS
README
pcre.txt (a concatenation of the man(3) pages)
pcretest.txt the pcretest man page
pcregrep.txt the pcregrep man page
pcre.txt (a concatenation of the man(3) pages)
pcretest.txt the pcretest man page
pcregrep.txt the pcregrep man page
pcre-config.txt the pcre-config man page
If you want to remove PCRE from your system, you can run "make uninstall".
This removes all the files that "make install" installed. However, it does not
remove any directories, because these are often shared with other programs.
Retrieving configuration information on Unix-like systems
---------------------------------------------------------
Retrieving configuration information
------------------------------------
Running "make install" installs the command pcre-config, which can be used to
recall information about the PCRE configuration and installation. For example:
@ -415,8 +482,8 @@ The data is held in *.pc files that are installed in a directory called
<prefix>/lib/pkgconfig.
Shared libraries on Unix-like systems
-------------------------------------
Shared libraries
----------------
The default distribution builds PCRE as shared libraries and static libraries,
as long as the operating system supports shared libraries. Shared library
@ -441,8 +508,8 @@ Then run "make" in the usual way. Similarly, you can use --disable-static to
build only shared libraries.
Cross-compiling on Unix-like systems
------------------------------------
Cross-compiling using autotools
-------------------------------
You can specify CC and CFLAGS in the normal way to the "configure" command, in
order to cross-compile PCRE for some other host. However, you should NOT
@ -514,30 +581,49 @@ script creates the .txt and HTML forms of the documentation from the man pages.
Testing PCRE
------------
To test the basic PCRE library on a Unix system, run the RunTest script that is
created by the configuring process. There is also a script called RunGrepTest
that tests the options of the pcregrep command. If the C++ wrapper library is
built, three test programs called pcrecpp_unittest, pcre_scanner_unittest, and
pcre_stringpiece_unittest are also built.
To test the basic PCRE library on a Unix-like system, run the RunTest script.
There is another script called RunGrepTest that tests the options of the
pcregrep command. If the C++ wrapper library is built, three test programs
called pcrecpp_unittest, pcre_scanner_unittest, and pcre_stringpiece_unittest
are also built. When JIT support is enabled, another test program called
pcre_jit_test is built.
Both the scripts and all the program tests are run if you obey "make check" or
"make test". For other systems, see the instructions in NON-UNIX-USE.
"make test". For other environments, see the instructions in
NON-AUTOTOOLS-BUILD.
The RunTest script runs the pcretest test program (which is documented in its
own man page) on each of the testinput files in the testdata directory in
turn, and compares the output with the contents of the corresponding testoutput
files. A file called testtry is used to hold the main output from pcretest
(testsavedregex is also used as a working file). To run pcretest on just one of
the test files, give its number as an argument to RunTest, for example:
own man page) on each of the relevant testinput files in the testdata
directory, and compares the output with the contents of the corresponding
testoutput files. Some tests are relevant only when certain build-time options
were selected. For example, the tests for UTF-8/16 support are run only if
--enable-utf was used. RunTest outputs a comment when it skips a test.
RunTest 2
Many of the tests that are not skipped are run up to three times. The second
run forces pcre_study() to be called for all patterns except for a few in some
tests that are marked "never study" (see the pcretest program for how this is
done). If JIT support is available, the non-DFA tests are run a third time,
this time with a forced pcre_study() with the PCRE_STUDY_JIT_COMPILE option.
The first test file can also be fed directly into the perltest.pl script to
check that Perl gives the same results. The only difference you should see is
in the first few lines, where the Perl version is given instead of the PCRE
version.
When both 8-bit and 16-bit support is enabled, the entire set of tests is run
twice, once for each library. If you want to run just one set of tests, call
RunTest with either the -8 or -16 option.
The second set of tests check pcre_fullinfo(), pcre_info(), pcre_study(),
RunTest uses a file called testtry to hold the main output from pcretest.
Other files whose names begin with "test" are used as working files in some
tests. To run pcretest on just one or more specific test files, give their
numbers as arguments to RunTest, for example:
RunTest 2 7 11
You can also call RunTest with the single argument "list" to cause it to output
a list of tests.
The first test file can be fed directly into the perltest.pl script to check
that Perl gives the same results. The only difference you should see is in the
first few lines, where the Perl version is given instead of the PCRE version.
The second set of tests check pcre_fullinfo(), pcre_study(),
pcre_copy_substring(), pcre_get_substring(), pcre_get_substring_list(), error
detection, and run-time flags that are specific to PCRE, as well as the POSIX
wrapper API. It also uses the debugging flags to check some of the internals of
@ -572,33 +658,32 @@ RunTest.bat. The version of RunTest.bat included with PCRE 7.4 and above uses
Windows versions of test 2. More info on using RunTest.bat is included in the
document entitled NON-UNIX-USE.]
The fourth test checks the UTF-8 support. It is not run automatically unless
PCRE is built with UTF-8 support. To do this you must set --enable-utf8 when
running "configure". This file can be also fed directly to the perltest.pl
script, provided you are running Perl 5.8 or higher.
The fourth and fifth tests check the UTF-8/16 support and error handling and
internal UTF features of PCRE that are not relevant to Perl, respectively. The
sixth and seventh tests do the same for Unicode character properties support.
The fifth test checks error handling with UTF-8 encoding, and internal UTF-8
features of PCRE that are not relevant to Perl.
The eighth, ninth, and tenth tests check the pcre_dfa_exec() alternative
matching function, in non-UTF-8/16 mode, UTF-8/16 mode, and UTF-8/16 mode with
Unicode property support, respectively.
The sixth test (which is Perl-5.10 compatible) checks the support for Unicode
character properties. It it not run automatically unless PCRE is built with
Unicode property support. To to this you must set --enable-unicode-properties
when running "configure".
The eleventh test checks some internal offsets and code size features; it is
run only when the default "link size" of 2 is set (in other cases the sizes
change) and when Unicode property support is enabled.
The seventh, eighth, and ninth tests check the pcre_dfa_exec() alternative
matching function, in non-UTF-8 mode, UTF-8 mode, and UTF-8 mode with Unicode
property support, respectively. The eighth and ninth tests are not run
automatically unless PCRE is build with the relevant support.
The twelfth test is run only when JIT support is available, and the thirteenth
test is run only when JIT support is not available. They test some JIT-specific
features such as information output from pcretest about JIT compilation.
The tenth test checks some internal offsets and code size features; it is run
only when the default "link size" of 2 is set (in other cases the sizes
change).
The fourteenth, fifteenth, and sixteenth tests are run only in 8-bit mode, and
the seventeenth, eighteenth, and nineteenth tests are run only in 16-bit mode.
These are tests that generate different output in the two modes. They are for
general cases, UTF-8/16 support, and Unicode property support, respectively.
The eleventh test checks out features that are new in Perl 5.10, and the
twelfth test checks a number internals and non-Perl features concerned with
Unicode property support. It it not run automatically unless PCRE is built with
Unicode property support. To to this you must set --enable-unicode-properties
when running "configure".
The twentieth test is run only in 16-bit mode. It tests some specific 16-bit
features of the DFA matching engine.
The twenty-first and twenty-second tests are run only in 16-bit mode, when the
link size is set to 2. They test reloading pre-compiled patterns.
Character tables
@ -658,7 +743,9 @@ will cause PCRE to malfunction.
File manifest
-------------
The distribution should contain the following files:
The distribution should contain the files listed below. Where a file name is
given as pcre[16]_xxx it means that there are two files, one with the name
pcre_xxx and the other with the name pcre16_xxx.
(A) Source files of the PCRE library functions and their headers:
@ -667,33 +754,40 @@ The distribution should contain the following files:
pcre_chartables.c.dist a default set of character tables that assume ASCII
coding; used, unless --enable-rebuild-chartables is
specified, by copying to pcre_chartables.c
specified, by copying to pcre[16]_chartables.c
pcreposix.c )
pcre_compile.c )
pcre_config.c )
pcre_dfa_exec.c )
pcre_exec.c )
pcre_fullinfo.c )
pcre_get.c ) sources for the functions in the library,
pcre_globals.c ) and some internal functions that they use
pcre_info.c )
pcre_maketables.c )
pcre_newline.c )
pcre[16]_byte_order.c )
pcre[16]_compile.c )
pcre[16]_config.c )
pcre[16]_dfa_exec.c )
pcre[16]_exec.c )
pcre[16]_fullinfo.c )
pcre[16]_get.c ) sources for the functions in the library,
pcre[16]_globals.c ) and some internal functions that they use
pcre[16]_jit_compile.c )
pcre[16]_maketables.c )
pcre[16]_newline.c )
pcre[16]_refcount.c )
pcre[16]_string_utils.c )
pcre[16]_study.c )
pcre[16]_tables.c )
pcre[16]_ucd.c )
pcre[16]_version.c )
pcre[16]_xclass.c )
pcre_ord2utf8.c )
pcre_refcount.c )
pcre_study.c )
pcre_tables.c )
pcre_try_flipped.c )
pcre_ucd.c )
pcre_valid_utf8.c )
pcre_version.c )
pcre_xclass.c )
pcre_printint.src ) debugging function that is #included in pcretest,
pcre16_ord2utf16.c )
pcre16_utf16_utils.c )
pcre16_valid_utf16.c )
pcre[16]_printint.c ) debugging function that is used by pcretest,
) and can also be #included in pcre_compile()
pcre.h.in template for pcre.h when built by "configure"
pcreposix.h header for the external POSIX wrapper API
pcre_internal.h header for internal use
sljit/* 16 files that make up the JIT compiler
ucp.h header for Unicode property handling
config.h.in template for config.h, which is built by "configure"
@ -730,7 +824,8 @@ The distribution should contain the following files:
Makefile.am ) the automake input that was used to create
) Makefile.in
NEWS important changes in this release
NON-UNIX-USE notes on building PCRE on non-Unix systems
NON-UNIX-USE the previous name for NON-AUTOTOOLS-BUILD
NON-AUTOTOOLS-BUILD notes on building PCRE without using autotools
PrepareRelease script to make preparations for "make dist"
README this file
RunTest a Unix shell script for running tests
@ -751,6 +846,7 @@ The distribution should contain the following files:
doc/pcretest.txt plain text documentation of test program
doc/perltest.txt plain text documentation of Perl test program
install-sh a shell script for installing files
libpcre16.pc.in template for libpcre16.pc for pkg-config
libpcre.pc.in template for libpcre.pc for pkg-config
libpcreposix.pc.in template for libpcreposix.pc for pkg-config
libpcrecpp.pc.in template for libpcrecpp.pc for pkg-config
@ -760,17 +856,20 @@ The distribution should contain the following files:
mkinstalldirs script for making install directories
perltest.pl Perl test program
pcre-config.in source of script which retains PCRE information
pcre_jit_test.c test program for the JIT compiler
pcrecpp_unittest.cc )
pcre_scanner_unittest.cc ) test programs for the C++ wrapper
pcre_stringpiece_unittest.cc )
testdata/testinput* test data for main library tests
testdata/testoutput* expected test results
testdata/grep* input and output for pcregrep tests
testdata/* other supporting test files
(D) Auxiliary files for cmake support
cmake/COPYING-CMAKE-SCRIPTS
cmake/FindPackageHandleStandardArgs.cmake
cmake/FindEditline.cmake
cmake/FindReadline.cmake
CMakeLists.txt
config-cmake.h.in
@ -796,4 +895,4 @@ The distribution should contain the following files:
Philip Hazel
Email local part: ph10
Email domain: cam.ac.uk
Last updated: 19 January 2010
Last updated: 18 June 2012

View File

@ -282,7 +282,7 @@ them both to 0; an emulation function will be used. */
#define PACKAGE_NAME "PCRE"
/* Define to the full name and version of this package. */
#define PACKAGE_STRING "PCRE 8.12"
#define PACKAGE_STRING "PCRE 8.31"
/* Define to the one symbol short name of this package. */
#define PACKAGE_TARNAME "pcre"
@ -291,7 +291,7 @@ them both to 0; an emulation function will be used. */
#define PACKAGE_URL ""
/* Define to the version of this package. */
#define PACKAGE_VERSION "8.12"
#define PACKAGE_VERSION "8.31"
/* If you are compiling for a system other than a Unix-like system or
@ -347,7 +347,7 @@ them both to 0; an emulation function will be used. */
/* Version number of package */
#ifndef VERSION
#define VERSION "8.12"
#define VERSION "8.31"
#endif
/* Define to empty if `const' does not conform to ANSI C. */

View File

@ -6,7 +6,7 @@
and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel
Copyright (c) 1997-2008 University of Cambridge
Copyright (c) 1997-2012 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
@ -112,7 +112,7 @@ fprintf(f,
"#endif\n\n"
"#include \"pcre_internal.h\"\n\n");
fprintf(f,
"const unsigned char _pcre_default_tables[] = {\n\n"
"const pcre_uint8 PRIV(default_tables)[] = {\n\n"
"/* This table is a lower casing table. */\n\n");
fprintf(f, " ");

File diff suppressed because it is too large Load Diff

View File

@ -5,7 +5,7 @@
/* This is the public header file for the PCRE library, to be #included by
applications that call the PCRE functions.
Copyright (c) 1997-2010 University of Cambridge
Copyright (c) 1997-2012 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
@ -42,9 +42,9 @@ POSSIBILITY OF SUCH DAMAGE.
/* The current PCRE version information. */
#define PCRE_MAJOR 8
#define PCRE_MINOR 12
#define PCRE_MINOR 31
#define PCRE_PRERELEASE
#define PCRE_DATE 2011-01-15
#define PCRE_DATE 2012-07-06
/* When an application links to a PCRE DLL in Windows, the symbols that are
imported have to be identified as such. When building PCRE, the appropriate
@ -98,28 +98,37 @@ extern "C" {
/* Options. Some are compile-time only, some are run-time only, and some are
both, so we keep them all distinct. However, almost all the bits in the options
word are now used. In the long run, we may have to re-use some of the
compile-time only bits for runtime options, or vice versa. */
compile-time only bits for runtime options, or vice versa. In the comments
below, "compile", "exec", and "DFA exec" mean that the option is permitted to
be set for those functions; "used in" means that an option may be set only for
compile, but is subsequently referenced in exec and/or DFA exec. Any of the
compile-time options may be inspected during studying (and therefore JIT
compiling). */
#define PCRE_CASELESS 0x00000001 /* Compile */
#define PCRE_MULTILINE 0x00000002 /* Compile */
#define PCRE_DOTALL 0x00000004 /* Compile */
#define PCRE_EXTENDED 0x00000008 /* Compile */
#define PCRE_ANCHORED 0x00000010 /* Compile, exec, DFA exec */
#define PCRE_DOLLAR_ENDONLY 0x00000020 /* Compile */
#define PCRE_DOLLAR_ENDONLY 0x00000020 /* Compile, used in exec, DFA exec */
#define PCRE_EXTRA 0x00000040 /* Compile */
#define PCRE_NOTBOL 0x00000080 /* Exec, DFA exec */
#define PCRE_NOTEOL 0x00000100 /* Exec, DFA exec */
#define PCRE_UNGREEDY 0x00000200 /* Compile */
#define PCRE_NOTEMPTY 0x00000400 /* Exec, DFA exec */
#define PCRE_UTF8 0x00000800 /* Compile */
/* The next two are also used in exec and DFA exec */
#define PCRE_UTF8 0x00000800 /* Compile (same as PCRE_UTF16) */
#define PCRE_UTF16 0x00000800 /* Compile (same as PCRE_UTF8) */
#define PCRE_NO_AUTO_CAPTURE 0x00001000 /* Compile */
#define PCRE_NO_UTF8_CHECK 0x00002000 /* Compile, exec, DFA exec */
/* The next two are also used in exec and DFA exec */
#define PCRE_NO_UTF8_CHECK 0x00002000 /* Compile (same as PCRE_NO_UTF16_CHECK) */
#define PCRE_NO_UTF16_CHECK 0x00002000 /* Compile (same as PCRE_NO_UTF8_CHECK) */
#define PCRE_AUTO_CALLOUT 0x00004000 /* Compile */
#define PCRE_PARTIAL_SOFT 0x00008000 /* Exec, DFA exec */
#define PCRE_PARTIAL 0x00008000 /* Backwards compatible synonym */
#define PCRE_DFA_SHORTEST 0x00010000 /* DFA exec */
#define PCRE_DFA_RESTART 0x00020000 /* DFA exec */
#define PCRE_FIRSTLINE 0x00040000 /* Compile */
#define PCRE_FIRSTLINE 0x00040000 /* Compile, used in exec, DFA exec */
#define PCRE_DUPNAMES 0x00080000 /* Compile */
#define PCRE_NEWLINE_CR 0x00100000 /* Compile, exec, DFA exec */
#define PCRE_NEWLINE_LF 0x00200000 /* Compile, exec, DFA exec */
@ -128,41 +137,82 @@ compile-time only bits for runtime options, or vice versa. */
#define PCRE_NEWLINE_ANYCRLF 0x00500000 /* Compile, exec, DFA exec */
#define PCRE_BSR_ANYCRLF 0x00800000 /* Compile, exec, DFA exec */
#define PCRE_BSR_UNICODE 0x01000000 /* Compile, exec, DFA exec */
#define PCRE_JAVASCRIPT_COMPAT 0x02000000 /* Compile */
#define PCRE_JAVASCRIPT_COMPAT 0x02000000 /* Compile, used in exec */
#define PCRE_NO_START_OPTIMIZE 0x04000000 /* Compile, exec, DFA exec */
#define PCRE_NO_START_OPTIMISE 0x04000000 /* Synonym */
#define PCRE_PARTIAL_HARD 0x08000000 /* Exec, DFA exec */
#define PCRE_NOTEMPTY_ATSTART 0x10000000 /* Exec, DFA exec */
#define PCRE_UCP 0x20000000 /* Compile */
#define PCRE_UCP 0x20000000 /* Compile, used in exec, DFA exec */
/* Exec-time and get/set-time error codes */
#define PCRE_ERROR_NOMATCH (-1)
#define PCRE_ERROR_NULL (-2)
#define PCRE_ERROR_BADOPTION (-3)
#define PCRE_ERROR_BADMAGIC (-4)
#define PCRE_ERROR_UNKNOWN_OPCODE (-5)
#define PCRE_ERROR_UNKNOWN_NODE (-5) /* For backward compatibility */
#define PCRE_ERROR_NOMEMORY (-6)
#define PCRE_ERROR_NOSUBSTRING (-7)
#define PCRE_ERROR_MATCHLIMIT (-8)
#define PCRE_ERROR_CALLOUT (-9) /* Never used by PCRE itself */
#define PCRE_ERROR_BADUTF8 (-10)
#define PCRE_ERROR_BADUTF8_OFFSET (-11)
#define PCRE_ERROR_PARTIAL (-12)
#define PCRE_ERROR_BADPARTIAL (-13)
#define PCRE_ERROR_INTERNAL (-14)
#define PCRE_ERROR_BADCOUNT (-15)
#define PCRE_ERROR_DFA_UITEM (-16)
#define PCRE_ERROR_DFA_UCOND (-17)
#define PCRE_ERROR_DFA_UMLIMIT (-18)
#define PCRE_ERROR_DFA_WSSIZE (-19)
#define PCRE_ERROR_DFA_RECURSE (-20)
#define PCRE_ERROR_RECURSIONLIMIT (-21)
#define PCRE_ERROR_NULLWSLIMIT (-22) /* No longer actually used */
#define PCRE_ERROR_BADNEWLINE (-23)
#define PCRE_ERROR_BADOFFSET (-24)
#define PCRE_ERROR_SHORTUTF8 (-25)
#define PCRE_ERROR_NOMATCH (-1)
#define PCRE_ERROR_NULL (-2)
#define PCRE_ERROR_BADOPTION (-3)
#define PCRE_ERROR_BADMAGIC (-4)
#define PCRE_ERROR_UNKNOWN_OPCODE (-5)
#define PCRE_ERROR_UNKNOWN_NODE (-5) /* For backward compatibility */
#define PCRE_ERROR_NOMEMORY (-6)
#define PCRE_ERROR_NOSUBSTRING (-7)
#define PCRE_ERROR_MATCHLIMIT (-8)
#define PCRE_ERROR_CALLOUT (-9) /* Never used by PCRE itself */
#define PCRE_ERROR_BADUTF8 (-10) /* Same for 8/16 */
#define PCRE_ERROR_BADUTF16 (-10) /* Same for 8/16 */
#define PCRE_ERROR_BADUTF8_OFFSET (-11) /* Same for 8/16 */
#define PCRE_ERROR_BADUTF16_OFFSET (-11) /* Same for 8/16 */
#define PCRE_ERROR_PARTIAL (-12)
#define PCRE_ERROR_BADPARTIAL (-13)
#define PCRE_ERROR_INTERNAL (-14)
#define PCRE_ERROR_BADCOUNT (-15)
#define PCRE_ERROR_DFA_UITEM (-16)
#define PCRE_ERROR_DFA_UCOND (-17)
#define PCRE_ERROR_DFA_UMLIMIT (-18)
#define PCRE_ERROR_DFA_WSSIZE (-19)
#define PCRE_ERROR_DFA_RECURSE (-20)
#define PCRE_ERROR_RECURSIONLIMIT (-21)
#define PCRE_ERROR_NULLWSLIMIT (-22) /* No longer actually used */
#define PCRE_ERROR_BADNEWLINE (-23)
#define PCRE_ERROR_BADOFFSET (-24)
#define PCRE_ERROR_SHORTUTF8 (-25)
#define PCRE_ERROR_SHORTUTF16 (-25) /* Same for 8/16 */
#define PCRE_ERROR_RECURSELOOP (-26)
#define PCRE_ERROR_JIT_STACKLIMIT (-27)
#define PCRE_ERROR_BADMODE (-28)
#define PCRE_ERROR_BADENDIANNESS (-29)
#define PCRE_ERROR_DFA_BADRESTART (-30)
/* Specific error codes for UTF-8 validity checks */
#define PCRE_UTF8_ERR0 0
#define PCRE_UTF8_ERR1 1
#define PCRE_UTF8_ERR2 2
#define PCRE_UTF8_ERR3 3
#define PCRE_UTF8_ERR4 4
#define PCRE_UTF8_ERR5 5
#define PCRE_UTF8_ERR6 6
#define PCRE_UTF8_ERR7 7
#define PCRE_UTF8_ERR8 8
#define PCRE_UTF8_ERR9 9
#define PCRE_UTF8_ERR10 10
#define PCRE_UTF8_ERR11 11
#define PCRE_UTF8_ERR12 12
#define PCRE_UTF8_ERR13 13
#define PCRE_UTF8_ERR14 14
#define PCRE_UTF8_ERR15 15
#define PCRE_UTF8_ERR16 16
#define PCRE_UTF8_ERR17 17
#define PCRE_UTF8_ERR18 18
#define PCRE_UTF8_ERR19 19
#define PCRE_UTF8_ERR20 20
#define PCRE_UTF8_ERR21 21
/* Specific error codes for UTF-16 validity checks */
#define PCRE_UTF16_ERR0 0
#define PCRE_UTF16_ERR1 1
#define PCRE_UTF16_ERR2 2
#define PCRE_UTF16_ERR3 3
#define PCRE_UTF16_ERR4 4
/* Request types for pcre_fullinfo() */
@ -183,6 +233,9 @@ compile-time only bits for runtime options, or vice versa. */
#define PCRE_INFO_JCHANGED 13
#define PCRE_INFO_HASCRORLF 14
#define PCRE_INFO_MINLENGTH 15
#define PCRE_INFO_JIT 16
#define PCRE_INFO_JITSIZE 17
#define PCRE_INFO_MAXLOOKBEHIND 18
/* Request types for pcre_config(). Do not re-arrange, in order to remain
compatible. */
@ -196,8 +249,18 @@ compatible. */
#define PCRE_CONFIG_UNICODE_PROPERTIES 6
#define PCRE_CONFIG_MATCH_LIMIT_RECURSION 7
#define PCRE_CONFIG_BSR 8
#define PCRE_CONFIG_JIT 9
#define PCRE_CONFIG_UTF16 10
#define PCRE_CONFIG_JITTARGET 11
/* Bit flags for the pcre_extra structure. Do not re-arrange or redefine
/* Request types for pcre_study(). Do not re-arrange, in order to remain
compatible. */
#define PCRE_STUDY_JIT_COMPILE 0x0001
#define PCRE_STUDY_JIT_PARTIAL_SOFT_COMPILE 0x0002
#define PCRE_STUDY_JIT_PARTIAL_HARD_COMPILE 0x0004
/* Bit flags for the pcre[16]_extra structure. Do not re-arrange or redefine
these bits, just add new ones on the end, in order to remain compatible. */
#define PCRE_EXTRA_STUDY_DATA 0x0001
@ -206,12 +269,33 @@ these bits, just add new ones on the end, in order to remain compatible. */
#define PCRE_EXTRA_TABLES 0x0008
#define PCRE_EXTRA_MATCH_LIMIT_RECURSION 0x0010
#define PCRE_EXTRA_MARK 0x0020
#define PCRE_EXTRA_EXECUTABLE_JIT 0x0040
/* Types */
struct real_pcre; /* declaration; the definition is private */
typedef struct real_pcre pcre;
struct real_pcre16; /* declaration; the definition is private */
typedef struct real_pcre16 pcre16;
struct real_pcre_jit_stack; /* declaration; the definition is private */
typedef struct real_pcre_jit_stack pcre_jit_stack;
struct real_pcre16_jit_stack; /* declaration; the definition is private */
typedef struct real_pcre16_jit_stack pcre16_jit_stack;
/* If PCRE is compiled with 16 bit character support, PCRE_UCHAR16 must contain
a 16 bit wide signed data type. Otherwise it can be a dummy data type since
pcre16 functions are not implemented. There is a check for this in pcre_internal.h. */
#ifndef PCRE_UCHAR16
#define PCRE_UCHAR16 unsigned short
#endif
#ifndef PCRE_SPTR16
#define PCRE_SPTR16 const PCRE_UCHAR16 *
#endif
/* When PCRE is compiled as a C++ library, the subject pointer type can be
replaced with a custom type. For conventional use, the public interface is a
const char *. */
@ -232,8 +316,22 @@ typedef struct pcre_extra {
const unsigned char *tables; /* Pointer to character tables */
unsigned long int match_limit_recursion; /* Max recursive calls to match() */
unsigned char **mark; /* For passing back a mark pointer */
void *executable_jit; /* Contains a pointer to a compiled jit code */
} pcre_extra;
/* Same structure as above, but with 16 bit char pointers. */
typedef struct pcre16_extra {
unsigned long int flags; /* Bits for which fields are set */
void *study_data; /* Opaque data from pcre_study() */
unsigned long int match_limit; /* Maximum number of calls to match() */
void *callout_data; /* Data passed back in callouts */
const unsigned char *tables; /* Pointer to character tables */
unsigned long int match_limit_recursion; /* Max recursive calls to match() */
PCRE_UCHAR16 **mark; /* For passing back a mark pointer */
void *executable_jit; /* Contains a pointer to a compiled jit code */
} pcre16_extra;
/* The structure for passing out data via the pcre_callout_function. We use a
structure so that new fields can be added on the end in future versions,
without changing the API of the function, thereby allowing old clients to work
@ -254,9 +352,33 @@ typedef struct pcre_callout_block {
/* ------------------- Added for Version 1 -------------------------- */
int pattern_position; /* Offset to next item in the pattern */
int next_item_length; /* Length of next item in the pattern */
/* ------------------- Added for Version 2 -------------------------- */
const unsigned char *mark; /* Pointer to current mark or NULL */
/* ------------------------------------------------------------------ */
} pcre_callout_block;
/* Same structure as above, but with 16 bit char pointers. */
typedef struct pcre16_callout_block {
int version; /* Identifies version of block */
/* ------------------------ Version 0 ------------------------------- */
int callout_number; /* Number compiled into pattern */
int *offset_vector; /* The offset vector */
PCRE_SPTR16 subject; /* The subject being matched */
int subject_length; /* The length of the subject */
int start_match; /* Offset to start of this match attempt */
int current_position; /* Where we currently are in the subject */
int capture_top; /* Max current capture */
int capture_last; /* Most recently closed capture */
void *callout_data; /* Data passed in with the call */
/* ------------------- Added for Version 1 -------------------------- */
int pattern_position; /* Offset to next item in the pattern */
int next_item_length; /* Length of next item in the pattern */
/* ------------------- Added for Version 2 -------------------------- */
const PCRE_UCHAR16 *mark; /* Pointer to current mark or NULL */
/* ------------------------------------------------------------------ */
} pcre16_callout_block;
/* Indirection for store get and free functions. These can be set to
alternative malloc/free functions if required. Special ones are used in the
non-recursive case for "frames". There is also an optional callout function
@ -269,47 +391,114 @@ PCRE_EXP_DECL void (*pcre_free)(void *);
PCRE_EXP_DECL void *(*pcre_stack_malloc)(size_t);
PCRE_EXP_DECL void (*pcre_stack_free)(void *);
PCRE_EXP_DECL int (*pcre_callout)(pcre_callout_block *);
PCRE_EXP_DECL void *(*pcre16_malloc)(size_t);
PCRE_EXP_DECL void (*pcre16_free)(void *);
PCRE_EXP_DECL void *(*pcre16_stack_malloc)(size_t);
PCRE_EXP_DECL void (*pcre16_stack_free)(void *);
PCRE_EXP_DECL int (*pcre16_callout)(pcre16_callout_block *);
#else /* VPCOMPAT */
PCRE_EXP_DECL void *pcre_malloc(size_t);
PCRE_EXP_DECL void pcre_free(void *);
PCRE_EXP_DECL void *pcre_stack_malloc(size_t);
PCRE_EXP_DECL void pcre_stack_free(void *);
PCRE_EXP_DECL int pcre_callout(pcre_callout_block *);
PCRE_EXP_DECL void *pcre16_malloc(size_t);
PCRE_EXP_DECL void pcre16_free(void *);
PCRE_EXP_DECL void *pcre16_stack_malloc(size_t);
PCRE_EXP_DECL void pcre16_stack_free(void *);
PCRE_EXP_DECL int pcre16_callout(pcre16_callout_block *);
#endif /* VPCOMPAT */
/* User defined callback which provides a stack just before the match starts. */
typedef pcre_jit_stack *(*pcre_jit_callback)(void *);
typedef pcre16_jit_stack *(*pcre16_jit_callback)(void *);
/* Exported PCRE functions */
PCRE_EXP_DECL pcre *pcre_compile(const char *, int, const char **, int *,
const unsigned char *);
PCRE_EXP_DECL pcre16 *pcre16_compile(PCRE_SPTR16, int, const char **, int *,
const unsigned char *);
PCRE_EXP_DECL pcre *pcre_compile2(const char *, int, int *, const char **,
int *, const unsigned char *);
PCRE_EXP_DECL pcre16 *pcre16_compile2(PCRE_SPTR16, int, int *, const char **,
int *, const unsigned char *);
PCRE_EXP_DECL int pcre_config(int, void *);
PCRE_EXP_DECL int pcre16_config(int, void *);
PCRE_EXP_DECL int pcre_copy_named_substring(const pcre *, const char *,
int *, int, const char *, char *, int);
PCRE_EXP_DECL int pcre_copy_substring(const char *, int *, int, int, char *,
int);
PCRE_EXP_DECL int pcre16_copy_named_substring(const pcre16 *, PCRE_SPTR16,
int *, int, PCRE_SPTR16, PCRE_UCHAR16 *, int);
PCRE_EXP_DECL int pcre_copy_substring(const char *, int *, int, int,
char *, int);
PCRE_EXP_DECL int pcre16_copy_substring(PCRE_SPTR16, int *, int, int,
PCRE_UCHAR16 *, int);
PCRE_EXP_DECL int pcre_dfa_exec(const pcre *, const pcre_extra *,
const char *, int, int, int, int *, int , int *, int);
PCRE_EXP_DECL int pcre16_dfa_exec(const pcre16 *, const pcre16_extra *,
PCRE_SPTR16, int, int, int, int *, int , int *, int);
PCRE_EXP_DECL int pcre_exec(const pcre *, const pcre_extra *, PCRE_SPTR,
int, int, int, int *, int);
PCRE_EXP_DECL int pcre16_exec(const pcre16 *, const pcre16_extra *,
PCRE_SPTR16, int, int, int, int *, int);
PCRE_EXP_DECL void pcre_free_substring(const char *);
PCRE_EXP_DECL void pcre16_free_substring(PCRE_SPTR16);
PCRE_EXP_DECL void pcre_free_substring_list(const char **);
PCRE_EXP_DECL void pcre16_free_substring_list(PCRE_SPTR16 *);
PCRE_EXP_DECL int pcre_fullinfo(const pcre *, const pcre_extra *, int,
void *);
PCRE_EXP_DECL int pcre16_fullinfo(const pcre16 *, const pcre16_extra *, int,
void *);
PCRE_EXP_DECL int pcre_get_named_substring(const pcre *, const char *,
int *, int, const char *, const char **);
PCRE_EXP_DECL int pcre16_get_named_substring(const pcre16 *, PCRE_SPTR16,
int *, int, PCRE_SPTR16, PCRE_SPTR16 *);
PCRE_EXP_DECL int pcre_get_stringnumber(const pcre *, const char *);
PCRE_EXP_DECL int pcre16_get_stringnumber(const pcre16 *, PCRE_SPTR16);
PCRE_EXP_DECL int pcre_get_stringtable_entries(const pcre *, const char *,
char **, char **);
PCRE_EXP_DECL int pcre16_get_stringtable_entries(const pcre16 *, PCRE_SPTR16,
PCRE_UCHAR16 **, PCRE_UCHAR16 **);
PCRE_EXP_DECL int pcre_get_substring(const char *, int *, int, int,
const char **);
PCRE_EXP_DECL int pcre16_get_substring(PCRE_SPTR16, int *, int, int,
PCRE_SPTR16 *);
PCRE_EXP_DECL int pcre_get_substring_list(const char *, int *, int,
const char ***);
PCRE_EXP_DECL int pcre_info(const pcre *, int *, int *);
PCRE_EXP_DECL int pcre16_get_substring_list(PCRE_SPTR16, int *, int,
PCRE_SPTR16 **);
PCRE_EXP_DECL const unsigned char *pcre_maketables(void);
PCRE_EXP_DECL const unsigned char *pcre16_maketables(void);
PCRE_EXP_DECL int pcre_refcount(pcre *, int);
PCRE_EXP_DECL int pcre16_refcount(pcre16 *, int);
PCRE_EXP_DECL pcre_extra *pcre_study(const pcre *, int, const char **);
PCRE_EXP_DECL pcre16_extra *pcre16_study(const pcre16 *, int, const char **);
PCRE_EXP_DECL void pcre_free_study(pcre_extra *);
PCRE_EXP_DECL void pcre16_free_study(pcre16_extra *);
PCRE_EXP_DECL const char *pcre_version(void);
PCRE_EXP_DECL const char *pcre16_version(void);
/* Utility functions for byte order swaps. */
PCRE_EXP_DECL int pcre_pattern_to_host_byte_order(pcre *, pcre_extra *,
const unsigned char *);
PCRE_EXP_DECL int pcre16_pattern_to_host_byte_order(pcre16 *, pcre16_extra *,
const unsigned char *);
PCRE_EXP_DECL int pcre16_utf16_to_host_byte_order(PCRE_UCHAR16 *,
PCRE_SPTR16, int, int *, int);
/* JIT compiler related functions. */
PCRE_EXP_DECL pcre_jit_stack *pcre_jit_stack_alloc(int, int);
PCRE_EXP_DECL pcre16_jit_stack *pcre16_jit_stack_alloc(int, int);
PCRE_EXP_DECL void pcre_jit_stack_free(pcre_jit_stack *);
PCRE_EXP_DECL void pcre16_jit_stack_free(pcre16_jit_stack *);
PCRE_EXP_DECL void pcre_assign_jit_stack(pcre_extra *,
pcre_jit_callback, void *);
PCRE_EXP_DECL void pcre16_assign_jit_stack(pcre16_extra *,
pcre16_jit_callback, void *);
#ifdef __cplusplus
} /* extern "C" */

File diff suppressed because it is too large Load Diff

View File

@ -6,7 +6,7 @@
and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel
Copyright (c) 1997-2009 University of Cambridge
Copyright (c) 1997-2012 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
@ -43,6 +43,9 @@ POSSIBILITY OF SUCH DAMAGE.
#include "config.h"
/* Keep the original link size. */
static int real_link_size = LINK_SIZE;
#include "pcre_internal.h"
@ -60,18 +63,41 @@ Arguments:
Returns: 0 if data returned, negative on error
*/
#ifdef COMPILE_PCRE8
PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
pcre_config(int what, void *where)
#else
PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
pcre16_config(int what, void *where)
#endif
{
switch (what)
{
case PCRE_CONFIG_UTF8:
#ifdef SUPPORT_UTF8
#if defined COMPILE_PCRE16
*((int *)where) = 0;
return PCRE_ERROR_BADOPTION;
#else
#if defined SUPPORT_UTF
*((int *)where) = 1;
#else
*((int *)where) = 0;
#endif
break;
#endif
case PCRE_CONFIG_UTF16:
#if defined COMPILE_PCRE8
*((int *)where) = 0;
return PCRE_ERROR_BADOPTION;
#else
#if defined SUPPORT_UTF
*((int *)where) = 1;
#else
*((int *)where) = 0;
#endif
break;
#endif
case PCRE_CONFIG_UNICODE_PROPERTIES:
#ifdef SUPPORT_UCP
@ -81,6 +107,22 @@ switch (what)
#endif
break;
case PCRE_CONFIG_JIT:
#ifdef SUPPORT_JIT
*((int *)where) = 1;
#else
*((int *)where) = 0;
#endif
break;
case PCRE_CONFIG_JITTARGET:
#ifdef SUPPORT_JIT
*((const char **)where) = PRIV(jit_get_target)();
#else
*((const char **)where) = NULL;
#endif
break;
case PCRE_CONFIG_NEWLINE:
*((int *)where) = NEWLINE;
break;
@ -94,7 +136,7 @@ switch (what)
break;
case PCRE_CONFIG_LINK_SIZE:
*((int *)where) = LINK_SIZE;
*((int *)where) = real_link_size;
break;
case PCRE_CONFIG_POSIX_MALLOC_THRESHOLD:

File diff suppressed because it is too large Load Diff

View File

@ -6,7 +6,7 @@
and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel
Copyright (c) 1997-2009 University of Cambridge
Copyright (c) 1997-2012 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
@ -63,13 +63,17 @@ Arguments:
Returns: 0 if data returned, negative on error
*/
#ifdef COMPILE_PCRE8
PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
pcre_fullinfo(const pcre *argument_re, const pcre_extra *extra_data, int what,
void *where)
pcre_fullinfo(const pcre *argument_re, const pcre_extra *extra_data,
int what, void *where)
#else
PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
pcre16_fullinfo(const pcre16 *argument_re, const pcre16_extra *extra_data,
int what, void *where)
#endif
{
real_pcre internal_re;
pcre_study_data internal_study;
const real_pcre *re = (const real_pcre *)argument_re;
const REAL_PCRE *re = (const REAL_PCRE *)argument_re;
const pcre_study_data *study = NULL;
if (re == NULL || where == NULL) return PCRE_ERROR_NULL;
@ -77,12 +81,18 @@ if (re == NULL || where == NULL) return PCRE_ERROR_NULL;
if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_STUDY_DATA) != 0)
study = (const pcre_study_data *)extra_data->study_data;
/* Check that the first field in the block is the magic number. If it is not,
return with PCRE_ERROR_BADMAGIC. However, if the magic number is equal to
REVERSED_MAGIC_NUMBER we return with PCRE_ERROR_BADENDIANNESS, which
means that the pattern is likely compiled with different endianness. */
if (re->magic_number != MAGIC_NUMBER)
{
re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
if (re == NULL) return PCRE_ERROR_BADMAGIC;
if (study != NULL) study = &internal_study;
}
return re->magic_number == REVERSED_MAGIC_NUMBER?
PCRE_ERROR_BADENDIANNESS:PCRE_ERROR_BADMAGIC;
/* Check that this pattern was compiled in the correct bit mode */
if ((re->flags & PCRE_MODE) == 0) return PCRE_ERROR_BADMODE;
switch (what)
{
@ -98,6 +108,18 @@ switch (what)
*((size_t *)where) = (study == NULL)? 0 : study->size;
break;
case PCRE_INFO_JITSIZE:
#ifdef SUPPORT_JIT
*((size_t *)where) =
(extra_data != NULL &&
(extra_data->flags & PCRE_EXTRA_EXECUTABLE_JIT) != 0 &&
extra_data->executable_jit != NULL)?
PRIV(jit_get_size)(extra_data->executable_jit) : 0;
#else
*((size_t *)where) = 0;
#endif
break;
case PCRE_INFO_CAPTURECOUNT:
*((int *)where) = re->top_bracket;
break;
@ -108,7 +130,7 @@ switch (what)
case PCRE_INFO_FIRSTBYTE:
*((int *)where) =
((re->flags & PCRE_FIRSTSET) != 0)? re->first_byte :
((re->flags & PCRE_FIRSTSET) != 0)? re->first_char :
((re->flags & PCRE_STARTLINE) != 0)? -1 : -2;
break;
@ -116,7 +138,7 @@ switch (what)
block, not the internal copy (with flipped integer fields). */
case PCRE_INFO_FIRSTTABLE:
*((const uschar **)where) =
*((const pcre_uint8 **)where) =
(study != NULL && (study->flags & PCRE_STUDY_MAPPED) != 0)?
((const pcre_study_data *)extra_data->study_data)->start_bits : NULL;
break;
@ -124,12 +146,18 @@ switch (what)
case PCRE_INFO_MINLENGTH:
*((int *)where) =
(study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0)?
study->minlength : -1;
(int)(study->minlength) : -1;
break;
case PCRE_INFO_JIT:
*((int *)where) = extra_data != NULL &&
(extra_data->flags & PCRE_EXTRA_EXECUTABLE_JIT) != 0 &&
extra_data->executable_jit != NULL;
break;
case PCRE_INFO_LASTLITERAL:
*((int *)where) =
((re->flags & PCRE_REQCHSET) != 0)? re->req_byte : -1;
((re->flags & PCRE_REQCHSET) != 0)? re->req_char : -1;
break;
case PCRE_INFO_NAMEENTRYSIZE:
@ -141,11 +169,11 @@ switch (what)
break;
case PCRE_INFO_NAMETABLE:
*((const uschar **)where) = (const uschar *)re + re->name_table_offset;
*((const pcre_uchar **)where) = (const pcre_uchar *)re + re->name_table_offset;
break;
case PCRE_INFO_DEFAULT_TABLES:
*((const uschar **)where) = (const uschar *)(_pcre_default_tables);
*((const pcre_uint8 **)where) = (const pcre_uint8 *)(PRIV(default_tables));
break;
/* From release 8.00 this will always return TRUE because NOPARTIAL is
@ -163,6 +191,10 @@ switch (what)
*((int *)where) = (re->flags & PCRE_HASCRORLF) != 0;
break;
case PCRE_INFO_MAXLOOKBEHIND:
*((int *)where) = re->max_lookbehind;
break;
default: return PCRE_ERROR_BADOPTION;
}

View File

@ -6,7 +6,7 @@
and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel
Copyright (c) 1997-2008 University of Cambridge
Copyright (c) 1997-2012 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
@ -63,14 +63,20 @@ Returns: the number of the named parentheses, or a negative number
(PCRE_ERROR_NOSUBSTRING) if not found
*/
#ifdef COMPILE_PCRE8
PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
pcre_get_stringnumber(const pcre *code, const char *stringname)
#else
PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
pcre16_get_stringnumber(const pcre16 *code, PCRE_SPTR16 stringname)
#endif
{
int rc;
int entrysize;
int top, bot;
uschar *nametable;
pcre_uchar *nametable;
#ifdef COMPILE_PCRE8
if ((rc = pcre_fullinfo(code, NULL, PCRE_INFO_NAMECOUNT, &top)) != 0)
return rc;
if (top <= 0) return PCRE_ERROR_NOSUBSTRING;
@ -79,14 +85,26 @@ if ((rc = pcre_fullinfo(code, NULL, PCRE_INFO_NAMEENTRYSIZE, &entrysize)) != 0)
return rc;
if ((rc = pcre_fullinfo(code, NULL, PCRE_INFO_NAMETABLE, &nametable)) != 0)
return rc;
#endif
#ifdef COMPILE_PCRE16
if ((rc = pcre16_fullinfo(code, NULL, PCRE_INFO_NAMECOUNT, &top)) != 0)
return rc;
if (top <= 0) return PCRE_ERROR_NOSUBSTRING;
if ((rc = pcre16_fullinfo(code, NULL, PCRE_INFO_NAMEENTRYSIZE, &entrysize)) != 0)
return rc;
if ((rc = pcre16_fullinfo(code, NULL, PCRE_INFO_NAMETABLE, &nametable)) != 0)
return rc;
#endif
bot = 0;
while (top > bot)
{
int mid = (top + bot) / 2;
uschar *entry = nametable + entrysize*mid;
int c = strcmp(stringname, (char *)(entry + 2));
if (c == 0) return (entry[0] << 8) + entry[1];
pcre_uchar *entry = nametable + entrysize*mid;
int c = STRCMP_UC_UC((pcre_uchar *)stringname,
(pcre_uchar *)(entry + IMM2_SIZE));
if (c == 0) return GET2(entry, 0);
if (c > 0) bot = mid + 1; else top = mid;
}
@ -112,15 +130,22 @@ Returns: the length of each entry, or a negative number
(PCRE_ERROR_NOSUBSTRING) if not found
*/
#ifdef COMPILE_PCRE8
PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
pcre_get_stringtable_entries(const pcre *code, const char *stringname,
char **firstptr, char **lastptr)
#else
PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
pcre16_get_stringtable_entries(const pcre16 *code, PCRE_SPTR16 stringname,
PCRE_UCHAR16 **firstptr, PCRE_UCHAR16 **lastptr)
#endif
{
int rc;
int entrysize;
int top, bot;
uschar *nametable, *lastentry;
pcre_uchar *nametable, *lastentry;
#ifdef COMPILE_PCRE8
if ((rc = pcre_fullinfo(code, NULL, PCRE_INFO_NAMECOUNT, &top)) != 0)
return rc;
if (top <= 0) return PCRE_ERROR_NOSUBSTRING;
@ -129,30 +154,49 @@ if ((rc = pcre_fullinfo(code, NULL, PCRE_INFO_NAMEENTRYSIZE, &entrysize)) != 0)
return rc;
if ((rc = pcre_fullinfo(code, NULL, PCRE_INFO_NAMETABLE, &nametable)) != 0)
return rc;
#endif
#ifdef COMPILE_PCRE16
if ((rc = pcre16_fullinfo(code, NULL, PCRE_INFO_NAMECOUNT, &top)) != 0)
return rc;
if (top <= 0) return PCRE_ERROR_NOSUBSTRING;
if ((rc = pcre16_fullinfo(code, NULL, PCRE_INFO_NAMEENTRYSIZE, &entrysize)) != 0)
return rc;
if ((rc = pcre16_fullinfo(code, NULL, PCRE_INFO_NAMETABLE, &nametable)) != 0)
return rc;
#endif
lastentry = nametable + entrysize * (top - 1);
bot = 0;
while (top > bot)
{
int mid = (top + bot) / 2;
uschar *entry = nametable + entrysize*mid;
int c = strcmp(stringname, (char *)(entry + 2));
pcre_uchar *entry = nametable + entrysize*mid;
int c = STRCMP_UC_UC((pcre_uchar *)stringname,
(pcre_uchar *)(entry + IMM2_SIZE));
if (c == 0)
{
uschar *first = entry;
uschar *last = entry;
pcre_uchar *first = entry;
pcre_uchar *last = entry;
while (first > nametable)
{
if (strcmp(stringname, (char *)(first - entrysize + 2)) != 0) break;
if (STRCMP_UC_UC((pcre_uchar *)stringname,
(pcre_uchar *)(first - entrysize + IMM2_SIZE)) != 0) break;
first -= entrysize;
}
while (last < lastentry)
{
if (strcmp(stringname, (char *)(last + entrysize + 2)) != 0) break;
if (STRCMP_UC_UC((pcre_uchar *)stringname,
(pcre_uchar *)(last + entrysize + IMM2_SIZE)) != 0) break;
last += entrysize;
}
#ifdef COMPILE_PCRE8
*firstptr = (char *)first;
*lastptr = (char *)last;
#else
*firstptr = (PCRE_UCHAR16 *)first;
*lastptr = (PCRE_UCHAR16 *)last;
#endif
return entrysize;
}
if (c > 0) bot = mid + 1; else top = mid;
@ -180,23 +224,39 @@ Returns: the number of the first that is set,
or a negative number on error
*/
#ifdef COMPILE_PCRE8
static int
get_first_set(const pcre *code, const char *stringname, int *ovector)
#else
static int
get_first_set(const pcre16 *code, PCRE_SPTR16 stringname, int *ovector)
#endif
{
const real_pcre *re = (const real_pcre *)code;
const REAL_PCRE *re = (const REAL_PCRE *)code;
int entrysize;
pcre_uchar *entry;
#ifdef COMPILE_PCRE8
char *first, *last;
uschar *entry;
#else
PCRE_UCHAR16 *first, *last;
#endif
#ifdef COMPILE_PCRE8
if ((re->options & PCRE_DUPNAMES) == 0 && (re->flags & PCRE_JCHANGED) == 0)
return pcre_get_stringnumber(code, stringname);
entrysize = pcre_get_stringtable_entries(code, stringname, &first, &last);
#else
if ((re->options & PCRE_DUPNAMES) == 0 && (re->flags & PCRE_JCHANGED) == 0)
return pcre16_get_stringnumber(code, stringname);
entrysize = pcre16_get_stringtable_entries(code, stringname, &first, &last);
#endif
if (entrysize <= 0) return entrysize;
for (entry = (uschar *)first; entry <= (uschar *)last; entry += entrysize)
for (entry = (pcre_uchar *)first; entry <= (pcre_uchar *)last; entry += entrysize)
{
int n = (entry[0] << 8) + entry[1];
int n = GET2(entry, 0);
if (ovector[n*2] >= 0) return n;
}
return (first[0] << 8) + first[1];
return GET2(entry, 0);
}
@ -229,9 +289,15 @@ Returns: if successful:
PCRE_ERROR_NOSUBSTRING (-7) no such captured substring
*/
#ifdef COMPILE_PCRE8
PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
pcre_copy_substring(const char *subject, int *ovector, int stringcount,
int stringnumber, char *buffer, int size)
#else
PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
pcre16_copy_substring(PCRE_SPTR16 subject, int *ovector, int stringcount,
int stringnumber, PCRE_UCHAR16 *buffer, int size)
#endif
{
int yield;
if (stringnumber < 0 || stringnumber >= stringcount)
@ -239,7 +305,7 @@ if (stringnumber < 0 || stringnumber >= stringcount)
stringnumber *= 2;
yield = ovector[stringnumber+1] - ovector[stringnumber];
if (size < yield + 1) return PCRE_ERROR_NOMEMORY;
memcpy(buffer, subject + ovector[stringnumber], yield);
memcpy(buffer, subject + ovector[stringnumber], IN_UCHARS(yield));
buffer[yield] = 0;
return yield;
}
@ -274,13 +340,25 @@ Returns: if successful:
PCRE_ERROR_NOSUBSTRING (-7) no such captured substring
*/
#ifdef COMPILE_PCRE8
PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
pcre_copy_named_substring(const pcre *code, const char *subject, int *ovector,
int stringcount, const char *stringname, char *buffer, int size)
pcre_copy_named_substring(const pcre *code, const char *subject,
int *ovector, int stringcount, const char *stringname,
char *buffer, int size)
#else
PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
pcre16_copy_named_substring(const pcre16 *code, PCRE_SPTR16 subject,
int *ovector, int stringcount, PCRE_SPTR16 stringname,
PCRE_UCHAR16 *buffer, int size)
#endif
{
int n = get_first_set(code, stringname, ovector);
if (n <= 0) return n;
#ifdef COMPILE_PCRE8
return pcre_copy_substring(subject, ovector, stringcount, n, buffer, size);
#else
return pcre16_copy_substring(subject, ovector, stringcount, n, buffer, size);
#endif
}
@ -306,29 +384,39 @@ Returns: if successful: 0
PCRE_ERROR_NOMEMORY (-6) failed to get store
*/
#ifdef COMPILE_PCRE8
PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
pcre_get_substring_list(const char *subject, int *ovector, int stringcount,
const char ***listptr)
#else
PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
pcre16_get_substring_list(PCRE_SPTR16 subject, int *ovector, int stringcount,
PCRE_SPTR16 **listptr)
#endif
{
int i;
int size = sizeof(char *);
int size = sizeof(pcre_uchar *);
int double_count = stringcount * 2;
char **stringlist;
char *p;
pcre_uchar **stringlist;
pcre_uchar *p;
for (i = 0; i < double_count; i += 2)
size += sizeof(char *) + ovector[i+1] - ovector[i] + 1;
size += sizeof(pcre_uchar *) + IN_UCHARS(ovector[i+1] - ovector[i] + 1);
stringlist = (char **)(pcre_malloc)(size);
stringlist = (pcre_uchar **)(PUBL(malloc))(size);
if (stringlist == NULL) return PCRE_ERROR_NOMEMORY;
#ifdef COMPILE_PCRE8
*listptr = (const char **)stringlist;
p = (char *)(stringlist + stringcount + 1);
#else
*listptr = (PCRE_SPTR16 *)stringlist;
#endif
p = (pcre_uchar *)(stringlist + stringcount + 1);
for (i = 0; i < double_count; i += 2)
{
int len = ovector[i+1] - ovector[i];
memcpy(p, subject + ovector[i], len);
memcpy(p, subject + ovector[i], IN_UCHARS(len));
*stringlist++ = p;
p += len;
*p++ = 0;
@ -345,16 +433,22 @@ return 0;
*************************************************/
/* This function exists for the benefit of people calling PCRE from non-C
programs that can call its functions, but not free() or (pcre_free)() directly.
programs that can call its functions, but not free() or (PUBL(free))()
directly.
Argument: the result of a previous pcre_get_substring_list()
Returns: nothing
*/
#ifdef COMPILE_PCRE8
PCRE_EXP_DEFN void PCRE_CALL_CONVENTION
pcre_free_substring_list(const char **pointer)
#else
PCRE_EXP_DEFN void PCRE_CALL_CONVENTION
pcre16_free_substring_list(PCRE_SPTR16 *pointer)
#endif
{
(pcre_free)((void *)pointer);
(PUBL(free))((void *)pointer);
}
@ -384,21 +478,31 @@ Returns: if successful:
PCRE_ERROR_NOSUBSTRING (-7) substring not present
*/
#ifdef COMPILE_PCRE8
PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
pcre_get_substring(const char *subject, int *ovector, int stringcount,
int stringnumber, const char **stringptr)
#else
PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
pcre16_get_substring(PCRE_SPTR16 subject, int *ovector, int stringcount,
int stringnumber, PCRE_SPTR16 *stringptr)
#endif
{
int yield;
char *substring;
pcre_uchar *substring;
if (stringnumber < 0 || stringnumber >= stringcount)
return PCRE_ERROR_NOSUBSTRING;
stringnumber *= 2;
yield = ovector[stringnumber+1] - ovector[stringnumber];
substring = (char *)(pcre_malloc)(yield + 1);
substring = (pcre_uchar *)(PUBL(malloc))(IN_UCHARS(yield + 1));
if (substring == NULL) return PCRE_ERROR_NOMEMORY;
memcpy(substring, subject + ovector[stringnumber], yield);
memcpy(substring, subject + ovector[stringnumber], IN_UCHARS(yield));
substring[yield] = 0;
*stringptr = substring;
#ifdef COMPILE_PCRE8
*stringptr = (const char *)substring;
#else
*stringptr = (PCRE_SPTR16)substring;
#endif
return yield;
}
@ -431,13 +535,25 @@ Returns: if successful:
PCRE_ERROR_NOSUBSTRING (-7) no such captured substring
*/
#ifdef COMPILE_PCRE8
PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
pcre_get_named_substring(const pcre *code, const char *subject, int *ovector,
int stringcount, const char *stringname, const char **stringptr)
pcre_get_named_substring(const pcre *code, const char *subject,
int *ovector, int stringcount, const char *stringname,
const char **stringptr)
#else
PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
pcre16_get_named_substring(const pcre16 *code, PCRE_SPTR16 subject,
int *ovector, int stringcount, PCRE_SPTR16 stringname,
PCRE_SPTR16 *stringptr)
#endif
{
int n = get_first_set(code, stringname, ovector);
if (n <= 0) return n;
#ifdef COMPILE_PCRE8
return pcre_get_substring(subject, ovector, stringcount, n, stringptr);
#else
return pcre16_get_substring(subject, ovector, stringcount, n, stringptr);
#endif
}
@ -448,16 +564,22 @@ return pcre_get_substring(subject, ovector, stringcount, n, stringptr);
*************************************************/
/* This function exists for the benefit of people calling PCRE from non-C
programs that can call its functions, but not free() or (pcre_free)() directly.
programs that can call its functions, but not free() or (PUBL(free))()
directly.
Argument: the result of a previous pcre_get_substring()
Returns: nothing
*/
#ifdef COMPILE_PCRE8
PCRE_EXP_DEFN void PCRE_CALL_CONVENTION
pcre_free_substring(const char *pointer)
#else
PCRE_EXP_DEFN void PCRE_CALL_CONVENTION
pcre16_free_substring(PCRE_SPTR16 pointer)
#endif
{
(pcre_free)((void *)pointer);
(PUBL(free))((void *)pointer);
}
/* End of pcre_get.c */

View File

@ -6,7 +6,7 @@
and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel
Copyright (c) 1997-2008 University of Cambridge
Copyright (c) 1997-2012 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
@ -65,18 +65,18 @@ static void LocalPcreFree(void* aPtr)
{
free(aPtr);
}
PCRE_EXP_DATA_DEFN void *(*pcre_malloc)(size_t) = LocalPcreMalloc;
PCRE_EXP_DATA_DEFN void (*pcre_free)(void *) = LocalPcreFree;
PCRE_EXP_DATA_DEFN void *(*pcre_stack_malloc)(size_t) = LocalPcreMalloc;
PCRE_EXP_DATA_DEFN void (*pcre_stack_free)(void *) = LocalPcreFree;
PCRE_EXP_DATA_DEFN int (*pcre_callout)(pcre_callout_block *) = NULL;
PCRE_EXP_DATA_DEFN void *(*PUBL(malloc))(size_t) = LocalPcreMalloc;
PCRE_EXP_DATA_DEFN void (*PUBL(free))(void *) = LocalPcreFree;
PCRE_EXP_DATA_DEFN void *(*PUBL(stack_malloc))(size_t) = LocalPcreMalloc;
PCRE_EXP_DATA_DEFN void (*PUBL(stack_free))(void *) = LocalPcreFree;
PCRE_EXP_DATA_DEFN int (*PUBL(callout))(PUBL(callout_block) *) = NULL;
#elif !defined VPCOMPAT
PCRE_EXP_DATA_DEFN void *(*pcre_malloc)(size_t) = malloc;
PCRE_EXP_DATA_DEFN void (*pcre_free)(void *) = free;
PCRE_EXP_DATA_DEFN void *(*pcre_stack_malloc)(size_t) = malloc;
PCRE_EXP_DATA_DEFN void (*pcre_stack_free)(void *) = free;
PCRE_EXP_DATA_DEFN int (*pcre_callout)(pcre_callout_block *) = NULL;
PCRE_EXP_DATA_DEFN void *(*PUBL(malloc))(size_t) = malloc;
PCRE_EXP_DATA_DEFN void (*PUBL(free))(void *) = free;
PCRE_EXP_DATA_DEFN void *(*PUBL(stack_malloc))(size_t) = malloc;
PCRE_EXP_DATA_DEFN void (*PUBL(stack_free))(void *) = free;
PCRE_EXP_DATA_DEFN int (*PUBL(callout))(PUBL(callout_block) *) = NULL;
#endif
/* End of pcre_globals.c */

View File

@ -1,91 +0,0 @@
/*************************************************
* Perl-Compatible Regular Expressions *
*************************************************/
/* PCRE is a library of functions to support regular expressions whose syntax
and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel
Copyright (c) 1997-2009 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the University of Cambridge nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
-----------------------------------------------------------------------------
*/
/* This module contains the external function pcre_info(), which gives some
information about a compiled pattern. However, use of this function is now
deprecated, as it has been superseded by pcre_fullinfo(). */
#include "config.h"
#include "pcre_internal.h"
/*************************************************
* (Obsolete) Return info about compiled pattern *
*************************************************/
/* This is the original "info" function. It picks potentially useful data out
of the private structure, but its interface was too rigid. It remains for
backwards compatibility. The public options are passed back in an int - though
the re->options field has been expanded to a long int, all the public options
at the low end of it, and so even on 16-bit systems this will still be OK.
Therefore, I haven't changed the API for pcre_info().
Arguments:
argument_re points to compiled code
optptr where to pass back the options
first_byte where to pass back the first character,
or -1 if multiline and all branches start ^,
or -2 otherwise
Returns: number of capturing subpatterns
or negative values on error
*/
PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
pcre_info(const pcre *argument_re, int *optptr, int *first_byte)
{
real_pcre internal_re;
const real_pcre *re = (const real_pcre *)argument_re;
if (re == NULL) return PCRE_ERROR_NULL;
if (re->magic_number != MAGIC_NUMBER)
{
re = _pcre_try_flipped(re, &internal_re, NULL, NULL);
if (re == NULL) return PCRE_ERROR_BADMAGIC;
}
if (optptr != NULL) *optptr = (int)(re->options & PUBLIC_COMPILE_OPTIONS);
if (first_byte != NULL)
*first_byte = ((re->flags & PCRE_FIRSTSET) != 0)? re->first_byte :
((re->flags & PCRE_STARTLINE) != 0)? -1 : -2;
return re->top_bracket;
}
/* End of pcre_info.c */

File diff suppressed because it is too large Load Diff

View File

@ -6,7 +6,7 @@
and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel
Copyright (c) 1997-2008 University of Cambridge
Copyright (c) 1997-2012 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
@ -57,21 +57,26 @@ compilation of dftables.c, in which case the macro DFTABLES is defined. */
/* This function builds a set of character tables for use by PCRE and returns
a pointer to them. They are build using the ctype functions, and consequently
their contents will depend upon the current locale setting. When compiled as
part of the library, the store is obtained via pcre_malloc(), but when compiled
inside dftables, use malloc().
part of the library, the store is obtained via PUBL(malloc)(), but when
compiled inside dftables, use malloc().
Arguments: none
Returns: pointer to the contiguous block of data
*/
#ifdef COMPILE_PCRE8
const unsigned char *
pcre_maketables(void)
#else
const unsigned char *
pcre16_maketables(void)
#endif
{
unsigned char *yield, *p;
int i;
#ifndef DFTABLES
yield = (unsigned char*)(pcre_malloc)(tables_length);
yield = (unsigned char*)(PUBL(malloc))(tables_length);
#else
yield = (unsigned char*)malloc(tables_length);
#endif

View File

@ -6,7 +6,7 @@
and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel
Copyright (c) 1997-2009 University of Cambridge
Copyright (c) 1997-2012 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
@ -65,16 +65,25 @@ Arguments:
type the newline type
endptr pointer to the end of the string
lenptr where to return the length
utf8 TRUE if in utf8 mode
utf TRUE if in utf mode
Returns: TRUE or FALSE
*/
BOOL
_pcre_is_newline(USPTR ptr, int type, USPTR endptr, int *lenptr, BOOL utf8)
PRIV(is_newline)(PCRE_PUCHAR ptr, int type, PCRE_PUCHAR endptr, int *lenptr,
BOOL utf)
{
int c;
if (utf8) { GETCHAR(c, ptr); } else c = *ptr;
(void)utf;
#ifdef SUPPORT_UTF
if (utf)
{
GETCHAR(c, ptr);
}
else
#endif /* SUPPORT_UTF */
c = *ptr;
if (type == NLTYPE_ANYCRLF) switch(c)
{
@ -93,9 +102,15 @@ else switch(c)
case 0x000c: *lenptr = 1; return TRUE; /* FF */
case 0x000d: *lenptr = (ptr < endptr - 1 && ptr[1] == 0x0a)? 2 : 1;
return TRUE; /* CR */
case 0x0085: *lenptr = utf8? 2 : 1; return TRUE; /* NEL */
#ifdef COMPILE_PCRE8
case 0x0085: *lenptr = utf? 2 : 1; return TRUE; /* NEL */
case 0x2028: /* LS */
case 0x2029: *lenptr = 3; return TRUE; /* PS */
#else
case 0x0085: /* NEL */
case 0x2028: /* LS */
case 0x2029: *lenptr = 1; return TRUE; /* PS */
#endif /* COMPILE_PCRE8 */
default: return FALSE;
}
}
@ -114,26 +129,27 @@ Arguments:
type the newline type
startptr pointer to the start of the string
lenptr where to return the length
utf8 TRUE if in utf8 mode
utf TRUE if in utf mode
Returns: TRUE or FALSE
*/
BOOL
_pcre_was_newline(USPTR ptr, int type, USPTR startptr, int *lenptr, BOOL utf8)
PRIV(was_newline)(PCRE_PUCHAR ptr, int type, PCRE_PUCHAR startptr, int *lenptr,
BOOL utf)
{
int c;
(void)utf;
ptr--;
#ifdef SUPPORT_UTF8
if (utf8)
#ifdef SUPPORT_UTF
if (utf)
{
BACKCHAR(ptr);
GETCHAR(c, ptr);
}
else c = *ptr;
#else /* no UTF-8 support */
c = *ptr;
#endif /* SUPPORT_UTF8 */
else
#endif /* SUPPORT_UTF */
c = *ptr;
if (type == NLTYPE_ANYCRLF) switch(c)
{
@ -150,9 +166,15 @@ else switch(c)
case 0x000b: /* VT */
case 0x000c: /* FF */
case 0x000d: *lenptr = 1; return TRUE; /* CR */
case 0x0085: *lenptr = utf8? 2 : 1; return TRUE; /* NEL */
#ifdef COMPILE_PCRE8
case 0x0085: *lenptr = utf? 2 : 1; return TRUE; /* NEL */
case 0x2028: /* LS */
case 0x2029: *lenptr = 3; return TRUE; /* PS */
#else
case 0x0085: /* NEL */
case 0x2028: /* LS */
case 0x2029: *lenptr = 1; return TRUE; /* PS */
#endif /* COMPILE_PCRE8 */
default: return FALSE;
}
}

View File

@ -6,7 +6,7 @@
and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel
Copyright (c) 1997-2008 University of Cambridge
Copyright (c) 1997-2012 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
@ -50,35 +50,45 @@ character value into a UTF8 string. */
* Convert character value to UTF-8 *
*************************************************/
/* This function takes an integer value in the range 0 - 0x7fffffff
and encodes it as a UTF-8 character in 0 to 6 bytes.
/* This function takes an integer value in the range 0 - 0x10ffff
and encodes it as a UTF-8 character in 1 to 6 pcre_uchars.
Arguments:
cvalue the character value
buffer pointer to buffer for result - at least 6 bytes long
buffer pointer to buffer for result - at least 6 pcre_uchars long
Returns: number of characters placed in the buffer
*/
int
_pcre_ord2utf8(int cvalue, uschar *buffer)
PRIV(ord2utf)(pcre_uint32 cvalue, pcre_uchar *buffer)
{
#ifdef SUPPORT_UTF8
#ifdef SUPPORT_UTF
register int i, j;
for (i = 0; i < _pcre_utf8_table1_size; i++)
if (cvalue <= _pcre_utf8_table1[i]) break;
/* Checking invalid cvalue character, encoded as invalid UTF-16 character.
Should never happen in practice. */
if ((cvalue & 0xf800) == 0xd800 || cvalue >= 0x110000)
cvalue = 0xfffe;
for (i = 0; i < PRIV(utf8_table1_size); i++)
if ((int)cvalue <= PRIV(utf8_table1)[i]) break;
buffer += i;
for (j = i; j > 0; j--)
{
*buffer-- = 0x80 | (cvalue & 0x3f);
cvalue >>= 6;
}
*buffer = _pcre_utf8_table2[i] | cvalue;
*buffer = PRIV(utf8_table2)[i] | cvalue;
return i + 1;
#else
(void)(cvalue); /* Keep compiler happy; this function won't ever be */
(void)(buffer); /* called when SUPPORT_UTF8 is not defined. */
(void)(buffer); /* called when SUPPORT_UTF is not defined. */
return 0;
#endif
}

View File

@ -6,7 +6,7 @@
and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel
Copyright (c) 1997-2008 University of Cambridge
Copyright (c) 1997-2012 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
@ -66,11 +66,18 @@ Returns: the (possibly updated) count value (a non-negative number), or
a negative error number
*/
#ifdef COMPILE_PCRE8
PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
pcre_refcount(pcre *argument_re, int adjust)
#else
PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
pcre16_refcount(pcre16 *argument_re, int adjust)
#endif
{
real_pcre *re = (real_pcre *)argument_re;
REAL_PCRE *re = (REAL_PCRE *)argument_re;
if (re == NULL) return PCRE_ERROR_NULL;
if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
if ((re->flags & PCRE_MODE) == 0) return PCRE_ERROR_BADMODE;
re->ref_count = (-adjust > re->ref_count)? 0 :
(adjust + re->ref_count > 65535)? 65535 :
re->ref_count + adjust;

File diff suppressed because it is too large Load Diff

View File

@ -6,7 +6,7 @@
and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel
Copyright (c) 1997-2009 University of Cambridge
Copyright (c) 1997-2012 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
@ -37,6 +37,7 @@ POSSIBILITY OF SUCH DAMAGE.
-----------------------------------------------------------------------------
*/
#ifndef PCRE_INCLUDED
/* This module contains some fixed tables that are used by more than one of the
PCRE code modules. The tables are also #included by the pcretest program, which
@ -48,11 +49,12 @@ clashes with the library. */
#include "pcre_internal.h"
#endif /* PCRE_INCLUDED */
/* Table of sizes for the fixed-length opcodes. It's defined in a macro so that
the definition is next to the definition of the opcodes in pcre_internal.h. */
const uschar _pcre_OP_lengths[] = { OP_LENGTHS };
const pcre_uint8 PRIV(OP_lengths)[] = { OP_LENGTHS };
@ -63,31 +65,38 @@ const uschar _pcre_OP_lengths[] = { OP_LENGTHS };
/* These are the breakpoints for different numbers of bytes in a UTF-8
character. */
#ifdef SUPPORT_UTF8
#if (defined SUPPORT_UTF && defined COMPILE_PCRE8) \
|| (defined PCRE_INCLUDED && defined SUPPORT_PCRE16)
const int _pcre_utf8_table1[] =
/* These tables are also required by pcretest in 16 bit mode. */
const int PRIV(utf8_table1)[] =
{ 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff};
const int _pcre_utf8_table1_size = sizeof(_pcre_utf8_table1)/sizeof(int);
const int PRIV(utf8_table1_size) = sizeof(PRIV(utf8_table1)) / sizeof(int);
/* These are the indicator bits and the mask for the data bits to set in the
first byte of a character, indexed by the number of additional bytes. */
const int _pcre_utf8_table2[] = { 0, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc};
const int _pcre_utf8_table3[] = { 0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01};
const int PRIV(utf8_table2)[] = { 0, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc};
const int PRIV(utf8_table3)[] = { 0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01};
/* Table of the number of extra bytes, indexed by the first byte masked with
0x3f. The highest number for a valid UTF-8 first byte is in fact 0x3d. */
const uschar _pcre_utf8_table4[] = {
const pcre_uint8 PRIV(utf8_table4)[] = {
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 };
#endif /* (SUPPORT_UTF && COMPILE_PCRE8) || (PCRE_INCLUDED && SUPPORT_PCRE16)*/
#ifdef SUPPORT_UTF
/* Table to translate from particular type value to the general value. */
const int _pcre_ucp_gentype[] = {
const int PRIV(ucp_gentype)[] = {
ucp_C, ucp_C, ucp_C, ucp_C, ucp_C, /* Cc, Cf, Cn, Co, Cs */
ucp_L, ucp_L, ucp_L, ucp_L, ucp_L, /* Ll, Lu, Lm, Lo, Lt */
ucp_M, ucp_M, ucp_M, /* Mc, Me, Mn */
@ -98,6 +107,21 @@ const int _pcre_ucp_gentype[] = {
ucp_Z, ucp_Z, ucp_Z /* Zl, Zp, Zs */
};
#ifdef SUPPORT_JIT
/* This table reverses PRIV(ucp_gentype). We can save the cost
of a memory load. */
const int PRIV(ucp_typerange)[] = {
ucp_Cc, ucp_Cs,
ucp_Ll, ucp_Lu,
ucp_Mc, ucp_Mn,
ucp_Nd, ucp_No,
ucp_Pc, ucp_Ps,
ucp_Sc, ucp_So,
ucp_Zl, ucp_Zs,
};
#endif /* SUPPORT_JIT */
/* The pcre_utt[] table below translates Unicode property names into type and
code values. It is searched by binary chop, so must be in collating sequence of
name. Originally, the table contained pointers to the name strings in the first
@ -108,7 +132,7 @@ table itself. Maintenance is more error-prone, but frequent changes to this
data are unlikely.
July 2008: There is now a script called maint/GenerateUtt.py that can be used
to generate this data instead of maintaining it entirely by hand.
to generate this data automatically instead of maintaining it by hand.
The script was updated in March 2009 to generate a new EBCDIC-compliant
version. Like all other character and string literals that are compared against
@ -121,8 +145,10 @@ strings to make sure that UTF-8 support works on EBCDIC platforms. */
#define STRING_Avestan0 STR_A STR_v STR_e STR_s STR_t STR_a STR_n "\0"
#define STRING_Balinese0 STR_B STR_a STR_l STR_i STR_n STR_e STR_s STR_e "\0"
#define STRING_Bamum0 STR_B STR_a STR_m STR_u STR_m "\0"
#define STRING_Batak0 STR_B STR_a STR_t STR_a STR_k "\0"
#define STRING_Bengali0 STR_B STR_e STR_n STR_g STR_a STR_l STR_i "\0"
#define STRING_Bopomofo0 STR_B STR_o STR_p STR_o STR_m STR_o STR_f STR_o "\0"
#define STRING_Brahmi0 STR_B STR_r STR_a STR_h STR_m STR_i "\0"
#define STRING_Braille0 STR_B STR_r STR_a STR_i STR_l STR_l STR_e "\0"
#define STRING_Buginese0 STR_B STR_u STR_g STR_i STR_n STR_e STR_s STR_e "\0"
#define STRING_Buhid0 STR_B STR_u STR_h STR_i STR_d "\0"
@ -131,6 +157,7 @@ strings to make sure that UTF-8 support works on EBCDIC platforms. */
#define STRING_Carian0 STR_C STR_a STR_r STR_i STR_a STR_n "\0"
#define STRING_Cc0 STR_C STR_c "\0"
#define STRING_Cf0 STR_C STR_f "\0"
#define STRING_Chakma0 STR_C STR_h STR_a STR_k STR_m STR_a "\0"
#define STRING_Cham0 STR_C STR_h STR_a STR_m "\0"
#define STRING_Cherokee0 STR_C STR_h STR_e STR_r STR_o STR_k STR_e STR_e "\0"
#define STRING_Cn0 STR_C STR_n "\0"
@ -184,9 +211,13 @@ strings to make sure that UTF-8 support works on EBCDIC platforms. */
#define STRING_Lydian0 STR_L STR_y STR_d STR_i STR_a STR_n "\0"
#define STRING_M0 STR_M "\0"
#define STRING_Malayalam0 STR_M STR_a STR_l STR_a STR_y STR_a STR_l STR_a STR_m "\0"
#define STRING_Mandaic0 STR_M STR_a STR_n STR_d STR_a STR_i STR_c "\0"
#define STRING_Mc0 STR_M STR_c "\0"
#define STRING_Me0 STR_M STR_e "\0"
#define STRING_Meetei_Mayek0 STR_M STR_e STR_e STR_t STR_e STR_i STR_UNDERSCORE STR_M STR_a STR_y STR_e STR_k "\0"
#define STRING_Meroitic_Cursive0 STR_M STR_e STR_r STR_o STR_i STR_t STR_i STR_c STR_UNDERSCORE STR_C STR_u STR_r STR_s STR_i STR_v STR_e "\0"
#define STRING_Meroitic_Hieroglyphs0 STR_M STR_e STR_r STR_o STR_i STR_t STR_i STR_c STR_UNDERSCORE STR_H STR_i STR_e STR_r STR_o STR_g STR_l STR_y STR_p STR_h STR_s "\0"
#define STRING_Miao0 STR_M STR_i STR_a STR_o "\0"
#define STRING_Mn0 STR_M STR_n "\0"
#define STRING_Mongolian0 STR_M STR_o STR_n STR_g STR_o STR_l STR_i STR_a STR_n "\0"
#define STRING_Myanmar0 STR_M STR_y STR_a STR_n STR_m STR_a STR_r "\0"
@ -220,11 +251,13 @@ strings to make sure that UTF-8 support works on EBCDIC platforms. */
#define STRING_Samaritan0 STR_S STR_a STR_m STR_a STR_r STR_i STR_t STR_a STR_n "\0"
#define STRING_Saurashtra0 STR_S STR_a STR_u STR_r STR_a STR_s STR_h STR_t STR_r STR_a "\0"
#define STRING_Sc0 STR_S STR_c "\0"
#define STRING_Sharada0 STR_S STR_h STR_a STR_r STR_a STR_d STR_a "\0"
#define STRING_Shavian0 STR_S STR_h STR_a STR_v STR_i STR_a STR_n "\0"
#define STRING_Sinhala0 STR_S STR_i STR_n STR_h STR_a STR_l STR_a "\0"
#define STRING_Sk0 STR_S STR_k "\0"
#define STRING_Sm0 STR_S STR_m "\0"
#define STRING_So0 STR_S STR_o "\0"
#define STRING_Sora_Sompeng0 STR_S STR_o STR_r STR_a STR_UNDERSCORE STR_S STR_o STR_m STR_p STR_e STR_n STR_g "\0"
#define STRING_Sundanese0 STR_S STR_u STR_n STR_d STR_a STR_n STR_e STR_s STR_e "\0"
#define STRING_Syloti_Nagri0 STR_S STR_y STR_l STR_o STR_t STR_i STR_UNDERSCORE STR_N STR_a STR_g STR_r STR_i "\0"
#define STRING_Syriac0 STR_S STR_y STR_r STR_i STR_a STR_c "\0"
@ -233,6 +266,7 @@ strings to make sure that UTF-8 support works on EBCDIC platforms. */
#define STRING_Tai_Le0 STR_T STR_a STR_i STR_UNDERSCORE STR_L STR_e "\0"
#define STRING_Tai_Tham0 STR_T STR_a STR_i STR_UNDERSCORE STR_T STR_h STR_a STR_m "\0"
#define STRING_Tai_Viet0 STR_T STR_a STR_i STR_UNDERSCORE STR_V STR_i STR_e STR_t "\0"
#define STRING_Takri0 STR_T STR_a STR_k STR_r STR_i "\0"
#define STRING_Tamil0 STR_T STR_a STR_m STR_i STR_l "\0"
#define STRING_Telugu0 STR_T STR_e STR_l STR_u STR_g STR_u "\0"
#define STRING_Thaana0 STR_T STR_h STR_a STR_a STR_n STR_a "\0"
@ -251,15 +285,17 @@ strings to make sure that UTF-8 support works on EBCDIC platforms. */
#define STRING_Zp0 STR_Z STR_p "\0"
#define STRING_Zs0 STR_Z STR_s "\0"
const char _pcre_utt_names[] =
const char PRIV(utt_names)[] =
STRING_Any0
STRING_Arabic0
STRING_Armenian0
STRING_Avestan0
STRING_Balinese0
STRING_Bamum0
STRING_Batak0
STRING_Bengali0
STRING_Bopomofo0
STRING_Brahmi0
STRING_Braille0
STRING_Buginese0
STRING_Buhid0
@ -268,6 +304,7 @@ const char _pcre_utt_names[] =
STRING_Carian0
STRING_Cc0
STRING_Cf0
STRING_Chakma0
STRING_Cham0
STRING_Cherokee0
STRING_Cn0
@ -321,9 +358,13 @@ const char _pcre_utt_names[] =
STRING_Lydian0
STRING_M0
STRING_Malayalam0
STRING_Mandaic0
STRING_Mc0
STRING_Me0
STRING_Meetei_Mayek0
STRING_Meroitic_Cursive0
STRING_Meroitic_Hieroglyphs0
STRING_Miao0
STRING_Mn0
STRING_Mongolian0
STRING_Myanmar0
@ -357,11 +398,13 @@ const char _pcre_utt_names[] =
STRING_Samaritan0
STRING_Saurashtra0
STRING_Sc0
STRING_Sharada0
STRING_Shavian0
STRING_Sinhala0
STRING_Sk0
STRING_Sm0
STRING_So0
STRING_Sora_Sompeng0
STRING_Sundanese0
STRING_Syloti_Nagri0
STRING_Syriac0
@ -370,6 +413,7 @@ const char _pcre_utt_names[] =
STRING_Tai_Le0
STRING_Tai_Tham0
STRING_Tai_Viet0
STRING_Takri0
STRING_Tamil0
STRING_Telugu0
STRING_Thaana0
@ -388,146 +432,156 @@ const char _pcre_utt_names[] =
STRING_Zp0
STRING_Zs0;
const ucp_type_table _pcre_utt[] = {
const ucp_type_table PRIV(utt)[] = {
{ 0, PT_ANY, 0 },
{ 4, PT_SC, ucp_Arabic },
{ 11, PT_SC, ucp_Armenian },
{ 20, PT_SC, ucp_Avestan },
{ 28, PT_SC, ucp_Balinese },
{ 37, PT_SC, ucp_Bamum },
{ 43, PT_SC, ucp_Bengali },
{ 51, PT_SC, ucp_Bopomofo },
{ 60, PT_SC, ucp_Braille },
{ 68, PT_SC, ucp_Buginese },
{ 77, PT_SC, ucp_Buhid },
{ 83, PT_GC, ucp_C },
{ 85, PT_SC, ucp_Canadian_Aboriginal },
{ 105, PT_SC, ucp_Carian },
{ 112, PT_PC, ucp_Cc },
{ 115, PT_PC, ucp_Cf },
{ 118, PT_SC, ucp_Cham },
{ 123, PT_SC, ucp_Cherokee },
{ 132, PT_PC, ucp_Cn },
{ 135, PT_PC, ucp_Co },
{ 138, PT_SC, ucp_Common },
{ 145, PT_SC, ucp_Coptic },
{ 152, PT_PC, ucp_Cs },
{ 155, PT_SC, ucp_Cuneiform },
{ 165, PT_SC, ucp_Cypriot },
{ 173, PT_SC, ucp_Cyrillic },
{ 182, PT_SC, ucp_Deseret },
{ 190, PT_SC, ucp_Devanagari },
{ 201, PT_SC, ucp_Egyptian_Hieroglyphs },
{ 222, PT_SC, ucp_Ethiopic },
{ 231, PT_SC, ucp_Georgian },
{ 240, PT_SC, ucp_Glagolitic },
{ 251, PT_SC, ucp_Gothic },
{ 258, PT_SC, ucp_Greek },
{ 264, PT_SC, ucp_Gujarati },
{ 273, PT_SC, ucp_Gurmukhi },
{ 282, PT_SC, ucp_Han },
{ 286, PT_SC, ucp_Hangul },
{ 293, PT_SC, ucp_Hanunoo },
{ 301, PT_SC, ucp_Hebrew },
{ 308, PT_SC, ucp_Hiragana },
{ 317, PT_SC, ucp_Imperial_Aramaic },
{ 334, PT_SC, ucp_Inherited },
{ 344, PT_SC, ucp_Inscriptional_Pahlavi },
{ 366, PT_SC, ucp_Inscriptional_Parthian },
{ 389, PT_SC, ucp_Javanese },
{ 398, PT_SC, ucp_Kaithi },
{ 405, PT_SC, ucp_Kannada },
{ 413, PT_SC, ucp_Katakana },
{ 422, PT_SC, ucp_Kayah_Li },
{ 431, PT_SC, ucp_Kharoshthi },
{ 442, PT_SC, ucp_Khmer },
{ 448, PT_GC, ucp_L },
{ 450, PT_LAMP, 0 },
{ 453, PT_SC, ucp_Lao },
{ 457, PT_SC, ucp_Latin },
{ 463, PT_SC, ucp_Lepcha },
{ 470, PT_SC, ucp_Limbu },
{ 476, PT_SC, ucp_Linear_B },
{ 485, PT_SC, ucp_Lisu },
{ 490, PT_PC, ucp_Ll },
{ 493, PT_PC, ucp_Lm },
{ 496, PT_PC, ucp_Lo },
{ 499, PT_PC, ucp_Lt },
{ 502, PT_PC, ucp_Lu },
{ 505, PT_SC, ucp_Lycian },
{ 512, PT_SC, ucp_Lydian },
{ 519, PT_GC, ucp_M },
{ 521, PT_SC, ucp_Malayalam },
{ 531, PT_PC, ucp_Mc },
{ 534, PT_PC, ucp_Me },
{ 537, PT_SC, ucp_Meetei_Mayek },
{ 550, PT_PC, ucp_Mn },
{ 553, PT_SC, ucp_Mongolian },
{ 563, PT_SC, ucp_Myanmar },
{ 571, PT_GC, ucp_N },
{ 573, PT_PC, ucp_Nd },
{ 576, PT_SC, ucp_New_Tai_Lue },
{ 588, PT_SC, ucp_Nko },
{ 592, PT_PC, ucp_Nl },
{ 595, PT_PC, ucp_No },
{ 598, PT_SC, ucp_Ogham },
{ 604, PT_SC, ucp_Ol_Chiki },
{ 613, PT_SC, ucp_Old_Italic },
{ 624, PT_SC, ucp_Old_Persian },
{ 636, PT_SC, ucp_Old_South_Arabian },
{ 654, PT_SC, ucp_Old_Turkic },
{ 665, PT_SC, ucp_Oriya },
{ 671, PT_SC, ucp_Osmanya },
{ 679, PT_GC, ucp_P },
{ 681, PT_PC, ucp_Pc },
{ 684, PT_PC, ucp_Pd },
{ 687, PT_PC, ucp_Pe },
{ 690, PT_PC, ucp_Pf },
{ 693, PT_SC, ucp_Phags_Pa },
{ 702, PT_SC, ucp_Phoenician },
{ 713, PT_PC, ucp_Pi },
{ 716, PT_PC, ucp_Po },
{ 719, PT_PC, ucp_Ps },
{ 722, PT_SC, ucp_Rejang },
{ 729, PT_SC, ucp_Runic },
{ 735, PT_GC, ucp_S },
{ 737, PT_SC, ucp_Samaritan },
{ 747, PT_SC, ucp_Saurashtra },
{ 758, PT_PC, ucp_Sc },
{ 761, PT_SC, ucp_Shavian },
{ 769, PT_SC, ucp_Sinhala },
{ 777, PT_PC, ucp_Sk },
{ 780, PT_PC, ucp_Sm },
{ 783, PT_PC, ucp_So },
{ 786, PT_SC, ucp_Sundanese },
{ 796, PT_SC, ucp_Syloti_Nagri },
{ 809, PT_SC, ucp_Syriac },
{ 816, PT_SC, ucp_Tagalog },
{ 824, PT_SC, ucp_Tagbanwa },
{ 833, PT_SC, ucp_Tai_Le },
{ 840, PT_SC, ucp_Tai_Tham },
{ 849, PT_SC, ucp_Tai_Viet },
{ 858, PT_SC, ucp_Tamil },
{ 864, PT_SC, ucp_Telugu },
{ 871, PT_SC, ucp_Thaana },
{ 878, PT_SC, ucp_Thai },
{ 883, PT_SC, ucp_Tibetan },
{ 891, PT_SC, ucp_Tifinagh },
{ 900, PT_SC, ucp_Ugaritic },
{ 909, PT_SC, ucp_Vai },
{ 913, PT_ALNUM, 0 },
{ 917, PT_PXSPACE, 0 },
{ 921, PT_SPACE, 0 },
{ 925, PT_WORD, 0 },
{ 929, PT_SC, ucp_Yi },
{ 932, PT_GC, ucp_Z },
{ 934, PT_PC, ucp_Zl },
{ 937, PT_PC, ucp_Zp },
{ 940, PT_PC, ucp_Zs }
{ 43, PT_SC, ucp_Batak },
{ 49, PT_SC, ucp_Bengali },
{ 57, PT_SC, ucp_Bopomofo },
{ 66, PT_SC, ucp_Brahmi },
{ 73, PT_SC, ucp_Braille },
{ 81, PT_SC, ucp_Buginese },
{ 90, PT_SC, ucp_Buhid },
{ 96, PT_GC, ucp_C },
{ 98, PT_SC, ucp_Canadian_Aboriginal },
{ 118, PT_SC, ucp_Carian },
{ 125, PT_PC, ucp_Cc },
{ 128, PT_PC, ucp_Cf },
{ 131, PT_SC, ucp_Chakma },
{ 138, PT_SC, ucp_Cham },
{ 143, PT_SC, ucp_Cherokee },
{ 152, PT_PC, ucp_Cn },
{ 155, PT_PC, ucp_Co },
{ 158, PT_SC, ucp_Common },
{ 165, PT_SC, ucp_Coptic },
{ 172, PT_PC, ucp_Cs },
{ 175, PT_SC, ucp_Cuneiform },
{ 185, PT_SC, ucp_Cypriot },
{ 193, PT_SC, ucp_Cyrillic },
{ 202, PT_SC, ucp_Deseret },
{ 210, PT_SC, ucp_Devanagari },
{ 221, PT_SC, ucp_Egyptian_Hieroglyphs },
{ 242, PT_SC, ucp_Ethiopic },
{ 251, PT_SC, ucp_Georgian },
{ 260, PT_SC, ucp_Glagolitic },
{ 271, PT_SC, ucp_Gothic },
{ 278, PT_SC, ucp_Greek },
{ 284, PT_SC, ucp_Gujarati },
{ 293, PT_SC, ucp_Gurmukhi },
{ 302, PT_SC, ucp_Han },
{ 306, PT_SC, ucp_Hangul },
{ 313, PT_SC, ucp_Hanunoo },
{ 321, PT_SC, ucp_Hebrew },
{ 328, PT_SC, ucp_Hiragana },
{ 337, PT_SC, ucp_Imperial_Aramaic },
{ 354, PT_SC, ucp_Inherited },
{ 364, PT_SC, ucp_Inscriptional_Pahlavi },
{ 386, PT_SC, ucp_Inscriptional_Parthian },
{ 409, PT_SC, ucp_Javanese },
{ 418, PT_SC, ucp_Kaithi },
{ 425, PT_SC, ucp_Kannada },
{ 433, PT_SC, ucp_Katakana },
{ 442, PT_SC, ucp_Kayah_Li },
{ 451, PT_SC, ucp_Kharoshthi },
{ 462, PT_SC, ucp_Khmer },
{ 468, PT_GC, ucp_L },
{ 470, PT_LAMP, 0 },
{ 473, PT_SC, ucp_Lao },
{ 477, PT_SC, ucp_Latin },
{ 483, PT_SC, ucp_Lepcha },
{ 490, PT_SC, ucp_Limbu },
{ 496, PT_SC, ucp_Linear_B },
{ 505, PT_SC, ucp_Lisu },
{ 510, PT_PC, ucp_Ll },
{ 513, PT_PC, ucp_Lm },
{ 516, PT_PC, ucp_Lo },
{ 519, PT_PC, ucp_Lt },
{ 522, PT_PC, ucp_Lu },
{ 525, PT_SC, ucp_Lycian },
{ 532, PT_SC, ucp_Lydian },
{ 539, PT_GC, ucp_M },
{ 541, PT_SC, ucp_Malayalam },
{ 551, PT_SC, ucp_Mandaic },
{ 559, PT_PC, ucp_Mc },
{ 562, PT_PC, ucp_Me },
{ 565, PT_SC, ucp_Meetei_Mayek },
{ 578, PT_SC, ucp_Meroitic_Cursive },
{ 595, PT_SC, ucp_Meroitic_Hieroglyphs },
{ 616, PT_SC, ucp_Miao },
{ 621, PT_PC, ucp_Mn },
{ 624, PT_SC, ucp_Mongolian },
{ 634, PT_SC, ucp_Myanmar },
{ 642, PT_GC, ucp_N },
{ 644, PT_PC, ucp_Nd },
{ 647, PT_SC, ucp_New_Tai_Lue },
{ 659, PT_SC, ucp_Nko },
{ 663, PT_PC, ucp_Nl },
{ 666, PT_PC, ucp_No },
{ 669, PT_SC, ucp_Ogham },
{ 675, PT_SC, ucp_Ol_Chiki },
{ 684, PT_SC, ucp_Old_Italic },
{ 695, PT_SC, ucp_Old_Persian },
{ 707, PT_SC, ucp_Old_South_Arabian },
{ 725, PT_SC, ucp_Old_Turkic },
{ 736, PT_SC, ucp_Oriya },
{ 742, PT_SC, ucp_Osmanya },
{ 750, PT_GC, ucp_P },
{ 752, PT_PC, ucp_Pc },
{ 755, PT_PC, ucp_Pd },
{ 758, PT_PC, ucp_Pe },
{ 761, PT_PC, ucp_Pf },
{ 764, PT_SC, ucp_Phags_Pa },
{ 773, PT_SC, ucp_Phoenician },
{ 784, PT_PC, ucp_Pi },
{ 787, PT_PC, ucp_Po },
{ 790, PT_PC, ucp_Ps },
{ 793, PT_SC, ucp_Rejang },
{ 800, PT_SC, ucp_Runic },
{ 806, PT_GC, ucp_S },
{ 808, PT_SC, ucp_Samaritan },
{ 818, PT_SC, ucp_Saurashtra },
{ 829, PT_PC, ucp_Sc },
{ 832, PT_SC, ucp_Sharada },
{ 840, PT_SC, ucp_Shavian },
{ 848, PT_SC, ucp_Sinhala },
{ 856, PT_PC, ucp_Sk },
{ 859, PT_PC, ucp_Sm },
{ 862, PT_PC, ucp_So },
{ 865, PT_SC, ucp_Sora_Sompeng },
{ 878, PT_SC, ucp_Sundanese },
{ 888, PT_SC, ucp_Syloti_Nagri },
{ 901, PT_SC, ucp_Syriac },
{ 908, PT_SC, ucp_Tagalog },
{ 916, PT_SC, ucp_Tagbanwa },
{ 925, PT_SC, ucp_Tai_Le },
{ 932, PT_SC, ucp_Tai_Tham },
{ 941, PT_SC, ucp_Tai_Viet },
{ 950, PT_SC, ucp_Takri },
{ 956, PT_SC, ucp_Tamil },
{ 962, PT_SC, ucp_Telugu },
{ 969, PT_SC, ucp_Thaana },
{ 976, PT_SC, ucp_Thai },
{ 981, PT_SC, ucp_Tibetan },
{ 989, PT_SC, ucp_Tifinagh },
{ 998, PT_SC, ucp_Ugaritic },
{ 1007, PT_SC, ucp_Vai },
{ 1011, PT_ALNUM, 0 },
{ 1015, PT_PXSPACE, 0 },
{ 1019, PT_SPACE, 0 },
{ 1023, PT_WORD, 0 },
{ 1027, PT_SC, ucp_Yi },
{ 1030, PT_GC, ucp_Z },
{ 1032, PT_PC, ucp_Zl },
{ 1035, PT_PC, ucp_Zp },
{ 1038, PT_PC, ucp_Zs }
};
const int _pcre_utt_size = sizeof(_pcre_utt)/sizeof(ucp_type_table);
const int PRIV(utt_size) = sizeof(PRIV(utt)) / sizeof(ucp_type_table);
#endif /* SUPPORT_UTF8 */
#endif /* SUPPORT_UTF */
/* End of pcre_tables.c */

View File

@ -1,137 +0,0 @@
/*************************************************
* Perl-Compatible Regular Expressions *
*************************************************/
/* PCRE is a library of functions to support regular expressions whose syntax
and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel
Copyright (c) 1997-2009 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the University of Cambridge nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
-----------------------------------------------------------------------------
*/
/* This module contains an internal function that tests a compiled pattern to
see if it was compiled with the opposite endianness. If so, it uses an
auxiliary local function to flip the appropriate bytes. */
#include "config.h"
#include "pcre_internal.h"
/*************************************************
* Flip bytes in an integer *
*************************************************/
/* This function is called when the magic number in a regex doesn't match, in
order to flip its bytes to see if we are dealing with a pattern that was
compiled on a host of different endianness. If so, this function is used to
flip other byte values.
Arguments:
value the number to flip
n the number of bytes to flip (assumed to be 2 or 4)
Returns: the flipped value
*/
static unsigned long int
byteflip(unsigned long int value, int n)
{
if (n == 2) return ((value & 0x00ff) << 8) | ((value & 0xff00) >> 8);
return ((value & 0x000000ff) << 24) |
((value & 0x0000ff00) << 8) |
((value & 0x00ff0000) >> 8) |
((value & 0xff000000) >> 24);
}
/*************************************************
* Test for a byte-flipped compiled regex *
*************************************************/
/* This function is called from pcre_exec(), pcre_dfa_exec(), and also from
pcre_fullinfo(). Its job is to test whether the regex is byte-flipped - that
is, it was compiled on a system of opposite endianness. The function is called
only when the native MAGIC_NUMBER test fails. If the regex is indeed flipped,
we flip all the relevant values into a different data block, and return it.
Arguments:
re points to the regex
study points to study data, or NULL
internal_re points to a new regex block
internal_study points to a new study block
Returns: the new block if is is indeed a byte-flipped regex
NULL if it is not
*/
real_pcre *
_pcre_try_flipped(const real_pcre *re, real_pcre *internal_re,
const pcre_study_data *study, pcre_study_data *internal_study)
{
if (byteflip(re->magic_number, sizeof(re->magic_number)) != MAGIC_NUMBER)
return NULL;
*internal_re = *re; /* To copy other fields */
internal_re->size = byteflip(re->size, sizeof(re->size));
internal_re->options = byteflip(re->options, sizeof(re->options));
internal_re->flags = (pcre_uint16)byteflip(re->flags, sizeof(re->flags));
internal_re->top_bracket =
(pcre_uint16)byteflip(re->top_bracket, sizeof(re->top_bracket));
internal_re->top_backref =
(pcre_uint16)byteflip(re->top_backref, sizeof(re->top_backref));
internal_re->first_byte =
(pcre_uint16)byteflip(re->first_byte, sizeof(re->first_byte));
internal_re->req_byte =
(pcre_uint16)byteflip(re->req_byte, sizeof(re->req_byte));
internal_re->name_table_offset =
(pcre_uint16)byteflip(re->name_table_offset, sizeof(re->name_table_offset));
internal_re->name_entry_size =
(pcre_uint16)byteflip(re->name_entry_size, sizeof(re->name_entry_size));
internal_re->name_count =
(pcre_uint16)byteflip(re->name_count, sizeof(re->name_count));
if (study != NULL)
{
*internal_study = *study; /* To copy other fields */
internal_study->size = byteflip(study->size, sizeof(study->size));
internal_study->flags = byteflip(study->flags, sizeof(study->flags));
internal_study->minlength = byteflip(study->minlength,
sizeof(study->minlength));
}
return internal_re;
}
/* End of pcre_tryflipped.c */

File diff suppressed because it is too large Load Diff

View File

@ -6,7 +6,7 @@
and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel
Copyright (c) 1997-2009 University of Cambridge
Copyright (c) 1997-2012 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
@ -52,127 +52,246 @@ strings. */
*************************************************/
/* This function is called (optionally) at the start of compile or match, to
validate that a supposed UTF-8 string is actually valid. The early check means
check that a supposed UTF-8 string is actually valid. The early check means
that subsequent code can assume it is dealing with a valid string. The check
can be turned off for maximum performance, but the consequences of supplying
an invalid string are then undefined.
can be turned off for maximum performance, but the consequences of supplying an
invalid string are then undefined.
Originally, this function checked according to RFC 2279, allowing for values in
the range 0 to 0x7fffffff, up to 6 bytes long, but ensuring that they were in
the canonical format. Once somebody had pointed out RFC 3629 to me (it
obsoletes 2279), additional restrictions were applied. The values are now
limited to be between 0 and 0x0010ffff, no more than 4 bytes long, and the
subrange 0xd000 to 0xdfff is excluded.
subrange 0xd000 to 0xdfff is excluded. However, the format of 5-byte and 6-byte
characters is still checked.
From release 8.13 more information about the details of the error are passed
back in the returned value:
PCRE_UTF8_ERR0 No error
PCRE_UTF8_ERR1 Missing 1 byte at the end of the string
PCRE_UTF8_ERR2 Missing 2 bytes at the end of the string
PCRE_UTF8_ERR3 Missing 3 bytes at the end of the string
PCRE_UTF8_ERR4 Missing 4 bytes at the end of the string
PCRE_UTF8_ERR5 Missing 5 bytes at the end of the string
PCRE_UTF8_ERR6 2nd-byte's two top bits are not 0x80
PCRE_UTF8_ERR7 3rd-byte's two top bits are not 0x80
PCRE_UTF8_ERR8 4th-byte's two top bits are not 0x80
PCRE_UTF8_ERR9 5th-byte's two top bits are not 0x80
PCRE_UTF8_ERR10 6th-byte's two top bits are not 0x80
PCRE_UTF8_ERR11 5-byte character is not permitted by RFC 3629
PCRE_UTF8_ERR12 6-byte character is not permitted by RFC 3629
PCRE_UTF8_ERR13 4-byte character with value > 0x10ffff is not permitted
PCRE_UTF8_ERR14 3-byte character with value 0xd000-0xdfff is not permitted
PCRE_UTF8_ERR15 Overlong 2-byte sequence
PCRE_UTF8_ERR16 Overlong 3-byte sequence
PCRE_UTF8_ERR17 Overlong 4-byte sequence
PCRE_UTF8_ERR18 Overlong 5-byte sequence (won't ever occur)
PCRE_UTF8_ERR19 Overlong 6-byte sequence (won't ever occur)
PCRE_UTF8_ERR20 Isolated 0x80 byte (not within UTF-8 character)
PCRE_UTF8_ERR21 Byte with the illegal value 0xfe or 0xff
Arguments:
string points to the string
length length of string, or -1 if the string is zero-terminated
errp pointer to an error position offset variable
Returns: < 0 if the string is a valid UTF-8 string
>= 0 otherwise; the value is the offset of the bad byte
Bad bytes can be:
. An isolated byte whose most significant bits are 0x80, because this
can only correctly appear within a UTF-8 character;
. A byte whose most significant bits are 0xc0, but whose other bits indicate
that there are more than 3 additional bytes (i.e. an RFC 2279 starting
byte, which is no longer valid under RFC 3629);
.
The returned offset may also be equal to the length of the string; this means
that one or more bytes is missing from the final UTF-8 character.
Returns: = 0 if the string is a valid UTF-8 string
> 0 otherwise, setting the offset of the bad character
*/
int
_pcre_valid_utf8(USPTR string, int length)
PRIV(valid_utf)(PCRE_PUCHAR string, int length, int *erroroffset)
{
#ifdef SUPPORT_UTF8
register USPTR p;
#ifdef SUPPORT_UTF
register PCRE_PUCHAR p;
if (length < 0)
{
for (p = string; *p != 0; p++);
length = p - string;
length = (int)(p - string);
}
for (p = string; length-- > 0; p++)
{
register int ab;
register int c = *p;
if (c < 128) continue;
if (c < 0xc0) return p - string;
ab = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */
if (ab > 3) return p - string; /* Too many for RFC 3629 */
if (length < ab) return p + 1 + length - string; /* Missing bytes */
length -= ab;
register int ab, c, d;
c = *p;
if (c < 128) continue; /* ASCII character */
if (c < 0xc0) /* Isolated 10xx xxxx byte */
{
*erroroffset = (int)(p - string);
return PCRE_UTF8_ERR20;
}
if (c >= 0xfe) /* Invalid 0xfe or 0xff bytes */
{
*erroroffset = (int)(p - string);
return PCRE_UTF8_ERR21;
}
ab = PRIV(utf8_table4)[c & 0x3f]; /* Number of additional bytes */
if (length < ab)
{
*erroroffset = (int)(p - string); /* Missing bytes */
return ab - length; /* Codes ERR1 to ERR5 */
}
length -= ab; /* Length remaining */
/* Check top bits in the second byte */
if ((*(++p) & 0xc0) != 0x80) return p - string;
/* Check for overlong sequences for each different length, and for the
excluded range 0xd000 to 0xdfff. */
if (((d = *(++p)) & 0xc0) != 0x80)
{
*erroroffset = (int)(p - string) - 1;
return PCRE_UTF8_ERR6;
}
/* For each length, check that the remaining bytes start with the 0x80 bit
set and not the 0x40 bit. Then check for an overlong sequence, and for the
excluded range 0xd800 to 0xdfff. */
switch (ab)
{
/* Check for xx00 000x (overlong sequence) */
/* 2-byte character. No further bytes to check for 0x80. Check first byte
for for xx00 000x (overlong sequence). */
case 1:
if ((c & 0x3e) == 0) return p - string;
continue; /* We know there aren't any more bytes to check */
case 1: if ((c & 0x3e) == 0)
{
*erroroffset = (int)(p - string) - 1;
return PCRE_UTF8_ERR15;
}
break;
/* Check for 1110 0000, xx0x xxxx (overlong sequence) or
1110 1101, 1010 xxxx (0xd000 - 0xdfff) */
/* 3-byte character. Check third byte for 0x80. Then check first 2 bytes
for 1110 0000, xx0x xxxx (overlong sequence) or
1110 1101, 1010 xxxx (0xd800 - 0xdfff) */
case 2:
if ((c == 0xe0 && (*p & 0x20) == 0) ||
(c == 0xed && *p >= 0xa0))
return p - string;
if ((*(++p) & 0xc0) != 0x80) /* Third byte */
{
*erroroffset = (int)(p - string) - 2;
return PCRE_UTF8_ERR7;
}
if (c == 0xe0 && (d & 0x20) == 0)
{
*erroroffset = (int)(p - string) - 2;
return PCRE_UTF8_ERR16;
}
if (c == 0xed && d >= 0xa0)
{
*erroroffset = (int)(p - string) - 2;
return PCRE_UTF8_ERR14;
}
break;
/* Check for 1111 0000, xx00 xxxx (overlong sequence) or
greater than 0x0010ffff (f4 8f bf bf) */
/* 4-byte character. Check 3rd and 4th bytes for 0x80. Then check first 2
bytes for for 1111 0000, xx00 xxxx (overlong sequence), then check for a
character greater than 0x0010ffff (f4 8f bf bf) */
case 3:
if ((c == 0xf0 && (*p & 0x30) == 0) ||
(c > 0xf4 ) ||
(c == 0xf4 && *p > 0x8f))
return p - string;
if ((*(++p) & 0xc0) != 0x80) /* Third byte */
{
*erroroffset = (int)(p - string) - 2;
return PCRE_UTF8_ERR7;
}
if ((*(++p) & 0xc0) != 0x80) /* Fourth byte */
{
*erroroffset = (int)(p - string) - 3;
return PCRE_UTF8_ERR8;
}
if (c == 0xf0 && (d & 0x30) == 0)
{
*erroroffset = (int)(p - string) - 3;
return PCRE_UTF8_ERR17;
}
if (c > 0xf4 || (c == 0xf4 && d > 0x8f))
{
*erroroffset = (int)(p - string) - 3;
return PCRE_UTF8_ERR13;
}
break;
#if 0
/* These cases can no longer occur, as we restrict to a maximum of four
bytes nowadays. Leave the code here in case we ever want to add an option
for longer sequences. */
/* 5-byte and 6-byte characters are not allowed by RFC 3629, and will be
rejected by the length test below. However, we do the appropriate tests
here so that overlong sequences get diagnosed, and also in case there is
ever an option for handling these larger code points. */
/* 5-byte character. Check 3rd, 4th, and 5th bytes for 0x80. Then check for
1111 1000, xx00 0xxx */
/* Check for 1111 1000, xx00 0xxx */
case 4:
if (c == 0xf8 && (*p & 0x38) == 0) return p - string;
if ((*(++p) & 0xc0) != 0x80) /* Third byte */
{
*erroroffset = (int)(p - string) - 2;
return PCRE_UTF8_ERR7;
}
if ((*(++p) & 0xc0) != 0x80) /* Fourth byte */
{
*erroroffset = (int)(p - string) - 3;
return PCRE_UTF8_ERR8;
}
if ((*(++p) & 0xc0) != 0x80) /* Fifth byte */
{
*erroroffset = (int)(p - string) - 4;
return PCRE_UTF8_ERR9;
}
if (c == 0xf8 && (d & 0x38) == 0)
{
*erroroffset = (int)(p - string) - 4;
return PCRE_UTF8_ERR18;
}
break;
/* Check for leading 0xfe or 0xff, and then for 1111 1100, xx00 00xx */
/* 6-byte character. Check 3rd-6th bytes for 0x80. Then check for
1111 1100, xx00 00xx. */
case 5:
if (c == 0xfe || c == 0xff ||
(c == 0xfc && (*p & 0x3c) == 0)) return p - string;
if ((*(++p) & 0xc0) != 0x80) /* Third byte */
{
*erroroffset = (int)(p - string) - 2;
return PCRE_UTF8_ERR7;
}
if ((*(++p) & 0xc0) != 0x80) /* Fourth byte */
{
*erroroffset = (int)(p - string) - 3;
return PCRE_UTF8_ERR8;
}
if ((*(++p) & 0xc0) != 0x80) /* Fifth byte */
{
*erroroffset = (int)(p - string) - 4;
return PCRE_UTF8_ERR9;
}
if ((*(++p) & 0xc0) != 0x80) /* Sixth byte */
{
*erroroffset = (int)(p - string) - 5;
return PCRE_UTF8_ERR10;
}
if (c == 0xfc && (d & 0x3c) == 0)
{
*erroroffset = (int)(p - string) - 5;
return PCRE_UTF8_ERR19;
}
break;
#endif
}
/* Check for valid bytes after the 2nd, if any; all must start 10 */
while (--ab > 0)
/* Character is valid under RFC 2279, but 4-byte and 5-byte characters are
excluded by RFC 3629. The pointer p is currently at the last byte of the
character. */
if (ab > 3)
{
if ((*(++p) & 0xc0) != 0x80) return p - string;
*erroroffset = (int)(p - string) - ab;
return (ab == 4)? PCRE_UTF8_ERR11 : PCRE_UTF8_ERR12;
}
}
#else
#else /* SUPPORT_UTF */
(void)(string); /* Keep picky compilers happy */
(void)(length);
#endif
return -1;
return PCRE_UTF8_ERR0; /* This indicates success */
}
/* End of pcre_valid_utf8.c */

View File

@ -6,7 +6,7 @@
and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel
Copyright (c) 1997-2008 University of Cambridge
Copyright (c) 1997-2012 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
@ -77,8 +77,13 @@ I could find no way of detecting that a macro is defined as an empty string at
pre-processor time. This hack uses a standard trick for avoiding calling
the STRING macro with an empty argument when doing the test. */
#ifdef COMPILE_PCRE8
PCRE_EXP_DEFN const char * PCRE_CALL_CONVENTION
pcre_version(void)
#else
PCRE_EXP_DEFN const char * PCRE_CALL_CONVENTION
pcre16_version(void)
#endif
{
return (XSTRING(Z PCRE_PRERELEASE)[1] == 0)?
XSTRING(PCRE_MAJOR.PCRE_MINOR PCRE_DATE) :

View File

@ -6,7 +6,7 @@
and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel
Copyright (c) 1997-2010 University of Cambridge
Copyright (c) 1997-2012 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
@ -62,39 +62,63 @@ Returns: TRUE if character matches, else FALSE
*/
BOOL
_pcre_xclass(int c, const uschar *data)
PRIV(xclass)(int c, const pcre_uchar *data, BOOL utf)
{
int t;
BOOL negated = (*data & XCL_NOT) != 0;
(void)utf;
#ifdef COMPILE_PCRE8
/* In 8 bit mode, this must always be TRUE. Help the compiler to know that. */
utf = TRUE;
#endif
/* Character values < 256 are matched against a bitmap, if one is present. If
not, we still carry on, because there may be ranges that start below 256 in the
additional data. */
if (c < 256)
{
if ((*data & XCL_MAP) != 0 && (data[1 + c/8] & (1 << (c&7))) != 0)
return !negated; /* char found */
if ((*data & XCL_MAP) != 0 &&
(((pcre_uint8 *)(data + 1))[c/8] & (1 << (c&7))) != 0)
return !negated; /* char found */
}
/* First skip the bit map if present. Then match against the list of Unicode
properties or large chars or ranges that end with a large char. We won't ever
encounter XCL_PROP or XCL_NOTPROP when UCP support is not compiled. */
if ((*data++ & XCL_MAP) != 0) data += 32;
if ((*data++ & XCL_MAP) != 0) data += 32 / sizeof(pcre_uchar);
while ((t = *data++) != XCL_END)
{
int x, y;
if (t == XCL_SINGLE)
{
GETCHARINC(x, data);
#ifdef SUPPORT_UTF
if (utf)
{
GETCHARINC(x, data); /* macro generates multiple statements */
}
else
#endif
x = *data++;
if (c == x) return !negated;
}
else if (t == XCL_RANGE)
{
GETCHARINC(x, data);
GETCHARINC(y, data);
#ifdef SUPPORT_UTF
if (utf)
{
GETCHARINC(x, data); /* macro generates multiple statements */
GETCHARINC(y, data); /* macro generates multiple statements */
}
else
#endif
{
x = *data++;
y = *data++;
}
if (c >= x && c <= y) return !negated;
}
@ -115,7 +139,7 @@ while ((t = *data++) != XCL_END)
break;
case PT_GC:
if ((data[1] == _pcre_ucp_gentype[prop->chartype]) == (t == XCL_PROP))
if ((data[1] == PRIV(ucp_gentype)[prop->chartype]) == (t == XCL_PROP))
return !negated;
break;
@ -128,28 +152,28 @@ while ((t = *data++) != XCL_END)
break;
case PT_ALNUM:
if ((_pcre_ucp_gentype[prop->chartype] == ucp_L ||
_pcre_ucp_gentype[prop->chartype] == ucp_N) == (t == XCL_PROP))
if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
PRIV(ucp_gentype)[prop->chartype] == ucp_N) == (t == XCL_PROP))
return !negated;
break;
case PT_SPACE: /* Perl space */
if ((_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
== (t == XCL_PROP))
return !negated;
break;
case PT_PXSPACE: /* POSIX space */
if ((_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
c == CHAR_FF || c == CHAR_CR) == (t == XCL_PROP))
return !negated;
break;
case PT_WORD:
if ((_pcre_ucp_gentype[prop->chartype] == ucp_L ||
_pcre_ucp_gentype[prop->chartype] == ucp_N || c == CHAR_UNDERSCORE)
if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
PRIV(ucp_gentype)[prop->chartype] == ucp_N || c == CHAR_UNDERSCORE)
== (t == XCL_PROP))
return !negated;
break;

View File

@ -248,7 +248,7 @@ if (namecount <= 0) printf("No named substrings\n"); else
* more than one byte. *
* *
* However, there is a complication concerned with newlines. When the *
* newline convention is such that CRLF is a valid newline, we want must *
* newline convention is such that CRLF is a valid newline, we must *
* advance by two characters rather than one. The newline convention can *
* be set in the regex by (*CR), etc.; if not, we must find the default. *
*************************************************************************/

View File

@ -6,7 +6,7 @@
and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel
Copyright (c) 1997-2010 University of Cambridge
Copyright (c) 1997-2012 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
@ -150,6 +150,16 @@ static const int eint[] = {
REG_BADPAT, /* (*MARK) must have an argument */
REG_INVARG, /* this version of PCRE is not compiled with PCRE_UCP support */
REG_BADPAT, /* \c must be followed by an ASCII character */
REG_BADPAT, /* \k is not followed by a braced, angle-bracketed, or quoted name */
/* 70 */
REG_BADPAT, /* internal error: unknown opcode in find_fixedlength() */
REG_BADPAT, /* \N is not supported in a class */
REG_BADPAT, /* too many forward references */
REG_BADPAT, /* disallowed UTF-8/16 code point (>= 0xd800 && <= 0xdfff) */
REG_BADPAT, /* invalid UTF-16 string (should not occur) */
/* 75 */
REG_BADPAT, /* overlong MARK name */
REG_BADPAT /* character value in \u.... sequence is too large */
};
/* Table of texts corresponding to POSIX error codes */
@ -220,7 +230,7 @@ return length + addlength;
PCREPOSIX_EXP_DEFN void PCRE_CALL_CONVENTION
regfree(regex_t *preg)
{
(pcre_free)(preg->re_pcre);
(PUBL(free))(preg->re_pcre);
}
@ -265,11 +275,12 @@ should not happen, but we all make mistakes), return REG_BADPAT. */
if (preg->re_pcre == NULL)
{
return (errorcode < sizeof(eint)/sizeof(const int))?
return (errorcode < (int)(sizeof(eint)/sizeof(const int)))?
eint[errorcode] : REG_BADPAT;
}
preg->re_nsub = pcre_info((const pcre *)preg->re_pcre, NULL, NULL);
(void)pcre_fullinfo((const pcre *)preg->re_pcre, NULL, PCRE_INFO_CAPTURECOUNT,
&(preg->re_nsub));
return 0;
}
@ -395,6 +406,7 @@ switch(rc)
case PCRE_ERROR_MATCHLIMIT: return REG_ESPACE;
case PCRE_ERROR_BADUTF8: return REG_INVARG;
case PCRE_ERROR_BADUTF8_OFFSET: return REG_INVARG;
case PCRE_ERROR_BADMODE: return REG_INVARG;
default: return REG_ASSERT;
}
}

View File

@ -9,7 +9,7 @@
Compatible Regular Expression library. It defines the things POSIX says should
be there. I hope.
Copyright (c) 1997-2009 University of Cambridge
Copyright (c) 1997-2012 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without

View File

@ -602,6 +602,8 @@ ABOVE theatre
AB.VE
AB.VE the turtle
010203040506
PUT NEW DATA ABOVE THIS LINE.
=============================

View File

@ -1,6 +1,5 @@
X one
X two X three X four
X five
X two X three X four X five
X six
X seven…X eightX nineX ten

View File

@ -10,7 +10,7 @@ RC=0
7:PATTERN at the start of a line.
8:In the middle of a line, PATTERN appears.
10:This pattern is in lower case.
608:Check up on PATTERN near the end.
610:Check up on PATTERN near the end.
RC=0
---------------------------- Test 4 ------------------------------
4
@ -19,7 +19,7 @@ RC=0
./testdata/grepinput:7:PATTERN at the start of a line.
./testdata/grepinput:8:In the middle of a line, PATTERN appears.
./testdata/grepinput:10:This pattern is in lower case.
./testdata/grepinput:608:Check up on PATTERN near the end.
./testdata/grepinput:610:Check up on PATTERN near the end.
./testdata/grepinputx:3:Here is the pattern again.
./testdata/grepinputx:5:Pattern
./testdata/grepinputx:42:This line contains pattern not on a line by itself.
@ -28,7 +28,7 @@ RC=0
7:PATTERN at the start of a line.
8:In the middle of a line, PATTERN appears.
10:This pattern is in lower case.
608:Check up on PATTERN near the end.
610:Check up on PATTERN near the end.
3:Here is the pattern again.
5:Pattern
42:This line contains pattern not on a line by itself.
@ -323,10 +323,10 @@ RC=0
./testdata/grepinput-9-
./testdata/grepinput:10:This pattern is in lower case.
--
./testdata/grepinput-605-PUT NEW DATA ABOVE THIS LINE.
./testdata/grepinput-606-=============================
./testdata/grepinput-607-
./testdata/grepinput:608:Check up on PATTERN near the end.
./testdata/grepinput-607-PUT NEW DATA ABOVE THIS LINE.
./testdata/grepinput-608-=============================
./testdata/grepinput-609-
./testdata/grepinput:610:Check up on PATTERN near the end.
--
./testdata/grepinputx-1-This is a second file of input for the pcregrep tests.
./testdata/grepinputx-2-
@ -348,8 +348,8 @@ RC=0
./testdata/grepinput-12-Here follows a whole lot of stuff that makes the file over 24K long.
./testdata/grepinput-13-
--
./testdata/grepinput:608:Check up on PATTERN near the end.
./testdata/grepinput-609-This is the last line of this file.
./testdata/grepinput:610:Check up on PATTERN near the end.
./testdata/grepinput-611-This is the last line of this file.
--
./testdata/grepinputx:3:Here is the pattern again.
./testdata/grepinputx-4-
@ -380,6 +380,7 @@ RC=0
---------------------------- Test 37 -----------------------------
aaaaa0
aaaaa2
010203040506
RC=0
======== STDERR ========
pcregrep: pcre_exec() gave error -8 while matching this text:
@ -390,7 +391,7 @@ pcregrep: pcre_exec() gave error -8 while matching this text:
aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
pcregrep: Error -8 or -21 means that a resource limit was exceeded.
pcregrep: Error -8, -21 or -27 means that a resource limit was exceeded.
pcregrep: Check your regex for nested unlimited loops.
---------------------------- Test 38 ------------------------------
This line contains a binary zero here >< for testing.
@ -514,7 +515,7 @@ This is a file of miscellaneous text that is used as test data for checking
that the pcregrep command is working correctly. The file must be more than 24K
long so that it needs more than a single read
pcregrep: Error -8 or -21 means that a resource limit was exceeded.
pcregrep: Error -8, -21 or -27 means that a resource limit was exceeded.
pcregrep: Check your regex for nested unlimited loops.
RC=1
---------------------------- Test 63 -----------------------------
@ -524,7 +525,7 @@ This is a file of miscellaneous text that is used as test data for checking
that the pcregrep command is working correctly. The file must be more than 24K
long so that it needs more than a single read
pcregrep: Error -8 or -21 means that a resource limit was exceeded.
pcregrep: Error -8, -21 or -27 means that a resource limit was exceeded.
pcregrep: Check your regex for nested unlimited loops.
RC=1
---------------------------- Test 64 ------------------------------
@ -593,3 +594,77 @@ RC=0
triple: t6_txt s2_tag s_txt p_tag p_txt o_tag o_txt
RC=0
---------------------------- Test 71 -----------------------------
01
RC=0
---------------------------- Test 72 -----------------------------
010203040506
RC=0
---------------------------- Test 73 -----------------------------
01
RC=0
---------------------------- Test 74 -----------------------------
01
02
RC=0
---------------------------- Test 75 -----------------------------
010203040506
RC=0
---------------------------- Test 76 -----------------------------
01
02
RC=0
---------------------------- Test 77 -----------------------------
01
03
RC=0
---------------------------- Test 78 -----------------------------
010203040506
RC=0
---------------------------- Test 79 -----------------------------
01
03
RC=0
---------------------------- Test 80 -----------------------------
01
RC=0
---------------------------- Test 81 -----------------------------
010203040506
RC=0
---------------------------- Test 82 -----------------------------
01
RC=0
---------------------------- Test 83 -----------------------------
pcregrep: line 4 of file ./testdata/grepinput3 is too long for the internal buffer
pcregrep: check the --buffer-size option
RC=2
---------------------------- Test 84 -----------------------------
testdata/grepinputv:fox jumps
testdata/grepinputx:complete pair
testdata/grepinputx:That was a complete pair
testdata/grepinputx:complete pair
RC=0
---------------------------- Test 85 -----------------------------
./testdata/grepinput3:Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
RC=0
---------------------------- Test 86 -----------------------------
Binary file ./testdata/grepbinary matches
RC=0
---------------------------- Test 87 -----------------------------
RC=1
---------------------------- Test 88 -----------------------------
Binary file ./testdata/grepbinary matches
RC=0
---------------------------- Test 89 -----------------------------
RC=1
---------------------------- Test 90 -----------------------------
RC=1
---------------------------- Test 91 -----------------------------
The quick brown fx jumps over the lazy dog.
RC=0
---------------------------- Test 92 -----------------------------
The quick brown fx jumps over the lazy dog.
RC=0
---------------------------- Test 93 -----------------------------
The quick brown fx jumps over the lazy dog.
RC=0

View File

@ -1,11 +1,12 @@
---------------------------- Test U1 ------------------------------
1:X one
2:X two 3:X three 4:X four
5:X five
2:X two 3:X three 4:X four 5:X five
6:X six
7:X seven…8:X eight9:X nine10:X ten
RC=0
---------------------------- Test U2 ------------------------------
12-Before 111
13-Before 22214-Before 333…15:Match
16-After 111
17-After 22218-After 333
RC=0

View File

@ -1,22 +1,16 @@
---------------------------- Test N1 ------------------------------
1:abc
2:def
---------------------------- Test N2 ------------------------------
1:abc
def
1:abc 2:def ---------------------------- Test N2 ------------------------------
1:abc def
2:ghi
jkl---------------------------- Test N3 ------------------------------
2:def
3:
2:def 3:
ghi
jkl---------------------------- Test N4 ------------------------------
2:ghi
jkl---------------------------- Test N5 ------------------------------
1:abc
2:def
1:abc 2:def
3:ghi
4:jkl---------------------------- Test N6 ------------------------------
1:abc
2:def
1:abc 2:def
3:ghi
4:jkl

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,5 +1,6 @@
/-- This set of tests if for UTF-8 support, excluding Unicode properties. It is
compatible with all versions of Perl 5. --/
/-- This set of tests is for UTF support, excluding Unicode properties. It is
compatible with all versions of Perl >= 5.10 and both the 8-bit and 16-bit
PCRE libraries. --/
/a.b/8
acb
@ -126,31 +127,6 @@
*** Failers
XYZ
/X(\C{3})/8
X\x{1234}
/X(\C{4})/8
X\x{1234}YZ
/X\C*/8
XYZabcdce
/X\C*?/8
XYZabcde
/X\C{3,5}/8
Xabcdefg
X\x{1234}
X\x{1234}YZ
X\x{1234}\x{512}
X\x{1234}\x{512}YZ
/X\C{3,5}?/8
Xabcdefg
X\x{1234}
X\x{1234}YZ
X\x{1234}\x{512}
/[^a]+/8g
bcd
\x{100}aY\x{256}Z
@ -456,17 +432,6 @@
\x{150}X
\x{200}X
/a\Cb/
aXb
a\nb
/a\Cb/8
aXb
a\nb
/a\C\Cb/8
a\x{100}b
/[z-\x{100}]/8i
z
Z
@ -644,4 +609,16 @@
/A*/g8
AAB\x{123}BAA
/(abc)\1/8i
abc
/(abc)\1/8
abc
/a(*:a\x{1234}b)/8K
abc
/a(*:a£b)/8K
abc
/-- End of testinput4 --/

View File

@ -1,72 +1,36 @@
/-- This set of tests checks the API, internals, and non-Perl stuff for UTF-8
support, excluding Unicode properties. --/
/-- This set of tests checks the API, internals, and non-Perl stuff for UTF
support, excluding Unicode properties. However, tests that give different
results in 8-bit and 16-bit modes are excluded (see tests 16 and 17). --/
/\x{100}/8DZ
/\x{1000}/8DZ
/\x{10000}/8DZ
/\x{100000}/8DZ
/\x{1000000}/8DZ
/\x{4000000}/8DZ
/\x{7fffFFFF}/8DZ
/[\x{ff}]/8DZ
/[\x{100}]/8DZ
/\x{110000}/8DZ
/\x{ffffffff}/8
/\x{100000000}/8
/\x{d800}/8
/\x{dfff}/8
/\x{d7ff}/8
/\x{e000}/8
/^\x{100}a\x{1234}/8
\x{100}a\x{1234}bcd
/\x80/8DZ
/\xff/8DZ
/\x{0041}\x{2262}\x{0391}\x{002e}/DZ8
\x{0041}\x{2262}\x{0391}\x{002e}
/\x{D55c}\x{ad6d}\x{C5B4}/DZ8
\x{D55c}\x{ad6d}\x{C5B4}
/\x{65e5}\x{672c}\x{8a9e}/DZ8
\x{65e5}\x{672c}\x{8a9e}
/\x{80}/DZ8
/\x{084}/DZ8
/\x{104}/DZ8
/\x{861}/DZ8
/\x{212ab}/DZ8
/.{3,5}X/DZ8
\x{212ab}\x{212ab}\x{212ab}\x{861}X
/.{3,5}?/DZ8
\x{212ab}\x{212ab}\x{212ab}\x{861}
/(?<=\C)X/8
Should produce an error diagnostic
/-- This one is here not because it's different to Perl, but because the way
the captured single-byte is displayed. (In Perl it becomes a character, and you
can't tell the difference.) --/
/X(\C)(.*)/8
X\x{1234}
X\nabc
/^[ab]/8DZ
bar
*** Failers
@ -81,26 +45,6 @@ can't tell the difference.) --/
*** Failers
aaa
/[^ab\xC0-\xF0]/8SDZ
\x{f1}
\x{bf}
\x{100}
\x{1000}
*** Failers
\x{c0}
\x{f0}
/Ä€{3,4}/8SDZ
\x{100}\x{100}\x{100}\x{100\x{100}
/(\x{100}+|x)/8SDZ
/(\x{100}*a|x)/8SDZ
/(\x{100}{0,2}a|x)/8SDZ
/(\x{100}{1,2}a|x)/8SDZ
/\x{100}*(\d+|"(?1)")/8
1234
"1234"
@ -111,33 +55,17 @@ can't tell the difference.) --/
*** Failers
\x{100}\x{100}abcd
/\x{100}/8DZ
/\x{100}*/8DZ
/a\x{100}*/8DZ
/ab\x{100}*/8DZ
/a\x{100}\x{101}*/8DZ
/a\x{100}\x{101}+/8DZ
/\x{100}*A/8DZ
A
/\x{100}*\d(?R)/8DZ
/[^\x{c4}]/DZ
/[^\x{c4}]/8DZ
/[\x{100}]/8DZ
\x{100}
Z\x{100}
\x{100}Z
*** Failers
/[Z\x{100}]/8DZ
Z\x{100}
\x{100}
@ -162,13 +90,8 @@ can't tell the difference.) --/
/[\xFF]/DZ
>\xff<
/[\xff]/DZ8
>\x{ff}<
/[^\xFF]/DZ
/[^\xff]/8DZ
/[Ä-Ü]/8
Ö # Matches without Study
\x{d6}
@ -185,45 +108,6 @@ can't tell the difference.) --/
Ö <-- Same with Study
\x{d6}
/[Ã]/8
/Ã/8
/ÃÃÃxxx/8
/ÃÃÃxxx/8?DZ
/abc/8
Ã]
Ã
ÃÃÃ
ÃÃÃ\?
/anything/8
\xc0\x80
\xc1\x8f
\xe0\x9f\x80
\xf0\x8f\x80\x80
\xf8\x87\x80\x80\x80
\xfc\x83\x80\x80\x80\x80
\xfe\x80\x80\x80\x80\x80
\xff\x80\x80\x80\x80\x80
\xc3\x8f
\xe0\xaf\x80
\xe1\x80\x80
\xf0\x9f\x80\x80
\xf1\x8f\x80\x80
\xf8\x88\x80\x80\x80
\xf9\x87\x80\x80\x80
\xfc\x84\x80\x80\x80\x80
\xfd\x83\x80\x80\x80\x80
\?\xf8\x88\x80\x80\x80
\?\xf9\x87\x80\x80\x80
\?\xfc\x84\x80\x80\x80\x80
\?\xfd\x83\x80\x80\x80\x80
/\x{100}abc(xyz(?1))/8DZ
/[^\x{100}]abc(xyz(?1))/8DZ
/[ab\x{100}]abc(xyz(?1))/8DZ
@ -243,17 +127,8 @@ can't tell the difference.) --/
/\w/8
\x{100}X
/a\x{1234}b/P8
a\x{1234}b
/^\ሴ/8DZ
/\777/I
/\777/8I
\x{1ff}
\777
/\x{100}*\d/8DZ
/\x{100}*\s/8DZ
@ -266,12 +141,6 @@ can't tell the difference.) --/
/\x{100}*\W/8DZ
/\x{100}+\x{200}/8DZ
/\x{100}+X/8DZ
/X+\x{200}/8DZ
/()()()()()()()()()()
()()()()()()()()()()
()()()()()()()()()()
@ -283,8 +152,6 @@ can't tell the difference.) --/
/^[\QĀ\E-\QŐ\E]/BZ8
/^[\QÄ€\E-\QÅ<51>\E/BZ8
/^abc./mgx8<any>
abc1 \x0aabc2 \x0babc3xx \x0cabc4 \x0dabc5xx \x0d\x0aabc6 \x{0085}abc7 \x{2028}abc8 \x{2029}abc9 JUNK
@ -379,23 +246,6 @@ can't tell the difference.) --/
/.*$/8<any>
\x{1ec5}
/-- This tests the stricter UTF-8 check according to RFC 3629. --/
/X/8
\x{0}\x{d7ff}\x{e000}\x{10ffff}
\x{d800}
\x{d800}\?
\x{da00}
\x{da00}\?
\x{dfff}
\x{dfff}\?
\x{110000}
\x{110000}\?
\x{2000000}
\x{2000000}\?
\x{7fffffff}
\x{7fffffff}\?
/a\Rb/I8<bsr_anycrlf>
a\rb
a\nb
@ -454,16 +304,10 @@ can't tell the difference.) --/
/(\x{de})\1/
\x{de}\x{de}
\x{123}
/X/8f<any>
A\x{1ec5}ABCXYZ
/(*UTF8)\x{1234}/
abcd\x{1234}pqr
/(*CRLF)(*UTF8)(*BSR_UNICODE)a\Rb/I
/Xa{2,4}b/8
X\P
Xa\P
@ -745,53 +589,184 @@ can't tell the difference.) --/
/X\W{3}X/8
\PX
/\h/SI
/\h/SI8
ABC\x{09}
ABC\x{20}
ABC\x{a0}
ABC\x{1680}
ABC\x{180e}
ABC\x{2000}
ABC\x{202f}
ABC\x{205f}
ABC\x{3000}
/\v/SI
/\v/SI8
ABC\x{0a}
ABC\x{0b}
ABC\x{0c}
ABC\x{0d}
ABC\x{85}
ABC\x{2028}
/\R/SI
/\R/SI8
/\h*A/SI8
CDBABC
/\v+A/SI8
/\s?xxx\s/8SI
/\sxxx\s/8T1
AB\x{85}xxx\x{a0}XYZ
AB\x{a0}xxx\x{85}XYZ
/\sxxx\s/I8ST1
AB\x{85}xxx\x{a0}XYZ
AB\x{a0}xxx\x{85}XYZ
/\S \S/8T1
\x{a2} \x{84}
/\S \S/I8ST1
\x{a2} \x{84}
A Z
'A#хц'8x<any>BZ
'A#хц
PQ'8x<any>BZ
/a+#хaa
z#XX?/8x<any>BZ
/a+#хaa
z#х?/8x<any>BZ
/\g{A}xxx#bXX(?'A'123) (?'A'456)/8x<any>BZ
/\g{A}xxx#bх(?'A'123) (?'A'456)/8x<any>BZ
/^\cģ/8
/(\R*)(.)/s8
\r\n
\r\r\n\n\r
\r\r\n\n\r\n
/(\R)*(.)/s8
\r\n
\r\r\n\n\r
\r\r\n\n\r\n
/[^\x{1234}]+/iS8I
/[^\x{1234}]+?/iS8I
/[^\x{1234}]++/iS8I
/[^\x{1234}]{2}/iS8I
//<bsr_anycrlf><bsr_unicode>
/f.*/
\P\Pfor
/f.*/s
\P\Pfor
/f.*/8
\P\Pfor
/f.*/8s
\P\Pfor
/\x{d7ff}\x{e000}/8
/\x{d800}/8
/\x{dfff}/8
/\h+/8
\x{1681}\x{200b}\x{1680}\x{2000}\x{202f}\x{3000}
\x{3001}\x{2fff}\x{200a}\x{a0}\x{2000}
/[\h\x{e000}]+/8BZ
\x{1681}\x{200b}\x{1680}\x{2000}\x{202f}\x{3000}
\x{3001}\x{2fff}\x{200a}\x{a0}\x{2000}
/\H+/8
\x{1680}\x{180e}\x{167f}\x{1681}\x{180d}\x{180f}
\x{2000}\x{200a}\x{1fff}\x{200b}
\x{202f}\x{205f}\x{202e}\x{2030}\x{205e}\x{2060}
\x{a0}\x{3000}\x{9f}\x{a1}\x{2fff}\x{3001}
/[\H\x{d7ff}]+/8BZ
\x{1680}\x{180e}\x{167f}\x{1681}\x{180d}\x{180f}
\x{2000}\x{200a}\x{1fff}\x{200b}
\x{202f}\x{205f}\x{202e}\x{2030}\x{205e}\x{2060}
\x{a0}\x{3000}\x{9f}\x{a1}\x{2fff}\x{3001}
/\v+/8
\x{2027}\x{2030}\x{2028}\x{2029}
\x09\x0e\x{84}\x{86}\x{85}\x0a\x0b\x0c\x0d
/[\v\x{e000}]+/8BZ
\x{2027}\x{2030}\x{2028}\x{2029}
\x09\x0e\x{84}\x{86}\x{85}\x0a\x0b\x0c\x0d
/\V+/8
\x{2028}\x{2029}\x{2027}\x{2030}
\x{85}\x0a\x0b\x0c\x0d\x09\x0e\x{84}\x{86}
/[\V\x{d7ff}]+/8BZ
\x{2028}\x{2029}\x{2027}\x{2030}
\x{85}\x0a\x0b\x0c\x0d\x09\x0e\x{84}\x{86}
/\R+/8<bsr_unicode>
\x{2027}\x{2030}\x{2028}\x{2029}
\x09\x0e\x{84}\x{86}\x{85}\x0a\x0b\x0c\x0d
/(..)\1/8
ab\P
aba\P
abab\P
/(..)\1/8i
ab\P
abA\P
aBAb\P
/(..)\1{2,}/8
ab\P
aba\P
abab\P
ababa\P
ababab\P
ababab\P\P
abababa\P
abababa\P\P
/(..)\1{2,}/8i
ab\P
aBa\P
aBAb\P
AbaBA\P
abABAb\P
aBAbaB\P\P
abABabA\P
abaBABa\P\P
/(..)\1{2,}?x/8i
ab\P
abA\P
aBAb\P
abaBA\P
abAbaB\P
abaBabA\P
abAbABaBx\P
/./8<CRLF>
\r\P
\r\P\P
/.{2,3}/8<CRLF>
\r\P
\r\P\P
\r\r\P
\r\r\P\P
\r\r\r\P
\r\r\r\P\P
/.{2,3}?/8<CRLF>
\r\P
\r\P\P
\r\r\P
\r\r\P\P
\r\r\r\P
\r\r\r\P\P
/[^\x{100}][^\x{1234}][^\x{ffff}][^\x{10000}][^\x{10ffff}]/8BZ
/[^\x{100}][^\x{1234}][^\x{ffff}][^\x{10000}][^\x{10ffff}]/8BZi
/[^\x{100}]*[^\x{10000}]+[^\x{10ffff}]??[^\x{8000}]{4,}[^\x{7fff}]{2,9}?[^\x{fffff}]{5,6}+/8BZ
/[^\x{100}]*[^\x{10000}]+[^\x{10ffff}]??[^\x{8000}]{4,}[^\x{7fff}]{2,9}?[^\x{fffff}]{5,6}+/8BZi
/(?<=\x{1234}\x{1234})\bxy/I8
/(?<!^)ETA/8
ETA
/\u0100/<JS>8BZ
/[\u0100-\u0200]/<JS>8BZ
/\ud800/<JS>8
/-- End of testinput5 --/

View File

@ -655,6 +655,7 @@
A\x80
/^[\p{Arabic}]/8
\x{604}
\x{60e}
\x{656}
\x{657}
@ -670,7 +671,6 @@
\x{6ef}
\x{6fa}
** Failers
\x{600}
\x{650}
\x{651}
\x{652}
@ -688,7 +688,6 @@
\x{61f}
\x{964}
\x{965}
\x{970}
/^\p{Inherited}/8
\x{64b}
@ -802,4 +801,18 @@
** Failers
a\xFCb
/ⱥ/8i
Ⱥx
Ⱥ
/[ⱥ]/8i
Ⱥx
Ⱥ
/Ⱥ/8i
Ⱥ
/-- End of testinput6 --/

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,5 +1,6 @@
/-- This set of tests if for UTF-8 support, excluding Unicode properties. It is
compatible with all versions of Perl 5. --/
/-- This set of tests is for UTF support, excluding Unicode properties. It is
compatible with all versions of Perl >= 5.10 and both the 8-bit and 16-bit
PCRE libraries. --/
/a.b/8
acb
@ -255,46 +256,6 @@ No match
XYZ
No match
/X(\C{3})/8
X\x{1234}
0: X\x{1234}
1: \x{1234}
/X(\C{4})/8
X\x{1234}YZ
0: X\x{1234}Y
1: \x{1234}Y
/X\C*/8
XYZabcdce
0: XYZabcdce
/X\C*?/8
XYZabcde
0: X
/X\C{3,5}/8
Xabcdefg
0: Xabcde
X\x{1234}
0: X\x{1234}
X\x{1234}YZ
0: X\x{1234}YZ
X\x{1234}\x{512}
0: X\x{1234}\x{512}
X\x{1234}\x{512}YZ
0: X\x{1234}\x{512}
/X\C{3,5}?/8
Xabcdefg
0: Xabc
X\x{1234}
0: X\x{1234}
X\x{1234}YZ
0: X\x{1234}
X\x{1234}\x{512}
0: X\x{1234}
/[^a]+/8g
bcd
0: bcd
@ -791,22 +752,6 @@ No match
\x{200}X
No match
/a\Cb/
aXb
0: aXb
a\nb
0: a\x0ab
/a\Cb/8
aXb
0: aXb
a\nb
0: a\x{0a}b
/a\C\Cb/8
a\x{100}b
0: a\x{100}b
/[z-\x{100}]/8i
z
0: z
@ -1128,4 +1073,22 @@ No match
0: AA
0:
/(abc)\1/8i
abc
No match
/(abc)\1/8
abc
No match
/a(*:a\x{1234}b)/8K
abc
0: a
MK: a\x{1234}b
/a(*:a£b)/8K
abc
0: a
MK: a\x{a3}b
/-- End of testinput4 --/

File diff suppressed because it is too large Load Diff

View File

@ -1114,6 +1114,8 @@ No match
0: A\x80
/^[\p{Arabic}]/8
\x{604}
0: \x{604}
\x{60e}
0: \x{60e}
\x{656}
@ -1143,8 +1145,6 @@ No match
\x{6fa}
0: \x{6fa}
** Failers
No match
\x{600}
No match
\x{650}
No match
@ -1176,8 +1176,6 @@ No match
0: \x{964}
\x{965}
0: \x{965}
\x{970}
0: \x{970}
/^\p{Inherited}/8
\x{64b}
@ -1353,4 +1351,26 @@ No match
a\xFCb
No match
/ⱥ/8i
0: \x{2c65}
Ⱥx
0: \x{23a}
Ⱥ
0: \x{23a}
/[ⱥ]/8i
0: \x{2c65}
Ⱥx
0: \x{23a}
Ⱥ
0: \x{23a}
/Ⱥ/8i
Ⱥ
0: \x{23a}
0: \x{2c65}
/-- End of testinput6 --/

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -153,7 +153,19 @@ enum {
ucp_Old_Turkic,
ucp_Samaritan,
ucp_Tai_Tham,
ucp_Tai_Viet
ucp_Tai_Viet,
/* New for Unicode 6.0.0: */
ucp_Batak,
ucp_Brahmi,
ucp_Mandaic,
/* New for Unicode 6.1.0: */
ucp_Chakma,
ucp_Meroitic_Cursive,
ucp_Meroitic_Hieroglyphs,
ucp_Miao,
ucp_Sharada,
ucp_Sora_Sompeng,
ucp_Takri
};
#endif