mirror of
https://github.com/php/php-src.git
synced 2024-12-02 22:34:55 +08:00
Merge branch 'PHP-5.3' into PHP-5.4
* PHP-5.3: Fixed bug #63284 PCRE upgrade to 8.31
This commit is contained in:
commit
d2fa182f25
@ -3,7 +3,7 @@
|
||||
|
||||
EXTENSION("pcre", "php_pcre.c", false /* never shared */,
|
||||
"-Iext/pcre/pcrelib");
|
||||
ADD_SOURCES("ext/pcre/pcrelib", "pcre_chartables.c pcre_ucd.c pcre_compile.c pcre_config.c pcre_exec.c pcre_fullinfo.c pcre_get.c pcre_globals.c pcre_info.c pcre_maketables.c pcre_newline.c pcre_ord2utf8.c pcre_refcount.c pcre_study.c pcre_tables.c pcre_try_flipped.c pcre_valid_utf8.c pcre_version.c pcre_xclass.c", "pcre");
|
||||
ADD_SOURCES("ext/pcre/pcrelib", "pcre_chartables.c pcre_ucd.c pcre_compile.c pcre_config.c pcre_exec.c pcre_fullinfo.c pcre_get.c pcre_globals.c pcre_maketables.c pcre_newline.c pcre_ord2utf8.c pcre_refcount.c pcre_study.c pcre_tables.c pcre_valid_utf8.c pcre_version.c pcre_xclass.c", "pcre");
|
||||
ADD_DEF_FILE("ext\\pcre\\php_pcre.def");
|
||||
|
||||
AC_DEFINE('HAVE_BUNDLED_PCRE', 1, 'Using bundled PCRE library');
|
||||
|
@ -55,9 +55,9 @@ PHP_ARG_WITH(pcre-regex,,
|
||||
pcrelib_sources="pcrelib/pcre_chartables.c pcrelib/pcre_ucd.c \
|
||||
pcrelib/pcre_compile.c pcrelib/pcre_config.c pcrelib/pcre_exec.c \
|
||||
pcrelib/pcre_fullinfo.c pcrelib/pcre_get.c pcrelib/pcre_globals.c \
|
||||
pcrelib/pcre_info.c pcrelib/pcre_maketables.c pcrelib/pcre_newline.c \
|
||||
pcrelib/pcre_maketables.c pcrelib/pcre_newline.c \
|
||||
pcrelib/pcre_ord2utf8.c pcrelib/pcre_refcount.c pcrelib/pcre_study.c \
|
||||
pcrelib/pcre_tables.c pcrelib/pcre_try_flipped.c pcrelib/pcre_valid_utf8.c \
|
||||
pcrelib/pcre_tables.c pcrelib/pcre_valid_utf8.c \
|
||||
pcrelib/pcre_version.c pcrelib/pcre_xclass.c"
|
||||
PHP_NEW_EXTENSION(pcre, $pcrelib_sources php_pcre.c, no,,-I@ext_srcdir@/pcrelib)
|
||||
PHP_ADD_BUILD_DIR($ext_builddir/pcrelib)
|
||||
|
@ -8,16 +8,38 @@ Email domain: cam.ac.uk
|
||||
University of Cambridge Computing Service,
|
||||
Cambridge, England.
|
||||
|
||||
Copyright (c) 1997-2010 University of Cambridge
|
||||
Copyright (c) 1997-2012 University of Cambridge
|
||||
All rights reserved
|
||||
|
||||
|
||||
PCRE JUST-IN-TIME COMPILATION SUPPORT
|
||||
-------------------------------------
|
||||
|
||||
Written by: Zoltan Herczeg
|
||||
Email local part: hzmester
|
||||
Emain domain: freemail.hu
|
||||
|
||||
Copyright(c) 2010-2012 Zoltan Herczeg
|
||||
All rights reserved.
|
||||
|
||||
|
||||
STACK-LESS JUST-IN-TIME COMPILER
|
||||
--------------------------------
|
||||
|
||||
Written by: Zoltan Herczeg
|
||||
Email local part: hzmester
|
||||
Emain domain: freemail.hu
|
||||
|
||||
Copyright(c) 2009-2012 Zoltan Herczeg
|
||||
All rights reserved.
|
||||
|
||||
|
||||
THE C++ WRAPPER LIBRARY
|
||||
-----------------------
|
||||
|
||||
Written by: Google Inc.
|
||||
|
||||
Copyright (c) 2007-2010 Google Inc
|
||||
Copyright (c) 2007-2012 Google Inc
|
||||
All rights reserved
|
||||
|
||||
####
|
||||
|
@ -1,6 +1,699 @@
|
||||
ChangeLog for PCRE
|
||||
------------------
|
||||
|
||||
Version 8.31 06-July-2012
|
||||
-------------------------
|
||||
|
||||
1. Fixing a wrong JIT test case and some compiler warnings.
|
||||
|
||||
2. Removed a bashism from the RunTest script.
|
||||
|
||||
3. Add a cast to pcre_exec.c to fix the warning "unary minus operator applied
|
||||
to unsigned type, result still unsigned" that was given by an MS compiler
|
||||
on encountering the code "-sizeof(xxx)".
|
||||
|
||||
4. Partial matching support is added to the JIT compiler.
|
||||
|
||||
5. Fixed several bugs concerned with partial matching of items that consist
|
||||
of more than one character:
|
||||
|
||||
(a) /^(..)\1/ did not partially match "aba" because checking references was
|
||||
done on an "all or nothing" basis. This also applied to repeated
|
||||
references.
|
||||
|
||||
(b) \R did not give a hard partial match if \r was found at the end of the
|
||||
subject.
|
||||
|
||||
(c) \X did not give a hard partial match after matching one or more
|
||||
characters at the end of the subject.
|
||||
|
||||
(d) When newline was set to CRLF, a pattern such as /a$/ did not recognize
|
||||
a partial match for the string "\r".
|
||||
|
||||
(e) When newline was set to CRLF, the metacharacter "." did not recognize
|
||||
a partial match for a CR character at the end of the subject string.
|
||||
|
||||
6. If JIT is requested using /S++ or -s++ (instead of just /S+ or -s+) when
|
||||
running pcretest, the text "(JIT)" added to the output whenever JIT is
|
||||
actually used to run the match.
|
||||
|
||||
7. Individual JIT compile options can be set in pcretest by following -s+[+]
|
||||
or /S+[+] with a digit between 1 and 7.
|
||||
|
||||
8. OP_NOT now supports any UTF character not just single-byte ones.
|
||||
|
||||
9. (*MARK) control verb is now supported by the JIT compiler.
|
||||
|
||||
10. The command "./RunTest list" lists the available tests without actually
|
||||
running any of them. (Because I keep forgetting what they all are.)
|
||||
|
||||
11. Add PCRE_INFO_MAXLOOKBEHIND.
|
||||
|
||||
12. Applied a (slightly modified) user-supplied patch that improves performance
|
||||
when the heap is used for recursion (compiled with --disable-stack-for-
|
||||
recursion). Instead of malloc and free for each heap frame each time a
|
||||
logical recursion happens, frames are retained on a chain and re-used where
|
||||
possible. This sometimes gives as much as 30% improvement.
|
||||
|
||||
13. As documented, (*COMMIT) is now confined to within a recursive subpattern
|
||||
call.
|
||||
|
||||
14. As documented, (*COMMIT) is now confined to within a positive assertion.
|
||||
|
||||
15. It is now possible to link pcretest with libedit as an alternative to
|
||||
libreadline.
|
||||
|
||||
16. (*COMMIT) control verb is now supported by the JIT compiler.
|
||||
|
||||
17. The Unicode data tables have been updated to Unicode 6.1.0.
|
||||
|
||||
18. Added --file-list option to pcregrep.
|
||||
|
||||
19. Added binary file support to pcregrep, including the -a, --binary-files,
|
||||
-I, and --text options.
|
||||
|
||||
20. The madvise function is renamed for posix_madvise for QNX compatibility
|
||||
reasons. Fixed by Giuseppe D'Angelo.
|
||||
|
||||
21. Fixed a bug for backward assertions with REVERSE 0 in the JIT compiler.
|
||||
|
||||
22. Changed the option for creating symbolic links for 16-bit man pages from
|
||||
-s to -sf so that re-installing does not cause issues.
|
||||
|
||||
23. Support PCRE_NO_START_OPTIMIZE in JIT as (*MARK) support requires it.
|
||||
|
||||
24. Fixed a very old bug in pcretest that caused errors with restarted DFA
|
||||
matches in certain environments (the workspace was not being correctly
|
||||
retained). Also added to pcre_dfa_exec() a simple plausibility check on
|
||||
some of the workspace data at the beginning of a restart.
|
||||
|
||||
25. \s*\R was auto-possessifying the \s* when it should not, whereas \S*\R
|
||||
was not doing so when it should - probably a typo introduced by SVN 528
|
||||
(change 8.10/14).
|
||||
|
||||
26. When PCRE_UCP was not set, \w+\x{c4} was incorrectly auto-possessifying the
|
||||
\w+ when the character tables indicated that \x{c4} was a word character.
|
||||
There were several related cases, all because the tests for doing a table
|
||||
lookup were testing for characters less than 127 instead of 255.
|
||||
|
||||
27. If a pattern contains capturing parentheses that are not used in a match,
|
||||
their slots in the ovector are set to -1. For those that are higher than
|
||||
any matched groups, this happens at the end of processing. In the case when
|
||||
there were back references that the ovector was too small to contain
|
||||
(causing temporary malloc'd memory to be used during matching), and the
|
||||
highest capturing number was not used, memory off the end of the ovector
|
||||
was incorrectly being set to -1. (It was using the size of the temporary
|
||||
memory instead of the true size.)
|
||||
|
||||
28. To catch bugs like 27 using valgrind, when pcretest is asked to specify an
|
||||
ovector size, it uses memory at the end of the block that it has got.
|
||||
|
||||
29. Check for an overlong MARK name and give an error at compile time. The
|
||||
limit is 255 for the 8-bit library and 65535 for the 16-bit library.
|
||||
|
||||
30. JIT compiler update.
|
||||
|
||||
31. JIT is now supported on jailbroken iOS devices. Thanks for Ruiger
|
||||
Rill for the patch.
|
||||
|
||||
32. Put spaces around SLJIT_PRINT_D in the JIT compiler. Required by CXX11.
|
||||
|
||||
33. Variable renamings in the PCRE-JIT compiler. No functionality change.
|
||||
|
||||
34. Fixed typos in pcregrep: in two places there was SUPPORT_LIBZ2 instead of
|
||||
SUPPORT_LIBBZ2. This caused a build problem when bzip2 but not gzip (zlib)
|
||||
was enabled.
|
||||
|
||||
35. Improve JIT code generation for greedy plus quantifier.
|
||||
|
||||
36. When /((?:a?)*)*c/ or /((?>a?)*)*c/ was matched against "aac", it set group
|
||||
1 to "aa" instead of to an empty string. The bug affected repeated groups
|
||||
that could potentially match an empty string.
|
||||
|
||||
37. Optimizing single character iterators in JIT.
|
||||
|
||||
38. Wide characters specified with \uxxxx in JavaScript mode are now subject to
|
||||
the same checks as \x{...} characters in non-JavaScript mode. Specifically,
|
||||
codepoints that are too big for the mode are faulted, and in a UTF mode,
|
||||
disallowed codepoints are also faulted.
|
||||
|
||||
39. If PCRE was compiled with UTF support, in three places in the DFA
|
||||
matcher there was code that should only have been obeyed in UTF mode, but
|
||||
was being obeyed unconditionally. In 8-bit mode this could cause incorrect
|
||||
processing when bytes with values greater than 127 were present. In 16-bit
|
||||
mode the bug would be provoked by values in the range 0xfc00 to 0xdc00. In
|
||||
both cases the values are those that cannot be the first data item in a UTF
|
||||
character. The three items that might have provoked this were recursions,
|
||||
possessively repeated groups, and atomic groups.
|
||||
|
||||
40. Ensure that libpcre is explicitly listed in the link commands for pcretest
|
||||
and pcregrep, because some OS require shared objects to be explicitly
|
||||
passed to ld, causing the link step to fail if they are not.
|
||||
|
||||
41. There were two incorrect #ifdefs in pcre_study.c, meaning that, in 16-bit
|
||||
mode, patterns that started with \h* or \R* might be incorrectly matched.
|
||||
|
||||
|
||||
Version 8.30 04-February-2012
|
||||
-----------------------------
|
||||
|
||||
1. Renamed "isnumber" as "is_a_number" because in some Mac environments this
|
||||
name is defined in ctype.h.
|
||||
|
||||
2. Fixed a bug in fixed-length calculation for lookbehinds that would show up
|
||||
only in quite long subpatterns.
|
||||
|
||||
3. Removed the function pcre_info(), which has been obsolete and deprecated
|
||||
since it was replaced by pcre_fullinfo() in February 2000.
|
||||
|
||||
4. For a non-anchored pattern, if (*SKIP) was given with a name that did not
|
||||
match a (*MARK), and the match failed at the start of the subject, a
|
||||
reference to memory before the start of the subject could occur. This bug
|
||||
was introduced by fix 17 of release 8.21.
|
||||
|
||||
5. A reference to an unset group with zero minimum repetition was giving
|
||||
totally wrong answers (in non-JavaScript-compatibility mode). For example,
|
||||
/(another)?(\1?)test/ matched against "hello world test". This bug was
|
||||
introduced in release 8.13.
|
||||
|
||||
6. Add support for 16-bit character strings (a large amount of work involving
|
||||
many changes and refactorings).
|
||||
|
||||
7. RunGrepTest failed on msys because \r\n was replaced by whitespace when the
|
||||
command "pattern=`printf 'xxx\r\njkl'`" was run. The pattern is now taken
|
||||
from a file.
|
||||
|
||||
8. Ovector size of 2 is also supported by JIT based pcre_exec (the ovector size
|
||||
rounding is not applied in this particular case).
|
||||
|
||||
9. The invalid Unicode surrogate codepoints U+D800 to U+DFFF are now rejected
|
||||
if they appear, or are escaped, in patterns.
|
||||
|
||||
10. Get rid of a number of -Wunused-but-set-variable warnings.
|
||||
|
||||
11. The pattern /(?=(*:x))(q|)/ matches an empty string, and returns the mark
|
||||
"x". The similar pattern /(?=(*:x))((*:y)q|)/ did not return a mark at all.
|
||||
Oddly, Perl behaves the same way. PCRE has been fixed so that this pattern
|
||||
also returns the mark "x". This bug applied to capturing parentheses,
|
||||
non-capturing parentheses, and atomic parentheses. It also applied to some
|
||||
assertions.
|
||||
|
||||
12. Stephen Kelly's patch to CMakeLists.txt allows it to parse the version
|
||||
information out of configure.ac instead of relying on pcre.h.generic, which
|
||||
is not stored in the repository.
|
||||
|
||||
13. Applied Dmitry V. Levin's patch for a more portable method for linking with
|
||||
-lreadline.
|
||||
|
||||
14. ZH added PCRE_CONFIG_JITTARGET; added its output to pcretest -C.
|
||||
|
||||
15. Applied Graycode's patch to put the top-level frame on the stack rather
|
||||
than the heap when not using the stack for recursion. This gives a
|
||||
performance improvement in many cases when recursion is not deep.
|
||||
|
||||
16. Experimental code added to "pcretest -C" to output the stack frame size.
|
||||
|
||||
|
||||
Version 8.21 12-Dec-2011
|
||||
------------------------
|
||||
|
||||
1. Updating the JIT compiler.
|
||||
|
||||
2. JIT compiler now supports OP_NCREF, OP_RREF and OP_NRREF. New test cases
|
||||
are added as well.
|
||||
|
||||
3. Fix cache-flush issue on PowerPC (It is still an experimental JIT port).
|
||||
PCRE_EXTRA_TABLES is not suported by JIT, and should be checked before
|
||||
calling _pcre_jit_exec. Some extra comments are added.
|
||||
|
||||
4. (*MARK) settings inside atomic groups that do not contain any capturing
|
||||
parentheses, for example, (?>a(*:m)), were not being passed out. This bug
|
||||
was introduced by change 18 for 8.20.
|
||||
|
||||
5. Supporting of \x, \U and \u in JavaScript compatibility mode based on the
|
||||
ECMA-262 standard.
|
||||
|
||||
6. Lookbehinds such as (?<=a{2}b) that contained a fixed repetition were
|
||||
erroneously being rejected as "not fixed length" if PCRE_CASELESS was set.
|
||||
This bug was probably introduced by change 9 of 8.13.
|
||||
|
||||
7. While fixing 6 above, I noticed that a number of other items were being
|
||||
incorrectly rejected as "not fixed length". This arose partly because newer
|
||||
opcodes had not been added to the fixed-length checking code. I have (a)
|
||||
corrected the bug and added tests for these items, and (b) arranged for an
|
||||
error to occur if an unknown opcode is encountered while checking for fixed
|
||||
length instead of just assuming "not fixed length". The items that were
|
||||
rejected were: (*ACCEPT), (*COMMIT), (*FAIL), (*MARK), (*PRUNE), (*SKIP),
|
||||
(*THEN), \h, \H, \v, \V, and single character negative classes with fixed
|
||||
repetitions, e.g. [^a]{3}, with and without PCRE_CASELESS.
|
||||
|
||||
8. A possessively repeated conditional subpattern such as (?(?=c)c|d)++ was
|
||||
being incorrectly compiled and would have given unpredicatble results.
|
||||
|
||||
9. A possessively repeated subpattern with minimum repeat count greater than
|
||||
one behaved incorrectly. For example, (A){2,}+ behaved as if it was
|
||||
(A)(A)++ which meant that, after a subsequent mismatch, backtracking into
|
||||
the first (A) could occur when it should not.
|
||||
|
||||
10. Add a cast and remove a redundant test from the code.
|
||||
|
||||
11. JIT should use pcre_malloc/pcre_free for allocation.
|
||||
|
||||
12. Updated pcre-config so that it no longer shows -L/usr/lib, which seems
|
||||
best practice nowadays, and helps with cross-compiling. (If the exec_prefix
|
||||
is anything other than /usr, -L is still shown).
|
||||
|
||||
13. In non-UTF-8 mode, \C is now supported in lookbehinds and DFA matching.
|
||||
|
||||
14. Perl does not support \N without a following name in a [] class; PCRE now
|
||||
also gives an error.
|
||||
|
||||
15. If a forward reference was repeated with an upper limit of around 2000,
|
||||
it caused the error "internal error: overran compiling workspace". The
|
||||
maximum number of forward references (including repeats) was limited by the
|
||||
internal workspace, and dependent on the LINK_SIZE. The code has been
|
||||
rewritten so that the workspace expands (via pcre_malloc) if necessary, and
|
||||
the default depends on LINK_SIZE. There is a new upper limit (for safety)
|
||||
of around 200,000 forward references. While doing this, I also speeded up
|
||||
the filling in of repeated forward references.
|
||||
|
||||
16. A repeated forward reference in a pattern such as (a)(?2){2}(.) was
|
||||
incorrectly expecting the subject to contain another "a" after the start.
|
||||
|
||||
17. When (*SKIP:name) is activated without a corresponding (*MARK:name) earlier
|
||||
in the match, the SKIP should be ignored. This was not happening; instead
|
||||
the SKIP was being treated as NOMATCH. For patterns such as
|
||||
/A(*MARK:A)A+(*SKIP:B)Z|AAC/ this meant that the AAC branch was never
|
||||
tested.
|
||||
|
||||
18. The behaviour of (*MARK), (*PRUNE), and (*THEN) has been reworked and is
|
||||
now much more compatible with Perl, in particular in cases where the result
|
||||
is a non-match for a non-anchored pattern. For example, if
|
||||
/b(*:m)f|a(*:n)w/ is matched against "abc", the non-match returns the name
|
||||
"m", where previously it did not return a name. A side effect of this
|
||||
change is that for partial matches, the last encountered mark name is
|
||||
returned, as for non matches. A number of tests that were previously not
|
||||
Perl-compatible have been moved into the Perl-compatible test files. The
|
||||
refactoring has had the pleasing side effect of removing one argument from
|
||||
the match() function, thus reducing its stack requirements.
|
||||
|
||||
19. If the /S+ option was used in pcretest to study a pattern using JIT,
|
||||
subsequent uses of /S (without +) incorrectly behaved like /S+.
|
||||
|
||||
21. Retrieve executable code size support for the JIT compiler and fixing
|
||||
some warnings.
|
||||
|
||||
22. A caseless match of a UTF-8 character whose other case uses fewer bytes did
|
||||
not work when the shorter character appeared right at the end of the
|
||||
subject string.
|
||||
|
||||
23. Added some (int) casts to non-JIT modules to reduce warnings on 64-bit
|
||||
systems.
|
||||
|
||||
24. Added PCRE_INFO_JITSIZE to pass on the value from (21) above, and also
|
||||
output it when the /M option is used in pcretest.
|
||||
|
||||
25. The CheckMan script was not being included in the distribution. Also, added
|
||||
an explicit "perl" to run Perl scripts from the PrepareRelease script
|
||||
because this is reportedly needed in Windows.
|
||||
|
||||
26. If study data was being save in a file and studying had not found a set of
|
||||
"starts with" bytes for the pattern, the data written to the file (though
|
||||
never used) was taken from uninitialized memory and so caused valgrind to
|
||||
complain.
|
||||
|
||||
27. Updated RunTest.bat as provided by Sheri Pierce.
|
||||
|
||||
28. Fixed a possible uninitialized memory bug in pcre_jit_compile.c.
|
||||
|
||||
29. Computation of memory usage for the table of capturing group names was
|
||||
giving an unnecessarily large value.
|
||||
|
||||
|
||||
Version 8.20 21-Oct-2011
|
||||
------------------------
|
||||
|
||||
1. Change 37 of 8.13 broke patterns like [:a]...[b:] because it thought it had
|
||||
a POSIX class. After further experiments with Perl, which convinced me that
|
||||
Perl has bugs and confusions, a closing square bracket is no longer allowed
|
||||
in a POSIX name. This bug also affected patterns with classes that started
|
||||
with full stops.
|
||||
|
||||
2. If a pattern such as /(a)b|ac/ is matched against "ac", there is no
|
||||
captured substring, but while checking the failing first alternative,
|
||||
substring 1 is temporarily captured. If the output vector supplied to
|
||||
pcre_exec() was not big enough for this capture, the yield of the function
|
||||
was still zero ("insufficient space for captured substrings"). This cannot
|
||||
be totally fixed without adding another stack variable, which seems a lot
|
||||
of expense for a edge case. However, I have improved the situation in cases
|
||||
such as /(a)(b)x|abc/ matched against "abc", where the return code
|
||||
indicates that fewer than the maximum number of slots in the ovector have
|
||||
been set.
|
||||
|
||||
3. Related to (2) above: when there are more back references in a pattern than
|
||||
slots in the output vector, pcre_exec() uses temporary memory during
|
||||
matching, and copies in the captures as far as possible afterwards. It was
|
||||
using the entire output vector, but this conflicts with the specification
|
||||
that only 2/3 is used for passing back captured substrings. Now it uses
|
||||
only the first 2/3, for compatibility. This is, of course, another edge
|
||||
case.
|
||||
|
||||
4. Zoltan Herczeg's just-in-time compiler support has been integrated into the
|
||||
main code base, and can be used by building with --enable-jit. When this is
|
||||
done, pcregrep automatically uses it unless --disable-pcregrep-jit or the
|
||||
runtime --no-jit option is given.
|
||||
|
||||
5. When the number of matches in a pcre_dfa_exec() run exactly filled the
|
||||
ovector, the return from the function was zero, implying that there were
|
||||
other matches that did not fit. The correct "exactly full" value is now
|
||||
returned.
|
||||
|
||||
6. If a subpattern that was called recursively or as a subroutine contained
|
||||
(*PRUNE) or any other control that caused it to give a non-standard return,
|
||||
invalid errors such as "Error -26 (nested recursion at the same subject
|
||||
position)" or even infinite loops could occur.
|
||||
|
||||
7. If a pattern such as /a(*SKIP)c|b(*ACCEPT)|/ was studied, it stopped
|
||||
computing the minimum length on reaching *ACCEPT, and so ended up with the
|
||||
wrong value of 1 rather than 0. Further investigation indicates that
|
||||
computing a minimum subject length in the presence of *ACCEPT is difficult
|
||||
(think back references, subroutine calls), and so I have changed the code
|
||||
so that no minimum is registered for a pattern that contains *ACCEPT.
|
||||
|
||||
8. If (*THEN) was present in the first (true) branch of a conditional group,
|
||||
it was not handled as intended. [But see 16 below.]
|
||||
|
||||
9. Replaced RunTest.bat and CMakeLists.txt with improved versions provided by
|
||||
Sheri Pierce.
|
||||
|
||||
10. A pathological pattern such as /(*ACCEPT)a/ was miscompiled, thinking that
|
||||
the first byte in a match must be "a".
|
||||
|
||||
11. Change 17 for 8.13 increased the recursion depth for patterns like
|
||||
/a(?:.)*?a/ drastically. I've improved things by remembering whether a
|
||||
pattern contains any instances of (*THEN). If it does not, the old
|
||||
optimizations are restored. It would be nice to do this on a per-group
|
||||
basis, but at the moment that is not feasible.
|
||||
|
||||
12. In some environments, the output of pcretest -C is CRLF terminated. This
|
||||
broke RunTest's code that checks for the link size. A single white space
|
||||
character after the value is now allowed for.
|
||||
|
||||
13. RunTest now checks for the "fr" locale as well as for "fr_FR" and "french".
|
||||
For "fr", it uses the Windows-specific input and output files.
|
||||
|
||||
14. If (*THEN) appeared in a group that was called recursively or as a
|
||||
subroutine, it did not work as intended. [But see next item.]
|
||||
|
||||
15. Consider the pattern /A (B(*THEN)C) | D/ where A, B, C, and D are complex
|
||||
pattern fragments (but not containing any | characters). If A and B are
|
||||
matched, but there is a failure in C so that it backtracks to (*THEN), PCRE
|
||||
was behaving differently to Perl. PCRE backtracked into A, but Perl goes to
|
||||
D. In other words, Perl considers parentheses that do not contain any |
|
||||
characters to be part of a surrounding alternative, whereas PCRE was
|
||||
treading (B(*THEN)C) the same as (B(*THEN)C|(*FAIL)) -- which Perl handles
|
||||
differently. PCRE now behaves in the same way as Perl, except in the case
|
||||
of subroutine/recursion calls such as (?1) which have in any case always
|
||||
been different (but PCRE had them first :-).
|
||||
|
||||
16. Related to 15 above: Perl does not treat the | in a conditional group as
|
||||
creating alternatives. Such a group is treated in the same way as an
|
||||
ordinary group without any | characters when processing (*THEN). PCRE has
|
||||
been changed to match Perl's behaviour.
|
||||
|
||||
17. If a user had set PCREGREP_COLO(U)R to something other than 1:31, the
|
||||
RunGrepTest script failed.
|
||||
|
||||
18. Change 22 for version 13 caused atomic groups to use more stack. This is
|
||||
inevitable for groups that contain captures, but it can lead to a lot of
|
||||
stack use in large patterns. The old behaviour has been restored for atomic
|
||||
groups that do not contain any capturing parentheses.
|
||||
|
||||
19. If the PCRE_NO_START_OPTIMIZE option was set for pcre_compile(), it did not
|
||||
suppress the check for a minimum subject length at run time. (If it was
|
||||
given to pcre_exec() or pcre_dfa_exec() it did work.)
|
||||
|
||||
20. Fixed an ASCII-dependent infelicity in pcretest that would have made it
|
||||
fail to work when decoding hex characters in data strings in EBCDIC
|
||||
environments.
|
||||
|
||||
21. It appears that in at least one Mac OS environment, the isxdigit() function
|
||||
is implemented as a macro that evaluates to its argument more than once,
|
||||
contravening the C 90 Standard (I haven't checked a later standard). There
|
||||
was an instance in pcretest which caused it to go wrong when processing
|
||||
\x{...} escapes in subject strings. The has been rewritten to avoid using
|
||||
things like p++ in the argument of isxdigit().
|
||||
|
||||
|
||||
Version 8.13 16-Aug-2011
|
||||
------------------------
|
||||
|
||||
1. The Unicode data tables have been updated to Unicode 6.0.0.
|
||||
|
||||
2. Two minor typos in pcre_internal.h have been fixed.
|
||||
|
||||
3. Added #include <string.h> to pcre_scanner_unittest.cc, pcrecpp.cc, and
|
||||
pcrecpp_unittest.cc. They are needed for strcmp(), memset(), and strchr()
|
||||
in some environments (e.g. Solaris 10/SPARC using Sun Studio 12U2).
|
||||
|
||||
4. There were a number of related bugs in the code for matching backrefences
|
||||
caselessly in UTF-8 mode when codes for the characters concerned were
|
||||
different numbers of bytes. For example, U+023A and U+2C65 are an upper
|
||||
and lower case pair, using 2 and 3 bytes, respectively. The main bugs were:
|
||||
(a) A reference to 3 copies of a 2-byte code matched only 2 of a 3-byte
|
||||
code. (b) A reference to 2 copies of a 3-byte code would not match 2 of a
|
||||
2-byte code at the end of the subject (it thought there wasn't enough data
|
||||
left).
|
||||
|
||||
5. Comprehensive information about what went wrong is now returned by
|
||||
pcre_exec() and pcre_dfa_exec() when the UTF-8 string check fails, as long
|
||||
as the output vector has at least 2 elements. The offset of the start of
|
||||
the failing character and a reason code are placed in the vector.
|
||||
|
||||
6. When the UTF-8 string check fails for pcre_compile(), the offset that is
|
||||
now returned is for the first byte of the failing character, instead of the
|
||||
last byte inspected. This is an incompatible change, but I hope it is small
|
||||
enough not to be a problem. It makes the returned offset consistent with
|
||||
pcre_exec() and pcre_dfa_exec().
|
||||
|
||||
7. pcretest now gives a text phrase as well as the error number when
|
||||
pcre_exec() or pcre_dfa_exec() fails; if the error is a UTF-8 check
|
||||
failure, the offset and reason code are output.
|
||||
|
||||
8. When \R was used with a maximizing quantifier it failed to skip backwards
|
||||
over a \r\n pair if the subsequent match failed. Instead, it just skipped
|
||||
back over a single character (\n). This seems wrong (because it treated the
|
||||
two characters as a single entity when going forwards), conflicts with the
|
||||
documentation that \R is equivalent to (?>\r\n|\n|...etc), and makes the
|
||||
behaviour of \R* different to (\R)*, which also seems wrong. The behaviour
|
||||
has been changed.
|
||||
|
||||
9. Some internal refactoring has changed the processing so that the handling
|
||||
of the PCRE_CASELESS and PCRE_MULTILINE options is done entirely at compile
|
||||
time (the PCRE_DOTALL option was changed this way some time ago: version
|
||||
7.7 change 16). This has made it possible to abolish the OP_OPT op code,
|
||||
which was always a bit of a fudge. It also means that there is one less
|
||||
argument for the match() function, which reduces its stack requirements
|
||||
slightly. This change also fixes an incompatibility with Perl: the pattern
|
||||
(?i:([^b]))(?1) should not match "ab", but previously PCRE gave a match.
|
||||
|
||||
10. More internal refactoring has drastically reduced the number of recursive
|
||||
calls to match() for possessively repeated groups such as (abc)++ when
|
||||
using pcre_exec().
|
||||
|
||||
11. While implementing 10, a number of bugs in the handling of groups were
|
||||
discovered and fixed:
|
||||
|
||||
(?<=(a)+) was not diagnosed as invalid (non-fixed-length lookbehind).
|
||||
(a|)*(?1) gave a compile-time internal error.
|
||||
((a|)+)+ did not notice that the outer group could match an empty string.
|
||||
(^a|^)+ was not marked as anchored.
|
||||
(.*a|.*)+ was not marked as matching at start or after a newline.
|
||||
|
||||
12. Yet more internal refactoring has removed another argument from the match()
|
||||
function. Special calls to this function are now indicated by setting a
|
||||
value in a variable in the "match data" data block.
|
||||
|
||||
13. Be more explicit in pcre_study() instead of relying on "default" for
|
||||
opcodes that mean there is no starting character; this means that when new
|
||||
ones are added and accidentally left out of pcre_study(), testing should
|
||||
pick them up.
|
||||
|
||||
14. The -s option of pcretest has been documented for ages as being an old
|
||||
synonym of -m (show memory usage). I have changed it to mean "force study
|
||||
for every regex", that is, assume /S for every regex. This is similar to -i
|
||||
and -d etc. It's slightly incompatible, but I'm hoping nobody is still
|
||||
using it. It makes it easier to run collections of tests with and without
|
||||
study enabled, and thereby test pcre_study() more easily. All the standard
|
||||
tests are now run with and without -s (but some patterns can be marked as
|
||||
"never study" - see 20 below).
|
||||
|
||||
15. When (*ACCEPT) was used in a subpattern that was called recursively, the
|
||||
restoration of the capturing data to the outer values was not happening
|
||||
correctly.
|
||||
|
||||
16. If a recursively called subpattern ended with (*ACCEPT) and matched an
|
||||
empty string, and PCRE_NOTEMPTY was set, pcre_exec() thought the whole
|
||||
pattern had matched an empty string, and so incorrectly returned a no
|
||||
match.
|
||||
|
||||
17. There was optimizing code for the last branch of non-capturing parentheses,
|
||||
and also for the obeyed branch of a conditional subexpression, which used
|
||||
tail recursion to cut down on stack usage. Unfortunately, now that there is
|
||||
the possibility of (*THEN) occurring in these branches, tail recursion is
|
||||
no longer possible because the return has to be checked for (*THEN). These
|
||||
two optimizations have therefore been removed. [But see 8.20/11 above.]
|
||||
|
||||
18. If a pattern containing \R was studied, it was assumed that \R always
|
||||
matched two bytes, thus causing the minimum subject length to be
|
||||
incorrectly computed because \R can also match just one byte.
|
||||
|
||||
19. If a pattern containing (*ACCEPT) was studied, the minimum subject length
|
||||
was incorrectly computed.
|
||||
|
||||
20. If /S is present twice on a test pattern in pcretest input, it now
|
||||
*disables* studying, thereby overriding the use of -s on the command line
|
||||
(see 14 above). This is necessary for one or two tests to keep the output
|
||||
identical in both cases.
|
||||
|
||||
21. When (*ACCEPT) was used in an assertion that matched an empty string and
|
||||
PCRE_NOTEMPTY was set, PCRE applied the non-empty test to the assertion.
|
||||
|
||||
22. When an atomic group that contained a capturing parenthesis was
|
||||
successfully matched, but the branch in which it appeared failed, the
|
||||
capturing was not being forgotten if a higher numbered group was later
|
||||
captured. For example, /(?>(a))b|(a)c/ when matching "ac" set capturing
|
||||
group 1 to "a", when in fact it should be unset. This applied to multi-
|
||||
branched capturing and non-capturing groups, repeated or not, and also to
|
||||
positive assertions (capturing in negative assertions does not happen
|
||||
in PCRE) and also to nested atomic groups.
|
||||
|
||||
23. Add the ++ qualifier feature to pcretest, to show the remainder of the
|
||||
subject after a captured substring, to make it easier to tell which of a
|
||||
number of identical substrings has been captured.
|
||||
|
||||
24. The way atomic groups are processed by pcre_exec() has been changed so that
|
||||
if they are repeated, backtracking one repetition now resets captured
|
||||
values correctly. For example, if ((?>(a+)b)+aabab) is matched against
|
||||
"aaaabaaabaabab" the value of captured group 2 is now correctly recorded as
|
||||
"aaa". Previously, it would have been "a". As part of this code
|
||||
refactoring, the way recursive calls are handled has also been changed.
|
||||
|
||||
25. If an assertion condition captured any substrings, they were not passed
|
||||
back unless some other capturing happened later. For example, if
|
||||
(?(?=(a))a) was matched against "a", no capturing was returned.
|
||||
|
||||
26. When studying a pattern that contained subroutine calls or assertions,
|
||||
the code for finding the minimum length of a possible match was handling
|
||||
direct recursions such as (xxx(?1)|yyy) but not mutual recursions (where
|
||||
group 1 called group 2 while simultaneously a separate group 2 called group
|
||||
1). A stack overflow occurred in this case. I have fixed this by limiting
|
||||
the recursion depth to 10.
|
||||
|
||||
27. Updated RunTest.bat in the distribution to the version supplied by Tom
|
||||
Fortmann. This supports explicit test numbers on the command line, and has
|
||||
argument validation and error reporting.
|
||||
|
||||
28. An instance of \X with an unlimited repeat could fail if at any point the
|
||||
first character it looked at was a mark character.
|
||||
|
||||
29. Some minor code refactoring concerning Unicode properties and scripts
|
||||
should reduce the stack requirement of match() slightly.
|
||||
|
||||
30. Added the '=' option to pcretest to check the setting of unused capturing
|
||||
slots at the end of the pattern, which are documented as being -1, but are
|
||||
not included in the return count.
|
||||
|
||||
31. If \k was not followed by a braced, angle-bracketed, or quoted name, PCRE
|
||||
compiled something random. Now it gives a compile-time error (as does
|
||||
Perl).
|
||||
|
||||
32. A *MARK encountered during the processing of a positive assertion is now
|
||||
recorded and passed back (compatible with Perl).
|
||||
|
||||
33. If --only-matching or --colour was set on a pcregrep call whose pattern
|
||||
had alternative anchored branches, the search for a second match in a line
|
||||
was done as if at the line start. Thus, for example, /^01|^02/ incorrectly
|
||||
matched the line "0102" twice. The same bug affected patterns that started
|
||||
with a backwards assertion. For example /\b01|\b02/ also matched "0102"
|
||||
twice.
|
||||
|
||||
34. Previously, PCRE did not allow quantification of assertions. However, Perl
|
||||
does, and because of capturing effects, quantifying parenthesized
|
||||
assertions may at times be useful. Quantifiers are now allowed for
|
||||
parenthesized assertions.
|
||||
|
||||
35. A minor code tidy in pcre_compile() when checking options for \R usage.
|
||||
|
||||
36. \g was being checked for fancy things in a character class, when it should
|
||||
just be a literal "g".
|
||||
|
||||
37. PCRE was rejecting [:a[:digit:]] whereas Perl was not. It seems that the
|
||||
appearance of a nested POSIX class supersedes an apparent external class.
|
||||
For example, [:a[:digit:]b:] matches "a", "b", ":", or a digit. Also,
|
||||
unescaped square brackets may also appear as part of class names. For
|
||||
example, [:a[:abc]b:] gives unknown class "[:abc]b:]". PCRE now behaves
|
||||
more like Perl. (But see 8.20/1 above.)
|
||||
|
||||
38. PCRE was giving an error for \N with a braced quantifier such as {1,} (this
|
||||
was because it thought it was \N{name}, which is not supported).
|
||||
|
||||
39. Add minix to OS list not supporting the -S option in pcretest.
|
||||
|
||||
40. PCRE tries to detect cases of infinite recursion at compile time, but it
|
||||
cannot analyze patterns in sufficient detail to catch mutual recursions
|
||||
such as ((?1))((?2)). There is now a runtime test that gives an error if a
|
||||
subgroup is called recursively as a subpattern for a second time at the
|
||||
same position in the subject string. In previous releases this might have
|
||||
been caught by the recursion limit, or it might have run out of stack.
|
||||
|
||||
41. A pattern such as /(?(R)a+|(?R)b)/ is quite safe, as the recursion can
|
||||
happen only once. PCRE was, however incorrectly giving a compile time error
|
||||
"recursive call could loop indefinitely" because it cannot analyze the
|
||||
pattern in sufficient detail. The compile time test no longer happens when
|
||||
PCRE is compiling a conditional subpattern, but actual runaway loops are
|
||||
now caught at runtime (see 40 above).
|
||||
|
||||
42. It seems that Perl allows any characters other than a closing parenthesis
|
||||
to be part of the NAME in (*MARK:NAME) and other backtracking verbs. PCRE
|
||||
has been changed to be the same.
|
||||
|
||||
43. Updated configure.ac to put in more quoting round AC_LANG_PROGRAM etc. so
|
||||
as not to get warnings when autogen.sh is called. Also changed
|
||||
AC_PROG_LIBTOOL (deprecated) to LT_INIT (the current macro).
|
||||
|
||||
44. To help people who use pcregrep to scan files containing exceedingly long
|
||||
lines, the following changes have been made:
|
||||
|
||||
(a) The default value of the buffer size parameter has been increased from
|
||||
8K to 20K. (The actual buffer used is three times this size.)
|
||||
|
||||
(b) The default can be changed by ./configure --with-pcregrep-bufsize when
|
||||
PCRE is built.
|
||||
|
||||
(c) A --buffer-size=n option has been added to pcregrep, to allow the size
|
||||
to be set at run time.
|
||||
|
||||
(d) Numerical values in pcregrep options can be followed by K or M, for
|
||||
example --buffer-size=50K.
|
||||
|
||||
(e) If a line being scanned overflows pcregrep's buffer, an error is now
|
||||
given and the return code is set to 2.
|
||||
|
||||
45. Add a pointer to the latest mark to the callout data block.
|
||||
|
||||
46. The pattern /.(*F)/, when applied to "abc" with PCRE_PARTIAL_HARD, gave a
|
||||
partial match of an empty string instead of no match. This was specific to
|
||||
the use of ".".
|
||||
|
||||
47. The pattern /f.*/8s, when applied to "for" with PCRE_PARTIAL_HARD, gave a
|
||||
complete match instead of a partial match. This bug was dependent on both
|
||||
the PCRE_UTF8 and PCRE_DOTALL options being set.
|
||||
|
||||
48. For a pattern such as /\babc|\bdef/ pcre_study() was failing to set up the
|
||||
starting byte set, because \b was not being ignored.
|
||||
|
||||
|
||||
Version 8.12 15-Jan-2011
|
||||
------------------------
|
||||
|
||||
|
@ -2,7 +2,8 @@ Technical Notes about PCRE
|
||||
--------------------------
|
||||
|
||||
These are very rough technical notes that record potentially useful information
|
||||
about PCRE internals.
|
||||
about PCRE internals. For information about testing PCRE, see the pcretest
|
||||
documentation and the comment at the head of the RunTest file.
|
||||
|
||||
|
||||
Historical note 1
|
||||
@ -48,6 +49,18 @@ complexity in Perl regular expressions, I couldn't do this. In any case, a
|
||||
first pass through the pattern is helpful for other reasons.
|
||||
|
||||
|
||||
Support for 16-bit data strings
|
||||
-------------------------------
|
||||
|
||||
From release 8.30, PCRE supports 16-bit as well as 8-bit data strings, by being
|
||||
compilable in either 8-bit or 16-bit modes, or both. Thus, two different
|
||||
libraries can be created. In the description that follows, the word "short" is
|
||||
used for a 16-bit data quantity, and the word "unit" is used for a quantity
|
||||
that is a byte in 8-bit mode and a short in 16-bit mode. However, so as not to
|
||||
over-complicate the text, the names of PCRE functions are given in 8-bit form
|
||||
only.
|
||||
|
||||
|
||||
Computing the memory requirement: how it was
|
||||
--------------------------------------------
|
||||
|
||||
@ -68,7 +81,7 @@ things I did for 6.8 was to fix Yet Another Bug in the memory computation. Then
|
||||
I had a flash of inspiration as to how I could run the real compile function in
|
||||
a "fake" mode that enables it to compute how much memory it would need, while
|
||||
actually only ever using a few hundred bytes of working memory, and without too
|
||||
many tests of the mode that might slow it down. So I re-factored the compiling
|
||||
many tests of the mode that might slow it down. So I refactored the compiling
|
||||
functions to work this way. This got rid of about 600 lines of source. It
|
||||
should make future maintenance and development easier. As this was such a major
|
||||
change, I never released 6.8, instead upping the number to 7.0 (other quite
|
||||
@ -88,7 +101,10 @@ The "traditional", and original, matching function is called pcre_exec(), and
|
||||
it implements an NFA algorithm, similar to the original Henry Spencer algorithm
|
||||
and the way that Perl works. This is not surprising, since it is intended to be
|
||||
as compatible with Perl as possible. This is the function most users of PCRE
|
||||
will use most of the time.
|
||||
will use most of the time. From release 8.20, if PCRE is compiled with
|
||||
just-in-time (JIT) support, and studying a compiled pattern with JIT is
|
||||
successful, the JIT code is run instead of the normal pcre_exec() code, but the
|
||||
result is the same.
|
||||
|
||||
|
||||
Supplementary matching function
|
||||
@ -108,28 +124,38 @@ needed at compile time to produce a traditional FSM where only one state is
|
||||
ever active at once. I believe some other regex matchers work this way.
|
||||
|
||||
|
||||
Changeable options
|
||||
------------------
|
||||
|
||||
The /i, /m, or /s options (PCRE_CASELESS, PCRE_MULTILINE, PCRE_DOTALL) may
|
||||
change in the middle of patterns. From PCRE 8.13, their processing is handled
|
||||
entirely at compile time by generating different opcodes for the different
|
||||
settings. The runtime functions do not need to keep track of an options state
|
||||
any more.
|
||||
|
||||
|
||||
Format of compiled patterns
|
||||
---------------------------
|
||||
|
||||
The compiled form of a pattern is a vector of bytes, containing items of
|
||||
variable length. The first byte in an item is an opcode, and the length of the
|
||||
item is either implicit in the opcode or contained in the data bytes that
|
||||
follow it.
|
||||
The compiled form of a pattern is a vector of units (bytes in 8-bit mode, or
|
||||
shorts in 16-bit mode), containing items of variable length. The first unit in
|
||||
an item contains an opcode, and the length of the item is either implicit in
|
||||
the opcode or contained in the data that follows it.
|
||||
|
||||
In many cases below LINK_SIZE data values are specified for offsets within the
|
||||
compiled pattern. The default value for LINK_SIZE is 2, but PCRE can be
|
||||
compiled to use 3-byte or 4-byte values for these offsets (impairing the
|
||||
performance). This is necessary only when patterns whose compiled length is
|
||||
greater than 64K are going to be processed. In this description, we assume the
|
||||
"normal" compilation options. Data values that are counts (e.g. for
|
||||
quantifiers) are always just two bytes long.
|
||||
|
||||
A list of the opcodes follows:
|
||||
In many cases listed below, LINK_SIZE data values are specified for offsets
|
||||
within the compiled pattern. LINK_SIZE always specifies a number of bytes. The
|
||||
default value for LINK_SIZE is 2, but PCRE can be compiled to use 3-byte or
|
||||
4-byte values for these offsets, although this impairs the performance. (3-byte
|
||||
LINK_SIZE values are available only in 8-bit mode.) Specifing a LINK_SIZE
|
||||
larger than 2 is necessary only when patterns whose compiled length is greater
|
||||
than 64K are going to be processed. In this description, we assume the "normal"
|
||||
compilation options. Data values that are counts (e.g. for quantifiers) are
|
||||
always just two bytes long (one short in 16-bit mode).
|
||||
|
||||
Opcodes with no following data
|
||||
------------------------------
|
||||
|
||||
These items are all just one byte long
|
||||
These items are all just one unit long
|
||||
|
||||
OP_END end of pattern
|
||||
OP_ANY match any one character other than newline
|
||||
@ -138,7 +164,8 @@ These items are all just one byte long
|
||||
OP_SOD match start of data: \A
|
||||
OP_SOM, start of match (subject + offset): \G
|
||||
OP_SET_SOM, set start of match (\K)
|
||||
OP_CIRC ^ (start of data, or after \n in multiline)
|
||||
OP_CIRC ^ (start of data)
|
||||
OP_CIRCM ^ multiline mode (start of data or after newline)
|
||||
OP_NOT_WORD_BOUNDARY \W
|
||||
OP_WORD_BOUNDARY \w
|
||||
OP_NOT_DIGIT \D
|
||||
@ -153,7 +180,8 @@ These items are all just one byte long
|
||||
OP_WORDCHAR \w
|
||||
OP_EODN match end of data or \n at end: \Z
|
||||
OP_EOD match end of data: \z
|
||||
OP_DOLL $ (end of data, or before \n in multiline)
|
||||
OP_DOLL $ (end of data, or before final newline)
|
||||
OP_DOLLM $ multiline mode (end of data or before newline)
|
||||
OP_EXTUNI match an extended Unicode character
|
||||
OP_ANYNL match any Unicode newline sequence
|
||||
|
||||
@ -164,49 +192,57 @@ These items are all just one byte long
|
||||
OP_SKIP ) indicating which parentheses must be closed.
|
||||
|
||||
|
||||
Backtracking control verbs with data
|
||||
------------------------------------
|
||||
|
||||
OP_THEN is followed by a LINK_SIZE offset, which is the distance back to the
|
||||
start of the current branch.
|
||||
Backtracking control verbs with (optional) data
|
||||
-----------------------------------------------
|
||||
|
||||
OP_MARK is followed by the mark name, preceded by a one-byte length, and
|
||||
followed by a binary zero. For (*PRUNE), (*SKIP), and (*THEN) with arguments,
|
||||
the opcodes OP_PRUNE_ARG, OP_SKIP_ARG, and OP_THEN_ARG are used. For the first
|
||||
two, the name follows immediately; for OP_THEN_ARG, it follows the LINK_SIZE
|
||||
offset value.
|
||||
(*THEN) without an argument generates the opcode OP_THEN and no following data.
|
||||
OP_MARK is followed by the mark name, preceded by a one-unit length, and
|
||||
followed by a binary zero. For (*PRUNE), (*SKIP), and (*THEN) with arguments,
|
||||
the opcodes OP_PRUNE_ARG, OP_SKIP_ARG, and OP_THEN_ARG are used, with the name
|
||||
following in the same format.
|
||||
|
||||
|
||||
Matching literal characters
|
||||
---------------------------
|
||||
|
||||
The OP_CHAR opcode is followed by a single character that is to be matched
|
||||
casefully. For caseless matching, OP_CHARI is used. In UTF-8 or UTF-16 modes,
|
||||
the character may be more than one unit long.
|
||||
|
||||
|
||||
Repeating single characters
|
||||
---------------------------
|
||||
|
||||
The common repeats (*, +, ?) when applied to a single character use the
|
||||
following opcodes:
|
||||
The common repeats (*, +, ?), when applied to a single character, use the
|
||||
following opcodes, which come in caseful and caseless versions:
|
||||
|
||||
OP_STAR
|
||||
OP_MINSTAR
|
||||
OP_POSSTAR
|
||||
OP_PLUS
|
||||
OP_MINPLUS
|
||||
OP_POSPLUS
|
||||
OP_QUERY
|
||||
OP_MINQUERY
|
||||
OP_POSQUERY
|
||||
Caseful Caseless
|
||||
OP_STAR OP_STARI
|
||||
OP_MINSTAR OP_MINSTARI
|
||||
OP_POSSTAR OP_POSSTARI
|
||||
OP_PLUS OP_PLUSI
|
||||
OP_MINPLUS OP_MINPLUSI
|
||||
OP_POSPLUS OP_POSPLUSI
|
||||
OP_QUERY OP_QUERYI
|
||||
OP_MINQUERY OP_MINQUERYI
|
||||
OP_POSQUERY OP_POSQUERYI
|
||||
|
||||
In ASCII mode, these are two-byte items; in UTF-8 mode, the length is variable.
|
||||
Those with "MIN" in their name are the minimizing versions. Those with "POS" in
|
||||
their names are possessive versions. Each is followed by the character that is
|
||||
to be repeated. Other repeats make use of
|
||||
Each opcode is followed by the character that is to be repeated. In ASCII mode,
|
||||
these are two-unit items; in UTF-8 or UTF-16 modes, the length is variable.
|
||||
Those with "MIN" in their names are the minimizing versions. Those with "POS"
|
||||
in their names are possessive versions. Other repeats make use of these
|
||||
opcodes:
|
||||
|
||||
OP_UPTO
|
||||
OP_MINUPTO
|
||||
OP_POSUPTO
|
||||
OP_EXACT
|
||||
Caseful Caseless
|
||||
OP_UPTO OP_UPTOI
|
||||
OP_MINUPTO OP_MINUPTOI
|
||||
OP_POSUPTO OP_POSUPTOI
|
||||
OP_EXACT OP_EXACTI
|
||||
|
||||
which are followed by a two-byte count (most significant first) and the
|
||||
repeated character. OP_UPTO matches from 0 to the given number. A repeat with a
|
||||
non-zero minimum and a fixed maximum is coded as an OP_EXACT followed by an
|
||||
OP_UPTO (or OP_MINUPTO or OPT_POSUPTO).
|
||||
Each of these is followed by a two-byte (one short) count (most significant
|
||||
byte first in 8-bit mode) and then the repeated character. OP_UPTO matches from
|
||||
0 to the given number. A repeat with a non-zero minimum and a fixed maximum is
|
||||
coded as an OP_EXACT followed by an OP_UPTO (or OP_MINUPTO or OPT_POSUPTO).
|
||||
|
||||
|
||||
Repeating character types
|
||||
@ -214,7 +250,7 @@ Repeating character types
|
||||
|
||||
Repeats of things like \d are done exactly as for single characters, except
|
||||
that instead of a character, the opcode for the type is stored in the data
|
||||
byte. The opcodes are:
|
||||
unit. The opcodes are:
|
||||
|
||||
OP_TYPESTAR
|
||||
OP_TYPEMINSTAR
|
||||
@ -236,65 +272,58 @@ Match by Unicode property
|
||||
|
||||
OP_PROP and OP_NOTPROP are used for positive and negative matches of a
|
||||
character by testing its Unicode property (the \p and \P escape sequences).
|
||||
Each is followed by two bytes that encode the desired property as a type and a
|
||||
Each is followed by two units that encode the desired property as a type and a
|
||||
value.
|
||||
|
||||
Repeats of these items use the OP_TYPESTAR etc. set of opcodes, followed by
|
||||
three bytes: OP_PROP or OP_NOTPROP and then the desired property type and
|
||||
Repeats of these items use the OP_TYPESTAR etc. set of opcodes, followed by
|
||||
three units: OP_PROP or OP_NOTPROP, and then the desired property type and
|
||||
value.
|
||||
|
||||
|
||||
Matching literal characters
|
||||
---------------------------
|
||||
|
||||
The OP_CHAR opcode is followed by a single character that is to be matched
|
||||
casefully. For caseless matching, OP_CHARNC is used. In UTF-8 mode, the
|
||||
character may be more than one byte long. (Earlier versions of PCRE used
|
||||
multi-character strings, but this was changed to allow some new features to be
|
||||
added.)
|
||||
|
||||
|
||||
Character classes
|
||||
-----------------
|
||||
|
||||
If there is only one character, OP_CHAR or OP_CHARNC is used for a positive
|
||||
class, and OP_NOT for a negative one (that is, for something like [^a]).
|
||||
However, in UTF-8 mode, the use of OP_NOT applies only to characters with
|
||||
values < 128, because OP_NOT is confined to single bytes.
|
||||
If there is only one character in the class, OP_CHAR or OP_CHARI is used for a
|
||||
positive class, and OP_NOT or OP_NOTI for a negative one (that is, for
|
||||
something like [^a]).
|
||||
|
||||
Another set of repeating opcodes (OP_NOTSTAR etc.) are used for a repeated,
|
||||
negated, single-character class. The normal ones (OP_STAR etc.) are used for a
|
||||
repeated positive single-character class.
|
||||
Another set of 13 repeating opcodes (called OP_NOTSTAR etc.) are used for
|
||||
repeated, negated, single-character classes. The normal single-character
|
||||
opcodes (OP_STAR, etc.) are used for repeated positive single-character
|
||||
classes.
|
||||
|
||||
When there's more than one character in a class and all the characters are less
|
||||
than 256, OP_CLASS is used for a positive class, and OP_NCLASS for a negative
|
||||
one. In either case, the opcode is followed by a 32-byte bit map containing a 1
|
||||
bit for every character that is acceptable. The bits are counted from the least
|
||||
significant end of each byte.
|
||||
When there is more than one character in a class and all the characters are
|
||||
less than 256, OP_CLASS is used for a positive class, and OP_NCLASS for a
|
||||
negative one. In either case, the opcode is followed by a 32-byte (16-short)
|
||||
bit map containing a 1 bit for every character that is acceptable. The bits are
|
||||
counted from the least significant end of each unit. In caseless mode, bits for
|
||||
both cases are set.
|
||||
|
||||
The reason for having both OP_CLASS and OP_NCLASS is so that, in UTF-8 mode,
|
||||
subject characters with values greater than 256 can be handled correctly. For
|
||||
OP_CLASS they don't match, whereas for OP_NCLASS they do.
|
||||
The reason for having both OP_CLASS and OP_NCLASS is so that, in UTF-8/16 mode,
|
||||
subject characters with values greater than 255 can be handled correctly. For
|
||||
OP_CLASS they do not match, whereas for OP_NCLASS they do.
|
||||
|
||||
For classes containing characters with values > 255, OP_XCLASS is used. It
|
||||
optionally uses a bit map (if any characters lie within it), followed by a list
|
||||
of pairs and single characters. There is a flag character than indicates
|
||||
whether it's a positive or a negative class.
|
||||
For classes containing characters with values greater than 255, OP_XCLASS is
|
||||
used. It optionally uses a bit map (if any characters lie within it), followed
|
||||
by a list of pairs (for a range) and single characters. In caseless mode, both
|
||||
cases are explicitly listed. There is a flag character than indicates whether
|
||||
it is a positive or a negative class.
|
||||
|
||||
|
||||
Back references
|
||||
---------------
|
||||
|
||||
OP_REF is followed by two bytes containing the reference number.
|
||||
OP_REF (caseful) or OP_REFI (caseless) is followed by two bytes (one short)
|
||||
containing the reference number.
|
||||
|
||||
|
||||
Repeating character classes and back references
|
||||
-----------------------------------------------
|
||||
|
||||
Single-character classes are handled specially (see above). This section
|
||||
applies to OP_CLASS and OP_REF. In both cases, the repeat information follows
|
||||
the base item. The matching code looks at the following opcode to see if it is
|
||||
one of
|
||||
applies to OP_CLASS and OP_REF[I]. In both cases, the repeat information
|
||||
follows the base item. The matching code looks at the following opcode to see
|
||||
if it is one of
|
||||
|
||||
OP_CRSTAR
|
||||
OP_CRMINSTAR
|
||||
@ -305,10 +334,10 @@ one of
|
||||
OP_CRRANGE
|
||||
OP_CRMINRANGE
|
||||
|
||||
All but the last two are just single-byte items. The others are followed by
|
||||
four bytes of data, comprising the minimum and maximum repeat counts. There are
|
||||
no special possessive opcodes for these repeats; a possessive repeat is
|
||||
compiled into an atomic group.
|
||||
All but the last two are just single-unit items. The others are followed by
|
||||
four bytes (two shorts) of data, comprising the minimum and maximum repeat
|
||||
counts. There are no special possessive opcodes for these repeats; a possessive
|
||||
repeat is compiled into an atomic group.
|
||||
|
||||
|
||||
Brackets and alternation
|
||||
@ -318,7 +347,8 @@ A pair of non-capturing (round) brackets is wrapped round each expression at
|
||||
compile time, so alternation always happens in the context of brackets.
|
||||
|
||||
[Note for North Americans: "bracket" to some English speakers, including
|
||||
myself, can be round, square, curly, or pointy. Hence this usage.]
|
||||
myself, can be round, square, curly, or pointy. Hence this usage rather than
|
||||
"parentheses".]
|
||||
|
||||
Non-capturing brackets use the opcode OP_BRA. Originally PCRE was limited to 99
|
||||
capturing brackets and it used a different opcode for each one. From release
|
||||
@ -330,16 +360,17 @@ A bracket opcode is followed by LINK_SIZE bytes which give the offset to the
|
||||
next alternative OP_ALT or, if there aren't any branches, to the matching
|
||||
OP_KET opcode. Each OP_ALT is followed by LINK_SIZE bytes giving the offset to
|
||||
the next one, or to the OP_KET opcode. For capturing brackets, the bracket
|
||||
number immediately follows the offset, always as a 2-byte item.
|
||||
number immediately follows the offset, always as a 2-byte (one short) item.
|
||||
|
||||
OP_KET is used for subpatterns that do not repeat indefinitely, while
|
||||
OP_KET is used for subpatterns that do not repeat indefinitely, and
|
||||
OP_KETRMIN and OP_KETRMAX are used for indefinite repetitions, minimally or
|
||||
maximally respectively. All three are followed by LINK_SIZE bytes giving (as a
|
||||
positive number) the offset back to the matching bracket opcode.
|
||||
maximally respectively (see below for possessive repetitions). All three are
|
||||
followed by LINK_SIZE bytes giving (as a positive number) the offset back to
|
||||
the matching bracket opcode.
|
||||
|
||||
If a subpattern is quantified such that it is permitted to match zero times, it
|
||||
is preceded by one of OP_BRAZERO, OP_BRAMINZERO, or OP_SKIPZERO. These are
|
||||
single-byte opcodes that tell the matcher that skipping the following
|
||||
single-unit opcodes that tell the matcher that skipping the following
|
||||
subpattern entirely is a valid branch. In the case of the first two, not
|
||||
skipping the pattern is also valid (greedy and non-greedy). The third is used
|
||||
when a pattern has the quantifier {0,0}. It cannot be entirely discarded,
|
||||
@ -362,6 +393,15 @@ final replication is changed to OP_SBRA or OP_SCBRA. This tells the matcher
|
||||
that it needs to check for matching an empty string when it hits OP_KETRMIN or
|
||||
OP_KETRMAX, and if so, to break the loop.
|
||||
|
||||
Possessive brackets
|
||||
-------------------
|
||||
|
||||
When a repeated group (capturing or non-capturing) is marked as possessive by
|
||||
the "+" notation, e.g. (abc)++, different opcodes are used. Their names all
|
||||
have POS on the end, e.g. OP_BRAPOS instead of OP_BRA and OP_SCPBRPOS instead
|
||||
of OP_SCBRA. The end of such a group is marked by OP_KETRPOS. If the minimum
|
||||
repetition is zero, the group is preceded by OP_BRAPOSZERO.
|
||||
|
||||
|
||||
Assertions
|
||||
----------
|
||||
@ -369,11 +409,11 @@ Assertions
|
||||
Forward assertions are just like other subpatterns, but starting with one of
|
||||
the opcodes OP_ASSERT or OP_ASSERT_NOT. Backward assertions use the opcodes
|
||||
OP_ASSERTBACK and OP_ASSERTBACK_NOT, and the first opcode inside the assertion
|
||||
is OP_REVERSE, followed by a two byte count of the number of characters to move
|
||||
back the pointer in the subject string. When operating in UTF-8 mode, the count
|
||||
is a character count rather than a byte count. A separate count is present in
|
||||
each alternative of a lookbehind assertion, allowing them to have different
|
||||
fixed lengths.
|
||||
is OP_REVERSE, followed by a two byte (one short) count of the number of
|
||||
characters to move back the pointer in the subject string. In ASCII mode, the
|
||||
count is a number of units, but in UTF-8/16 mode each character may occupy more
|
||||
than one unit. A separate count is present in each alternative of a lookbehind
|
||||
assertion, allowing them to have different fixed lengths.
|
||||
|
||||
|
||||
Once-only (atomic) subpatterns
|
||||
@ -390,14 +430,15 @@ Conditional subpatterns
|
||||
These are like other subpatterns, but they start with the opcode OP_COND, or
|
||||
OP_SCOND for one that might match an empty string in an unbounded repeat. If
|
||||
the condition is a back reference, this is stored at the start of the
|
||||
subpattern using the opcode OP_CREF followed by two bytes containing the
|
||||
reference number. OP_NCREF is used instead if the reference was generated by
|
||||
name (so that the runtime code knows to check for duplicate names).
|
||||
subpattern using the opcode OP_CREF followed by two bytes (one short)
|
||||
containing the reference number. OP_NCREF is used instead if the reference was
|
||||
generated by name (so that the runtime code knows to check for duplicate
|
||||
names).
|
||||
|
||||
If the condition is "in recursion" (coded as "(?(R)"), or "in recursion of
|
||||
group x" (coded as "(?(Rx)"), the group number is stored at the start of the
|
||||
subpattern using the opcode OP_RREF or OP_NRREF (cf OP_NCREF), and a value of
|
||||
zero for "the whole pattern". For a DEFINE condition, just the single byte
|
||||
zero for "the whole pattern". For a DEFINE condition, just the single unit
|
||||
OP_DEF is used (it has no associated data). Otherwise, a conditional subpattern
|
||||
always starts with one of the assertions.
|
||||
|
||||
@ -416,25 +457,12 @@ are not strictly a recursion.
|
||||
Callout
|
||||
-------
|
||||
|
||||
OP_CALLOUT is followed by one byte of data that holds a callout number in the
|
||||
OP_CALLOUT is followed by one unit of data that holds a callout number in the
|
||||
range 0 to 254 for manual callouts, or 255 for an automatic callout. In both
|
||||
cases there follows a two-byte value giving the offset in the pattern to the
|
||||
start of the following item, and another two-byte item giving the length of the
|
||||
next item.
|
||||
cases there follows a two-byte (one short) value giving the offset in the
|
||||
pattern to the start of the following item, and another two-byte (one short)
|
||||
item giving the length of the next item.
|
||||
|
||||
|
||||
Changing options
|
||||
----------------
|
||||
|
||||
If any of the /i, /m, or /s options are changed within a pattern, an OP_OPT
|
||||
opcode is compiled, followed by one byte containing the new settings of these
|
||||
flags. If there are several alternatives, there is an occurrence of OP_OPT at
|
||||
the start of all those following the first options change, to set appropriate
|
||||
options for the start of the alternative. Immediately after the end of the
|
||||
group there is another such item to reset the flags to their previous values. A
|
||||
change of flag right at the very start of the pattern can be handled entirely
|
||||
at compile time, and so does not cause anything to be put into the compiled
|
||||
data.
|
||||
|
||||
Philip Hazel
|
||||
October 2010
|
||||
February 2012
|
||||
|
@ -9,7 +9,9 @@ specified below. The documentation for PCRE, supplied in the "doc"
|
||||
directory, is distributed under the same terms as the software itself.
|
||||
|
||||
The basic library functions are written in C and are freestanding. Also
|
||||
included in the distribution is a set of C++ wrapper functions.
|
||||
included in the distribution is a set of C++ wrapper functions, and a
|
||||
just-in-time compiler that can be used to optimize pattern matching. These
|
||||
are both optional features that can be omitted when the library is built.
|
||||
|
||||
|
||||
THE BASIC LIBRARY FUNCTIONS
|
||||
@ -22,7 +24,29 @@ Email domain: cam.ac.uk
|
||||
University of Cambridge Computing Service,
|
||||
Cambridge, England.
|
||||
|
||||
Copyright (c) 1997-2010 University of Cambridge
|
||||
Copyright (c) 1997-2012 University of Cambridge
|
||||
All rights reserved.
|
||||
|
||||
|
||||
PCRE JUST-IN-TIME COMPILATION SUPPORT
|
||||
-------------------------------------
|
||||
|
||||
Written by: Zoltan Herczeg
|
||||
Email local part: hzmester
|
||||
Emain domain: freemail.hu
|
||||
|
||||
Copyright(c) 2010-2012 Zoltan Herczeg
|
||||
All rights reserved.
|
||||
|
||||
|
||||
STACK-LESS JUST-IN-TIME COMPILER
|
||||
--------------------------------
|
||||
|
||||
Written by: Zoltan Herczeg
|
||||
Email local part: hzmester
|
||||
Emain domain: freemail.hu
|
||||
|
||||
Copyright(c) 2009-2012 Zoltan Herczeg
|
||||
All rights reserved.
|
||||
|
||||
|
||||
@ -31,7 +55,7 @@ THE C++ WRAPPER FUNCTIONS
|
||||
|
||||
Contributed by: Google Inc.
|
||||
|
||||
Copyright (c) 2007-2010, Google Inc.
|
||||
Copyright (c) 2007-2012, Google Inc.
|
||||
All rights reserved.
|
||||
|
||||
|
||||
|
@ -1,6 +1,82 @@
|
||||
News about PCRE releases
|
||||
------------------------
|
||||
|
||||
Release 8.31 06-July-2012
|
||||
-------------------------
|
||||
|
||||
This is mainly a bug-fixing release, with a small number of developments:
|
||||
|
||||
. The JIT compiler now supports partial matching and the (*MARK) and
|
||||
(*COMMIT) verbs.
|
||||
|
||||
. PCRE_INFO_MAXLOOKBEHIND can be used to find the longest lookbehing in a
|
||||
pattern.
|
||||
|
||||
. There should be a performance improvement when using the heap instead of the
|
||||
stack for recursion.
|
||||
|
||||
. pcregrep can now be linked with libedit as an alternative to libreadline.
|
||||
|
||||
. pcregrep now has a --file-list option where the list of files to scan is
|
||||
given as a file.
|
||||
|
||||
. pcregrep now recognizes binary files and there are related options.
|
||||
|
||||
. The Unicode tables have been updated to 6.1.0.
|
||||
|
||||
As always, the full list of changes is in the ChangeLog file.
|
||||
|
||||
|
||||
Release 8.30 04-February-2012
|
||||
-----------------------------
|
||||
|
||||
Release 8.30 introduces a major new feature: support for 16-bit character
|
||||
strings, compiled as a separate library. There are a few changes to the
|
||||
8-bit library, in addition to some bug fixes.
|
||||
|
||||
. The pcre_info() function, which has been obsolete for over 10 years, has
|
||||
been removed.
|
||||
|
||||
. When a compiled pattern was saved to a file and later reloaded on a host
|
||||
with different endianness, PCRE used automatically to swap the bytes in some
|
||||
of the data fields. With the advent of the 16-bit library, where more of this
|
||||
swapping is needed, it is no longer done automatically. Instead, the bad
|
||||
endianness is detected and a specific error is given. The user can then call
|
||||
a new function called pcre_pattern_to_host_byte_order() (or an equivalent
|
||||
16-bit function) to do the swap.
|
||||
|
||||
. In UTF-8 mode, the values 0xd800 to 0xdfff are not legal Unicode
|
||||
code points and are now faulted. (They are the so-called "surrogates"
|
||||
that are reserved for coding high values in UTF-16.)
|
||||
|
||||
|
||||
Release 8.21 12-Dec-2011
|
||||
------------------------
|
||||
|
||||
This is almost entirely a bug-fix release. The only new feature is the ability
|
||||
to obtain the size of the memory used by the JIT compiler.
|
||||
|
||||
|
||||
Release 8.20 21-Oct-2011
|
||||
------------------------
|
||||
|
||||
The main change in this release is the inclusion of Zoltan Herczeg's
|
||||
just-in-time compiler support, which can be accessed by building PCRE with
|
||||
--enable-jit. Large performance benefits can be had in many situations. 8.20
|
||||
also fixes an unfortunate bug that was introduced in 8.13 as well as tidying up
|
||||
a number of infelicities and differences from Perl.
|
||||
|
||||
|
||||
Release 8.13 16-Aug-2011
|
||||
------------------------
|
||||
|
||||
This is mainly a bug-fix release. There has been a lot of internal refactoring.
|
||||
The Unicode tables have been updated. The only new feature in the library is
|
||||
the passing of *MARK information to callouts. Some additions have been made to
|
||||
pcretest to make testing easier and more comprehensive. There is a new option
|
||||
for pcregrep to adjust its internal buffer size.
|
||||
|
||||
|
||||
Release 8.12 15-Jan-2011
|
||||
------------------------
|
||||
|
||||
|
@ -1,501 +1,7 @@
|
||||
Compiling PCRE on non-Unix systems
|
||||
----------------------------------
|
||||
|
||||
This document contains the following sections:
|
||||
This has been renamed to better reflect its contents. Please see the file
|
||||
NON-AUTOTOOLS-BUILD for details of how to build PCRE without using autotools.
|
||||
|
||||
General
|
||||
Generic instructions for the PCRE C library
|
||||
The C++ wrapper functions
|
||||
Building for virtual Pascal
|
||||
Stack size in Windows environments
|
||||
Linking programs in Windows environments
|
||||
Comments about Win32 builds
|
||||
Building PCRE on Windows with CMake
|
||||
Use of relative paths with CMake on Windows
|
||||
Testing with RunTest.bat
|
||||
Building under Windows with BCC5.5
|
||||
Building PCRE on OpenVMS
|
||||
Building PCRE on Stratus OpenVOS
|
||||
|
||||
|
||||
GENERAL
|
||||
|
||||
I (Philip Hazel) have no experience of Windows or VMS sytems and how their
|
||||
libraries work. The items in the PCRE distribution and Makefile that relate to
|
||||
anything other than Unix-like systems are untested by me.
|
||||
|
||||
There are some other comments and files (including some documentation in CHM
|
||||
format) in the Contrib directory on the FTP site:
|
||||
|
||||
ftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre/Contrib
|
||||
|
||||
If you want to compile PCRE for a non-Unix system (especially for a system that
|
||||
does not support "configure" and "make" files), note that the basic PCRE
|
||||
library consists entirely of code written in Standard C, and so should compile
|
||||
successfully on any system that has a Standard C compiler and library. The C++
|
||||
wrapper functions are a separate issue (see below).
|
||||
|
||||
The PCRE distribution includes a "configure" file for use by the Configure/Make
|
||||
build system, as found in many Unix-like environments. There is also support
|
||||
support for CMake, which some users prefer, especially in Windows environments.
|
||||
There are some instructions for CMake under Windows in the section entitled
|
||||
"Building PCRE with CMake" below. CMake can also be used to build PCRE in
|
||||
Unix-like systems.
|
||||
|
||||
|
||||
GENERIC INSTRUCTIONS FOR THE PCRE C LIBRARY
|
||||
|
||||
The following are generic comments about building the PCRE C library "by hand".
|
||||
|
||||
(1) Copy or rename the file config.h.generic as config.h, and edit the macro
|
||||
settings that it contains to whatever is appropriate for your environment.
|
||||
In particular, if you want to force a specific value for newline, you can
|
||||
define the NEWLINE macro. When you compile any of the PCRE modules, you
|
||||
must specify -DHAVE_CONFIG_H to your compiler so that config.h is included
|
||||
in the sources.
|
||||
|
||||
An alternative approach is not to edit config.h, but to use -D on the
|
||||
compiler command line to make any changes that you need to the
|
||||
configuration options. In this case -DHAVE_CONFIG_H must not be set.
|
||||
|
||||
NOTE: There have been occasions when the way in which certain parameters
|
||||
in config.h are used has changed between releases. (In the configure/make
|
||||
world, this is handled automatically.) When upgrading to a new release,
|
||||
you are strongly advised to review config.h.generic before re-using what
|
||||
you had previously.
|
||||
|
||||
(2) Copy or rename the file pcre.h.generic as pcre.h.
|
||||
|
||||
(3) EITHER:
|
||||
Copy or rename file pcre_chartables.c.dist as pcre_chartables.c.
|
||||
|
||||
OR:
|
||||
Compile dftables.c as a stand-alone program (using -DHAVE_CONFIG_H if
|
||||
you have set up config.h), and then run it with the single argument
|
||||
"pcre_chartables.c". This generates a set of standard character tables
|
||||
and writes them to that file. The tables are generated using the default
|
||||
C locale for your system. If you want to use a locale that is specified
|
||||
by LC_xxx environment variables, add the -L option to the dftables
|
||||
command. You must use this method if you are building on a system that
|
||||
uses EBCDIC code.
|
||||
|
||||
The tables in pcre_chartables.c are defaults. The caller of PCRE can
|
||||
specify alternative tables at run time.
|
||||
|
||||
(4) Ensure that you have the following header files:
|
||||
|
||||
pcre_internal.h
|
||||
ucp.h
|
||||
|
||||
(5) Also ensure that you have the following file, which is #included as source
|
||||
when building a debugging version of PCRE, and is also used by pcretest.
|
||||
|
||||
pcre_printint.src
|
||||
|
||||
(6) Compile the following source files, setting -DHAVE_CONFIG_H as a compiler
|
||||
option if you have set up config.h with your configuration, or else use
|
||||
other -D settings to change the configuration as required.
|
||||
|
||||
pcre_chartables.c
|
||||
pcre_compile.c
|
||||
pcre_config.c
|
||||
pcre_dfa_exec.c
|
||||
pcre_exec.c
|
||||
pcre_fullinfo.c
|
||||
pcre_get.c
|
||||
pcre_globals.c
|
||||
pcre_info.c
|
||||
pcre_maketables.c
|
||||
pcre_newline.c
|
||||
pcre_ord2utf8.c
|
||||
pcre_refcount.c
|
||||
pcre_study.c
|
||||
pcre_tables.c
|
||||
pcre_try_flipped.c
|
||||
pcre_ucd.c
|
||||
pcre_valid_utf8.c
|
||||
pcre_version.c
|
||||
pcre_xclass.c
|
||||
|
||||
Make sure that you include -I. in the compiler command (or equivalent for
|
||||
an unusual compiler) so that all included PCRE header files are first
|
||||
sought in the current directory. Otherwise you run the risk of picking up
|
||||
a previously-installed file from somewhere else.
|
||||
|
||||
(7) Now link all the compiled code into an object library in whichever form
|
||||
your system keeps such libraries. This is the basic PCRE C library. If
|
||||
your system has static and shared libraries, you may have to do this once
|
||||
for each type.
|
||||
|
||||
(8) Similarly, if you want to build the POSIX wrapper functions, ensure that
|
||||
you have the pcreposix.h file and then compile pcreposix.c (remembering
|
||||
-DHAVE_CONFIG_H if necessary). Link the result (on its own) as the
|
||||
pcreposix library.
|
||||
|
||||
(9) Compile the test program pcretest.c (again, don't forget -DHAVE_CONFIG_H).
|
||||
This needs the functions in the PCRE library when linking. It also needs
|
||||
the pcreposix wrapper functions unless you compile it with -DNOPOSIX. The
|
||||
pcretest.c program also needs the pcre_printint.src source file, which it
|
||||
#includes.
|
||||
|
||||
(10) Run pcretest on the testinput files in the testdata directory, and check
|
||||
that the output matches the corresponding testoutput files. Note that the
|
||||
supplied files are in Unix format, with just LF characters as line
|
||||
terminators. You may need to edit them to change this if your system uses
|
||||
a different convention. If you are using Windows, you probably should use
|
||||
the wintestinput3 file instead of testinput3 (and the corresponding output
|
||||
file). This is a locale test; wintestinput3 sets the locale to "french"
|
||||
rather than "fr_FR", and there some minor output differences.
|
||||
|
||||
(11) If you want to use the pcregrep command, compile and link pcregrep.c; it
|
||||
uses only the basic PCRE library (it does not need the pcreposix library).
|
||||
|
||||
|
||||
THE C++ WRAPPER FUNCTIONS
|
||||
|
||||
The PCRE distribution also contains some C++ wrapper functions and tests,
|
||||
contributed by Google Inc. On a system that can use "configure" and "make",
|
||||
the functions are automatically built into a library called pcrecpp. It should
|
||||
be straightforward to compile the .cc files manually on other systems. The
|
||||
files called xxx_unittest.cc are test programs for each of the corresponding
|
||||
xxx.cc files.
|
||||
|
||||
|
||||
BUILDING FOR VIRTUAL PASCAL
|
||||
|
||||
A script for building PCRE using Borland's C++ compiler for use with VPASCAL
|
||||
was contributed by Alexander Tokarev. Stefan Weber updated the script and added
|
||||
additional files. The following files in the distribution are for building PCRE
|
||||
for use with VP/Borland: makevp_c.txt, makevp_l.txt, makevp.bat, pcregexp.pas.
|
||||
|
||||
|
||||
STACK SIZE IN WINDOWS ENVIRONMENTS
|
||||
|
||||
The default processor stack size of 1Mb in some Windows environments is too
|
||||
small for matching patterns that need much recursion. In particular, test 2 may
|
||||
fail because of this. Normally, running out of stack causes a crash, but there
|
||||
have been cases where the test program has just died silently. See your linker
|
||||
documentation for how to increase stack size if you experience problems. The
|
||||
Linux default of 8Mb is a reasonable choice for the stack, though even that can
|
||||
be too small for some pattern/subject combinations.
|
||||
|
||||
PCRE has a compile configuration option to disable the use of stack for
|
||||
recursion so that heap is used instead. However, pattern matching is
|
||||
significantly slower when this is done. There is more about stack usage in the
|
||||
"pcrestack" documentation.
|
||||
|
||||
|
||||
LINKING PROGRAMS IN WINDOWS ENVIRONMENTS
|
||||
|
||||
If you want to statically link a program against a PCRE library in the form of
|
||||
a non-dll .a file, you must define PCRE_STATIC before including pcre.h or
|
||||
pcrecpp.h, otherwise the pcre_malloc() and pcre_free() exported functions will
|
||||
be declared __declspec(dllimport), with unwanted results.
|
||||
|
||||
|
||||
CALLING CONVENTIONS IN WINDOWS ENVIRONMENTS
|
||||
|
||||
It is possible to compile programs to use different calling conventions using
|
||||
MSVC. Search the web for "calling conventions" for more information. To make it
|
||||
easier to change the calling convention for the exported functions in the
|
||||
PCRE library, the macro PCRE_CALL_CONVENTION is present in all the external
|
||||
definitions. It can be set externally when compiling (e.g. in CFLAGS). If it is
|
||||
not set, it defaults to empty; the default calling convention is then used
|
||||
(which is what is wanted most of the time).
|
||||
|
||||
|
||||
COMMENTS ABOUT WIN32 BUILDS (see also "BUILDING PCRE WITH CMAKE" below)
|
||||
|
||||
There are two ways of building PCRE using the "configure, make, make install"
|
||||
paradigm on Windows systems: using MinGW or using Cygwin. These are not at all
|
||||
the same thing; they are completely different from each other. There is also
|
||||
support for building using CMake, which some users find a more straightforward
|
||||
way of building PCRE under Windows. However, the tests are not run
|
||||
automatically when CMake is used.
|
||||
|
||||
The MinGW home page (http://www.mingw.org/) says this:
|
||||
|
||||
MinGW: A collection of freely available and freely distributable Windows
|
||||
specific header files and import libraries combined with GNU toolsets that
|
||||
allow one to produce native Windows programs that do not rely on any
|
||||
3rd-party C runtime DLLs.
|
||||
|
||||
The Cygwin home page (http://www.cygwin.com/) says this:
|
||||
|
||||
Cygwin is a Linux-like environment for Windows. It consists of two parts:
|
||||
|
||||
. A DLL (cygwin1.dll) which acts as a Linux API emulation layer providing
|
||||
substantial Linux API functionality
|
||||
|
||||
. A collection of tools which provide Linux look and feel.
|
||||
|
||||
The Cygwin DLL currently works with all recent, commercially released x86 32
|
||||
bit and 64 bit versions of Windows, with the exception of Windows CE.
|
||||
|
||||
On both MinGW and Cygwin, PCRE should build correctly using:
|
||||
|
||||
./configure && make && make install
|
||||
|
||||
This should create two libraries called libpcre and libpcreposix, and, if you
|
||||
have enabled building the C++ wrapper, a third one called libpcrecpp. These are
|
||||
independent libraries: when you like with libpcreposix or libpcrecpp you must
|
||||
also link with libpcre, which contains the basic functions. (Some earlier
|
||||
releases of PCRE included the basic libpcre functions in libpcreposix. This no
|
||||
longer happens.)
|
||||
|
||||
A user submitted a special-purpose patch that makes it easy to create
|
||||
"pcre.dll" under mingw32 using the "msys" environment. It provides "pcre.dll"
|
||||
as a special target. If you use this target, no other files are built, and in
|
||||
particular, the pcretest and pcregrep programs are not built. An example of how
|
||||
this might be used is:
|
||||
|
||||
./configure --enable-utf --disable-cpp CFLAGS="-03 -s"; make pcre.dll
|
||||
|
||||
Using Cygwin's compiler generates libraries and executables that depend on
|
||||
cygwin1.dll. If a library that is generated this way is distributed,
|
||||
cygwin1.dll has to be distributed as well. Since cygwin1.dll is under the GPL
|
||||
licence, this forces not only PCRE to be under the GPL, but also the entire
|
||||
application. A distributor who wants to keep their own code proprietary must
|
||||
purchase an appropriate Cygwin licence.
|
||||
|
||||
MinGW has no such restrictions. The MinGW compiler generates a library or
|
||||
executable that can run standalone on Windows without any third party dll or
|
||||
licensing issues.
|
||||
|
||||
But there is more complication:
|
||||
|
||||
If a Cygwin user uses the -mno-cygwin Cygwin gcc flag, what that really does is
|
||||
to tell Cygwin's gcc to use the MinGW gcc. Cygwin's gcc is only acting as a
|
||||
front end to MinGW's gcc (if you install Cygwin's gcc, you get both Cygwin's
|
||||
gcc and MinGW's gcc). So, a user can:
|
||||
|
||||
. Build native binaries by using MinGW or by getting Cygwin and using
|
||||
-mno-cygwin.
|
||||
|
||||
. Build binaries that depend on cygwin1.dll by using Cygwin with the normal
|
||||
compiler flags.
|
||||
|
||||
The test files that are supplied with PCRE are in Unix format, with LF
|
||||
characters as line terminators. It may be necessary to change the line
|
||||
terminators in order to get some of the tests to work.
|
||||
|
||||
|
||||
BUILDING PCRE ON WINDOWS WITH CMAKE
|
||||
|
||||
CMake is an alternative configuration facility that can be used instead of the
|
||||
traditional Unix "configure". CMake creates project files (make files, solution
|
||||
files, etc.) tailored to numerous development environments, including Visual
|
||||
Studio, Borland, Msys, MinGW, NMake, and Unix. The following instructions
|
||||
were contributed by a PCRE user.
|
||||
|
||||
1. Install the latest CMake version available from http://www.cmake.org/, and
|
||||
ensure that cmake\bin is on your path.
|
||||
|
||||
2. Unzip (retaining folder structure) the PCRE source tree into a source
|
||||
directory such as C:\pcre.
|
||||
|
||||
3. Create a new, empty build directory, for example C:\pcre\build\
|
||||
|
||||
4. Run cmake-gui from the Shell envirornment of your build tool, for example,
|
||||
Msys for Msys/MinGW or Visual Studio Command Prompt for VC/VC++.
|
||||
|
||||
5. Enter C:\pcre\pcre-xx and C:\pcre\build for the source and build
|
||||
directories, respectively.
|
||||
|
||||
6. Hit the "Configure" button.
|
||||
|
||||
7. Select the particular IDE / build tool that you are using (Visual
|
||||
Studio, MSYS makefiles, MinGW makefiles, etc.)
|
||||
|
||||
8. The GUI will then list several configuration options. This is where
|
||||
you can enable UTF-8 support or other PCRE optional features.
|
||||
|
||||
9. Hit "Configure" again. The adjacent "Generate" button should now be
|
||||
active.
|
||||
|
||||
10. Hit "Generate".
|
||||
|
||||
11. The build directory should now contain a usable build system, be it a
|
||||
solution file for Visual Studio, makefiles for MinGW, etc. Exit from
|
||||
cmake-gui and use the generated build system with your compiler or IDE.
|
||||
|
||||
|
||||
USE OF RELATIVE PATHS WITH CMAKE ON WINDOWS
|
||||
|
||||
A PCRE user comments as follows:
|
||||
|
||||
I thought that others may want to know the current state of
|
||||
CMAKE_USE_RELATIVE_PATHS support on Windows.
|
||||
|
||||
Here it is:
|
||||
-- AdditionalIncludeDirectories is only partially modified (only the
|
||||
first path - see below)
|
||||
-- Only some of the contained file paths are modified - shown below for
|
||||
pcre.vcproj
|
||||
-- It properly modifies
|
||||
|
||||
I am sure CMake people can fix that if they want to. Until then one will
|
||||
need to replace existing absolute paths in project files with relative
|
||||
paths manually (e.g. from VS) - relative to project file location. I did
|
||||
just that before being told to try CMAKE_USE_RELATIVE_PATHS. Not a big
|
||||
deal.
|
||||
|
||||
AdditionalIncludeDirectories="E:\builds\pcre\build;E:\builds\pcre\pcre-7.5;"
|
||||
AdditionalIncludeDirectories=".;E:\builds\pcre\pcre-7.5;"
|
||||
|
||||
RelativePath="pcre.h">
|
||||
RelativePath="pcre_chartables.c">
|
||||
RelativePath="pcre_chartables.c.rule">
|
||||
|
||||
|
||||
TESTING WITH RUNTEST.BAT
|
||||
|
||||
1. Copy RunTest.bat into the directory where pcretest.exe has been created.
|
||||
|
||||
2. Edit RunTest.bat and insert a line that indentifies the relative location of
|
||||
the pcre source, e.g.:
|
||||
|
||||
set srcdir=..\pcre-7.4-RC3
|
||||
|
||||
3. Run RunTest.bat from a command shell environment. Test outputs will
|
||||
automatically be compared to expected results, and discrepancies will
|
||||
identified in the console output.
|
||||
|
||||
4. To test pcrecpp, run pcrecpp_unittest.exe, pcre_stringpiece_unittest.exe and
|
||||
pcre_scanner_unittest.exe.
|
||||
|
||||
|
||||
BUILDING UNDER WINDOWS WITH BCC5.5
|
||||
|
||||
Michael Roy sent these comments about building PCRE under Windows with BCC5.5:
|
||||
|
||||
Some of the core BCC libraries have a version of PCRE from 1998 built in,
|
||||
which can lead to pcre_exec() giving an erroneous PCRE_ERROR_NULL from a
|
||||
version mismatch. I'm including an easy workaround below, if you'd like to
|
||||
include it in the non-unix instructions:
|
||||
|
||||
When linking a project with BCC5.5, pcre.lib must be included before any of
|
||||
the libraries cw32.lib, cw32i.lib, cw32mt.lib, and cw32mti.lib on the command
|
||||
line.
|
||||
|
||||
|
||||
BUILDING UNDER WINDOWS CE WITH VISUAL STUDIO 200x
|
||||
|
||||
Vincent Richomme sent a zip archive of files to help with this process. They
|
||||
can be found in the file "pcre-vsbuild.zip" in the Contrib directory of the FTP
|
||||
site.
|
||||
|
||||
|
||||
BUILDING PCRE ON OPENVMS
|
||||
|
||||
Dan Mooney sent the following comments about building PCRE on OpenVMS. They
|
||||
relate to an older version of PCRE that used fewer source files, so the exact
|
||||
commands will need changing. See the current list of source files above.
|
||||
|
||||
"It was quite easy to compile and link the library. I don't have a formal
|
||||
make file but the attached file [reproduced below] contains the OpenVMS DCL
|
||||
commands I used to build the library. I had to add #define
|
||||
POSIX_MALLOC_THRESHOLD 10 to pcre.h since it was not defined anywhere.
|
||||
|
||||
The library was built on:
|
||||
O/S: HP OpenVMS v7.3-1
|
||||
Compiler: Compaq C v6.5-001-48BCD
|
||||
Linker: vA13-01
|
||||
|
||||
The test results did not match 100% due to the issues you mention in your
|
||||
documentation regarding isprint(), iscntrl(), isgraph() and ispunct(). I
|
||||
modified some of the character tables temporarily and was able to get the
|
||||
results to match. Tests using the fr locale did not match since I don't have
|
||||
that locale loaded. The study size was always reported to be 3 less than the
|
||||
value in the standard test output files."
|
||||
|
||||
=========================
|
||||
$! This DCL procedure builds PCRE on OpenVMS
|
||||
$!
|
||||
$! I followed the instructions in the non-unix-use file in the distribution.
|
||||
$!
|
||||
$ COMPILE == "CC/LIST/NOMEMBER_ALIGNMENT/PREFIX_LIBRARY_ENTRIES=ALL_ENTRIES
|
||||
$ COMPILE DFTABLES.C
|
||||
$ LINK/EXE=DFTABLES.EXE DFTABLES.OBJ
|
||||
$ RUN DFTABLES.EXE/OUTPUT=CHARTABLES.C
|
||||
$ COMPILE MAKETABLES.C
|
||||
$ COMPILE GET.C
|
||||
$ COMPILE STUDY.C
|
||||
$! I had to set POSIX_MALLOC_THRESHOLD to 10 in PCRE.H since the symbol
|
||||
$! did not seem to be defined anywhere.
|
||||
$! I edited pcre.h and added #DEFINE SUPPORT_UTF8 to enable UTF8 support.
|
||||
$ COMPILE PCRE.C
|
||||
$ LIB/CREATE PCRE MAKETABLES.OBJ, GET.OBJ, STUDY.OBJ, PCRE.OBJ
|
||||
$! I had to set POSIX_MALLOC_THRESHOLD to 10 in PCRE.H since the symbol
|
||||
$! did not seem to be defined anywhere.
|
||||
$ COMPILE PCREPOSIX.C
|
||||
$ LIB/CREATE PCREPOSIX PCREPOSIX.OBJ
|
||||
$ COMPILE PCRETEST.C
|
||||
$ LINK/EXE=PCRETEST.EXE PCRETEST.OBJ, PCRE/LIB, PCREPOSIX/LIB
|
||||
$! C programs that want access to command line arguments must be
|
||||
$! defined as a symbol
|
||||
$ PCRETEST :== "$ SYS$ROADSUSERS:[DMOONEY.REGEXP]PCRETEST.EXE"
|
||||
$! Arguments must be enclosed in quotes.
|
||||
$ PCRETEST "-C"
|
||||
$! Test results:
|
||||
$!
|
||||
$! The test results did not match 100%. The functions isprint(), iscntrl(),
|
||||
$! isgraph() and ispunct() on OpenVMS must not produce the same results
|
||||
$! as the system that built the test output files provided with the
|
||||
$! distribution.
|
||||
$!
|
||||
$! The study size did not match and was always 3 less on OpenVMS.
|
||||
$!
|
||||
$! Locale could not be set to fr
|
||||
$!
|
||||
=========================
|
||||
|
||||
|
||||
BUILDING PCRE ON STRATUS OPENVOS
|
||||
|
||||
These notes on the port of PCRE to VOS (lightly edited) were supplied by
|
||||
Ashutosh Warikoo, whose email address has the local part awarikoo and the
|
||||
domain nse.co.in. The port was for version 7.9 in August 2009.
|
||||
|
||||
1. Building PCRE
|
||||
|
||||
I built pcre on OpenVOS Release 17.0.1at using GNU Tools 3.4a without any
|
||||
problems. I used the following packages to build PCRE:
|
||||
|
||||
ftp://ftp.stratus.com/pub/vos/posix/ga/posix.save.evf.gz
|
||||
|
||||
Please read and follow the instructions that come with these packages. To start
|
||||
the build of pcre, from the root of the package type:
|
||||
|
||||
./build.sh
|
||||
|
||||
2. Installing PCRE
|
||||
|
||||
Once you have successfully built PCRE, login to the SysAdmin group, switch to
|
||||
the root user, and type
|
||||
|
||||
[ !create_dir (master_disk)>usr --if needed ]
|
||||
[ !create_dir (master_disk)>usr>local --if needed ]
|
||||
!gmake install
|
||||
|
||||
This installs PCRE and its man pages into /usr/local. You can add
|
||||
(master_disk)>usr>local>bin to your command search paths, or if you are in
|
||||
BASH, add /usr/local/bin to the PATH environment variable.
|
||||
|
||||
4. Restrictions
|
||||
|
||||
This port requires readline library optionally. However during the build I
|
||||
faced some yet unexplored errors while linking with readline. As it was an
|
||||
optional component I chose to disable it.
|
||||
|
||||
5. Known Problems
|
||||
|
||||
I ran a the test suite, but you will have to be your own judge of whether this
|
||||
command, and this port, suits your purposes. If you find any problems that
|
||||
appear to be related to the port itself, please let me know. Please see the
|
||||
build.log file in the root of the package also.
|
||||
|
||||
|
||||
=========================
|
||||
Last Updated: 26 May 2010
|
||||
****
|
||||
####
|
||||
|
@ -18,11 +18,12 @@ The contents of this README file are:
|
||||
The PCRE APIs
|
||||
Documentation for PCRE
|
||||
Contributions by users of PCRE
|
||||
Building PCRE on non-Unix systems
|
||||
Building PCRE on Unix-like systems
|
||||
Retrieving configuration information on Unix-like systems
|
||||
Shared libraries on Unix-like systems
|
||||
Cross-compiling on Unix-like systems
|
||||
Building PCRE on non-Unix-like systems
|
||||
Building PCRE without using autotools
|
||||
Building PCRE using autotools
|
||||
Retrieving configuration information
|
||||
Shared libraries
|
||||
Cross-compiling using autotools
|
||||
Using HP's ANSI C++ compiler (aCC)
|
||||
Using PCRE from MySQL
|
||||
Making new tarballs
|
||||
@ -34,16 +35,19 @@ The contents of this README file are:
|
||||
The PCRE APIs
|
||||
-------------
|
||||
|
||||
PCRE is written in C, and it has its own API. The distribution also includes a
|
||||
set of C++ wrapper functions (see the pcrecpp man page for details), courtesy
|
||||
of Google Inc.
|
||||
PCRE is written in C, and it has its own API. There are two sets of functions,
|
||||
one for the 8-bit library, which processes strings of bytes, and one for the
|
||||
16-bit library, which processes strings of 16-bit values. The distribution also
|
||||
includes a set of C++ wrapper functions (see the pcrecpp man page for details),
|
||||
courtesy of Google Inc., which can be used to call the 8-bit PCRE library from
|
||||
C++.
|
||||
|
||||
In addition, there is a set of C wrapper functions that are based on the POSIX
|
||||
regular expression API (see the pcreposix man page). These end up in the
|
||||
library called libpcreposix. Note that this just provides a POSIX calling
|
||||
interface to PCRE; the regular expressions themselves still follow Perl syntax
|
||||
and semantics. The POSIX API is restricted, and does not give full access to
|
||||
all of PCRE's facilities.
|
||||
In addition, there is a set of C wrapper functions (again, just for the 8-bit
|
||||
library) that are based on the POSIX regular expression API (see the pcreposix
|
||||
man page). These end up in the library called libpcreposix. Note that this just
|
||||
provides a POSIX calling interface to PCRE; the regular expressions themselves
|
||||
still follow Perl syntax and semantics. The POSIX API is restricted, and does
|
||||
not give full access to all of PCRE's facilities.
|
||||
|
||||
The header file for the POSIX-style functions is called pcreposix.h. The
|
||||
official POSIX name is regex.h, but I did not want to risk possible problems
|
||||
@ -106,36 +110,45 @@ Windows (I myself do not use Windows). Nowadays there is more Windows support
|
||||
in the standard distribution, so these contibutions have been archived.
|
||||
|
||||
|
||||
Building PCRE on non-Unix systems
|
||||
---------------------------------
|
||||
Building PCRE on non-Unix-like systems
|
||||
--------------------------------------
|
||||
|
||||
For a non-Unix system, please read the comments in the file NON-UNIX-USE,
|
||||
though if your system supports the use of "configure" and "make" you may be
|
||||
able to build PCRE in the same way as for Unix-like systems. PCRE can also be
|
||||
configured in many platform environments using the GUI facility provided by
|
||||
CMake's cmake-gui command. This creates Makefiles, solution files, etc.
|
||||
For a non-Unix-like system, please read the comments in the file
|
||||
NON-AUTOTOOLS-BUILD, though if your system supports the use of "configure" and
|
||||
"make" you may be able to build PCRE using autotools in the same way as for
|
||||
many Unix-like systems.
|
||||
|
||||
PCRE can also be configured using the GUI facility provided by CMake's
|
||||
cmake-gui command. This creates Makefiles, solution files, etc. The file
|
||||
NON-AUTOTOOLS-BUILD has information about CMake.
|
||||
|
||||
PCRE has been compiled on many different operating systems. It should be
|
||||
straightforward to build PCRE on any system that has a Standard C compiler and
|
||||
library, because it uses only Standard C functions.
|
||||
|
||||
|
||||
Building PCRE on Unix-like systems
|
||||
----------------------------------
|
||||
Building PCRE without using autotools
|
||||
-------------------------------------
|
||||
|
||||
The use of autotools (in particular, libtool) is problematic in some
|
||||
environments, even some that are Unix or Unix-like. See the NON-AUTOTOOLS-BUILD
|
||||
file for ways of building PCRE without using autotools.
|
||||
|
||||
|
||||
Building PCRE using autotools
|
||||
-----------------------------
|
||||
|
||||
If you are using HP's ANSI C++ compiler (aCC), please see the special note
|
||||
in the section entitled "Using HP's ANSI C++ compiler (aCC)" below.
|
||||
|
||||
The following instructions assume the use of the widely used "configure, make,
|
||||
make install" process. There is also support for CMake in the PCRE
|
||||
distribution; there are some comments about using CMake in the NON-UNIX-USE
|
||||
file, though it can also be used in Unix-like systems.
|
||||
The following instructions assume the use of the widely used "configure; make;
|
||||
make install" (autotools) process.
|
||||
|
||||
To build PCRE on a Unix-like system, first run the "configure" command from the
|
||||
PCRE distribution directory, with your current directory set to the directory
|
||||
where you want the files to be created. This command is a standard GNU
|
||||
"autoconf" configuration script, for which generic instructions are supplied in
|
||||
the file INSTALL.
|
||||
To build PCRE on system that supports autotools, first run the "configure"
|
||||
command from the PCRE distribution directory, with your current directory set
|
||||
to the directory where you want the files to be created. This command is a
|
||||
standard GNU "autoconf" configuration script, for which generic instructions
|
||||
are supplied in the file INSTALL.
|
||||
|
||||
Most commonly, people build PCRE within its own distribution directory, and in
|
||||
this case, on many systems, just running "./configure" is sufficient. However,
|
||||
@ -143,9 +156,9 @@ the usual methods of changing standard defaults are available. For example:
|
||||
|
||||
CFLAGS='-O2 -Wall' ./configure --prefix=/opt/local
|
||||
|
||||
specifies that the C compiler should be run with the flags '-O2 -Wall' instead
|
||||
of the default, and that "make install" should install PCRE under /opt/local
|
||||
instead of the default /usr/local.
|
||||
This command specifies that the C compiler should be run with the flags '-O2
|
||||
-Wall' instead of the default, and that "make install" should install PCRE
|
||||
under /opt/local instead of the default /usr/local.
|
||||
|
||||
If you want to build in a different directory, just run "configure" with that
|
||||
directory as current. For example, suppose you have unpacked the PCRE source
|
||||
@ -159,27 +172,59 @@ possible to build it as a C++ library, though the provided building apparatus
|
||||
does not have any features to support this.
|
||||
|
||||
There are some optional features that can be included or omitted from the PCRE
|
||||
library. You can read more about them in the pcrebuild man page.
|
||||
library. They are also documented in the pcrebuild man page.
|
||||
|
||||
. If you want to suppress the building of the C++ wrapper library, you can add
|
||||
--disable-cpp to the "configure" command. Otherwise, when "configure" is run,
|
||||
it will try to find a C++ compiler and C++ header files, and if it succeeds,
|
||||
it will try to build the C++ wrapper.
|
||||
. By default, both shared and static libraries are built. You can change this
|
||||
by adding one of these options to the "configure" command:
|
||||
|
||||
--disable-shared
|
||||
--disable-static
|
||||
|
||||
(See also "Shared libraries on Unix-like systems" below.)
|
||||
|
||||
. By default, only the 8-bit library is built. If you add --enable-pcre16 to
|
||||
the "configure" command, the 16-bit library is also built. If you want only
|
||||
the 16-bit library, use "./configure --enable-pcre16 --disable-pcre8".
|
||||
|
||||
. If you are building the 8-bit library and want to suppress the building of
|
||||
the C++ wrapper library, you can add --disable-cpp to the "configure"
|
||||
command. Otherwise, when "configure" is run without --disable-pcre8, it will
|
||||
try to find a C++ compiler and C++ header files, and if it succeeds, it will
|
||||
try to build the C++ wrapper.
|
||||
|
||||
. If you want to include support for just-in-time compiling, which can give
|
||||
large performance improvements on certain platforms, add --enable-jit to the
|
||||
"configure" command. This support is available only for certain hardware
|
||||
architectures. If you try to enable it on an unsupported architecture, there
|
||||
will be a compile time error.
|
||||
|
||||
. When JIT support is enabled, pcregrep automatically makes use of it, unless
|
||||
you add --disable-pcregrep-jit to the "configure" command.
|
||||
|
||||
. If you want to make use of the support for UTF-8 Unicode character strings in
|
||||
PCRE, you must add --enable-utf8 to the "configure" command. Without it, the
|
||||
code for handling UTF-8 is not included in the library. Even when included,
|
||||
it still has to be enabled by an option at run time. When PCRE is compiled
|
||||
with this option, its input can only either be ASCII or UTF-8, even when
|
||||
running on EBCDIC platforms. It is not possible to use both --enable-utf8 and
|
||||
--enable-ebcdic at the same time.
|
||||
the 8-bit library, or UTF-16 Unicode character strings in the 16-bit library,
|
||||
you must add --enable-utf to the "configure" command. Without it, the code
|
||||
for handling UTF-8 and UTF-16 is not included in the relevant library. Even
|
||||
when --enable-utf is included, the use of a UTF encoding still has to be
|
||||
enabled by an option at run time. When PCRE is compiled with this option, its
|
||||
input can only either be ASCII or UTF-8/16, even when running on EBCDIC
|
||||
platforms. It is not possible to use both --enable-utf and --enable-ebcdic at
|
||||
the same time.
|
||||
|
||||
. If, in addition to support for UTF-8 character strings, you want to include
|
||||
support for the \P, \p, and \X sequences that recognize Unicode character
|
||||
properties, you must add --enable-unicode-properties to the "configure"
|
||||
command. This adds about 30K to the size of the library (in the form of a
|
||||
property table); only the basic two-letter properties such as Lu are
|
||||
supported.
|
||||
. There are no separate options for enabling UTF-8 and UTF-16 independently
|
||||
because that would allow ridiculous settings such as requesting UTF-16
|
||||
support while building only the 8-bit library. However, the option
|
||||
--enable-utf8 is retained for backwards compatibility with earlier releases
|
||||
that did not support 16-bit character strings. It is synonymous with
|
||||
--enable-utf. It is not possible to configure one library with UTF support
|
||||
and the other without in the same configuration.
|
||||
|
||||
. If, in addition to support for UTF-8/16 character strings, you want to
|
||||
include support for the \P, \p, and \X sequences that recognize Unicode
|
||||
character properties, you must add --enable-unicode-properties to the
|
||||
"configure" command. This adds about 30K to the size of the library (in the
|
||||
form of a property table); only the basic two-letter properties such as Lu
|
||||
are supported.
|
||||
|
||||
. You can build PCRE to recognize either CR or LF or the sequence CRLF or any
|
||||
of the preceding, or any of the Unicode newline sequences as indicating the
|
||||
@ -232,10 +277,11 @@ library. You can read more about them in the pcrebuild man page.
|
||||
sizes in the pcrestack man page.
|
||||
|
||||
. The default maximum compiled pattern size is around 64K. You can increase
|
||||
this by adding --with-link-size=3 to the "configure" command. You can
|
||||
increase it even more by setting --with-link-size=4, but this is unlikely
|
||||
ever to be necessary. Increasing the internal link size will reduce
|
||||
performance.
|
||||
this by adding --with-link-size=3 to the "configure" command. In the 8-bit
|
||||
library, PCRE then uses three bytes instead of two for offsets to different
|
||||
parts of the compiled pattern. In the 16-bit library, --with-link-size=3 is
|
||||
the same as --with-link-size=4, which (in both libraries) uses four-byte
|
||||
offsets. Increasing the internal link size reduces performance.
|
||||
|
||||
. You can build PCRE so that its internal match() function that is called from
|
||||
pcre_exec() does not call itself recursively. Instead, it uses memory blocks
|
||||
@ -247,9 +293,10 @@ library. You can read more about them in the pcrebuild man page.
|
||||
|
||||
on the "configure" command. PCRE runs more slowly in this mode, but it may be
|
||||
necessary in environments with limited stack sizes. This applies only to the
|
||||
pcre_exec() function; it does not apply to pcre_dfa_exec(), which does not
|
||||
use deeply nested recursion. There is a discussion about stack sizes in the
|
||||
pcrestack man page.
|
||||
normal execution of the pcre_exec() function; if JIT support is being
|
||||
successfully used, it is not relevant. Equally, it does not apply to
|
||||
pcre_dfa_exec(), which does not use deeply nested recursion. There is a
|
||||
discussion about stack sizes in the pcrestack man page.
|
||||
|
||||
. For speed, PCRE uses four tables for manipulating and identifying characters
|
||||
whose code point values are less than 256. By default, it uses a set of
|
||||
@ -269,27 +316,37 @@ library. You can read more about them in the pcrebuild man page.
|
||||
|
||||
This automatically implies --enable-rebuild-chartables (see above). However,
|
||||
when PCRE is built this way, it always operates in EBCDIC. It cannot support
|
||||
both EBCDIC and UTF-8.
|
||||
both EBCDIC and UTF-8/16.
|
||||
|
||||
. It is possible to compile pcregrep to use libz and/or libbz2, in order to
|
||||
read .gz and .bz2 files (respectively), by specifying one or both of
|
||||
. The pcregrep program currently supports only 8-bit data files, and so
|
||||
requires the 8-bit PCRE library. It is possible to compile pcregrep to use
|
||||
libz and/or libbz2, in order to read .gz and .bz2 files (respectively), by
|
||||
specifying one or both of
|
||||
|
||||
--enable-pcregrep-libz
|
||||
--enable-pcregrep-libbz2
|
||||
|
||||
Of course, the relevant libraries must be installed on your system.
|
||||
|
||||
. It is possible to compile pcretest so that it links with the libreadline
|
||||
library, by specifying
|
||||
. The default size of internal buffer used by pcregrep can be set by, for
|
||||
example:
|
||||
|
||||
--enable-pcretest-libreadline
|
||||
--with-pcregrep-bufsize=50K
|
||||
|
||||
The default value is 20K.
|
||||
|
||||
. It is possible to compile pcretest so that it links with the libreadline
|
||||
or libedit libraries, by specifying, respectively,
|
||||
|
||||
--enable-pcretest-libreadline or --enable-pcretest-libedit
|
||||
|
||||
If this is done, when pcretest's input is from a terminal, it reads it using
|
||||
the readline() function. This provides line-editing and history facilities.
|
||||
Note that libreadline is GPL-licenced, so if you distribute a binary of
|
||||
pcretest linked in this way, there may be licensing issues.
|
||||
pcretest linked in this way, there may be licensing issues. These can be
|
||||
avoided by linking with libedit (which has a BSD licence) instead.
|
||||
|
||||
Setting this option causes the -lreadline option to be added to the pcretest
|
||||
Enabling libreadline causes the -lreadline option to be added to the pcretest
|
||||
build. In many operating environments with a sytem-installed readline
|
||||
library this is sufficient. However, in some environments (e.g. if an
|
||||
unmodified distribution version of readline is in use), it may be necessary
|
||||
@ -302,37 +359,42 @@ library. You can read more about them in the pcrebuild man page.
|
||||
|
||||
The "configure" script builds the following files for the basic C library:
|
||||
|
||||
. Makefile is the makefile that builds the library
|
||||
. config.h contains build-time configuration options for the library
|
||||
. pcre.h is the public PCRE header file
|
||||
. pcre-config is a script that shows the settings of "configure" options
|
||||
. libpcre.pc is data for the pkg-config command
|
||||
. libtool is a script that builds shared and/or static libraries
|
||||
. RunTest is a script for running tests on the basic C library
|
||||
. RunGrepTest is a script for running tests on the pcregrep command
|
||||
. Makefile the makefile that builds the library
|
||||
. config.h build-time configuration options for the library
|
||||
. pcre.h the public PCRE header file
|
||||
. pcre-config script that shows the building settings such as CFLAGS
|
||||
that were set for "configure"
|
||||
. libpcre.pc ) data for the pkg-config command
|
||||
. libpcre16.pc )
|
||||
. libpcreposix.pc )
|
||||
. libtool script that builds shared and/or static libraries
|
||||
|
||||
Versions of config.h and pcre.h are distributed in the PCRE tarballs under the
|
||||
names config.h.generic and pcre.h.generic. These are provided for those who
|
||||
have to built PCRE without using "configure" or CMake. If you use "configure"
|
||||
or CMake, the .generic versions are not used.
|
||||
|
||||
If a C++ compiler is found, the following files are also built:
|
||||
When building the 8-bit library, if a C++ compiler is found, the following
|
||||
files are also built:
|
||||
|
||||
. libpcrecpp.pc is data for the pkg-config command
|
||||
. pcrecpparg.h is a header file for programs that call PCRE via the C++ wrapper
|
||||
. pcre_stringpiece.h is the header for the C++ "stringpiece" functions
|
||||
. libpcrecpp.pc data for the pkg-config command
|
||||
. pcrecpparg.h header file for calling PCRE via the C++ wrapper
|
||||
. pcre_stringpiece.h header for the C++ "stringpiece" functions
|
||||
|
||||
The "configure" script also creates config.status, which is an executable
|
||||
script that can be run to recreate the configuration, and config.log, which
|
||||
contains compiler output from tests that "configure" runs.
|
||||
|
||||
Once "configure" has run, you can run "make". It builds two libraries, called
|
||||
libpcre and libpcreposix, a test program called pcretest, and the pcregrep
|
||||
command. If a C++ compiler was found on your system, "make" also builds the C++
|
||||
wrapper library, which is called libpcrecpp, and some test programs called
|
||||
pcrecpp_unittest, pcre_scanner_unittest, and pcre_stringpiece_unittest.
|
||||
Building the C++ wrapper can be disabled by adding --disable-cpp to the
|
||||
"configure" command.
|
||||
Once "configure" has run, you can run "make". This builds either or both of the
|
||||
libraries libpcre and libpcre16, and a test program called pcretest. If you
|
||||
enabled JIT support with --enable-jit, a test program called pcre_jit_test is
|
||||
built as well.
|
||||
|
||||
If the 8-bit library is built, libpcreposix and the pcregrep command are also
|
||||
built, and if a C++ compiler was found on your system, and you did not disable
|
||||
it with --disable-cpp, "make" builds the C++ wrapper library, which is called
|
||||
libpcrecpp, as well as some test programs called pcrecpp_unittest,
|
||||
pcre_scanner_unittest, and pcre_stringpiece_unittest.
|
||||
|
||||
The command "make check" runs all the appropriate tests. Details of the PCRE
|
||||
tests are given below in a separate section of this document.
|
||||
@ -343,16 +405,19 @@ system. The following are installed (file names are all relative to the
|
||||
|
||||
Commands (bin):
|
||||
pcretest
|
||||
pcregrep
|
||||
pcregrep (if 8-bit support is enabled)
|
||||
pcre-config
|
||||
|
||||
Libraries (lib):
|
||||
libpcre
|
||||
libpcreposix
|
||||
libpcrecpp (if C++ support is enabled)
|
||||
libpcre16 (if 16-bit support is enabled)
|
||||
libpcre (if 8-bit support is enabled)
|
||||
libpcreposix (if 8-bit support is enabled)
|
||||
libpcrecpp (if 8-bit and C++ support is enabled)
|
||||
|
||||
Configuration information (lib/pkgconfig):
|
||||
libpcre16.pc
|
||||
libpcre.pc
|
||||
libpcreposix.pc
|
||||
libpcrecpp.pc (if C++ support is enabled)
|
||||
|
||||
Header files (include):
|
||||
@ -366,6 +431,7 @@ system. The following are installed (file names are all relative to the
|
||||
Man pages (share/man/man{1,3}):
|
||||
pcregrep.1
|
||||
pcretest.1
|
||||
pcre-config.1
|
||||
pcre.3
|
||||
pcre*.3 (lots more pages, all starting "pcre")
|
||||
|
||||
@ -380,17 +446,18 @@ system. The following are installed (file names are all relative to the
|
||||
LICENCE
|
||||
NEWS
|
||||
README
|
||||
pcre.txt (a concatenation of the man(3) pages)
|
||||
pcretest.txt the pcretest man page
|
||||
pcregrep.txt the pcregrep man page
|
||||
pcre.txt (a concatenation of the man(3) pages)
|
||||
pcretest.txt the pcretest man page
|
||||
pcregrep.txt the pcregrep man page
|
||||
pcre-config.txt the pcre-config man page
|
||||
|
||||
If you want to remove PCRE from your system, you can run "make uninstall".
|
||||
This removes all the files that "make install" installed. However, it does not
|
||||
remove any directories, because these are often shared with other programs.
|
||||
|
||||
|
||||
Retrieving configuration information on Unix-like systems
|
||||
---------------------------------------------------------
|
||||
Retrieving configuration information
|
||||
------------------------------------
|
||||
|
||||
Running "make install" installs the command pcre-config, which can be used to
|
||||
recall information about the PCRE configuration and installation. For example:
|
||||
@ -415,8 +482,8 @@ The data is held in *.pc files that are installed in a directory called
|
||||
<prefix>/lib/pkgconfig.
|
||||
|
||||
|
||||
Shared libraries on Unix-like systems
|
||||
-------------------------------------
|
||||
Shared libraries
|
||||
----------------
|
||||
|
||||
The default distribution builds PCRE as shared libraries and static libraries,
|
||||
as long as the operating system supports shared libraries. Shared library
|
||||
@ -441,8 +508,8 @@ Then run "make" in the usual way. Similarly, you can use --disable-static to
|
||||
build only shared libraries.
|
||||
|
||||
|
||||
Cross-compiling on Unix-like systems
|
||||
------------------------------------
|
||||
Cross-compiling using autotools
|
||||
-------------------------------
|
||||
|
||||
You can specify CC and CFLAGS in the normal way to the "configure" command, in
|
||||
order to cross-compile PCRE for some other host. However, you should NOT
|
||||
@ -514,30 +581,49 @@ script creates the .txt and HTML forms of the documentation from the man pages.
|
||||
Testing PCRE
|
||||
------------
|
||||
|
||||
To test the basic PCRE library on a Unix system, run the RunTest script that is
|
||||
created by the configuring process. There is also a script called RunGrepTest
|
||||
that tests the options of the pcregrep command. If the C++ wrapper library is
|
||||
built, three test programs called pcrecpp_unittest, pcre_scanner_unittest, and
|
||||
pcre_stringpiece_unittest are also built.
|
||||
To test the basic PCRE library on a Unix-like system, run the RunTest script.
|
||||
There is another script called RunGrepTest that tests the options of the
|
||||
pcregrep command. If the C++ wrapper library is built, three test programs
|
||||
called pcrecpp_unittest, pcre_scanner_unittest, and pcre_stringpiece_unittest
|
||||
are also built. When JIT support is enabled, another test program called
|
||||
pcre_jit_test is built.
|
||||
|
||||
Both the scripts and all the program tests are run if you obey "make check" or
|
||||
"make test". For other systems, see the instructions in NON-UNIX-USE.
|
||||
"make test". For other environments, see the instructions in
|
||||
NON-AUTOTOOLS-BUILD.
|
||||
|
||||
The RunTest script runs the pcretest test program (which is documented in its
|
||||
own man page) on each of the testinput files in the testdata directory in
|
||||
turn, and compares the output with the contents of the corresponding testoutput
|
||||
files. A file called testtry is used to hold the main output from pcretest
|
||||
(testsavedregex is also used as a working file). To run pcretest on just one of
|
||||
the test files, give its number as an argument to RunTest, for example:
|
||||
own man page) on each of the relevant testinput files in the testdata
|
||||
directory, and compares the output with the contents of the corresponding
|
||||
testoutput files. Some tests are relevant only when certain build-time options
|
||||
were selected. For example, the tests for UTF-8/16 support are run only if
|
||||
--enable-utf was used. RunTest outputs a comment when it skips a test.
|
||||
|
||||
RunTest 2
|
||||
Many of the tests that are not skipped are run up to three times. The second
|
||||
run forces pcre_study() to be called for all patterns except for a few in some
|
||||
tests that are marked "never study" (see the pcretest program for how this is
|
||||
done). If JIT support is available, the non-DFA tests are run a third time,
|
||||
this time with a forced pcre_study() with the PCRE_STUDY_JIT_COMPILE option.
|
||||
|
||||
The first test file can also be fed directly into the perltest.pl script to
|
||||
check that Perl gives the same results. The only difference you should see is
|
||||
in the first few lines, where the Perl version is given instead of the PCRE
|
||||
version.
|
||||
When both 8-bit and 16-bit support is enabled, the entire set of tests is run
|
||||
twice, once for each library. If you want to run just one set of tests, call
|
||||
RunTest with either the -8 or -16 option.
|
||||
|
||||
The second set of tests check pcre_fullinfo(), pcre_info(), pcre_study(),
|
||||
RunTest uses a file called testtry to hold the main output from pcretest.
|
||||
Other files whose names begin with "test" are used as working files in some
|
||||
tests. To run pcretest on just one or more specific test files, give their
|
||||
numbers as arguments to RunTest, for example:
|
||||
|
||||
RunTest 2 7 11
|
||||
|
||||
You can also call RunTest with the single argument "list" to cause it to output
|
||||
a list of tests.
|
||||
|
||||
The first test file can be fed directly into the perltest.pl script to check
|
||||
that Perl gives the same results. The only difference you should see is in the
|
||||
first few lines, where the Perl version is given instead of the PCRE version.
|
||||
|
||||
The second set of tests check pcre_fullinfo(), pcre_study(),
|
||||
pcre_copy_substring(), pcre_get_substring(), pcre_get_substring_list(), error
|
||||
detection, and run-time flags that are specific to PCRE, as well as the POSIX
|
||||
wrapper API. It also uses the debugging flags to check some of the internals of
|
||||
@ -572,33 +658,32 @@ RunTest.bat. The version of RunTest.bat included with PCRE 7.4 and above uses
|
||||
Windows versions of test 2. More info on using RunTest.bat is included in the
|
||||
document entitled NON-UNIX-USE.]
|
||||
|
||||
The fourth test checks the UTF-8 support. It is not run automatically unless
|
||||
PCRE is built with UTF-8 support. To do this you must set --enable-utf8 when
|
||||
running "configure". This file can be also fed directly to the perltest.pl
|
||||
script, provided you are running Perl 5.8 or higher.
|
||||
The fourth and fifth tests check the UTF-8/16 support and error handling and
|
||||
internal UTF features of PCRE that are not relevant to Perl, respectively. The
|
||||
sixth and seventh tests do the same for Unicode character properties support.
|
||||
|
||||
The fifth test checks error handling with UTF-8 encoding, and internal UTF-8
|
||||
features of PCRE that are not relevant to Perl.
|
||||
The eighth, ninth, and tenth tests check the pcre_dfa_exec() alternative
|
||||
matching function, in non-UTF-8/16 mode, UTF-8/16 mode, and UTF-8/16 mode with
|
||||
Unicode property support, respectively.
|
||||
|
||||
The sixth test (which is Perl-5.10 compatible) checks the support for Unicode
|
||||
character properties. It it not run automatically unless PCRE is built with
|
||||
Unicode property support. To to this you must set --enable-unicode-properties
|
||||
when running "configure".
|
||||
The eleventh test checks some internal offsets and code size features; it is
|
||||
run only when the default "link size" of 2 is set (in other cases the sizes
|
||||
change) and when Unicode property support is enabled.
|
||||
|
||||
The seventh, eighth, and ninth tests check the pcre_dfa_exec() alternative
|
||||
matching function, in non-UTF-8 mode, UTF-8 mode, and UTF-8 mode with Unicode
|
||||
property support, respectively. The eighth and ninth tests are not run
|
||||
automatically unless PCRE is build with the relevant support.
|
||||
The twelfth test is run only when JIT support is available, and the thirteenth
|
||||
test is run only when JIT support is not available. They test some JIT-specific
|
||||
features such as information output from pcretest about JIT compilation.
|
||||
|
||||
The tenth test checks some internal offsets and code size features; it is run
|
||||
only when the default "link size" of 2 is set (in other cases the sizes
|
||||
change).
|
||||
The fourteenth, fifteenth, and sixteenth tests are run only in 8-bit mode, and
|
||||
the seventeenth, eighteenth, and nineteenth tests are run only in 16-bit mode.
|
||||
These are tests that generate different output in the two modes. They are for
|
||||
general cases, UTF-8/16 support, and Unicode property support, respectively.
|
||||
|
||||
The eleventh test checks out features that are new in Perl 5.10, and the
|
||||
twelfth test checks a number internals and non-Perl features concerned with
|
||||
Unicode property support. It it not run automatically unless PCRE is built with
|
||||
Unicode property support. To to this you must set --enable-unicode-properties
|
||||
when running "configure".
|
||||
The twentieth test is run only in 16-bit mode. It tests some specific 16-bit
|
||||
features of the DFA matching engine.
|
||||
|
||||
The twenty-first and twenty-second tests are run only in 16-bit mode, when the
|
||||
link size is set to 2. They test reloading pre-compiled patterns.
|
||||
|
||||
|
||||
Character tables
|
||||
@ -658,7 +743,9 @@ will cause PCRE to malfunction.
|
||||
File manifest
|
||||
-------------
|
||||
|
||||
The distribution should contain the following files:
|
||||
The distribution should contain the files listed below. Where a file name is
|
||||
given as pcre[16]_xxx it means that there are two files, one with the name
|
||||
pcre_xxx and the other with the name pcre16_xxx.
|
||||
|
||||
(A) Source files of the PCRE library functions and their headers:
|
||||
|
||||
@ -667,33 +754,40 @@ The distribution should contain the following files:
|
||||
|
||||
pcre_chartables.c.dist a default set of character tables that assume ASCII
|
||||
coding; used, unless --enable-rebuild-chartables is
|
||||
specified, by copying to pcre_chartables.c
|
||||
specified, by copying to pcre[16]_chartables.c
|
||||
|
||||
pcreposix.c )
|
||||
pcre_compile.c )
|
||||
pcre_config.c )
|
||||
pcre_dfa_exec.c )
|
||||
pcre_exec.c )
|
||||
pcre_fullinfo.c )
|
||||
pcre_get.c ) sources for the functions in the library,
|
||||
pcre_globals.c ) and some internal functions that they use
|
||||
pcre_info.c )
|
||||
pcre_maketables.c )
|
||||
pcre_newline.c )
|
||||
pcre[16]_byte_order.c )
|
||||
pcre[16]_compile.c )
|
||||
pcre[16]_config.c )
|
||||
pcre[16]_dfa_exec.c )
|
||||
pcre[16]_exec.c )
|
||||
pcre[16]_fullinfo.c )
|
||||
pcre[16]_get.c ) sources for the functions in the library,
|
||||
pcre[16]_globals.c ) and some internal functions that they use
|
||||
pcre[16]_jit_compile.c )
|
||||
pcre[16]_maketables.c )
|
||||
pcre[16]_newline.c )
|
||||
pcre[16]_refcount.c )
|
||||
pcre[16]_string_utils.c )
|
||||
pcre[16]_study.c )
|
||||
pcre[16]_tables.c )
|
||||
pcre[16]_ucd.c )
|
||||
pcre[16]_version.c )
|
||||
pcre[16]_xclass.c )
|
||||
pcre_ord2utf8.c )
|
||||
pcre_refcount.c )
|
||||
pcre_study.c )
|
||||
pcre_tables.c )
|
||||
pcre_try_flipped.c )
|
||||
pcre_ucd.c )
|
||||
pcre_valid_utf8.c )
|
||||
pcre_version.c )
|
||||
pcre_xclass.c )
|
||||
pcre_printint.src ) debugging function that is #included in pcretest,
|
||||
pcre16_ord2utf16.c )
|
||||
pcre16_utf16_utils.c )
|
||||
pcre16_valid_utf16.c )
|
||||
|
||||
pcre[16]_printint.c ) debugging function that is used by pcretest,
|
||||
) and can also be #included in pcre_compile()
|
||||
|
||||
pcre.h.in template for pcre.h when built by "configure"
|
||||
pcreposix.h header for the external POSIX wrapper API
|
||||
pcre_internal.h header for internal use
|
||||
sljit/* 16 files that make up the JIT compiler
|
||||
ucp.h header for Unicode property handling
|
||||
|
||||
config.h.in template for config.h, which is built by "configure"
|
||||
@ -730,7 +824,8 @@ The distribution should contain the following files:
|
||||
Makefile.am ) the automake input that was used to create
|
||||
) Makefile.in
|
||||
NEWS important changes in this release
|
||||
NON-UNIX-USE notes on building PCRE on non-Unix systems
|
||||
NON-UNIX-USE the previous name for NON-AUTOTOOLS-BUILD
|
||||
NON-AUTOTOOLS-BUILD notes on building PCRE without using autotools
|
||||
PrepareRelease script to make preparations for "make dist"
|
||||
README this file
|
||||
RunTest a Unix shell script for running tests
|
||||
@ -751,6 +846,7 @@ The distribution should contain the following files:
|
||||
doc/pcretest.txt plain text documentation of test program
|
||||
doc/perltest.txt plain text documentation of Perl test program
|
||||
install-sh a shell script for installing files
|
||||
libpcre16.pc.in template for libpcre16.pc for pkg-config
|
||||
libpcre.pc.in template for libpcre.pc for pkg-config
|
||||
libpcreposix.pc.in template for libpcreposix.pc for pkg-config
|
||||
libpcrecpp.pc.in template for libpcrecpp.pc for pkg-config
|
||||
@ -760,17 +856,20 @@ The distribution should contain the following files:
|
||||
mkinstalldirs script for making install directories
|
||||
perltest.pl Perl test program
|
||||
pcre-config.in source of script which retains PCRE information
|
||||
pcre_jit_test.c test program for the JIT compiler
|
||||
pcrecpp_unittest.cc )
|
||||
pcre_scanner_unittest.cc ) test programs for the C++ wrapper
|
||||
pcre_stringpiece_unittest.cc )
|
||||
testdata/testinput* test data for main library tests
|
||||
testdata/testoutput* expected test results
|
||||
testdata/grep* input and output for pcregrep tests
|
||||
testdata/* other supporting test files
|
||||
|
||||
(D) Auxiliary files for cmake support
|
||||
|
||||
cmake/COPYING-CMAKE-SCRIPTS
|
||||
cmake/FindPackageHandleStandardArgs.cmake
|
||||
cmake/FindEditline.cmake
|
||||
cmake/FindReadline.cmake
|
||||
CMakeLists.txt
|
||||
config-cmake.h.in
|
||||
@ -796,4 +895,4 @@ The distribution should contain the following files:
|
||||
Philip Hazel
|
||||
Email local part: ph10
|
||||
Email domain: cam.ac.uk
|
||||
Last updated: 19 January 2010
|
||||
Last updated: 18 June 2012
|
||||
|
@ -282,7 +282,7 @@ them both to 0; an emulation function will be used. */
|
||||
#define PACKAGE_NAME "PCRE"
|
||||
|
||||
/* Define to the full name and version of this package. */
|
||||
#define PACKAGE_STRING "PCRE 8.12"
|
||||
#define PACKAGE_STRING "PCRE 8.31"
|
||||
|
||||
/* Define to the one symbol short name of this package. */
|
||||
#define PACKAGE_TARNAME "pcre"
|
||||
@ -291,7 +291,7 @@ them both to 0; an emulation function will be used. */
|
||||
#define PACKAGE_URL ""
|
||||
|
||||
/* Define to the version of this package. */
|
||||
#define PACKAGE_VERSION "8.12"
|
||||
#define PACKAGE_VERSION "8.31"
|
||||
|
||||
|
||||
/* If you are compiling for a system other than a Unix-like system or
|
||||
@ -347,7 +347,7 @@ them both to 0; an emulation function will be used. */
|
||||
|
||||
/* Version number of package */
|
||||
#ifndef VERSION
|
||||
#define VERSION "8.12"
|
||||
#define VERSION "8.31"
|
||||
#endif
|
||||
|
||||
/* Define to empty if `const' does not conform to ANSI C. */
|
||||
|
@ -6,7 +6,7 @@
|
||||
and semantics are as close as possible to those of the Perl 5 language.
|
||||
|
||||
Written by Philip Hazel
|
||||
Copyright (c) 1997-2008 University of Cambridge
|
||||
Copyright (c) 1997-2012 University of Cambridge
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@ -112,7 +112,7 @@ fprintf(f,
|
||||
"#endif\n\n"
|
||||
"#include \"pcre_internal.h\"\n\n");
|
||||
fprintf(f,
|
||||
"const unsigned char _pcre_default_tables[] = {\n\n"
|
||||
"const pcre_uint8 PRIV(default_tables)[] = {\n\n"
|
||||
"/* This table is a lower casing table. */\n\n");
|
||||
|
||||
fprintf(f, " ");
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -5,7 +5,7 @@
|
||||
/* This is the public header file for the PCRE library, to be #included by
|
||||
applications that call the PCRE functions.
|
||||
|
||||
Copyright (c) 1997-2010 University of Cambridge
|
||||
Copyright (c) 1997-2012 University of Cambridge
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@ -42,9 +42,9 @@ POSSIBILITY OF SUCH DAMAGE.
|
||||
/* The current PCRE version information. */
|
||||
|
||||
#define PCRE_MAJOR 8
|
||||
#define PCRE_MINOR 12
|
||||
#define PCRE_MINOR 31
|
||||
#define PCRE_PRERELEASE
|
||||
#define PCRE_DATE 2011-01-15
|
||||
#define PCRE_DATE 2012-07-06
|
||||
|
||||
/* When an application links to a PCRE DLL in Windows, the symbols that are
|
||||
imported have to be identified as such. When building PCRE, the appropriate
|
||||
@ -98,28 +98,37 @@ extern "C" {
|
||||
/* Options. Some are compile-time only, some are run-time only, and some are
|
||||
both, so we keep them all distinct. However, almost all the bits in the options
|
||||
word are now used. In the long run, we may have to re-use some of the
|
||||
compile-time only bits for runtime options, or vice versa. */
|
||||
compile-time only bits for runtime options, or vice versa. In the comments
|
||||
below, "compile", "exec", and "DFA exec" mean that the option is permitted to
|
||||
be set for those functions; "used in" means that an option may be set only for
|
||||
compile, but is subsequently referenced in exec and/or DFA exec. Any of the
|
||||
compile-time options may be inspected during studying (and therefore JIT
|
||||
compiling). */
|
||||
|
||||
#define PCRE_CASELESS 0x00000001 /* Compile */
|
||||
#define PCRE_MULTILINE 0x00000002 /* Compile */
|
||||
#define PCRE_DOTALL 0x00000004 /* Compile */
|
||||
#define PCRE_EXTENDED 0x00000008 /* Compile */
|
||||
#define PCRE_ANCHORED 0x00000010 /* Compile, exec, DFA exec */
|
||||
#define PCRE_DOLLAR_ENDONLY 0x00000020 /* Compile */
|
||||
#define PCRE_DOLLAR_ENDONLY 0x00000020 /* Compile, used in exec, DFA exec */
|
||||
#define PCRE_EXTRA 0x00000040 /* Compile */
|
||||
#define PCRE_NOTBOL 0x00000080 /* Exec, DFA exec */
|
||||
#define PCRE_NOTEOL 0x00000100 /* Exec, DFA exec */
|
||||
#define PCRE_UNGREEDY 0x00000200 /* Compile */
|
||||
#define PCRE_NOTEMPTY 0x00000400 /* Exec, DFA exec */
|
||||
#define PCRE_UTF8 0x00000800 /* Compile */
|
||||
/* The next two are also used in exec and DFA exec */
|
||||
#define PCRE_UTF8 0x00000800 /* Compile (same as PCRE_UTF16) */
|
||||
#define PCRE_UTF16 0x00000800 /* Compile (same as PCRE_UTF8) */
|
||||
#define PCRE_NO_AUTO_CAPTURE 0x00001000 /* Compile */
|
||||
#define PCRE_NO_UTF8_CHECK 0x00002000 /* Compile, exec, DFA exec */
|
||||
/* The next two are also used in exec and DFA exec */
|
||||
#define PCRE_NO_UTF8_CHECK 0x00002000 /* Compile (same as PCRE_NO_UTF16_CHECK) */
|
||||
#define PCRE_NO_UTF16_CHECK 0x00002000 /* Compile (same as PCRE_NO_UTF8_CHECK) */
|
||||
#define PCRE_AUTO_CALLOUT 0x00004000 /* Compile */
|
||||
#define PCRE_PARTIAL_SOFT 0x00008000 /* Exec, DFA exec */
|
||||
#define PCRE_PARTIAL 0x00008000 /* Backwards compatible synonym */
|
||||
#define PCRE_DFA_SHORTEST 0x00010000 /* DFA exec */
|
||||
#define PCRE_DFA_RESTART 0x00020000 /* DFA exec */
|
||||
#define PCRE_FIRSTLINE 0x00040000 /* Compile */
|
||||
#define PCRE_FIRSTLINE 0x00040000 /* Compile, used in exec, DFA exec */
|
||||
#define PCRE_DUPNAMES 0x00080000 /* Compile */
|
||||
#define PCRE_NEWLINE_CR 0x00100000 /* Compile, exec, DFA exec */
|
||||
#define PCRE_NEWLINE_LF 0x00200000 /* Compile, exec, DFA exec */
|
||||
@ -128,41 +137,82 @@ compile-time only bits for runtime options, or vice versa. */
|
||||
#define PCRE_NEWLINE_ANYCRLF 0x00500000 /* Compile, exec, DFA exec */
|
||||
#define PCRE_BSR_ANYCRLF 0x00800000 /* Compile, exec, DFA exec */
|
||||
#define PCRE_BSR_UNICODE 0x01000000 /* Compile, exec, DFA exec */
|
||||
#define PCRE_JAVASCRIPT_COMPAT 0x02000000 /* Compile */
|
||||
#define PCRE_JAVASCRIPT_COMPAT 0x02000000 /* Compile, used in exec */
|
||||
#define PCRE_NO_START_OPTIMIZE 0x04000000 /* Compile, exec, DFA exec */
|
||||
#define PCRE_NO_START_OPTIMISE 0x04000000 /* Synonym */
|
||||
#define PCRE_PARTIAL_HARD 0x08000000 /* Exec, DFA exec */
|
||||
#define PCRE_NOTEMPTY_ATSTART 0x10000000 /* Exec, DFA exec */
|
||||
#define PCRE_UCP 0x20000000 /* Compile */
|
||||
#define PCRE_UCP 0x20000000 /* Compile, used in exec, DFA exec */
|
||||
|
||||
/* Exec-time and get/set-time error codes */
|
||||
|
||||
#define PCRE_ERROR_NOMATCH (-1)
|
||||
#define PCRE_ERROR_NULL (-2)
|
||||
#define PCRE_ERROR_BADOPTION (-3)
|
||||
#define PCRE_ERROR_BADMAGIC (-4)
|
||||
#define PCRE_ERROR_UNKNOWN_OPCODE (-5)
|
||||
#define PCRE_ERROR_UNKNOWN_NODE (-5) /* For backward compatibility */
|
||||
#define PCRE_ERROR_NOMEMORY (-6)
|
||||
#define PCRE_ERROR_NOSUBSTRING (-7)
|
||||
#define PCRE_ERROR_MATCHLIMIT (-8)
|
||||
#define PCRE_ERROR_CALLOUT (-9) /* Never used by PCRE itself */
|
||||
#define PCRE_ERROR_BADUTF8 (-10)
|
||||
#define PCRE_ERROR_BADUTF8_OFFSET (-11)
|
||||
#define PCRE_ERROR_PARTIAL (-12)
|
||||
#define PCRE_ERROR_BADPARTIAL (-13)
|
||||
#define PCRE_ERROR_INTERNAL (-14)
|
||||
#define PCRE_ERROR_BADCOUNT (-15)
|
||||
#define PCRE_ERROR_DFA_UITEM (-16)
|
||||
#define PCRE_ERROR_DFA_UCOND (-17)
|
||||
#define PCRE_ERROR_DFA_UMLIMIT (-18)
|
||||
#define PCRE_ERROR_DFA_WSSIZE (-19)
|
||||
#define PCRE_ERROR_DFA_RECURSE (-20)
|
||||
#define PCRE_ERROR_RECURSIONLIMIT (-21)
|
||||
#define PCRE_ERROR_NULLWSLIMIT (-22) /* No longer actually used */
|
||||
#define PCRE_ERROR_BADNEWLINE (-23)
|
||||
#define PCRE_ERROR_BADOFFSET (-24)
|
||||
#define PCRE_ERROR_SHORTUTF8 (-25)
|
||||
#define PCRE_ERROR_NOMATCH (-1)
|
||||
#define PCRE_ERROR_NULL (-2)
|
||||
#define PCRE_ERROR_BADOPTION (-3)
|
||||
#define PCRE_ERROR_BADMAGIC (-4)
|
||||
#define PCRE_ERROR_UNKNOWN_OPCODE (-5)
|
||||
#define PCRE_ERROR_UNKNOWN_NODE (-5) /* For backward compatibility */
|
||||
#define PCRE_ERROR_NOMEMORY (-6)
|
||||
#define PCRE_ERROR_NOSUBSTRING (-7)
|
||||
#define PCRE_ERROR_MATCHLIMIT (-8)
|
||||
#define PCRE_ERROR_CALLOUT (-9) /* Never used by PCRE itself */
|
||||
#define PCRE_ERROR_BADUTF8 (-10) /* Same for 8/16 */
|
||||
#define PCRE_ERROR_BADUTF16 (-10) /* Same for 8/16 */
|
||||
#define PCRE_ERROR_BADUTF8_OFFSET (-11) /* Same for 8/16 */
|
||||
#define PCRE_ERROR_BADUTF16_OFFSET (-11) /* Same for 8/16 */
|
||||
#define PCRE_ERROR_PARTIAL (-12)
|
||||
#define PCRE_ERROR_BADPARTIAL (-13)
|
||||
#define PCRE_ERROR_INTERNAL (-14)
|
||||
#define PCRE_ERROR_BADCOUNT (-15)
|
||||
#define PCRE_ERROR_DFA_UITEM (-16)
|
||||
#define PCRE_ERROR_DFA_UCOND (-17)
|
||||
#define PCRE_ERROR_DFA_UMLIMIT (-18)
|
||||
#define PCRE_ERROR_DFA_WSSIZE (-19)
|
||||
#define PCRE_ERROR_DFA_RECURSE (-20)
|
||||
#define PCRE_ERROR_RECURSIONLIMIT (-21)
|
||||
#define PCRE_ERROR_NULLWSLIMIT (-22) /* No longer actually used */
|
||||
#define PCRE_ERROR_BADNEWLINE (-23)
|
||||
#define PCRE_ERROR_BADOFFSET (-24)
|
||||
#define PCRE_ERROR_SHORTUTF8 (-25)
|
||||
#define PCRE_ERROR_SHORTUTF16 (-25) /* Same for 8/16 */
|
||||
#define PCRE_ERROR_RECURSELOOP (-26)
|
||||
#define PCRE_ERROR_JIT_STACKLIMIT (-27)
|
||||
#define PCRE_ERROR_BADMODE (-28)
|
||||
#define PCRE_ERROR_BADENDIANNESS (-29)
|
||||
#define PCRE_ERROR_DFA_BADRESTART (-30)
|
||||
|
||||
/* Specific error codes for UTF-8 validity checks */
|
||||
|
||||
#define PCRE_UTF8_ERR0 0
|
||||
#define PCRE_UTF8_ERR1 1
|
||||
#define PCRE_UTF8_ERR2 2
|
||||
#define PCRE_UTF8_ERR3 3
|
||||
#define PCRE_UTF8_ERR4 4
|
||||
#define PCRE_UTF8_ERR5 5
|
||||
#define PCRE_UTF8_ERR6 6
|
||||
#define PCRE_UTF8_ERR7 7
|
||||
#define PCRE_UTF8_ERR8 8
|
||||
#define PCRE_UTF8_ERR9 9
|
||||
#define PCRE_UTF8_ERR10 10
|
||||
#define PCRE_UTF8_ERR11 11
|
||||
#define PCRE_UTF8_ERR12 12
|
||||
#define PCRE_UTF8_ERR13 13
|
||||
#define PCRE_UTF8_ERR14 14
|
||||
#define PCRE_UTF8_ERR15 15
|
||||
#define PCRE_UTF8_ERR16 16
|
||||
#define PCRE_UTF8_ERR17 17
|
||||
#define PCRE_UTF8_ERR18 18
|
||||
#define PCRE_UTF8_ERR19 19
|
||||
#define PCRE_UTF8_ERR20 20
|
||||
#define PCRE_UTF8_ERR21 21
|
||||
|
||||
/* Specific error codes for UTF-16 validity checks */
|
||||
|
||||
#define PCRE_UTF16_ERR0 0
|
||||
#define PCRE_UTF16_ERR1 1
|
||||
#define PCRE_UTF16_ERR2 2
|
||||
#define PCRE_UTF16_ERR3 3
|
||||
#define PCRE_UTF16_ERR4 4
|
||||
|
||||
/* Request types for pcre_fullinfo() */
|
||||
|
||||
@ -183,6 +233,9 @@ compile-time only bits for runtime options, or vice versa. */
|
||||
#define PCRE_INFO_JCHANGED 13
|
||||
#define PCRE_INFO_HASCRORLF 14
|
||||
#define PCRE_INFO_MINLENGTH 15
|
||||
#define PCRE_INFO_JIT 16
|
||||
#define PCRE_INFO_JITSIZE 17
|
||||
#define PCRE_INFO_MAXLOOKBEHIND 18
|
||||
|
||||
/* Request types for pcre_config(). Do not re-arrange, in order to remain
|
||||
compatible. */
|
||||
@ -196,8 +249,18 @@ compatible. */
|
||||
#define PCRE_CONFIG_UNICODE_PROPERTIES 6
|
||||
#define PCRE_CONFIG_MATCH_LIMIT_RECURSION 7
|
||||
#define PCRE_CONFIG_BSR 8
|
||||
#define PCRE_CONFIG_JIT 9
|
||||
#define PCRE_CONFIG_UTF16 10
|
||||
#define PCRE_CONFIG_JITTARGET 11
|
||||
|
||||
/* Bit flags for the pcre_extra structure. Do not re-arrange or redefine
|
||||
/* Request types for pcre_study(). Do not re-arrange, in order to remain
|
||||
compatible. */
|
||||
|
||||
#define PCRE_STUDY_JIT_COMPILE 0x0001
|
||||
#define PCRE_STUDY_JIT_PARTIAL_SOFT_COMPILE 0x0002
|
||||
#define PCRE_STUDY_JIT_PARTIAL_HARD_COMPILE 0x0004
|
||||
|
||||
/* Bit flags for the pcre[16]_extra structure. Do not re-arrange or redefine
|
||||
these bits, just add new ones on the end, in order to remain compatible. */
|
||||
|
||||
#define PCRE_EXTRA_STUDY_DATA 0x0001
|
||||
@ -206,12 +269,33 @@ these bits, just add new ones on the end, in order to remain compatible. */
|
||||
#define PCRE_EXTRA_TABLES 0x0008
|
||||
#define PCRE_EXTRA_MATCH_LIMIT_RECURSION 0x0010
|
||||
#define PCRE_EXTRA_MARK 0x0020
|
||||
#define PCRE_EXTRA_EXECUTABLE_JIT 0x0040
|
||||
|
||||
/* Types */
|
||||
|
||||
struct real_pcre; /* declaration; the definition is private */
|
||||
typedef struct real_pcre pcre;
|
||||
|
||||
struct real_pcre16; /* declaration; the definition is private */
|
||||
typedef struct real_pcre16 pcre16;
|
||||
|
||||
struct real_pcre_jit_stack; /* declaration; the definition is private */
|
||||
typedef struct real_pcre_jit_stack pcre_jit_stack;
|
||||
|
||||
struct real_pcre16_jit_stack; /* declaration; the definition is private */
|
||||
typedef struct real_pcre16_jit_stack pcre16_jit_stack;
|
||||
|
||||
/* If PCRE is compiled with 16 bit character support, PCRE_UCHAR16 must contain
|
||||
a 16 bit wide signed data type. Otherwise it can be a dummy data type since
|
||||
pcre16 functions are not implemented. There is a check for this in pcre_internal.h. */
|
||||
#ifndef PCRE_UCHAR16
|
||||
#define PCRE_UCHAR16 unsigned short
|
||||
#endif
|
||||
|
||||
#ifndef PCRE_SPTR16
|
||||
#define PCRE_SPTR16 const PCRE_UCHAR16 *
|
||||
#endif
|
||||
|
||||
/* When PCRE is compiled as a C++ library, the subject pointer type can be
|
||||
replaced with a custom type. For conventional use, the public interface is a
|
||||
const char *. */
|
||||
@ -232,8 +316,22 @@ typedef struct pcre_extra {
|
||||
const unsigned char *tables; /* Pointer to character tables */
|
||||
unsigned long int match_limit_recursion; /* Max recursive calls to match() */
|
||||
unsigned char **mark; /* For passing back a mark pointer */
|
||||
void *executable_jit; /* Contains a pointer to a compiled jit code */
|
||||
} pcre_extra;
|
||||
|
||||
/* Same structure as above, but with 16 bit char pointers. */
|
||||
|
||||
typedef struct pcre16_extra {
|
||||
unsigned long int flags; /* Bits for which fields are set */
|
||||
void *study_data; /* Opaque data from pcre_study() */
|
||||
unsigned long int match_limit; /* Maximum number of calls to match() */
|
||||
void *callout_data; /* Data passed back in callouts */
|
||||
const unsigned char *tables; /* Pointer to character tables */
|
||||
unsigned long int match_limit_recursion; /* Max recursive calls to match() */
|
||||
PCRE_UCHAR16 **mark; /* For passing back a mark pointer */
|
||||
void *executable_jit; /* Contains a pointer to a compiled jit code */
|
||||
} pcre16_extra;
|
||||
|
||||
/* The structure for passing out data via the pcre_callout_function. We use a
|
||||
structure so that new fields can be added on the end in future versions,
|
||||
without changing the API of the function, thereby allowing old clients to work
|
||||
@ -254,9 +352,33 @@ typedef struct pcre_callout_block {
|
||||
/* ------------------- Added for Version 1 -------------------------- */
|
||||
int pattern_position; /* Offset to next item in the pattern */
|
||||
int next_item_length; /* Length of next item in the pattern */
|
||||
/* ------------------- Added for Version 2 -------------------------- */
|
||||
const unsigned char *mark; /* Pointer to current mark or NULL */
|
||||
/* ------------------------------------------------------------------ */
|
||||
} pcre_callout_block;
|
||||
|
||||
/* Same structure as above, but with 16 bit char pointers. */
|
||||
|
||||
typedef struct pcre16_callout_block {
|
||||
int version; /* Identifies version of block */
|
||||
/* ------------------------ Version 0 ------------------------------- */
|
||||
int callout_number; /* Number compiled into pattern */
|
||||
int *offset_vector; /* The offset vector */
|
||||
PCRE_SPTR16 subject; /* The subject being matched */
|
||||
int subject_length; /* The length of the subject */
|
||||
int start_match; /* Offset to start of this match attempt */
|
||||
int current_position; /* Where we currently are in the subject */
|
||||
int capture_top; /* Max current capture */
|
||||
int capture_last; /* Most recently closed capture */
|
||||
void *callout_data; /* Data passed in with the call */
|
||||
/* ------------------- Added for Version 1 -------------------------- */
|
||||
int pattern_position; /* Offset to next item in the pattern */
|
||||
int next_item_length; /* Length of next item in the pattern */
|
||||
/* ------------------- Added for Version 2 -------------------------- */
|
||||
const PCRE_UCHAR16 *mark; /* Pointer to current mark or NULL */
|
||||
/* ------------------------------------------------------------------ */
|
||||
} pcre16_callout_block;
|
||||
|
||||
/* Indirection for store get and free functions. These can be set to
|
||||
alternative malloc/free functions if required. Special ones are used in the
|
||||
non-recursive case for "frames". There is also an optional callout function
|
||||
@ -269,47 +391,114 @@ PCRE_EXP_DECL void (*pcre_free)(void *);
|
||||
PCRE_EXP_DECL void *(*pcre_stack_malloc)(size_t);
|
||||
PCRE_EXP_DECL void (*pcre_stack_free)(void *);
|
||||
PCRE_EXP_DECL int (*pcre_callout)(pcre_callout_block *);
|
||||
|
||||
PCRE_EXP_DECL void *(*pcre16_malloc)(size_t);
|
||||
PCRE_EXP_DECL void (*pcre16_free)(void *);
|
||||
PCRE_EXP_DECL void *(*pcre16_stack_malloc)(size_t);
|
||||
PCRE_EXP_DECL void (*pcre16_stack_free)(void *);
|
||||
PCRE_EXP_DECL int (*pcre16_callout)(pcre16_callout_block *);
|
||||
#else /* VPCOMPAT */
|
||||
PCRE_EXP_DECL void *pcre_malloc(size_t);
|
||||
PCRE_EXP_DECL void pcre_free(void *);
|
||||
PCRE_EXP_DECL void *pcre_stack_malloc(size_t);
|
||||
PCRE_EXP_DECL void pcre_stack_free(void *);
|
||||
PCRE_EXP_DECL int pcre_callout(pcre_callout_block *);
|
||||
|
||||
PCRE_EXP_DECL void *pcre16_malloc(size_t);
|
||||
PCRE_EXP_DECL void pcre16_free(void *);
|
||||
PCRE_EXP_DECL void *pcre16_stack_malloc(size_t);
|
||||
PCRE_EXP_DECL void pcre16_stack_free(void *);
|
||||
PCRE_EXP_DECL int pcre16_callout(pcre16_callout_block *);
|
||||
#endif /* VPCOMPAT */
|
||||
|
||||
/* User defined callback which provides a stack just before the match starts. */
|
||||
|
||||
typedef pcre_jit_stack *(*pcre_jit_callback)(void *);
|
||||
typedef pcre16_jit_stack *(*pcre16_jit_callback)(void *);
|
||||
|
||||
/* Exported PCRE functions */
|
||||
|
||||
PCRE_EXP_DECL pcre *pcre_compile(const char *, int, const char **, int *,
|
||||
const unsigned char *);
|
||||
PCRE_EXP_DECL pcre16 *pcre16_compile(PCRE_SPTR16, int, const char **, int *,
|
||||
const unsigned char *);
|
||||
PCRE_EXP_DECL pcre *pcre_compile2(const char *, int, int *, const char **,
|
||||
int *, const unsigned char *);
|
||||
PCRE_EXP_DECL pcre16 *pcre16_compile2(PCRE_SPTR16, int, int *, const char **,
|
||||
int *, const unsigned char *);
|
||||
PCRE_EXP_DECL int pcre_config(int, void *);
|
||||
PCRE_EXP_DECL int pcre16_config(int, void *);
|
||||
PCRE_EXP_DECL int pcre_copy_named_substring(const pcre *, const char *,
|
||||
int *, int, const char *, char *, int);
|
||||
PCRE_EXP_DECL int pcre_copy_substring(const char *, int *, int, int, char *,
|
||||
int);
|
||||
PCRE_EXP_DECL int pcre16_copy_named_substring(const pcre16 *, PCRE_SPTR16,
|
||||
int *, int, PCRE_SPTR16, PCRE_UCHAR16 *, int);
|
||||
PCRE_EXP_DECL int pcre_copy_substring(const char *, int *, int, int,
|
||||
char *, int);
|
||||
PCRE_EXP_DECL int pcre16_copy_substring(PCRE_SPTR16, int *, int, int,
|
||||
PCRE_UCHAR16 *, int);
|
||||
PCRE_EXP_DECL int pcre_dfa_exec(const pcre *, const pcre_extra *,
|
||||
const char *, int, int, int, int *, int , int *, int);
|
||||
PCRE_EXP_DECL int pcre16_dfa_exec(const pcre16 *, const pcre16_extra *,
|
||||
PCRE_SPTR16, int, int, int, int *, int , int *, int);
|
||||
PCRE_EXP_DECL int pcre_exec(const pcre *, const pcre_extra *, PCRE_SPTR,
|
||||
int, int, int, int *, int);
|
||||
PCRE_EXP_DECL int pcre16_exec(const pcre16 *, const pcre16_extra *,
|
||||
PCRE_SPTR16, int, int, int, int *, int);
|
||||
PCRE_EXP_DECL void pcre_free_substring(const char *);
|
||||
PCRE_EXP_DECL void pcre16_free_substring(PCRE_SPTR16);
|
||||
PCRE_EXP_DECL void pcre_free_substring_list(const char **);
|
||||
PCRE_EXP_DECL void pcre16_free_substring_list(PCRE_SPTR16 *);
|
||||
PCRE_EXP_DECL int pcre_fullinfo(const pcre *, const pcre_extra *, int,
|
||||
void *);
|
||||
PCRE_EXP_DECL int pcre16_fullinfo(const pcre16 *, const pcre16_extra *, int,
|
||||
void *);
|
||||
PCRE_EXP_DECL int pcre_get_named_substring(const pcre *, const char *,
|
||||
int *, int, const char *, const char **);
|
||||
PCRE_EXP_DECL int pcre16_get_named_substring(const pcre16 *, PCRE_SPTR16,
|
||||
int *, int, PCRE_SPTR16, PCRE_SPTR16 *);
|
||||
PCRE_EXP_DECL int pcre_get_stringnumber(const pcre *, const char *);
|
||||
PCRE_EXP_DECL int pcre16_get_stringnumber(const pcre16 *, PCRE_SPTR16);
|
||||
PCRE_EXP_DECL int pcre_get_stringtable_entries(const pcre *, const char *,
|
||||
char **, char **);
|
||||
PCRE_EXP_DECL int pcre16_get_stringtable_entries(const pcre16 *, PCRE_SPTR16,
|
||||
PCRE_UCHAR16 **, PCRE_UCHAR16 **);
|
||||
PCRE_EXP_DECL int pcre_get_substring(const char *, int *, int, int,
|
||||
const char **);
|
||||
PCRE_EXP_DECL int pcre16_get_substring(PCRE_SPTR16, int *, int, int,
|
||||
PCRE_SPTR16 *);
|
||||
PCRE_EXP_DECL int pcre_get_substring_list(const char *, int *, int,
|
||||
const char ***);
|
||||
PCRE_EXP_DECL int pcre_info(const pcre *, int *, int *);
|
||||
PCRE_EXP_DECL int pcre16_get_substring_list(PCRE_SPTR16, int *, int,
|
||||
PCRE_SPTR16 **);
|
||||
PCRE_EXP_DECL const unsigned char *pcre_maketables(void);
|
||||
PCRE_EXP_DECL const unsigned char *pcre16_maketables(void);
|
||||
PCRE_EXP_DECL int pcre_refcount(pcre *, int);
|
||||
PCRE_EXP_DECL int pcre16_refcount(pcre16 *, int);
|
||||
PCRE_EXP_DECL pcre_extra *pcre_study(const pcre *, int, const char **);
|
||||
PCRE_EXP_DECL pcre16_extra *pcre16_study(const pcre16 *, int, const char **);
|
||||
PCRE_EXP_DECL void pcre_free_study(pcre_extra *);
|
||||
PCRE_EXP_DECL void pcre16_free_study(pcre16_extra *);
|
||||
PCRE_EXP_DECL const char *pcre_version(void);
|
||||
PCRE_EXP_DECL const char *pcre16_version(void);
|
||||
|
||||
/* Utility functions for byte order swaps. */
|
||||
PCRE_EXP_DECL int pcre_pattern_to_host_byte_order(pcre *, pcre_extra *,
|
||||
const unsigned char *);
|
||||
PCRE_EXP_DECL int pcre16_pattern_to_host_byte_order(pcre16 *, pcre16_extra *,
|
||||
const unsigned char *);
|
||||
PCRE_EXP_DECL int pcre16_utf16_to_host_byte_order(PCRE_UCHAR16 *,
|
||||
PCRE_SPTR16, int, int *, int);
|
||||
|
||||
/* JIT compiler related functions. */
|
||||
|
||||
PCRE_EXP_DECL pcre_jit_stack *pcre_jit_stack_alloc(int, int);
|
||||
PCRE_EXP_DECL pcre16_jit_stack *pcre16_jit_stack_alloc(int, int);
|
||||
PCRE_EXP_DECL void pcre_jit_stack_free(pcre_jit_stack *);
|
||||
PCRE_EXP_DECL void pcre16_jit_stack_free(pcre16_jit_stack *);
|
||||
PCRE_EXP_DECL void pcre_assign_jit_stack(pcre_extra *,
|
||||
pcre_jit_callback, void *);
|
||||
PCRE_EXP_DECL void pcre16_assign_jit_stack(pcre16_extra *,
|
||||
pcre16_jit_callback, void *);
|
||||
|
||||
#ifdef __cplusplus
|
||||
} /* extern "C" */
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -6,7 +6,7 @@
|
||||
and semantics are as close as possible to those of the Perl 5 language.
|
||||
|
||||
Written by Philip Hazel
|
||||
Copyright (c) 1997-2009 University of Cambridge
|
||||
Copyright (c) 1997-2012 University of Cambridge
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@ -43,6 +43,9 @@ POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
#include "config.h"
|
||||
|
||||
/* Keep the original link size. */
|
||||
static int real_link_size = LINK_SIZE;
|
||||
|
||||
#include "pcre_internal.h"
|
||||
|
||||
|
||||
@ -60,18 +63,41 @@ Arguments:
|
||||
Returns: 0 if data returned, negative on error
|
||||
*/
|
||||
|
||||
#ifdef COMPILE_PCRE8
|
||||
PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
|
||||
pcre_config(int what, void *where)
|
||||
#else
|
||||
PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
|
||||
pcre16_config(int what, void *where)
|
||||
#endif
|
||||
{
|
||||
switch (what)
|
||||
{
|
||||
case PCRE_CONFIG_UTF8:
|
||||
#ifdef SUPPORT_UTF8
|
||||
#if defined COMPILE_PCRE16
|
||||
*((int *)where) = 0;
|
||||
return PCRE_ERROR_BADOPTION;
|
||||
#else
|
||||
#if defined SUPPORT_UTF
|
||||
*((int *)where) = 1;
|
||||
#else
|
||||
*((int *)where) = 0;
|
||||
#endif
|
||||
break;
|
||||
#endif
|
||||
|
||||
case PCRE_CONFIG_UTF16:
|
||||
#if defined COMPILE_PCRE8
|
||||
*((int *)where) = 0;
|
||||
return PCRE_ERROR_BADOPTION;
|
||||
#else
|
||||
#if defined SUPPORT_UTF
|
||||
*((int *)where) = 1;
|
||||
#else
|
||||
*((int *)where) = 0;
|
||||
#endif
|
||||
break;
|
||||
#endif
|
||||
|
||||
case PCRE_CONFIG_UNICODE_PROPERTIES:
|
||||
#ifdef SUPPORT_UCP
|
||||
@ -81,6 +107,22 @@ switch (what)
|
||||
#endif
|
||||
break;
|
||||
|
||||
case PCRE_CONFIG_JIT:
|
||||
#ifdef SUPPORT_JIT
|
||||
*((int *)where) = 1;
|
||||
#else
|
||||
*((int *)where) = 0;
|
||||
#endif
|
||||
break;
|
||||
|
||||
case PCRE_CONFIG_JITTARGET:
|
||||
#ifdef SUPPORT_JIT
|
||||
*((const char **)where) = PRIV(jit_get_target)();
|
||||
#else
|
||||
*((const char **)where) = NULL;
|
||||
#endif
|
||||
break;
|
||||
|
||||
case PCRE_CONFIG_NEWLINE:
|
||||
*((int *)where) = NEWLINE;
|
||||
break;
|
||||
@ -94,7 +136,7 @@ switch (what)
|
||||
break;
|
||||
|
||||
case PCRE_CONFIG_LINK_SIZE:
|
||||
*((int *)where) = LINK_SIZE;
|
||||
*((int *)where) = real_link_size;
|
||||
break;
|
||||
|
||||
case PCRE_CONFIG_POSIX_MALLOC_THRESHOLD:
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -6,7 +6,7 @@
|
||||
and semantics are as close as possible to those of the Perl 5 language.
|
||||
|
||||
Written by Philip Hazel
|
||||
Copyright (c) 1997-2009 University of Cambridge
|
||||
Copyright (c) 1997-2012 University of Cambridge
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@ -63,13 +63,17 @@ Arguments:
|
||||
Returns: 0 if data returned, negative on error
|
||||
*/
|
||||
|
||||
#ifdef COMPILE_PCRE8
|
||||
PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
|
||||
pcre_fullinfo(const pcre *argument_re, const pcre_extra *extra_data, int what,
|
||||
void *where)
|
||||
pcre_fullinfo(const pcre *argument_re, const pcre_extra *extra_data,
|
||||
int what, void *where)
|
||||
#else
|
||||
PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
|
||||
pcre16_fullinfo(const pcre16 *argument_re, const pcre16_extra *extra_data,
|
||||
int what, void *where)
|
||||
#endif
|
||||
{
|
||||
real_pcre internal_re;
|
||||
pcre_study_data internal_study;
|
||||
const real_pcre *re = (const real_pcre *)argument_re;
|
||||
const REAL_PCRE *re = (const REAL_PCRE *)argument_re;
|
||||
const pcre_study_data *study = NULL;
|
||||
|
||||
if (re == NULL || where == NULL) return PCRE_ERROR_NULL;
|
||||
@ -77,12 +81,18 @@ if (re == NULL || where == NULL) return PCRE_ERROR_NULL;
|
||||
if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_STUDY_DATA) != 0)
|
||||
study = (const pcre_study_data *)extra_data->study_data;
|
||||
|
||||
/* Check that the first field in the block is the magic number. If it is not,
|
||||
return with PCRE_ERROR_BADMAGIC. However, if the magic number is equal to
|
||||
REVERSED_MAGIC_NUMBER we return with PCRE_ERROR_BADENDIANNESS, which
|
||||
means that the pattern is likely compiled with different endianness. */
|
||||
|
||||
if (re->magic_number != MAGIC_NUMBER)
|
||||
{
|
||||
re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
|
||||
if (re == NULL) return PCRE_ERROR_BADMAGIC;
|
||||
if (study != NULL) study = &internal_study;
|
||||
}
|
||||
return re->magic_number == REVERSED_MAGIC_NUMBER?
|
||||
PCRE_ERROR_BADENDIANNESS:PCRE_ERROR_BADMAGIC;
|
||||
|
||||
/* Check that this pattern was compiled in the correct bit mode */
|
||||
|
||||
if ((re->flags & PCRE_MODE) == 0) return PCRE_ERROR_BADMODE;
|
||||
|
||||
switch (what)
|
||||
{
|
||||
@ -98,6 +108,18 @@ switch (what)
|
||||
*((size_t *)where) = (study == NULL)? 0 : study->size;
|
||||
break;
|
||||
|
||||
case PCRE_INFO_JITSIZE:
|
||||
#ifdef SUPPORT_JIT
|
||||
*((size_t *)where) =
|
||||
(extra_data != NULL &&
|
||||
(extra_data->flags & PCRE_EXTRA_EXECUTABLE_JIT) != 0 &&
|
||||
extra_data->executable_jit != NULL)?
|
||||
PRIV(jit_get_size)(extra_data->executable_jit) : 0;
|
||||
#else
|
||||
*((size_t *)where) = 0;
|
||||
#endif
|
||||
break;
|
||||
|
||||
case PCRE_INFO_CAPTURECOUNT:
|
||||
*((int *)where) = re->top_bracket;
|
||||
break;
|
||||
@ -108,7 +130,7 @@ switch (what)
|
||||
|
||||
case PCRE_INFO_FIRSTBYTE:
|
||||
*((int *)where) =
|
||||
((re->flags & PCRE_FIRSTSET) != 0)? re->first_byte :
|
||||
((re->flags & PCRE_FIRSTSET) != 0)? re->first_char :
|
||||
((re->flags & PCRE_STARTLINE) != 0)? -1 : -2;
|
||||
break;
|
||||
|
||||
@ -116,7 +138,7 @@ switch (what)
|
||||
block, not the internal copy (with flipped integer fields). */
|
||||
|
||||
case PCRE_INFO_FIRSTTABLE:
|
||||
*((const uschar **)where) =
|
||||
*((const pcre_uint8 **)where) =
|
||||
(study != NULL && (study->flags & PCRE_STUDY_MAPPED) != 0)?
|
||||
((const pcre_study_data *)extra_data->study_data)->start_bits : NULL;
|
||||
break;
|
||||
@ -124,12 +146,18 @@ switch (what)
|
||||
case PCRE_INFO_MINLENGTH:
|
||||
*((int *)where) =
|
||||
(study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0)?
|
||||
study->minlength : -1;
|
||||
(int)(study->minlength) : -1;
|
||||
break;
|
||||
|
||||
case PCRE_INFO_JIT:
|
||||
*((int *)where) = extra_data != NULL &&
|
||||
(extra_data->flags & PCRE_EXTRA_EXECUTABLE_JIT) != 0 &&
|
||||
extra_data->executable_jit != NULL;
|
||||
break;
|
||||
|
||||
case PCRE_INFO_LASTLITERAL:
|
||||
*((int *)where) =
|
||||
((re->flags & PCRE_REQCHSET) != 0)? re->req_byte : -1;
|
||||
((re->flags & PCRE_REQCHSET) != 0)? re->req_char : -1;
|
||||
break;
|
||||
|
||||
case PCRE_INFO_NAMEENTRYSIZE:
|
||||
@ -141,11 +169,11 @@ switch (what)
|
||||
break;
|
||||
|
||||
case PCRE_INFO_NAMETABLE:
|
||||
*((const uschar **)where) = (const uschar *)re + re->name_table_offset;
|
||||
*((const pcre_uchar **)where) = (const pcre_uchar *)re + re->name_table_offset;
|
||||
break;
|
||||
|
||||
case PCRE_INFO_DEFAULT_TABLES:
|
||||
*((const uschar **)where) = (const uschar *)(_pcre_default_tables);
|
||||
*((const pcre_uint8 **)where) = (const pcre_uint8 *)(PRIV(default_tables));
|
||||
break;
|
||||
|
||||
/* From release 8.00 this will always return TRUE because NOPARTIAL is
|
||||
@ -163,6 +191,10 @@ switch (what)
|
||||
*((int *)where) = (re->flags & PCRE_HASCRORLF) != 0;
|
||||
break;
|
||||
|
||||
case PCRE_INFO_MAXLOOKBEHIND:
|
||||
*((int *)where) = re->max_lookbehind;
|
||||
break;
|
||||
|
||||
default: return PCRE_ERROR_BADOPTION;
|
||||
}
|
||||
|
||||
|
@ -6,7 +6,7 @@
|
||||
and semantics are as close as possible to those of the Perl 5 language.
|
||||
|
||||
Written by Philip Hazel
|
||||
Copyright (c) 1997-2008 University of Cambridge
|
||||
Copyright (c) 1997-2012 University of Cambridge
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@ -63,14 +63,20 @@ Returns: the number of the named parentheses, or a negative number
|
||||
(PCRE_ERROR_NOSUBSTRING) if not found
|
||||
*/
|
||||
|
||||
#ifdef COMPILE_PCRE8
|
||||
PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
|
||||
pcre_get_stringnumber(const pcre *code, const char *stringname)
|
||||
#else
|
||||
PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
|
||||
pcre16_get_stringnumber(const pcre16 *code, PCRE_SPTR16 stringname)
|
||||
#endif
|
||||
{
|
||||
int rc;
|
||||
int entrysize;
|
||||
int top, bot;
|
||||
uschar *nametable;
|
||||
pcre_uchar *nametable;
|
||||
|
||||
#ifdef COMPILE_PCRE8
|
||||
if ((rc = pcre_fullinfo(code, NULL, PCRE_INFO_NAMECOUNT, &top)) != 0)
|
||||
return rc;
|
||||
if (top <= 0) return PCRE_ERROR_NOSUBSTRING;
|
||||
@ -79,14 +85,26 @@ if ((rc = pcre_fullinfo(code, NULL, PCRE_INFO_NAMEENTRYSIZE, &entrysize)) != 0)
|
||||
return rc;
|
||||
if ((rc = pcre_fullinfo(code, NULL, PCRE_INFO_NAMETABLE, &nametable)) != 0)
|
||||
return rc;
|
||||
#endif
|
||||
#ifdef COMPILE_PCRE16
|
||||
if ((rc = pcre16_fullinfo(code, NULL, PCRE_INFO_NAMECOUNT, &top)) != 0)
|
||||
return rc;
|
||||
if (top <= 0) return PCRE_ERROR_NOSUBSTRING;
|
||||
|
||||
if ((rc = pcre16_fullinfo(code, NULL, PCRE_INFO_NAMEENTRYSIZE, &entrysize)) != 0)
|
||||
return rc;
|
||||
if ((rc = pcre16_fullinfo(code, NULL, PCRE_INFO_NAMETABLE, &nametable)) != 0)
|
||||
return rc;
|
||||
#endif
|
||||
|
||||
bot = 0;
|
||||
while (top > bot)
|
||||
{
|
||||
int mid = (top + bot) / 2;
|
||||
uschar *entry = nametable + entrysize*mid;
|
||||
int c = strcmp(stringname, (char *)(entry + 2));
|
||||
if (c == 0) return (entry[0] << 8) + entry[1];
|
||||
pcre_uchar *entry = nametable + entrysize*mid;
|
||||
int c = STRCMP_UC_UC((pcre_uchar *)stringname,
|
||||
(pcre_uchar *)(entry + IMM2_SIZE));
|
||||
if (c == 0) return GET2(entry, 0);
|
||||
if (c > 0) bot = mid + 1; else top = mid;
|
||||
}
|
||||
|
||||
@ -112,15 +130,22 @@ Returns: the length of each entry, or a negative number
|
||||
(PCRE_ERROR_NOSUBSTRING) if not found
|
||||
*/
|
||||
|
||||
#ifdef COMPILE_PCRE8
|
||||
PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
|
||||
pcre_get_stringtable_entries(const pcre *code, const char *stringname,
|
||||
char **firstptr, char **lastptr)
|
||||
#else
|
||||
PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
|
||||
pcre16_get_stringtable_entries(const pcre16 *code, PCRE_SPTR16 stringname,
|
||||
PCRE_UCHAR16 **firstptr, PCRE_UCHAR16 **lastptr)
|
||||
#endif
|
||||
{
|
||||
int rc;
|
||||
int entrysize;
|
||||
int top, bot;
|
||||
uschar *nametable, *lastentry;
|
||||
pcre_uchar *nametable, *lastentry;
|
||||
|
||||
#ifdef COMPILE_PCRE8
|
||||
if ((rc = pcre_fullinfo(code, NULL, PCRE_INFO_NAMECOUNT, &top)) != 0)
|
||||
return rc;
|
||||
if (top <= 0) return PCRE_ERROR_NOSUBSTRING;
|
||||
@ -129,30 +154,49 @@ if ((rc = pcre_fullinfo(code, NULL, PCRE_INFO_NAMEENTRYSIZE, &entrysize)) != 0)
|
||||
return rc;
|
||||
if ((rc = pcre_fullinfo(code, NULL, PCRE_INFO_NAMETABLE, &nametable)) != 0)
|
||||
return rc;
|
||||
#endif
|
||||
#ifdef COMPILE_PCRE16
|
||||
if ((rc = pcre16_fullinfo(code, NULL, PCRE_INFO_NAMECOUNT, &top)) != 0)
|
||||
return rc;
|
||||
if (top <= 0) return PCRE_ERROR_NOSUBSTRING;
|
||||
|
||||
if ((rc = pcre16_fullinfo(code, NULL, PCRE_INFO_NAMEENTRYSIZE, &entrysize)) != 0)
|
||||
return rc;
|
||||
if ((rc = pcre16_fullinfo(code, NULL, PCRE_INFO_NAMETABLE, &nametable)) != 0)
|
||||
return rc;
|
||||
#endif
|
||||
|
||||
lastentry = nametable + entrysize * (top - 1);
|
||||
bot = 0;
|
||||
while (top > bot)
|
||||
{
|
||||
int mid = (top + bot) / 2;
|
||||
uschar *entry = nametable + entrysize*mid;
|
||||
int c = strcmp(stringname, (char *)(entry + 2));
|
||||
pcre_uchar *entry = nametable + entrysize*mid;
|
||||
int c = STRCMP_UC_UC((pcre_uchar *)stringname,
|
||||
(pcre_uchar *)(entry + IMM2_SIZE));
|
||||
if (c == 0)
|
||||
{
|
||||
uschar *first = entry;
|
||||
uschar *last = entry;
|
||||
pcre_uchar *first = entry;
|
||||
pcre_uchar *last = entry;
|
||||
while (first > nametable)
|
||||
{
|
||||
if (strcmp(stringname, (char *)(first - entrysize + 2)) != 0) break;
|
||||
if (STRCMP_UC_UC((pcre_uchar *)stringname,
|
||||
(pcre_uchar *)(first - entrysize + IMM2_SIZE)) != 0) break;
|
||||
first -= entrysize;
|
||||
}
|
||||
while (last < lastentry)
|
||||
{
|
||||
if (strcmp(stringname, (char *)(last + entrysize + 2)) != 0) break;
|
||||
if (STRCMP_UC_UC((pcre_uchar *)stringname,
|
||||
(pcre_uchar *)(last + entrysize + IMM2_SIZE)) != 0) break;
|
||||
last += entrysize;
|
||||
}
|
||||
#ifdef COMPILE_PCRE8
|
||||
*firstptr = (char *)first;
|
||||
*lastptr = (char *)last;
|
||||
#else
|
||||
*firstptr = (PCRE_UCHAR16 *)first;
|
||||
*lastptr = (PCRE_UCHAR16 *)last;
|
||||
#endif
|
||||
return entrysize;
|
||||
}
|
||||
if (c > 0) bot = mid + 1; else top = mid;
|
||||
@ -180,23 +224,39 @@ Returns: the number of the first that is set,
|
||||
or a negative number on error
|
||||
*/
|
||||
|
||||
#ifdef COMPILE_PCRE8
|
||||
static int
|
||||
get_first_set(const pcre *code, const char *stringname, int *ovector)
|
||||
#else
|
||||
static int
|
||||
get_first_set(const pcre16 *code, PCRE_SPTR16 stringname, int *ovector)
|
||||
#endif
|
||||
{
|
||||
const real_pcre *re = (const real_pcre *)code;
|
||||
const REAL_PCRE *re = (const REAL_PCRE *)code;
|
||||
int entrysize;
|
||||
pcre_uchar *entry;
|
||||
#ifdef COMPILE_PCRE8
|
||||
char *first, *last;
|
||||
uschar *entry;
|
||||
#else
|
||||
PCRE_UCHAR16 *first, *last;
|
||||
#endif
|
||||
|
||||
#ifdef COMPILE_PCRE8
|
||||
if ((re->options & PCRE_DUPNAMES) == 0 && (re->flags & PCRE_JCHANGED) == 0)
|
||||
return pcre_get_stringnumber(code, stringname);
|
||||
entrysize = pcre_get_stringtable_entries(code, stringname, &first, &last);
|
||||
#else
|
||||
if ((re->options & PCRE_DUPNAMES) == 0 && (re->flags & PCRE_JCHANGED) == 0)
|
||||
return pcre16_get_stringnumber(code, stringname);
|
||||
entrysize = pcre16_get_stringtable_entries(code, stringname, &first, &last);
|
||||
#endif
|
||||
if (entrysize <= 0) return entrysize;
|
||||
for (entry = (uschar *)first; entry <= (uschar *)last; entry += entrysize)
|
||||
for (entry = (pcre_uchar *)first; entry <= (pcre_uchar *)last; entry += entrysize)
|
||||
{
|
||||
int n = (entry[0] << 8) + entry[1];
|
||||
int n = GET2(entry, 0);
|
||||
if (ovector[n*2] >= 0) return n;
|
||||
}
|
||||
return (first[0] << 8) + first[1];
|
||||
return GET2(entry, 0);
|
||||
}
|
||||
|
||||
|
||||
@ -229,9 +289,15 @@ Returns: if successful:
|
||||
PCRE_ERROR_NOSUBSTRING (-7) no such captured substring
|
||||
*/
|
||||
|
||||
#ifdef COMPILE_PCRE8
|
||||
PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
|
||||
pcre_copy_substring(const char *subject, int *ovector, int stringcount,
|
||||
int stringnumber, char *buffer, int size)
|
||||
#else
|
||||
PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
|
||||
pcre16_copy_substring(PCRE_SPTR16 subject, int *ovector, int stringcount,
|
||||
int stringnumber, PCRE_UCHAR16 *buffer, int size)
|
||||
#endif
|
||||
{
|
||||
int yield;
|
||||
if (stringnumber < 0 || stringnumber >= stringcount)
|
||||
@ -239,7 +305,7 @@ if (stringnumber < 0 || stringnumber >= stringcount)
|
||||
stringnumber *= 2;
|
||||
yield = ovector[stringnumber+1] - ovector[stringnumber];
|
||||
if (size < yield + 1) return PCRE_ERROR_NOMEMORY;
|
||||
memcpy(buffer, subject + ovector[stringnumber], yield);
|
||||
memcpy(buffer, subject + ovector[stringnumber], IN_UCHARS(yield));
|
||||
buffer[yield] = 0;
|
||||
return yield;
|
||||
}
|
||||
@ -274,13 +340,25 @@ Returns: if successful:
|
||||
PCRE_ERROR_NOSUBSTRING (-7) no such captured substring
|
||||
*/
|
||||
|
||||
#ifdef COMPILE_PCRE8
|
||||
PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
|
||||
pcre_copy_named_substring(const pcre *code, const char *subject, int *ovector,
|
||||
int stringcount, const char *stringname, char *buffer, int size)
|
||||
pcre_copy_named_substring(const pcre *code, const char *subject,
|
||||
int *ovector, int stringcount, const char *stringname,
|
||||
char *buffer, int size)
|
||||
#else
|
||||
PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
|
||||
pcre16_copy_named_substring(const pcre16 *code, PCRE_SPTR16 subject,
|
||||
int *ovector, int stringcount, PCRE_SPTR16 stringname,
|
||||
PCRE_UCHAR16 *buffer, int size)
|
||||
#endif
|
||||
{
|
||||
int n = get_first_set(code, stringname, ovector);
|
||||
if (n <= 0) return n;
|
||||
#ifdef COMPILE_PCRE8
|
||||
return pcre_copy_substring(subject, ovector, stringcount, n, buffer, size);
|
||||
#else
|
||||
return pcre16_copy_substring(subject, ovector, stringcount, n, buffer, size);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
@ -306,29 +384,39 @@ Returns: if successful: 0
|
||||
PCRE_ERROR_NOMEMORY (-6) failed to get store
|
||||
*/
|
||||
|
||||
#ifdef COMPILE_PCRE8
|
||||
PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
|
||||
pcre_get_substring_list(const char *subject, int *ovector, int stringcount,
|
||||
const char ***listptr)
|
||||
#else
|
||||
PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
|
||||
pcre16_get_substring_list(PCRE_SPTR16 subject, int *ovector, int stringcount,
|
||||
PCRE_SPTR16 **listptr)
|
||||
#endif
|
||||
{
|
||||
int i;
|
||||
int size = sizeof(char *);
|
||||
int size = sizeof(pcre_uchar *);
|
||||
int double_count = stringcount * 2;
|
||||
char **stringlist;
|
||||
char *p;
|
||||
pcre_uchar **stringlist;
|
||||
pcre_uchar *p;
|
||||
|
||||
for (i = 0; i < double_count; i += 2)
|
||||
size += sizeof(char *) + ovector[i+1] - ovector[i] + 1;
|
||||
size += sizeof(pcre_uchar *) + IN_UCHARS(ovector[i+1] - ovector[i] + 1);
|
||||
|
||||
stringlist = (char **)(pcre_malloc)(size);
|
||||
stringlist = (pcre_uchar **)(PUBL(malloc))(size);
|
||||
if (stringlist == NULL) return PCRE_ERROR_NOMEMORY;
|
||||
|
||||
#ifdef COMPILE_PCRE8
|
||||
*listptr = (const char **)stringlist;
|
||||
p = (char *)(stringlist + stringcount + 1);
|
||||
#else
|
||||
*listptr = (PCRE_SPTR16 *)stringlist;
|
||||
#endif
|
||||
p = (pcre_uchar *)(stringlist + stringcount + 1);
|
||||
|
||||
for (i = 0; i < double_count; i += 2)
|
||||
{
|
||||
int len = ovector[i+1] - ovector[i];
|
||||
memcpy(p, subject + ovector[i], len);
|
||||
memcpy(p, subject + ovector[i], IN_UCHARS(len));
|
||||
*stringlist++ = p;
|
||||
p += len;
|
||||
*p++ = 0;
|
||||
@ -345,16 +433,22 @@ return 0;
|
||||
*************************************************/
|
||||
|
||||
/* This function exists for the benefit of people calling PCRE from non-C
|
||||
programs that can call its functions, but not free() or (pcre_free)() directly.
|
||||
programs that can call its functions, but not free() or (PUBL(free))()
|
||||
directly.
|
||||
|
||||
Argument: the result of a previous pcre_get_substring_list()
|
||||
Returns: nothing
|
||||
*/
|
||||
|
||||
#ifdef COMPILE_PCRE8
|
||||
PCRE_EXP_DEFN void PCRE_CALL_CONVENTION
|
||||
pcre_free_substring_list(const char **pointer)
|
||||
#else
|
||||
PCRE_EXP_DEFN void PCRE_CALL_CONVENTION
|
||||
pcre16_free_substring_list(PCRE_SPTR16 *pointer)
|
||||
#endif
|
||||
{
|
||||
(pcre_free)((void *)pointer);
|
||||
(PUBL(free))((void *)pointer);
|
||||
}
|
||||
|
||||
|
||||
@ -384,21 +478,31 @@ Returns: if successful:
|
||||
PCRE_ERROR_NOSUBSTRING (-7) substring not present
|
||||
*/
|
||||
|
||||
#ifdef COMPILE_PCRE8
|
||||
PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
|
||||
pcre_get_substring(const char *subject, int *ovector, int stringcount,
|
||||
int stringnumber, const char **stringptr)
|
||||
#else
|
||||
PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
|
||||
pcre16_get_substring(PCRE_SPTR16 subject, int *ovector, int stringcount,
|
||||
int stringnumber, PCRE_SPTR16 *stringptr)
|
||||
#endif
|
||||
{
|
||||
int yield;
|
||||
char *substring;
|
||||
pcre_uchar *substring;
|
||||
if (stringnumber < 0 || stringnumber >= stringcount)
|
||||
return PCRE_ERROR_NOSUBSTRING;
|
||||
stringnumber *= 2;
|
||||
yield = ovector[stringnumber+1] - ovector[stringnumber];
|
||||
substring = (char *)(pcre_malloc)(yield + 1);
|
||||
substring = (pcre_uchar *)(PUBL(malloc))(IN_UCHARS(yield + 1));
|
||||
if (substring == NULL) return PCRE_ERROR_NOMEMORY;
|
||||
memcpy(substring, subject + ovector[stringnumber], yield);
|
||||
memcpy(substring, subject + ovector[stringnumber], IN_UCHARS(yield));
|
||||
substring[yield] = 0;
|
||||
*stringptr = substring;
|
||||
#ifdef COMPILE_PCRE8
|
||||
*stringptr = (const char *)substring;
|
||||
#else
|
||||
*stringptr = (PCRE_SPTR16)substring;
|
||||
#endif
|
||||
return yield;
|
||||
}
|
||||
|
||||
@ -431,13 +535,25 @@ Returns: if successful:
|
||||
PCRE_ERROR_NOSUBSTRING (-7) no such captured substring
|
||||
*/
|
||||
|
||||
#ifdef COMPILE_PCRE8
|
||||
PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
|
||||
pcre_get_named_substring(const pcre *code, const char *subject, int *ovector,
|
||||
int stringcount, const char *stringname, const char **stringptr)
|
||||
pcre_get_named_substring(const pcre *code, const char *subject,
|
||||
int *ovector, int stringcount, const char *stringname,
|
||||
const char **stringptr)
|
||||
#else
|
||||
PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
|
||||
pcre16_get_named_substring(const pcre16 *code, PCRE_SPTR16 subject,
|
||||
int *ovector, int stringcount, PCRE_SPTR16 stringname,
|
||||
PCRE_SPTR16 *stringptr)
|
||||
#endif
|
||||
{
|
||||
int n = get_first_set(code, stringname, ovector);
|
||||
if (n <= 0) return n;
|
||||
#ifdef COMPILE_PCRE8
|
||||
return pcre_get_substring(subject, ovector, stringcount, n, stringptr);
|
||||
#else
|
||||
return pcre16_get_substring(subject, ovector, stringcount, n, stringptr);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
@ -448,16 +564,22 @@ return pcre_get_substring(subject, ovector, stringcount, n, stringptr);
|
||||
*************************************************/
|
||||
|
||||
/* This function exists for the benefit of people calling PCRE from non-C
|
||||
programs that can call its functions, but not free() or (pcre_free)() directly.
|
||||
programs that can call its functions, but not free() or (PUBL(free))()
|
||||
directly.
|
||||
|
||||
Argument: the result of a previous pcre_get_substring()
|
||||
Returns: nothing
|
||||
*/
|
||||
|
||||
#ifdef COMPILE_PCRE8
|
||||
PCRE_EXP_DEFN void PCRE_CALL_CONVENTION
|
||||
pcre_free_substring(const char *pointer)
|
||||
#else
|
||||
PCRE_EXP_DEFN void PCRE_CALL_CONVENTION
|
||||
pcre16_free_substring(PCRE_SPTR16 pointer)
|
||||
#endif
|
||||
{
|
||||
(pcre_free)((void *)pointer);
|
||||
(PUBL(free))((void *)pointer);
|
||||
}
|
||||
|
||||
/* End of pcre_get.c */
|
||||
|
@ -6,7 +6,7 @@
|
||||
and semantics are as close as possible to those of the Perl 5 language.
|
||||
|
||||
Written by Philip Hazel
|
||||
Copyright (c) 1997-2008 University of Cambridge
|
||||
Copyright (c) 1997-2012 University of Cambridge
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@ -65,18 +65,18 @@ static void LocalPcreFree(void* aPtr)
|
||||
{
|
||||
free(aPtr);
|
||||
}
|
||||
PCRE_EXP_DATA_DEFN void *(*pcre_malloc)(size_t) = LocalPcreMalloc;
|
||||
PCRE_EXP_DATA_DEFN void (*pcre_free)(void *) = LocalPcreFree;
|
||||
PCRE_EXP_DATA_DEFN void *(*pcre_stack_malloc)(size_t) = LocalPcreMalloc;
|
||||
PCRE_EXP_DATA_DEFN void (*pcre_stack_free)(void *) = LocalPcreFree;
|
||||
PCRE_EXP_DATA_DEFN int (*pcre_callout)(pcre_callout_block *) = NULL;
|
||||
PCRE_EXP_DATA_DEFN void *(*PUBL(malloc))(size_t) = LocalPcreMalloc;
|
||||
PCRE_EXP_DATA_DEFN void (*PUBL(free))(void *) = LocalPcreFree;
|
||||
PCRE_EXP_DATA_DEFN void *(*PUBL(stack_malloc))(size_t) = LocalPcreMalloc;
|
||||
PCRE_EXP_DATA_DEFN void (*PUBL(stack_free))(void *) = LocalPcreFree;
|
||||
PCRE_EXP_DATA_DEFN int (*PUBL(callout))(PUBL(callout_block) *) = NULL;
|
||||
|
||||
#elif !defined VPCOMPAT
|
||||
PCRE_EXP_DATA_DEFN void *(*pcre_malloc)(size_t) = malloc;
|
||||
PCRE_EXP_DATA_DEFN void (*pcre_free)(void *) = free;
|
||||
PCRE_EXP_DATA_DEFN void *(*pcre_stack_malloc)(size_t) = malloc;
|
||||
PCRE_EXP_DATA_DEFN void (*pcre_stack_free)(void *) = free;
|
||||
PCRE_EXP_DATA_DEFN int (*pcre_callout)(pcre_callout_block *) = NULL;
|
||||
PCRE_EXP_DATA_DEFN void *(*PUBL(malloc))(size_t) = malloc;
|
||||
PCRE_EXP_DATA_DEFN void (*PUBL(free))(void *) = free;
|
||||
PCRE_EXP_DATA_DEFN void *(*PUBL(stack_malloc))(size_t) = malloc;
|
||||
PCRE_EXP_DATA_DEFN void (*PUBL(stack_free))(void *) = free;
|
||||
PCRE_EXP_DATA_DEFN int (*PUBL(callout))(PUBL(callout_block) *) = NULL;
|
||||
#endif
|
||||
|
||||
/* End of pcre_globals.c */
|
||||
|
@ -1,91 +0,0 @@
|
||||
/*************************************************
|
||||
* Perl-Compatible Regular Expressions *
|
||||
*************************************************/
|
||||
|
||||
/* PCRE is a library of functions to support regular expressions whose syntax
|
||||
and semantics are as close as possible to those of the Perl 5 language.
|
||||
|
||||
Written by Philip Hazel
|
||||
Copyright (c) 1997-2009 University of Cambridge
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright notice,
|
||||
this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of the University of Cambridge nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
-----------------------------------------------------------------------------
|
||||
*/
|
||||
|
||||
|
||||
/* This module contains the external function pcre_info(), which gives some
|
||||
information about a compiled pattern. However, use of this function is now
|
||||
deprecated, as it has been superseded by pcre_fullinfo(). */
|
||||
|
||||
|
||||
#include "config.h"
|
||||
|
||||
#include "pcre_internal.h"
|
||||
|
||||
|
||||
/*************************************************
|
||||
* (Obsolete) Return info about compiled pattern *
|
||||
*************************************************/
|
||||
|
||||
/* This is the original "info" function. It picks potentially useful data out
|
||||
of the private structure, but its interface was too rigid. It remains for
|
||||
backwards compatibility. The public options are passed back in an int - though
|
||||
the re->options field has been expanded to a long int, all the public options
|
||||
at the low end of it, and so even on 16-bit systems this will still be OK.
|
||||
Therefore, I haven't changed the API for pcre_info().
|
||||
|
||||
Arguments:
|
||||
argument_re points to compiled code
|
||||
optptr where to pass back the options
|
||||
first_byte where to pass back the first character,
|
||||
or -1 if multiline and all branches start ^,
|
||||
or -2 otherwise
|
||||
|
||||
Returns: number of capturing subpatterns
|
||||
or negative values on error
|
||||
*/
|
||||
|
||||
PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
|
||||
pcre_info(const pcre *argument_re, int *optptr, int *first_byte)
|
||||
{
|
||||
real_pcre internal_re;
|
||||
const real_pcre *re = (const real_pcre *)argument_re;
|
||||
if (re == NULL) return PCRE_ERROR_NULL;
|
||||
if (re->magic_number != MAGIC_NUMBER)
|
||||
{
|
||||
re = _pcre_try_flipped(re, &internal_re, NULL, NULL);
|
||||
if (re == NULL) return PCRE_ERROR_BADMAGIC;
|
||||
}
|
||||
if (optptr != NULL) *optptr = (int)(re->options & PUBLIC_COMPILE_OPTIONS);
|
||||
if (first_byte != NULL)
|
||||
*first_byte = ((re->flags & PCRE_FIRSTSET) != 0)? re->first_byte :
|
||||
((re->flags & PCRE_STARTLINE) != 0)? -1 : -2;
|
||||
return re->top_bracket;
|
||||
}
|
||||
|
||||
/* End of pcre_info.c */
|
File diff suppressed because it is too large
Load Diff
@ -6,7 +6,7 @@
|
||||
and semantics are as close as possible to those of the Perl 5 language.
|
||||
|
||||
Written by Philip Hazel
|
||||
Copyright (c) 1997-2008 University of Cambridge
|
||||
Copyright (c) 1997-2012 University of Cambridge
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@ -57,21 +57,26 @@ compilation of dftables.c, in which case the macro DFTABLES is defined. */
|
||||
/* This function builds a set of character tables for use by PCRE and returns
|
||||
a pointer to them. They are build using the ctype functions, and consequently
|
||||
their contents will depend upon the current locale setting. When compiled as
|
||||
part of the library, the store is obtained via pcre_malloc(), but when compiled
|
||||
inside dftables, use malloc().
|
||||
part of the library, the store is obtained via PUBL(malloc)(), but when
|
||||
compiled inside dftables, use malloc().
|
||||
|
||||
Arguments: none
|
||||
Returns: pointer to the contiguous block of data
|
||||
*/
|
||||
|
||||
#ifdef COMPILE_PCRE8
|
||||
const unsigned char *
|
||||
pcre_maketables(void)
|
||||
#else
|
||||
const unsigned char *
|
||||
pcre16_maketables(void)
|
||||
#endif
|
||||
{
|
||||
unsigned char *yield, *p;
|
||||
int i;
|
||||
|
||||
#ifndef DFTABLES
|
||||
yield = (unsigned char*)(pcre_malloc)(tables_length);
|
||||
yield = (unsigned char*)(PUBL(malloc))(tables_length);
|
||||
#else
|
||||
yield = (unsigned char*)malloc(tables_length);
|
||||
#endif
|
||||
|
@ -6,7 +6,7 @@
|
||||
and semantics are as close as possible to those of the Perl 5 language.
|
||||
|
||||
Written by Philip Hazel
|
||||
Copyright (c) 1997-2009 University of Cambridge
|
||||
Copyright (c) 1997-2012 University of Cambridge
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@ -65,16 +65,25 @@ Arguments:
|
||||
type the newline type
|
||||
endptr pointer to the end of the string
|
||||
lenptr where to return the length
|
||||
utf8 TRUE if in utf8 mode
|
||||
utf TRUE if in utf mode
|
||||
|
||||
Returns: TRUE or FALSE
|
||||
*/
|
||||
|
||||
BOOL
|
||||
_pcre_is_newline(USPTR ptr, int type, USPTR endptr, int *lenptr, BOOL utf8)
|
||||
PRIV(is_newline)(PCRE_PUCHAR ptr, int type, PCRE_PUCHAR endptr, int *lenptr,
|
||||
BOOL utf)
|
||||
{
|
||||
int c;
|
||||
if (utf8) { GETCHAR(c, ptr); } else c = *ptr;
|
||||
(void)utf;
|
||||
#ifdef SUPPORT_UTF
|
||||
if (utf)
|
||||
{
|
||||
GETCHAR(c, ptr);
|
||||
}
|
||||
else
|
||||
#endif /* SUPPORT_UTF */
|
||||
c = *ptr;
|
||||
|
||||
if (type == NLTYPE_ANYCRLF) switch(c)
|
||||
{
|
||||
@ -93,9 +102,15 @@ else switch(c)
|
||||
case 0x000c: *lenptr = 1; return TRUE; /* FF */
|
||||
case 0x000d: *lenptr = (ptr < endptr - 1 && ptr[1] == 0x0a)? 2 : 1;
|
||||
return TRUE; /* CR */
|
||||
case 0x0085: *lenptr = utf8? 2 : 1; return TRUE; /* NEL */
|
||||
#ifdef COMPILE_PCRE8
|
||||
case 0x0085: *lenptr = utf? 2 : 1; return TRUE; /* NEL */
|
||||
case 0x2028: /* LS */
|
||||
case 0x2029: *lenptr = 3; return TRUE; /* PS */
|
||||
#else
|
||||
case 0x0085: /* NEL */
|
||||
case 0x2028: /* LS */
|
||||
case 0x2029: *lenptr = 1; return TRUE; /* PS */
|
||||
#endif /* COMPILE_PCRE8 */
|
||||
default: return FALSE;
|
||||
}
|
||||
}
|
||||
@ -114,26 +129,27 @@ Arguments:
|
||||
type the newline type
|
||||
startptr pointer to the start of the string
|
||||
lenptr where to return the length
|
||||
utf8 TRUE if in utf8 mode
|
||||
utf TRUE if in utf mode
|
||||
|
||||
Returns: TRUE or FALSE
|
||||
*/
|
||||
|
||||
BOOL
|
||||
_pcre_was_newline(USPTR ptr, int type, USPTR startptr, int *lenptr, BOOL utf8)
|
||||
PRIV(was_newline)(PCRE_PUCHAR ptr, int type, PCRE_PUCHAR startptr, int *lenptr,
|
||||
BOOL utf)
|
||||
{
|
||||
int c;
|
||||
(void)utf;
|
||||
ptr--;
|
||||
#ifdef SUPPORT_UTF8
|
||||
if (utf8)
|
||||
#ifdef SUPPORT_UTF
|
||||
if (utf)
|
||||
{
|
||||
BACKCHAR(ptr);
|
||||
GETCHAR(c, ptr);
|
||||
}
|
||||
else c = *ptr;
|
||||
#else /* no UTF-8 support */
|
||||
c = *ptr;
|
||||
#endif /* SUPPORT_UTF8 */
|
||||
else
|
||||
#endif /* SUPPORT_UTF */
|
||||
c = *ptr;
|
||||
|
||||
if (type == NLTYPE_ANYCRLF) switch(c)
|
||||
{
|
||||
@ -150,9 +166,15 @@ else switch(c)
|
||||
case 0x000b: /* VT */
|
||||
case 0x000c: /* FF */
|
||||
case 0x000d: *lenptr = 1; return TRUE; /* CR */
|
||||
case 0x0085: *lenptr = utf8? 2 : 1; return TRUE; /* NEL */
|
||||
#ifdef COMPILE_PCRE8
|
||||
case 0x0085: *lenptr = utf? 2 : 1; return TRUE; /* NEL */
|
||||
case 0x2028: /* LS */
|
||||
case 0x2029: *lenptr = 3; return TRUE; /* PS */
|
||||
#else
|
||||
case 0x0085: /* NEL */
|
||||
case 0x2028: /* LS */
|
||||
case 0x2029: *lenptr = 1; return TRUE; /* PS */
|
||||
#endif /* COMPILE_PCRE8 */
|
||||
default: return FALSE;
|
||||
}
|
||||
}
|
||||
|
@ -6,7 +6,7 @@
|
||||
and semantics are as close as possible to those of the Perl 5 language.
|
||||
|
||||
Written by Philip Hazel
|
||||
Copyright (c) 1997-2008 University of Cambridge
|
||||
Copyright (c) 1997-2012 University of Cambridge
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@ -50,35 +50,45 @@ character value into a UTF8 string. */
|
||||
* Convert character value to UTF-8 *
|
||||
*************************************************/
|
||||
|
||||
/* This function takes an integer value in the range 0 - 0x7fffffff
|
||||
and encodes it as a UTF-8 character in 0 to 6 bytes.
|
||||
/* This function takes an integer value in the range 0 - 0x10ffff
|
||||
and encodes it as a UTF-8 character in 1 to 6 pcre_uchars.
|
||||
|
||||
Arguments:
|
||||
cvalue the character value
|
||||
buffer pointer to buffer for result - at least 6 bytes long
|
||||
buffer pointer to buffer for result - at least 6 pcre_uchars long
|
||||
|
||||
Returns: number of characters placed in the buffer
|
||||
*/
|
||||
|
||||
int
|
||||
_pcre_ord2utf8(int cvalue, uschar *buffer)
|
||||
PRIV(ord2utf)(pcre_uint32 cvalue, pcre_uchar *buffer)
|
||||
{
|
||||
#ifdef SUPPORT_UTF8
|
||||
#ifdef SUPPORT_UTF
|
||||
|
||||
register int i, j;
|
||||
for (i = 0; i < _pcre_utf8_table1_size; i++)
|
||||
if (cvalue <= _pcre_utf8_table1[i]) break;
|
||||
|
||||
/* Checking invalid cvalue character, encoded as invalid UTF-16 character.
|
||||
Should never happen in practice. */
|
||||
if ((cvalue & 0xf800) == 0xd800 || cvalue >= 0x110000)
|
||||
cvalue = 0xfffe;
|
||||
|
||||
for (i = 0; i < PRIV(utf8_table1_size); i++)
|
||||
if ((int)cvalue <= PRIV(utf8_table1)[i]) break;
|
||||
buffer += i;
|
||||
for (j = i; j > 0; j--)
|
||||
{
|
||||
*buffer-- = 0x80 | (cvalue & 0x3f);
|
||||
cvalue >>= 6;
|
||||
}
|
||||
*buffer = _pcre_utf8_table2[i] | cvalue;
|
||||
*buffer = PRIV(utf8_table2)[i] | cvalue;
|
||||
return i + 1;
|
||||
|
||||
#else
|
||||
|
||||
(void)(cvalue); /* Keep compiler happy; this function won't ever be */
|
||||
(void)(buffer); /* called when SUPPORT_UTF8 is not defined. */
|
||||
(void)(buffer); /* called when SUPPORT_UTF is not defined. */
|
||||
return 0;
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
|
@ -6,7 +6,7 @@
|
||||
and semantics are as close as possible to those of the Perl 5 language.
|
||||
|
||||
Written by Philip Hazel
|
||||
Copyright (c) 1997-2008 University of Cambridge
|
||||
Copyright (c) 1997-2012 University of Cambridge
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@ -66,11 +66,18 @@ Returns: the (possibly updated) count value (a non-negative number), or
|
||||
a negative error number
|
||||
*/
|
||||
|
||||
#ifdef COMPILE_PCRE8
|
||||
PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
|
||||
pcre_refcount(pcre *argument_re, int adjust)
|
||||
#else
|
||||
PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
|
||||
pcre16_refcount(pcre16 *argument_re, int adjust)
|
||||
#endif
|
||||
{
|
||||
real_pcre *re = (real_pcre *)argument_re;
|
||||
REAL_PCRE *re = (REAL_PCRE *)argument_re;
|
||||
if (re == NULL) return PCRE_ERROR_NULL;
|
||||
if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
|
||||
if ((re->flags & PCRE_MODE) == 0) return PCRE_ERROR_BADMODE;
|
||||
re->ref_count = (-adjust > re->ref_count)? 0 :
|
||||
(adjust + re->ref_count > 65535)? 65535 :
|
||||
re->ref_count + adjust;
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -6,7 +6,7 @@
|
||||
and semantics are as close as possible to those of the Perl 5 language.
|
||||
|
||||
Written by Philip Hazel
|
||||
Copyright (c) 1997-2009 University of Cambridge
|
||||
Copyright (c) 1997-2012 University of Cambridge
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@ -37,6 +37,7 @@ POSSIBILITY OF SUCH DAMAGE.
|
||||
-----------------------------------------------------------------------------
|
||||
*/
|
||||
|
||||
#ifndef PCRE_INCLUDED
|
||||
|
||||
/* This module contains some fixed tables that are used by more than one of the
|
||||
PCRE code modules. The tables are also #included by the pcretest program, which
|
||||
@ -48,11 +49,12 @@ clashes with the library. */
|
||||
|
||||
#include "pcre_internal.h"
|
||||
|
||||
#endif /* PCRE_INCLUDED */
|
||||
|
||||
/* Table of sizes for the fixed-length opcodes. It's defined in a macro so that
|
||||
the definition is next to the definition of the opcodes in pcre_internal.h. */
|
||||
|
||||
const uschar _pcre_OP_lengths[] = { OP_LENGTHS };
|
||||
const pcre_uint8 PRIV(OP_lengths)[] = { OP_LENGTHS };
|
||||
|
||||
|
||||
|
||||
@ -63,31 +65,38 @@ const uschar _pcre_OP_lengths[] = { OP_LENGTHS };
|
||||
/* These are the breakpoints for different numbers of bytes in a UTF-8
|
||||
character. */
|
||||
|
||||
#ifdef SUPPORT_UTF8
|
||||
#if (defined SUPPORT_UTF && defined COMPILE_PCRE8) \
|
||||
|| (defined PCRE_INCLUDED && defined SUPPORT_PCRE16)
|
||||
|
||||
const int _pcre_utf8_table1[] =
|
||||
/* These tables are also required by pcretest in 16 bit mode. */
|
||||
|
||||
const int PRIV(utf8_table1)[] =
|
||||
{ 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff};
|
||||
|
||||
const int _pcre_utf8_table1_size = sizeof(_pcre_utf8_table1)/sizeof(int);
|
||||
const int PRIV(utf8_table1_size) = sizeof(PRIV(utf8_table1)) / sizeof(int);
|
||||
|
||||
/* These are the indicator bits and the mask for the data bits to set in the
|
||||
first byte of a character, indexed by the number of additional bytes. */
|
||||
|
||||
const int _pcre_utf8_table2[] = { 0, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc};
|
||||
const int _pcre_utf8_table3[] = { 0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01};
|
||||
const int PRIV(utf8_table2)[] = { 0, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc};
|
||||
const int PRIV(utf8_table3)[] = { 0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01};
|
||||
|
||||
/* Table of the number of extra bytes, indexed by the first byte masked with
|
||||
0x3f. The highest number for a valid UTF-8 first byte is in fact 0x3d. */
|
||||
|
||||
const uschar _pcre_utf8_table4[] = {
|
||||
const pcre_uint8 PRIV(utf8_table4)[] = {
|
||||
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
||||
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
||||
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
|
||||
3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 };
|
||||
|
||||
#endif /* (SUPPORT_UTF && COMPILE_PCRE8) || (PCRE_INCLUDED && SUPPORT_PCRE16)*/
|
||||
|
||||
#ifdef SUPPORT_UTF
|
||||
|
||||
/* Table to translate from particular type value to the general value. */
|
||||
|
||||
const int _pcre_ucp_gentype[] = {
|
||||
const int PRIV(ucp_gentype)[] = {
|
||||
ucp_C, ucp_C, ucp_C, ucp_C, ucp_C, /* Cc, Cf, Cn, Co, Cs */
|
||||
ucp_L, ucp_L, ucp_L, ucp_L, ucp_L, /* Ll, Lu, Lm, Lo, Lt */
|
||||
ucp_M, ucp_M, ucp_M, /* Mc, Me, Mn */
|
||||
@ -98,6 +107,21 @@ const int _pcre_ucp_gentype[] = {
|
||||
ucp_Z, ucp_Z, ucp_Z /* Zl, Zp, Zs */
|
||||
};
|
||||
|
||||
#ifdef SUPPORT_JIT
|
||||
/* This table reverses PRIV(ucp_gentype). We can save the cost
|
||||
of a memory load. */
|
||||
|
||||
const int PRIV(ucp_typerange)[] = {
|
||||
ucp_Cc, ucp_Cs,
|
||||
ucp_Ll, ucp_Lu,
|
||||
ucp_Mc, ucp_Mn,
|
||||
ucp_Nd, ucp_No,
|
||||
ucp_Pc, ucp_Ps,
|
||||
ucp_Sc, ucp_So,
|
||||
ucp_Zl, ucp_Zs,
|
||||
};
|
||||
#endif /* SUPPORT_JIT */
|
||||
|
||||
/* The pcre_utt[] table below translates Unicode property names into type and
|
||||
code values. It is searched by binary chop, so must be in collating sequence of
|
||||
name. Originally, the table contained pointers to the name strings in the first
|
||||
@ -108,7 +132,7 @@ table itself. Maintenance is more error-prone, but frequent changes to this
|
||||
data are unlikely.
|
||||
|
||||
July 2008: There is now a script called maint/GenerateUtt.py that can be used
|
||||
to generate this data instead of maintaining it entirely by hand.
|
||||
to generate this data automatically instead of maintaining it by hand.
|
||||
|
||||
The script was updated in March 2009 to generate a new EBCDIC-compliant
|
||||
version. Like all other character and string literals that are compared against
|
||||
@ -121,8 +145,10 @@ strings to make sure that UTF-8 support works on EBCDIC platforms. */
|
||||
#define STRING_Avestan0 STR_A STR_v STR_e STR_s STR_t STR_a STR_n "\0"
|
||||
#define STRING_Balinese0 STR_B STR_a STR_l STR_i STR_n STR_e STR_s STR_e "\0"
|
||||
#define STRING_Bamum0 STR_B STR_a STR_m STR_u STR_m "\0"
|
||||
#define STRING_Batak0 STR_B STR_a STR_t STR_a STR_k "\0"
|
||||
#define STRING_Bengali0 STR_B STR_e STR_n STR_g STR_a STR_l STR_i "\0"
|
||||
#define STRING_Bopomofo0 STR_B STR_o STR_p STR_o STR_m STR_o STR_f STR_o "\0"
|
||||
#define STRING_Brahmi0 STR_B STR_r STR_a STR_h STR_m STR_i "\0"
|
||||
#define STRING_Braille0 STR_B STR_r STR_a STR_i STR_l STR_l STR_e "\0"
|
||||
#define STRING_Buginese0 STR_B STR_u STR_g STR_i STR_n STR_e STR_s STR_e "\0"
|
||||
#define STRING_Buhid0 STR_B STR_u STR_h STR_i STR_d "\0"
|
||||
@ -131,6 +157,7 @@ strings to make sure that UTF-8 support works on EBCDIC platforms. */
|
||||
#define STRING_Carian0 STR_C STR_a STR_r STR_i STR_a STR_n "\0"
|
||||
#define STRING_Cc0 STR_C STR_c "\0"
|
||||
#define STRING_Cf0 STR_C STR_f "\0"
|
||||
#define STRING_Chakma0 STR_C STR_h STR_a STR_k STR_m STR_a "\0"
|
||||
#define STRING_Cham0 STR_C STR_h STR_a STR_m "\0"
|
||||
#define STRING_Cherokee0 STR_C STR_h STR_e STR_r STR_o STR_k STR_e STR_e "\0"
|
||||
#define STRING_Cn0 STR_C STR_n "\0"
|
||||
@ -184,9 +211,13 @@ strings to make sure that UTF-8 support works on EBCDIC platforms. */
|
||||
#define STRING_Lydian0 STR_L STR_y STR_d STR_i STR_a STR_n "\0"
|
||||
#define STRING_M0 STR_M "\0"
|
||||
#define STRING_Malayalam0 STR_M STR_a STR_l STR_a STR_y STR_a STR_l STR_a STR_m "\0"
|
||||
#define STRING_Mandaic0 STR_M STR_a STR_n STR_d STR_a STR_i STR_c "\0"
|
||||
#define STRING_Mc0 STR_M STR_c "\0"
|
||||
#define STRING_Me0 STR_M STR_e "\0"
|
||||
#define STRING_Meetei_Mayek0 STR_M STR_e STR_e STR_t STR_e STR_i STR_UNDERSCORE STR_M STR_a STR_y STR_e STR_k "\0"
|
||||
#define STRING_Meroitic_Cursive0 STR_M STR_e STR_r STR_o STR_i STR_t STR_i STR_c STR_UNDERSCORE STR_C STR_u STR_r STR_s STR_i STR_v STR_e "\0"
|
||||
#define STRING_Meroitic_Hieroglyphs0 STR_M STR_e STR_r STR_o STR_i STR_t STR_i STR_c STR_UNDERSCORE STR_H STR_i STR_e STR_r STR_o STR_g STR_l STR_y STR_p STR_h STR_s "\0"
|
||||
#define STRING_Miao0 STR_M STR_i STR_a STR_o "\0"
|
||||
#define STRING_Mn0 STR_M STR_n "\0"
|
||||
#define STRING_Mongolian0 STR_M STR_o STR_n STR_g STR_o STR_l STR_i STR_a STR_n "\0"
|
||||
#define STRING_Myanmar0 STR_M STR_y STR_a STR_n STR_m STR_a STR_r "\0"
|
||||
@ -220,11 +251,13 @@ strings to make sure that UTF-8 support works on EBCDIC platforms. */
|
||||
#define STRING_Samaritan0 STR_S STR_a STR_m STR_a STR_r STR_i STR_t STR_a STR_n "\0"
|
||||
#define STRING_Saurashtra0 STR_S STR_a STR_u STR_r STR_a STR_s STR_h STR_t STR_r STR_a "\0"
|
||||
#define STRING_Sc0 STR_S STR_c "\0"
|
||||
#define STRING_Sharada0 STR_S STR_h STR_a STR_r STR_a STR_d STR_a "\0"
|
||||
#define STRING_Shavian0 STR_S STR_h STR_a STR_v STR_i STR_a STR_n "\0"
|
||||
#define STRING_Sinhala0 STR_S STR_i STR_n STR_h STR_a STR_l STR_a "\0"
|
||||
#define STRING_Sk0 STR_S STR_k "\0"
|
||||
#define STRING_Sm0 STR_S STR_m "\0"
|
||||
#define STRING_So0 STR_S STR_o "\0"
|
||||
#define STRING_Sora_Sompeng0 STR_S STR_o STR_r STR_a STR_UNDERSCORE STR_S STR_o STR_m STR_p STR_e STR_n STR_g "\0"
|
||||
#define STRING_Sundanese0 STR_S STR_u STR_n STR_d STR_a STR_n STR_e STR_s STR_e "\0"
|
||||
#define STRING_Syloti_Nagri0 STR_S STR_y STR_l STR_o STR_t STR_i STR_UNDERSCORE STR_N STR_a STR_g STR_r STR_i "\0"
|
||||
#define STRING_Syriac0 STR_S STR_y STR_r STR_i STR_a STR_c "\0"
|
||||
@ -233,6 +266,7 @@ strings to make sure that UTF-8 support works on EBCDIC platforms. */
|
||||
#define STRING_Tai_Le0 STR_T STR_a STR_i STR_UNDERSCORE STR_L STR_e "\0"
|
||||
#define STRING_Tai_Tham0 STR_T STR_a STR_i STR_UNDERSCORE STR_T STR_h STR_a STR_m "\0"
|
||||
#define STRING_Tai_Viet0 STR_T STR_a STR_i STR_UNDERSCORE STR_V STR_i STR_e STR_t "\0"
|
||||
#define STRING_Takri0 STR_T STR_a STR_k STR_r STR_i "\0"
|
||||
#define STRING_Tamil0 STR_T STR_a STR_m STR_i STR_l "\0"
|
||||
#define STRING_Telugu0 STR_T STR_e STR_l STR_u STR_g STR_u "\0"
|
||||
#define STRING_Thaana0 STR_T STR_h STR_a STR_a STR_n STR_a "\0"
|
||||
@ -251,15 +285,17 @@ strings to make sure that UTF-8 support works on EBCDIC platforms. */
|
||||
#define STRING_Zp0 STR_Z STR_p "\0"
|
||||
#define STRING_Zs0 STR_Z STR_s "\0"
|
||||
|
||||
const char _pcre_utt_names[] =
|
||||
const char PRIV(utt_names)[] =
|
||||
STRING_Any0
|
||||
STRING_Arabic0
|
||||
STRING_Armenian0
|
||||
STRING_Avestan0
|
||||
STRING_Balinese0
|
||||
STRING_Bamum0
|
||||
STRING_Batak0
|
||||
STRING_Bengali0
|
||||
STRING_Bopomofo0
|
||||
STRING_Brahmi0
|
||||
STRING_Braille0
|
||||
STRING_Buginese0
|
||||
STRING_Buhid0
|
||||
@ -268,6 +304,7 @@ const char _pcre_utt_names[] =
|
||||
STRING_Carian0
|
||||
STRING_Cc0
|
||||
STRING_Cf0
|
||||
STRING_Chakma0
|
||||
STRING_Cham0
|
||||
STRING_Cherokee0
|
||||
STRING_Cn0
|
||||
@ -321,9 +358,13 @@ const char _pcre_utt_names[] =
|
||||
STRING_Lydian0
|
||||
STRING_M0
|
||||
STRING_Malayalam0
|
||||
STRING_Mandaic0
|
||||
STRING_Mc0
|
||||
STRING_Me0
|
||||
STRING_Meetei_Mayek0
|
||||
STRING_Meroitic_Cursive0
|
||||
STRING_Meroitic_Hieroglyphs0
|
||||
STRING_Miao0
|
||||
STRING_Mn0
|
||||
STRING_Mongolian0
|
||||
STRING_Myanmar0
|
||||
@ -357,11 +398,13 @@ const char _pcre_utt_names[] =
|
||||
STRING_Samaritan0
|
||||
STRING_Saurashtra0
|
||||
STRING_Sc0
|
||||
STRING_Sharada0
|
||||
STRING_Shavian0
|
||||
STRING_Sinhala0
|
||||
STRING_Sk0
|
||||
STRING_Sm0
|
||||
STRING_So0
|
||||
STRING_Sora_Sompeng0
|
||||
STRING_Sundanese0
|
||||
STRING_Syloti_Nagri0
|
||||
STRING_Syriac0
|
||||
@ -370,6 +413,7 @@ const char _pcre_utt_names[] =
|
||||
STRING_Tai_Le0
|
||||
STRING_Tai_Tham0
|
||||
STRING_Tai_Viet0
|
||||
STRING_Takri0
|
||||
STRING_Tamil0
|
||||
STRING_Telugu0
|
||||
STRING_Thaana0
|
||||
@ -388,146 +432,156 @@ const char _pcre_utt_names[] =
|
||||
STRING_Zp0
|
||||
STRING_Zs0;
|
||||
|
||||
const ucp_type_table _pcre_utt[] = {
|
||||
const ucp_type_table PRIV(utt)[] = {
|
||||
{ 0, PT_ANY, 0 },
|
||||
{ 4, PT_SC, ucp_Arabic },
|
||||
{ 11, PT_SC, ucp_Armenian },
|
||||
{ 20, PT_SC, ucp_Avestan },
|
||||
{ 28, PT_SC, ucp_Balinese },
|
||||
{ 37, PT_SC, ucp_Bamum },
|
||||
{ 43, PT_SC, ucp_Bengali },
|
||||
{ 51, PT_SC, ucp_Bopomofo },
|
||||
{ 60, PT_SC, ucp_Braille },
|
||||
{ 68, PT_SC, ucp_Buginese },
|
||||
{ 77, PT_SC, ucp_Buhid },
|
||||
{ 83, PT_GC, ucp_C },
|
||||
{ 85, PT_SC, ucp_Canadian_Aboriginal },
|
||||
{ 105, PT_SC, ucp_Carian },
|
||||
{ 112, PT_PC, ucp_Cc },
|
||||
{ 115, PT_PC, ucp_Cf },
|
||||
{ 118, PT_SC, ucp_Cham },
|
||||
{ 123, PT_SC, ucp_Cherokee },
|
||||
{ 132, PT_PC, ucp_Cn },
|
||||
{ 135, PT_PC, ucp_Co },
|
||||
{ 138, PT_SC, ucp_Common },
|
||||
{ 145, PT_SC, ucp_Coptic },
|
||||
{ 152, PT_PC, ucp_Cs },
|
||||
{ 155, PT_SC, ucp_Cuneiform },
|
||||
{ 165, PT_SC, ucp_Cypriot },
|
||||
{ 173, PT_SC, ucp_Cyrillic },
|
||||
{ 182, PT_SC, ucp_Deseret },
|
||||
{ 190, PT_SC, ucp_Devanagari },
|
||||
{ 201, PT_SC, ucp_Egyptian_Hieroglyphs },
|
||||
{ 222, PT_SC, ucp_Ethiopic },
|
||||
{ 231, PT_SC, ucp_Georgian },
|
||||
{ 240, PT_SC, ucp_Glagolitic },
|
||||
{ 251, PT_SC, ucp_Gothic },
|
||||
{ 258, PT_SC, ucp_Greek },
|
||||
{ 264, PT_SC, ucp_Gujarati },
|
||||
{ 273, PT_SC, ucp_Gurmukhi },
|
||||
{ 282, PT_SC, ucp_Han },
|
||||
{ 286, PT_SC, ucp_Hangul },
|
||||
{ 293, PT_SC, ucp_Hanunoo },
|
||||
{ 301, PT_SC, ucp_Hebrew },
|
||||
{ 308, PT_SC, ucp_Hiragana },
|
||||
{ 317, PT_SC, ucp_Imperial_Aramaic },
|
||||
{ 334, PT_SC, ucp_Inherited },
|
||||
{ 344, PT_SC, ucp_Inscriptional_Pahlavi },
|
||||
{ 366, PT_SC, ucp_Inscriptional_Parthian },
|
||||
{ 389, PT_SC, ucp_Javanese },
|
||||
{ 398, PT_SC, ucp_Kaithi },
|
||||
{ 405, PT_SC, ucp_Kannada },
|
||||
{ 413, PT_SC, ucp_Katakana },
|
||||
{ 422, PT_SC, ucp_Kayah_Li },
|
||||
{ 431, PT_SC, ucp_Kharoshthi },
|
||||
{ 442, PT_SC, ucp_Khmer },
|
||||
{ 448, PT_GC, ucp_L },
|
||||
{ 450, PT_LAMP, 0 },
|
||||
{ 453, PT_SC, ucp_Lao },
|
||||
{ 457, PT_SC, ucp_Latin },
|
||||
{ 463, PT_SC, ucp_Lepcha },
|
||||
{ 470, PT_SC, ucp_Limbu },
|
||||
{ 476, PT_SC, ucp_Linear_B },
|
||||
{ 485, PT_SC, ucp_Lisu },
|
||||
{ 490, PT_PC, ucp_Ll },
|
||||
{ 493, PT_PC, ucp_Lm },
|
||||
{ 496, PT_PC, ucp_Lo },
|
||||
{ 499, PT_PC, ucp_Lt },
|
||||
{ 502, PT_PC, ucp_Lu },
|
||||
{ 505, PT_SC, ucp_Lycian },
|
||||
{ 512, PT_SC, ucp_Lydian },
|
||||
{ 519, PT_GC, ucp_M },
|
||||
{ 521, PT_SC, ucp_Malayalam },
|
||||
{ 531, PT_PC, ucp_Mc },
|
||||
{ 534, PT_PC, ucp_Me },
|
||||
{ 537, PT_SC, ucp_Meetei_Mayek },
|
||||
{ 550, PT_PC, ucp_Mn },
|
||||
{ 553, PT_SC, ucp_Mongolian },
|
||||
{ 563, PT_SC, ucp_Myanmar },
|
||||
{ 571, PT_GC, ucp_N },
|
||||
{ 573, PT_PC, ucp_Nd },
|
||||
{ 576, PT_SC, ucp_New_Tai_Lue },
|
||||
{ 588, PT_SC, ucp_Nko },
|
||||
{ 592, PT_PC, ucp_Nl },
|
||||
{ 595, PT_PC, ucp_No },
|
||||
{ 598, PT_SC, ucp_Ogham },
|
||||
{ 604, PT_SC, ucp_Ol_Chiki },
|
||||
{ 613, PT_SC, ucp_Old_Italic },
|
||||
{ 624, PT_SC, ucp_Old_Persian },
|
||||
{ 636, PT_SC, ucp_Old_South_Arabian },
|
||||
{ 654, PT_SC, ucp_Old_Turkic },
|
||||
{ 665, PT_SC, ucp_Oriya },
|
||||
{ 671, PT_SC, ucp_Osmanya },
|
||||
{ 679, PT_GC, ucp_P },
|
||||
{ 681, PT_PC, ucp_Pc },
|
||||
{ 684, PT_PC, ucp_Pd },
|
||||
{ 687, PT_PC, ucp_Pe },
|
||||
{ 690, PT_PC, ucp_Pf },
|
||||
{ 693, PT_SC, ucp_Phags_Pa },
|
||||
{ 702, PT_SC, ucp_Phoenician },
|
||||
{ 713, PT_PC, ucp_Pi },
|
||||
{ 716, PT_PC, ucp_Po },
|
||||
{ 719, PT_PC, ucp_Ps },
|
||||
{ 722, PT_SC, ucp_Rejang },
|
||||
{ 729, PT_SC, ucp_Runic },
|
||||
{ 735, PT_GC, ucp_S },
|
||||
{ 737, PT_SC, ucp_Samaritan },
|
||||
{ 747, PT_SC, ucp_Saurashtra },
|
||||
{ 758, PT_PC, ucp_Sc },
|
||||
{ 761, PT_SC, ucp_Shavian },
|
||||
{ 769, PT_SC, ucp_Sinhala },
|
||||
{ 777, PT_PC, ucp_Sk },
|
||||
{ 780, PT_PC, ucp_Sm },
|
||||
{ 783, PT_PC, ucp_So },
|
||||
{ 786, PT_SC, ucp_Sundanese },
|
||||
{ 796, PT_SC, ucp_Syloti_Nagri },
|
||||
{ 809, PT_SC, ucp_Syriac },
|
||||
{ 816, PT_SC, ucp_Tagalog },
|
||||
{ 824, PT_SC, ucp_Tagbanwa },
|
||||
{ 833, PT_SC, ucp_Tai_Le },
|
||||
{ 840, PT_SC, ucp_Tai_Tham },
|
||||
{ 849, PT_SC, ucp_Tai_Viet },
|
||||
{ 858, PT_SC, ucp_Tamil },
|
||||
{ 864, PT_SC, ucp_Telugu },
|
||||
{ 871, PT_SC, ucp_Thaana },
|
||||
{ 878, PT_SC, ucp_Thai },
|
||||
{ 883, PT_SC, ucp_Tibetan },
|
||||
{ 891, PT_SC, ucp_Tifinagh },
|
||||
{ 900, PT_SC, ucp_Ugaritic },
|
||||
{ 909, PT_SC, ucp_Vai },
|
||||
{ 913, PT_ALNUM, 0 },
|
||||
{ 917, PT_PXSPACE, 0 },
|
||||
{ 921, PT_SPACE, 0 },
|
||||
{ 925, PT_WORD, 0 },
|
||||
{ 929, PT_SC, ucp_Yi },
|
||||
{ 932, PT_GC, ucp_Z },
|
||||
{ 934, PT_PC, ucp_Zl },
|
||||
{ 937, PT_PC, ucp_Zp },
|
||||
{ 940, PT_PC, ucp_Zs }
|
||||
{ 43, PT_SC, ucp_Batak },
|
||||
{ 49, PT_SC, ucp_Bengali },
|
||||
{ 57, PT_SC, ucp_Bopomofo },
|
||||
{ 66, PT_SC, ucp_Brahmi },
|
||||
{ 73, PT_SC, ucp_Braille },
|
||||
{ 81, PT_SC, ucp_Buginese },
|
||||
{ 90, PT_SC, ucp_Buhid },
|
||||
{ 96, PT_GC, ucp_C },
|
||||
{ 98, PT_SC, ucp_Canadian_Aboriginal },
|
||||
{ 118, PT_SC, ucp_Carian },
|
||||
{ 125, PT_PC, ucp_Cc },
|
||||
{ 128, PT_PC, ucp_Cf },
|
||||
{ 131, PT_SC, ucp_Chakma },
|
||||
{ 138, PT_SC, ucp_Cham },
|
||||
{ 143, PT_SC, ucp_Cherokee },
|
||||
{ 152, PT_PC, ucp_Cn },
|
||||
{ 155, PT_PC, ucp_Co },
|
||||
{ 158, PT_SC, ucp_Common },
|
||||
{ 165, PT_SC, ucp_Coptic },
|
||||
{ 172, PT_PC, ucp_Cs },
|
||||
{ 175, PT_SC, ucp_Cuneiform },
|
||||
{ 185, PT_SC, ucp_Cypriot },
|
||||
{ 193, PT_SC, ucp_Cyrillic },
|
||||
{ 202, PT_SC, ucp_Deseret },
|
||||
{ 210, PT_SC, ucp_Devanagari },
|
||||
{ 221, PT_SC, ucp_Egyptian_Hieroglyphs },
|
||||
{ 242, PT_SC, ucp_Ethiopic },
|
||||
{ 251, PT_SC, ucp_Georgian },
|
||||
{ 260, PT_SC, ucp_Glagolitic },
|
||||
{ 271, PT_SC, ucp_Gothic },
|
||||
{ 278, PT_SC, ucp_Greek },
|
||||
{ 284, PT_SC, ucp_Gujarati },
|
||||
{ 293, PT_SC, ucp_Gurmukhi },
|
||||
{ 302, PT_SC, ucp_Han },
|
||||
{ 306, PT_SC, ucp_Hangul },
|
||||
{ 313, PT_SC, ucp_Hanunoo },
|
||||
{ 321, PT_SC, ucp_Hebrew },
|
||||
{ 328, PT_SC, ucp_Hiragana },
|
||||
{ 337, PT_SC, ucp_Imperial_Aramaic },
|
||||
{ 354, PT_SC, ucp_Inherited },
|
||||
{ 364, PT_SC, ucp_Inscriptional_Pahlavi },
|
||||
{ 386, PT_SC, ucp_Inscriptional_Parthian },
|
||||
{ 409, PT_SC, ucp_Javanese },
|
||||
{ 418, PT_SC, ucp_Kaithi },
|
||||
{ 425, PT_SC, ucp_Kannada },
|
||||
{ 433, PT_SC, ucp_Katakana },
|
||||
{ 442, PT_SC, ucp_Kayah_Li },
|
||||
{ 451, PT_SC, ucp_Kharoshthi },
|
||||
{ 462, PT_SC, ucp_Khmer },
|
||||
{ 468, PT_GC, ucp_L },
|
||||
{ 470, PT_LAMP, 0 },
|
||||
{ 473, PT_SC, ucp_Lao },
|
||||
{ 477, PT_SC, ucp_Latin },
|
||||
{ 483, PT_SC, ucp_Lepcha },
|
||||
{ 490, PT_SC, ucp_Limbu },
|
||||
{ 496, PT_SC, ucp_Linear_B },
|
||||
{ 505, PT_SC, ucp_Lisu },
|
||||
{ 510, PT_PC, ucp_Ll },
|
||||
{ 513, PT_PC, ucp_Lm },
|
||||
{ 516, PT_PC, ucp_Lo },
|
||||
{ 519, PT_PC, ucp_Lt },
|
||||
{ 522, PT_PC, ucp_Lu },
|
||||
{ 525, PT_SC, ucp_Lycian },
|
||||
{ 532, PT_SC, ucp_Lydian },
|
||||
{ 539, PT_GC, ucp_M },
|
||||
{ 541, PT_SC, ucp_Malayalam },
|
||||
{ 551, PT_SC, ucp_Mandaic },
|
||||
{ 559, PT_PC, ucp_Mc },
|
||||
{ 562, PT_PC, ucp_Me },
|
||||
{ 565, PT_SC, ucp_Meetei_Mayek },
|
||||
{ 578, PT_SC, ucp_Meroitic_Cursive },
|
||||
{ 595, PT_SC, ucp_Meroitic_Hieroglyphs },
|
||||
{ 616, PT_SC, ucp_Miao },
|
||||
{ 621, PT_PC, ucp_Mn },
|
||||
{ 624, PT_SC, ucp_Mongolian },
|
||||
{ 634, PT_SC, ucp_Myanmar },
|
||||
{ 642, PT_GC, ucp_N },
|
||||
{ 644, PT_PC, ucp_Nd },
|
||||
{ 647, PT_SC, ucp_New_Tai_Lue },
|
||||
{ 659, PT_SC, ucp_Nko },
|
||||
{ 663, PT_PC, ucp_Nl },
|
||||
{ 666, PT_PC, ucp_No },
|
||||
{ 669, PT_SC, ucp_Ogham },
|
||||
{ 675, PT_SC, ucp_Ol_Chiki },
|
||||
{ 684, PT_SC, ucp_Old_Italic },
|
||||
{ 695, PT_SC, ucp_Old_Persian },
|
||||
{ 707, PT_SC, ucp_Old_South_Arabian },
|
||||
{ 725, PT_SC, ucp_Old_Turkic },
|
||||
{ 736, PT_SC, ucp_Oriya },
|
||||
{ 742, PT_SC, ucp_Osmanya },
|
||||
{ 750, PT_GC, ucp_P },
|
||||
{ 752, PT_PC, ucp_Pc },
|
||||
{ 755, PT_PC, ucp_Pd },
|
||||
{ 758, PT_PC, ucp_Pe },
|
||||
{ 761, PT_PC, ucp_Pf },
|
||||
{ 764, PT_SC, ucp_Phags_Pa },
|
||||
{ 773, PT_SC, ucp_Phoenician },
|
||||
{ 784, PT_PC, ucp_Pi },
|
||||
{ 787, PT_PC, ucp_Po },
|
||||
{ 790, PT_PC, ucp_Ps },
|
||||
{ 793, PT_SC, ucp_Rejang },
|
||||
{ 800, PT_SC, ucp_Runic },
|
||||
{ 806, PT_GC, ucp_S },
|
||||
{ 808, PT_SC, ucp_Samaritan },
|
||||
{ 818, PT_SC, ucp_Saurashtra },
|
||||
{ 829, PT_PC, ucp_Sc },
|
||||
{ 832, PT_SC, ucp_Sharada },
|
||||
{ 840, PT_SC, ucp_Shavian },
|
||||
{ 848, PT_SC, ucp_Sinhala },
|
||||
{ 856, PT_PC, ucp_Sk },
|
||||
{ 859, PT_PC, ucp_Sm },
|
||||
{ 862, PT_PC, ucp_So },
|
||||
{ 865, PT_SC, ucp_Sora_Sompeng },
|
||||
{ 878, PT_SC, ucp_Sundanese },
|
||||
{ 888, PT_SC, ucp_Syloti_Nagri },
|
||||
{ 901, PT_SC, ucp_Syriac },
|
||||
{ 908, PT_SC, ucp_Tagalog },
|
||||
{ 916, PT_SC, ucp_Tagbanwa },
|
||||
{ 925, PT_SC, ucp_Tai_Le },
|
||||
{ 932, PT_SC, ucp_Tai_Tham },
|
||||
{ 941, PT_SC, ucp_Tai_Viet },
|
||||
{ 950, PT_SC, ucp_Takri },
|
||||
{ 956, PT_SC, ucp_Tamil },
|
||||
{ 962, PT_SC, ucp_Telugu },
|
||||
{ 969, PT_SC, ucp_Thaana },
|
||||
{ 976, PT_SC, ucp_Thai },
|
||||
{ 981, PT_SC, ucp_Tibetan },
|
||||
{ 989, PT_SC, ucp_Tifinagh },
|
||||
{ 998, PT_SC, ucp_Ugaritic },
|
||||
{ 1007, PT_SC, ucp_Vai },
|
||||
{ 1011, PT_ALNUM, 0 },
|
||||
{ 1015, PT_PXSPACE, 0 },
|
||||
{ 1019, PT_SPACE, 0 },
|
||||
{ 1023, PT_WORD, 0 },
|
||||
{ 1027, PT_SC, ucp_Yi },
|
||||
{ 1030, PT_GC, ucp_Z },
|
||||
{ 1032, PT_PC, ucp_Zl },
|
||||
{ 1035, PT_PC, ucp_Zp },
|
||||
{ 1038, PT_PC, ucp_Zs }
|
||||
};
|
||||
|
||||
const int _pcre_utt_size = sizeof(_pcre_utt)/sizeof(ucp_type_table);
|
||||
const int PRIV(utt_size) = sizeof(PRIV(utt)) / sizeof(ucp_type_table);
|
||||
|
||||
#endif /* SUPPORT_UTF8 */
|
||||
#endif /* SUPPORT_UTF */
|
||||
|
||||
/* End of pcre_tables.c */
|
||||
|
@ -1,137 +0,0 @@
|
||||
/*************************************************
|
||||
* Perl-Compatible Regular Expressions *
|
||||
*************************************************/
|
||||
|
||||
/* PCRE is a library of functions to support regular expressions whose syntax
|
||||
and semantics are as close as possible to those of the Perl 5 language.
|
||||
|
||||
Written by Philip Hazel
|
||||
Copyright (c) 1997-2009 University of Cambridge
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright notice,
|
||||
this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of the University of Cambridge nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
-----------------------------------------------------------------------------
|
||||
*/
|
||||
|
||||
|
||||
/* This module contains an internal function that tests a compiled pattern to
|
||||
see if it was compiled with the opposite endianness. If so, it uses an
|
||||
auxiliary local function to flip the appropriate bytes. */
|
||||
|
||||
|
||||
#include "config.h"
|
||||
|
||||
#include "pcre_internal.h"
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Flip bytes in an integer *
|
||||
*************************************************/
|
||||
|
||||
/* This function is called when the magic number in a regex doesn't match, in
|
||||
order to flip its bytes to see if we are dealing with a pattern that was
|
||||
compiled on a host of different endianness. If so, this function is used to
|
||||
flip other byte values.
|
||||
|
||||
Arguments:
|
||||
value the number to flip
|
||||
n the number of bytes to flip (assumed to be 2 or 4)
|
||||
|
||||
Returns: the flipped value
|
||||
*/
|
||||
|
||||
static unsigned long int
|
||||
byteflip(unsigned long int value, int n)
|
||||
{
|
||||
if (n == 2) return ((value & 0x00ff) << 8) | ((value & 0xff00) >> 8);
|
||||
return ((value & 0x000000ff) << 24) |
|
||||
((value & 0x0000ff00) << 8) |
|
||||
((value & 0x00ff0000) >> 8) |
|
||||
((value & 0xff000000) >> 24);
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Test for a byte-flipped compiled regex *
|
||||
*************************************************/
|
||||
|
||||
/* This function is called from pcre_exec(), pcre_dfa_exec(), and also from
|
||||
pcre_fullinfo(). Its job is to test whether the regex is byte-flipped - that
|
||||
is, it was compiled on a system of opposite endianness. The function is called
|
||||
only when the native MAGIC_NUMBER test fails. If the regex is indeed flipped,
|
||||
we flip all the relevant values into a different data block, and return it.
|
||||
|
||||
Arguments:
|
||||
re points to the regex
|
||||
study points to study data, or NULL
|
||||
internal_re points to a new regex block
|
||||
internal_study points to a new study block
|
||||
|
||||
Returns: the new block if is is indeed a byte-flipped regex
|
||||
NULL if it is not
|
||||
*/
|
||||
|
||||
real_pcre *
|
||||
_pcre_try_flipped(const real_pcre *re, real_pcre *internal_re,
|
||||
const pcre_study_data *study, pcre_study_data *internal_study)
|
||||
{
|
||||
if (byteflip(re->magic_number, sizeof(re->magic_number)) != MAGIC_NUMBER)
|
||||
return NULL;
|
||||
|
||||
*internal_re = *re; /* To copy other fields */
|
||||
internal_re->size = byteflip(re->size, sizeof(re->size));
|
||||
internal_re->options = byteflip(re->options, sizeof(re->options));
|
||||
internal_re->flags = (pcre_uint16)byteflip(re->flags, sizeof(re->flags));
|
||||
internal_re->top_bracket =
|
||||
(pcre_uint16)byteflip(re->top_bracket, sizeof(re->top_bracket));
|
||||
internal_re->top_backref =
|
||||
(pcre_uint16)byteflip(re->top_backref, sizeof(re->top_backref));
|
||||
internal_re->first_byte =
|
||||
(pcre_uint16)byteflip(re->first_byte, sizeof(re->first_byte));
|
||||
internal_re->req_byte =
|
||||
(pcre_uint16)byteflip(re->req_byte, sizeof(re->req_byte));
|
||||
internal_re->name_table_offset =
|
||||
(pcre_uint16)byteflip(re->name_table_offset, sizeof(re->name_table_offset));
|
||||
internal_re->name_entry_size =
|
||||
(pcre_uint16)byteflip(re->name_entry_size, sizeof(re->name_entry_size));
|
||||
internal_re->name_count =
|
||||
(pcre_uint16)byteflip(re->name_count, sizeof(re->name_count));
|
||||
|
||||
if (study != NULL)
|
||||
{
|
||||
*internal_study = *study; /* To copy other fields */
|
||||
internal_study->size = byteflip(study->size, sizeof(study->size));
|
||||
internal_study->flags = byteflip(study->flags, sizeof(study->flags));
|
||||
internal_study->minlength = byteflip(study->minlength,
|
||||
sizeof(study->minlength));
|
||||
}
|
||||
|
||||
return internal_re;
|
||||
}
|
||||
|
||||
/* End of pcre_tryflipped.c */
|
File diff suppressed because it is too large
Load Diff
@ -6,7 +6,7 @@
|
||||
and semantics are as close as possible to those of the Perl 5 language.
|
||||
|
||||
Written by Philip Hazel
|
||||
Copyright (c) 1997-2009 University of Cambridge
|
||||
Copyright (c) 1997-2012 University of Cambridge
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@ -52,127 +52,246 @@ strings. */
|
||||
*************************************************/
|
||||
|
||||
/* This function is called (optionally) at the start of compile or match, to
|
||||
validate that a supposed UTF-8 string is actually valid. The early check means
|
||||
check that a supposed UTF-8 string is actually valid. The early check means
|
||||
that subsequent code can assume it is dealing with a valid string. The check
|
||||
can be turned off for maximum performance, but the consequences of supplying
|
||||
an invalid string are then undefined.
|
||||
can be turned off for maximum performance, but the consequences of supplying an
|
||||
invalid string are then undefined.
|
||||
|
||||
Originally, this function checked according to RFC 2279, allowing for values in
|
||||
the range 0 to 0x7fffffff, up to 6 bytes long, but ensuring that they were in
|
||||
the canonical format. Once somebody had pointed out RFC 3629 to me (it
|
||||
obsoletes 2279), additional restrictions were applied. The values are now
|
||||
limited to be between 0 and 0x0010ffff, no more than 4 bytes long, and the
|
||||
subrange 0xd000 to 0xdfff is excluded.
|
||||
subrange 0xd000 to 0xdfff is excluded. However, the format of 5-byte and 6-byte
|
||||
characters is still checked.
|
||||
|
||||
From release 8.13 more information about the details of the error are passed
|
||||
back in the returned value:
|
||||
|
||||
PCRE_UTF8_ERR0 No error
|
||||
PCRE_UTF8_ERR1 Missing 1 byte at the end of the string
|
||||
PCRE_UTF8_ERR2 Missing 2 bytes at the end of the string
|
||||
PCRE_UTF8_ERR3 Missing 3 bytes at the end of the string
|
||||
PCRE_UTF8_ERR4 Missing 4 bytes at the end of the string
|
||||
PCRE_UTF8_ERR5 Missing 5 bytes at the end of the string
|
||||
PCRE_UTF8_ERR6 2nd-byte's two top bits are not 0x80
|
||||
PCRE_UTF8_ERR7 3rd-byte's two top bits are not 0x80
|
||||
PCRE_UTF8_ERR8 4th-byte's two top bits are not 0x80
|
||||
PCRE_UTF8_ERR9 5th-byte's two top bits are not 0x80
|
||||
PCRE_UTF8_ERR10 6th-byte's two top bits are not 0x80
|
||||
PCRE_UTF8_ERR11 5-byte character is not permitted by RFC 3629
|
||||
PCRE_UTF8_ERR12 6-byte character is not permitted by RFC 3629
|
||||
PCRE_UTF8_ERR13 4-byte character with value > 0x10ffff is not permitted
|
||||
PCRE_UTF8_ERR14 3-byte character with value 0xd000-0xdfff is not permitted
|
||||
PCRE_UTF8_ERR15 Overlong 2-byte sequence
|
||||
PCRE_UTF8_ERR16 Overlong 3-byte sequence
|
||||
PCRE_UTF8_ERR17 Overlong 4-byte sequence
|
||||
PCRE_UTF8_ERR18 Overlong 5-byte sequence (won't ever occur)
|
||||
PCRE_UTF8_ERR19 Overlong 6-byte sequence (won't ever occur)
|
||||
PCRE_UTF8_ERR20 Isolated 0x80 byte (not within UTF-8 character)
|
||||
PCRE_UTF8_ERR21 Byte with the illegal value 0xfe or 0xff
|
||||
|
||||
Arguments:
|
||||
string points to the string
|
||||
length length of string, or -1 if the string is zero-terminated
|
||||
errp pointer to an error position offset variable
|
||||
|
||||
Returns: < 0 if the string is a valid UTF-8 string
|
||||
>= 0 otherwise; the value is the offset of the bad byte
|
||||
|
||||
Bad bytes can be:
|
||||
|
||||
. An isolated byte whose most significant bits are 0x80, because this
|
||||
can only correctly appear within a UTF-8 character;
|
||||
|
||||
. A byte whose most significant bits are 0xc0, but whose other bits indicate
|
||||
that there are more than 3 additional bytes (i.e. an RFC 2279 starting
|
||||
byte, which is no longer valid under RFC 3629);
|
||||
|
||||
.
|
||||
|
||||
The returned offset may also be equal to the length of the string; this means
|
||||
that one or more bytes is missing from the final UTF-8 character.
|
||||
Returns: = 0 if the string is a valid UTF-8 string
|
||||
> 0 otherwise, setting the offset of the bad character
|
||||
*/
|
||||
|
||||
int
|
||||
_pcre_valid_utf8(USPTR string, int length)
|
||||
PRIV(valid_utf)(PCRE_PUCHAR string, int length, int *erroroffset)
|
||||
{
|
||||
#ifdef SUPPORT_UTF8
|
||||
register USPTR p;
|
||||
#ifdef SUPPORT_UTF
|
||||
register PCRE_PUCHAR p;
|
||||
|
||||
if (length < 0)
|
||||
{
|
||||
for (p = string; *p != 0; p++);
|
||||
length = p - string;
|
||||
length = (int)(p - string);
|
||||
}
|
||||
|
||||
for (p = string; length-- > 0; p++)
|
||||
{
|
||||
register int ab;
|
||||
register int c = *p;
|
||||
if (c < 128) continue;
|
||||
if (c < 0xc0) return p - string;
|
||||
ab = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */
|
||||
if (ab > 3) return p - string; /* Too many for RFC 3629 */
|
||||
if (length < ab) return p + 1 + length - string; /* Missing bytes */
|
||||
length -= ab;
|
||||
register int ab, c, d;
|
||||
|
||||
c = *p;
|
||||
if (c < 128) continue; /* ASCII character */
|
||||
|
||||
if (c < 0xc0) /* Isolated 10xx xxxx byte */
|
||||
{
|
||||
*erroroffset = (int)(p - string);
|
||||
return PCRE_UTF8_ERR20;
|
||||
}
|
||||
|
||||
if (c >= 0xfe) /* Invalid 0xfe or 0xff bytes */
|
||||
{
|
||||
*erroroffset = (int)(p - string);
|
||||
return PCRE_UTF8_ERR21;
|
||||
}
|
||||
|
||||
ab = PRIV(utf8_table4)[c & 0x3f]; /* Number of additional bytes */
|
||||
if (length < ab)
|
||||
{
|
||||
*erroroffset = (int)(p - string); /* Missing bytes */
|
||||
return ab - length; /* Codes ERR1 to ERR5 */
|
||||
}
|
||||
length -= ab; /* Length remaining */
|
||||
|
||||
/* Check top bits in the second byte */
|
||||
if ((*(++p) & 0xc0) != 0x80) return p - string;
|
||||
|
||||
/* Check for overlong sequences for each different length, and for the
|
||||
excluded range 0xd000 to 0xdfff. */
|
||||
if (((d = *(++p)) & 0xc0) != 0x80)
|
||||
{
|
||||
*erroroffset = (int)(p - string) - 1;
|
||||
return PCRE_UTF8_ERR6;
|
||||
}
|
||||
|
||||
/* For each length, check that the remaining bytes start with the 0x80 bit
|
||||
set and not the 0x40 bit. Then check for an overlong sequence, and for the
|
||||
excluded range 0xd800 to 0xdfff. */
|
||||
|
||||
switch (ab)
|
||||
{
|
||||
/* Check for xx00 000x (overlong sequence) */
|
||||
/* 2-byte character. No further bytes to check for 0x80. Check first byte
|
||||
for for xx00 000x (overlong sequence). */
|
||||
|
||||
case 1:
|
||||
if ((c & 0x3e) == 0) return p - string;
|
||||
continue; /* We know there aren't any more bytes to check */
|
||||
case 1: if ((c & 0x3e) == 0)
|
||||
{
|
||||
*erroroffset = (int)(p - string) - 1;
|
||||
return PCRE_UTF8_ERR15;
|
||||
}
|
||||
break;
|
||||
|
||||
/* Check for 1110 0000, xx0x xxxx (overlong sequence) or
|
||||
1110 1101, 1010 xxxx (0xd000 - 0xdfff) */
|
||||
/* 3-byte character. Check third byte for 0x80. Then check first 2 bytes
|
||||
for 1110 0000, xx0x xxxx (overlong sequence) or
|
||||
1110 1101, 1010 xxxx (0xd800 - 0xdfff) */
|
||||
|
||||
case 2:
|
||||
if ((c == 0xe0 && (*p & 0x20) == 0) ||
|
||||
(c == 0xed && *p >= 0xa0))
|
||||
return p - string;
|
||||
if ((*(++p) & 0xc0) != 0x80) /* Third byte */
|
||||
{
|
||||
*erroroffset = (int)(p - string) - 2;
|
||||
return PCRE_UTF8_ERR7;
|
||||
}
|
||||
if (c == 0xe0 && (d & 0x20) == 0)
|
||||
{
|
||||
*erroroffset = (int)(p - string) - 2;
|
||||
return PCRE_UTF8_ERR16;
|
||||
}
|
||||
if (c == 0xed && d >= 0xa0)
|
||||
{
|
||||
*erroroffset = (int)(p - string) - 2;
|
||||
return PCRE_UTF8_ERR14;
|
||||
}
|
||||
break;
|
||||
|
||||
/* Check for 1111 0000, xx00 xxxx (overlong sequence) or
|
||||
greater than 0x0010ffff (f4 8f bf bf) */
|
||||
/* 4-byte character. Check 3rd and 4th bytes for 0x80. Then check first 2
|
||||
bytes for for 1111 0000, xx00 xxxx (overlong sequence), then check for a
|
||||
character greater than 0x0010ffff (f4 8f bf bf) */
|
||||
|
||||
case 3:
|
||||
if ((c == 0xf0 && (*p & 0x30) == 0) ||
|
||||
(c > 0xf4 ) ||
|
||||
(c == 0xf4 && *p > 0x8f))
|
||||
return p - string;
|
||||
if ((*(++p) & 0xc0) != 0x80) /* Third byte */
|
||||
{
|
||||
*erroroffset = (int)(p - string) - 2;
|
||||
return PCRE_UTF8_ERR7;
|
||||
}
|
||||
if ((*(++p) & 0xc0) != 0x80) /* Fourth byte */
|
||||
{
|
||||
*erroroffset = (int)(p - string) - 3;
|
||||
return PCRE_UTF8_ERR8;
|
||||
}
|
||||
if (c == 0xf0 && (d & 0x30) == 0)
|
||||
{
|
||||
*erroroffset = (int)(p - string) - 3;
|
||||
return PCRE_UTF8_ERR17;
|
||||
}
|
||||
if (c > 0xf4 || (c == 0xf4 && d > 0x8f))
|
||||
{
|
||||
*erroroffset = (int)(p - string) - 3;
|
||||
return PCRE_UTF8_ERR13;
|
||||
}
|
||||
break;
|
||||
|
||||
#if 0
|
||||
/* These cases can no longer occur, as we restrict to a maximum of four
|
||||
bytes nowadays. Leave the code here in case we ever want to add an option
|
||||
for longer sequences. */
|
||||
/* 5-byte and 6-byte characters are not allowed by RFC 3629, and will be
|
||||
rejected by the length test below. However, we do the appropriate tests
|
||||
here so that overlong sequences get diagnosed, and also in case there is
|
||||
ever an option for handling these larger code points. */
|
||||
|
||||
/* 5-byte character. Check 3rd, 4th, and 5th bytes for 0x80. Then check for
|
||||
1111 1000, xx00 0xxx */
|
||||
|
||||
/* Check for 1111 1000, xx00 0xxx */
|
||||
case 4:
|
||||
if (c == 0xf8 && (*p & 0x38) == 0) return p - string;
|
||||
if ((*(++p) & 0xc0) != 0x80) /* Third byte */
|
||||
{
|
||||
*erroroffset = (int)(p - string) - 2;
|
||||
return PCRE_UTF8_ERR7;
|
||||
}
|
||||
if ((*(++p) & 0xc0) != 0x80) /* Fourth byte */
|
||||
{
|
||||
*erroroffset = (int)(p - string) - 3;
|
||||
return PCRE_UTF8_ERR8;
|
||||
}
|
||||
if ((*(++p) & 0xc0) != 0x80) /* Fifth byte */
|
||||
{
|
||||
*erroroffset = (int)(p - string) - 4;
|
||||
return PCRE_UTF8_ERR9;
|
||||
}
|
||||
if (c == 0xf8 && (d & 0x38) == 0)
|
||||
{
|
||||
*erroroffset = (int)(p - string) - 4;
|
||||
return PCRE_UTF8_ERR18;
|
||||
}
|
||||
break;
|
||||
|
||||
/* Check for leading 0xfe or 0xff, and then for 1111 1100, xx00 00xx */
|
||||
/* 6-byte character. Check 3rd-6th bytes for 0x80. Then check for
|
||||
1111 1100, xx00 00xx. */
|
||||
|
||||
case 5:
|
||||
if (c == 0xfe || c == 0xff ||
|
||||
(c == 0xfc && (*p & 0x3c) == 0)) return p - string;
|
||||
if ((*(++p) & 0xc0) != 0x80) /* Third byte */
|
||||
{
|
||||
*erroroffset = (int)(p - string) - 2;
|
||||
return PCRE_UTF8_ERR7;
|
||||
}
|
||||
if ((*(++p) & 0xc0) != 0x80) /* Fourth byte */
|
||||
{
|
||||
*erroroffset = (int)(p - string) - 3;
|
||||
return PCRE_UTF8_ERR8;
|
||||
}
|
||||
if ((*(++p) & 0xc0) != 0x80) /* Fifth byte */
|
||||
{
|
||||
*erroroffset = (int)(p - string) - 4;
|
||||
return PCRE_UTF8_ERR9;
|
||||
}
|
||||
if ((*(++p) & 0xc0) != 0x80) /* Sixth byte */
|
||||
{
|
||||
*erroroffset = (int)(p - string) - 5;
|
||||
return PCRE_UTF8_ERR10;
|
||||
}
|
||||
if (c == 0xfc && (d & 0x3c) == 0)
|
||||
{
|
||||
*erroroffset = (int)(p - string) - 5;
|
||||
return PCRE_UTF8_ERR19;
|
||||
}
|
||||
break;
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
/* Check for valid bytes after the 2nd, if any; all must start 10 */
|
||||
while (--ab > 0)
|
||||
/* Character is valid under RFC 2279, but 4-byte and 5-byte characters are
|
||||
excluded by RFC 3629. The pointer p is currently at the last byte of the
|
||||
character. */
|
||||
|
||||
if (ab > 3)
|
||||
{
|
||||
if ((*(++p) & 0xc0) != 0x80) return p - string;
|
||||
*erroroffset = (int)(p - string) - ab;
|
||||
return (ab == 4)? PCRE_UTF8_ERR11 : PCRE_UTF8_ERR12;
|
||||
}
|
||||
}
|
||||
#else
|
||||
|
||||
#else /* SUPPORT_UTF */
|
||||
(void)(string); /* Keep picky compilers happy */
|
||||
(void)(length);
|
||||
#endif
|
||||
|
||||
return -1;
|
||||
return PCRE_UTF8_ERR0; /* This indicates success */
|
||||
}
|
||||
|
||||
/* End of pcre_valid_utf8.c */
|
||||
|
@ -6,7 +6,7 @@
|
||||
and semantics are as close as possible to those of the Perl 5 language.
|
||||
|
||||
Written by Philip Hazel
|
||||
Copyright (c) 1997-2008 University of Cambridge
|
||||
Copyright (c) 1997-2012 University of Cambridge
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@ -77,8 +77,13 @@ I could find no way of detecting that a macro is defined as an empty string at
|
||||
pre-processor time. This hack uses a standard trick for avoiding calling
|
||||
the STRING macro with an empty argument when doing the test. */
|
||||
|
||||
#ifdef COMPILE_PCRE8
|
||||
PCRE_EXP_DEFN const char * PCRE_CALL_CONVENTION
|
||||
pcre_version(void)
|
||||
#else
|
||||
PCRE_EXP_DEFN const char * PCRE_CALL_CONVENTION
|
||||
pcre16_version(void)
|
||||
#endif
|
||||
{
|
||||
return (XSTRING(Z PCRE_PRERELEASE)[1] == 0)?
|
||||
XSTRING(PCRE_MAJOR.PCRE_MINOR PCRE_DATE) :
|
||||
|
@ -6,7 +6,7 @@
|
||||
and semantics are as close as possible to those of the Perl 5 language.
|
||||
|
||||
Written by Philip Hazel
|
||||
Copyright (c) 1997-2010 University of Cambridge
|
||||
Copyright (c) 1997-2012 University of Cambridge
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@ -62,39 +62,63 @@ Returns: TRUE if character matches, else FALSE
|
||||
*/
|
||||
|
||||
BOOL
|
||||
_pcre_xclass(int c, const uschar *data)
|
||||
PRIV(xclass)(int c, const pcre_uchar *data, BOOL utf)
|
||||
{
|
||||
int t;
|
||||
BOOL negated = (*data & XCL_NOT) != 0;
|
||||
|
||||
(void)utf;
|
||||
#ifdef COMPILE_PCRE8
|
||||
/* In 8 bit mode, this must always be TRUE. Help the compiler to know that. */
|
||||
utf = TRUE;
|
||||
#endif
|
||||
|
||||
/* Character values < 256 are matched against a bitmap, if one is present. If
|
||||
not, we still carry on, because there may be ranges that start below 256 in the
|
||||
additional data. */
|
||||
|
||||
if (c < 256)
|
||||
{
|
||||
if ((*data & XCL_MAP) != 0 && (data[1 + c/8] & (1 << (c&7))) != 0)
|
||||
return !negated; /* char found */
|
||||
if ((*data & XCL_MAP) != 0 &&
|
||||
(((pcre_uint8 *)(data + 1))[c/8] & (1 << (c&7))) != 0)
|
||||
return !negated; /* char found */
|
||||
}
|
||||
|
||||
/* First skip the bit map if present. Then match against the list of Unicode
|
||||
properties or large chars or ranges that end with a large char. We won't ever
|
||||
encounter XCL_PROP or XCL_NOTPROP when UCP support is not compiled. */
|
||||
|
||||
if ((*data++ & XCL_MAP) != 0) data += 32;
|
||||
if ((*data++ & XCL_MAP) != 0) data += 32 / sizeof(pcre_uchar);
|
||||
|
||||
while ((t = *data++) != XCL_END)
|
||||
{
|
||||
int x, y;
|
||||
if (t == XCL_SINGLE)
|
||||
{
|
||||
GETCHARINC(x, data);
|
||||
#ifdef SUPPORT_UTF
|
||||
if (utf)
|
||||
{
|
||||
GETCHARINC(x, data); /* macro generates multiple statements */
|
||||
}
|
||||
else
|
||||
#endif
|
||||
x = *data++;
|
||||
if (c == x) return !negated;
|
||||
}
|
||||
else if (t == XCL_RANGE)
|
||||
{
|
||||
GETCHARINC(x, data);
|
||||
GETCHARINC(y, data);
|
||||
#ifdef SUPPORT_UTF
|
||||
if (utf)
|
||||
{
|
||||
GETCHARINC(x, data); /* macro generates multiple statements */
|
||||
GETCHARINC(y, data); /* macro generates multiple statements */
|
||||
}
|
||||
else
|
||||
#endif
|
||||
{
|
||||
x = *data++;
|
||||
y = *data++;
|
||||
}
|
||||
if (c >= x && c <= y) return !negated;
|
||||
}
|
||||
|
||||
@ -115,7 +139,7 @@ while ((t = *data++) != XCL_END)
|
||||
break;
|
||||
|
||||
case PT_GC:
|
||||
if ((data[1] == _pcre_ucp_gentype[prop->chartype]) == (t == XCL_PROP))
|
||||
if ((data[1] == PRIV(ucp_gentype)[prop->chartype]) == (t == XCL_PROP))
|
||||
return !negated;
|
||||
break;
|
||||
|
||||
@ -128,28 +152,28 @@ while ((t = *data++) != XCL_END)
|
||||
break;
|
||||
|
||||
case PT_ALNUM:
|
||||
if ((_pcre_ucp_gentype[prop->chartype] == ucp_L ||
|
||||
_pcre_ucp_gentype[prop->chartype] == ucp_N) == (t == XCL_PROP))
|
||||
if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
|
||||
PRIV(ucp_gentype)[prop->chartype] == ucp_N) == (t == XCL_PROP))
|
||||
return !negated;
|
||||
break;
|
||||
|
||||
case PT_SPACE: /* Perl space */
|
||||
if ((_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
|
||||
if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
|
||||
c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
|
||||
== (t == XCL_PROP))
|
||||
return !negated;
|
||||
break;
|
||||
|
||||
case PT_PXSPACE: /* POSIX space */
|
||||
if ((_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
|
||||
if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
|
||||
c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
|
||||
c == CHAR_FF || c == CHAR_CR) == (t == XCL_PROP))
|
||||
return !negated;
|
||||
break;
|
||||
|
||||
case PT_WORD:
|
||||
if ((_pcre_ucp_gentype[prop->chartype] == ucp_L ||
|
||||
_pcre_ucp_gentype[prop->chartype] == ucp_N || c == CHAR_UNDERSCORE)
|
||||
if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
|
||||
PRIV(ucp_gentype)[prop->chartype] == ucp_N || c == CHAR_UNDERSCORE)
|
||||
== (t == XCL_PROP))
|
||||
return !negated;
|
||||
break;
|
||||
|
@ -248,7 +248,7 @@ if (namecount <= 0) printf("No named substrings\n"); else
|
||||
* more than one byte. *
|
||||
* *
|
||||
* However, there is a complication concerned with newlines. When the *
|
||||
* newline convention is such that CRLF is a valid newline, we want must *
|
||||
* newline convention is such that CRLF is a valid newline, we must *
|
||||
* advance by two characters rather than one. The newline convention can *
|
||||
* be set in the regex by (*CR), etc.; if not, we must find the default. *
|
||||
*************************************************************************/
|
||||
|
@ -6,7 +6,7 @@
|
||||
and semantics are as close as possible to those of the Perl 5 language.
|
||||
|
||||
Written by Philip Hazel
|
||||
Copyright (c) 1997-2010 University of Cambridge
|
||||
Copyright (c) 1997-2012 University of Cambridge
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@ -150,6 +150,16 @@ static const int eint[] = {
|
||||
REG_BADPAT, /* (*MARK) must have an argument */
|
||||
REG_INVARG, /* this version of PCRE is not compiled with PCRE_UCP support */
|
||||
REG_BADPAT, /* \c must be followed by an ASCII character */
|
||||
REG_BADPAT, /* \k is not followed by a braced, angle-bracketed, or quoted name */
|
||||
/* 70 */
|
||||
REG_BADPAT, /* internal error: unknown opcode in find_fixedlength() */
|
||||
REG_BADPAT, /* \N is not supported in a class */
|
||||
REG_BADPAT, /* too many forward references */
|
||||
REG_BADPAT, /* disallowed UTF-8/16 code point (>= 0xd800 && <= 0xdfff) */
|
||||
REG_BADPAT, /* invalid UTF-16 string (should not occur) */
|
||||
/* 75 */
|
||||
REG_BADPAT, /* overlong MARK name */
|
||||
REG_BADPAT /* character value in \u.... sequence is too large */
|
||||
};
|
||||
|
||||
/* Table of texts corresponding to POSIX error codes */
|
||||
@ -220,7 +230,7 @@ return length + addlength;
|
||||
PCREPOSIX_EXP_DEFN void PCRE_CALL_CONVENTION
|
||||
regfree(regex_t *preg)
|
||||
{
|
||||
(pcre_free)(preg->re_pcre);
|
||||
(PUBL(free))(preg->re_pcre);
|
||||
}
|
||||
|
||||
|
||||
@ -265,11 +275,12 @@ should not happen, but we all make mistakes), return REG_BADPAT. */
|
||||
|
||||
if (preg->re_pcre == NULL)
|
||||
{
|
||||
return (errorcode < sizeof(eint)/sizeof(const int))?
|
||||
return (errorcode < (int)(sizeof(eint)/sizeof(const int)))?
|
||||
eint[errorcode] : REG_BADPAT;
|
||||
}
|
||||
|
||||
preg->re_nsub = pcre_info((const pcre *)preg->re_pcre, NULL, NULL);
|
||||
(void)pcre_fullinfo((const pcre *)preg->re_pcre, NULL, PCRE_INFO_CAPTURECOUNT,
|
||||
&(preg->re_nsub));
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -395,6 +406,7 @@ switch(rc)
|
||||
case PCRE_ERROR_MATCHLIMIT: return REG_ESPACE;
|
||||
case PCRE_ERROR_BADUTF8: return REG_INVARG;
|
||||
case PCRE_ERROR_BADUTF8_OFFSET: return REG_INVARG;
|
||||
case PCRE_ERROR_BADMODE: return REG_INVARG;
|
||||
default: return REG_ASSERT;
|
||||
}
|
||||
}
|
||||
|
@ -9,7 +9,7 @@
|
||||
Compatible Regular Expression library. It defines the things POSIX says should
|
||||
be there. I hope.
|
||||
|
||||
Copyright (c) 1997-2009 University of Cambridge
|
||||
Copyright (c) 1997-2012 University of Cambridge
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
|
2
ext/pcre/pcrelib/testdata/grepinput
vendored
2
ext/pcre/pcrelib/testdata/grepinput
vendored
@ -602,6 +602,8 @@ ABOVE theatre
|
||||
AB.VE
|
||||
AB.VE the turtle
|
||||
|
||||
010203040506
|
||||
|
||||
PUT NEW DATA ABOVE THIS LINE.
|
||||
=============================
|
||||
|
||||
|
3
ext/pcre/pcrelib/testdata/grepinput8
vendored
3
ext/pcre/pcrelib/testdata/grepinput8
vendored
@ -1,6 +1,5 @@
|
||||
X one
|
||||
X twoX threeX four
|
||||
X five
|
||||
X twoX threeX four
X five
|
||||
X six
|
||||
X seven
X eight
X nine
X ten
|
||||
|
||||
|
99
ext/pcre/pcrelib/testdata/grepoutput
vendored
99
ext/pcre/pcrelib/testdata/grepoutput
vendored
@ -10,7 +10,7 @@ RC=0
|
||||
7:PATTERN at the start of a line.
|
||||
8:In the middle of a line, PATTERN appears.
|
||||
10:This pattern is in lower case.
|
||||
608:Check up on PATTERN near the end.
|
||||
610:Check up on PATTERN near the end.
|
||||
RC=0
|
||||
---------------------------- Test 4 ------------------------------
|
||||
4
|
||||
@ -19,7 +19,7 @@ RC=0
|
||||
./testdata/grepinput:7:PATTERN at the start of a line.
|
||||
./testdata/grepinput:8:In the middle of a line, PATTERN appears.
|
||||
./testdata/grepinput:10:This pattern is in lower case.
|
||||
./testdata/grepinput:608:Check up on PATTERN near the end.
|
||||
./testdata/grepinput:610:Check up on PATTERN near the end.
|
||||
./testdata/grepinputx:3:Here is the pattern again.
|
||||
./testdata/grepinputx:5:Pattern
|
||||
./testdata/grepinputx:42:This line contains pattern not on a line by itself.
|
||||
@ -28,7 +28,7 @@ RC=0
|
||||
7:PATTERN at the start of a line.
|
||||
8:In the middle of a line, PATTERN appears.
|
||||
10:This pattern is in lower case.
|
||||
608:Check up on PATTERN near the end.
|
||||
610:Check up on PATTERN near the end.
|
||||
3:Here is the pattern again.
|
||||
5:Pattern
|
||||
42:This line contains pattern not on a line by itself.
|
||||
@ -323,10 +323,10 @@ RC=0
|
||||
./testdata/grepinput-9-
|
||||
./testdata/grepinput:10:This pattern is in lower case.
|
||||
--
|
||||
./testdata/grepinput-605-PUT NEW DATA ABOVE THIS LINE.
|
||||
./testdata/grepinput-606-=============================
|
||||
./testdata/grepinput-607-
|
||||
./testdata/grepinput:608:Check up on PATTERN near the end.
|
||||
./testdata/grepinput-607-PUT NEW DATA ABOVE THIS LINE.
|
||||
./testdata/grepinput-608-=============================
|
||||
./testdata/grepinput-609-
|
||||
./testdata/grepinput:610:Check up on PATTERN near the end.
|
||||
--
|
||||
./testdata/grepinputx-1-This is a second file of input for the pcregrep tests.
|
||||
./testdata/grepinputx-2-
|
||||
@ -348,8 +348,8 @@ RC=0
|
||||
./testdata/grepinput-12-Here follows a whole lot of stuff that makes the file over 24K long.
|
||||
./testdata/grepinput-13-
|
||||
--
|
||||
./testdata/grepinput:608:Check up on PATTERN near the end.
|
||||
./testdata/grepinput-609-This is the last line of this file.
|
||||
./testdata/grepinput:610:Check up on PATTERN near the end.
|
||||
./testdata/grepinput-611-This is the last line of this file.
|
||||
--
|
||||
./testdata/grepinputx:3:Here is the pattern again.
|
||||
./testdata/grepinputx-4-
|
||||
@ -380,6 +380,7 @@ RC=0
|
||||
---------------------------- Test 37 -----------------------------
|
||||
aaaaa0
|
||||
aaaaa2
|
||||
010203040506
|
||||
RC=0
|
||||
======== STDERR ========
|
||||
pcregrep: pcre_exec() gave error -8 while matching this text:
|
||||
@ -390,7 +391,7 @@ pcregrep: pcre_exec() gave error -8 while matching this text:
|
||||
|
||||
aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
|
||||
|
||||
pcregrep: Error -8 or -21 means that a resource limit was exceeded.
|
||||
pcregrep: Error -8, -21 or -27 means that a resource limit was exceeded.
|
||||
pcregrep: Check your regex for nested unlimited loops.
|
||||
---------------------------- Test 38 ------------------------------
|
||||
This line contains a binary zero here > |