Upgraded bundled PCRE to version 8.02.

2024-11-24 10:24:11 +08:00 · 2010-03-29 11:58:06 +00:00 · 2010-03-29 11:58:06 +00:00 · 6e92347ddf
commit 6e92347ddf
parent 71ec12cc78
19 changed files with 3612 additions and 3044 deletions
--- a/2
+++ b/2
@ -1,6 +1,8 @@
 PHP                                                                        NEWS
 |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
 ?? ??? 201?, PHP 5.3.99
+- Upgraded bundled PCRE to version 8.02. (Ilia)
+
 - Added Tokyo Cabinet abstract DB support to ext/dba. (Michael Maclean)
 - Added Jenkins's one-at-a-time hash support to ext/hash. (Martin Jansen)
 - Added FNV-1 hash support to ext/hash. (Michael Maclean)
--- a/ext/pcre/pcrelib/ChangeLog
+++ b/ext/pcre/pcrelib/ChangeLog
@ -1,6 +1,71 @@
 ChangeLog for PCRE
 ------------------

+Version 8.02 19-Mar-2010
+------------------------
+
+1.  The Unicode data tables have been updated to Unicode 5.2.0.
+
+2.  Added the option --libs-cpp to pcre-config, but only when C++ support is
+    configured.
+
+3.  Updated the licensing terms in the pcregexp.pas file, as agreed with the
+    original author of that file, following a query about its status.
+
+4.  On systems that do not have stdint.h (e.g. Solaris), check for and include
+    inttypes.h instead. This fixes a bug that was introduced by change 8.01/8.
+
+5.  A pattern such as (?&t)*+(?(DEFINE)(?<t>.)) which has a possessive
+    quantifier applied to a forward-referencing subroutine call, could compile
+    incorrect code or give the error "internal error: previously-checked
+    referenced subpattern not found".
+
+6.  Both MS Visual Studio and Symbian OS have problems with initializing
+    variables to point to external functions. For these systems, therefore,
+    pcre_malloc etc. are now initialized to local functions that call the
+    relevant global functions.
+
+7.  There were two entries missing in the vectors called coptable and poptable
+    in pcre_dfa_exec.c. This could lead to memory accesses outsize the vectors.
+    I've fixed the data, and added a kludgy way of testing at compile time that
+    the lengths are correct (equal to the number of opcodes).
+
+8.  Following on from 7, I added a similar kludge to check the length of the
+    eint vector in pcreposix.c.
+
+9.  Error texts for pcre_compile() are held as one long string to avoid too
+    much relocation at load time. To find a text, the string is searched,
+    counting zeros. There was no check for running off the end of the string,
+    which could happen if a new error number was added without updating the
+    string.
+
+10. \K gave a compile-time error if it appeared in a lookbehind assersion.
+
+11. \K was not working if it appeared in an atomic group or in a group that
+    was called as a "subroutine", or in an assertion. Perl 5.11 documents that
+    \K is "not well defined" if used in an assertion. PCRE now accepts it if
+    the assertion is positive, but not if it is negative.
+
+12. Change 11 fortuitously reduced the size of the stack frame used in the
+    "match()" function of pcre_exec.c by one pointer. Forthcoming
+    implementation of support for (*MARK) will need an extra pointer on the
+    stack; I have reserved it now, so that the stack frame size does not
+    decrease.
+
+13. A pattern such as (?P<L1>(?P<L2>0)|(?P>L2)(?P>L1)) in which the only other
+    item in branch that calls a recursion is a subroutine call - as in the
+    second branch in the above example - was incorrectly given the compile-
+    time error "recursive call could loop indefinitely" because pcre_compile()
+    was not correctly checking the subroutine for matching a non-empty string.
+
+14. The checks for overrunning compiling workspace could trigger after an
+    overrun had occurred. This is a "should never occur" error, but it can be
+    triggered by pathological patterns such as hundreds of nested parentheses.
+    The checks now trigger 100 bytes before the end of the workspace.
+
+15. Fix typo in configure.ac: "srtoq" should be "strtoq".
+
+
 Version 8.01 19-Jan-2010
 ------------------------

--- a/ext/pcre/pcrelib/NEWS
+++ b/ext/pcre/pcrelib/NEWS
@ -1,6 +1,12 @@
 News about PCRE releases
 ------------------------

+Release 8.02 19-Mar-2010
+------------------------
+
+Another bug-fix release.
+
+
 Release 8.01 19-Jan-2010
 ------------------------

--- a/ext/pcre/pcrelib/config.h
+++ b/ext/pcre/pcrelib/config.h
@ -148,12 +148,12 @@ them both to 0; an emulation function will be used. */
 /* #undef HAVE_STRTOIMAX */

 /* Define to 1 if you have `strtoll'. */
-#ifndef HAVE_STRTOLL
-#define HAVE_STRTOLL 1
-#endif
+/* #undef HAVE_STRTOLL */

 /* Define to 1 if you have `strtoq'. */
-/* #undef HAVE_STRTOQ */
+#ifndef HAVE_STRTOQ
+#define HAVE_STRTOQ 1
+#endif

 /* Define to 1 if you have the <sys/stat.h> header file. */
 #ifndef HAVE_SYS_STAT_H
@ -271,13 +271,13 @@ them both to 0; an emulation function will be used. */
 #define PACKAGE_NAME "PCRE"

 /* Define to the full name and version of this package. */
-#define PACKAGE_STRING "PCRE 8.01"
+#define PACKAGE_STRING "PCRE 8.02"

 /* Define to the one symbol short name of this package. */
 #define PACKAGE_TARNAME "pcre"

 /* Define to the version of this package. */
-#define PACKAGE_VERSION "8.01"
+#define PACKAGE_VERSION "8.02"


 /* If you are compiling for a system other than a Unix-like system or
@ -333,7 +333,7 @@ them both to 0; an emulation function will be used. */

 /* Version number of package */
 #ifndef VERSION
-#define VERSION "8.01"
+#define VERSION "8.02"
 #endif

 /* Define to empty if `const' does not conform to ANSI C. */
--- a/ext/pcre/pcrelib/doc/pcre.txt
+++ b/ext/pcre/pcrelib/doc/pcre.txt
--- a/ext/pcre/pcrelib/pcre.h
+++ b/ext/pcre/pcrelib/pcre.h
@ -42,9 +42,9 @@ POSSIBILITY OF SUCH DAMAGE.
 /* The current PCRE version information. */

 #define PCRE_MAJOR          8
-#define PCRE_MINOR          01
+#define PCRE_MINOR          02
 #define PCRE_PRERELEASE     
-#define PCRE_DATE           2010-01-19
+#define PCRE_DATE           2010-03-19

 /* When an application links to a PCRE DLL in Windows, the symbols that are
 imported have to be identified as such. When building PCRE, the appropriate
--- a/ext/pcre/pcrelib/pcre_compile.c
+++ b/ext/pcre/pcrelib/pcre_compile.c
@ -90,6 +90,11 @@ is 4 there is plenty of room. */

 #define COMPILE_WORK_SIZE (4096)

+/* The overrun tests check for a slightly smaller size so that they detect the
+overrun before it actually does run off the end of the data block. */
+
+#define WORK_SIZE_CHECK (COMPILE_WORK_SIZE - 100)
+

 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
 are simple data values; negative values are for special things like \d and so
@ -261,7 +266,11 @@ the number of relocations needed when a shared library is loaded dynamically,
 it is now one long string. We cannot use a table of offsets, because the
 lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
 simply count through to the one we want - this isn't a performance issue
-because these strings are used only when there is a compilation error. */
+because these strings are used only when there is a compilation error.
+
+Each substring ends with \0 to insert a null character. This includes the final
+substring, so that the whole string ends with \0\0, which can be detected when
+counting through. */

 static const char error_texts[] =
  "no error\0"
@ -342,8 +351,7 @@ static const char error_texts[] =
  "digit expected after (?+\0"
  "] is an invalid data character in JavaScript compatibility mode\0"
  /* 65 */
-  "different names for subpatterns of the same number are not allowed";
-
+  "different names for subpatterns of the same number are not allowed\0";

 /* Table to identify digits and hex digits. This is used when compiling
 patterns. Note that the tables in chartables are dependent on the locale, and
@ -501,7 +509,11 @@ static const char *
 find_error_text(int n)
 {
 const char *s = error_texts;
-for (; n > 0; n--) while (*s++ != 0) {};
+for (; n > 0; n--)
+  {
+  while (*s++ != 0) {};
+  if (*s == 0) return "Error text not found (please report)";
+  }
 return s;
 }

@ -1441,6 +1453,7 @@ for (;;)
    case OP_CALLOUT:
    case OP_SOD:
    case OP_SOM:
+    case OP_SET_SOM:
    case OP_EOD:
    case OP_EODN:
    case OP_CIRC:
@ -1775,12 +1788,14 @@ Arguments:
  code        points to start of search
  endcode     points to where to stop
  utf8        TRUE if in UTF8 mode
+  cd          contains pointers to tables etc.

 Returns:      TRUE if what is matched could be empty
 */

 static BOOL
-could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
+could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8,
+  compile_data *cd)
 {
 register int c;
 for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
@ -1811,6 +1826,28 @@ for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE
    continue;
    }

+  /* For a recursion/subroutine call, if its end has been reached, which
+  implies a subroutine call, we can scan it. */
+
+  if (c == OP_RECURSE)
+    {
+    BOOL empty_branch = FALSE;
+    const uschar *scode = cd->start_code + GET(code, 1);
+    if (GET(scode, 1) == 0) return TRUE;    /* Unclosed */
+    do
+      {
+      if (could_be_empty_branch(scode, endcode, utf8, cd))
+        {
+        empty_branch = TRUE;
+        break;
+        }
+      scode += GET(scode, 1);
+      }
+    while (*scode == OP_ALT);
+    if (!empty_branch) return FALSE;  /* All branches are non-empty */
+    continue;
+    }
+
  /* For other groups, scan the branches. */

  if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE || c == OP_COND)
@ -1829,7 +1866,7 @@ for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE
      empty_branch = FALSE;
      do
        {
-        if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
+        if (!empty_branch && could_be_empty_branch(code, endcode, utf8, cd))
          empty_branch = TRUE;
        code += GET(code, 1);
        }
@ -1963,6 +2000,11 @@ for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE
    if (utf8 && code[3] >= 0xc0) code += _pcre_utf8_table4[code[3] & 0x3f];
    break;
 #endif
+
+    /* None of the remaining opcodes are required to match a character. */
+
+    default:
+    break;
    }
  }

@ -1985,17 +2027,18 @@ Arguments:
  endcode     points to where to stop (current RECURSE item)
  bcptr       points to the chain of current (unclosed) branch starts
  utf8        TRUE if in UTF-8 mode
+  cd          pointers to tables etc

 Returns:      TRUE if what is matched could be empty
 */

 static BOOL
 could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
-  BOOL utf8)
+  BOOL utf8, compile_data *cd)
 {
 while (bcptr != NULL && bcptr->current_branch >= code)
  {
-  if (!could_be_empty_branch(bcptr->current_branch, endcode, utf8))
+  if (!could_be_empty_branch(bcptr->current_branch, endcode, utf8, cd))
    return FALSE;
  bcptr = bcptr->outer;
  }
@ -2720,7 +2763,7 @@ for (;; ptr++)
 #ifdef PCRE_DEBUG
    if (code > cd->hwm) cd->hwm = code;                 /* High water info */
 #endif
-    if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
+    if (code > cd->start_workspace + WORK_SIZE_CHECK)   /* Check for overrun */
      {
      *errorcodeptr = ERR52;
      goto FAILED;
@ -2769,7 +2812,7 @@ for (;; ptr++)
  /* In the real compile phase, just check the workspace used by the forward
  reference list. */

-  else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)
+  else if (cd->hwm > cd->start_workspace + WORK_SIZE_CHECK)
    {
    *errorcodeptr = ERR52;
    goto FAILED;
@ -4353,7 +4396,7 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */
          uschar *scode = bracode;
          do
            {
-            if (could_be_empty_branch(scode, ketcode, utf8))
+            if (could_be_empty_branch(scode, ketcode, utf8, cd))
              {
              *bracode += OP_SBRA - OP_BRA;
              break;
@ -4428,7 +4471,12 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */
        case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
        case OP_NOTUPTO:  *tempcode = OP_NOTPOSUPTO; break;

+        /* Because we are moving code along, we must ensure that any
+        pending recursive references are updated. */
+
        default:
+        *code = OP_END;
+        adjust_recurse(tempcode, 1 + LINK_SIZE, utf8, cd, save_hwm);
        memmove(tempcode + 1+LINK_SIZE, tempcode, len);
        code += 1 + LINK_SIZE;
        len += 1 + LINK_SIZE;
@ -5147,6 +5195,11 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */
                *errorcodeptr = ERR15;
                goto FAILED;
                }
+
+              /* Fudge the value of "called" so that when it is inserted as an
+              offset below, what it actually inserted is the reference number
+              of the group. */
+
              called = cd->start_code + recno;
              PUTINC(cd->hwm, 0, code + 2 + LINK_SIZE - cd->start_code);
              }
@ -5156,7 +5209,7 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */
            recursion that could loop for ever, and diagnose that case. */

            else if (GET(called, 1) == 0 &&
-                     could_be_empty(called, code, bcptr, utf8))
+                     could_be_empty(called, code, bcptr, utf8, cd))
              {
              *errorcodeptr = ERR40;
              goto FAILED;
@ -6802,7 +6855,6 @@ if (reqbyte >= 0 &&
 case when building a production library. */

 #ifdef PCRE_DEBUG
-
 printf("Length = %d top_bracket = %d top_backref = %d\n",
  length, re->top_bracket, re->top_backref);

--- a/ext/pcre/pcrelib/pcre_exec.c
+++ b/ext/pcre/pcrelib/pcre_exec.c
@ -247,7 +247,7 @@ enum { RM1=1, RM2,  RM3,  RM4,  RM5,  RM6,  RM7,  RM8,  RM9,  RM10,

 /* These versions of the macros use the stack, as normal. There are debugging
 versions and production versions. Note that the "rw" argument of RMATCH isn't
-actuall used in this definition. */
+actually used in this definition. */

 #ifndef NO_RECURSE
 #define REGISTER register
@ -256,7 +256,7 @@ actuall used in this definition. */
 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
  { \
  printf("match() called in line %d\n", __LINE__); \
-  rrc = match(ra,rb,mstart,rc,rd,re,rf,rg,rdepth+1); \
+  rrc = match(ra,rb,mstart,markptr,rc,rd,re,rf,rg,rdepth+1); \
  printf("to line %d\n", __LINE__); \
  }
 #define RRETURN(ra) \
@ -266,7 +266,7 @@ actuall used in this definition. */
  }
 #else
 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
-  rrc = match(ra,rb,mstart,rc,rd,re,rf,rg,rdepth+1)
+  rrc = match(ra,rb,mstart,markptr,rc,rd,re,rf,rg,rdepth+1)
 #define RRETURN(ra) return ra
 #endif

@ -286,6 +286,7 @@ argument of match(), which never changes. */
  newframe->Xeptr = ra;\
  newframe->Xecode = rb;\
  newframe->Xmstart = mstart;\
+  newframe->Xmarkptr = markptr;\
  newframe->Xoffset_top = rc;\
  newframe->Xims = re;\
  newframe->Xeptrb = rf;\
@ -323,6 +324,7 @@ typedef struct heapframe {
  USPTR Xeptr;
  const uschar *Xecode;
  USPTR Xmstart;
+  USPTR Xmarkptr;
  int Xoffset_top;
  long int Xims;
  eptrblock *Xeptrb;
@ -430,6 +432,7 @@ Arguments:
   ecode       pointer to current position in compiled code
   mstart      pointer to the current match start position (can be modified
                 by encountering \K)
+   markptr     pointer to the most recent MARK name, or NULL
   offset_top  current top pointer
   md          pointer to "static" info for the match
   ims         current /i, /m, and /s options
@ -448,9 +451,9 @@ Returns:       MATCH_MATCH if matched            )  these values are >= 0
 */

 static int
-match(REGISTER USPTR eptr, REGISTER const uschar *ecode, USPTR mstart,
-  int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
-  int flags, unsigned int rdepth)
+match(REGISTER USPTR eptr, REGISTER const uschar *ecode, USPTR mstart, USPTR
+  markptr, int offset_top, match_data *md, unsigned long int ims,
+  eptrblock *eptrb, int flags, unsigned int rdepth)
 {
 /* These variables do not need to be preserved over recursion in this function,
 so they can be ordinary variables in all cases. Mark some of them with
@ -478,6 +481,7 @@ frame->Xprevframe = NULL;            /* Marks the top level */
 frame->Xeptr = eptr;
 frame->Xecode = ecode;
 frame->Xmstart = mstart;
+frame->Xmarkptr = markptr;
 frame->Xoffset_top = offset_top;
 frame->Xims = ims;
 frame->Xeptrb = eptrb;
@ -493,6 +497,7 @@ HEAP_RECURSE:
 #define eptr               frame->Xeptr
 #define ecode              frame->Xecode
 #define mstart             frame->Xmstart
+#define markptr            frame->Xmarkptr
 #define offset_top         frame->Xoffset_top
 #define ims                frame->Xims
 #define eptrb              frame->Xeptrb
@ -1068,7 +1073,6 @@ for (;;)
      memmove(md->offset_vector, rec->offset_save,
        rec->saved_max * sizeof(int));
      offset_top = rec->save_offset_top;
-      mstart = rec->save_start;
      ims = original_ims;
      ecode = rec->after_call;
      break;
@ -1112,7 +1116,11 @@ for (;;)
      {
      RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
        RM4);
-      if (rrc == MATCH_MATCH) break;
+      if (rrc == MATCH_MATCH)
+        {
+        mstart = md->start_match_ptr;   /* In case \K reset it */
+        break;
+        }
      if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
      ecode += GET(ecode, 1);
      }
@ -1265,9 +1273,7 @@ for (;;)

      memcpy(new_recursive.offset_save, md->offset_vector,
            new_recursive.saved_max * sizeof(int));
-      new_recursive.save_start = mstart;
      new_recursive.save_offset_top = offset_top;
-      mstart = eptr;

      /* OK, now we can do the recursion. For each top-level alternative we
      restore the offset and recursion data. */
@ -1314,7 +1320,8 @@ for (;;)
    a move back into the brackets. Friedl calls these "atomic" subpatterns.
    Check the alternative branches in turn - the matching won't pass the KET
    for this kind of subpattern. If any one branch matches, we carry on as at
-    the end of a normal bracket, leaving the subject pointer. */
+    the end of a normal bracket, leaving the subject pointer, but resetting
+    the start-of-match value in case it was changed by \K. */

    case OP_ONCE:
    prev = ecode;
@ -1323,7 +1330,11 @@ for (;;)
    do
      {
      RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM7);
-      if (rrc == MATCH_MATCH) break;
+      if (rrc == MATCH_MATCH)
+        {
+        mstart = md->start_match_ptr;
+        break;
+        }
      if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
      ecode += GET(ecode,1);
      }
@ -1442,9 +1453,10 @@ for (;;)
      }
    else saved_eptr = NULL;

-    /* If we are at the end of an assertion group, stop matching and return
-    MATCH_MATCH, but record the current high water mark for use by positive
-    assertions. Do this also for the "once" (atomic) groups. */
+    /* If we are at the end of an assertion group or an atomic group, stop
+    matching and return MATCH_MATCH, but record the current high water mark for
+    use by positive assertions. We also need to record the match start in case
+    it was changed by \K. */

    if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
        *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
@ -1452,6 +1464,7 @@ for (;;)
      {
      md->end_match_ptr = eptr;      /* For ONCE */
      md->end_offset_top = offset_top;
+      md->start_match_ptr = mstart;
      RRETURN(MATCH_MATCH);
      }

@ -1488,7 +1501,6 @@ for (;;)
        recursion_info *rec = md->recursive;
        DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
        md->recursive = rec->prevrec;
-        mstart = rec->save_start;
        memcpy(md->offset_vector, rec->offset_save,
          rec->saved_max * sizeof(int));
        offset_top = rec->save_offset_top;
@ -5649,7 +5661,8 @@ for(;;)
  md->start_match_ptr = start_match;
  md->start_used_ptr = start_match;
  md->match_call_count = 0;
-  rc = match(start_match, md->start_code, start_match, 2, md, ims, NULL, 0, 0);
+  rc = match(start_match, md->start_code, start_match, NULL, 2, md, ims, NULL,
+    0, 0);
  if (md->hitend && start_partial == NULL) start_partial = md->start_used_ptr;

  switch(rc)
--- a/ext/pcre/pcrelib/pcre_globals.c
+++ b/ext/pcre/pcrelib/pcre_globals.c
@ -43,14 +43,35 @@ PCRE is thread-clean and doesn't use any global variables in the normal sense.
 However, it calls memory allocation and freeing functions via the four
 indirections below, and it can optionally do callouts, using the fifth
 indirection. These values can be changed by the caller, but are shared between
-all threads. However, when compiling for Virtual Pascal, things are done
-differently, and global variables are not used (see pcre.in). */
+all threads.
+
+For MS Visual Studio and Symbian OS, there are problems in initializing these
+variables to non-local functions. In these cases, therefore, an indirection via
+a local function is used.
+
+Also, when compiling for Virtual Pascal, things are done differently, and
+global variables are not used. */

 #include "config.h"

 #include "pcre_internal.h"

-#ifndef VPCOMPAT
+#if defined _MSC_VER || defined  __SYMBIAN32__
+static void* LocalPcreMalloc(size_t aSize)
+  {
+  return malloc(aSize);
+  }
+static void LocalPcreFree(void* aPtr)
+  {
+  free(aPtr);
+  }
+PCRE_EXP_DATA_DEFN void *(*pcre_malloc)(size_t) = LocalPcreMalloc;
+PCRE_EXP_DATA_DEFN void  (*pcre_free)(void *) = LocalPcreFree;
+PCRE_EXP_DATA_DEFN void *(*pcre_stack_malloc)(size_t) = LocalPcreMalloc;
+PCRE_EXP_DATA_DEFN void  (*pcre_stack_free)(void *) = LocalPcreFree;
+PCRE_EXP_DATA_DEFN int   (*pcre_callout)(pcre_callout_block *) = NULL;
+
+#elif !defined VPCOMPAT
 PCRE_EXP_DATA_DEFN void *(*pcre_malloc)(size_t) = malloc;
 PCRE_EXP_DATA_DEFN void  (*pcre_free)(void *) = free;
 PCRE_EXP_DATA_DEFN void *(*pcre_stack_malloc)(size_t) = malloc;
--- a/ext/pcre/pcrelib/pcre_internal.h
+++ b/ext/pcre/pcrelib/pcre_internal.h
@ -188,15 +188,14 @@ preprocessor time in standard C environments. */
 large integers. If a 64-bit integer type is available, we can use that.
 Otherwise we have to cast to double, which of course requires floating point
 arithmetic. Handle this by defining a macro for the appropriate type. If
-stdint.h is available, include it; it may define INT64_MAX. The macro int64_t
-may be set by "configure". */
+stdint.h is available, include it; it may define INT64_MAX. Systems that do not
+have stdint.h (e.g. Solaris) may have inttypes.h. The macro int64_t may be set
+by "configure". */

 #if HAVE_STDINT_H
-# ifdef PHP_WIN32
-#  include "win32/php_stdint.h"
-# else
-#  include <stdint.h>
-# endif
+#include <stdint.h>
+#elif HAVE_INTTYPES_H
+#include <inttypes.h>
 #endif

 #if defined INT64_MAX || defined int64_t
@ -1392,7 +1391,13 @@ enum {

  /* This is used to skip a subpattern with a {0} quantifier */

-  OP_SKIPZERO        /* 114 */
+  OP_SKIPZERO,       /* 114 */
+
+  /* This is not an opcode, but is used to check that tables indexed by opcode
+  are the correct length, in order to catch updating errors - there have been
+  some in the past. */
+
+  OP_TABLE_LENGTH
 };

 /* *** NOTE NOTE NOTE *** Whenever the list above is updated, the two macro
@ -1440,8 +1445,9 @@ in UTF-8 mode. The code that uses this table must know about such things. */
  1, 1, 1, 1, 1,                 /* \A, \G, \K, \B, \b                     */ \
  1, 1, 1, 1, 1, 1,              /* \D, \d, \S, \s, \W, \w                 */ \
  1, 1, 1,                       /* Any, AllAny, Anybyte                   */ \
-  3, 3, 1,                       /* NOTPROP, PROP, EXTUNI                  */ \
+  3, 3,                          /* \P, \p                                 */ \
  1, 1, 1, 1, 1,                 /* \R, \H, \h, \V, \v                     */ \
+  1,                             /* \X                                     */ \
  1, 1, 2, 1, 1,                 /* \Z, \z, Opt, ^, $                      */ \
  2,                             /* Char  - the minimum length             */ \
  2,                             /* Charnc  - the minimum length           */ \
@ -1496,8 +1502,9 @@ condition. */

 #define RREF_ANY  0xffff

-/* Error code numbers. They are given names so that they can more easily be
-tracked. */
+/* Compile time error code numbers. They are given names so that they can more
+easily be tracked. When a new number is added, the table called eint in
+pcreposix.c must be updated. */

 enum { ERR0,  ERR1,  ERR2,  ERR3,  ERR4,  ERR5,  ERR6,  ERR7,  ERR8,  ERR9,
       ERR10, ERR11, ERR12, ERR13, ERR14, ERR15, ERR16, ERR17, ERR18, ERR19,
@ -1505,7 +1512,7 @@ enum { ERR0,  ERR1,  ERR2,  ERR3,  ERR4,  ERR5,  ERR6,  ERR7,  ERR8,  ERR9,
       ERR30, ERR31, ERR32, ERR33, ERR34, ERR35, ERR36, ERR37, ERR38, ERR39,
       ERR40, ERR41, ERR42, ERR43, ERR44, ERR45, ERR46, ERR47, ERR48, ERR49,
       ERR50, ERR51, ERR52, ERR53, ERR54, ERR55, ERR56, ERR57, ERR58, ERR59,
-       ERR60, ERR61, ERR62, ERR63, ERR64, ERR65 };
+       ERR60, ERR61, ERR62, ERR63, ERR64, ERR65, ERRCOUNT };

 /* The real format of the start of the pcre block; the index of names and the
 code vector run on as long as necessary after the end. We store an explicit
@ -1610,7 +1617,6 @@ typedef struct recursion_info {
  struct recursion_info *prevrec; /* Previous recursion record (or NULL) */
  int group_num;                /* Number of group that was called */
  const uschar *after_call;     /* "Return value": points after the call in the expr */
-  USPTR save_start;             /* Old value of mstart */
  int *offset_save;             /* Pointer to start of saved offsets */
  int saved_max;                /* Number of saved offsets */
  int save_offset_top;          /* Current value of offset_top */
--- a/ext/pcre/pcrelib/pcre_printint.src
+++ b/ext/pcre/pcrelib/pcre_printint.src
@ -190,6 +190,25 @@ for(;;)

  switch(*code)
    {
+/* ========================================================================== */
+      /* These cases are never obeyed. This is a fudge that causes a compile-
+      time error if the vectors OP_names or _pcre_OP_lengths, which are indexed
+      by opcode, are not the correct length. It seems to be the only way to do
+      such a check at compile time, as the sizeof() operator does not work in
+      the C preprocessor. We do this while compiling pcretest, because that
+      #includes pcre_tables.c, which holds _pcre_OP_lengths. We can't do this
+      when building pcre_compile.c with PCRE_DEBUG set, because it doesn't then
+      know the size of _pcre_OP_lengths. */
+
+#ifdef COMPILING_PCRETEST
+      case OP_TABLE_LENGTH:
+      case OP_TABLE_LENGTH +
+        ((sizeof(OP_names)/sizeof(const char *) == OP_TABLE_LENGTH) &&
+        (sizeof(_pcre_OP_lengths) == OP_TABLE_LENGTH)):
+      break;
+#endif
+/* ========================================================================== */
+
    case OP_END:
    fprintf(f, "    %s\n", OP_names[*code]);
    fprintf(f, "------------------------------------------------------------------\n");
--- a/ext/pcre/pcrelib/pcre_tables.c
+++ b/ext/pcre/pcrelib/pcre_tables.c
@ -118,7 +118,9 @@ strings to make sure that UTF-8 support works on EBCDIC platforms. */
 #define STRING_Any0 STR_A STR_n STR_y "\0"
 #define STRING_Arabic0 STR_A STR_r STR_a STR_b STR_i STR_c "\0"
 #define STRING_Armenian0 STR_A STR_r STR_m STR_e STR_n STR_i STR_a STR_n "\0"
+#define STRING_Avestan0 STR_A STR_v STR_e STR_s STR_t STR_a STR_n "\0"
 #define STRING_Balinese0 STR_B STR_a STR_l STR_i STR_n STR_e STR_s STR_e "\0"
+#define STRING_Bamum0 STR_B STR_a STR_m STR_u STR_m "\0"
 #define STRING_Bengali0 STR_B STR_e STR_n STR_g STR_a STR_l STR_i "\0"
 #define STRING_Bopomofo0 STR_B STR_o STR_p STR_o STR_m STR_o STR_f STR_o "\0"
 #define STRING_Braille0 STR_B STR_r STR_a STR_i STR_l STR_l STR_e "\0"
@ -141,6 +143,7 @@ strings to make sure that UTF-8 support works on EBCDIC platforms. */
 #define STRING_Cyrillic0 STR_C STR_y STR_r STR_i STR_l STR_l STR_i STR_c "\0"
 #define STRING_Deseret0 STR_D STR_e STR_s STR_e STR_r STR_e STR_t "\0"
 #define STRING_Devanagari0 STR_D STR_e STR_v STR_a STR_n STR_a STR_g STR_a STR_r STR_i "\0"
+#define STRING_Egyptian_Hieroglyphs0 STR_E STR_g STR_y STR_p STR_t STR_i STR_a STR_n STR_UNDERSCORE STR_H STR_i STR_e STR_r STR_o STR_g STR_l STR_y STR_p STR_h STR_s "\0"
 #define STRING_Ethiopic0 STR_E STR_t STR_h STR_i STR_o STR_p STR_i STR_c "\0"
 #define STRING_Georgian0 STR_G STR_e STR_o STR_r STR_g STR_i STR_a STR_n "\0"
 #define STRING_Glagolitic0 STR_G STR_l STR_a STR_g STR_o STR_l STR_i STR_t STR_i STR_c "\0"
@ -153,7 +156,12 @@ strings to make sure that UTF-8 support works on EBCDIC platforms. */
 #define STRING_Hanunoo0 STR_H STR_a STR_n STR_u STR_n STR_o STR_o "\0"
 #define STRING_Hebrew0 STR_H STR_e STR_b STR_r STR_e STR_w "\0"
 #define STRING_Hiragana0 STR_H STR_i STR_r STR_a STR_g STR_a STR_n STR_a "\0"
+#define STRING_Imperial_Aramaic0 STR_I STR_m STR_p STR_e STR_r STR_i STR_a STR_l STR_UNDERSCORE STR_A STR_r STR_a STR_m STR_a STR_i STR_c "\0"
 #define STRING_Inherited0 STR_I STR_n STR_h STR_e STR_r STR_i STR_t STR_e STR_d "\0"
+#define STRING_Inscriptional_Pahlavi0 STR_I STR_n STR_s STR_c STR_r STR_i STR_p STR_t STR_i STR_o STR_n STR_a STR_l STR_UNDERSCORE STR_P STR_a STR_h STR_l STR_a STR_v STR_i "\0"
+#define STRING_Inscriptional_Parthian0 STR_I STR_n STR_s STR_c STR_r STR_i STR_p STR_t STR_i STR_o STR_n STR_a STR_l STR_UNDERSCORE STR_P STR_a STR_r STR_t STR_h STR_i STR_a STR_n "\0"
+#define STRING_Javanese0 STR_J STR_a STR_v STR_a STR_n STR_e STR_s STR_e "\0"
+#define STRING_Kaithi0 STR_K STR_a STR_i STR_t STR_h STR_i "\0"
 #define STRING_Kannada0 STR_K STR_a STR_n STR_n STR_a STR_d STR_a "\0"
 #define STRING_Katakana0 STR_K STR_a STR_t STR_a STR_k STR_a STR_n STR_a "\0"
 #define STRING_Kayah_Li0 STR_K STR_a STR_y STR_a STR_h STR_UNDERSCORE STR_L STR_i "\0"
@ -166,6 +174,7 @@ strings to make sure that UTF-8 support works on EBCDIC platforms. */
 #define STRING_Lepcha0 STR_L STR_e STR_p STR_c STR_h STR_a "\0"
 #define STRING_Limbu0 STR_L STR_i STR_m STR_b STR_u "\0"
 #define STRING_Linear_B0 STR_L STR_i STR_n STR_e STR_a STR_r STR_UNDERSCORE STR_B "\0"
+#define STRING_Lisu0 STR_L STR_i STR_s STR_u "\0"
 #define STRING_Ll0 STR_L STR_l "\0"
 #define STRING_Lm0 STR_L STR_m "\0"
 #define STRING_Lo0 STR_L STR_o "\0"
@ -177,6 +186,7 @@ strings to make sure that UTF-8 support works on EBCDIC platforms. */
 #define STRING_Malayalam0 STR_M STR_a STR_l STR_a STR_y STR_a STR_l STR_a STR_m "\0"
 #define STRING_Mc0 STR_M STR_c "\0"
 #define STRING_Me0 STR_M STR_e "\0"
+#define STRING_Meetei_Mayek0 STR_M STR_e STR_e STR_t STR_e STR_i STR_UNDERSCORE STR_M STR_a STR_y STR_e STR_k "\0"
 #define STRING_Mn0 STR_M STR_n "\0"
 #define STRING_Mongolian0 STR_M STR_o STR_n STR_g STR_o STR_l STR_i STR_a STR_n "\0"
 #define STRING_Myanmar0 STR_M STR_y STR_a STR_n STR_m STR_a STR_r "\0"
@ -190,6 +200,8 @@ strings to make sure that UTF-8 support works on EBCDIC platforms. */
 #define STRING_Ol_Chiki0 STR_O STR_l STR_UNDERSCORE STR_C STR_h STR_i STR_k STR_i "\0"
 #define STRING_Old_Italic0 STR_O STR_l STR_d STR_UNDERSCORE STR_I STR_t STR_a STR_l STR_i STR_c "\0"
 #define STRING_Old_Persian0 STR_O STR_l STR_d STR_UNDERSCORE STR_P STR_e STR_r STR_s STR_i STR_a STR_n "\0"
+#define STRING_Old_South_Arabian0 STR_O STR_l STR_d STR_UNDERSCORE STR_S STR_o STR_u STR_t STR_h STR_UNDERSCORE STR_A STR_r STR_a STR_b STR_i STR_a STR_n "\0"
+#define STRING_Old_Turkic0 STR_O STR_l STR_d STR_UNDERSCORE STR_T STR_u STR_r STR_k STR_i STR_c "\0"
 #define STRING_Oriya0 STR_O STR_r STR_i STR_y STR_a "\0"
 #define STRING_Osmanya0 STR_O STR_s STR_m STR_a STR_n STR_y STR_a "\0"
 #define STRING_P0 STR_P "\0"
@ -205,6 +217,7 @@ strings to make sure that UTF-8 support works on EBCDIC platforms. */
 #define STRING_Rejang0 STR_R STR_e STR_j STR_a STR_n STR_g "\0"
 #define STRING_Runic0 STR_R STR_u STR_n STR_i STR_c "\0"
 #define STRING_S0 STR_S "\0"
+#define STRING_Samaritan0 STR_S STR_a STR_m STR_a STR_r STR_i STR_t STR_a STR_n "\0"
 #define STRING_Saurashtra0 STR_S STR_a STR_u STR_r STR_a STR_s STR_h STR_t STR_r STR_a "\0"
 #define STRING_Sc0 STR_S STR_c "\0"
 #define STRING_Shavian0 STR_S STR_h STR_a STR_v STR_i STR_a STR_n "\0"
@ -218,6 +231,8 @@ strings to make sure that UTF-8 support works on EBCDIC platforms. */
 #define STRING_Tagalog0 STR_T STR_a STR_g STR_a STR_l STR_o STR_g "\0"
 #define STRING_Tagbanwa0 STR_T STR_a STR_g STR_b STR_a STR_n STR_w STR_a "\0"
 #define STRING_Tai_Le0 STR_T STR_a STR_i STR_UNDERSCORE STR_L STR_e "\0"
+#define STRING_Tai_Tham0 STR_T STR_a STR_i STR_UNDERSCORE STR_T STR_h STR_a STR_m "\0"
+#define STRING_Tai_Viet0 STR_T STR_a STR_i STR_UNDERSCORE STR_V STR_i STR_e STR_t "\0"
 #define STRING_Tamil0 STR_T STR_a STR_m STR_i STR_l "\0"
 #define STRING_Telugu0 STR_T STR_e STR_l STR_u STR_g STR_u "\0"
 #define STRING_Thaana0 STR_T STR_h STR_a STR_a STR_n STR_a "\0"
@ -236,7 +251,9 @@ const char _pcre_utt_names[] =
  STRING_Any0
  STRING_Arabic0
  STRING_Armenian0
+  STRING_Avestan0
  STRING_Balinese0
+  STRING_Bamum0
  STRING_Bengali0
  STRING_Bopomofo0
  STRING_Braille0
@ -259,6 +276,7 @@ const char _pcre_utt_names[] =
  STRING_Cyrillic0
  STRING_Deseret0
  STRING_Devanagari0
+  STRING_Egyptian_Hieroglyphs0
  STRING_Ethiopic0
  STRING_Georgian0
  STRING_Glagolitic0
@ -271,7 +289,12 @@ const char _pcre_utt_names[] =
  STRING_Hanunoo0
  STRING_Hebrew0
  STRING_Hiragana0
+  STRING_Imperial_Aramaic0
  STRING_Inherited0
+  STRING_Inscriptional_Pahlavi0
+  STRING_Inscriptional_Parthian0
+  STRING_Javanese0
+  STRING_Kaithi0
  STRING_Kannada0
  STRING_Katakana0
  STRING_Kayah_Li0
@ -284,6 +307,7 @@ const char _pcre_utt_names[] =
  STRING_Lepcha0
  STRING_Limbu0
  STRING_Linear_B0
+  STRING_Lisu0
  STRING_Ll0
  STRING_Lm0
  STRING_Lo0
@ -295,6 +319,7 @@ const char _pcre_utt_names[] =
  STRING_Malayalam0
  STRING_Mc0
  STRING_Me0
+  STRING_Meetei_Mayek0
  STRING_Mn0
  STRING_Mongolian0
  STRING_Myanmar0
@ -308,6 +333,8 @@ const char _pcre_utt_names[] =
  STRING_Ol_Chiki0
  STRING_Old_Italic0
  STRING_Old_Persian0
+  STRING_Old_South_Arabian0
+  STRING_Old_Turkic0
  STRING_Oriya0
  STRING_Osmanya0
  STRING_P0
@ -323,6 +350,7 @@ const char _pcre_utt_names[] =
  STRING_Rejang0
  STRING_Runic0
  STRING_S0
+  STRING_Samaritan0
  STRING_Saurashtra0
  STRING_Sc0
  STRING_Shavian0
@ -336,6 +364,8 @@ const char _pcre_utt_names[] =
  STRING_Tagalog0
  STRING_Tagbanwa0
  STRING_Tai_Le0
+  STRING_Tai_Tham0
+  STRING_Tai_Viet0
  STRING_Tamil0
  STRING_Telugu0
  STRING_Thaana0
@ -354,119 +384,134 @@ const ucp_type_table _pcre_utt[] = {
  {   0, PT_ANY, 0 },
  {   4, PT_SC, ucp_Arabic },
  {  11, PT_SC, ucp_Armenian },
-  {  20, PT_SC, ucp_Balinese },
-  {  29, PT_SC, ucp_Bengali },
-  {  37, PT_SC, ucp_Bopomofo },
-  {  46, PT_SC, ucp_Braille },
-  {  54, PT_SC, ucp_Buginese },
-  {  63, PT_SC, ucp_Buhid },
-  {  69, PT_GC, ucp_C },
-  {  71, PT_SC, ucp_Canadian_Aboriginal },
-  {  91, PT_SC, ucp_Carian },
-  {  98, PT_PC, ucp_Cc },
-  { 101, PT_PC, ucp_Cf },
-  { 104, PT_SC, ucp_Cham },
-  { 109, PT_SC, ucp_Cherokee },
-  { 118, PT_PC, ucp_Cn },
-  { 121, PT_PC, ucp_Co },
-  { 124, PT_SC, ucp_Common },
-  { 131, PT_SC, ucp_Coptic },
-  { 138, PT_PC, ucp_Cs },
-  { 141, PT_SC, ucp_Cuneiform },
-  { 151, PT_SC, ucp_Cypriot },
-  { 159, PT_SC, ucp_Cyrillic },
-  { 168, PT_SC, ucp_Deseret },
-  { 176, PT_SC, ucp_Devanagari },
-  { 187, PT_SC, ucp_Ethiopic },
-  { 196, PT_SC, ucp_Georgian },
-  { 205, PT_SC, ucp_Glagolitic },
-  { 216, PT_SC, ucp_Gothic },
-  { 223, PT_SC, ucp_Greek },
-  { 229, PT_SC, ucp_Gujarati },
-  { 238, PT_SC, ucp_Gurmukhi },
-  { 247, PT_SC, ucp_Han },
-  { 251, PT_SC, ucp_Hangul },
-  { 258, PT_SC, ucp_Hanunoo },
-  { 266, PT_SC, ucp_Hebrew },
-  { 273, PT_SC, ucp_Hiragana },
-  { 282, PT_SC, ucp_Inherited },
-  { 292, PT_SC, ucp_Kannada },
-  { 300, PT_SC, ucp_Katakana },
-  { 309, PT_SC, ucp_Kayah_Li },
-  { 318, PT_SC, ucp_Kharoshthi },
-  { 329, PT_SC, ucp_Khmer },
-  { 335, PT_GC, ucp_L },
-  { 337, PT_LAMP, 0 },
-  { 340, PT_SC, ucp_Lao },
-  { 344, PT_SC, ucp_Latin },
-  { 350, PT_SC, ucp_Lepcha },
-  { 357, PT_SC, ucp_Limbu },
-  { 363, PT_SC, ucp_Linear_B },
-  { 372, PT_PC, ucp_Ll },
-  { 375, PT_PC, ucp_Lm },
-  { 378, PT_PC, ucp_Lo },
-  { 381, PT_PC, ucp_Lt },
-  { 384, PT_PC, ucp_Lu },
-  { 387, PT_SC, ucp_Lycian },
-  { 394, PT_SC, ucp_Lydian },
-  { 401, PT_GC, ucp_M },
-  { 403, PT_SC, ucp_Malayalam },
-  { 413, PT_PC, ucp_Mc },
-  { 416, PT_PC, ucp_Me },
-  { 419, PT_PC, ucp_Mn },
-  { 422, PT_SC, ucp_Mongolian },
-  { 432, PT_SC, ucp_Myanmar },
-  { 440, PT_GC, ucp_N },
-  { 442, PT_PC, ucp_Nd },
-  { 445, PT_SC, ucp_New_Tai_Lue },
-  { 457, PT_SC, ucp_Nko },
-  { 461, PT_PC, ucp_Nl },
-  { 464, PT_PC, ucp_No },
-  { 467, PT_SC, ucp_Ogham },
-  { 473, PT_SC, ucp_Ol_Chiki },
-  { 482, PT_SC, ucp_Old_Italic },
-  { 493, PT_SC, ucp_Old_Persian },
-  { 505, PT_SC, ucp_Oriya },
-  { 511, PT_SC, ucp_Osmanya },
-  { 519, PT_GC, ucp_P },
-  { 521, PT_PC, ucp_Pc },
-  { 524, PT_PC, ucp_Pd },
-  { 527, PT_PC, ucp_Pe },
-  { 530, PT_PC, ucp_Pf },
-  { 533, PT_SC, ucp_Phags_Pa },
-  { 542, PT_SC, ucp_Phoenician },
-  { 553, PT_PC, ucp_Pi },
-  { 556, PT_PC, ucp_Po },
-  { 559, PT_PC, ucp_Ps },
-  { 562, PT_SC, ucp_Rejang },
-  { 569, PT_SC, ucp_Runic },
-  { 575, PT_GC, ucp_S },
-  { 577, PT_SC, ucp_Saurashtra },
-  { 588, PT_PC, ucp_Sc },
-  { 591, PT_SC, ucp_Shavian },
-  { 599, PT_SC, ucp_Sinhala },
-  { 607, PT_PC, ucp_Sk },
-  { 610, PT_PC, ucp_Sm },
-  { 613, PT_PC, ucp_So },
-  { 616, PT_SC, ucp_Sundanese },
-  { 626, PT_SC, ucp_Syloti_Nagri },
-  { 639, PT_SC, ucp_Syriac },
-  { 646, PT_SC, ucp_Tagalog },
-  { 654, PT_SC, ucp_Tagbanwa },
-  { 663, PT_SC, ucp_Tai_Le },
-  { 670, PT_SC, ucp_Tamil },
-  { 676, PT_SC, ucp_Telugu },
-  { 683, PT_SC, ucp_Thaana },
-  { 690, PT_SC, ucp_Thai },
-  { 695, PT_SC, ucp_Tibetan },
-  { 703, PT_SC, ucp_Tifinagh },
-  { 712, PT_SC, ucp_Ugaritic },
-  { 721, PT_SC, ucp_Vai },
-  { 725, PT_SC, ucp_Yi },
-  { 728, PT_GC, ucp_Z },
-  { 730, PT_PC, ucp_Zl },
-  { 733, PT_PC, ucp_Zp },
-  { 736, PT_PC, ucp_Zs }
+  {  20, PT_SC, ucp_Avestan },
+  {  28, PT_SC, ucp_Balinese },
+  {  37, PT_SC, ucp_Bamum },
+  {  43, PT_SC, ucp_Bengali },
+  {  51, PT_SC, ucp_Bopomofo },
+  {  60, PT_SC, ucp_Braille },
+  {  68, PT_SC, ucp_Buginese },
+  {  77, PT_SC, ucp_Buhid },
+  {  83, PT_GC, ucp_C },
+  {  85, PT_SC, ucp_Canadian_Aboriginal },
+  { 105, PT_SC, ucp_Carian },
+  { 112, PT_PC, ucp_Cc },
+  { 115, PT_PC, ucp_Cf },
+  { 118, PT_SC, ucp_Cham },
+  { 123, PT_SC, ucp_Cherokee },
+  { 132, PT_PC, ucp_Cn },
+  { 135, PT_PC, ucp_Co },
+  { 138, PT_SC, ucp_Common },
+  { 145, PT_SC, ucp_Coptic },
+  { 152, PT_PC, ucp_Cs },
+  { 155, PT_SC, ucp_Cuneiform },
+  { 165, PT_SC, ucp_Cypriot },
+  { 173, PT_SC, ucp_Cyrillic },
+  { 182, PT_SC, ucp_Deseret },
+  { 190, PT_SC, ucp_Devanagari },
+  { 201, PT_SC, ucp_Egyptian_Hieroglyphs },
+  { 222, PT_SC, ucp_Ethiopic },
+  { 231, PT_SC, ucp_Georgian },
+  { 240, PT_SC, ucp_Glagolitic },
+  { 251, PT_SC, ucp_Gothic },
+  { 258, PT_SC, ucp_Greek },
+  { 264, PT_SC, ucp_Gujarati },
+  { 273, PT_SC, ucp_Gurmukhi },
+  { 282, PT_SC, ucp_Han },
+  { 286, PT_SC, ucp_Hangul },
+  { 293, PT_SC, ucp_Hanunoo },
+  { 301, PT_SC, ucp_Hebrew },
+  { 308, PT_SC, ucp_Hiragana },
+  { 317, PT_SC, ucp_Imperial_Aramaic },
+  { 334, PT_SC, ucp_Inherited },
+  { 344, PT_SC, ucp_Inscriptional_Pahlavi },
+  { 366, PT_SC, ucp_Inscriptional_Parthian },
+  { 389, PT_SC, ucp_Javanese },
+  { 398, PT_SC, ucp_Kaithi },
+  { 405, PT_SC, ucp_Kannada },
+  { 413, PT_SC, ucp_Katakana },
+  { 422, PT_SC, ucp_Kayah_Li },
+  { 431, PT_SC, ucp_Kharoshthi },
+  { 442, PT_SC, ucp_Khmer },
+  { 448, PT_GC, ucp_L },
+  { 450, PT_LAMP, 0 },
+  { 453, PT_SC, ucp_Lao },
+  { 457, PT_SC, ucp_Latin },
+  { 463, PT_SC, ucp_Lepcha },
+  { 470, PT_SC, ucp_Limbu },
+  { 476, PT_SC, ucp_Linear_B },
+  { 485, PT_SC, ucp_Lisu },
+  { 490, PT_PC, ucp_Ll },
+  { 493, PT_PC, ucp_Lm },
+  { 496, PT_PC, ucp_Lo },
+  { 499, PT_PC, ucp_Lt },
+  { 502, PT_PC, ucp_Lu },
+  { 505, PT_SC, ucp_Lycian },
+  { 512, PT_SC, ucp_Lydian },
+  { 519, PT_GC, ucp_M },
+  { 521, PT_SC, ucp_Malayalam },
+  { 531, PT_PC, ucp_Mc },
+  { 534, PT_PC, ucp_Me },
+  { 537, PT_SC, ucp_Meetei_Mayek },
+  { 550, PT_PC, ucp_Mn },
+  { 553, PT_SC, ucp_Mongolian },
+  { 563, PT_SC, ucp_Myanmar },
+  { 571, PT_GC, ucp_N },
+  { 573, PT_PC, ucp_Nd },
+  { 576, PT_SC, ucp_New_Tai_Lue },
+  { 588, PT_SC, ucp_Nko },
+  { 592, PT_PC, ucp_Nl },
+  { 595, PT_PC, ucp_No },
+  { 598, PT_SC, ucp_Ogham },
+  { 604, PT_SC, ucp_Ol_Chiki },
+  { 613, PT_SC, ucp_Old_Italic },
+  { 624, PT_SC, ucp_Old_Persian },
+  { 636, PT_SC, ucp_Old_South_Arabian },
+  { 654, PT_SC, ucp_Old_Turkic },
+  { 665, PT_SC, ucp_Oriya },
+  { 671, PT_SC, ucp_Osmanya },
+  { 679, PT_GC, ucp_P },
+  { 681, PT_PC, ucp_Pc },
+  { 684, PT_PC, ucp_Pd },
+  { 687, PT_PC, ucp_Pe },
+  { 690, PT_PC, ucp_Pf },
+  { 693, PT_SC, ucp_Phags_Pa },
+  { 702, PT_SC, ucp_Phoenician },
+  { 713, PT_PC, ucp_Pi },
+  { 716, PT_PC, ucp_Po },
+  { 719, PT_PC, ucp_Ps },
+  { 722, PT_SC, ucp_Rejang },
+  { 729, PT_SC, ucp_Runic },
+  { 735, PT_GC, ucp_S },
+  { 737, PT_SC, ucp_Samaritan },
+  { 747, PT_SC, ucp_Saurashtra },
+  { 758, PT_PC, ucp_Sc },
+  { 761, PT_SC, ucp_Shavian },
+  { 769, PT_SC, ucp_Sinhala },
+  { 777, PT_PC, ucp_Sk },
+  { 780, PT_PC, ucp_Sm },
+  { 783, PT_PC, ucp_So },
+  { 786, PT_SC, ucp_Sundanese },
+  { 796, PT_SC, ucp_Syloti_Nagri },
+  { 809, PT_SC, ucp_Syriac },
+  { 816, PT_SC, ucp_Tagalog },
+  { 824, PT_SC, ucp_Tagbanwa },
+  { 833, PT_SC, ucp_Tai_Le },
+  { 840, PT_SC, ucp_Tai_Tham },
+  { 849, PT_SC, ucp_Tai_Viet },
+  { 858, PT_SC, ucp_Tamil },
+  { 864, PT_SC, ucp_Telugu },
+  { 871, PT_SC, ucp_Thaana },
+  { 878, PT_SC, ucp_Thai },
+  { 883, PT_SC, ucp_Tibetan },
+  { 891, PT_SC, ucp_Tifinagh },
+  { 900, PT_SC, ucp_Ugaritic },
+  { 909, PT_SC, ucp_Vai },
+  { 913, PT_SC, ucp_Yi },
+  { 916, PT_GC, ucp_Z },
+  { 918, PT_PC, ucp_Zl },
+  { 921, PT_PC, ucp_Zp },
+  { 924, PT_PC, ucp_Zs }
 };

 const int _pcre_utt_size = sizeof(_pcre_utt)/sizeof(ucp_type_table);
--- a/ext/pcre/pcrelib/pcre_ucd.c
+++ b/ext/pcre/pcrelib/pcre_ucd.c
--- a/ext/pcre/pcrelib/pcreposix.c
+++ b/ext/pcre/pcrelib/pcreposix.c
@ -342,6 +342,8 @@ rc = pcre_exec((const pcre *)preg->re_pcre, NULL, string + so, (eo - so),

 if (rc == 0) rc = nmatch;    /* All captured slots were filled in */

+/* Successful match */
+
 if (rc >= 0)
  {
  size_t i;
@ -358,22 +360,33 @@ if (rc >= 0)
  return 0;
  }

-else
+/* Unsuccessful match */
+
+if (allocated_ovector) free(ovector);
+switch(rc)
  {
-  if (allocated_ovector) free(ovector);
-  switch(rc)
-    {
-    case PCRE_ERROR_NOMATCH: return REG_NOMATCH;
-    case PCRE_ERROR_NULL: return REG_INVARG;
-    case PCRE_ERROR_BADOPTION: return REG_INVARG;
-    case PCRE_ERROR_BADMAGIC: return REG_INVARG;
-    case PCRE_ERROR_UNKNOWN_NODE: return REG_ASSERT;
-    case PCRE_ERROR_NOMEMORY: return REG_ESPACE;
-    case PCRE_ERROR_MATCHLIMIT: return REG_ESPACE;
-    case PCRE_ERROR_BADUTF8: return REG_INVARG;
-    case PCRE_ERROR_BADUTF8_OFFSET: return REG_INVARG;
-    default: return REG_ASSERT;
-    }
+/* ========================================================================== */
+  /* These cases are never obeyed. This is a fudge that causes a compile-time
+  error if the vector eint, which is indexed by compile-time error number, is
+  not the correct length. It seems to be the only way to do such a check at
+  compile time, as the sizeof() operator does not work in the C preprocessor.
+  As all the PCRE_ERROR_xxx values are negative, we can use 0 and 1. */
+
+  case 0:
+  case (sizeof(eint)/sizeof(int) == ERRCOUNT):
+  return REG_ASSERT;
+/* ========================================================================== */
+
+  case PCRE_ERROR_NOMATCH: return REG_NOMATCH;
+  case PCRE_ERROR_NULL: return REG_INVARG;
+  case PCRE_ERROR_BADOPTION: return REG_INVARG;
+  case PCRE_ERROR_BADMAGIC: return REG_INVARG;
+  case PCRE_ERROR_UNKNOWN_NODE: return REG_ASSERT;
+  case PCRE_ERROR_NOMEMORY: return REG_ESPACE;
+  case PCRE_ERROR_MATCHLIMIT: return REG_ESPACE;
+  case PCRE_ERROR_BADUTF8: return REG_INVARG;
+  case PCRE_ERROR_BADUTF8_OFFSET: return REG_INVARG;
+  default: return REG_ASSERT;
  }
 }

--- a/ext/pcre/pcrelib/testdata/testinput2
+++ b/ext/pcre/pcrelib/testdata/testinput2
@ -3203,5 +3203,33 @@ a random value. /Ix

 /^(ab(c\1)d|x){2}$/BZ
    xabcxd
+    
+/^(?&t)*+(?(DEFINE)(?<t>.))$/BZ
+
+/^(?&t)*(?(DEFINE)(?<t>.))$/BZ
+
+/ -- The first four of these are not in the Perl 5.10 test because Perl 
+     documents that the use of \K in assertions is "not well defined". The
+     last is here because Perl gives the match as "b" rather than "ab". I
+     believe this to be a Perl bug. --/  
+      
+/(?=a\Kb)ab/
+    ab 
+
+/(?!a\Kb)ac/
+    ac 
+    
+/^abc(?<=b\Kc)d/
+    abcd
+
+/^abc(?<!b\Kq)d/
+    abcd
+
+/(?>a\Kb)z|(ab)/
+    ab 
+
+/----------------------/
+
+/(?P<L1>(?P<L2>0|)|(?P>L2)(?P>L1))/

 /-- End of testinput2 --/
--- a/ext/pcre/pcrelib/testdata/testinput6
+++ b/ext/pcre/pcrelib/testdata/testinput6
@ -370,13 +370,6 @@
    \x{3b1}
    \x{ff5a}   
    
-/^\X/8
-    A
-    A\x{300}BC 
-    A\x{300}\x{301}\x{302}BC 
-    *** Failers
-    \x{300}  
-
 /^[\X]/8
    X123
    *** Failers
@ -756,4 +749,7 @@
 /[\p{Lu}\x20]+/
    \x41\x20\x50\xC2\x54\xC9\x20\x54\x4F\x44\x41\x59

+/\p{Avestan}\p{Bamum}\p{Egyptian_Hieroglyphs}\p{Imperial_Aramaic}\p{Inscriptional_Pahlavi}\p{Inscriptional_Parthian}\p{Javanese}\p{Kaithi}\p{Lisu}\p{Meetei_Mayek}\p{Old_South_Arabian}\p{Old_Turkic}\p{Samaritan}\p{Tai_Tham}\p{Tai_Viet}/8
+    \x{10b00}\x{a6ef}\x{13007}\x{10857}\x{10b78}\x{10b58}\x{a980}\x{110c1}\x{a4ff}\x{abc0}\x{10a7d}\x{10c48}\x{0800}\x{1aad}\x{aac0}
+
 /-- End of testinput6 --/
--- a/ext/pcre/pcrelib/testdata/testoutput2
+++ b/ext/pcre/pcrelib/testdata/testoutput2
@ -10596,5 +10596,76 @@ No match
 0: xabcxd
 1: abcxd
 2: cx
+    
+/^(?&t)*+(?(DEFINE)(?<t>.))$/BZ
+------------------------------------------------------------------
+        Bra
+        ^
+        Once
+        Brazero
+        Once
+        Recurse
+        KetRmax
+        Ket
+        Cond
+        Cond def
+        CBra 1
+        Any
+        Ket
+        Ket
+        $
+        Ket
+        End
+------------------------------------------------------------------
+
+/^(?&t)*(?(DEFINE)(?<t>.))$/BZ
+------------------------------------------------------------------
+        Bra
+        ^
+        Brazero
+        Once
+        Recurse
+        KetRmax
+        Cond
+        Cond def
+        CBra 1
+        Any
+        Ket
+        Ket
+        $
+        Ket
+        End
+------------------------------------------------------------------
+
+/ -- The first four of these are not in the Perl 5.10 test because Perl 
+     documents that the use of \K in assertions is "not well defined". The
+     last is here because Perl gives the match as "b" rather than "ab". I
+     believe this to be a Perl bug. --/  
+      
+/(?=a\Kb)ab/
+    ab 
+ 0: b
+
+/(?!a\Kb)ac/
+    ac 
+ 0: ac
+    
+/^abc(?<=b\Kc)d/
+    abcd
+ 0: cd
+
+/^abc(?<!b\Kq)d/
+    abcd
+ 0: abcd
+
+/(?>a\Kb)z|(ab)/
+    ab 
+ 0: ab
+ 1: ab
+
+/----------------------/
+
+/(?P<L1>(?P<L2>0|)|(?P>L2)(?P>L1))/
+Failed: recursive call could loop indefinitely at offset 31

 /-- End of testinput2 --/
--- a/ext/pcre/pcrelib/testdata/testoutput6
+++ b/ext/pcre/pcrelib/testdata/testoutput6
@ -618,18 +618,6 @@ No match
    \x{ff5a}   
 0: \x{ff5a}
    
-/^\X/8
-    A
- 0: A
-    A\x{300}BC 
- 0: A\x{300}
-    A\x{300}\x{301}\x{302}BC 
- 0: A\x{300}\x{301}\x{302}
-    *** Failers
- 0: *
-    \x{300}  
-No match
-
 /^[\X]/8
    X123
 0: X
@ -1293,4 +1281,8 @@ No match
    \x41\x20\x50\xC2\x54\xC9\x20\x54\x4F\x44\x41\x59
 0: A P\xc2T\xc9 TODAY

+/\p{Avestan}\p{Bamum}\p{Egyptian_Hieroglyphs}\p{Imperial_Aramaic}\p{Inscriptional_Pahlavi}\p{Inscriptional_Parthian}\p{Javanese}\p{Kaithi}\p{Lisu}\p{Meetei_Mayek}\p{Old_South_Arabian}\p{Old_Turkic}\p{Samaritan}\p{Tai_Tham}\p{Tai_Viet}/8
+    \x{10b00}\x{a6ef}\x{13007}\x{10857}\x{10b78}\x{10b58}\x{a980}\x{110c1}\x{a4ff}\x{abc0}\x{10a7d}\x{10c48}\x{0800}\x{1aad}\x{aac0}
+ 0: \x{10b00}\x{a6ef}\x{13007}\x{10857}\x{10b78}\x{10b58}\x{a980}\x{110c1}\x{a4ff}\x{abc0}\x{10a7d}\x{10c48}\x{800}\x{1aad}\x{aac0}
+
 /-- End of testinput6 --/
--- a/ext/pcre/pcrelib/ucp.h
+++ b/ext/pcre/pcrelib/ucp.h
@ -137,7 +137,23 @@ enum {
  ucp_Rejang,
  ucp_Saurashtra,
  ucp_Sundanese,
-  ucp_Vai
+  ucp_Vai,
+  /* New for Unicode 5.2: */
+  ucp_Avestan,
+  ucp_Bamum,
+  ucp_Egyptian_Hieroglyphs,
+  ucp_Imperial_Aramaic,
+  ucp_Inscriptional_Pahlavi,
+  ucp_Inscriptional_Parthian,
+  ucp_Javanese,
+  ucp_Kaithi,
+  ucp_Lisu,
+  ucp_Meetei_Mayek,
+  ucp_Old_South_Arabian,
+  ucp_Old_Turkic,
+  ucp_Samaritan,
+  ucp_Tai_Tham,
+  ucp_Tai_Viet
 };

 #endif