diff options
author | ale <ale@FreeBSD.org> | 2007-03-13 15:11:54 +0800 |
---|---|---|
committer | ale <ale@FreeBSD.org> | 2007-03-13 15:11:54 +0800 |
commit | 9d853b59063f04a934165c5e9d39a63bf0c93216 (patch) | |
tree | 2372f9aeef7da050bc0b4b43840a2aa286b1c364 /devel | |
parent | b099aea38548ca791d25beb2b5ccf108d5b5987c (diff) | |
download | freebsd-ports-gnome-9d853b59063f04a934165c5e9d39a63bf0c93216.tar.gz freebsd-ports-gnome-9d853b59063f04a934165c5e9d39a63bf0c93216.tar.zst freebsd-ports-gnome-9d853b59063f04a934165c5e9d39a63bf0c93216.zip |
Update bundled pcrelib to 7.0, in sync with php4 and next php5 version.
Hopefully this will fix all the known pcre issues.
Diffstat (limited to 'devel')
-rw-r--r-- | devel/php5-pcre/Makefile | 2 | ||||
-rw-r--r-- | devel/php5-pcre/files/patch-pcre-7.0 | 10262 |
2 files changed, 10263 insertions, 1 deletions
diff --git a/devel/php5-pcre/Makefile b/devel/php5-pcre/Makefile index ce9907acf7e6..5c28e9ff5dbc 100644 --- a/devel/php5-pcre/Makefile +++ b/devel/php5-pcre/Makefile @@ -5,7 +5,7 @@ # $FreeBSD$ # -PORTREVISION= 4 +PORTREVISION= 5 CATEGORIES= devel diff --git a/devel/php5-pcre/files/patch-pcre-7.0 b/devel/php5-pcre/files/patch-pcre-7.0 new file mode 100644 index 000000000000..724ae26614a7 --- /dev/null +++ b/devel/php5-pcre/files/patch-pcre-7.0 @@ -0,0 +1,10262 @@ +diff -ruN ../pcre.orig/config.m4 ./config.m4 +--- ../pcre.orig/config.m4 Mon Dec 4 19:01:53 2006 ++++ ./config.m4 Fri Feb 9 22:31:18 2007 +@@ -13,7 +13,7 @@ + + if test "$PHP_PCRE_REGEX" != "no"; then + if test "$PHP_PCRE_REGEX" = "yes"; then +- PHP_NEW_EXTENSION(pcre, pcrelib/pcre_chartables.c pcrelib/pcre_ucp_searchfuncs.c pcrelib/pcre_compile.c pcrelib/pcre_config.c pcrelib/pcre_exec.c pcrelib/pcre_fullinfo.c pcrelib/pcre_get.c pcrelib/pcre_globals.c pcrelib/pcre_info.c pcrelib/pcre_maketables.c pcrelib/pcre_ord2utf8.c pcrelib/pcre_refcount.c pcrelib/pcre_study.c pcrelib/pcre_tables.c pcrelib/pcre_try_flipped.c pcrelib/pcre_valid_utf8.c pcrelib/pcre_version.c pcrelib/pcre_xclass.c php_pcre.c, $ext_shared,,-DEXPORT= -DNEWLINE=10 -DSUPPORT_UTF8 -DSUPPORT_UCP -DLINK_SIZE=2 -DPOSIX_MALLOC_THRESHOLD=10 -DMATCH_LIMIT=10000000 -DMATCH_LIMIT_RECURSION=10000000 -DMAX_NAME_SIZE=32 -DMAX_NAME_COUNT=10000 -DMAX_DUPLENGTH=30000 -I@ext_srcdir@/pcrelib) ++ PHP_NEW_EXTENSION(pcre, pcrelib/pcre_chartables.c pcrelib/pcre_ucp_searchfuncs.c pcrelib/pcre_compile.c pcrelib/pcre_config.c pcrelib/pcre_exec.c pcrelib/pcre_fullinfo.c pcrelib/pcre_get.c pcrelib/pcre_globals.c pcrelib/pcre_info.c pcrelib/pcre_maketables.c pcrelib/pcre_newline.c pcrelib/pcre_ord2utf8.c pcrelib/pcre_refcount.c pcrelib/pcre_study.c pcrelib/pcre_tables.c pcrelib/pcre_try_flipped.c pcrelib/pcre_valid_utf8.c pcrelib/pcre_version.c pcrelib/pcre_xclass.c php_pcre.c, $ext_shared,,-DEXPORT= -DNEWLINE=10 -DSUPPORT_UTF8 -DSUPPORT_UCP -DLINK_SIZE=2 -DPOSIX_MALLOC_THRESHOLD=10 -DMATCH_LIMIT=10000000 -DMATCH_LIMIT_RECURSION=10000000 -DMAX_NAME_SIZE=32 -DMAX_NAME_COUNT=10000 -DMAX_DUPLENGTH=30000 -DEBCDIC=0 -I@ext_srcdir@/pcrelib) + PHP_ADD_BUILD_DIR($ext_builddir/pcrelib) + PHP_INSTALL_HEADERS([ext/pcre], [php_pcre.h pcrelib/]) + AC_DEFINE(HAVE_BUNDLED_PCRE, 1, [ ]) +diff -ruN ../pcre.orig/pcrelib/dftables.c ./pcrelib/dftables.c +--- ../pcre.orig/pcrelib/dftables.c Mon Jan 1 10:36:04 2007 ++++ ./pcrelib/dftables.c Fri Feb 9 22:31:19 2007 +@@ -6,7 +6,7 @@ + and semantics are as close as possible to those of the Perl 5 language. + + Written by Philip Hazel +- Copyright (c) 1997-2007 University of Cambridge ++ Copyright (c) 1997-2006 University of Cambridge + + ----------------------------------------------------------------------------- + Redistribution and use in source and binary forms, with or without +@@ -86,7 +86,16 @@ + fprintf(f, + "This file contains the default tables for characters with codes less than\n" + "128 (ASCII characters). These tables are used when no external tables are\n" +- "passed to PCRE. */\n\n" ++ "passed to PCRE.\n\n"); ++fprintf(f, ++ "The following #include is present because without it gcc 4.x may remove\n" ++ "the array definition from the final binary if PCRE is built into a static\n" ++ "library and dead code stripping is activated. This leads to link errors.\n" ++ "Pulling in the header ensures that the array gets flagged as \"someone\n" ++ "outside this compilation unit might reference this\" and so it will always\n" ++ "be supplied to the linker. */\n\n" ++ "#include \"pcre_internal.h\"\n\n"); ++fprintf(f, + "const unsigned char _pcre_default_tables[] = {\n\n" + "/* This table is a lower casing table. */\n\n"); + +diff -ruN ../pcre.orig/pcrelib/pcre.h ./pcrelib/pcre.h +--- ../pcre.orig/pcrelib/pcre.h Wed Jan 3 19:32:27 2007 ++++ ./pcrelib/pcre.h Fri Feb 9 22:31:19 2007 +@@ -5,7 +5,7 @@ + /* This is the public header file for the PCRE library, to be #included by + applications that call the PCRE functions. + +- Copyright (c) 1997-2005 University of Cambridge ++ Copyright (c) 1997-2006 University of Cambridge + + ----------------------------------------------------------------------------- + Redistribution and use in source and binary forms, with or without +@@ -38,7 +38,7 @@ + + #ifndef _PCRE_H + #define _PCRE_H +- ++ + #include "php_compat.h" + + /* The current PCRE version information. */ +@@ -54,10 +54,10 @@ + cannot run ./configure. As it now stands, this file need not be edited in that + circumstance. */ + +-#define PCRE_MAJOR 6 +-#define PCRE_MINOR 7 ++#define PCRE_MAJOR 7 ++#define PCRE_MINOR 0 + #define PCRE_PRERELEASE +-#define PCRE_DATE 04-Jul-2006 ++#define PCRE_DATE 18-Dec-2006 + + /* Win32 uses DLL by default; it needs special stuff for exported functions + when building PCRE. */ +@@ -120,6 +120,7 @@ + #define PCRE_NEWLINE_CR 0x00100000 + #define PCRE_NEWLINE_LF 0x00200000 + #define PCRE_NEWLINE_CRLF 0x00300000 ++#define PCRE_NEWLINE_ANY 0x00400000 + + /* Exec-time and get/set-time error codes */ + +@@ -127,7 +128,8 @@ + #define PCRE_ERROR_NULL (-2) + #define PCRE_ERROR_BADOPTION (-3) + #define PCRE_ERROR_BADMAGIC (-4) +-#define PCRE_ERROR_UNKNOWN_NODE (-5) ++#define PCRE_ERROR_UNKNOWN_OPCODE (-5) ++#define PCRE_ERROR_UNKNOWN_NODE (-5) /* For backward compatibility */ + #define PCRE_ERROR_NOMEMORY (-6) + #define PCRE_ERROR_NOSUBSTRING (-7) + #define PCRE_ERROR_MATCHLIMIT (-8) +@@ -144,6 +146,8 @@ + #define PCRE_ERROR_DFA_WSSIZE (-19) + #define PCRE_ERROR_DFA_RECURSE (-20) + #define PCRE_ERROR_RECURSIONLIMIT (-21) ++#define PCRE_ERROR_NULLWSLIMIT (-22) ++#define PCRE_ERROR_BADNEWLINE (-23) + + /* Request types for pcre_fullinfo() */ + +diff -ruN ../pcre.orig/pcrelib/pcre_compile.c ./pcrelib/pcre_compile.c +--- ../pcre.orig/pcrelib/pcre_compile.c Mon Jan 1 10:36:04 2007 ++++ ./pcrelib/pcre_compile.c Fri Feb 9 22:31:19 2007 +@@ -6,7 +6,7 @@ + and semantics are as close as possible to those of the Perl 5 language. + + Written by Philip Hazel +- Copyright (c) 1997-2007 University of Cambridge ++ Copyright (c) 1997-2006 University of Cambridge + + ----------------------------------------------------------------------------- + Redistribution and use in source and binary forms, with or without +@@ -42,7 +42,11 @@ + supporting internal functions that are not used by other modules. */ + + +-#define NLBLOCK cd /* The block containing newline information */ ++#define NLBLOCK cd /* Block containing newline information */ ++#define PSSTART start_pattern /* Field containing processed string start */ ++#define PSEND end_pattern /* Field containing processed string end */ ++ ++ + #include "pcre_internal.h" + + +@@ -54,18 +58,23 @@ + #endif + + +- + /************************************************* + * Code parameters and static tables * + *************************************************/ + +-/* Maximum number of items on the nested bracket stacks at compile time. This +-applies to the nesting of all kinds of parentheses. It does not limit +-un-nested, non-capturing parentheses. This number can be made bigger if +-necessary - it is used to dimension one int and one unsigned char vector at +-compile time. */ ++/* This value specifies the size of stack workspace that is used during the ++first pre-compile phase that determines how much memory is required. The regex ++is partly compiled into this space, but the compiled parts are discarded as ++soon as they can be, so that hopefully there will never be an overrun. The code ++does, however, check for an overrun. The largest amount I've seen used is 218, ++so this number is very generous. ++ ++The same workspace is used during the second, actual compile phase for ++remembering forward references to groups so that they can be filled in at the ++end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE ++is 4 there is plenty of room. */ + +-#define BRASTACK_SIZE 200 ++#define COMPILE_WORK_SIZE (4096) + + + /* Table for handling escaped characters in the range '0'-'z'. Positive returns +@@ -79,10 +88,10 @@ + 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */ + '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */ + 0, 0, 0, 0, 0, 0, 0, 0, /* H - O */ +--ESC_P, -ESC_Q, 0, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */ ++-ESC_P, -ESC_Q, -ESC_R, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */ + -ESC_X, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */ + '`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */ +- 0, 0, 0, 0, 0, 0, ESC_n, 0, /* h - o */ ++ 0, 0, 0, -ESC_k, 0, 0, ESC_n, 0, /* h - o */ + -ESC_p, 0, ESC_r, -ESC_s, ESC_tee, 0, 0, -ESC_w, /* p - w */ + 0, 0, -ESC_z /* x - z */ + }; +@@ -98,7 +107,7 @@ + /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"', + /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, + /* 88 */ 0, 0, 0, '{', 0, 0, 0, 0, +-/* 90 */ 0, 0, 0, 'l', 0, ESC_n, 0, -ESC_p, ++/* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p, + /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0, + /* A0 */ 0, '~', -ESC_s, ESC_tee, 0, 0, -ESC_w, 0, + /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0, +@@ -107,7 +116,7 @@ + /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G, + /* C8 */ 0, 0, 0, 0, 0, 0, 0, 0, + /* D0 */ '}', 0, 0, 0, 0, 0, 0, -ESC_P, +-/* D8 */-ESC_Q, 0, 0, 0, 0, 0, 0, 0, ++/* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0, + /* E0 */ '\\', 0, -ESC_S, 0, 0, 0, -ESC_W, -ESC_X, + /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0, + /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0, +@@ -156,8 +165,13 @@ + }; + + ++#define STRING(a) # a ++#define XSTRING(s) STRING(s) ++ + /* The texts of compile-time error messages. These are "char *" because they +-are passed to the outside world. */ ++are passed to the outside world. Do not ever re-use any error number, because ++they are documented. Always add a new error instead. Messages marked DEAD below ++are no longer used. */ + + static const char *error_texts[] = { + "no error", +@@ -172,7 +186,7 @@ + "range out of order in character class", + "nothing to repeat", + /* 10 */ +- "operand of unlimited repeat could match the empty string", ++ "operand of unlimited repeat could match the empty string", /** DEAD **/ + "internal error: unexpected repeat", + "unrecognized character after (?", + "POSIX named classes are supported only within a class", +@@ -182,7 +196,7 @@ + "erroffset passed as NULL", + "unknown option bit(s) set", + "missing ) after comment", +- "parentheses nested too deeply", ++ "parentheses nested too deeply", /** DEAD **/ + /* 20 */ + "regular expression too large", + "failed to get memory", +@@ -199,7 +213,7 @@ + "unknown POSIX class name", + "POSIX collating elements are not supported", + "this version of PCRE is not compiled with PCRE_UTF8 support", +- "spare error", ++ "spare error", /** DEAD **/ + "character value in \\x{...} sequence is too large", + /* 35 */ + "invalid condition (?(0)", +@@ -210,18 +224,25 @@ + /* 40 */ + "recursive call could loop indefinitely", + "unrecognized character after (?P", +- "syntax error after (?P", ++ "syntax error in subpattern name (missing terminator)", + "two named subpatterns have the same name", + "invalid UTF-8 string", + /* 45 */ + "support for \\P, \\p, and \\X has not been compiled", + "malformed \\P or \\p sequence", + "unknown property name after \\P or \\p", +- "subpattern name is too long (maximum 32 characters)", +- "too many named subpatterns (maximum 10,000)", ++ "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)", ++ "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")", + /* 50 */ + "repeated subpattern is too long", +- "octal value is greater than \\377 (not in UTF-8 mode)" ++ "octal value is greater than \\377 (not in UTF-8 mode)", ++ "internal error: overran compiling workspace", ++ "internal error: previously-checked referenced subpattern not found", ++ "DEFINE group contains more than one branch", ++ /* 55 */ ++ "repeating a DEFINE group is not allowed", ++ "inconsistent NEWLINE options", ++ "\\g is not followed by an (optionally braced) non-zero number" + }; + + +@@ -352,8 +373,8 @@ + /* Definition to allow mutual recursion */ + + static BOOL +- compile_regex(int, int, int *, uschar **, const uschar **, int *, BOOL, int, +- int *, int *, branch_chain *, compile_data *); ++ compile_regex(int, int, uschar **, const uschar **, int *, BOOL, int, int *, ++ int *, branch_chain *, compile_data *, int *); + + + +@@ -363,9 +384,11 @@ + + /* This function is called when a \ has been encountered. It either returns a + positive value for a simple escape such as \n, or a negative value which +-encodes one of the more complicated things such as \d. When UTF-8 is enabled, +-a positive value greater than 255 may be returned. On entry, ptr is pointing at +-the \. On exit, it is on the final character of the escape sequence. ++encodes one of the more complicated things such as \d. A backreference to group ++n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When ++UTF-8 is enabled, a positive value greater than 255 may be returned. On entry, ++ptr is pointing at the \. On exit, it is on the final character of the escape ++sequence. + + Arguments: + ptrptr points to the pattern position pointer +@@ -412,6 +435,8 @@ + else + { + const uschar *oldptr; ++ BOOL braced, negated; ++ + switch (c) + { + /* A number of Perl escapes are not handled by PCRE. We give an explicit +@@ -425,6 +450,48 @@ + *errorcodeptr = ERR37; + break; + ++ /* \g must be followed by a number, either plain or braced. If positive, it ++ is an absolute backreference. If negative, it is a relative backreference. ++ This is a Perl 5.10 feature. */ ++ ++ case 'g': ++ if (ptr[1] == '{') ++ { ++ braced = TRUE; ++ ptr++; ++ } ++ else braced = FALSE; ++ ++ if (ptr[1] == '-') ++ { ++ negated = TRUE; ++ ptr++; ++ } ++ else negated = FALSE; ++ ++ c = 0; ++ while ((digitab[ptr[1]] & ctype_digit) != 0) ++ c = c * 10 + *(++ptr) - '0'; ++ ++ if (c == 0 || (braced && *(++ptr) != '}')) ++ { ++ *errorcodeptr = ERR57; ++ return 0; ++ } ++ ++ if (negated) ++ { ++ if (c > bracount) ++ { ++ *errorcodeptr = ERR15; ++ return 0; ++ } ++ c = bracount - (c - 1); ++ } ++ ++ c = -(ESC_REF + c); ++ break; ++ + /* The handling of escape sequences consisting of a string of digits + starting with one that is not zero is not straightforward. By experiment, + the way Perl works seems to be as follows: +@@ -532,7 +599,9 @@ + } + break; + +- /* Other special escapes not starting with a digit are straightforward */ ++ /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped. ++ This coding is ASCII-specific, but then the whole concept of \cx is ++ ASCII-specific. (However, an EBCDIC equivalent has now been added.) */ + + case 'c': + c = *(++ptr); +@@ -542,10 +611,6 @@ + return 0; + } + +- /* A letter is upper-cased; then the 0x40 bit is flipped. This coding +- is ASCII-specific, but then the whole concept of \cx is ASCII-specific. +- (However, an EBCDIC equivalent has now been added.) */ +- + #if !EBCDIC /* ASCII coding */ + if (c >= 'a' && c <= 'z') c -= 32; + c ^= 0x40; +@@ -772,42 +837,111 @@ + + + /************************************************* +-* Find forward referenced named subpattern * ++* Find forward referenced subpattern * + *************************************************/ + +-/* This function scans along a pattern looking for capturing subpatterns, and +-counting them. If it finds a named pattern that matches the name it is given, +-it returns its number. This is used for forward references to named +-subpatterns. We know that if (?P< is encountered, the name will be terminated +-by '>' because that is checked in the first pass. ++/* This function scans along a pattern's text looking for capturing ++subpatterns, and counting them. If it finds a named pattern that matches the ++name it is given, it returns its number. Alternatively, if the name is NULL, it ++returns when it reaches a given numbered subpattern. This is used for forward ++references to subpatterns. We know that if (?P< is encountered, the name will ++be terminated by '>' because that is checked in the first pass. + + Arguments: +- pointer current position in the pattern +- count current count of capturing parens +- name name to seek +- namelen name length ++ ptr current position in the pattern ++ count current count of capturing parens so far encountered ++ name name to seek, or NULL if seeking a numbered subpattern ++ lorn name length, or subpattern number if name is NULL ++ xmode TRUE if we are in /x mode + + Returns: the number of the named subpattern, or -1 if not found + */ + + static int +-find_named_parens(const uschar *ptr, int count, const uschar *name, int namelen) ++find_parens(const uschar *ptr, int count, const uschar *name, int lorn, ++ BOOL xmode) + { + const uschar *thisname; ++ + for (; *ptr != 0; ptr++) + { +- if (*ptr == '\\' && ptr[1] != 0) { ptr++; continue; } ++ int term; ++ ++ /* Skip over backslashed characters and also entire \Q...\E */ ++ ++ if (*ptr == '\\') ++ { ++ if (*(++ptr) == 0) return -1; ++ if (*ptr == 'Q') for (;;) ++ { ++ while (*(++ptr) != 0 && *ptr != '\\'); ++ if (*ptr == 0) return -1; ++ if (*(++ptr) == 'E') break; ++ } ++ continue; ++ } ++ ++ /* Skip over character classes */ ++ ++ if (*ptr == '[') ++ { ++ while (*(++ptr) != ']') ++ { ++ if (*ptr == '\\') ++ { ++ if (*(++ptr) == 0) return -1; ++ if (*ptr == 'Q') for (;;) ++ { ++ while (*(++ptr) != 0 && *ptr != '\\'); ++ if (*ptr == 0) return -1; ++ if (*(++ptr) == 'E') break; ++ } ++ continue; ++ } ++ } ++ continue; ++ } ++ ++ /* Skip comments in /x mode */ ++ ++ if (xmode && *ptr == '#') ++ { ++ while (*(++ptr) != 0 && *ptr != '\n'); ++ if (*ptr == 0) return -1; ++ continue; ++ } ++ ++ /* An opening parens must now be a real metacharacter */ ++ + if (*ptr != '(') continue; +- if (ptr[1] != '?') { count++; continue; } +- if (ptr[2] == '(') { ptr += 2; continue; } +- if (ptr[2] != 'P' || ptr[3] != '<') continue; ++ if (ptr[1] != '?') ++ { ++ count++; ++ if (name == NULL && count == lorn) return count; ++ continue; ++ } ++ ++ ptr += 2; ++ if (*ptr == 'P') ptr++; /* Allow optional P */ ++ ++ /* We have to disambiguate (?<! and (?<= from (?<name> */ ++ ++ if ((*ptr != '<' || ptr[1] == '!' || ptr[1] == '=') && ++ *ptr != '\'') ++ continue; ++ + count++; +- ptr += 4; ++ ++ if (name == NULL && count == lorn) return count; ++ term = *ptr++; ++ if (term == '<') term = '>'; + thisname = ptr; +- while (*ptr != '>') ptr++; +- if (namelen == ptr - thisname && strncmp(name, thisname, namelen) == 0) ++ while (*ptr != term) ptr++; ++ if (name != NULL && lorn == ptr - thisname && ++ strncmp((const char *)name, (const char *)thisname, lorn) == 0) + return count; + } ++ + return -1; + } + +@@ -862,7 +996,8 @@ + + case OP_CALLOUT: + case OP_CREF: +- case OP_BRANUMBER: ++ case OP_RREF: ++ case OP_DEF: + code += _pcre_OP_lengths[*code]; + break; + +@@ -907,14 +1042,14 @@ + { + int d; + register int op = *cc; +- if (op >= OP_BRA) op = OP_BRA; + + switch (op) + { ++ case OP_CBRA: + case OP_BRA: + case OP_ONCE: + case OP_COND: +- d = find_fixedlength(cc, options); ++ d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options); + if (d < 0) return d; + branchlength += d; + do cc += GET(cc, 1); while (*cc == OP_ALT); +@@ -949,8 +1084,9 @@ + /* Skip over things that don't match chars */ + + case OP_REVERSE: +- case OP_BRANUMBER: + case OP_CREF: ++ case OP_RREF: ++ case OP_DEF: + case OP_OPT: + case OP_CALLOUT: + case OP_SOD: +@@ -1094,21 +1230,18 @@ + + if (c == OP_XCLASS) code += GET(code, 1); + +- /* Handle bracketed group */ ++ /* Handle capturing bracket */ + +- else if (c > OP_BRA) ++ else if (c == OP_CBRA) + { +- int n = c - OP_BRA; +- if (n > EXTRACT_BASIC_MAX) n = GET2(code, 2+LINK_SIZE); ++ int n = GET2(code, 1+LINK_SIZE); + if (n == number) return (uschar *)code; +- code += _pcre_OP_lengths[OP_BRA]; ++ code += _pcre_OP_lengths[c]; + } + +- /* Otherwise, we get the item's length from the table. In UTF-8 mode, opcodes +- that are followed by a character may be followed by a multi-byte character. +- The length in the table is a minimum, so we have to scan along to skip the +- extra bytes. All opcodes are less than 128, so we can use relatively +- efficient code. */ ++ /* In UTF-8 mode, opcodes that are followed by a character may be followed by ++ a multi-byte character. The length in the table is a minimum, so we have to ++ arrange to skip the extra bytes. */ + + else + { +@@ -1120,13 +1253,17 @@ + case OP_EXACT: + case OP_UPTO: + case OP_MINUPTO: ++ case OP_POSUPTO: + case OP_STAR: + case OP_MINSTAR: ++ case OP_POSSTAR: + case OP_PLUS: + case OP_MINPLUS: ++ case OP_POSPLUS: + case OP_QUERY: + case OP_MINQUERY: +- while ((*code & 0xc0) == 0x80) code++; ++ case OP_POSQUERY: ++ if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f]; + break; + } + } +@@ -1164,18 +1301,10 @@ + + if (c == OP_XCLASS) code += GET(code, 1); + +- /* All bracketed groups have the same length. */ +- +- else if (c > OP_BRA) +- { +- code += _pcre_OP_lengths[OP_BRA]; +- } +- + /* Otherwise, we get the item's length from the table. In UTF-8 mode, opcodes + that are followed by a character may be followed by a multi-byte character. +- The length in the table is a minimum, so we have to scan along to skip the +- extra bytes. All opcodes are less than 128, so we can use relatively +- efficient code. */ ++ The length in the table is a minimum, so we have to arrange to skip the extra ++ bytes. */ + + else + { +@@ -1187,13 +1316,17 @@ + case OP_EXACT: + case OP_UPTO: + case OP_MINUPTO: ++ case OP_POSUPTO: + case OP_STAR: + case OP_MINSTAR: ++ case OP_POSSTAR: + case OP_PLUS: + case OP_MINPLUS: ++ case OP_POSPLUS: + case OP_QUERY: + case OP_MINQUERY: +- while ((*code & 0xc0) == 0x80) code++; ++ case OP_POSQUERY: ++ if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f]; + break; + } + } +@@ -1207,10 +1340,11 @@ + *************************************************/ + + /* This function scans through a branch of a compiled pattern to see whether it +-can match the empty string or not. It is called only from could_be_empty() +-below. Note that first_significant_code() skips over assertions. If we hit an +-unclosed bracket, we return "empty" - this means we've struck an inner bracket +-whose current branch will already have been scanned. ++can match the empty string or not. It is called from could_be_empty() ++below and from compile_branch() when checking for an unlimited repeat of a ++group that can match nothing. Note that first_significant_code() skips over ++assertions. If we hit an unclosed bracket, we return "empty" - this means we've ++struck an inner bracket whose current branch will already have been scanned. + + Arguments: + code points to start of search +@@ -1224,7 +1358,7 @@ + could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8) + { + register int c; +-for (code = first_significant_code(code + 1 + LINK_SIZE, NULL, 0, TRUE); ++for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE); + code < endcode; + code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE)) + { +@@ -1232,7 +1366,7 @@ + + c = *code; + +- if (c >= OP_BRA) ++ if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE) + { + BOOL empty_branch; + if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */ +@@ -1248,11 +1382,18 @@ + } + while (*code == OP_ALT); + if (!empty_branch) return FALSE; /* All branches are non-empty */ +- code += 1 + LINK_SIZE; +- c = *code; ++ ++ /* Move past the KET and fudge things so that the increment in the "for" ++ above has no effect. */ ++ ++ c = OP_END; ++ code += 1 + LINK_SIZE - _pcre_OP_lengths[c]; ++ continue; + } + +- else switch (c) ++ /* Handle the other opcodes */ ++ ++ switch (c) + { + /* Check for quantifiers after a class */ + +@@ -1308,12 +1449,15 @@ + case OP_NOT: + case OP_PLUS: + case OP_MINPLUS: ++ case OP_POSPLUS: + case OP_EXACT: + case OP_NOTPLUS: + case OP_NOTMINPLUS: ++ case OP_NOTPOSPLUS: + case OP_NOTEXACT: + case OP_TYPEPLUS: + case OP_TYPEMINPLUS: ++ case OP_TYPEPOSPLUS: + case OP_TYPEEXACT: + return FALSE; + +@@ -1325,16 +1469,19 @@ + case OP_ALT: + return TRUE; + +- /* In UTF-8 mode, STAR, MINSTAR, QUERY, MINQUERY, UPTO, and MINUPTO may be +- followed by a multibyte character */ ++ /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO, ++ MINUPTO, and POSUPTO may be followed by a multibyte character */ + + #ifdef SUPPORT_UTF8 + case OP_STAR: + case OP_MINSTAR: ++ case OP_POSSTAR: + case OP_QUERY: + case OP_MINQUERY: ++ case OP_POSQUERY: + case OP_UPTO: + case OP_MINUPTO: ++ case OP_POSUPTO: + if (utf8) while ((code[2] & 0xc0) == 0x80) code++; + break; + #endif +@@ -1452,26 +1599,57 @@ + optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before + it, after it has been compiled. This means that any OP_RECURSE items within it + that refer to the group itself or any contained groups have to have their +-offsets adjusted. That is the job of this function. Before it is called, the +-partially compiled regex must be temporarily terminated with OP_END. ++offsets adjusted. That one of the jobs of this function. Before it is called, ++the partially compiled regex must be temporarily terminated with OP_END. ++ ++This function has been extended with the possibility of forward references for ++recursions and subroutine calls. It must also check the list of such references ++for the group we are dealing with. If it finds that one of the recursions in ++the current group is on this list, it adjusts the offset in the list, not the ++value in the reference (which is a group number). + + Arguments: + group points to the start of the group + adjust the amount by which the group is to be moved + utf8 TRUE in UTF-8 mode + cd contains pointers to tables etc. ++ save_hwm the hwm forward reference pointer at the start of the group + + Returns: nothing + */ + + static void +-adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd) ++adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd, ++ uschar *save_hwm) + { + uschar *ptr = group; + while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL) + { +- int offset = GET(ptr, 1); +- if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust); ++ int offset; ++ uschar *hc; ++ ++ /* See if this recursion is on the forward reference list. If so, adjust the ++ reference. */ ++ ++ for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE) ++ { ++ offset = GET(hc, 0); ++ if (cd->start_code + offset == ptr + 1) ++ { ++ PUT(hc, 0, offset + adjust); ++ break; ++ } ++ } ++ ++ /* Otherwise, adjust the recursion offset if it's after the start of this ++ group. */ ++ ++ if (hc >= cd->hwm) ++ { ++ offset = GET(ptr, 1); ++ if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust); ++ } ++ + ptr += 1 + LINK_SIZE; + } + } +@@ -1550,12 +1728,13 @@ + */ + + static BOOL +-get_othercase_range(int *cptr, int d, int *ocptr, int *odptr) ++get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr, ++ unsigned int *odptr) + { +-int c, othercase, next; ++unsigned int c, othercase, next; + + for (c = *cptr; c <= d; c++) +- { if ((othercase = _pcre_ucp_othercase(c)) >= 0) break; } ++ { if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR) break; } + + if (c > d) return FALSE; + +@@ -1576,17 +1755,249 @@ + #endif /* SUPPORT_UCP */ + + ++ ++/************************************************* ++* Check if auto-possessifying is possible * ++*************************************************/ ++ ++/* This function is called for unlimited repeats of certain items, to see ++whether the next thing could possibly match the repeated item. If not, it makes ++sense to automatically possessify the repeated item. ++ ++Arguments: ++ op_code the repeated op code ++ this data for this item, depends on the opcode ++ utf8 TRUE in UTF-8 mode ++ utf8_char used for utf8 character bytes, NULL if not relevant ++ ptr next character in pattern ++ options options bits ++ cd contains pointers to tables etc. ++ ++Returns: TRUE if possessifying is wanted ++*/ ++ ++static BOOL ++check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char, ++ const uschar *ptr, int options, compile_data *cd) ++{ ++int next; ++ ++/* Skip whitespace and comments in extended mode */ ++ ++if ((options & PCRE_EXTENDED) != 0) ++ { ++ for (;;) ++ { ++ while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++; ++ if (*ptr == '#') ++ { ++ while (*(++ptr) != 0) ++ if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; } ++ } ++ else break; ++ } ++ } ++ ++/* If the next item is one that we can handle, get its value. A non-negative ++value is a character, a negative value is an escape value. */ ++ ++if (*ptr == '\\') ++ { ++ int temperrorcode = 0; ++ next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE); ++ if (temperrorcode != 0) return FALSE; ++ ptr++; /* Point after the escape sequence */ ++ } ++ ++else if ((cd->ctypes[*ptr] & ctype_meta) == 0) ++ { ++#ifdef SUPPORT_UTF8 ++ if (utf8) { GETCHARINC(next, ptr); } else ++#endif ++ next = *ptr++; ++ } ++ ++else return FALSE; ++ ++/* Skip whitespace and comments in extended mode */ ++ ++if ((options & PCRE_EXTENDED) != 0) ++ { ++ for (;;) ++ { ++ while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++; ++ if (*ptr == '#') ++ { ++ while (*(++ptr) != 0) ++ if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; } ++ } ++ else break; ++ } ++ } ++ ++/* If the next thing is itself optional, we have to give up. */ ++ ++if (*ptr == '*' || *ptr == '?' || strncmp((char *)ptr, "{0,", 3) == 0) ++ return FALSE; ++ ++/* Now compare the next item with the previous opcode. If the previous is a ++positive single character match, "item" either contains the character or, if ++"item" is greater than 127 in utf8 mode, the character's bytes are in ++utf8_char. */ ++ ++ ++/* Handle cases when the next item is a character. */ ++ ++if (next >= 0) switch(op_code) ++ { ++ case OP_CHAR: ++#ifdef SUPPORT_UTF8 ++ if (utf8 && item > 127) { GETCHAR(item, utf8_char); } ++#endif ++ return item != next; ++ ++ /* For CHARNC (caseless character) we must check the other case. If we have ++ Unicode property support, we can use it to test the other case of ++ high-valued characters. */ ++ ++ case OP_CHARNC: ++#ifdef SUPPORT_UTF8 ++ if (utf8 && item > 127) { GETCHAR(item, utf8_char); } ++#endif ++ if (item == next) return FALSE; ++#ifdef SUPPORT_UTF8 ++ if (utf8) ++ { ++ unsigned int othercase; ++ if (next < 128) othercase = cd->fcc[next]; else ++#ifdef SUPPORT_UCP ++ othercase = _pcre_ucp_othercase((unsigned int)next); ++#else ++ othercase = NOTACHAR; ++#endif ++ return (unsigned int)item != othercase; ++ } ++ else ++#endif /* SUPPORT_UTF8 */ ++ return (item != cd->fcc[next]); /* Non-UTF-8 mode */ ++ ++ /* For OP_NOT, "item" must be a single-byte character. */ ++ ++ case OP_NOT: ++ if (next < 0) return FALSE; /* Not a character */ ++ if (item == next) return TRUE; ++ if ((options & PCRE_CASELESS) == 0) return FALSE; ++#ifdef SUPPORT_UTF8 ++ if (utf8) ++ { ++ unsigned int othercase; ++ if (next < 128) othercase = cd->fcc[next]; else ++#ifdef SUPPORT_UCP ++ othercase = _pcre_ucp_othercase(next); ++#else ++ othercase = NOTACHAR; ++#endif ++ return (unsigned int)item == othercase; ++ } ++ else ++#endif /* SUPPORT_UTF8 */ ++ return (item == cd->fcc[next]); /* Non-UTF-8 mode */ ++ ++ case OP_DIGIT: ++ return next > 127 || (cd->ctypes[next] & ctype_digit) == 0; ++ ++ case OP_NOT_DIGIT: ++ return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0; ++ ++ case OP_WHITESPACE: ++ return next > 127 || (cd->ctypes[next] & ctype_space) == 0; ++ ++ case OP_NOT_WHITESPACE: ++ return next <= 127 && (cd->ctypes[next] & ctype_space) != 0; ++ ++ case OP_WORDCHAR: ++ return next > 127 || (cd->ctypes[next] & ctype_word) == 0; ++ ++ case OP_NOT_WORDCHAR: ++ return next <= 127 && (cd->ctypes[next] & ctype_word) != 0; ++ ++ default: ++ return FALSE; ++ } ++ ++ ++/* Handle the case when the next item is \d, \s, etc. */ ++ ++switch(op_code) ++ { ++ case OP_CHAR: ++ case OP_CHARNC: ++#ifdef SUPPORT_UTF8 ++ if (utf8 && item > 127) { GETCHAR(item, utf8_char); } ++#endif ++ switch(-next) ++ { ++ case ESC_d: ++ return item > 127 || (cd->ctypes[item] & ctype_digit) == 0; ++ ++ case ESC_D: ++ return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0; ++ ++ case ESC_s: ++ return item > 127 || (cd->ctypes[item] & ctype_space) == 0; ++ ++ case ESC_S: ++ return item <= 127 && (cd->ctypes[item] & ctype_space) != 0; ++ ++ case ESC_w: ++ return item > 127 || (cd->ctypes[item] & ctype_word) == 0; ++ ++ case ESC_W: ++ return item <= 127 && (cd->ctypes[item] & ctype_word) != 0; ++ ++ default: ++ return FALSE; ++ } ++ ++ case OP_DIGIT: ++ return next == -ESC_D || next == -ESC_s || next == -ESC_W; ++ ++ case OP_NOT_DIGIT: ++ return next == -ESC_d; ++ ++ case OP_WHITESPACE: ++ return next == -ESC_S || next == -ESC_d || next == -ESC_w; ++ ++ case OP_NOT_WHITESPACE: ++ return next == -ESC_s; ++ ++ case OP_WORDCHAR: ++ return next == -ESC_W || next == -ESC_s; ++ ++ case OP_NOT_WORDCHAR: ++ return next == -ESC_w || next == -ESC_d; ++ ++ default: ++ return FALSE; ++ } ++ ++/* Control does not reach here */ ++} ++ ++ ++ + /************************************************* + * Compile one branch * + *************************************************/ + +-/* Scan the pattern, compiling it into the code vector. If the options are ++/* Scan the pattern, compiling it into the a vector. If the options are + changed during the branch, the pointer is used to change the external options +-bits. ++bits. This function is used during the pre-compile phase when we are trying ++to find out the amount of memory needed, as well as during the real compile ++phase. The value of lengthptr distinguishes the two phases. + + Arguments: + optionsptr pointer to the option bits +- brackets points to number of extracting brackets used + codeptr points to the pointer to the current code point + ptrptr points to the current pattern pointer + errorcodeptr points to error code variable +@@ -1594,15 +2005,17 @@ + reqbyteptr set to the last literal character required, else < 0 + bcptr points to current branch chain + cd contains pointers to tables etc. ++ lengthptr NULL during the real compile phase ++ points to length accumulator during pre-compile phase + + Returns: TRUE on success + FALSE, with *errorcodeptr set non-zero on error + */ + + static BOOL +-compile_branch(int *optionsptr, int *brackets, uschar **codeptr, +- const uschar **ptrptr, int *errorcodeptr, int *firstbyteptr, +- int *reqbyteptr, branch_chain *bcptr, compile_data *cd) ++compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr, ++ int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, ++ compile_data *cd, int *lengthptr) + { + int repeat_type, op_type; + int repeat_min = 0, repeat_max = 0; /* To please picky compilers */ +@@ -1613,8 +2026,11 @@ + int req_caseopt, reqvary, tempreqvary; + int options = *optionsptr; + int after_manual_callout = 0; ++int length_prevgroup = 0; + register int c; + register uschar *code = *codeptr; ++uschar *last_code = code; ++uschar *orig_code = code; + uschar *tempcode; + BOOL inescq = FALSE; + BOOL groupsetfirstbyte = FALSE; +@@ -1622,6 +2038,7 @@ + const uschar *tempptr; + uschar *previous = NULL; + uschar *previous_callout = NULL; ++uschar *save_hwm = NULL; + uschar classbits[32]; + + #ifdef SUPPORT_UTF8 +@@ -1631,6 +2048,11 @@ + uschar utf8_char[6]; + #else + BOOL utf8 = FALSE; ++uschar *utf8_char = NULL; ++#endif ++ ++#ifdef DEBUG ++if (lengthptr != NULL) DPRINTF((">> start branch\n")); + #endif + + /* Set up the default and non-default settings for greediness */ +@@ -1664,6 +2086,7 @@ + BOOL negate_class; + BOOL possessive_quantifier; + BOOL is_quantifier; ++ BOOL is_recurse; + int class_charcount; + int class_lastchar; + int newoptions; +@@ -1671,13 +2094,68 @@ + int skipbytes; + int subreqbyte; + int subfirstbyte; ++ int terminator; + int mclength; + uschar mcbuffer[8]; + +- /* Next byte in the pattern */ ++ /* Get next byte in the pattern */ + + c = *ptr; + ++ /* If we are in the pre-compile phase, accumulate the length used for the ++ previous cycle of this loop. */ ++ ++ if (lengthptr != NULL) ++ { ++#ifdef DEBUG ++ if (code > cd->hwm) cd->hwm = code; /* High water info */ ++#endif ++ if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */ ++ { ++ *errorcodeptr = ERR52; ++ goto FAILED; ++ } ++ ++ /* There is at least one situation where code goes backwards: this is the ++ case of a zero quantifier after a class (e.g. [ab]{0}). At compile time, ++ the class is simply eliminated. However, it is created first, so we have to ++ allow memory for it. Therefore, don't ever reduce the length at this point. ++ */ ++ ++ if (code < last_code) code = last_code; ++ *lengthptr += code - last_code; ++ DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c)); ++ ++ /* If "previous" is set and it is not at the start of the work space, move ++ it back to there, in order to avoid filling up the work space. Otherwise, ++ if "previous" is NULL, reset the current code pointer to the start. */ ++ ++ if (previous != NULL) ++ { ++ if (previous > orig_code) ++ { ++ memmove(orig_code, previous, code - previous); ++ code -= previous - orig_code; ++ previous = orig_code; ++ } ++ } ++ else code = orig_code; ++ ++ /* Remember where this code item starts so we can pick up the length ++ next time round. */ ++ ++ last_code = code; ++ } ++ ++ /* In the real compile phase, just check the workspace used by the forward ++ reference list. */ ++ ++ else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE) ++ { ++ *errorcodeptr = ERR52; ++ goto FAILED; ++ } ++ + /* If in \Q...\E, check for the end; if not, we have a literal */ + + if (inescq && c != 0) +@@ -1692,7 +2170,8 @@ + { + if (previous_callout != NULL) + { +- complete_callout(previous_callout, ptr, cd); ++ if (lengthptr == NULL) /* Don't attempt in pre-compile phase */ ++ complete_callout(previous_callout, ptr, cd); + previous_callout = NULL; + } + if ((options & PCRE_AUTO_CALLOUT) != 0) +@@ -1713,7 +2192,8 @@ + if (!is_quantifier && previous_callout != NULL && + after_manual_callout-- <= 0) + { +- complete_callout(previous_callout, ptr, cd); ++ if (lengthptr == NULL) /* Don't attempt in pre-compile phase */ ++ complete_callout(previous_callout, ptr, cd); + previous_callout = NULL; + } + +@@ -1724,12 +2204,12 @@ + if ((cd->ctypes[c] & ctype_space) != 0) continue; + if (c == '#') + { +- while (*(++ptr) != 0) if (IS_NEWLINE(ptr)) break; +- if (*ptr != 0) ++ while (*(++ptr) != 0) + { +- ptr += cd->nllen - 1; +- continue; ++ if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; } + } ++ if (*ptr != 0) continue; ++ + /* Else fall through to handle end of string */ + c = 0; + } +@@ -1745,17 +2225,23 @@ + + switch(c) + { +- /* The branch terminates at end of string, |, or ). */ +- +- case 0: +- case '|': ++ /* ===================================================================*/ ++ case 0: /* The branch terminates at string end */ ++ case '|': /* or | or ) */ + case ')': + *firstbyteptr = firstbyte; + *reqbyteptr = reqbyte; + *codeptr = code; + *ptrptr = ptr; ++ if (lengthptr != NULL) ++ { ++ *lengthptr += code - last_code; /* To include callout length */ ++ DPRINTF((">> end branch\n")); ++ } + return TRUE; + ++ ++ /* ===================================================================*/ + /* Handle single-character metacharacters. In multiline mode, ^ disables + the setting of any following char as a first character. */ + +@@ -1784,6 +2270,8 @@ + *code++ = OP_ANY; + break; + ++ ++ /* ===================================================================*/ + /* Character classes. If the included characters are all < 256, we build a + 32-byte bitmap of the permitted characters, except in the special case + where there is only one such character. For negated classes, we build the +@@ -1822,32 +2310,32 @@ + } + + /* Keep a count of chars with values < 256 so that we can optimize the case +- of just a single character (as long as it's < 256). For higher valued UTF-8 +- characters, we don't yet do any optimization. */ ++ of just a single character (as long as it's < 256). However, For higher ++ valued UTF-8 characters, we don't yet do any optimization. */ + + class_charcount = 0; + class_lastchar = -1; + ++ /* Initialize the 32-char bit map to all zeros. We build the map in a ++ temporary bit of memory, in case the class contains only 1 character (less ++ than 256), because in that case the compiled code doesn't use the bit map. ++ */ ++ ++ memset(classbits, 0, 32 * sizeof(uschar)); ++ + #ifdef SUPPORT_UTF8 + class_utf8 = FALSE; /* No chars >= 256 */ +- class_utf8data = code + LINK_SIZE + 34; /* For UTF-8 items */ ++ class_utf8data = code + LINK_SIZE + 2; /* For UTF-8 items */ + #endif + +- /* Initialize the 32-char bit map to all zeros. We have to build the +- map in a temporary bit of store, in case the class contains only 1 +- character (< 256), because in that case the compiled code doesn't use the +- bit map. */ +- +- memset(classbits, 0, 32 * sizeof(uschar)); +- + /* Process characters until ] is reached. By writing this as a "do" it +- means that an initial ] is taken as a data character. The first pass +- through the regex checked the overall syntax, so we don't need to be very +- strict here. At the start of the loop, c contains the first byte of the +- character. */ ++ means that an initial ] is taken as a data character. At the start of the ++ loop, c contains the first byte of the character. */ + +- do ++ if (c != 0) do + { ++ const uschar *oldptr; ++ + #ifdef SUPPORT_UTF8 + if (utf8 && c > 127) + { /* Braces are required because the */ +@@ -1859,13 +2347,13 @@ + + if (inescq) + { +- if (c == '\\' && ptr[1] == 'E') ++ if (c == '\\' && ptr[1] == 'E') /* If we are at \E */ + { +- inescq = FALSE; +- ptr++; +- continue; ++ inescq = FALSE; /* Reset literal state */ ++ ptr++; /* Skip the 'E' */ ++ continue; /* Carry on with next */ + } +- else goto LONE_SINGLE_CHARACTER; ++ goto CHECK_RANGE; /* Could be range if \E follows */ + } + + /* Handle POSIX class names. Perl allows a negation extension of the +@@ -1956,19 +2444,20 @@ + } + + /* Backslash may introduce a single character, or it may introduce one +- of the specials, which just set a flag. Escaped items are checked for +- validity in the pre-compiling pass. The sequence \b is a special case. +- Inside a class (and only there) it is treated as backspace. Elsewhere +- it marks a word boundary. Other escapes have preset maps ready to +- or into the one we are building. We assume they have more than one ++ of the specials, which just set a flag. The sequence \b is a special ++ case. Inside a class (and only there) it is treated as backspace. ++ Elsewhere it marks a word boundary. Other escapes have preset maps ready ++ to or into the one we are building. We assume they have more than one + character in them, so set class_charcount bigger than one. */ + + if (c == '\\') + { +- c = check_escape(&ptr, errorcodeptr, *brackets, options, TRUE); ++ c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE); ++ if (*errorcodeptr != 0) goto FAILED; + + if (-c == ESC_b) c = '\b'; /* \b is backslash in a class */ + else if (-c == ESC_X) c = 'X'; /* \X is literal X in a class */ ++ else if (-c == ESC_R) c = 'R'; /* \R is literal R in a class */ + else if (-c == ESC_Q) /* Handle start of quoted string */ + { + if (ptr[1] == '\\' && ptr[2] == 'E') +@@ -1983,7 +2472,10 @@ + { + register const uschar *cbits = cd->cbits; + class_charcount += 2; /* Greater than 1 is what matters */ +- switch (-c) ++ ++ /* Save time by not doing this in the pre-compile phase. */ ++ ++ if (lengthptr == NULL) switch (-c) + { + case ESC_d: + for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit]; +@@ -2011,52 +2503,91 @@ + classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */ + continue; + +-#ifdef SUPPORT_UCP +- case ESC_p: +- case ESC_P: +- { +- BOOL negated; +- int pdata; +- int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr); +- if (ptype < 0) goto FAILED; +- class_utf8 = TRUE; +- *class_utf8data++ = ((-c == ESC_p) != negated)? +- XCL_PROP : XCL_NOTPROP; +- *class_utf8data++ = ptype; +- *class_utf8data++ = pdata; +- class_charcount -= 2; /* Not a < 256 character */ +- } ++ case ESC_E: /* Perl ignores an orphan \E */ + continue; +-#endif +- +- /* Unrecognized escapes are faulted if PCRE is running in its +- strict mode. By default, for compatibility with Perl, they are +- treated as literals. */ + +- default: +- if ((options & PCRE_EXTRA) != 0) +- { +- *errorcodeptr = ERR7; +- goto FAILED; +- } +- c = *ptr; /* The final character */ +- class_charcount -= 2; /* Undo the default count from above */ ++ default: /* Not recognized; fall through */ ++ break; /* Need "default" setting to stop compiler warning. */ + } +- } + +- /* Fall through if we have a single character (c >= 0). This may be +- > 256 in UTF-8 mode. */ ++ /* In the pre-compile phase, just do the recognition. */ + +- } /* End of backslash handling */ ++ else if (c == -ESC_d || c == -ESC_D || c == -ESC_w || ++ c == -ESC_W || c == -ESC_s || c == -ESC_S) continue; ++ ++ /* We need to deal with \P and \p in both phases. */ ++ ++#ifdef SUPPORT_UCP ++ if (-c == ESC_p || -c == ESC_P) ++ { ++ BOOL negated; ++ int pdata; ++ int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr); ++ if (ptype < 0) goto FAILED; ++ class_utf8 = TRUE; ++ *class_utf8data++ = ((-c == ESC_p) != negated)? ++ XCL_PROP : XCL_NOTPROP; ++ *class_utf8data++ = ptype; ++ *class_utf8data++ = pdata; ++ class_charcount -= 2; /* Not a < 256 character */ ++ continue; ++ } ++#endif ++ /* Unrecognized escapes are faulted if PCRE is running in its ++ strict mode. By default, for compatibility with Perl, they are ++ treated as literals. */ ++ ++ if ((options & PCRE_EXTRA) != 0) ++ { ++ *errorcodeptr = ERR7; ++ goto FAILED; ++ } ++ ++ class_charcount -= 2; /* Undo the default count from above */ ++ c = *ptr; /* Get the final character and fall through */ ++ } ++ ++ /* Fall through if we have a single character (c >= 0). This may be ++ greater than 256 in UTF-8 mode. */ ++ ++ } /* End of backslash handling */ + + /* A single character may be followed by '-' to form a range. However, + Perl does not permit ']' to be the end of the range. A '-' character +- here is treated as a literal. */ ++ at the end is treated as a literal. Perl ignores orphaned \E sequences ++ entirely. The code for handling \Q and \E is messy. */ ++ ++ CHECK_RANGE: ++ while (ptr[1] == '\\' && ptr[2] == 'E') ++ { ++ inescq = FALSE; ++ ptr += 2; ++ } ++ ++ oldptr = ptr; + +- if (ptr[1] == '-' && ptr[2] != ']') ++ if (!inescq && ptr[1] == '-') + { + int d; + ptr += 2; ++ while (*ptr == '\\' && ptr[1] == 'E') ptr += 2; ++ ++ /* If we hit \Q (not followed by \E) at this point, go into escaped ++ mode. */ ++ ++ while (*ptr == '\\' && ptr[1] == 'Q') ++ { ++ ptr += 2; ++ if (*ptr == '\\' && ptr[1] == 'E') { ptr += 2; continue; } ++ inescq = TRUE; ++ break; ++ } ++ ++ if (*ptr == 0 || (!inescq && *ptr == ']')) ++ { ++ ptr = oldptr; ++ goto LONE_SINGLE_CHARACTER; ++ } + + #ifdef SUPPORT_UTF8 + if (utf8) +@@ -2071,27 +2602,34 @@ + not any of the other escapes. Perl 5.6 treats a hyphen as a literal + in such circumstances. */ + +- if (d == '\\') ++ if (!inescq && d == '\\') + { +- const uschar *oldptr = ptr; +- d = check_escape(&ptr, errorcodeptr, *brackets, options, TRUE); ++ d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE); ++ if (*errorcodeptr != 0) goto FAILED; + +- /* \b is backslash; \X is literal X; any other special means the '-' +- was literal */ ++ /* \b is backslash; \X is literal X; \R is literal R; any other ++ special means the '-' was literal */ + + if (d < 0) + { + if (d == -ESC_b) d = '\b'; +- else if (d == -ESC_X) d = 'X'; else ++ else if (d == -ESC_X) d = 'X'; ++ else if (d == -ESC_R) d = 'R'; else + { +- ptr = oldptr - 2; ++ ptr = oldptr; + goto LONE_SINGLE_CHARACTER; /* A few lines below */ + } + } + } + +- /* The check that the two values are in the correct order happens in +- the pre-pass. Optimize one-character ranges */ ++ /* Check that the two values are in the correct order. Optimize ++ one-character ranges */ ++ ++ if (d < c) ++ { ++ *errorcodeptr = ERR8; ++ goto FAILED; ++ } + + if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */ + +@@ -2112,9 +2650,9 @@ + #ifdef SUPPORT_UCP + if ((options & PCRE_CASELESS) != 0) + { +- int occ, ocd; +- int cc = c; +- int origd = d; ++ unsigned int occ, ocd; ++ unsigned int cc = c; ++ unsigned int origd = d; + while (get_othercase_range(&cc, origd, &occ, &ocd)) + { + if (occ >= c && ocd <= d) continue; /* Skip embedded ranges */ +@@ -2172,7 +2710,12 @@ + ranges that lie entirely within 0-127 when there is UCP support; else + for partial ranges without UCP support. */ + +- for (; c <= d; c++) ++ class_charcount += d - c + 1; ++ class_lastchar = d; ++ ++ /* We can save a bit of time by skipping this in the pre-compile. */ ++ ++ if (lengthptr == NULL) for (; c <= d; c++) + { + classbits[c/8] |= (1 << (c&7)); + if ((options & PCRE_CASELESS) != 0) +@@ -2180,8 +2723,6 @@ + int uc = cd->fcc[c]; /* flip case */ + classbits[uc/8] |= (1 << (uc&7)); + } +- class_charcount++; /* in case a one-char range */ +- class_lastchar = c; + } + + continue; /* Go get the next char in the class */ +@@ -2205,8 +2746,8 @@ + #ifdef SUPPORT_UCP + if ((options & PCRE_CASELESS) != 0) + { +- int othercase; +- if ((othercase = _pcre_ucp_othercase(c)) >= 0) ++ unsigned int othercase; ++ if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR) + { + *class_utf8data++ = XCL_SINGLE; + class_utf8data += _pcre_ord2utf8(othercase, class_utf8data); +@@ -2231,10 +2772,15 @@ + } + } + +- /* Loop until ']' reached; the check for end of string happens inside the +- loop. This "while" is the end of the "do" above. */ ++ /* Loop until ']' reached. This "while" is the end of the "do" above. */ + +- while ((c = *(++ptr)) != ']' || inescq); ++ while ((c = *(++ptr)) != 0 && (c != ']' || inescq)); ++ ++ if (c == 0) /* Missing terminating ']' */ ++ { ++ *errorcodeptr = ERR6; ++ goto FAILED; ++ } + + /* If class_charcount is 1, we saw precisely one character whose value is + less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we +@@ -2298,7 +2844,7 @@ + + /* If there are characters with values > 255, we have to compile an + extended class, with its own opcode. If there are no characters < 256, +- we can omit the bitmap. */ ++ we can omit the bitmap in the actual compiled code. */ + + #ifdef SUPPORT_UTF8 + if (class_utf8) +@@ -2308,24 +2854,17 @@ + code += LINK_SIZE; + *code = negate_class? XCL_NOT : 0; + +- /* If the map is required, install it, and move on to the end of +- the extra data */ ++ /* If the map is required, move up the extra data to make room for it; ++ otherwise just move the code pointer to the end of the extra data. */ + + if (class_charcount > 0) + { + *code++ |= XCL_MAP; ++ memmove(code + 32, code, class_utf8data - code); + memcpy(code, classbits, 32); +- code = class_utf8data; +- } +- +- /* If the map is not required, slide down the extra data. */ +- +- else +- { +- int len = class_utf8data - (code + 33); +- memmove(code + 1, code + 33, len); +- code += len + 1; ++ code = class_utf8data + 32; + } ++ else code = class_utf8data; + + /* Now fill in the complete length of the item */ + +@@ -2342,7 +2881,8 @@ + if (negate_class) + { + *code++ = OP_NCLASS; +- for (c = 0; c < 32; c++) code[c] = ~classbits[c]; ++ if (lengthptr == NULL) /* Save time in the pre-compile phase */ ++ for (c = 0; c < 32; c++) code[c] = ~classbits[c]; + } + else + { +@@ -2352,6 +2892,8 @@ + code += 32; + break; + ++ ++ /* ===================================================================*/ + /* Various kinds of repeat; '{' is not necessarily a quantifier, but this + has been tested above. */ + +@@ -2419,20 +2961,6 @@ + } + else repeat_type = greedy_default; + +- /* If previous was a recursion, we need to wrap it inside brackets so that +- it can be replicated if necessary. */ +- +- if (*previous == OP_RECURSE) +- { +- memmove(previous + 1 + LINK_SIZE, previous, 1 + LINK_SIZE); +- code += 1 + LINK_SIZE; +- *previous = OP_BRA; +- PUT(previous, 1, code - previous); +- *code = OP_KET; +- PUT(code, 1, code - previous); +- code += 1 + LINK_SIZE; +- } +- + /* If previous was a character match, abolish the item and generate a + repeat item instead. If a char item has a minumum of more than one, ensure + that it is set in reqbyte - it might not be if a sequence such as x{3} is +@@ -2466,18 +2994,40 @@ + if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt; + } + ++ /* If the repetition is unlimited, it pays to see if the next thing on ++ the line is something that cannot possibly match this character. If so, ++ automatically possessifying this item gains some performance in the case ++ where the match fails. */ ++ ++ if (!possessive_quantifier && ++ repeat_max < 0 && ++ check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1, ++ options, cd)) ++ { ++ repeat_type = 0; /* Force greedy */ ++ possessive_quantifier = TRUE; ++ } ++ + goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */ + } + + /* If previous was a single negated character ([^a] or similar), we use + one of the special opcodes, replacing it. The code is shared with single- + character repeats by setting opt_type to add a suitable offset into +- repeat_type. OP_NOT is currently used only for single-byte chars. */ ++ repeat_type. We can also test for auto-possessification. OP_NOT is ++ currently used only for single-byte chars. */ + + else if (*previous == OP_NOT) + { + op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */ + c = previous[1]; ++ if (!possessive_quantifier && ++ repeat_max < 0 && ++ check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd)) ++ { ++ repeat_type = 0; /* Force greedy */ ++ possessive_quantifier = TRUE; ++ } + goto OUTPUT_SINGLE_REPEAT; + } + +@@ -2495,6 +3045,14 @@ + op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */ + c = *previous; + ++ if (!possessive_quantifier && ++ repeat_max < 0 && ++ check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd)) ++ { ++ repeat_type = 0; /* Force greedy */ ++ possessive_quantifier = TRUE; ++ } ++ + OUTPUT_SINGLE_REPEAT: + if (*previous == OP_PROP || *previous == OP_NOTPROP) + { +@@ -2535,7 +3093,7 @@ + } + + /* A repeat minimum of 1 is optimized into some special cases. If the +- maximum is unlimited, we use OP_PLUS. Otherwise, the original item it ++ maximum is unlimited, we use OP_PLUS. Otherwise, the original item is + left in place and, if the maximum is greater than 1, we use OP_UPTO with + one less than the maximum. */ + +@@ -2588,7 +3146,8 @@ + } + + /* Else insert an UPTO if the max is greater than the min, again +- preceded by the character, for the previously inserted code. */ ++ preceded by the character, for the previously inserted code. If the ++ UPTO is just for 1 instance, we can use QUERY instead. */ + + else if (repeat_max != repeat_min) + { +@@ -2607,8 +3166,16 @@ + *code++ = prop_value; + } + repeat_max -= repeat_min; +- *code++ = OP_UPTO + repeat_type; +- PUT2INC(code, 0, repeat_max); ++ ++ if (repeat_max == 1) ++ { ++ *code++ = OP_QUERY + repeat_type; ++ } ++ else ++ { ++ *code++ = OP_UPTO + repeat_type; ++ PUT2INC(code, 0, repeat_max); ++ } + } + } + +@@ -2675,14 +3242,30 @@ + /* If previous was a bracket group, we may have to replicate it in certain + cases. */ + +- else if (*previous >= OP_BRA || *previous == OP_ONCE || +- *previous == OP_COND) ++ else if (*previous == OP_BRA || *previous == OP_CBRA || ++ *previous == OP_ONCE || *previous == OP_COND) + { + register int i; + int ketoffset = 0; + int len = code - previous; + uschar *bralink = NULL; + ++ /* Repeating a DEFINE group is pointless */ ++ ++ if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF) ++ { ++ *errorcodeptr = ERR55; ++ goto FAILED; ++ } ++ ++ /* This is a paranoid check to stop integer overflow later on */ ++ ++ if (len > MAX_DUPLENGTH) ++ { ++ *errorcodeptr = ERR50; ++ goto FAILED; ++ } ++ + /* If the maximum repeat count is unlimited, find the end of the bracket + by scanning through from the start, and compute the offset back to it + from the current code pointer. There may be an OP_OPT setting following +@@ -2717,13 +3300,14 @@ + /* If the maximum is 1 or unlimited, we just have to stick in the + BRAZERO and do no more at this point. However, we do need to adjust + any OP_RECURSE calls inside the group that refer to the group itself or +- any internal group, because the offset is from the start of the whole +- regex. Temporarily terminate the pattern while doing this. */ ++ any internal or forward referenced group, because the offset is from ++ the start of the whole regex. Temporarily terminate the pattern while ++ doing this. */ + + if (repeat_max <= 1) + { + *code = OP_END; +- adjust_recurse(previous, 1, utf8, cd); ++ adjust_recurse(previous, 1, utf8, cd, save_hwm); + memmove(previous+1, previous, len); + code++; + *previous++ = OP_BRAZERO + repeat_type; +@@ -2741,7 +3325,7 @@ + { + int offset; + *code = OP_END; +- adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd); ++ adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm); + memmove(previous + 2 + LINK_SIZE, previous, len); + code += 2 + LINK_SIZE; + *previous++ = OP_BRAZERO + repeat_type; +@@ -2761,19 +3345,41 @@ + /* If the minimum is greater than zero, replicate the group as many + times as necessary, and adjust the maximum to the number of subsequent + copies that we need. If we set a first char from the group, and didn't +- set a required char, copy the latter from the former. */ ++ set a required char, copy the latter from the former. If there are any ++ forward reference subroutine calls in the group, there will be entries on ++ the workspace list; replicate these with an appropriate increment. */ + + else + { + if (repeat_min > 1) + { +- if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte; +- for (i = 1; i < repeat_min; i++) ++ /* In the pre-compile phase, we don't actually do the replication. We ++ just adjust the length as if we had. */ ++ ++ if (lengthptr != NULL) ++ *lengthptr += (repeat_min - 1)*length_prevgroup; ++ ++ /* This is compiling for real */ ++ ++ else + { +- memcpy(code, previous, len); +- code += len; ++ if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte; ++ for (i = 1; i < repeat_min; i++) ++ { ++ uschar *hc; ++ uschar *this_hwm = cd->hwm; ++ memcpy(code, previous, len); ++ for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE) ++ { ++ PUT(cd->hwm, 0, GET(hc, 0) + len); ++ cd->hwm += LINK_SIZE; ++ } ++ save_hwm = this_hwm; ++ code += len; ++ } + } + } ++ + if (repeat_max > 0) repeat_max -= repeat_min; + } + +@@ -2781,12 +3387,27 @@ + the maximum is limited, it replicates the group in a nested fashion, + remembering the bracket starts on a stack. In the case of a zero minimum, + the first one was set up above. In all cases the repeat_max now specifies +- the number of additional copies needed. */ ++ the number of additional copies needed. Again, we must remember to ++ replicate entries on the forward reference list. */ + + if (repeat_max >= 0) + { +- for (i = repeat_max - 1; i >= 0; i--) ++ /* In the pre-compile phase, we don't actually do the replication. We ++ just adjust the length as if we had. For each repetition we must add 1 ++ to the length for BRAZERO and for all but the last repetition we must ++ add 2 + 2*LINKSIZE to allow for the nesting that occurs. */ ++ ++ if (lengthptr != NULL && repeat_max > 0) ++ *lengthptr += repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) - ++ 2 - 2*LINK_SIZE; /* Last one doesn't nest */ ++ ++ /* This is compiling for real */ ++ ++ else for (i = repeat_max - 1; i >= 0; i--) + { ++ uschar *hc; ++ uschar *this_hwm = cd->hwm; ++ + *code++ = OP_BRAZERO + repeat_type; + + /* All but the final copy start a new nesting, maintaining the +@@ -2802,6 +3423,12 @@ + } + + memcpy(code, previous, len); ++ for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE) ++ { ++ PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1)); ++ cd->hwm += LINK_SIZE; ++ } ++ save_hwm = this_hwm; + code += len; + } + +@@ -2824,9 +3451,34 @@ + /* If the maximum is unlimited, set a repeater in the final copy. We + can't just offset backwards from the current code point, because we + don't know if there's been an options resetting after the ket. The +- correct offset was computed above. */ ++ correct offset was computed above. ++ ++ Then, when we are doing the actual compile phase, check to see whether ++ this group is a non-atomic one that could match an empty string. If so, ++ convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so ++ that runtime checking can be done. [This check is also applied to ++ atomic groups at runtime, but in a different way.] */ + +- else code[-ketoffset] = OP_KETRMAX + repeat_type; ++ else ++ { ++ uschar *ketcode = code - ketoffset; ++ uschar *bracode = ketcode - GET(ketcode, 1); ++ *ketcode = OP_KETRMAX + repeat_type; ++ if (lengthptr == NULL && *bracode != OP_ONCE) ++ { ++ uschar *scode = bracode; ++ do ++ { ++ if (could_be_empty_branch(scode, ketcode, utf8)) ++ { ++ *bracode += OP_SBRA - OP_BRA; ++ break; ++ } ++ scode += GET(scode, 1); ++ } ++ while (*scode == OP_ALT); ++ } ++ } + } + + /* Else there's some kind of shambles */ +@@ -2837,22 +3489,53 @@ + goto FAILED; + } + +- /* If the character following a repeat is '+', we wrap the entire repeated +- item inside OP_ONCE brackets. This is just syntactic sugar, taken from +- Sun's Java package. The repeated item starts at tempcode, not at previous, +- which might be the first part of a string whose (former) last char we +- repeated. However, we don't support '+' after a greediness '?'. */ ++ /* If the character following a repeat is '+', or if certain optimization ++ tests above succeeded, possessive_quantifier is TRUE. For some of the ++ simpler opcodes, there is an special alternative opcode for this. For ++ anything else, we wrap the entire repeated item inside OP_ONCE brackets. ++ The '+' notation is just syntactic sugar, taken from Sun's Java package, ++ but the special opcodes can optimize it a bit. The repeated item starts at ++ tempcode, not at previous, which might be the first part of a string whose ++ (former) last char we repeated. ++ ++ Possessifying an 'exact' quantifier has no effect, so we can ignore it. But ++ an 'upto' may follow. We skip over an 'exact' item, and then test the ++ length of what remains before proceeding. */ + + if (possessive_quantifier) + { +- int len = code - tempcode; +- memmove(tempcode + 1+LINK_SIZE, tempcode, len); +- code += 1 + LINK_SIZE; +- len += 1 + LINK_SIZE; +- tempcode[0] = OP_ONCE; +- *code++ = OP_KET; +- PUTINC(code, 0, len); +- PUT(tempcode, 1, len); ++ int len; ++ if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT || ++ *tempcode == OP_NOTEXACT) ++ tempcode += _pcre_OP_lengths[*tempcode]; ++ len = code - tempcode; ++ if (len > 0) switch (*tempcode) ++ { ++ case OP_STAR: *tempcode = OP_POSSTAR; break; ++ case OP_PLUS: *tempcode = OP_POSPLUS; break; ++ case OP_QUERY: *tempcode = OP_POSQUERY; break; ++ case OP_UPTO: *tempcode = OP_POSUPTO; break; ++ ++ case OP_TYPESTAR: *tempcode = OP_TYPEPOSSTAR; break; ++ case OP_TYPEPLUS: *tempcode = OP_TYPEPOSPLUS; break; ++ case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break; ++ case OP_TYPEUPTO: *tempcode = OP_TYPEPOSUPTO; break; ++ ++ case OP_NOTSTAR: *tempcode = OP_NOTPOSSTAR; break; ++ case OP_NOTPLUS: *tempcode = OP_NOTPOSPLUS; break; ++ case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break; ++ case OP_NOTUPTO: *tempcode = OP_NOTPOSUPTO; break; ++ ++ default: ++ memmove(tempcode + 1+LINK_SIZE, tempcode, len); ++ code += 1 + LINK_SIZE; ++ len += 1 + LINK_SIZE; ++ tempcode[0] = OP_ONCE; ++ *code++ = OP_KET; ++ PUTINC(code, 0, len); ++ PUT(tempcode, 1, len); ++ break; ++ } + } + + /* In all case we no longer have a previous item. We also set the +@@ -2865,162 +3548,275 @@ + break; + + +- /* Start of nested bracket sub-expression, or comment or lookahead or +- lookbehind or option setting or condition. First deal with special things +- that can come after a bracket; all are introduced by ?, and the appearance +- of any of them means that this is not a referencing group. They were +- checked for validity in the first pass over the string, so we don't have to +- check for syntax errors here. */ ++ /* ===================================================================*/ ++ /* Start of nested parenthesized sub-expression, or comment or lookahead or ++ lookbehind or option setting or condition or all the other extended ++ parenthesis forms. First deal with the specials; all are introduced by ?, ++ and the appearance of any of them means that this is not a capturing ++ group. */ + + case '(': + newoptions = options; + skipbytes = 0; ++ bravalue = OP_CBRA; ++ save_hwm = cd->hwm; + + if (*(++ptr) == '?') + { +- int set, unset; ++ int i, set, unset, namelen; + int *optset; ++ const uschar *name; ++ uschar *slot; + + switch (*(++ptr)) + { + case '#': /* Comment; skip to ket */ + ptr++; +- while (*ptr != ')') ptr++; ++ while (*ptr != 0 && *ptr != ')') ptr++; ++ if (*ptr == 0) ++ { ++ *errorcodeptr = ERR18; ++ goto FAILED; ++ } + continue; + +- case ':': /* Non-extracting bracket */ ++ ++ /* ------------------------------------------------------------ */ ++ case ':': /* Non-capturing bracket */ + bravalue = OP_BRA; + ptr++; + break; + ++ ++ /* ------------------------------------------------------------ */ + case '(': + bravalue = OP_COND; /* Conditional group */ + +- /* A condition can be a number, referring to a numbered group, a name, +- referring to a named group, 'R', referring to recursion, or an +- assertion. There are two unfortunate ambiguities, caused by history. +- (a) 'R' can be the recursive thing or the name 'R', and (b) a number +- could be a name that consists of digits. In both cases, we look for a +- name first; if not found, we try the other cases. If the first +- character after (?( is a word character, we know the rest up to ) will +- also be word characters because the syntax was checked in the first +- pass. */ +- +- if ((cd->ctypes[ptr[1]] & ctype_word) != 0) +- { +- int i, namelen; +- int condref = 0; +- const uschar *name; +- uschar *slot = cd->name_table; ++ /* A condition can be an assertion, a number (referring to a numbered ++ group), a name (referring to a named group), or 'R', referring to ++ recursion. R<digits> and R&name are also permitted for recursion tests. ++ ++ There are several syntaxes for testing a named group: (?(name)) is used ++ by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')). ++ ++ There are two unfortunate ambiguities, caused by history. (a) 'R' can ++ be the recursive thing or the name 'R' (and similarly for 'R' followed ++ by digits), and (b) a number could be a name that consists of digits. ++ In both cases, we look for a name first; if not found, we try the other ++ cases. */ ++ ++ /* For conditions that are assertions, check the syntax, and then exit ++ the switch. This will take control down to where bracketed groups, ++ including assertions, are processed. */ + +- /* This is needed for all successful cases. */ ++ if (ptr[1] == '?' && (ptr[2] == '=' || ptr[2] == '!' || ptr[2] == '<')) ++ break; + +- skipbytes = 3; ++ /* Most other conditions use OP_CREF (a couple change to OP_RREF ++ below), and all need to skip 3 bytes at the start of the group. */ + +- /* Read the name, but also get it as a number if it's all digits */ ++ code[1+LINK_SIZE] = OP_CREF; ++ skipbytes = 3; + +- name = ++ptr; +- while (*ptr != ')') +- { +- if (condref >= 0) +- condref = ((digitab[*ptr] & ctype_digit) != 0)? +- condref * 10 + *ptr - '0' : -1; +- ptr++; +- } +- namelen = ptr - name; ++ /* Check for a test for recursion in a named group. */ ++ ++ if (ptr[1] == 'R' && ptr[2] == '&') ++ { ++ terminator = -1; ++ ptr += 2; ++ code[1+LINK_SIZE] = OP_RREF; /* Change the type of test */ ++ } ++ ++ /* Check for a test for a named group's having been set, using the Perl ++ syntax (?(<name>) or (?('name') */ ++ ++ else if (ptr[1] == '<') ++ { ++ terminator = '>'; + ptr++; ++ } ++ else if (ptr[1] == '\'') ++ { ++ terminator = '\''; ++ ptr++; ++ } ++ else terminator = 0; + +- for (i = 0; i < cd->names_found; i++) +- { +- if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break; +- slot += cd->name_entry_size; +- } ++ /* We now expect to read a name; any thing else is an error */ + +- /* Found a previous named subpattern */ ++ if ((cd->ctypes[ptr[1]] & ctype_word) == 0) ++ { ++ ptr += 1; /* To get the right offset */ ++ *errorcodeptr = ERR28; ++ goto FAILED; ++ } + +- if (i < cd->names_found) +- { +- condref = GET2(slot, 0); +- code[1+LINK_SIZE] = OP_CREF; +- PUT2(code, 2+LINK_SIZE, condref); +- } ++ /* Read the name, but also get it as a number if it's all digits */ + +- /* Search the pattern for a forward reference */ ++ recno = 0; ++ name = ++ptr; ++ while ((cd->ctypes[*ptr] & ctype_word) != 0) ++ { ++ if (recno >= 0) ++ recno = ((digitab[*ptr] & ctype_digit) != 0)? ++ recno * 10 + *ptr - '0' : -1; ++ ptr++; ++ } ++ namelen = ptr - name; + +- else if ((i = find_named_parens(ptr, *brackets, name, namelen)) > 0) +- { +- code[1+LINK_SIZE] = OP_CREF; +- PUT2(code, 2+LINK_SIZE, i); +- } ++ if ((terminator > 0 && *ptr++ != terminator) || *ptr++ != ')') ++ { ++ ptr--; /* Error offset */ ++ *errorcodeptr = ERR26; ++ goto FAILED; ++ } + +- /* Check for 'R' for recursion */ ++ /* Do no further checking in the pre-compile phase. */ + +- else if (namelen == 1 && *name == 'R') +- { +- code[1+LINK_SIZE] = OP_CREF; +- PUT2(code, 2+LINK_SIZE, CREF_RECURSE); +- } ++ if (lengthptr != NULL) break; + +- /* Check for a subpattern number */ ++ /* In the real compile we do the work of looking for the actual ++ reference. */ + +- else if (condref > 0) +- { +- code[1+LINK_SIZE] = OP_CREF; +- PUT2(code, 2+LINK_SIZE, condref); +- } ++ slot = cd->name_table; ++ for (i = 0; i < cd->names_found; i++) ++ { ++ if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break; ++ slot += cd->name_entry_size; ++ } + +- /* Either an unidentified subpattern, or a reference to (?(0) */ ++ /* Found a previous named subpattern */ + +- else ++ if (i < cd->names_found) ++ { ++ recno = GET2(slot, 0); ++ PUT2(code, 2+LINK_SIZE, recno); ++ } ++ ++ /* Search the pattern for a forward reference */ ++ ++ else if ((i = find_parens(ptr, cd->bracount, name, namelen, ++ (options & PCRE_EXTENDED) != 0)) > 0) ++ { ++ PUT2(code, 2+LINK_SIZE, i); ++ } ++ ++ /* If terminator == 0 it means that the name followed directly after ++ the opening parenthesis [e.g. (?(abc)...] and in this case there are ++ some further alternatives to try. For the cases where terminator != 0 ++ [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have ++ now checked all the possibilities, so give an error. */ ++ ++ else if (terminator != 0) ++ { ++ *errorcodeptr = ERR15; ++ goto FAILED; ++ } ++ ++ /* Check for (?(R) for recursion. Allow digits after R to specify a ++ specific group number. */ ++ ++ else if (*name == 'R') ++ { ++ recno = 0; ++ for (i = 1; i < namelen; i++) + { +- *errorcodeptr = (condref == 0)? ERR35: ERR15; +- goto FAILED; ++ if ((digitab[name[i]] & ctype_digit) == 0) ++ { ++ *errorcodeptr = ERR15; ++ goto FAILED; ++ } ++ recno = recno * 10 + name[i] - '0'; + } ++ if (recno == 0) recno = RREF_ANY; ++ code[1+LINK_SIZE] = OP_RREF; /* Change test type */ ++ PUT2(code, 2+LINK_SIZE, recno); ++ } ++ ++ /* Similarly, check for the (?(DEFINE) "condition", which is always ++ false. */ ++ ++ else if (namelen == 6 && strncmp((char *)name, "DEFINE", 6) == 0) ++ { ++ code[1+LINK_SIZE] = OP_DEF; ++ skipbytes = 1; ++ } ++ ++ /* Check for the "name" actually being a subpattern number. */ ++ ++ else if (recno > 0) ++ { ++ PUT2(code, 2+LINK_SIZE, recno); + } + +- /* For conditions that are assertions, we just fall through, having +- set bravalue above. */ ++ /* Either an unidentified subpattern, or a reference to (?(0) */ + ++ else ++ { ++ *errorcodeptr = (recno == 0)? ERR35: ERR15; ++ goto FAILED; ++ } + break; + ++ ++ /* ------------------------------------------------------------ */ + case '=': /* Positive lookahead */ + bravalue = OP_ASSERT; + ptr++; + break; + ++ ++ /* ------------------------------------------------------------ */ + case '!': /* Negative lookahead */ + bravalue = OP_ASSERT_NOT; + ptr++; + break; + +- case '<': /* Lookbehinds */ +- switch (*(++ptr)) ++ ++ /* ------------------------------------------------------------ */ ++ case '<': /* Lookbehind or named define */ ++ switch (ptr[1]) + { + case '=': /* Positive lookbehind */ + bravalue = OP_ASSERTBACK; +- ptr++; ++ ptr += 2; + break; + + case '!': /* Negative lookbehind */ + bravalue = OP_ASSERTBACK_NOT; +- ptr++; ++ ptr += 2; + break; ++ ++ default: /* Could be name define, else bad */ ++ if ((cd->ctypes[ptr[1]] & ctype_word) != 0) goto DEFINE_NAME; ++ ptr++; /* Correct offset for error */ ++ *errorcodeptr = ERR24; ++ goto FAILED; + } + break; + ++ ++ /* ------------------------------------------------------------ */ + case '>': /* One-time brackets */ + bravalue = OP_ONCE; + ptr++; + break; + ++ ++ /* ------------------------------------------------------------ */ + case 'C': /* Callout - may be followed by digits; */ + previous_callout = code; /* Save for later completion */ + after_manual_callout = 1; /* Skip one item before completing */ +- *code++ = OP_CALLOUT; /* Already checked that the terminating */ +- { /* closing parenthesis is present. */ ++ *code++ = OP_CALLOUT; ++ { + int n = 0; + while ((digitab[*(++ptr)] & ctype_digit) != 0) + n = n * 10 + *ptr - '0'; ++ if (*ptr != ')') ++ { ++ *errorcodeptr = ERR39; ++ goto FAILED; ++ } + if (n > 255) + { + *errorcodeptr = ERR38; +@@ -3034,134 +3830,232 @@ + previous = NULL; + continue; + +- case 'P': /* Named subpattern handling */ +- if (*(++ptr) == '<') /* Definition */ ++ ++ /* ------------------------------------------------------------ */ ++ case 'P': /* Python-style named subpattern handling */ ++ if (*(++ptr) == '=' || *ptr == '>') /* Reference or recursion */ ++ { ++ is_recurse = *ptr == '>'; ++ terminator = ')'; ++ goto NAMED_REF_OR_RECURSE; ++ } ++ else if (*ptr != '<') /* Test for Python-style definition */ ++ { ++ *errorcodeptr = ERR41; ++ goto FAILED; ++ } ++ /* Fall through to handle (?P< as (?< is handled */ ++ ++ ++ /* ------------------------------------------------------------ */ ++ DEFINE_NAME: /* Come here from (?< handling */ ++ case '\'': + { +- int i, namelen; +- uschar *slot = cd->name_table; +- const uschar *name; /* Don't amalgamate; some compilers */ +- name = ++ptr; /* grumble at autoincrement in declaration */ ++ terminator = (*ptr == '<')? '>' : '\''; ++ name = ++ptr; ++ ++ while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++; ++ namelen = ptr - name; + +- while (*ptr++ != '>'); +- namelen = ptr - name - 1; ++ /* In the pre-compile phase, just do a syntax check. */ + +- for (i = 0; i < cd->names_found; i++) ++ if (lengthptr != NULL) ++ { ++ if (*ptr != terminator) ++ { ++ *errorcodeptr = ERR42; ++ goto FAILED; ++ } ++ if (cd->names_found >= MAX_NAME_COUNT) ++ { ++ *errorcodeptr = ERR49; ++ goto FAILED; ++ } ++ if (namelen + 3 > cd->name_entry_size) ++ { ++ cd->name_entry_size = namelen + 3; ++ if (namelen > MAX_NAME_SIZE) ++ { ++ *errorcodeptr = ERR48; ++ goto FAILED; ++ } ++ } ++ } ++ ++ /* In the real compile, create the entry in the table */ ++ ++ else + { +- int crc = memcmp(name, slot+2, namelen); +- if (crc == 0) ++ slot = cd->name_table; ++ for (i = 0; i < cd->names_found; i++) + { +- if (slot[2+namelen] == 0) ++ int crc = memcmp(name, slot+2, namelen); ++ if (crc == 0) + { +- if ((options & PCRE_DUPNAMES) == 0) ++ if (slot[2+namelen] == 0) + { +- *errorcodeptr = ERR43; +- goto FAILED; ++ if ((options & PCRE_DUPNAMES) == 0) ++ { ++ *errorcodeptr = ERR43; ++ goto FAILED; ++ } + } ++ else crc = -1; /* Current name is substring */ + } +- else crc = -1; /* Current name is substring */ +- } +- if (crc < 0) +- { +- memmove(slot + cd->name_entry_size, slot, +- (cd->names_found - i) * cd->name_entry_size); +- break; ++ if (crc < 0) ++ { ++ memmove(slot + cd->name_entry_size, slot, ++ (cd->names_found - i) * cd->name_entry_size); ++ break; ++ } ++ slot += cd->name_entry_size; + } +- slot += cd->name_entry_size; +- } + +- PUT2(slot, 0, *brackets + 1); +- memcpy(slot + 2, name, namelen); +- slot[2+namelen] = 0; +- cd->names_found++; +- goto NUMBERED_GROUP; ++ PUT2(slot, 0, cd->bracount + 1); ++ memcpy(slot + 2, name, namelen); ++ slot[2+namelen] = 0; ++ } + } + +- if (*ptr == '=' || *ptr == '>') /* Reference or recursion */ +- { +- int i, namelen; +- int type = *ptr++; +- const uschar *name = ptr; +- uschar *slot = cd->name_table; ++ /* In both cases, count the number of names we've encountered. */ + +- while (*ptr != ')') ptr++; +- namelen = ptr - name; ++ ptr++; /* Move past > or ' */ ++ cd->names_found++; ++ goto NUMBERED_GROUP; + +- for (i = 0; i < cd->names_found; i++) ++ ++ /* ------------------------------------------------------------ */ ++ case '&': /* Perl recursion/subroutine syntax */ ++ terminator = ')'; ++ is_recurse = TRUE; ++ /* Fall through */ ++ ++ /* We come here from the Python syntax above that handles both ++ references (?P=name) and recursion (?P>name), as well as falling ++ through from the Perl recursion syntax (?&name). */ ++ ++ NAMED_REF_OR_RECURSE: ++ name = ++ptr; ++ while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++; ++ namelen = ptr - name; ++ ++ /* In the pre-compile phase, do a syntax check and set a dummy ++ reference number. */ ++ ++ if (lengthptr != NULL) ++ { ++ if (*ptr != terminator) + { +- if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break; +- slot += cd->name_entry_size; ++ *errorcodeptr = ERR42; ++ goto FAILED; + } +- +- if (i < cd->names_found) /* Back reference */ ++ if (namelen > MAX_NAME_SIZE) ++ { ++ *errorcodeptr = ERR48; ++ goto FAILED; ++ } ++ recno = 0; ++ } ++ ++ /* In the real compile, seek the name in the table */ ++ ++ else ++ { ++ slot = cd->name_table; ++ for (i = 0; i < cd->names_found; i++) ++ { ++ if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break; ++ slot += cd->name_entry_size; ++ } ++ ++ if (i < cd->names_found) /* Back reference */ + { + recno = GET2(slot, 0); + } + else if ((recno = /* Forward back reference */ +- find_named_parens(ptr, *brackets, name, namelen)) <= 0) ++ find_parens(ptr, cd->bracount, name, namelen, ++ (options & PCRE_EXTENDED) != 0)) <= 0) + { + *errorcodeptr = ERR15; + goto FAILED; + } ++ } + +- if (type == '>') goto HANDLE_RECURSION; /* A few lines below */ +- +- /* Back reference */ ++ /* In both phases, we can now go to the code than handles numerical ++ recursion or backreferences. */ + +- previous = code; +- *code++ = OP_REF; +- PUT2INC(code, 0, recno); +- cd->backref_map |= (recno < 32)? (1 << recno) : 1; +- if (recno > cd->top_backref) cd->top_backref = recno; +- continue; +- } ++ if (is_recurse) goto HANDLE_RECURSION; ++ else goto HANDLE_REFERENCE; + +- /* Should never happen */ +- break; + +- case 'R': /* Pattern recursion */ ++ /* ------------------------------------------------------------ */ ++ case 'R': /* Recursion */ + ptr++; /* Same as (?0) */ + /* Fall through */ + +- /* Recursion or "subroutine" call */ + +- case '0': case '1': case '2': case '3': case '4': +- case '5': case '6': case '7': case '8': case '9': ++ /* ------------------------------------------------------------ */ ++ case '0': case '1': case '2': case '3': case '4': /* Recursion or */ ++ case '5': case '6': case '7': case '8': case '9': /* subroutine */ + { + const uschar *called; + recno = 0; + while((digitab[*ptr] & ctype_digit) != 0) + recno = recno * 10 + *ptr++ - '0'; ++ if (*ptr != ')') ++ { ++ *errorcodeptr = ERR29; ++ goto FAILED; ++ } + + /* Come here from code above that handles a named recursion */ + + HANDLE_RECURSION: + + previous = code; ++ called = cd->start_code; + +- /* Find the bracket that is being referenced. Temporarily end the +- regex in case it doesn't exist. */ ++ /* When we are actually compiling, find the bracket that is being ++ referenced. Temporarily end the regex in case it doesn't exist before ++ this point. If we end up with a forward reference, first check that ++ the bracket does occur later so we can give the error (and position) ++ now. Then remember this forward reference in the workspace so it can ++ be filled in at the end. */ + +- *code = OP_END; +- called = (recno == 0)? cd->start_code : +- find_bracket(cd->start_code, utf8, recno); +- if (called == NULL) ++ if (lengthptr == NULL) + { +- *errorcodeptr = ERR15; +- goto FAILED; +- } ++ *code = OP_END; ++ if (recno != 0) called = find_bracket(cd->start_code, utf8, recno); + +- /* If the subpattern is still open, this is a recursive call. We +- check to see if this is a left recursion that could loop for ever, +- and diagnose that case. */ ++ /* Forward reference */ + +- if (GET(called, 1) == 0 && could_be_empty(called, code, bcptr, utf8)) +- { +- *errorcodeptr = ERR40; +- goto FAILED; ++ if (called == NULL) ++ { ++ if (find_parens(ptr, cd->bracount, NULL, recno, ++ (options & PCRE_EXTENDED) != 0) < 0) ++ { ++ *errorcodeptr = ERR15; ++ goto FAILED; ++ } ++ called = cd->start_code + recno; ++ PUTINC(cd->hwm, 0, code + 2 + LINK_SIZE - cd->start_code); ++ } ++ ++ /* If not a forward reference, and the subpattern is still open, ++ this is a recursive call. We check to see if this is a left ++ recursion that could loop for ever, and diagnose that case. */ ++ ++ else if (GET(called, 1) == 0 && ++ could_be_empty(called, code, bcptr, utf8)) ++ { ++ *errorcodeptr = ERR40; ++ goto FAILED; ++ } + } + + /* Insert the recursion/subroutine item, automatically wrapped inside +- "once" brackets. */ ++ "once" brackets. Set up a "previous group" length so that a ++ subsequent quantifier will work. */ + + *code = OP_ONCE; + PUT(code, 1, 2 + 2*LINK_SIZE); +@@ -3174,12 +4068,18 @@ + *code = OP_KET; + PUT(code, 1, 2 + 2*LINK_SIZE); + code += 1 + LINK_SIZE; ++ ++ length_prevgroup = 3 + 3*LINK_SIZE; + } ++ ++ /* Can't determine a first byte now */ ++ ++ if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE; + continue; + +- /* Character after (? not specially recognized */ + +- default: /* Option setting */ ++ /* ------------------------------------------------------------ */ ++ default: /* Other characters: check option setting */ + set = unset = 0; + optset = &set; + +@@ -3189,13 +4089,21 @@ + { + case '-': optset = &unset; break; + ++ case 'J': /* Record that it changed in the external options */ ++ *optset |= PCRE_DUPNAMES; ++ cd->external_options |= PCRE_JCHANGED; ++ break; ++ + case 'i': *optset |= PCRE_CASELESS; break; +- case 'J': *optset |= PCRE_DUPNAMES; break; + case 'm': *optset |= PCRE_MULTILINE; break; + case 's': *optset |= PCRE_DOTALL; break; + case 'x': *optset |= PCRE_EXTENDED; break; + case 'U': *optset |= PCRE_UNGREEDY; break; + case 'X': *optset |= PCRE_EXTRA; break; ++ ++ default: *errorcodeptr = ERR12; ++ ptr--; /* Correct the offset */ ++ goto FAILED; + } + } + +@@ -3204,32 +4112,54 @@ + newoptions = (options | set) & (~unset); + + /* If the options ended with ')' this is not the start of a nested +- group with option changes, so the options change at this level. Compile +- code to change the ims options if this setting actually changes any of +- them. We also pass the new setting back so that it can be put at the +- start of any following branches, and when this group ends (if we are in +- a group), a resetting item can be compiled. +- +- Note that if this item is right at the start of the pattern, the +- options will have been abstracted and made global, so there will be no +- change to compile. */ ++ group with option changes, so the options change at this level. If this ++ item is right at the start of the pattern, the options can be ++ abstracted and made external in the pre-compile phase, and ignored in ++ the compile phase. This can be helpful when matching -- for instance in ++ caseless checking of required bytes. ++ ++ If the code pointer is not (cd->start_code + 1 + LINK_SIZE), we are ++ definitely *not* at the start of the pattern because something has been ++ compiled. In the pre-compile phase, however, the code pointer can have ++ that value after the start, because it gets reset as code is discarded ++ during the pre-compile. However, this can happen only at top level - if ++ we are within parentheses, the starting BRA will still be present. At ++ any parenthesis level, the length value can be used to test if anything ++ has been compiled at that level. Thus, a test for both these conditions ++ is necessary to ensure we correctly detect the start of the pattern in ++ both phases. ++ ++ If we are not at the pattern start, compile code to change the ims ++ options if this setting actually changes any of them. We also pass the ++ new setting back so that it can be put at the start of any following ++ branches, and when this group ends (if we are in a group), a resetting ++ item can be compiled. */ + + if (*ptr == ')') + { +- if ((options & PCRE_IMS) != (newoptions & PCRE_IMS)) ++ if (code == cd->start_code + 1 + LINK_SIZE && ++ (lengthptr == NULL || *lengthptr == 2 + 2*LINK_SIZE)) + { +- *code++ = OP_OPT; +- *code++ = newoptions & PCRE_IMS; ++ cd->external_options = newoptions; ++ options = newoptions; + } ++ else ++ { ++ if ((options & PCRE_IMS) != (newoptions & PCRE_IMS)) ++ { ++ *code++ = OP_OPT; ++ *code++ = newoptions & PCRE_IMS; ++ } + +- /* Change options at this level, and pass them back for use +- in subsequent branches. Reset the greedy defaults and the case +- value for firstbyte and reqbyte. */ +- +- *optionsptr = options = newoptions; +- greedy_default = ((newoptions & PCRE_UNGREEDY) != 0); +- greedy_non_default = greedy_default ^ 1; +- req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0; ++ /* Change options at this level, and pass them back for use ++ in subsequent branches. Reset the greedy defaults and the case ++ value for firstbyte and reqbyte. */ ++ ++ *optionsptr = options = newoptions; ++ greedy_default = ((newoptions & PCRE_UNGREEDY) != 0); ++ greedy_non_default = greedy_default ^ 1; ++ req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0; ++ } + + previous = NULL; /* This item can't be repeated */ + continue; /* It is complete */ +@@ -3242,58 +4172,56 @@ + + bravalue = OP_BRA; + ptr++; +- } +- } ++ } /* End of switch for character following (? */ ++ } /* End of (? handling */ + +- /* If PCRE_NO_AUTO_CAPTURE is set, all unadorned brackets become +- non-capturing and behave like (?:...) brackets */ ++ /* Opening parenthesis not followed by '?'. If PCRE_NO_AUTO_CAPTURE is set, ++ all unadorned brackets become non-capturing and behave like (?:...) ++ brackets. */ + + else if ((options & PCRE_NO_AUTO_CAPTURE) != 0) + { + bravalue = OP_BRA; + } + +- /* Else we have a referencing group; adjust the opcode. If the bracket +- number is greater than EXTRACT_BASIC_MAX, we set the opcode one higher, and +- arrange for the true number to follow later, in an OP_BRANUMBER item. */ ++ /* Else we have a capturing group. */ + + else + { + NUMBERED_GROUP: +- if (++(*brackets) > EXTRACT_BASIC_MAX) +- { +- bravalue = OP_BRA + EXTRACT_BASIC_MAX + 1; +- code[1+LINK_SIZE] = OP_BRANUMBER; +- PUT2(code, 2+LINK_SIZE, *brackets); +- skipbytes = 3; +- } +- else bravalue = OP_BRA + *brackets; ++ cd->bracount += 1; ++ PUT2(code, 1+LINK_SIZE, cd->bracount); ++ skipbytes = 2; + } + +- /* Process nested bracketed re. Assertions may not be repeated, but other +- kinds can be. We copy code into a non-register variable in order to be able +- to pass its address because some compilers complain otherwise. Pass in a +- new setting for the ims options if they have changed. */ ++ /* Process nested bracketed regex. Assertions may not be repeated, but ++ other kinds can be. All their opcodes are >= OP_ONCE. We copy code into a ++ non-register variable in order to be able to pass its address because some ++ compilers complain otherwise. Pass in a new setting for the ims options if ++ they have changed. */ + + previous = (bravalue >= OP_ONCE)? code : NULL; + *code = bravalue; + tempcode = code; + tempreqvary = cd->req_varyopt; /* Save value before bracket */ ++ length_prevgroup = 0; /* Initialize for pre-compile phase */ + + if (!compile_regex( + newoptions, /* The complete new option state */ + options & PCRE_IMS, /* The previous ims option state */ +- brackets, /* Extracting bracket count */ + &tempcode, /* Where to put code (updated) */ + &ptr, /* Input pointer (updated) */ + errorcodeptr, /* Where to put an error message */ + (bravalue == OP_ASSERTBACK || + bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */ +- skipbytes, /* Skip over OP_COND/OP_BRANUMBER */ ++ skipbytes, /* Skip over bracket number */ + &subfirstbyte, /* For possible first char */ + &subreqbyte, /* For possible last char */ + bcptr, /* Current branch chain */ +- cd)) /* Tables block */ ++ cd, /* Tables block */ ++ (lengthptr == NULL)? NULL : /* Actual compile phase */ ++ &length_prevgroup /* Pre-compile phase */ ++ )) + goto FAILED; + + /* At the end of compiling, code is still pointing to the start of the +@@ -3302,9 +4230,9 @@ + is on the bracket. */ + + /* If this is a conditional bracket, check that there are no more than +- two branches in the group. */ ++ two branches in the group, or just one if it's a DEFINE group. */ + +- else if (bravalue == OP_COND) ++ if (bravalue == OP_COND) + { + uschar *tc = code; + int condcount = 0; +@@ -3315,29 +4243,77 @@ + } + while (*tc != OP_KET); + +- if (condcount > 2) ++ /* A DEFINE group is never obeyed inline (the "condition" is always ++ false). It must have only one branch. */ ++ ++ if (code[LINK_SIZE+1] == OP_DEF) + { +- *errorcodeptr = ERR27; +- goto FAILED; ++ if (condcount > 1) ++ { ++ *errorcodeptr = ERR54; ++ goto FAILED; ++ } ++ bravalue = OP_DEF; /* Just a flag to suppress char handling below */ ++ } ++ ++ /* A "normal" conditional group. If there is just one branch, we must not ++ make use of its firstbyte or reqbyte, because this is equivalent to an ++ empty second branch. */ ++ ++ else ++ { ++ if (condcount > 2) ++ { ++ *errorcodeptr = ERR27; ++ goto FAILED; ++ } ++ if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE; + } ++ } ++ ++ /* Error if hit end of pattern */ + +- /* If there is just one branch, we must not make use of its firstbyte or +- reqbyte, because this is equivalent to an empty second branch. */ ++ if (*ptr != ')') ++ { ++ *errorcodeptr = ERR14; ++ goto FAILED; ++ } + +- if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE; ++ /* In the pre-compile phase, update the length by the length of the nested ++ group, less the brackets at either end. Then reduce the compiled code to ++ just the brackets so that it doesn't use much memory if it is duplicated by ++ a quantifier. */ ++ ++ if (lengthptr != NULL) ++ { ++ *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE; ++ code++; ++ PUTINC(code, 0, 1 + LINK_SIZE); ++ *code++ = OP_KET; ++ PUTINC(code, 0, 1 + LINK_SIZE); + } + +- /* Handle updating of the required and first characters. Update for normal +- brackets of all kinds, and conditions with two branches (see code above). +- If the bracket is followed by a quantifier with zero repeat, we have to +- back off. Hence the definition of zeroreqbyte and zerofirstbyte outside the +- main loop so that they can be accessed for the back off. */ ++ /* Otherwise update the main code pointer to the end of the group. */ ++ ++ else code = tempcode; ++ ++ /* For a DEFINE group, required and first character settings are not ++ relevant. */ ++ ++ if (bravalue == OP_DEF) break; ++ ++ /* Handle updating of the required and first characters for other types of ++ group. Update for normal brackets of all kinds, and conditions with two ++ branches (see code above). If the bracket is followed by a quantifier with ++ zero repeat, we have to back off. Hence the definition of zeroreqbyte and ++ zerofirstbyte outside the main loop so that they can be accessed for the ++ back off. */ + + zeroreqbyte = reqbyte; + zerofirstbyte = firstbyte; + groupsetfirstbyte = FALSE; + +- if (bravalue >= OP_BRA || bravalue == OP_ONCE || bravalue == OP_COND) ++ if (bravalue >= OP_ONCE) + { + /* If we have not yet set a firstbyte in this branch, take it from the + subpattern, remembering that it was set here so that a repeat of more +@@ -3378,35 +4354,22 @@ + firstbyte, looking for an asserted first char. */ + + else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte; ++ break; /* End of processing '(' */ + +- /* Now update the main code pointer to the end of the group. */ +- +- code = tempcode; +- +- /* Error if hit end of pattern */ +- +- if (*ptr != ')') +- { +- *errorcodeptr = ERR14; +- goto FAILED; +- } +- break; +- +- /* Check \ for being a real metacharacter; if not, fall through and handle +- it as a data character at the start of a string. Escape items are checked +- for validity in the pre-compiling pass. */ +- +- case '\\': +- tempptr = ptr; +- c = check_escape(&ptr, errorcodeptr, *brackets, options, FALSE); + +- /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values ++ /* ===================================================================*/ ++ /* Handle metasequences introduced by \. For ones like \d, the ESC_ values + are arranged to be the negation of the corresponding OP_values. For the + back references, the values are ESC_REF plus the reference number. Only + back references and those types that consume a character may be repeated. + We can test for values between ESC_b and ESC_Z for the latter; this may + have to change if any new ones are ever created. */ + ++ case '\\': ++ tempptr = ptr; ++ c = check_escape(&ptr, errorcodeptr, cd->bracount, options, FALSE); ++ if (*errorcodeptr != 0) goto FAILED; ++ + if (c < 0) + { + if (-c == ESC_Q) /* Handle start of quoted string */ +@@ -3416,6 +4379,8 @@ + continue; + } + ++ if (-c == ESC_E) continue; /* Perl ignores an orphan \E */ ++ + /* For metasequences that actually match a character, we disable the + setting of a first character if it hasn't already been set. */ + +@@ -3427,18 +4392,33 @@ + zerofirstbyte = firstbyte; + zeroreqbyte = reqbyte; + +- /* Back references are handled specially */ ++ /* \k<name> or \k'name' is a back reference by name (Perl syntax) */ ++ ++ if (-c == ESC_k && (ptr[1] == '<' || ptr[1] == '\'')) ++ { ++ is_recurse = FALSE; ++ terminator = (*(++ptr) == '<')? '>' : '\''; ++ goto NAMED_REF_OR_RECURSE; ++ } ++ ++ /* Back references are handled specially; must disable firstbyte if ++ not set to cope with cases like (?=(\w+))\1: which would otherwise set ++ ':' later. */ + + if (-c >= ESC_REF) + { +- int number = -c - ESC_REF; ++ recno = -c - ESC_REF; ++ ++ HANDLE_REFERENCE: /* Come here from named backref handling */ ++ if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE; + previous = code; + *code++ = OP_REF; +- PUT2INC(code, 0, number); ++ PUT2INC(code, 0, recno); ++ cd->backref_map |= (recno < 32)? (1 << recno) : 1; ++ if (recno > cd->top_backref) cd->top_backref = recno; + } + +- /* So are Unicode property matches, if supported. We know that get_ucp +- won't fail because it was tested in the pre-pass. */ ++ /* So are Unicode property matches, if supported. */ + + #ifdef SUPPORT_UCP + else if (-c == ESC_P || -c == ESC_p) +@@ -3446,15 +4426,26 @@ + BOOL negated; + int pdata; + int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr); ++ if (ptype < 0) goto FAILED; + previous = code; + *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP; + *code++ = ptype; + *code++ = pdata; + } ++#else ++ ++ /* If Unicode properties are not supported, \X, \P, and \p are not ++ allowed. */ ++ ++ else if (-c == ESC_X || -c == ESC_P || -c == ESC_p) ++ { ++ *errorcodeptr = ERR45; ++ goto FAILED; ++ } + #endif + +- /* For the rest, we can obtain the OP value by negating the escape +- value */ ++ /* For the rest (including \X when Unicode properties are supported), we ++ can obtain the OP value by negating the escape value. */ + + else + { +@@ -3478,9 +4469,10 @@ + mcbuffer[0] = c; + mclength = 1; + } +- + goto ONE_CHAR; + ++ ++ /* ===================================================================*/ + /* Handle a literal character. It is guaranteed not to be whitespace or # + when the extended flag is set. If we are in UTF-8 mode, it may be a + multi-byte literal character. */ +@@ -3491,7 +4483,7 @@ + mcbuffer[0] = c; + + #ifdef SUPPORT_UTF8 +- if (utf8 && (c & 0xc0) == 0xc0) ++ if (utf8 && c >= 0xc0) + { + while ((ptr[1] & 0xc0) == 0x80) + mcbuffer[mclength++] = *(++ptr); +@@ -3542,6 +4534,7 @@ + } + } /* end of big loop */ + ++ + /* Control never reaches here by falling through, only by a goto for all the + error states. Pass back the position in the pattern so that it can be displayed + to the user for diagnosing the error. */ +@@ -3558,35 +4551,40 @@ + * Compile sequence of alternatives * + *************************************************/ + +-/* On entry, ptr is pointing past the bracket character, but on return +-it points to the closing bracket, or vertical bar, or end of string. +-The code variable is pointing at the byte into which the BRA operator has been +-stored. If the ims options are changed at the start (for a (?ims: group) or +-during any branch, we need to insert an OP_OPT item at the start of every +-following branch to ensure they get set correctly at run time, and also pass +-the new options into every subsequent branch compile. ++/* On entry, ptr is pointing past the bracket character, but on return it ++points to the closing bracket, or vertical bar, or end of string. The code ++variable is pointing at the byte into which the BRA operator has been stored. ++If the ims options are changed at the start (for a (?ims: group) or during any ++branch, we need to insert an OP_OPT item at the start of every following branch ++to ensure they get set correctly at run time, and also pass the new options ++into every subsequent branch compile. ++ ++This function is used during the pre-compile phase when we are trying to find ++out the amount of memory needed, as well as during the real compile phase. The ++value of lengthptr distinguishes the two phases. + + Argument: + options option bits, including any changes for this subpattern + oldims previous settings of ims option bits +- brackets -> int containing the number of extracting brackets used + codeptr -> the address of the current code pointer + ptrptr -> the address of the current pattern pointer + errorcodeptr -> pointer to error code variable + lookbehind TRUE if this is a lookbehind assertion +- skipbytes skip this many bytes at start (for OP_COND, OP_BRANUMBER) ++ skipbytes skip this many bytes at start (for brackets and OP_COND) + firstbyteptr place to put the first required character, or a negative number + reqbyteptr place to put the last required character, or a negative number + bcptr pointer to the chain of currently open branches + cd points to the data block with tables pointers etc. ++ lengthptr NULL during the real compile phase ++ points to length accumulator during pre-compile phase + +-Returns: TRUE on success ++Returns: TRUE on success + */ + + static BOOL +-compile_regex(int options, int oldims, int *brackets, uschar **codeptr, +- const uschar **ptrptr, int *errorcodeptr, BOOL lookbehind, int skipbytes, +- int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd) ++compile_regex(int options, int oldims, uschar **codeptr, const uschar **ptrptr, ++ int *errorcodeptr, BOOL lookbehind, int skipbytes, int *firstbyteptr, ++ int *reqbyteptr, branch_chain *bcptr, compile_data *cd, int *lengthptr) + { + const uschar *ptr = *ptrptr; + uschar *code = *codeptr; +@@ -3595,6 +4593,7 @@ + uschar *reverse_count = NULL; + int firstbyte, reqbyte; + int branchfirstbyte, branchreqbyte; ++int length; + branch_chain bc; + + bc.outer = bcptr; +@@ -3602,6 +4601,20 @@ + + firstbyte = reqbyte = REQ_UNSET; + ++/* Accumulate the length for use in the pre-compile phase. Start with the ++length of the BRA and KET and any extra bytes that are required at the ++beginning. We accumulate in a local variable to save frequent testing of ++lenthptr for NULL. We cannot do this by looking at the value of code at the ++start and end of each alternative, because compiled items are discarded during ++the pre-compile phase so that the work space is not exceeded. */ ++ ++length = 2 + 2*LINK_SIZE + skipbytes; ++ ++/* WARNING: If the above line is changed for any reason, you must also change ++the code that abstracts option settings at the start of the pattern and makes ++them global. It tests the value of length for (2 + 2*LINK_SIZE) in the ++pre-compile phase to find out whether anything has yet been compiled or not. */ ++ + /* Offset is set zero to mark that this bracket is still open */ + + PUT(code, 1, 0); +@@ -3617,6 +4630,7 @@ + { + *code++ = OP_OPT; + *code++ = options & PCRE_IMS; ++ length += 2; + } + + /* Set up dummy OP_REVERSE if lookbehind assertion */ +@@ -3626,73 +4640,80 @@ + *code++ = OP_REVERSE; + reverse_count = code; + PUTINC(code, 0, 0); ++ length += 1 + LINK_SIZE; + } + +- /* Now compile the branch */ ++ /* Now compile the branch; in the pre-compile phase its length gets added ++ into the length. */ + +- if (!compile_branch(&options, brackets, &code, &ptr, errorcodeptr, +- &branchfirstbyte, &branchreqbyte, &bc, cd)) ++ if (!compile_branch(&options, &code, &ptr, errorcodeptr, &branchfirstbyte, ++ &branchreqbyte, &bc, cd, (lengthptr == NULL)? NULL : &length)) + { + *ptrptr = ptr; + return FALSE; + } + +- /* If this is the first branch, the firstbyte and reqbyte values for the +- branch become the values for the regex. */ ++ /* In the real compile phase, there is some post-processing to be done. */ + +- if (*last_branch != OP_ALT) ++ if (lengthptr == NULL) + { +- firstbyte = branchfirstbyte; +- reqbyte = branchreqbyte; +- } ++ /* If this is the first branch, the firstbyte and reqbyte values for the ++ branch become the values for the regex. */ + +- /* If this is not the first branch, the first char and reqbyte have to +- match the values from all the previous branches, except that if the previous +- value for reqbyte didn't have REQ_VARY set, it can still match, and we set +- REQ_VARY for the regex. */ ++ if (*last_branch != OP_ALT) ++ { ++ firstbyte = branchfirstbyte; ++ reqbyte = branchreqbyte; ++ } + +- else +- { +- /* If we previously had a firstbyte, but it doesn't match the new branch, +- we have to abandon the firstbyte for the regex, but if there was previously +- no reqbyte, it takes on the value of the old firstbyte. */ ++ /* If this is not the first branch, the first char and reqbyte have to ++ match the values from all the previous branches, except that if the ++ previous value for reqbyte didn't have REQ_VARY set, it can still match, ++ and we set REQ_VARY for the regex. */ + +- if (firstbyte >= 0 && firstbyte != branchfirstbyte) ++ else + { +- if (reqbyte < 0) reqbyte = firstbyte; +- firstbyte = REQ_NONE; +- } ++ /* If we previously had a firstbyte, but it doesn't match the new branch, ++ we have to abandon the firstbyte for the regex, but if there was ++ previously no reqbyte, it takes on the value of the old firstbyte. */ ++ ++ if (firstbyte >= 0 && firstbyte != branchfirstbyte) ++ { ++ if (reqbyte < 0) reqbyte = firstbyte; ++ firstbyte = REQ_NONE; ++ } + +- /* If we (now or from before) have no firstbyte, a firstbyte from the +- branch becomes a reqbyte if there isn't a branch reqbyte. */ ++ /* If we (now or from before) have no firstbyte, a firstbyte from the ++ branch becomes a reqbyte if there isn't a branch reqbyte. */ + +- if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0) +- branchreqbyte = branchfirstbyte; ++ if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0) ++ branchreqbyte = branchfirstbyte; + +- /* Now ensure that the reqbytes match */ ++ /* Now ensure that the reqbytes match */ + +- if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY)) +- reqbyte = REQ_NONE; +- else reqbyte |= branchreqbyte; /* To "or" REQ_VARY */ +- } ++ if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY)) ++ reqbyte = REQ_NONE; ++ else reqbyte |= branchreqbyte; /* To "or" REQ_VARY */ ++ } + +- /* If lookbehind, check that this branch matches a fixed-length string, +- and put the length into the OP_REVERSE item. Temporarily mark the end of +- the branch with OP_END. */ ++ /* If lookbehind, check that this branch matches a fixed-length string, and ++ put the length into the OP_REVERSE item. Temporarily mark the end of the ++ branch with OP_END. */ + +- if (lookbehind) +- { +- int length; +- *code = OP_END; +- length = find_fixedlength(last_branch, options); +- DPRINTF(("fixed length = %d\n", length)); +- if (length < 0) ++ if (lookbehind) + { +- *errorcodeptr = (length == -2)? ERR36 : ERR25; +- *ptrptr = ptr; +- return FALSE; ++ int fixed_length; ++ *code = OP_END; ++ fixed_length = find_fixedlength(last_branch, options); ++ DPRINTF(("fixed length = %d\n", fixed_length)); ++ if (fixed_length < 0) ++ { ++ *errorcodeptr = (fixed_length == -2)? ERR36 : ERR25; ++ *ptrptr = ptr; ++ return FALSE; ++ } ++ PUT(reverse_count, 0, fixed_length); + } +- PUT(reverse_count, 0, length); + } + + /* Reached end of expression, either ')' or end of pattern. Go back through +@@ -3706,15 +4727,15 @@ + + if (*ptr != '|') + { +- int length = code - last_branch; ++ int branch_length = code - last_branch; + do + { + int prev_length = GET(last_branch, 1); +- PUT(last_branch, 1, length); +- length = prev_length; +- last_branch -= length; ++ PUT(last_branch, 1, branch_length); ++ branch_length = prev_length; ++ last_branch -= branch_length; + } +- while (length > 0); ++ while (branch_length > 0); + + /* Fill in the ket */ + +@@ -3728,6 +4749,7 @@ + { + *code++ = OP_OPT; + *code++ = oldims; ++ length += 2; + } + + /* Set values to pass back */ +@@ -3736,6 +4758,7 @@ + *ptrptr = ptr; + *firstbyteptr = firstbyte; + *reqbyteptr = reqbyte; ++ if (lengthptr != NULL) *lengthptr += length; + return TRUE; + } + +@@ -3749,6 +4772,7 @@ + bc.current = last_branch = code; + code += 1 + LINK_SIZE; + ptr++; ++ length += 1 + LINK_SIZE; + } + /* Control never reaches here */ + } +@@ -3799,24 +4823,29 @@ + unsigned int backref_map) + { + do { +- const uschar *scode = +- first_significant_code(code + 1+LINK_SIZE, options, PCRE_MULTILINE, FALSE); ++ const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code], ++ options, PCRE_MULTILINE, FALSE); + register int op = *scode; + ++ /* Non-capturing brackets */ ++ ++ if (op == OP_BRA) ++ { ++ if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE; ++ } ++ + /* Capturing brackets */ + +- if (op > OP_BRA) ++ else if (op == OP_CBRA) + { +- int new_map; +- op -= OP_BRA; +- if (op > EXTRACT_BASIC_MAX) op = GET2(scode, 2+LINK_SIZE); +- new_map = bracket_map | ((op < 32)? (1 << op) : 1); ++ int n = GET2(scode, 1+LINK_SIZE); ++ int new_map = bracket_map | ((n < 32)? (1 << n) : 1); + if (!is_anchored(scode, options, new_map, backref_map)) return FALSE; + } + + /* Other brackets */ + +- else if (op == OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND) ++ else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND) + { + if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE; + } +@@ -3824,7 +4853,8 @@ + /* .* is not anchored unless DOTALL is set and it isn't in brackets that + are or may be referenced. */ + +- else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR) && ++ else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR || ++ op == OP_TYPEPOSSTAR) && + (*options & PCRE_DOTALL) != 0) + { + if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE; +@@ -3869,30 +4899,35 @@ + unsigned int backref_map) + { + do { +- const uschar *scode = first_significant_code(code + 1+LINK_SIZE, NULL, 0, +- FALSE); ++ const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code], ++ NULL, 0, FALSE); + register int op = *scode; + ++ /* Non-capturing brackets */ ++ ++ if (op == OP_BRA) ++ { ++ if (!is_startline(scode, bracket_map, backref_map)) return FALSE; ++ } ++ + /* Capturing brackets */ + +- if (op > OP_BRA) ++ else if (op == OP_CBRA) + { +- int new_map; +- op -= OP_BRA; +- if (op > EXTRACT_BASIC_MAX) op = GET2(scode, 2+LINK_SIZE); +- new_map = bracket_map | ((op < 32)? (1 << op) : 1); ++ int n = GET2(scode, 1+LINK_SIZE); ++ int new_map = bracket_map | ((n < 32)? (1 << n) : 1); + if (!is_startline(scode, new_map, backref_map)) return FALSE; + } + + /* Other brackets */ + +- else if (op == OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND) ++ else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND) + { if (!is_startline(scode, bracket_map, backref_map)) return FALSE; } + + /* .* means "start at start or after \n" if it isn't in brackets that + may be referenced. */ + +- else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR) ++ else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR) + { + if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE; + } +@@ -3941,14 +4976,13 @@ + first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS, TRUE); + register int op = *scode; + +- if (op >= OP_BRA) op = OP_BRA; +- + switch(op) + { + default: + return -1; + + case OP_BRA: ++ case OP_CBRA: + case OP_ASSERT: + case OP_ONCE: + case OP_COND: +@@ -3964,6 +4998,7 @@ + case OP_CHARNC: + case OP_PLUS: + case OP_MINPLUS: ++ case OP_POSPLUS: + if (!inassert) return -1; + if (c < 0) + { +@@ -4012,37 +5047,36 @@ + } + + +- + PCRE_DATA_SCOPE pcre * + pcre_compile2(const char *pattern, int options, int *errorcodeptr, + const char **errorptr, int *erroroffset, const unsigned char *tables) + { + real_pcre *re; +-int length = 1 + LINK_SIZE; /* For initial BRA plus length */ +-int c, firstbyte, reqbyte, newline; +-int bracount = 0; +-int branch_extra = 0; +-int branch_newextra; +-int item_count = -1; +-int name_count = 0; +-int max_name_size = 0; +-int lastitemlength = 0; ++int length = 1; /* For final END opcode */ ++int firstbyte, reqbyte, newline; + int errorcode = 0; + #ifdef SUPPORT_UTF8 + BOOL utf8; +-BOOL class_utf8; + #endif +-BOOL inescq = FALSE; +-BOOL capturing; +-unsigned int brastackptr = 0; + size_t size; + uschar *code; + const uschar *codestart; + const uschar *ptr; + compile_data compile_block; + compile_data *cd = &compile_block; +-int brastack[BRASTACK_SIZE]; +-uschar bralenstack[BRASTACK_SIZE]; ++ ++/* This space is used for "compiling" into during the first phase, when we are ++computing the amount of memory that is needed. Compiled items are thrown away ++as soon as possible, so that a fairly large buffer should be sufficient for ++this purpose. The same space is used in the second phase for remembering where ++to fill in forward references to subpatterns. */ ++ ++uschar cworkspace[COMPILE_WORK_SIZE]; ++ ++ ++/* Set this early so that early errors get offset 0. */ ++ ++ptr = (const uschar *)pattern; + + /* We can't pass back an error message if errorptr is NULL; I guess the best we + can do is just return NULL, but we can set a code value if there is a code +@@ -4075,7 +5109,7 @@ + (*erroroffset = _pcre_valid_utf8((uschar *)pattern, -1)) >= 0) + { + errorcode = ERR44; +- goto PCRE_EARLY_ERROR_RETURN; ++ goto PCRE_UTF8_ERROR_RETURN; + } + #else + if ((options & PCRE_UTF8) != 0) +@@ -4099,34 +5133,43 @@ + cd->cbits = tables + cbits_offset; + cd->ctypes = tables + ctypes_offset; + +-/* Handle different types of newline. The two bits give four cases. The current +-code allows for one- or two-byte sequences. */ ++/* Handle different types of newline. The three bits give seven cases. The ++current code allows for fixed one- or two-byte sequences, plus "any". */ + +-switch (options & PCRE_NEWLINE_CRLF) ++switch (options & (PCRE_NEWLINE_CRLF | PCRE_NEWLINE_ANY)) + { +- default: newline = NEWLINE; break; /* Compile-time default */ ++ case 0: newline = NEWLINE; break; /* Compile-time default */ + case PCRE_NEWLINE_CR: newline = '\r'; break; + case PCRE_NEWLINE_LF: newline = '\n'; break; + case PCRE_NEWLINE_CR+ + PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break; ++ case PCRE_NEWLINE_ANY: newline = -1; break; ++ default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN; + } + +-if (newline > 255) ++if (newline < 0) + { +- cd->nllen = 2; +- cd->nl[0] = (newline >> 8) & 255; +- cd->nl[1] = newline & 255; ++ cd->nltype = NLTYPE_ANY; + } + else + { +- cd->nllen = 1; +- cd->nl[0] = newline; ++ cd->nltype = NLTYPE_FIXED; ++ if (newline > 255) ++ { ++ cd->nllen = 2; ++ cd->nl[0] = (newline >> 8) & 255; ++ cd->nl[1] = newline & 255; ++ } ++ else ++ { ++ cd->nllen = 1; ++ cd->nl[0] = newline; ++ } + } + +-/* Maximum back reference and backref bitmap. This is updated for numeric +-references during the first pass, but for named references during the actual +-compile pass. The bitmap records up to 31 back references to help in deciding +-whether (.*) can be treated as anchored or not. */ ++/* Maximum back reference and backref bitmap. The bitmap records up to 31 back ++references to help in deciding whether (.*) can be treated as anchored or not. ++*/ + + cd->top_backref = 0; + cd->backref_map = 0; +@@ -4136,1041 +5179,151 @@ + DPRINTF(("------------------------------------------------------------------\n")); + DPRINTF(("%s\n", pattern)); + +-/* The first thing to do is to make a pass over the pattern to compute the +-amount of store required to hold the compiled code. This does not have to be +-perfect as long as errors are overestimates. At the same time we can detect any +-flag settings right at the start, and extract them. Make an attempt to correct +-for any counted white space if an "extended" flag setting appears late in the +-pattern. We can't be so clever for #-comments. */ +- +-ptr = (const uschar *)(pattern - 1); +-while ((c = *(++ptr)) != 0) +- { +- int min, max; +- int class_optcount; +- int bracket_length; +- int duplength; ++/* Pretend to compile the pattern while actually just accumulating the length ++of memory required. This behaviour is triggered by passing a non-NULL final ++argument to compile_regex(). We pass a block of workspace (cworkspace) for it ++to compile parts of the pattern into; the compiled code is discarded when it is ++no longer needed, so hopefully this workspace will never overflow, though there ++is a test for its doing so. */ + +- /* If we are inside a \Q...\E sequence, all chars are literal */ ++cd->bracount = 0; ++cd->names_found = 0; ++cd->name_entry_size = 0; ++cd->name_table = NULL; ++cd->start_workspace = cworkspace; ++cd->start_code = cworkspace; ++cd->hwm = cworkspace; ++cd->start_pattern = (const uschar *)pattern; ++cd->end_pattern = (const uschar *)(pattern + strlen(pattern)); ++cd->req_varyopt = 0; ++cd->nopartial = FALSE; ++cd->external_options = options; + +- if (inescq) +- { +- if ((options & PCRE_AUTO_CALLOUT) != 0) length += 2 + 2*LINK_SIZE; +- goto NORMAL_CHAR; +- } ++/* Now do the pre-compile. On error, errorcode will be set non-zero, so we ++don't need to look at the result of the function here. The initial options have ++been put into the cd block so that they can be changed if an option setting is ++found within the regex right at the beginning. Bringing initial option settings ++outside can help speed up starting point checks. */ + +- /* Otherwise, first check for ignored whitespace and comments */ ++code = cworkspace; ++*code = OP_BRA; ++(void)compile_regex(cd->external_options, cd->external_options & PCRE_IMS, ++ &code, &ptr, &errorcode, FALSE, 0, &firstbyte, &reqbyte, NULL, cd, &length); ++if (errorcode != 0) goto PCRE_EARLY_ERROR_RETURN; + +- if ((options & PCRE_EXTENDED) != 0) +- { +- if ((cd->ctypes[c] & ctype_space) != 0) continue; +- if (c == '#') +- { +- while (*(++ptr) != 0) if (IS_NEWLINE(ptr)) break; +- if (*ptr != 0) +- { +- ptr += cd->nllen - 1; +- continue; +- } +- break; /* End loop at end of pattern */ +- } +- } ++DPRINTF(("end pre-compile: length=%d workspace=%d\n", length, ++ cd->hwm - cworkspace)); + +- item_count++; /* Is zero for the first non-comment item */ ++if (length > MAX_PATTERN_SIZE) ++ { ++ errorcode = ERR20; ++ goto PCRE_EARLY_ERROR_RETURN; ++ } + +- /* Allow space for auto callout before every item except quantifiers. */ ++/* Compute the size of data block needed and get it, either from malloc or ++externally provided function. Integer overflow should no longer be possible ++because nowadays we limit the maximum value of cd->names_found and ++cd->name_entry_size. */ + +- if ((options & PCRE_AUTO_CALLOUT) != 0 && +- c != '*' && c != '+' && c != '?' && +- (c != '{' || !is_counted_repeat(ptr + 1))) +- length += 2 + 2*LINK_SIZE; ++size = length + sizeof(real_pcre) + cd->names_found * (cd->name_entry_size + 3); ++re = (real_pcre *)(pcre_malloc)(size); + +- switch(c) +- { +- /* A backslashed item may be an escaped data character or it may be a +- character type. */ ++if (re == NULL) ++ { ++ errorcode = ERR21; ++ goto PCRE_EARLY_ERROR_RETURN; ++ } + +- case '\\': +- c = check_escape(&ptr, &errorcode, bracount, options, FALSE); +- if (errorcode != 0) goto PCRE_ERROR_RETURN; ++/* Put in the magic number, and save the sizes, initial options, and character ++table pointer. NULL is used for the default character tables. The nullpad field ++is at the end; it's there to help in the case when a regex compiled on a system ++with 4-byte pointers is run on another with 8-byte pointers. */ + +- lastitemlength = 1; /* Default length of last item for repeats */ ++re->magic_number = MAGIC_NUMBER; ++re->size = size; ++re->options = cd->external_options; ++re->dummy1 = 0; ++re->first_byte = 0; ++re->req_byte = 0; ++re->name_table_offset = sizeof(real_pcre); ++re->name_entry_size = cd->name_entry_size; ++re->name_count = cd->names_found; ++re->ref_count = 0; ++re->tables = (tables == _pcre_default_tables)? NULL : tables; ++re->nullpad = NULL; + +- if (c >= 0) /* Data character */ +- { +- length += 2; /* For a one-byte character */ ++/* The starting points of the name/number translation table and of the code are ++passed around in the compile data block. The start/end pattern and initial ++options are already set from the pre-compile phase, as is the name_entry_size ++field. Reset the bracket count and the names_found field. Also reset the hwm ++field; this time it's used for remembering forward references to subpatterns. ++*/ + +-#ifdef SUPPORT_UTF8 +- if (utf8 && c > 127) +- { +- int i; +- for (i = 0; i < _pcre_utf8_table1_size; i++) +- if (c <= _pcre_utf8_table1[i]) break; +- length += i; +- lastitemlength += i; +- } +-#endif ++cd->bracount = 0; ++cd->names_found = 0; ++cd->name_table = (uschar *)re + re->name_table_offset; ++codestart = cd->name_table + re->name_entry_size * re->name_count; ++cd->start_code = codestart; ++cd->hwm = cworkspace; ++cd->req_varyopt = 0; ++cd->nopartial = FALSE; + +- continue; +- } ++/* Set up a starting, non-extracting bracket, then compile the expression. On ++error, errorcode will be set non-zero, so we don't need to look at the result ++of the function here. */ + +- /* If \Q, enter "literal" mode */ ++ptr = (const uschar *)pattern; ++code = (uschar *)codestart; ++*code = OP_BRA; ++(void)compile_regex(re->options, re->options & PCRE_IMS, &code, &ptr, ++ &errorcode, FALSE, 0, &firstbyte, &reqbyte, NULL, cd, NULL); ++re->top_bracket = cd->bracount; ++re->top_backref = cd->top_backref; + +- if (-c == ESC_Q) +- { +- inescq = TRUE; +- continue; +- } ++if (cd->nopartial) re->options |= PCRE_NOPARTIAL; + +- /* \X is supported only if Unicode property support is compiled */ ++/* If not reached end of pattern on success, there's an excess bracket. */ + +-#ifndef SUPPORT_UCP +- if (-c == ESC_X) +- { +- errorcode = ERR45; +- goto PCRE_ERROR_RETURN; +- } +-#endif ++if (errorcode == 0 && *ptr != 0) errorcode = ERR22; + +- /* \P and \p are for Unicode properties, but only when the support has +- been compiled. Each item needs 3 bytes. */ ++/* Fill in the terminating state and check for disastrous overflow, but ++if debugging, leave the test till after things are printed out. */ + +- else if (-c == ESC_P || -c == ESC_p) +- { +-#ifdef SUPPORT_UCP +- BOOL negated; +- BOOL pdata; +- length += 3; +- lastitemlength = 3; +- if (get_ucp(&ptr, &negated, &pdata, &errorcode) < 0) +- goto PCRE_ERROR_RETURN; +- continue; +-#else +- errorcode = ERR45; +- goto PCRE_ERROR_RETURN; ++*code++ = OP_END; ++ ++#ifndef DEBUG ++if (code - codestart > length) errorcode = ERR23; + #endif +- } + +- /* Other escapes need one byte */ ++/* Fill in any forward references that are required. */ + +- length++; ++while (errorcode == 0 && cd->hwm > cworkspace) ++ { ++ int offset, recno; ++ const uschar *groupptr; ++ cd->hwm -= LINK_SIZE; ++ offset = GET(cd->hwm, 0); ++ recno = GET(codestart, offset); ++ groupptr = find_bracket(codestart, (re->options & PCRE_UTF8) != 0, recno); ++ if (groupptr == NULL) errorcode = ERR53; ++ else PUT(((uschar *)codestart), offset, groupptr - codestart); ++ } + +- /* A back reference needs an additional 2 bytes, plus either one or 5 +- bytes for a repeat. We also need to keep the value of the highest +- back reference. */ ++/* Give an error if there's back reference to a non-existent capturing ++subpattern. */ + +- if (c <= -ESC_REF) +- { +- int refnum = -c - ESC_REF; +- cd->backref_map |= (refnum < 32)? (1 << refnum) : 1; +- if (refnum > cd->top_backref) +- cd->top_backref = refnum; +- length += 2; /* For single back reference */ +- if (ptr[1] == '{' && is_counted_repeat(ptr+2)) +- { +- ptr = read_repeat_counts(ptr+2, &min, &max, &errorcode); +- if (errorcode != 0) goto PCRE_ERROR_RETURN; +- if ((min == 0 && (max == 1 || max == -1)) || +- (min == 1 && max == -1)) +- length++; +- else length += 5; +- if (ptr[1] == '?') ptr++; +- } +- } +- continue; +- +- case '^': /* Single-byte metacharacters */ +- case '.': +- case '$': +- length++; +- lastitemlength = 1; +- continue; +- +- case '*': /* These repeats won't be after brackets; */ +- case '+': /* those are handled separately */ +- case '?': +- length++; +- goto POSESSIVE; /* A few lines below */ +- +- /* This covers the cases of braced repeats after a single char, metachar, +- class, or back reference. */ +- +- case '{': +- if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR; +- ptr = read_repeat_counts(ptr+1, &min, &max, &errorcode); +- if (errorcode != 0) goto PCRE_ERROR_RETURN; +- +- /* These special cases just insert one extra opcode */ +- +- if ((min == 0 && (max == 1 || max == -1)) || +- (min == 1 && max == -1)) +- length++; +- +- /* These cases might insert additional copies of a preceding character. */ +- +- else +- { +- if (min != 1) +- { +- length -= lastitemlength; /* Uncount the original char or metachar */ +- if (min > 0) length += 3 + lastitemlength; +- } +- length += lastitemlength + ((max > 0)? 3 : 1); +- } +- +- if (ptr[1] == '?') ptr++; /* Needs no extra length */ +- +- POSESSIVE: /* Test for possessive quantifier */ +- if (ptr[1] == '+') +- { +- ptr++; +- length += 2 + 2*LINK_SIZE; /* Allow for atomic brackets */ +- } +- continue; +- +- /* An alternation contains an offset to the next branch or ket. If any ims +- options changed in the previous branch(es), and/or if we are in a +- lookbehind assertion, extra space will be needed at the start of the +- branch. This is handled by branch_extra. */ +- +- case '|': +- length += 1 + LINK_SIZE + branch_extra; +- continue; +- +- /* A character class uses 33 characters provided that all the character +- values are less than 256. Otherwise, it uses a bit map for low valued +- characters, and individual items for others. Don't worry about character +- types that aren't allowed in classes - they'll get picked up during the +- compile. A character class that contains only one single-byte character +- uses 2 or 3 bytes, depending on whether it is negated or not. Notice this +- where we can. (In UTF-8 mode we can do this only for chars < 128.) */ +- +- case '[': +- if (*(++ptr) == '^') +- { +- class_optcount = 10; /* Greater than one */ +- ptr++; +- } +- else class_optcount = 0; +- +-#ifdef SUPPORT_UTF8 +- class_utf8 = FALSE; +-#endif +- +- /* Written as a "do" so that an initial ']' is taken as data */ +- +- if (*ptr != 0) do +- { +- /* Inside \Q...\E everything is literal except \E */ +- +- if (inescq) +- { +- if (*ptr != '\\' || ptr[1] != 'E') goto GET_ONE_CHARACTER; +- inescq = FALSE; +- ptr += 1; +- continue; +- } +- +- /* Outside \Q...\E, check for escapes */ +- +- if (*ptr == '\\') +- { +- c = check_escape(&ptr, &errorcode, bracount, options, TRUE); +- if (errorcode != 0) goto PCRE_ERROR_RETURN; +- +- /* \b is backspace inside a class; \X is literal */ +- +- if (-c == ESC_b) c = '\b'; +- else if (-c == ESC_X) c = 'X'; +- +- /* \Q enters quoting mode */ +- +- else if (-c == ESC_Q) +- { +- inescq = TRUE; +- continue; +- } +- +- /* Handle escapes that turn into characters */ +- +- if (c >= 0) goto NON_SPECIAL_CHARACTER; +- +- /* Escapes that are meta-things. The normal ones just affect the +- bit map, but Unicode properties require an XCLASS extended item. */ +- +- else +- { +- class_optcount = 10; /* \d, \s etc; make sure > 1 */ +-#ifdef SUPPORT_UTF8 +- if (-c == ESC_p || -c == ESC_P) +- { +- if (!class_utf8) +- { +- class_utf8 = TRUE; +- length += LINK_SIZE + 2; +- } +- length += 3; +- } +-#endif +- } +- } +- +- /* Check the syntax for POSIX stuff. The bits we actually handle are +- checked during the real compile phase. */ +- +- else if (*ptr == '[' && +- (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') && +- check_posix_syntax(ptr, &ptr, cd)) +- { +- ptr++; +- class_optcount = 10; /* Make sure > 1 */ +- } +- +- /* Anything else increments the possible optimization count. We have to +- detect ranges here so that we can compute the number of extra ranges for +- caseless wide characters when UCP support is available. If there are wide +- characters, we are going to have to use an XCLASS, even for single +- characters. */ +- +- else +- { +- int d; +- +- GET_ONE_CHARACTER: +- +-#ifdef SUPPORT_UTF8 +- if (utf8) +- { +- int extra = 0; +- GETCHARLEN(c, ptr, extra); +- ptr += extra; +- } +- else c = *ptr; +-#else +- c = *ptr; +-#endif +- +- /* Come here from handling \ above when it escapes to a char value */ +- +- NON_SPECIAL_CHARACTER: +- class_optcount++; +- +- d = -1; +- if (ptr[1] == '-') +- { +- uschar const *hyptr = ptr++; +- if (ptr[1] == '\\') +- { +- ptr++; +- d = check_escape(&ptr, &errorcode, bracount, options, TRUE); +- if (errorcode != 0) goto PCRE_ERROR_RETURN; +- if (-d == ESC_b) d = '\b'; /* backspace */ +- else if (-d == ESC_X) d = 'X'; /* literal X in a class */ +- } +- else if (ptr[1] != 0 && ptr[1] != ']') +- { +- ptr++; +-#ifdef SUPPORT_UTF8 +- if (utf8) +- { +- int extra = 0; +- GETCHARLEN(d, ptr, extra); +- ptr += extra; +- } +- else +-#endif +- d = *ptr; +- } +- if (d < 0) ptr = hyptr; /* go back to hyphen as data */ +- } +- +- /* If d >= 0 we have a range. In UTF-8 mode, if the end is > 255, or > +- 127 for caseless matching, we will need to use an XCLASS. */ +- +- if (d >= 0) +- { +- class_optcount = 10; /* Ensure > 1 */ +- if (d < c) +- { +- errorcode = ERR8; +- goto PCRE_ERROR_RETURN; +- } +- +-#ifdef SUPPORT_UTF8 +- if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127))) +- { +- uschar buffer[6]; +- if (!class_utf8) /* Allow for XCLASS overhead */ +- { +- class_utf8 = TRUE; +- length += LINK_SIZE + 2; +- } +- +-#ifdef SUPPORT_UCP +- /* If we have UCP support, find out how many extra ranges are +- needed to map the other case of characters within this range. We +- have to mimic the range optimization here, because extending the +- range upwards might push d over a boundary that makes is use +- another byte in the UTF-8 representation. */ +- +- if ((options & PCRE_CASELESS) != 0) +- { +- int occ, ocd; +- int cc = c; +- int origd = d; +- while (get_othercase_range(&cc, origd, &occ, &ocd)) +- { +- if (occ >= c && ocd <= d) continue; /* Skip embedded */ +- +- if (occ < c && ocd >= c - 1) /* Extend the basic range */ +- { /* if there is overlap, */ +- c = occ; /* noting that if occ < c */ +- continue; /* we can't have ocd > d */ +- } /* because a subrange is */ +- if (ocd > d && occ <= d + 1) /* always shorter than */ +- { /* the basic range. */ +- d = ocd; +- continue; +- } +- +- /* An extra item is needed */ +- +- length += 1 + _pcre_ord2utf8(occ, buffer) + +- ((occ == ocd)? 0 : _pcre_ord2utf8(ocd, buffer)); +- } +- } +-#endif /* SUPPORT_UCP */ +- +- /* The length of the (possibly extended) range */ +- +- length += 1 + _pcre_ord2utf8(c, buffer) + _pcre_ord2utf8(d, buffer); +- } +-#endif /* SUPPORT_UTF8 */ +- +- } +- +- /* We have a single character. There is nothing to be done unless we +- are in UTF-8 mode. If the char is > 255, or 127 when caseless, we must +- allow for an XCL_SINGLE item, doubled for caselessness if there is UCP +- support. */ +- +- else +- { +-#ifdef SUPPORT_UTF8 +- if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127))) +- { +- uschar buffer[6]; +- class_optcount = 10; /* Ensure > 1 */ +- if (!class_utf8) /* Allow for XCLASS overhead */ +- { +- class_utf8 = TRUE; +- length += LINK_SIZE + 2; +- } +-#ifdef SUPPORT_UCP +- length += (((options & PCRE_CASELESS) != 0)? 2 : 1) * +- (1 + _pcre_ord2utf8(c, buffer)); +-#else /* SUPPORT_UCP */ +- length += 1 + _pcre_ord2utf8(c, buffer); +-#endif /* SUPPORT_UCP */ +- } +-#endif /* SUPPORT_UTF8 */ +- } +- } +- } +- while (*(++ptr) != 0 && (inescq || *ptr != ']')); /* Concludes "do" above */ +- +- if (*ptr == 0) /* Missing terminating ']' */ +- { +- errorcode = ERR6; +- goto PCRE_ERROR_RETURN; +- } +- +- /* We can optimize when there was only one optimizable character. Repeats +- for positive and negated single one-byte chars are handled by the general +- code. Here, we handle repeats for the class opcodes. */ +- +- if (class_optcount == 1) length += 3; else +- { +- length += 33; +- +- /* A repeat needs either 1 or 5 bytes. If it is a possessive quantifier, +- we also need extra for wrapping the whole thing in a sub-pattern. */ +- +- if (*ptr != 0 && ptr[1] == '{' && is_counted_repeat(ptr+2)) +- { +- ptr = read_repeat_counts(ptr+2, &min, &max, &errorcode); +- if (errorcode != 0) goto PCRE_ERROR_RETURN; +- if ((min == 0 && (max == 1 || max == -1)) || +- (min == 1 && max == -1)) +- length++; +- else length += 5; +- if (ptr[1] == '+') +- { +- ptr++; +- length += 2 + 2*LINK_SIZE; +- } +- else if (ptr[1] == '?') ptr++; +- } +- } +- continue; +- +- /* Brackets may be genuine groups or special things */ +- +- case '(': +- branch_newextra = 0; +- bracket_length = 1 + LINK_SIZE; +- capturing = FALSE; +- +- /* Handle special forms of bracket, which all start (? */ +- +- if (ptr[1] == '?') +- { +- int set, unset; +- int *optset; +- +- switch (c = ptr[2]) +- { +- /* Skip over comments entirely */ +- case '#': +- ptr += 3; +- while (*ptr != 0 && *ptr != ')') ptr++; +- if (*ptr == 0) +- { +- errorcode = ERR18; +- goto PCRE_ERROR_RETURN; +- } +- continue; +- +- /* Non-referencing groups and lookaheads just move the pointer on, and +- then behave like a non-special bracket, except that they don't increment +- the count of extracting brackets. Ditto for the "once only" bracket, +- which is in Perl from version 5.005. */ +- +- case ':': +- case '=': +- case '!': +- case '>': +- ptr += 2; +- break; +- +- /* Named subpatterns are an extension copied from Python */ +- +- case 'P': +- ptr += 3; +- +- /* Handle the definition of a named subpattern */ +- +- if (*ptr == '<') +- { +- const uschar *p; /* Don't amalgamate; some compilers */ +- p = ++ptr; /* grumble at autoincrement in declaration */ +- while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++; +- if (*ptr != '>') +- { +- errorcode = ERR42; +- goto PCRE_ERROR_RETURN; +- } +- name_count++; +- if (name_count > MAX_NAME_COUNT) +- { +- errorcode = ERR49; +- goto PCRE_ERROR_RETURN; +- } +- if (ptr - p > max_name_size) +- { +- max_name_size = (ptr - p); +- if (max_name_size > MAX_NAME_SIZE) +- { +- errorcode = ERR48; +- goto PCRE_ERROR_RETURN; +- } +- } +- capturing = TRUE; /* Named parentheses are always capturing */ +- break; /* Go handle capturing parentheses */ +- } +- +- /* Handle back references and recursive calls to named subpatterns */ +- +- if (*ptr == '=' || *ptr == '>') +- { +- length += 3 + 3*LINK_SIZE; /* Allow for the automatic "once" */ +- while ((cd->ctypes[*(++ptr)] & ctype_word) != 0); +- if (*ptr != ')') +- { +- errorcode = ERR42; +- goto PCRE_ERROR_RETURN; +- } +- goto RECURSE_CHECK_QUANTIFIED; +- } +- +- /* Unknown character after (?P */ +- +- errorcode = ERR41; +- goto PCRE_ERROR_RETURN; +- +- /* (?R) specifies a recursive call to the regex, which is an extension +- to provide the facility which can be obtained by (?p{perl-code}) in +- Perl 5.6. In Perl 5.8 this has become (??{perl-code}). +- +- From PCRE 4.00, items such as (?3) specify subroutine-like "calls" to +- the appropriate numbered brackets. This includes both recursive and +- non-recursive calls. (?R) is now synonymous with (?0). */ +- +- case 'R': +- ptr++; +- +- case '0': case '1': case '2': case '3': case '4': +- case '5': case '6': case '7': case '8': case '9': +- ptr += 2; +- if (c != 'R') +- while ((digitab[*(++ptr)] & ctype_digit) != 0); +- if (*ptr != ')') +- { +- errorcode = ERR29; +- goto PCRE_ERROR_RETURN; +- } +- length += 3 + 3*LINK_SIZE; /* Allows for the automatic "once" */ +- +- /* If this item is quantified, it will get wrapped inside brackets so +- as to use the code for quantified brackets. We jump down and use the +- code that handles this for real brackets. Come here from code for +- named recursions/subroutines. */ +- +- RECURSE_CHECK_QUANTIFIED: +- if (ptr[1] == '+' || ptr[1] == '*' || ptr[1] == '?' || ptr[1] == '{') +- { +- length += 2 + 2 * LINK_SIZE; /* to make bracketed */ +- duplength = 5 + 3 * LINK_SIZE; +- goto HANDLE_QUANTIFIED_BRACKETS; +- } +- continue; +- +- /* (?C) is an extension which provides "callout" - to provide a bit of +- the functionality of the Perl (?{...}) feature. An optional number may +- follow (default is zero). */ +- +- case 'C': +- ptr += 2; +- while ((digitab[*(++ptr)] & ctype_digit) != 0); +- if (*ptr != ')') +- { +- errorcode = ERR39; +- goto PCRE_ERROR_RETURN; +- } +- length += 2 + 2*LINK_SIZE; +- continue; +- +- /* Lookbehinds are in Perl from version 5.005 */ +- +- case '<': +- ptr += 3; +- if (*ptr == '=' || *ptr == '!') +- { +- branch_newextra = 1 + LINK_SIZE; +- length += 1 + LINK_SIZE; /* For the first branch */ +- break; +- } +- errorcode = ERR24; +- goto PCRE_ERROR_RETURN; +- +- /* Conditionals are in Perl from version 5.005. The bracket must either +- be followed by a number (for bracket reference) or by an assertion +- group. PCRE extends this by allowing a name to reference a named group; +- unfortunately, previously 'R' was implemented for a recursion test. +- When this is compiled, we look for the named group 'R' first. At this +- point we just do a basic syntax check. */ +- +- case '(': +- if ((cd->ctypes[ptr[3]] & ctype_word) != 0) +- { +- ptr += 4; +- length += 3; +- while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++; +- if (*ptr != ')') +- { +- errorcode = ERR26; +- goto PCRE_ERROR_RETURN; +- } +- } +- else /* An assertion must follow */ +- { +- ptr++; /* Can treat like ':' as far as spacing is concerned */ +- if (ptr[2] != '?' || +- (ptr[3] != '=' && ptr[3] != '!' && ptr[3] != '<') ) +- { +- ptr += 2; /* To get right offset in message */ +- errorcode = ERR28; +- goto PCRE_ERROR_RETURN; +- } +- } +- break; +- +- /* Else loop checking valid options until ) is met. Anything else is an +- error. If we are without any brackets, i.e. at top level, the settings +- act as if specified in the options, so massage the options immediately. +- This is for backward compatibility with Perl 5.004. */ +- +- default: +- set = unset = 0; +- optset = &set; +- ptr += 2; +- +- for (;; ptr++) +- { +- c = *ptr; +- switch (c) +- { +- case 'i': +- *optset |= PCRE_CASELESS; +- continue; +- +- case 'J': +- *optset |= PCRE_DUPNAMES; +- options |= PCRE_JCHANGED; /* Record that it changed */ +- continue; +- +- case 'm': +- *optset |= PCRE_MULTILINE; +- continue; +- +- case 's': +- *optset |= PCRE_DOTALL; +- continue; +- +- case 'x': +- *optset |= PCRE_EXTENDED; +- continue; +- +- case 'X': +- *optset |= PCRE_EXTRA; +- continue; +- +- case 'U': +- *optset |= PCRE_UNGREEDY; +- continue; +- +- case '-': +- optset = &unset; +- continue; +- +- /* A termination by ')' indicates an options-setting-only item; if +- this is at the very start of the pattern (indicated by item_count +- being zero), we use it to set the global options. This is helpful +- when analyzing the pattern for first characters, etc. Otherwise +- nothing is done here and it is handled during the compiling +- process. +- +- We allow for more than one options setting at the start. If such +- settings do not change the existing options, nothing is compiled. +- However, we must leave space just in case something is compiled. +- This can happen for pathological sequences such as (?i)(?-i) +- because the global options will end up with -i set. The space is +- small and not significant. (Before I did this there was a reported +- bug with (?i)(?-i) in a machine-generated pattern.) +- +- [Historical note: Up to Perl 5.8, options settings at top level +- were always global settings, wherever they appeared in the pattern. +- That is, they were equivalent to an external setting. From 5.8 +- onwards, they apply only to what follows (which is what you might +- expect).] */ +- +- case ')': +- if (item_count == 0) +- { +- options = (options | set) & (~unset); +- set = unset = 0; /* To save length */ +- item_count--; /* To allow for several */ +- length += 2; +- } +- +- /* Fall through */ +- +- /* A termination by ':' indicates the start of a nested group with +- the given options set. This is again handled at compile time, but +- we must allow for compiled space if any of the ims options are +- set. We also have to allow for resetting space at the end of +- the group, which is why 4 is added to the length and not just 2. +- If there are several changes of options within the same group, this +- will lead to an over-estimate on the length, but this shouldn't +- matter very much. We also have to allow for resetting options at +- the start of any alternations, which we do by setting +- branch_newextra to 2. */ +- +- case ':': +- if (((set|unset) & PCRE_IMS) != 0) +- { +- length += 4; +- branch_newextra = 2; +- } +- goto END_OPTIONS; +- +- /* Unrecognized option character */ +- +- default: +- errorcode = ERR12; +- goto PCRE_ERROR_RETURN; +- } +- } +- +- /* If we hit a closing bracket, that's it - this is a freestanding +- option-setting. We need to ensure that branch_extra is updated if +- necessary. The only values branch_newextra can have here are 0 or 2. +- If the value is 2, then branch_extra must either be 2 or 5, depending +- on whether this is a lookbehind group or not. */ +- +- END_OPTIONS: +- if (c == ')') +- { +- if (branch_newextra == 2 && +- (branch_extra == 0 || branch_extra == 1+LINK_SIZE)) +- branch_extra += branch_newextra; +- continue; +- } +- +- /* If options were terminated by ':' control comes here. This is a +- non-capturing group with an options change. There is nothing more that +- needs to be done because "capturing" is already set FALSE by default; +- we can just fall through. */ +- +- } +- } +- +- /* Ordinary parentheses, not followed by '?', are capturing unless +- PCRE_NO_AUTO_CAPTURE is set. */ +- +- else capturing = (options & PCRE_NO_AUTO_CAPTURE) == 0; +- +- /* Capturing brackets must be counted so we can process escapes in a +- Perlish way. If the number exceeds EXTRACT_BASIC_MAX we are going to need +- an additional 3 bytes of memory per capturing bracket. */ +- +- if (capturing) +- { +- bracount++; +- if (bracount > EXTRACT_BASIC_MAX) bracket_length += 3; +- } +- +- /* Save length for computing whole length at end if there's a repeat that +- requires duplication of the group. Also save the current value of +- branch_extra, and start the new group with the new value. If non-zero, this +- will either be 2 for a (?imsx: group, or 3 for a lookbehind assertion. */ +- +- if (brastackptr >= sizeof(brastack)/sizeof(int)) +- { +- errorcode = ERR19; +- goto PCRE_ERROR_RETURN; +- } +- +- bralenstack[brastackptr] = branch_extra; +- branch_extra = branch_newextra; +- +- brastack[brastackptr++] = length; +- length += bracket_length; +- continue; +- +- /* Handle ket. Look for subsequent max/min; for certain sets of values we +- have to replicate this bracket up to that many times. If brastackptr is +- 0 this is an unmatched bracket which will generate an error, but take care +- not to try to access brastack[-1] when computing the length and restoring +- the branch_extra value. */ +- +- case ')': +- length += 1 + LINK_SIZE; +- if (brastackptr > 0) +- { +- duplength = length - brastack[--brastackptr]; +- branch_extra = bralenstack[brastackptr]; +- /* This is a paranoid check to stop integer overflow later on */ +- if (duplength > MAX_DUPLENGTH) +- { +- errorcode = ERR50; +- goto PCRE_ERROR_RETURN; +- } +- } +- else duplength = 0; +- +- /* The following code is also used when a recursion such as (?3) is +- followed by a quantifier, because in that case, it has to be wrapped inside +- brackets so that the quantifier works. The value of duplength must be +- set before arrival. */ +- +- HANDLE_QUANTIFIED_BRACKETS: +- +- /* Leave ptr at the final char; for read_repeat_counts this happens +- automatically; for the others we need an increment. */ +- +- if ((c = ptr[1]) == '{' && is_counted_repeat(ptr+2)) +- { +- ptr = read_repeat_counts(ptr+2, &min, &max, &errorcode); +- if (errorcode != 0) goto PCRE_ERROR_RETURN; +- } +- else if (c == '*') { min = 0; max = -1; ptr++; } +- else if (c == '+') { min = 1; max = -1; ptr++; } +- else if (c == '?') { min = 0; max = 1; ptr++; } +- else { min = 1; max = 1; } +- +- /* If the minimum is zero, we have to allow for an OP_BRAZERO before the +- group, and if the maximum is greater than zero, we have to replicate +- maxval-1 times; each replication acquires an OP_BRAZERO plus a nesting +- bracket set. */ +- +- if (min == 0) +- { +- length++; +- if (max > 0) length += (max - 1) * (duplength + 3 + 2*LINK_SIZE); +- } +- +- /* When the minimum is greater than zero, we have to replicate up to +- minval-1 times, with no additions required in the copies. Then, if there +- is a limited maximum we have to replicate up to maxval-1 times allowing +- for a BRAZERO item before each optional copy and nesting brackets for all +- but one of the optional copies. */ +- +- else +- { +- length += (min - 1) * duplength; +- if (max > min) /* Need this test as max=-1 means no limit */ +- length += (max - min) * (duplength + 3 + 2*LINK_SIZE) +- - (2 + 2*LINK_SIZE); +- } +- +- /* Allow space for once brackets for "possessive quantifier" */ +- +- if (ptr[1] == '+') +- { +- ptr++; +- length += 2 + 2*LINK_SIZE; +- } +- continue; +- +- /* Non-special character. It won't be space or # in extended mode, so it is +- always a genuine character. If we are in a \Q...\E sequence, check for the +- end; if not, we have a literal. */ +- +- default: +- NORMAL_CHAR: +- +- if (inescq && c == '\\' && ptr[1] == 'E') +- { +- inescq = FALSE; +- ptr++; +- continue; +- } +- +- length += 2; /* For a one-byte character */ +- lastitemlength = 1; /* Default length of last item for repeats */ +- +- /* In UTF-8 mode, check for additional bytes. */ +- +-#ifdef SUPPORT_UTF8 +- if (utf8 && (c & 0xc0) == 0xc0) +- { +- while ((ptr[1] & 0xc0) == 0x80) /* Can't flow over the end */ +- { /* because the end is marked */ +- lastitemlength++; /* by a zero byte. */ +- length++; +- ptr++; +- } +- } +-#endif +- +- continue; +- } +- } +- +-length += 2 + LINK_SIZE; /* For final KET and END */ +- +-if ((options & PCRE_AUTO_CALLOUT) != 0) +- length += 2 + 2*LINK_SIZE; /* For final callout */ +- +-if (length > MAX_PATTERN_SIZE) +- { +- errorcode = ERR20; +- goto PCRE_EARLY_ERROR_RETURN; +- } +- +-/* Compute the size of data block needed and get it, either from malloc or +-externally provided function. Integer overflow should no longer be possible +-because nowadays we limit the maximum value of name_count and max_name size. */ +- +-size = length + sizeof(real_pcre) + name_count * (max_name_size + 3); +-re = (real_pcre *)(pcre_malloc)(size); +- +-if (re == NULL) +- { +- errorcode = ERR21; +- goto PCRE_EARLY_ERROR_RETURN; +- } +- +-/* Put in the magic number, and save the sizes, options, and character table +-pointer. NULL is used for the default character tables. The nullpad field is at +-the end; it's there to help in the case when a regex compiled on a system with +-4-byte pointers is run on another with 8-byte pointers. */ +- +-re->magic_number = MAGIC_NUMBER; +-re->size = size; +-re->options = options; +-re->dummy1 = 0; +-re->name_table_offset = sizeof(real_pcre); +-re->name_entry_size = max_name_size + 3; +-re->name_count = name_count; +-re->ref_count = 0; +-re->tables = (tables == _pcre_default_tables)? NULL : tables; +-re->nullpad = NULL; +- +-/* The starting points of the name/number translation table and of the code are +-passed around in the compile data block. */ +- +-cd->names_found = 0; +-cd->name_entry_size = max_name_size + 3; +-cd->name_table = (uschar *)re + re->name_table_offset; +-codestart = cd->name_table + re->name_entry_size * re->name_count; +-cd->start_code = codestart; +-cd->start_pattern = (const uschar *)pattern; +-cd->req_varyopt = 0; +-cd->nopartial = FALSE; +- +-/* Set up a starting, non-extracting bracket, then compile the expression. On +-error, errorcode will be set non-zero, so we don't need to look at the result +-of the function here. */ +- +-ptr = (const uschar *)pattern; +-code = (uschar *)codestart; +-*code = OP_BRA; +-bracount = 0; +-(void)compile_regex(options, options & PCRE_IMS, &bracount, &code, &ptr, +- &errorcode, FALSE, 0, &firstbyte, &reqbyte, NULL, cd); +-re->top_bracket = bracount; +-re->top_backref = cd->top_backref; +- +-if (cd->nopartial) re->options |= PCRE_NOPARTIAL; +- +-/* If not reached end of pattern on success, there's an excess bracket. */ +- +-if (errorcode == 0 && *ptr != 0) errorcode = ERR22; +- +-/* Fill in the terminating state and check for disastrous overflow, but +-if debugging, leave the test till after things are printed out. */ +- +-*code++ = OP_END; +- +-#ifndef DEBUG +-if (code - codestart > length) errorcode = ERR23; +-#endif +- +-/* Give an error if there's back reference to a non-existent capturing +-subpattern. */ +- +-if (re->top_backref > re->top_bracket) errorcode = ERR15; ++if (errorcode == 0 && re->top_backref > re->top_bracket) errorcode = ERR15; + + /* Failed to compile, or error while post-processing */ + + if (errorcode != 0) + { + (pcre_free)(re); +- PCRE_ERROR_RETURN: +- *erroroffset = ptr - (const uschar *)pattern; + PCRE_EARLY_ERROR_RETURN: ++ *erroroffset = ptr - (const uschar *)pattern; ++#ifdef SUPPORT_UTF8 ++ PCRE_UTF8_ERROR_RETURN: ++#endif + *errorptr = error_texts[errorcode]; + if (errorcodeptr != NULL) *errorcodeptr = errorcode; + return NULL; +@@ -5180,15 +5333,15 @@ + the pattern is anchored by virtue of ^ characters or \A or anything else (such + as starting with .* when DOTALL is set). + +-Otherwise, if we know what the first character has to be, save it, because that ++Otherwise, if we know what the first byte has to be, save it, because that + speeds up unanchored matches no end. If not, see if we can set the + PCRE_STARTLINE flag. This is helpful for multiline matches when all branches + start with ^. and also when all branches start with .* for non-DOTALL matches. + */ + +-if ((options & PCRE_ANCHORED) == 0) ++if ((re->options & PCRE_ANCHORED) == 0) + { +- int temp_options = options; ++ int temp_options = re->options; /* May get changed during these scans */ + if (is_anchored(codestart, &temp_options, 0, cd->backref_map)) + re->options |= PCRE_ANCHORED; + else +@@ -5273,7 +5426,7 @@ + if (errorcodeptr != NULL) *errorcodeptr = ERR23; + return NULL; + } +-#endif ++#endif /* DEBUG */ + + return (pcre *)re; + } +diff -ruN ../pcre.orig/pcrelib/pcre_exec.c ./pcrelib/pcre_exec.c +--- ../pcre.orig/pcrelib/pcre_exec.c Mon Jan 1 10:36:04 2007 ++++ ./pcrelib/pcre_exec.c Fri Feb 9 22:31:19 2007 +@@ -6,7 +6,7 @@ + and semantics are as close as possible to those of the Perl 5 language. + + Written by Philip Hazel +- Copyright (c) 1997-2007 University of Cambridge ++ Copyright (c) 1997-2006 University of Cambridge + + ----------------------------------------------------------------------------- + Redistribution and use in source and binary forms, with or without +@@ -42,25 +42,22 @@ + pattern matching using an NFA algorithm, trying to mimic Perl as closely as + possible. There are also some static supporting functions. */ + +-#define NLBLOCK md /* The block containing newline information */ ++#define NLBLOCK md /* Block containing newline information */ ++#define PSSTART start_subject /* Field containing processed string start */ ++#define PSEND end_subject /* Field containing processed string end */ ++ + #include "pcre_internal.h" + ++/* The chain of eptrblocks for tail recursions uses memory in stack workspace, ++obtained at top level, the size of which is defined by EPTR_WORK_SIZE. */ + +-/* Structure for building a chain of data that actually lives on the +-stack, for holding the values of the subject pointer at the start of each +-subpattern, so as to detect when an empty string has been matched by a +-subpattern - to break infinite loops. When NO_RECURSE is set, these blocks +-are on the heap, not on the stack. */ +- +-typedef struct eptrblock { +- struct eptrblock *epb_prev; +- USPTR epb_saved_eptr; +-} eptrblock; ++#define EPTR_WORK_SIZE (1000) + + /* Flag bits for the match() function */ + +-#define match_condassert 0x01 /* Called to check a condition assertion */ +-#define match_isgroup 0x02 /* Set if start of bracketed group */ ++#define match_condassert 0x01 /* Called to check a condition assertion */ ++#define match_cbegroup 0x02 /* Could-be-empty unlimited repeat group */ ++#define match_tail_recursed 0x04 /* Tail recursive call */ + + /* Non-error returns from the match() function. Error returns are externally + defined PCRE_ERROR_xxx codes, which are all negative. */ +@@ -101,7 +98,7 @@ + static void + pchars(const uschar *p, int length, BOOL is_subject, match_data *md) + { +-int c; ++unsigned int c; + if (is_subject && length > md->end_subject - p) length = md->end_subject - p; + while (length-- > 0) + if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c); +@@ -291,7 +288,6 @@ + + BOOL Xcur_is_word; + BOOL Xcondition; +- BOOL Xminimize; + BOOL Xprev_is_word; + + unsigned long int Xoriginal_ims; +@@ -303,11 +299,10 @@ + int Xprop_category; + int Xprop_chartype; + int Xprop_script; +- int *Xprop_test_variable; + #endif + + int Xctype; +- int Xfc; ++ unsigned int Xfc; + int Xfi; + int Xlength; + int Xmax; +@@ -340,10 +335,7 @@ + * Match from current position * + *************************************************/ + +-/* On entry ecode points to the first opcode, and eptr to the first character +-in the subject string, while eptrb holds the value of eptr at the start of the +-last bracketed group - used for breaking infinite loops matching zero-length +-strings. This function is called recursively in many circumstances. Whenever it ++/* This function is called recursively in many circumstances. Whenever it + returns a negative (error) response, the outer incarnation must also return the + same response. + +@@ -353,8 +345,8 @@ + made performance worse. + + Arguments: +- eptr pointer in subject +- ecode position in code ++ eptr pointer to current character in subject ++ ecode pointer to current position in compiled code + offset_top current top pointer + md pointer to "static" info for the match + ims current /i, /m, and /s options +@@ -362,7 +354,9 @@ + brackets - for testing for empty matches + flags can contain + match_condassert - this is an assertion condition +- match_isgroup - this is the start of a bracketed group ++ match_cbegroup - this is the start of an unlimited repeat ++ group that can match an empty string ++ match_tail_recursed - this is a tail_recursed group + rdepth the recursion depth + + Returns: MATCH_MATCH if matched ) these values are >= 0 +@@ -377,14 +371,16 @@ + int flags, unsigned int rdepth) + { + /* These variables do not need to be preserved over recursion in this function, +-so they can be ordinary variables in all cases. Mark them with "register" +-because they are used a lot in loops. */ ++so they can be ordinary variables in all cases. Mark some of them with ++"register" because they are used a lot in loops. */ + + register int rrc; /* Returns from recursive calls */ + register int i; /* Used for loops not involving calls to RMATCH() */ +-register unsigned int c; /* Character values not kept over RMATCH() calls */ ++register unsigned int c; /* Character values not kept over RMATCH() calls */ + register BOOL utf8; /* Local copy of UTF-8 flag for speed */ + ++BOOL minimize, possessive; /* Quantifier options */ ++ + /* When recursion is not being used, all "local" variables that have to be + preserved over calls to RMATCH() are part of a "frame" which is obtained from + heap storage. Set up the top-level frame here; others are obtained from the +@@ -434,7 +430,6 @@ + + #define cur_is_word frame->Xcur_is_word + #define condition frame->Xcondition +-#define minimize frame->Xminimize + #define prev_is_word frame->Xprev_is_word + + #define original_ims frame->Xoriginal_ims +@@ -446,7 +441,6 @@ + #define prop_category frame->Xprop_category + #define prop_chartype frame->Xprop_chartype + #define prop_script frame->Xprop_script +-#define prop_test_variable frame->Xprop_test_variable + #endif + + #define ctype frame->Xctype +@@ -470,7 +464,7 @@ + get preserved during recursion in the normal way. In this environment, fi and + i, and fc and c, can be the same variables. */ + +-#else ++#else /* NO_RECURSE not defined */ + #define fi i + #define fc c + +@@ -489,7 +483,6 @@ + /* that do not have to be preserved over */ + BOOL cur_is_word; /* a recursive call to RMATCH(). */ + BOOL condition; +-BOOL minimize; + BOOL prev_is_word; + + unsigned long int original_ims; +@@ -501,7 +494,6 @@ + int prop_category; + int prop_chartype; + int prop_script; +-int *prop_test_variable; + #endif + + int ctype; +@@ -516,7 +508,7 @@ + int stacksave[REC_STACK_SAVE_MAX]; + + eptrblock newptrb; +-#endif ++#endif /* NO_RECURSE */ + + /* These statements are here to stop the compiler complaining about unitialized + variables. */ +@@ -524,9 +516,9 @@ + #ifdef SUPPORT_UCP + prop_value = 0; + prop_fail_result = 0; +-prop_test_variable = NULL; + #endif + ++ + /* This label is used for tail recursion, which is used in a few cases even + when NO_RECURSE is not defined, in order to reduce the amount of stack that is + used. Thanks to Ian Taylor for noticing this possibility and sending the +@@ -556,24 +548,34 @@ + utf8 = FALSE; + #endif + +-/* At the start of a bracketed group, add the current subject pointer to the +-stack of such pointers, to be re-instated at the end of the group when we hit +-the closing ket. When match() is called in other circumstances, we don't add to +-this stack. */ ++/* At the start of a group with an unlimited repeat that may match an empty ++string, the match_cbegroup flag is set. When this is the case, add the current ++subject pointer to the chain of such remembered pointers, to be checked when we ++hit the closing ket, in order to break infinite loops that match no characters. ++When match() is called in other circumstances, don't add to the chain. If this ++is a tail recursion, use a block from the workspace, as the one on the stack is ++already used. */ + +-if ((flags & match_isgroup) != 0) ++if ((flags & match_cbegroup) != 0) + { +- newptrb.epb_prev = eptrb; +- newptrb.epb_saved_eptr = eptr; +- eptrb = &newptrb; ++ eptrblock *p; ++ if ((flags & match_tail_recursed) != 0) ++ { ++ if (md->eptrn >= EPTR_WORK_SIZE) RRETURN(PCRE_ERROR_NULLWSLIMIT); ++ p = md->eptrchain + md->eptrn++; ++ } ++ else p = &newptrb; ++ p->epb_saved_eptr = eptr; ++ p->epb_prev = eptrb; ++ eptrb = p; + } + +-/* Now start processing the operations. */ ++/* Now start processing the opcodes. */ + + for (;;) + { ++ minimize = possessive = FALSE; + op = *ecode; +- minimize = FALSE; + + /* For partial matching, remember if we ever hit the end of the subject after + matching at least one subject character. */ +@@ -583,33 +585,30 @@ + eptr > md->start_match) + md->hitend = TRUE; + +- /* Opening capturing bracket. If there is space in the offset vector, save +- the current subject position in the working slot at the top of the vector. We +- mustn't change the current values of the data slot, because they may be set +- from a previous iteration of this group, and be referred to by a reference +- inside the group. +- +- If the bracket fails to match, we need to restore this value and also the +- values of the final offsets, in case they were set by a previous iteration of +- the same bracket. +- +- If there isn't enough space in the offset vector, treat this as if it were a +- non-capturing bracket. Don't worry about setting the flag for the error case +- here; that is handled in the code for KET. */ +- +- if (op > OP_BRA) ++ switch(op) + { +- number = op - OP_BRA; +- +- /* For extended extraction brackets (large number), we have to fish out the +- number from a dummy opcode at the start. */ +- +- if (number > EXTRACT_BASIC_MAX) +- number = GET2(ecode, 2+LINK_SIZE); ++ /* Handle a capturing bracket. If there is space in the offset vector, save ++ the current subject position in the working slot at the top of the vector. ++ We mustn't change the current values of the data slot, because they may be ++ set from a previous iteration of this group, and be referred to by a ++ reference inside the group. ++ ++ If the bracket fails to match, we need to restore this value and also the ++ values of the final offsets, in case they were set by a previous iteration ++ of the same bracket. ++ ++ If there isn't enough space in the offset vector, treat this as if it were ++ a non-capturing bracket. Don't worry about setting the flag for the error ++ case here; that is handled in the code for KET. */ ++ ++ case OP_CBRA: ++ case OP_SCBRA: ++ number = GET2(ecode, 1+LINK_SIZE); + offset = number << 1; + + #ifdef DEBUG +- printf("start bracket %d subject=", number); ++ printf("start bracket %d\n", number); ++ printf("subject="); + pchars(eptr, 16, TRUE, md); + printf("\n"); + #endif +@@ -624,10 +623,11 @@ + DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3)); + md->offset_vector[md->offset_end - number] = eptr - md->start_subject; + ++ flags = (op == OP_SCBRA)? match_cbegroup : 0; + do + { +- RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, +- match_isgroup); ++ RMATCH(rrc, eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ++ ims, eptrb, flags); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + md->capture_last = save_capture_last; + ecode += GET(ecode, 1); +@@ -643,39 +643,35 @@ + RRETURN(MATCH_NOMATCH); + } + +- /* Insufficient room for saving captured contents */ +- +- else op = OP_BRA; +- } +- +- /* Other types of node can be handled by a switch */ ++ /* Insufficient room for saving captured contents. Treat as a non-capturing ++ bracket. */ + +- switch(op) +- { +- case OP_BRA: /* Non-capturing bracket: optimized */ +- DPRINTF(("start bracket 0\n")); +- +- /* Loop for all the alternatives */ ++ DPRINTF(("insufficient capture room: treat as non-capturing\n")); + ++ /* Non-capturing bracket. Loop for all the alternatives. When we get to the ++ final alternative within the brackets, we would return the result of a ++ recursive call to match() whatever happened. We can reduce stack usage by ++ turning this into a tail recursion. */ ++ ++ case OP_BRA: ++ case OP_SBRA: ++ DPRINTF(("start non-capturing bracket\n")); ++ flags = (op >= OP_SBRA)? match_cbegroup : 0; + for (;;) + { +- /* When we get to the final alternative within the brackets, we would +- return the result of a recursive call to match() whatever happened. We +- can reduce stack usage by turning this into a tail recursion. */ +- + if (ecode[GET(ecode, 1)] != OP_ALT) +- { +- ecode += 1 + LINK_SIZE; +- flags = match_isgroup; +- DPRINTF(("bracket 0 tail recursion\n")); +- goto TAIL_RECURSE; +- } ++ { ++ ecode += _pcre_OP_lengths[*ecode]; ++ flags |= match_tail_recursed; ++ DPRINTF(("bracket 0 tail recursion\n")); ++ goto TAIL_RECURSE; ++ } + + /* For non-final alternatives, continue the loop for a NOMATCH result; + otherwise return. */ + +- RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, +- match_isgroup); ++ RMATCH(rrc, eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims, ++ eptrb, flags); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + ecode += GET(ecode, 1); + } +@@ -688,54 +684,72 @@ + obeyed, we can use tail recursion to avoid using another stack frame. */ + + case OP_COND: +- if (ecode[LINK_SIZE+1] == OP_CREF) /* Condition extract or recurse test */ ++ case OP_SCOND: ++ if (ecode[LINK_SIZE+1] == OP_RREF) /* Recursion test */ ++ { ++ offset = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/ ++ condition = md->recursive != NULL && ++ (offset == RREF_ANY || offset == md->recursive->group_num); ++ ecode += condition? 3 : GET(ecode, 1); ++ } ++ ++ else if (ecode[LINK_SIZE+1] == OP_CREF) /* Group used test */ + { + offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */ +- condition = (offset == CREF_RECURSE * 2)? +- (md->recursive != NULL) : +- (offset < offset_top && md->offset_vector[offset] >= 0); +- ecode += condition? (LINK_SIZE + 4) : (LINK_SIZE + 1 + GET(ecode, 1)); +- flags = match_isgroup; +- goto TAIL_RECURSE; ++ condition = offset < offset_top && md->offset_vector[offset] >= 0; ++ ecode += condition? 3 : GET(ecode, 1); ++ } ++ ++ else if (ecode[LINK_SIZE+1] == OP_DEF) /* DEFINE - always false */ ++ { ++ condition = FALSE; ++ ecode += GET(ecode, 1); + } + + /* The condition is an assertion. Call match() to evaluate it - setting +- the final argument TRUE causes it to stop at the end of an assertion. */ ++ the final argument match_condassert causes it to stop at the end of an ++ assertion. */ + + else + { + RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, +- match_condassert | match_isgroup); ++ match_condassert); + if (rrc == MATCH_MATCH) + { +- ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE+2); ++ condition = TRUE; ++ ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2); + while (*ecode == OP_ALT) ecode += GET(ecode, 1); + } + else if (rrc != MATCH_NOMATCH) + { + RRETURN(rrc); /* Need braces because of following else */ + } +- else ecode += GET(ecode, 1); ++ else ++ { ++ condition = FALSE; ++ ecode += GET(ecode, 1); ++ } ++ } + +- /* We are now at the branch that is to be obeyed. As there is only one, +- we can use tail recursion to avoid using another stack frame. */ ++ /* We are now at the branch that is to be obeyed. As there is only one, ++ we can use tail recursion to avoid using another stack frame. If the second ++ alternative doesn't exist, we can just plough on. */ + ++ if (condition || *ecode == OP_ALT) ++ { + ecode += 1 + LINK_SIZE; +- flags = match_isgroup; ++ flags = match_tail_recursed | ((op == OP_SCOND)? match_cbegroup : 0); + goto TAIL_RECURSE; + } +- /* Control never reaches here */ +- +- /* Skip over conditional reference or large extraction number data if +- encountered. */ +- +- case OP_CREF: +- case OP_BRANUMBER: +- ecode += 3; ++ else ++ { ++ ecode += 1 + LINK_SIZE; ++ } + break; + +- /* End of the pattern. If we are in a recursion, we should restore the +- offsets appropriately and continue from after the call. */ ++ ++ /* End of the pattern. If we are in a top-level recursion, we should ++ restore the offsets appropriately and continue from after the call. */ + + case OP_END: + if (md->recursive != NULL && md->recursive->group_num == 0) +@@ -777,8 +791,7 @@ + case OP_ASSERTBACK: + do + { +- RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, +- match_isgroup); ++ RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0); + if (rrc == MATCH_MATCH) break; + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + ecode += GET(ecode, 1); +@@ -804,8 +817,7 @@ + case OP_ASSERTBACK_NOT: + do + { +- RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, +- match_isgroup); ++ RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0); + if (rrc == MATCH_MATCH) RRETURN(MATCH_NOMATCH); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + ecode += GET(ecode,1); +@@ -826,8 +838,8 @@ + #ifdef SUPPORT_UTF8 + if (utf8) + { +- c = GET(ecode,1); +- for (i = 0; i < c; i++) ++ i = GET(ecode, 1); ++ while (i-- > 0) + { + eptr--; + if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH); +@@ -840,7 +852,7 @@ + /* No UTF-8 support, or not in UTF-8 mode: count is byte count */ + + { +- eptr -= GET(ecode,1); ++ eptr -= GET(ecode, 1); + if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH); + } + +@@ -897,13 +909,8 @@ + case OP_RECURSE: + { + callpat = md->start_code + GET(ecode, 1); +- new_recursive.group_num = *callpat - OP_BRA; +- +- /* For extended extraction brackets (large number), we have to fish out +- the number from a dummy opcode at the start. */ +- +- if (new_recursive.group_num > EXTRACT_BASIC_MAX) +- new_recursive.group_num = GET2(callpat, 2+LINK_SIZE); ++ new_recursive.group_num = (callpat == md->start_code)? 0 : ++ GET2(callpat, 1 + LINK_SIZE); + + /* Add to "recursing stack" */ + +@@ -936,10 +943,11 @@ + restore the offset and recursion data. */ + + DPRINTF(("Recursing into group %d\n", new_recursive.group_num)); ++ flags = (*callpat >= OP_SBRA)? match_cbegroup : 0; + do + { +- RMATCH(rrc, eptr, callpat + 1 + LINK_SIZE, offset_top, md, ims, +- eptrb, match_isgroup); ++ RMATCH(rrc, eptr, callpat + _pcre_OP_lengths[*callpat], offset_top, ++ md, ims, eptrb, flags); + if (rrc == MATCH_MATCH) + { + DPRINTF(("Recursion matched\n")); +@@ -983,7 +991,7 @@ + do + { + RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, +- eptrb, match_isgroup); ++ eptrb, 0); + if (rrc == MATCH_MATCH) break; + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + ecode += GET(ecode,1); +@@ -997,7 +1005,7 @@ + /* Continue as from after the assertion, updating the offsets high water + mark, since extracts may have been taken. */ + +- do ecode += GET(ecode,1); while (*ecode == OP_ALT); ++ do ecode += GET(ecode, 1); while (*ecode == OP_ALT); + + offset_top = md->end_offset_top; + eptr = md->end_match_ptr; +@@ -1031,15 +1039,15 @@ + RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + ecode = prev; +- flags = match_isgroup; ++ flags = match_tail_recursed; + goto TAIL_RECURSE; + } + else /* OP_KETRMAX */ + { +- RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup); ++ RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_cbegroup); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + ecode += 1 + LINK_SIZE; +- flags = 0; ++ flags = match_tail_recursed; + goto TAIL_RECURSE; + } + /* Control never gets here */ +@@ -1060,38 +1068,44 @@ + case OP_BRAZERO: + { + next = ecode+1; +- RMATCH(rrc, eptr, next, offset_top, md, ims, eptrb, match_isgroup); ++ RMATCH(rrc, eptr, next, offset_top, md, ims, eptrb, 0); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + do next += GET(next,1); while (*next == OP_ALT); +- ecode = next + 1+LINK_SIZE; ++ ecode = next + 1 + LINK_SIZE; + } + break; + + case OP_BRAMINZERO: + { + next = ecode+1; +- do next += GET(next,1); while (*next == OP_ALT); +- RMATCH(rrc, eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb, +- match_isgroup); ++ do next += GET(next, 1); while (*next == OP_ALT); ++ RMATCH(rrc, eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + ecode++; + } + break; + +- /* End of a group, repeated or non-repeating. If we are at the end of +- an assertion "group", stop matching and return MATCH_MATCH, but record the +- current high water mark for use by positive assertions. Do this also +- for the "once" (not-backup up) groups. */ ++ /* End of a group, repeated or non-repeating. */ + + case OP_KET: + case OP_KETRMIN: + case OP_KETRMAX: + prev = ecode - GET(ecode, 1); +- saved_eptr = eptrb->epb_saved_eptr; + +- /* Back up the stack of bracket start pointers. */ ++ /* If this was a group that remembered the subject start, in order to break ++ infinite repeats of empty string matches, retrieve the subject start from ++ the chain. Otherwise, set it NULL. */ ++ ++ if (*prev >= OP_SBRA) ++ { ++ saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */ ++ eptrb = eptrb->epb_prev; /* Backup to previous group */ ++ } ++ else saved_eptr = NULL; + +- eptrb = eptrb->epb_prev; ++ /* If we are at the end of an assertion group, stop matching and return ++ MATCH_MATCH, but record the current high water mark for use by positive ++ assertions. Do this also for the "once" (atomic) groups. */ + + if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT || + *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT || +@@ -1102,18 +1116,15 @@ + RRETURN(MATCH_MATCH); + } + +- /* In all other cases except a conditional group we have to check the +- group number back at the start and if necessary complete handling an +- extraction by setting the offsets and bumping the high water mark. */ ++ /* For capturing groups we have to check the group number back at the start ++ and if necessary complete handling an extraction by setting the offsets and ++ bumping the high water mark. Note that whole-pattern recursion is coded as ++ a recurse into group 0, so it won't be picked up here. Instead, we catch it ++ when the OP_END is reached. Other recursion is handled here. */ + +- if (*prev != OP_COND) ++ if (*prev == OP_CBRA || *prev == OP_SCBRA) + { +- number = *prev - OP_BRA; +- +- /* For extended extraction brackets (large number), we have to fish out +- the number from a dummy opcode at the start. */ +- +- if (number > EXTRACT_BASIC_MAX) number = GET2(prev, 2+LINK_SIZE); ++ number = GET2(prev, 1+LINK_SIZE); + offset = number << 1; + + #ifdef DEBUG +@@ -1121,42 +1132,34 @@ + printf("\n"); + #endif + +- /* Test for a numbered group. This includes groups called as a result +- of recursion. Note that whole-pattern recursion is coded as a recurse +- into group 0, so it won't be picked up here. Instead, we catch it when +- the OP_END is reached. */ +- +- if (number > 0) ++ md->capture_last = number; ++ if (offset >= md->offset_max) md->offset_overflow = TRUE; else + { +- md->capture_last = number; +- if (offset >= md->offset_max) md->offset_overflow = TRUE; else +- { +- md->offset_vector[offset] = +- md->offset_vector[md->offset_end - number]; +- md->offset_vector[offset+1] = eptr - md->start_subject; +- if (offset_top <= offset) offset_top = offset + 2; +- } +- +- /* Handle a recursively called group. Restore the offsets +- appropriately and continue from after the call. */ +- +- if (md->recursive != NULL && md->recursive->group_num == number) +- { +- recursion_info *rec = md->recursive; +- DPRINTF(("Recursion (%d) succeeded - continuing\n", number)); +- md->recursive = rec->prevrec; +- md->start_match = rec->save_start; +- memcpy(md->offset_vector, rec->offset_save, +- rec->saved_max * sizeof(int)); +- ecode = rec->after_call; +- ims = original_ims; +- break; +- } ++ md->offset_vector[offset] = ++ md->offset_vector[md->offset_end - number]; ++ md->offset_vector[offset+1] = eptr - md->start_subject; ++ if (offset_top <= offset) offset_top = offset + 2; ++ } ++ ++ /* Handle a recursively called group. Restore the offsets ++ appropriately and continue from after the call. */ ++ ++ if (md->recursive != NULL && md->recursive->group_num == number) ++ { ++ recursion_info *rec = md->recursive; ++ DPRINTF(("Recursion (%d) succeeded - continuing\n", number)); ++ md->recursive = rec->prevrec; ++ md->start_match = rec->save_start; ++ memcpy(md->offset_vector, rec->offset_save, ++ rec->saved_max * sizeof(int)); ++ ecode = rec->after_call; ++ ims = original_ims; ++ break; + } + } + +- /* Reset the value of the ims flags, in case they got changed during +- the group. */ ++ /* For both capturing and non-capturing groups, reset the value of the ims ++ flags, in case they got changed during the group. */ + + ims = original_ims; + DPRINTF(("ims reset to %02lx\n", ims)); +@@ -1177,20 +1180,22 @@ + preceding bracket, in the appropriate order. In the second case, we can use + tail recursion to avoid using another stack frame. */ + ++ flags = (*prev >= OP_SBRA)? match_cbegroup : 0; ++ + if (*ecode == OP_KETRMIN) + { + RMATCH(rrc, eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + ecode = prev; +- flags = match_isgroup; ++ flags |= match_tail_recursed; + goto TAIL_RECURSE; + } + else /* OP_KETRMAX */ + { +- RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup); ++ RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, flags); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + ecode += 1 + LINK_SIZE; +- flags = 0; ++ flags = match_tail_recursed; + goto TAIL_RECURSE; + } + /* Control never gets here */ +@@ -1202,9 +1207,7 @@ + if ((ims & PCRE_MULTILINE) != 0) + { + if (eptr != md->start_subject && +- (eptr == md->end_subject || +- eptr < md->start_subject + md->nllen || +- !IS_NEWLINE(eptr - md->nllen))) ++ (eptr == md->end_subject || !WAS_NEWLINE(eptr))) + RRETURN(MATCH_NOMATCH); + ecode++; + break; +@@ -1244,7 +1247,7 @@ + if (!md->endonly) + { + if (eptr != md->end_subject && +- (eptr != md->end_subject - md->nllen || !IS_NEWLINE(eptr))) ++ (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen)) + RRETURN(MATCH_NOMATCH); + ecode++; + break; +@@ -1263,7 +1266,7 @@ + + case OP_EODN: + if (eptr != md->end_subject && +- (eptr != md->end_subject - md->nllen || !IS_NEWLINE(eptr))) ++ (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen)) + RRETURN(MATCH_NOMATCH); + ecode++; + break; +@@ -1319,8 +1322,7 @@ + case OP_ANY: + if ((ims & PCRE_DOTALL) == 0) + { +- if (eptr <= md->end_subject - md->nllen && IS_NEWLINE(eptr)) +- RRETURN(MATCH_NOMATCH); ++ if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); + } + if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH); + if (utf8) +@@ -1414,6 +1416,26 @@ + ecode++; + break; + ++ case OP_ANYNL: ++ if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); ++ GETCHARINCTEST(c, eptr); ++ switch(c) ++ { ++ default: RRETURN(MATCH_NOMATCH); ++ case 0x000d: ++ if (eptr < md->end_subject && *eptr == 0x0a) eptr++; ++ break; ++ case 0x000a: ++ case 0x000b: ++ case 0x000c: ++ case 0x0085: ++ case 0x2028: ++ case 0x2029: ++ break; ++ } ++ ecode++; ++ break; ++ + #ifdef SUPPORT_UCP + /* Check the next character by Unicode property. We will get here only + if the support is in the binary; otherwise a compile-time error occurs. */ +@@ -1456,7 +1478,6 @@ + + default: + RRETURN(PCRE_ERROR_INTERNAL); +- break; + } + + ecode += 3; +@@ -1926,7 +1947,7 @@ + + else + { +- int dc; ++ unsigned int dc; + GETCHARINC(dc, eptr); + ecode += length; + +@@ -1953,13 +1974,17 @@ + } + break; + +- /* Match a single character repeatedly; different opcodes share code. */ ++ /* Match a single character repeatedly. */ + + case OP_EXACT: + min = max = GET2(ecode, 1); + ecode += 3; + goto REPEATCHAR; + ++ case OP_POSUPTO: ++ possessive = TRUE; ++ /* Fall through */ ++ + case OP_UPTO: + case OP_MINUPTO: + min = 0; +@@ -1968,6 +1993,27 @@ + ecode += 3; + goto REPEATCHAR; + ++ case OP_POSSTAR: ++ possessive = TRUE; ++ min = 0; ++ max = INT_MAX; ++ ecode++; ++ goto REPEATCHAR; ++ ++ case OP_POSPLUS: ++ possessive = TRUE; ++ min = 1; ++ max = INT_MAX; ++ ecode++; ++ goto REPEATCHAR; ++ ++ case OP_POSQUERY: ++ possessive = TRUE; ++ min = 0; ++ max = 1; ++ ecode++; ++ goto REPEATCHAR; ++ + case OP_STAR: + case OP_MINSTAR: + case OP_PLUS: +@@ -2003,10 +2049,9 @@ + uschar occhars[8]; + + #ifdef SUPPORT_UCP +- int othercase; ++ unsigned int othercase; + if ((ims & PCRE_CASELESS) != 0 && +- (othercase = _pcre_ucp_othercase(fc)) >= 0 && +- othercase >= 0) ++ (othercase = _pcre_ucp_othercase(fc)) != NOTACHAR) + oclength = _pcre_ord2utf8(othercase, occhars); + #endif /* SUPPORT_UCP */ + +@@ -2042,7 +2087,8 @@ + } + /* Control never gets here */ + } +- else ++ ++ else /* Maximize */ + { + pp = eptr; + for (i = min; i < max; i++) +@@ -2056,6 +2102,8 @@ + eptr += oclength; + } + } ++ ++ if (possessive) continue; + while (eptr >= pp) + { + RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); +@@ -2110,7 +2158,7 @@ + } + /* Control never gets here */ + } +- else ++ else /* Maximize */ + { + pp = eptr; + for (i = min; i < max; i++) +@@ -2118,6 +2166,7 @@ + if (eptr >= md->end_subject || fc != md->lcc[*eptr]) break; + eptr++; + } ++ if (possessive) continue; + while (eptr >= pp) + { + RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); +@@ -2146,7 +2195,7 @@ + } + /* Control never gets here */ + } +- else ++ else /* Maximize */ + { + pp = eptr; + for (i = min; i < max; i++) +@@ -2154,6 +2203,7 @@ + if (eptr >= md->end_subject || fc != *eptr) break; + eptr++; + } ++ if (possessive) continue; + while (eptr >= pp) + { + RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); +@@ -2206,6 +2256,34 @@ + ecode += 3; + goto REPEATNOTCHAR; + ++ case OP_NOTPOSSTAR: ++ possessive = TRUE; ++ min = 0; ++ max = INT_MAX; ++ ecode++; ++ goto REPEATNOTCHAR; ++ ++ case OP_NOTPOSPLUS: ++ possessive = TRUE; ++ min = 1; ++ max = INT_MAX; ++ ecode++; ++ goto REPEATNOTCHAR; ++ ++ case OP_NOTPOSQUERY: ++ possessive = TRUE; ++ min = 0; ++ max = 1; ++ ecode++; ++ goto REPEATNOTCHAR; ++ ++ case OP_NOTPOSUPTO: ++ possessive = TRUE; ++ min = 0; ++ max = GET2(ecode, 1); ++ ecode += 3; ++ goto REPEATNOTCHAR; ++ + case OP_NOTSTAR: + case OP_NOTMINSTAR: + case OP_NOTPLUS: +@@ -2245,7 +2323,7 @@ + /* UTF-8 mode */ + if (utf8) + { +- register int d; ++ register unsigned int d; + for (i = 1; i <= min; i++) + { + GETCHARINC(d, eptr); +@@ -2270,7 +2348,7 @@ + /* UTF-8 mode */ + if (utf8) + { +- register int d; ++ register unsigned int d; + for (fi = min;; fi++) + { + RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); +@@ -2306,7 +2384,7 @@ + /* UTF-8 mode */ + if (utf8) + { +- register int d; ++ register unsigned int d; + for (i = min; i < max; i++) + { + int len = 1; +@@ -2316,7 +2394,8 @@ + if (fc == d) break; + eptr += len; + } +- for(;;) ++ if (possessive) continue; ++ for(;;) + { + RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); +@@ -2333,6 +2412,7 @@ + if (eptr >= md->end_subject || fc == md->lcc[*eptr]) break; + eptr++; + } ++ if (possessive) continue; + while (eptr >= pp) + { + RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); +@@ -2354,7 +2434,7 @@ + /* UTF-8 mode */ + if (utf8) + { +- register int d; ++ register unsigned int d; + for (i = 1; i <= min; i++) + { + GETCHARINC(d, eptr); +@@ -2377,7 +2457,7 @@ + /* UTF-8 mode */ + if (utf8) + { +- register int d; ++ register unsigned int d; + for (fi = min;; fi++) + { + RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); +@@ -2412,7 +2492,7 @@ + /* UTF-8 mode */ + if (utf8) + { +- register int d; ++ register unsigned int d; + for (i = min; i < max; i++) + { + int len = 1; +@@ -2421,6 +2501,7 @@ + if (fc == d) break; + eptr += len; + } ++ if (possessive) continue; + for(;;) + { + RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); +@@ -2438,6 +2519,7 @@ + if (eptr >= md->end_subject || fc == *eptr) break; + eptr++; + } ++ if (possessive) continue; + while (eptr >= pp) + { + RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); +@@ -2469,6 +2551,34 @@ + ecode += 3; + goto REPEATTYPE; + ++ case OP_TYPEPOSSTAR: ++ possessive = TRUE; ++ min = 0; ++ max = INT_MAX; ++ ecode++; ++ goto REPEATTYPE; ++ ++ case OP_TYPEPOSPLUS: ++ possessive = TRUE; ++ min = 1; ++ max = INT_MAX; ++ ecode++; ++ goto REPEATTYPE; ++ ++ case OP_TYPEPOSQUERY: ++ possessive = TRUE; ++ min = 0; ++ max = 1; ++ ecode++; ++ goto REPEATTYPE; ++ ++ case OP_TYPEPOSUPTO: ++ possessive = TRUE; ++ min = 0; ++ max = GET2(ecode, 1); ++ ecode += 3; ++ goto REPEATTYPE; ++ + case OP_TYPESTAR: + case OP_TYPEMINSTAR: + case OP_TYPEPLUS: +@@ -2571,7 +2681,6 @@ + + default: + RRETURN(PCRE_ERROR_INTERNAL); +- break; + } + } + +@@ -2611,9 +2720,7 @@ + for (i = 1; i <= min; i++) + { + if (eptr >= md->end_subject || +- ((ims & PCRE_DOTALL) == 0 && +- eptr <= md->end_subject - md->nllen && +- IS_NEWLINE(eptr))) ++ ((ims & PCRE_DOTALL) == 0 && IS_NEWLINE(eptr))) + RRETURN(MATCH_NOMATCH); + eptr++; + while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++; +@@ -2624,6 +2731,28 @@ + eptr += min; + break; + ++ case OP_ANYNL: ++ for (i = 1; i <= min; i++) ++ { ++ if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); ++ GETCHARINC(c, eptr); ++ switch(c) ++ { ++ default: RRETURN(MATCH_NOMATCH); ++ case 0x000d: ++ if (eptr < md->end_subject && *eptr == 0x0a) eptr++; ++ break; ++ case 0x000a: ++ case 0x000b: ++ case 0x000c: ++ case 0x0085: ++ case 0x2028: ++ case 0x2029: ++ break; ++ } ++ } ++ break; ++ + case OP_NOT_DIGIT: + for (i = 1; i <= min; i++) + { +@@ -2692,7 +2821,8 @@ + #endif /* SUPPORT_UTF8 */ + + /* Code for the non-UTF-8 case for minimum matching of operators other +- than OP_PROP and OP_NOTPROP. */ ++ than OP_PROP and OP_NOTPROP. We can assume that there are the minimum ++ number of bytes present, as this was tested above. */ + + switch(ctype) + { +@@ -2701,8 +2831,7 @@ + { + for (i = 1; i <= min; i++) + { +- if (eptr <= md->end_subject - md->nllen && IS_NEWLINE(eptr)) +- RRETURN(MATCH_NOMATCH); ++ if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); + eptr++; + } + } +@@ -2713,6 +2842,28 @@ + eptr += min; + break; + ++ /* Because of the CRLF case, we can't assume the minimum number of ++ bytes are present in this case. */ ++ ++ case OP_ANYNL: ++ for (i = 1; i <= min; i++) ++ { ++ if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); ++ switch(*eptr++) ++ { ++ default: RRETURN(MATCH_NOMATCH); ++ case 0x000d: ++ if (eptr < md->end_subject && *eptr == 0x0a) eptr++; ++ break; ++ case 0x000a: ++ case 0x000b: ++ case 0x000c: ++ case 0x0085: ++ break; ++ } ++ } ++ break; ++ + case OP_NOT_DIGIT: + for (i = 1; i <= min; i++) + if ((md->ctypes[*eptr++] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH); +@@ -2774,7 +2925,7 @@ + GETCHARINC(c, eptr); + if (prop_fail_result) RRETURN(MATCH_NOMATCH); + } +- break; ++ /* Control never gets here */ + + case PT_LAMP: + for (fi = min;; fi++) +@@ -2789,7 +2940,7 @@ + prop_chartype == ucp_Lt) == prop_fail_result) + RRETURN(MATCH_NOMATCH); + } +- break; ++ /* Control never gets here */ + + case PT_GC: + for (fi = min;; fi++) +@@ -2802,7 +2953,7 @@ + if ((prop_category == prop_value) == prop_fail_result) + RRETURN(MATCH_NOMATCH); + } +- break; ++ /* Control never gets here */ + + case PT_PC: + for (fi = min;; fi++) +@@ -2815,7 +2966,7 @@ + if ((prop_chartype == prop_value) == prop_fail_result) + RRETURN(MATCH_NOMATCH); + } +- break; ++ /* Control never gets here */ + + case PT_SC: + for (fi = min;; fi++) +@@ -2828,11 +2979,10 @@ + if ((prop_script == prop_value) == prop_fail_result) + RRETURN(MATCH_NOMATCH); + } +- break; ++ /* Control never gets here */ + + default: + RRETURN(PCRE_ERROR_INTERNAL); +- break; + } + } + +@@ -2876,7 +3026,7 @@ + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + if (fi >= max || eptr >= md->end_subject || + (ctype == OP_ANY && (ims & PCRE_DOTALL) == 0 && +- eptr <= md->end_subject - md->nllen && IS_NEWLINE(eptr))) ++ IS_NEWLINE(eptr))) + RRETURN(MATCH_NOMATCH); + + GETCHARINC(c, eptr); +@@ -2888,6 +3038,23 @@ + case OP_ANYBYTE: + break; + ++ case OP_ANYNL: ++ switch(c) ++ { ++ default: RRETURN(MATCH_NOMATCH); ++ case 0x000d: ++ if (eptr < md->end_subject && *eptr == 0x0a) eptr++; ++ break; ++ case 0x000a: ++ case 0x000b: ++ case 0x000c: ++ case 0x0085: ++ case 0x2028: ++ case 0x2029: ++ break; ++ } ++ break; ++ + case OP_NOT_DIGIT: + if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) + RRETURN(MATCH_NOMATCH); +@@ -2932,8 +3099,7 @@ + RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + if (fi >= max || eptr >= md->end_subject || +- ((ims & PCRE_DOTALL) == 0 && +- eptr <= md->end_subject - md->nllen && IS_NEWLINE(eptr))) ++ ((ims & PCRE_DOTALL) == 0 && IS_NEWLINE(eptr))) + RRETURN(MATCH_NOMATCH); + + c = *eptr++; +@@ -2945,6 +3111,21 @@ + case OP_ANYBYTE: + break; + ++ case OP_ANYNL: ++ switch(c) ++ { ++ default: RRETURN(MATCH_NOMATCH); ++ case 0x000d: ++ if (eptr < md->end_subject && *eptr == 0x0a) eptr++; ++ break; ++ case 0x000a: ++ case 0x000b: ++ case 0x000c: ++ case 0x0085: ++ break; ++ } ++ break; ++ + case OP_NOT_DIGIT: + if ((md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH); + break; +@@ -2977,7 +3158,7 @@ + /* Control never gets here */ + } + +- /* If maximizing it is worth using inline code for speed, doing the type ++ /* If maximizing, it is worth using inline code for speed, doing the type + test once at the start (i.e. keep it out of the loop). Again, keep the + UTF-8 and UCP stuff separate. */ + +@@ -3058,6 +3239,7 @@ + + /* eptr is now past the end of the maximum run */ + ++ if (possessive) continue; + for(;;) + { + RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); +@@ -3093,6 +3275,7 @@ + + /* eptr is now past the end of the maximum run */ + ++ if (possessive) continue; + for(;;) + { + RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); +@@ -3135,9 +3318,7 @@ + { + for (i = min; i < max; i++) + { +- if (eptr >= md->end_subject || +- (eptr <= md->end_subject - md->nllen && IS_NEWLINE(eptr))) +- break; ++ if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break; + eptr++; + while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++; + } +@@ -3161,9 +3342,7 @@ + { + for (i = min; i < max; i++) + { +- if (eptr >= md->end_subject || +- (eptr <= md->end_subject - md->nllen && IS_NEWLINE(eptr))) +- break; ++ if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break; + eptr++; + } + break; +@@ -3171,7 +3350,8 @@ + else + { + c = max - min; +- if (c > md->end_subject - eptr) c = md->end_subject - eptr; ++ if (c > (unsigned int)(md->end_subject - eptr)) ++ c = md->end_subject - eptr; + eptr += c; + } + } +@@ -3181,10 +3361,32 @@ + + case OP_ANYBYTE: + c = max - min; +- if (c > md->end_subject - eptr) c = md->end_subject - eptr; ++ if (c > (unsigned int)(md->end_subject - eptr)) ++ c = md->end_subject - eptr; + eptr += c; + break; + ++ case OP_ANYNL: ++ for (i = min; i < max; i++) ++ { ++ int len = 1; ++ if (eptr >= md->end_subject) break; ++ GETCHARLEN(c, eptr, len); ++ if (c == 0x000d) ++ { ++ if (++eptr >= md->end_subject) break; ++ if (*eptr == 0x000a) eptr++; ++ } ++ else ++ { ++ if (c != 0x000a && c != 0x000b && c != 0x000c && ++ c != 0x0085 && c != 0x2028 && c != 0x2029) ++ break; ++ eptr += len; ++ } ++ } ++ break; ++ + case OP_NOT_DIGIT: + for (i = min; i < max; i++) + { +@@ -3257,6 +3459,7 @@ + + /* eptr is now past the end of the maximum run */ + ++ if (possessive) continue; + for(;;) + { + RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); +@@ -3277,9 +3480,7 @@ + { + for (i = min; i < max; i++) + { +- if (eptr >= md->end_subject || +- (eptr <= md->end_subject - md->nllen && IS_NEWLINE(eptr))) +- break; ++ if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break; + eptr++; + } + break; +@@ -3288,10 +3489,30 @@ + + case OP_ANYBYTE: + c = max - min; +- if (c > md->end_subject - eptr) c = md->end_subject - eptr; ++ if (c > (unsigned int)(md->end_subject - eptr)) ++ c = md->end_subject - eptr; + eptr += c; + break; + ++ case OP_ANYNL: ++ for (i = min; i < max; i++) ++ { ++ if (eptr >= md->end_subject) break; ++ c = *eptr; ++ if (c == 0x000d) ++ { ++ if (++eptr >= md->end_subject) break; ++ if (*eptr == 0x000a) eptr++; ++ } ++ else ++ { ++ if (c != 0x000a && c != 0x000b && c != 0x000c && c != 0x0085) ++ break; ++ eptr++; ++ } ++ } ++ break; ++ + case OP_NOT_DIGIT: + for (i = min; i < max; i++) + { +@@ -3352,6 +3573,7 @@ + + /* eptr is now past the end of the maximum run */ + ++ if (possessive) continue; + while (eptr >= pp) + { + RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); +@@ -3366,14 +3588,12 @@ + } + /* Control never gets here */ + +- /* There's been some horrible disaster. Since all codes > OP_BRA are +- for capturing brackets, and there shouldn't be any gaps between 0 and +- OP_BRA, arrival here can only mean there is something seriously wrong +- in the code above or the OP_xxx definitions. */ ++ /* There's been some horrible disaster. Arrival here can only mean there is ++ something seriously wrong in the code above or the OP_xxx definitions. */ + + default: + DPRINTF(("Unknown opcode %d\n", *ecode)); +- RRETURN(PCRE_ERROR_UNKNOWN_NODE); ++ RRETURN(PCRE_ERROR_UNKNOWN_OPCODE); + } + + /* Do not stick any code in here without much thought; it is assumed +@@ -3411,7 +3631,6 @@ + + #undef cur_is_word + #undef condition +-#undef minimize + #undef prev_is_word + + #undef original_ims +@@ -3484,6 +3703,7 @@ + BOOL firstline; + BOOL first_byte_caseless = FALSE; + BOOL req_byte_caseless = FALSE; ++BOOL utf8; + match_data match_block; + match_data *md = &match_block; + const uschar *tables; +@@ -3491,6 +3711,7 @@ + USPTR start_match = (USPTR)subject + start_offset; + USPTR end_subject; + USPTR req_byte_ptr = start_match - 1; ++eptrblock eptrchain[EPTR_WORK_SIZE]; + + pcre_study_data internal_study; + const pcre_study_data *study; +@@ -3567,7 +3788,7 @@ + end_subject = md->end_subject; + + md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0; +-md->utf8 = (re->options & PCRE_UTF8) != 0; ++utf8 = md->utf8 = (re->options & PCRE_UTF8) != 0; + + md->notbol = (options & PCRE_NOTBOL) != 0; + md->noteol = (options & PCRE_NOTEOL) != 0; +@@ -3576,6 +3797,7 @@ + md->hitend = FALSE; + + md->recursive = NULL; /* No recursion at top level */ ++md->eptrchain = eptrchain; /* Make workspace generally available */ + + md->lcc = tables + lcc_offset; + md->ctypes = tables + ctypes_offset; +@@ -3583,26 +3805,36 @@ + /* Handle different types of newline. The two bits give four cases. If nothing + is set at run time, whatever was used at compile time applies. */ + +-switch ((((options & PCRE_NEWLINE_CRLF) == 0)? re->options : options) & +- PCRE_NEWLINE_CRLF) ++switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : options) & ++ PCRE_NEWLINE_BITS) + { +- default: newline = NEWLINE; break; /* Compile-time default */ ++ case 0: newline = NEWLINE; break; /* Compile-time default */ + case PCRE_NEWLINE_CR: newline = '\r'; break; + case PCRE_NEWLINE_LF: newline = '\n'; break; + case PCRE_NEWLINE_CR+ + PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break; ++ case PCRE_NEWLINE_ANY: newline = -1; break; ++ default: return PCRE_ERROR_BADNEWLINE; + } + +-if (newline > 255) ++if (newline < 0) + { +- md->nllen = 2; +- md->nl[0] = (newline >> 8) & 255; +- md->nl[1] = newline & 255; ++ md->nltype = NLTYPE_ANY; + } + else + { +- md->nllen = 1; +- md->nl[0] = newline; ++ md->nltype = NLTYPE_FIXED; ++ if (newline > 255) ++ { ++ md->nllen = 2; ++ md->nl[0] = (newline >> 8) & 255; ++ md->nl[1] = newline & 255; ++ } ++ else ++ { ++ md->nllen = 1; ++ md->nl[0] = newline; ++ } + } + + /* Partial matching is supported only for a restricted set of regexes at the +@@ -3615,7 +3847,7 @@ + back the character offset. */ + + #ifdef SUPPORT_UTF8 +-if (md->utf8 && (options & PCRE_NO_UTF8_CHECK) == 0) ++if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0) + { + if (_pcre_valid_utf8((uschar *)subject, length) >= 0) + return PCRE_ERROR_BADUTF8; +@@ -3707,10 +3939,13 @@ + req_byte2 = (tables + fcc_offset)[req_byte]; /* case flipped */ + } + ++ ++/* ==========================================================================*/ ++ + /* Loop for handling unanchored repeated matching attempts; for anchored regexs + the loop runs just once. */ + +-do ++for(;;) + { + USPTR save_end_subject = end_subject; + +@@ -3725,14 +3960,14 @@ + + /* Advance to a unique first char if possible. If firstline is TRUE, the + start of the match is constrained to the first line of a multiline string. +- Implement this by temporarily adjusting end_subject so that we stop scanning +- at a newline. If the match fails at the newline, later code breaks this loop. +- */ ++ That is, the match must be before or at the first newline. Implement this by ++ temporarily adjusting end_subject so that we stop scanning at a newline. If ++ the match fails at the newline, later code breaks this loop. */ + + if (firstline) + { + USPTR t = start_match; +- while (t <= save_end_subject - md->nllen && !IS_NEWLINE(t)) t++; ++ while (t < md->end_subject && !IS_NEWLINE(t)) t++; + end_subject = t; + } + +@@ -3753,11 +3988,9 @@ + + else if (startline) + { +- if (start_match >= md->start_subject + md->nllen + +- start_offset) ++ if (start_match > md->start_subject + start_offset) + { +- while (start_match <= end_subject && +- !IS_NEWLINE(start_match - md->nllen)) ++ while (start_match <= end_subject && !WAS_NEWLINE(start_match)) + start_match++; + } + } +@@ -3793,8 +4026,8 @@ + + HOWEVER: when the subject string is very, very long, searching to its end can + take a long time, and give bad performance on quite ordinary patterns. This +- showed up when somebody was matching /^C/ on a 32-megabyte string... so we +- don't do this when the string is sufficiently long. ++ showed up when somebody was matching something like /^\d+C/ on a 32-megabyte ++ string... so we don't do this when the string is sufficiently long. + + ALSO: this processing is disabled when partial matching is requested. + */ +@@ -3826,9 +4059,14 @@ + } + } + +- /* If we can't find the required character, break the matching loop */ ++ /* If we can't find the required character, break the matching loop, ++ forcing a match failure. */ + +- if (p >= end_subject) break; ++ if (p >= end_subject) ++ { ++ rc = MATCH_NOMATCH; ++ break; ++ } + + /* If we have found the required character, save the point where we + found it, so that we don't search again next time round the loop if +@@ -3838,49 +4076,70 @@ + } + } + +- /* When a match occurs, substrings will be set for all internal extractions; +- we just need to set up the whole thing as substring 0 before returning. If +- there were too many extractions, set the return code to zero. In the case +- where we had to get some local store to hold offsets for backreferences, copy +- those back references that we can. In this case there need not be overflow +- if certain parts of the pattern were not used. */ ++ /* OK, we can now run the match. */ + + md->start_match = start_match; + md->match_call_count = 0; ++ md->eptrn = 0; /* Next free eptrchain slot */ ++ rc = match(start_match, md->start_code, 2, md, ims, NULL, 0, 0); + +- rc = match(start_match, md->start_code, 2, md, ims, NULL, match_isgroup, 0); ++ /* Any return other than MATCH_NOMATCH breaks the loop. */ + +- /* When the result is no match, if the subject's first character was a +- newline and the PCRE_FIRSTLINE option is set, break (which will return +- PCRE_ERROR_NOMATCH). The option requests that a match occur before the first +- newline in the subject. Otherwise, advance the pointer to the next character +- and continue - but the continuation will actually happen only when the +- pattern is not anchored. */ ++ if (rc != MATCH_NOMATCH) break; + +- if (rc == MATCH_NOMATCH) +- { +- if (firstline && +- start_match <= md->end_subject - md->nllen && +- IS_NEWLINE(start_match)) +- break; +- start_match++; ++ /* If PCRE_FIRSTLINE is set, the match must happen before or at the first ++ newline in the subject (though it may continue over the newline). Therefore, ++ if we have just failed to match, starting at a newline, do not continue. */ ++ ++ if (firstline && IS_NEWLINE(start_match)) break; ++ ++ /* Advance the match position by one character. */ ++ ++ start_match++; + #ifdef SUPPORT_UTF8 +- if (md->utf8) +- while(start_match < end_subject && (*start_match & 0xc0) == 0x80) +- start_match++; ++ if (utf8) ++ while(start_match < end_subject && (*start_match & 0xc0) == 0x80) ++ start_match++; + #endif +- continue; +- } + +- if (rc != MATCH_MATCH) +- { +- DPRINTF((">>>> error: returning %d\n", rc)); +- return rc; +- } ++ /* Break the loop if the pattern is anchored or if we have passed the end of ++ the subject. */ ++ ++ if (anchored || start_match > end_subject) break; ++ ++ /* If we have just passed a CR and the newline option is CRLF or ANY, and we ++ are now at a LF, advance the match position by one more character. */ ++ ++ if (start_match[-1] == '\r' && ++ (md->nltype == NLTYPE_ANY || md->nllen == 2) && ++ start_match < end_subject && ++ *start_match == '\n') ++ start_match++; ++ ++ } /* End of for(;;) "bumpalong" loop */ ++ ++/* ==========================================================================*/ ++ ++/* We reach here when rc is not MATCH_NOMATCH, or if one of the stopping ++conditions is true: + +- /* We have a match! Copy the offset information from temporary store if +- necessary */ ++(1) The pattern is anchored; + ++(2) We are past the end of the subject; ++ ++(3) PCRE_FIRSTLINE is set and we have failed to match at a newline, because ++ this option requests that a match occur at or before the first newline in ++ the subject. ++ ++When we have a match and the offset vector is big enough to deal with any ++backreferences, captured substring offsets will already be set up. In the case ++where we had to get some local store to hold offsets for backreference ++processing, copy those that we can. In this case there need not be overflow if ++certain parts of the pattern were not used, even though there are more ++capturing parentheses than vector slots. */ ++ ++if (rc == MATCH_MATCH) ++ { + if (using_temporary_offsets) + { + if (offsetcount >= 4) +@@ -3889,15 +4148,18 @@ + (offsetcount - 2) * sizeof(int)); + DPRINTF(("Copied offsets from temporary memory\n")); + } +- if (md->end_offset_top > offsetcount) +- md->offset_overflow = TRUE; +- ++ if (md->end_offset_top > offsetcount) md->offset_overflow = TRUE; + DPRINTF(("Freeing temporary memory\n")); + (pcre_free)(md->offset_vector); + } + ++ /* Set the return code to the number of captured strings, or 0 if there are ++ too many to fit into the vector. */ ++ + rc = md->offset_overflow? 0 : md->end_offset_top/2; + ++ /* If there is space, set up the whole thing as substring 0. */ ++ + if (offsetcount < 2) rc = 0; else + { + offsets[0] = start_match - md->start_subject; +@@ -3908,9 +4170,8 @@ + return rc; + } + +-/* This "while" is the end of the "do" above */ +- +-while (!anchored && start_match <= end_subject); ++/* Control gets here if there has been an error, or if the overall match ++attempt has failed at all permitted starting positions. */ + + if (using_temporary_offsets) + { +@@ -3918,7 +4179,12 @@ + (pcre_free)(md->offset_vector); + } + +-if (md->partial && md->hitend) ++if (rc != MATCH_NOMATCH) ++ { ++ DPRINTF((">>>> error: returning %d\n", rc)); ++ return rc; ++ } ++else if (md->partial && md->hitend) + { + DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n")); + return PCRE_ERROR_PARTIAL; +diff -ruN ../pcre.orig/pcrelib/pcre_globals.c ./pcrelib/pcre_globals.c +--- ../pcre.orig/pcrelib/pcre_globals.c Mon Jan 1 10:36:04 2007 ++++ ./pcrelib/pcre_globals.c Fri Feb 9 22:31:19 2007 +@@ -6,7 +6,7 @@ + and semantics are as close as possible to those of the Perl 5 language. + + Written by Philip Hazel +- Copyright (c) 1997-2007 University of Cambridge ++ Copyright (c) 1997-2006 University of Cambridge + + ----------------------------------------------------------------------------- + Redistribution and use in source and binary forms, with or without +@@ -51,6 +51,18 @@ + + + #ifndef VPCOMPAT ++ ++/************************************************************************** ++This code used to be here for use when compiling as a C++ library. However, ++according to Dair Grant it is not needed: " ++ ++ Including 'extern "C"' in the declaration generates an "initialized and ++ declared `extern'" warning from gcc 4.0.1. Since we include pcre_internal.h, ++ which includes pcre.h, which declares these prototypes within an extern "C" {} ++ block, we shouldn't need the prefix here. ++ ++So, from Release 7.0 I have cut this out. ++ + #ifdef __cplusplus + extern "C" void *(*pcre_malloc)(size_t) = malloc; + extern "C" void (*pcre_free)(void *) = free; +@@ -58,12 +70,13 @@ + extern "C" void (*pcre_stack_free)(void *) = free; + extern "C" int (*pcre_callout)(pcre_callout_block *) = NULL; + #else ++**************************************************************************/ ++ + void *(*pcre_malloc)(size_t) = malloc; + void (*pcre_free)(void *) = free; + void *(*pcre_stack_malloc)(size_t) = malloc; + void (*pcre_stack_free)(void *) = free; + int (*pcre_callout)(pcre_callout_block *) = NULL; +-#endif + #endif + + /* End of pcre_globals.c */ +diff -ruN ../pcre.orig/pcrelib/pcre_internal.h ./pcrelib/pcre_internal.h +--- ../pcre.orig/pcrelib/pcre_internal.h Mon Jan 1 10:36:04 2007 ++++ ./pcrelib/pcre_internal.h Fri Feb 9 22:31:20 2007 +@@ -7,7 +7,7 @@ + and semantics are as close as possible to those of the Perl 5 language. + + Written by Philip Hazel +- Copyright (c) 1997-2007 University of Cambridge ++ Copyright (c) 1997-2006 University of Cambridge + + ----------------------------------------------------------------------------- + Redistribution and use in source and binary forms, with or without +@@ -54,12 +54,16 @@ + /* Use a macro for debugging printing, 'cause that eliminates the use of #ifdef + inline, and there are *still* stupid compilers about that don't like indented + pre-processor statements, or at least there were when I first wrote this. After +-all, it had only been about 10 years then... */ ++all, it had only been about 10 years then... + ++It turns out that the Mac Debugging.h header also defines the macro DPRINTF, so ++be absolutely sure we get our version. */ ++ ++#undef DPRINTF + #ifdef DEBUG + #define DPRINTF(p) printf p + #else +-#define DPRINTF(p) /*nothing*/ ++#define DPRINTF(p) /* Nothing */ + #endif + + +@@ -118,13 +122,48 @@ + + typedef unsigned char uschar; + +-/* PCRE is able to support 3 different kinds of newline (CR, LF, CRLF). The +-following macro is used to package up testing for newlines. NLBLOCK is defined +-in the various modules to indicate in which datablock the parameters exist. */ ++/* This is an unsigned int value that no character can ever have. UTF-8 ++characters only go up to 0x7fffffff (though Unicode doesn't go beyond ++0x0010ffff). */ ++ ++#define NOTACHAR 0xffffffff ++ ++/* PCRE is able to support several different kinds of newline (CR, LF, CRLF, ++and "all" at present). The following macros are used to package up testing for ++newlines. NLBLOCK, PSSTART, and PSEND are defined in the various modules to ++indicate in which datablock the parameters exist, and what the start/end of ++string field names are. */ ++ ++#define NLTYPE_FIXED 0 /* Newline is a fixed length string */ ++#define NLTYPE_ANY 1 /* Newline is any Unicode line ending */ ++ ++/* This macro checks for a newline at the given position */ + + #define IS_NEWLINE(p) \ +- ((p)[0] == NLBLOCK->nl[0] && \ +- (NLBLOCK->nllen == 1 || (p)[1] == NLBLOCK->nl[1])) ++ ((NLBLOCK->nltype != NLTYPE_FIXED)? \ ++ ((p) < NLBLOCK->PSEND && \ ++ _pcre_is_newline((p), NLBLOCK->PSEND, &(NLBLOCK->nllen), utf8) \ ++ ) \ ++ : \ ++ ((p) <= NLBLOCK->PSEND - NLBLOCK->nllen && \ ++ (p)[0] == NLBLOCK->nl[0] && \ ++ (NLBLOCK->nllen == 1 || (p)[1] == NLBLOCK->nl[1]) \ ++ ) \ ++ ) ++ ++/* This macro checks for a newline immediately preceding the given position */ ++ ++#define WAS_NEWLINE(p) \ ++ ((NLBLOCK->nltype != NLTYPE_FIXED)? \ ++ ((p) > NLBLOCK->PSSTART && \ ++ _pcre_was_newline((p), NLBLOCK->PSSTART, &(NLBLOCK->nllen), utf8) \ ++ ) \ ++ : \ ++ ((p) >= NLBLOCK->PSSTART + NLBLOCK->nllen && \ ++ (p)[-NLBLOCK->nllen] == NLBLOCK->nl[0] && \ ++ (NLBLOCK->nllen == 1 || (p)[-NLBLOCK->nllen+1] == NLBLOCK->nl[1]) \ ++ ) \ ++ ) + + /* When PCRE is compiled as a C++ library, the subject pointer can be replaced + with a custom type. This makes it possible, for example, to allow pcre_exec() +@@ -282,7 +321,7 @@ + + #define GETCHAR(c, eptr) \ + c = *eptr; \ +- if ((c & 0xc0) == 0xc0) \ ++ if (c >= 0xc0) \ + { \ + int gcii; \ + int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \ +@@ -300,7 +339,7 @@ + + #define GETCHARTEST(c, eptr) \ + c = *eptr; \ +- if (utf8 && (c & 0xc0) == 0xc0) \ ++ if (utf8 && c >= 0xc0) \ + { \ + int gcii; \ + int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \ +@@ -318,7 +357,7 @@ + + #define GETCHARINC(c, eptr) \ + c = *eptr++; \ +- if ((c & 0xc0) == 0xc0) \ ++ if (c >= 0xc0) \ + { \ + int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \ + int gcss = 6*gcaa; \ +@@ -334,7 +373,7 @@ + + #define GETCHARINCTEST(c, eptr) \ + c = *eptr++; \ +- if (utf8 && (c & 0xc0) == 0xc0) \ ++ if (utf8 && c >= 0xc0) \ + { \ + int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \ + int gcss = 6*gcaa; \ +@@ -351,7 +390,7 @@ + + #define GETCHARLEN(c, eptr, len) \ + c = *eptr; \ +- if ((c & 0xc0) == 0xc0) \ ++ if (c >= 0xc0) \ + { \ + int gcii; \ + int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \ +@@ -404,20 +443,21 @@ + /* Masks for identifying the public options that are permitted at compile + time, run time, or study time, respectively. */ + ++#define PCRE_NEWLINE_BITS (PCRE_NEWLINE_CR|PCRE_NEWLINE_LF|PCRE_NEWLINE_ANY) ++ + #define PUBLIC_OPTIONS \ + (PCRE_CASELESS|PCRE_EXTENDED|PCRE_ANCHORED|PCRE_MULTILINE| \ + PCRE_DOTALL|PCRE_DOLLAR_ENDONLY|PCRE_EXTRA|PCRE_UNGREEDY|PCRE_UTF8| \ + PCRE_NO_AUTO_CAPTURE|PCRE_NO_UTF8_CHECK|PCRE_AUTO_CALLOUT|PCRE_FIRSTLINE| \ +- PCRE_DUPNAMES|PCRE_NEWLINE_CR|PCRE_NEWLINE_LF) ++ PCRE_DUPNAMES|PCRE_NEWLINE_BITS) + + #define PUBLIC_EXEC_OPTIONS \ + (PCRE_ANCHORED|PCRE_NOTBOL|PCRE_NOTEOL|PCRE_NOTEMPTY|PCRE_NO_UTF8_CHECK| \ +- PCRE_PARTIAL|PCRE_NEWLINE_CR|PCRE_NEWLINE_LF) ++ PCRE_PARTIAL|PCRE_NEWLINE_BITS) + + #define PUBLIC_DFA_EXEC_OPTIONS \ + (PCRE_ANCHORED|PCRE_NOTBOL|PCRE_NOTEOL|PCRE_NOTEMPTY|PCRE_NO_UTF8_CHECK| \ +- PCRE_PARTIAL|PCRE_DFA_SHORTEST|PCRE_DFA_RESTART|PCRE_NEWLINE_CR| \ +- PCRE_NEWLINE_LF) ++ PCRE_PARTIAL|PCRE_DFA_SHORTEST|PCRE_DFA_RESTART|PCRE_NEWLINE_BITS) + + #define PUBLIC_STUDY_OPTIONS 0 /* None defined */ + +@@ -449,9 +489,7 @@ + #define FALSE 0 + #define TRUE 1 + +-/* Escape items that are just an encoding of a particular data value. Note that +-ESC_n is defined as yet another macro, which is set in config.h to either \n +-(the default) or \r (which some people want). */ ++/* Escape items that are just an encoding of a particular data value. */ + + #ifndef ESC_e + #define ESC_e 27 +@@ -462,7 +500,7 @@ + #endif + + #ifndef ESC_n +-#define ESC_n NEWLINE ++#define ESC_n '\n' + #endif + + #ifndef ESC_r +@@ -501,21 +539,28 @@ + their negation. Also, they must appear in the same order as in the opcode + definitions below, up to ESC_z. There's a dummy for OP_ANY because it + corresponds to "." rather than an escape sequence. The final one must be +-ESC_REF as subsequent values are used for \1, \2, \3, etc. There is are two +-tests in the code for an escape greater than ESC_b and less than ESC_Z to +-detect the types that may be repeated. These are the types that consume +-characters. If any new escapes are put in between that don't consume a ++ESC_REF as subsequent values are used for backreferences (\1, \2, \3, etc). ++There are two tests in the code for an escape greater than ESC_b and less than ++ESC_Z to detect the types that may be repeated. These are the types that ++consume characters. If any new escapes are put in between that don't consume a + character, that code will have to change. */ + + enum { ESC_A = 1, ESC_G, ESC_B, ESC_b, ESC_D, ESC_d, ESC_S, ESC_s, ESC_W, +- ESC_w, ESC_dum1, ESC_C, ESC_P, ESC_p, ESC_X, ESC_Z, ESC_z, ESC_E, +- ESC_Q, ESC_REF }; ++ ESC_w, ESC_dum1, ESC_C, ESC_P, ESC_p, ESC_R, ESC_X, ESC_Z, ESC_z, ++ ESC_E, ESC_Q, ESC_k, ESC_REF }; ++ + + /* Opcode table: OP_BRA must be last, as all values >= it are used for brackets + that extract substrings. Starting from 1 (i.e. after OP_END), the values up to + OP_EOD must correspond in order to the list of escapes immediately above. +-Note that whenever this list is updated, the two macro definitions that follow +-must also be updated to match. */ ++ ++To keep stored, compiled patterns compatible, new opcodes should be added ++immediately before OP_BRA, where (since release 7.0) a gap is left for this ++purpose. ++ ++*** NOTE NOTE NOTE *** Whenever this list is updated, the two macro definitions ++that follow must also be updated to match. There is also a table called ++"coptable" in pcre_dfa_exec.c that must be updated. */ + + enum { + OP_END, /* 0 End of pattern */ +@@ -536,110 +581,122 @@ + OP_ANYBYTE, /* 12 Match any byte (\C); different to OP_ANY for UTF-8 */ + OP_NOTPROP, /* 13 \P (not Unicode property) */ + OP_PROP, /* 14 \p (Unicode property) */ +- OP_EXTUNI, /* 15 \X (extended Unicode sequence */ +- OP_EODN, /* 16 End of data or \n at end of data: \Z. */ +- OP_EOD, /* 17 End of data: \z */ +- +- OP_OPT, /* 18 Set runtime options */ +- OP_CIRC, /* 19 Start of line - varies with multiline switch */ +- OP_DOLL, /* 20 End of line - varies with multiline switch */ +- OP_CHAR, /* 21 Match one character, casefully */ +- OP_CHARNC, /* 22 Match one character, caselessly */ +- OP_NOT, /* 23 Match one character, not the following one */ +- +- OP_STAR, /* 24 The maximizing and minimizing versions of */ +- OP_MINSTAR, /* 25 all these opcodes must come in pairs, with */ +- OP_PLUS, /* 26 the minimizing one second. */ +- OP_MINPLUS, /* 27 This first set applies to single characters */ +- OP_QUERY, /* 28 */ +- OP_MINQUERY, /* 29 */ +- OP_UPTO, /* 30 From 0 to n matches */ +- OP_MINUPTO, /* 31 */ +- OP_EXACT, /* 32 Exactly n matches */ +- +- OP_NOTSTAR, /* 33 The maximizing and minimizing versions of */ +- OP_NOTMINSTAR, /* 34 all these opcodes must come in pairs, with */ +- OP_NOTPLUS, /* 35 the minimizing one second. */ +- OP_NOTMINPLUS, /* 36 This set applies to "not" single characters */ +- OP_NOTQUERY, /* 37 */ +- OP_NOTMINQUERY, /* 38 */ +- OP_NOTUPTO, /* 39 From 0 to n matches */ +- OP_NOTMINUPTO, /* 40 */ +- OP_NOTEXACT, /* 41 Exactly n matches */ +- +- OP_TYPESTAR, /* 42 The maximizing and minimizing versions of */ +- OP_TYPEMINSTAR, /* 43 all these opcodes must come in pairs, with */ +- OP_TYPEPLUS, /* 44 the minimizing one second. These codes must */ +- OP_TYPEMINPLUS, /* 45 be in exactly the same order as those above. */ +- OP_TYPEQUERY, /* 46 This set applies to character types such as \d */ +- OP_TYPEMINQUERY, /* 47 */ +- OP_TYPEUPTO, /* 48 From 0 to n matches */ +- OP_TYPEMINUPTO, /* 49 */ +- OP_TYPEEXACT, /* 50 Exactly n matches */ +- +- OP_CRSTAR, /* 51 The maximizing and minimizing versions of */ +- OP_CRMINSTAR, /* 52 all these opcodes must come in pairs, with */ +- OP_CRPLUS, /* 53 the minimizing one second. These codes must */ +- OP_CRMINPLUS, /* 54 be in exactly the same order as those above. */ +- OP_CRQUERY, /* 55 These are for character classes and back refs */ +- OP_CRMINQUERY, /* 56 */ +- OP_CRRANGE, /* 57 These are different to the three sets above. */ +- OP_CRMINRANGE, /* 58 */ ++ OP_ANYNL, /* 15 \R (any newline sequence) */ ++ OP_EXTUNI, /* 16 \X (extended Unicode sequence */ ++ OP_EODN, /* 17 End of data or \n at end of data: \Z. */ ++ OP_EOD, /* 18 End of data: \z */ ++ ++ OP_OPT, /* 19 Set runtime options */ ++ OP_CIRC, /* 20 Start of line - varies with multiline switch */ ++ OP_DOLL, /* 21 End of line - varies with multiline switch */ ++ OP_CHAR, /* 22 Match one character, casefully */ ++ OP_CHARNC, /* 23 Match one character, caselessly */ ++ OP_NOT, /* 24 Match one character, not the following one */ ++ ++ OP_STAR, /* 25 The maximizing and minimizing versions of */ ++ OP_MINSTAR, /* 26 these six opcodes must come in pairs, with */ ++ OP_PLUS, /* 27 the minimizing one second. */ ++ OP_MINPLUS, /* 28 This first set applies to single characters.*/ ++ OP_QUERY, /* 29 */ ++ OP_MINQUERY, /* 30 */ ++ ++ OP_UPTO, /* 31 From 0 to n matches */ ++ OP_MINUPTO, /* 32 */ ++ OP_EXACT, /* 33 Exactly n matches */ ++ ++ OP_POSSTAR, /* 34 Possessified star */ ++ OP_POSPLUS, /* 35 Possessified plus */ ++ OP_POSQUERY, /* 36 Posesssified query */ ++ OP_POSUPTO, /* 37 Possessified upto */ ++ ++ OP_NOTSTAR, /* 38 The maximizing and minimizing versions of */ ++ OP_NOTMINSTAR, /* 39 these six opcodes must come in pairs, with */ ++ OP_NOTPLUS, /* 40 the minimizing one second. They must be in */ ++ OP_NOTMINPLUS, /* 41 exactly the same order as those above. */ ++ OP_NOTQUERY, /* 42 This set applies to "not" single characters. */ ++ OP_NOTMINQUERY, /* 43 */ ++ ++ OP_NOTUPTO, /* 44 From 0 to n matches */ ++ OP_NOTMINUPTO, /* 45 */ ++ OP_NOTEXACT, /* 46 Exactly n matches */ ++ ++ OP_NOTPOSSTAR, /* 47 Possessified versions */ ++ OP_NOTPOSPLUS, /* 48 */ ++ OP_NOTPOSQUERY, /* 49 */ ++ OP_NOTPOSUPTO, /* 50 */ ++ ++ OP_TYPESTAR, /* 51 The maximizing and minimizing versions of */ ++ OP_TYPEMINSTAR, /* 52 these six opcodes must come in pairs, with */ ++ OP_TYPEPLUS, /* 53 the minimizing one second. These codes must */ ++ OP_TYPEMINPLUS, /* 54 be in exactly the same order as those above. */ ++ OP_TYPEQUERY, /* 55 This set applies to character types such as \d */ ++ OP_TYPEMINQUERY, /* 56 */ ++ ++ OP_TYPEUPTO, /* 57 From 0 to n matches */ ++ OP_TYPEMINUPTO, /* 58 */ ++ OP_TYPEEXACT, /* 59 Exactly n matches */ ++ ++ OP_TYPEPOSSTAR, /* 60 Possessified versions */ ++ OP_TYPEPOSPLUS, /* 61 */ ++ OP_TYPEPOSQUERY, /* 62 */ ++ OP_TYPEPOSUPTO, /* 63 */ ++ ++ OP_CRSTAR, /* 64 The maximizing and minimizing versions of */ ++ OP_CRMINSTAR, /* 65 all these opcodes must come in pairs, with */ ++ OP_CRPLUS, /* 66 the minimizing one second. These codes must */ ++ OP_CRMINPLUS, /* 67 be in exactly the same order as those above. */ ++ OP_CRQUERY, /* 68 These are for character classes and back refs */ ++ OP_CRMINQUERY, /* 69 */ ++ OP_CRRANGE, /* 70 These are different to the three sets above. */ ++ OP_CRMINRANGE, /* 71 */ + +- OP_CLASS, /* 59 Match a character class, chars < 256 only */ +- OP_NCLASS, /* 60 Same, but the bitmap was created from a negative ++ OP_CLASS, /* 72 Match a character class, chars < 256 only */ ++ OP_NCLASS, /* 73 Same, but the bitmap was created from a negative + class - the difference is relevant only when a UTF-8 + character > 255 is encountered. */ + +- OP_XCLASS, /* 61 Extended class for handling UTF-8 chars within the ++ OP_XCLASS, /* 74 Extended class for handling UTF-8 chars within the + class. This does both positive and negative. */ + +- OP_REF, /* 62 Match a back reference */ +- OP_RECURSE, /* 63 Match a numbered subpattern (possibly recursive) */ +- OP_CALLOUT, /* 64 Call out to external function if provided */ +- +- OP_ALT, /* 65 Start of alternation */ +- OP_KET, /* 66 End of group that doesn't have an unbounded repeat */ +- OP_KETRMAX, /* 67 These two must remain together and in this */ +- OP_KETRMIN, /* 68 order. They are for groups the repeat for ever. */ +- +- /* The assertions must come before ONCE and COND */ +- +- OP_ASSERT, /* 69 Positive lookahead */ +- OP_ASSERT_NOT, /* 70 Negative lookahead */ +- OP_ASSERTBACK, /* 71 Positive lookbehind */ +- OP_ASSERTBACK_NOT, /* 72 Negative lookbehind */ +- OP_REVERSE, /* 73 Move pointer back - used in lookbehind assertions */ +- +- /* ONCE and COND must come after the assertions, with ONCE first, as there's +- a test for >= ONCE for a subpattern that isn't an assertion. */ +- +- OP_ONCE, /* 74 Once matched, don't back up into the subpattern */ +- OP_COND, /* 75 Conditional group */ +- OP_CREF, /* 76 Used to hold an extraction string number (cond ref) */ +- +- OP_BRAZERO, /* 77 These two must remain together and in this */ +- OP_BRAMINZERO, /* 78 order. */ +- +- OP_BRANUMBER, /* 79 Used for extracting brackets whose number is greater +- than can fit into an opcode. */ +- +- OP_BRA /* 80 This and greater values are used for brackets that +- extract substrings up to EXTRACT_BASIC_MAX. After +- that, use is made of OP_BRANUMBER. */ +-}; +- +-/* WARNING WARNING WARNING: There is an implicit assumption in pcre.c and +-study.c that all opcodes are less than 128 in value. This makes handling UTF-8 +-character sequences easier. */ +- +-/* The highest extraction number before we have to start using additional +-bytes. (Originally PCRE didn't have support for extraction counts highter than +-this number.) The value is limited by the number of opcodes left after OP_BRA, +-i.e. 255 - OP_BRA. We actually set it a bit lower to leave room for additional +-opcodes. */ ++ OP_REF, /* 75 Match a back reference */ ++ OP_RECURSE, /* 76 Match a numbered subpattern (possibly recursive) */ ++ OP_CALLOUT, /* 77 Call out to external function if provided */ ++ ++ OP_ALT, /* 78 Start of alternation */ ++ OP_KET, /* 79 End of group that doesn't have an unbounded repeat */ ++ OP_KETRMAX, /* 80 These two must remain together and in this */ ++ OP_KETRMIN, /* 81 order. They are for groups the repeat for ever. */ ++ ++ /* The assertions must come before BRA, CBRA, ONCE, and COND.*/ ++ ++ OP_ASSERT, /* 82 Positive lookahead */ ++ OP_ASSERT_NOT, /* 83 Negative lookahead */ ++ OP_ASSERTBACK, /* 84 Positive lookbehind */ ++ OP_ASSERTBACK_NOT, /* 85 Negative lookbehind */ ++ OP_REVERSE, /* 86 Move pointer back - used in lookbehind assertions */ ++ ++ /* ONCE, BRA, CBRA, and COND must come after the assertions, with ONCE first, ++ as there's a test for >= ONCE for a subpattern that isn't an assertion. */ ++ ++ OP_ONCE, /* 87 Atomic group */ ++ OP_BRA, /* 88 Start of non-capturing bracket */ ++ OP_CBRA, /* 89 Start of capturing bracket */ ++ OP_COND, /* 90 Conditional group */ ++ ++ /* These three must follow the previous three, in the same order. There's a ++ check for >= SBRA to distinguish the two sets. */ ++ ++ OP_SBRA, /* 91 Start of non-capturing bracket, check empty */ ++ OP_SCBRA, /* 92 Start of capturing bracket, check empty */ ++ OP_SCOND, /* 93 Conditional group, check empty */ ++ ++ OP_CREF, /* 94 Used to hold a capture number as condition */ ++ OP_RREF, /* 95 Used to hold a recursion number as condition */ ++ OP_DEF, /* 96 The DEFINE condition */ + +-#define EXTRACT_BASIC_MAX 100 ++ OP_BRAZERO, /* 97 These two must remain together and in this */ ++ OP_BRAMINZERO /* 98 order. */ ++}; + + + /* This macro defines textual names for all the opcodes. These are used only +@@ -648,17 +705,21 @@ + #define OP_NAME_LIST \ + "End", "\\A", "\\G", "\\B", "\\b", "\\D", "\\d", \ + "\\S", "\\s", "\\W", "\\w", "Any", "Anybyte", \ +- "notprop", "prop", "extuni", \ ++ "notprop", "prop", "anynl", "extuni", \ + "\\Z", "\\z", \ + "Opt", "^", "$", "char", "charnc", "not", \ + "*", "*?", "+", "+?", "?", "??", "{", "{", "{", \ ++ "*+","++", "?+", "{", \ + "*", "*?", "+", "+?", "?", "??", "{", "{", "{", \ ++ "*+","++", "?+", "{", \ + "*", "*?", "+", "+?", "?", "??", "{", "{", "{", \ ++ "*+","++", "?+", "{", \ + "*", "*?", "+", "+?", "?", "??", "{", "{", \ + "class", "nclass", "xclass", "Ref", "Recurse", "Callout", \ + "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not", \ +- "AssertB", "AssertB not", "Reverse", "Once", "Cond", "Cond ref",\ +- "Brazero", "Braminzero", "Branumber", "Bra" ++ "AssertB", "AssertB not", "Reverse", \ ++ "Once", "Bra 0", "Bra", "Cond", "SBra 0", "SBra", "SCond", \ ++ "Cond ref", "Cond rec", "Cond def", "Brazero", "Braminzero" + + + /* This macro defines the length of fixed length operations in the compiled +@@ -674,7 +735,7 @@ + 1, /* End */ \ + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* \A, \G, \B, \B, \D, \d, \S, \s, \W, \w */ \ + 1, 1, /* Any, Anybyte */ \ +- 3, 3, 1, /* NOTPROP, PROP, EXTUNI */ \ ++ 3, 3, 1, 1, /* NOTPROP, PROP, EXTUNI, ANYNL */ \ + 1, 1, 2, 1, 1, /* \Z, \z, Opt, ^, $ */ \ + 2, /* Char - the minimum length */ \ + 2, /* Charnc - the minimum length */ \ +@@ -682,12 +743,15 @@ + /* Positive single-char repeats ** These are */ \ + 2, 2, 2, 2, 2, 2, /* *, *?, +, +?, ?, ?? ** minima in */ \ + 4, 4, 4, /* upto, minupto, exact ** UTF-8 mode */ \ ++ 2, 2, 2, 4, /* *+, ++, ?+, upto+ */ \ + /* Negative single-char repeats - only for chars < 256 */ \ + 2, 2, 2, 2, 2, 2, /* NOT *, *?, +, +?, ?, ?? */ \ + 4, 4, 4, /* NOT upto, minupto, exact */ \ ++ 2, 2, 2, 4, /* Possessive *, +, ?, upto */ \ + /* Positive type repeats */ \ + 2, 2, 2, 2, 2, 2, /* Type *, *?, +, +?, ?, ?? */ \ + 4, 4, 4, /* Type upto, minupto, exact */ \ ++ 2, 2, 2, 4, /* Possessive *+, ++, ?+, upto+ */ \ + /* Character class & ref repeats */ \ + 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */ \ + 5, 5, /* CRRANGE, CRMINRANGE */ \ +@@ -706,17 +770,22 @@ + 1+LINK_SIZE, /* Assert behind */ \ + 1+LINK_SIZE, /* Assert behind not */ \ + 1+LINK_SIZE, /* Reverse */ \ +- 1+LINK_SIZE, /* Once */ \ ++ 1+LINK_SIZE, /* ONCE */ \ ++ 1+LINK_SIZE, /* BRA */ \ ++ 3+LINK_SIZE, /* CBRA */ \ + 1+LINK_SIZE, /* COND */ \ ++ 1+LINK_SIZE, /* SBRA */ \ ++ 3+LINK_SIZE, /* SCBRA */ \ ++ 1+LINK_SIZE, /* SCOND */ \ + 3, /* CREF */ \ ++ 3, /* RREF */ \ ++ 1, /* DEF */ \ + 1, 1, /* BRAZERO, BRAMINZERO */ \ +- 3, /* BRANUMBER */ \ +- 1+LINK_SIZE /* BRA */ \ + + +-/* A magic value for OP_CREF to indicate the "in recursion" condition. */ ++/* A magic value for OP_RREF to indicate the "any recursion" condition. */ + +-#define CREF_RECURSE 0xffff ++#define RREF_ANY 0xffff + + /* Error code numbers. They are given names so that they can more easily be + tracked. */ +@@ -726,7 +795,7 @@ + ERR20, ERR21, ERR22, ERR23, ERR24, ERR25, ERR26, ERR27, ERR28, ERR29, + ERR30, ERR31, ERR32, ERR33, ERR34, ERR35, ERR36, ERR37, ERR38, ERR39, + ERR40, ERR41, ERR42, ERR43, ERR44, ERR45, ERR46, ERR47, ERR48, ERR49, +- ERR50, ERR51 }; ++ ERR50, ERR51, ERR52, ERR53, ERR54, ERR55, ERR56, ERR57 }; + + /* The real format of the start of the pcre block; the index of names and the + code vector run on as long as necessary after the end. We store an explicit +@@ -781,17 +850,23 @@ + const uschar *fcc; /* Points to case-flipping table */ + const uschar *cbits; /* Points to character type table */ + const uschar *ctypes; /* Points to table of type maps */ ++ const uschar *start_workspace;/* The start of working space */ + const uschar *start_code; /* The start of the compiled code */ + const uschar *start_pattern; /* The start of the pattern */ ++ const uschar *end_pattern; /* The end of the pattern */ ++ uschar *hwm; /* High watermark of workspace */ + uschar *name_table; /* The name/number table */ + int names_found; /* Number of entries so far */ + int name_entry_size; /* Size of each entry */ ++ int bracount; /* Count of capturing parens */ + int top_backref; /* Maximum back reference */ + unsigned int backref_map; /* Bitmap of low back refs */ ++ int external_options; /* External (initial) options */ + int req_varyopt; /* "After variable item" flag for reqbyte */ + BOOL nopartial; /* Set TRUE if partial won't work */ +- int nllen; /* 1 or 2 for newline string length */ +- uschar nl[4]; /* Newline string */ ++ int nltype; /* Newline type */ ++ int nllen; /* Newline string length */ ++ uschar nl[4]; /* Newline string when fixed length */ + } compile_data; + + /* Structure for maintaining a chain of pointers to the currently incomplete +@@ -824,6 +899,16 @@ + + struct heapframe; + ++/* Structure for building a chain of data for holding the values of the subject ++pointer at the start of each subpattern, so as to detect when an empty string ++has been matched by a subpattern - to break infinite loops. */ ++ ++typedef struct eptrblock { ++ struct eptrblock *epb_prev; ++ USPTR epb_saved_eptr; ++} eptrblock; ++ ++ + /* Structure for passing "static" information around between the functions + doing traditional NFA matching, so that they are thread-safe. */ + +@@ -834,8 +919,9 @@ + int *offset_vector; /* Offset vector */ + int offset_end; /* One past the end */ + int offset_max; /* The maximum usable for return data */ +- int nllen; /* 1 or 2 for newline string length */ +- uschar nl[4]; /* Newline string */ ++ int nltype; /* Newline type */ ++ int nllen; /* Newline string length */ ++ uschar nl[4]; /* Newline string when fixed */ + const uschar *lcc; /* Points to lower casing table */ + const uschar *ctypes; /* Points to table of type maps */ + BOOL offset_overflow; /* Set if too many extractions */ +@@ -854,6 +940,8 @@ + int end_offset_top; /* Highwater mark at end of match */ + int capture_last; /* Most recent capture number */ + int start_offset; /* The start offset value */ ++ eptrblock *eptrchain; /* Chain of eptrblocks for tail recursions */ ++ int eptrn; /* Next free eptrblock */ + recursion_info *recursive; /* Linked list of recursion data */ + void *callout_data; /* To pass back to callouts */ + struct heapframe *thisframe; /* Used only when compiling for no recursion */ +@@ -869,8 +957,9 @@ + const uschar *tables; /* Character tables */ + int moptions; /* Match options */ + int poptions; /* Pattern options */ +- int nllen; /* 1 or 2 for newline string length */ +- uschar nl[4]; /* Newline string */ ++ int nltype; /* Newline type */ ++ int nllen; /* Newline string length */ ++ uschar nl[4]; /* Newline string when fixed */ + void *callout_data; /* To pass back to callouts */ + } dfa_match_data; + +@@ -941,13 +1030,17 @@ + one of the exported public functions. They have to be "external" in the C + sense, but are not part of the PCRE public API. */ + +-extern int _pcre_ord2utf8(int, uschar *); +-extern real_pcre * _pcre_try_flipped(const real_pcre *, real_pcre *, +- const pcre_study_data *, pcre_study_data *); +-extern int _pcre_ucp_findprop(const unsigned int, int *, int *); +-extern int _pcre_ucp_othercase(const int); +-extern int _pcre_valid_utf8(const uschar *, int); +-extern BOOL _pcre_xclass(int, const uschar *); ++extern BOOL _pcre_is_newline(const uschar *, const uschar *, int *, ++ BOOL); ++extern int _pcre_ord2utf8(int, uschar *); ++extern real_pcre *_pcre_try_flipped(const real_pcre *, real_pcre *, ++ const pcre_study_data *, pcre_study_data *); ++extern int _pcre_ucp_findprop(const unsigned int, int *, int *); ++extern unsigned int _pcre_ucp_othercase(const unsigned int); ++extern int _pcre_valid_utf8(const uschar *, int); ++extern BOOL _pcre_was_newline(const uschar *, const uschar *, int *, ++ BOOL); ++extern BOOL _pcre_xclass(int, const uschar *); + + #endif + +diff -ruN ../pcre.orig/pcrelib/pcre_maketables.c ./pcrelib/pcre_maketables.c +--- ../pcre.orig/pcrelib/pcre_maketables.c Mon Jan 1 10:36:04 2007 ++++ ./pcrelib/pcre_maketables.c Fri Feb 9 22:31:20 2007 +@@ -6,7 +6,7 @@ + and semantics are as close as possible to those of the Perl 5 language. + + Written by Philip Hazel +- Copyright (c) 1997-2007 University of Cambridge ++ Copyright (c) 1997-2006 University of Cambridge + + ----------------------------------------------------------------------------- + Redistribution and use in source and binary forms, with or without +@@ -130,7 +130,7 @@ + meta-character, which in this sense is any character that terminates a run + of data characters. */ + +- if (strchr("*+?{^.$|()[", i) != 0) x += ctype_meta; ++ if (strchr("\\*+?{^.$|()[", i) != 0) x += ctype_meta; + *p++ = x; + } + +diff -ruN ../pcre.orig/pcrelib/pcre_newline.c ./pcrelib/pcre_newline.c +--- ../pcre.orig/pcrelib/pcre_newline.c Thu Jan 1 01:00:00 1970 ++++ ./pcrelib/pcre_newline.c Fri Feb 9 20:48:47 2007 +@@ -0,0 +1,135 @@ ++/************************************************* ++* Perl-Compatible Regular Expressions * ++*************************************************/ ++ ++/* PCRE is a library of functions to support regular expressions whose syntax ++and semantics are as close as possible to those of the Perl 5 language. ++ ++ Written by Philip Hazel ++ Copyright (c) 1997-2006 University of Cambridge ++ ++----------------------------------------------------------------------------- ++Redistribution and use in source and binary forms, with or without ++modification, are permitted provided that the following conditions are met: ++ ++ * Redistributions of source code must retain the above copyright notice, ++ this list of conditions and the following disclaimer. ++ ++ * Redistributions in binary form must reproduce the above copyright ++ notice, this list of conditions and the following disclaimer in the ++ documentation and/or other materials provided with the distribution. ++ ++ * Neither the name of the University of Cambridge nor the names of its ++ contributors may be used to endorse or promote products derived from ++ this software without specific prior written permission. ++ ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ++AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ++IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ++ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE ++LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS ++INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN ++CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ++ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE ++POSSIBILITY OF SUCH DAMAGE. ++----------------------------------------------------------------------------- ++*/ ++ ++ ++/* This module contains internal functions for testing newlines when more than ++one kind of newline is to be recognized. When a newline is found, its length is ++returned. In principle, we could implement several newline "types", each ++referring to a different set of newline characters. At present, PCRE supports ++only NLTYPE_FIXED, which gets handled without these functions, and NLTYPE_ALL, ++so for now the type isn't passed into the functions. It can easily be added ++later if required. The full list of Unicode newline characters is taken from ++http://unicode.org/unicode/reports/tr18/. */ ++ ++ ++#include "pcre_internal.h" ++ ++ ++ ++/************************************************* ++* Check for newline at given position * ++*************************************************/ ++ ++/* It is guaranteed that the initial value of ptr is less than the end of the ++string that is being processed. ++ ++Arguments: ++ ptr pointer to possible newline ++ endptr pointer to the end of the string ++ lenptr where to return the length ++ utf8 TRUE if in utf8 mode ++ ++Returns: TRUE or FALSE ++*/ ++ ++BOOL ++_pcre_is_newline(const uschar *ptr, const uschar *endptr, int *lenptr, ++ BOOL utf8) ++{ ++int c; ++if (utf8) { GETCHAR(c, ptr); } else c = *ptr; ++switch(c) ++ { ++ case 0x000a: /* LF */ ++ case 0x000b: /* VT */ ++ case 0x000c: *lenptr = 1; return TRUE; /* FF */ ++ case 0x000d: *lenptr = (ptr < endptr - 1 && ptr[1] == 0x0a)? 2 : 1; ++ return TRUE; /* CR */ ++ case 0x0085: *lenptr = utf8? 2 : 1; return TRUE; /* NEL */ ++ case 0x2028: /* LS */ ++ case 0x2029: *lenptr = 3; return TRUE; /* PS */ ++ default: return FALSE; ++ } ++} ++ ++ ++ ++/************************************************* ++* Check for newline at previous position * ++*************************************************/ ++ ++/* It is guaranteed that the initial value of ptr is greater than the start of ++the string that is being processed. ++ ++Arguments: ++ ptr pointer to possible newline ++ startptr pointer to the start of the string ++ lenptr where to return the length ++ utf8 TRUE if in utf8 mode ++ ++Returns: TRUE or FALSE ++*/ ++ ++BOOL ++_pcre_was_newline(const uschar *ptr, const uschar *startptr, int *lenptr, ++ BOOL utf8) ++{ ++int c; ++ptr--; ++if (utf8) ++ { ++ BACKCHAR(ptr); ++ GETCHAR(c, ptr); ++ } ++else c = *ptr; ++switch(c) ++ { ++ case 0x000a: *lenptr = (ptr > startptr && ptr[-1] == 0x0d)? 2 : 1; ++ return TRUE; /* LF */ ++ case 0x000b: /* VT */ ++ case 0x000c: /* FF */ ++ case 0x000d: *lenptr = 1; return TRUE; /* CR */ ++ case 0x0085: *lenptr = utf8? 2 : 1; return TRUE; /* NEL */ ++ case 0x2028: /* LS */ ++ case 0x2029: *lenptr = 3; return TRUE; /* PS */ ++ default: return FALSE; ++ } ++} ++ ++/* End of pcre_newline.c */ +diff -ruN ../pcre.orig/pcrelib/pcre_printint.src ./pcrelib/pcre_printint.src +--- ../pcre.orig/pcrelib/pcre_printint.src Wed Aug 30 22:00:22 2006 ++++ ./pcrelib/pcre_printint.src Fri Feb 9 22:31:20 2007 +@@ -49,9 +49,19 @@ + compiled regex for debugging purposes. */ + + ++/* Macro that decides whether a character should be output as a literal or in ++hexadecimal. We don't use isprint() because that can vary from system to system ++(even without the use of locales) and we want the output always to be the same, ++for testing purposes. This macro is used in pcretest as well as in this file. */ ++ ++#define PRINTABLE(c) ((c) >= 32 && (c) < 127) ++ ++/* The table of operator names. */ ++ + static const char *OP_names[] = { OP_NAME_LIST }; + + ++ + /************************************************* + * Print single- or multi-byte character * + *************************************************/ +@@ -63,7 +73,7 @@ + + if (!utf8 || (c & 0xc0) != 0xc0) + { +- if (isprint(c)) fprintf(f, "%c", c); else fprintf(f, "\\x%02x", c); ++ if (PRINTABLE(c)) fprintf(f, "%c", c); else fprintf(f, "\\x%02x", c); + return 0; + } + else +@@ -160,16 +170,6 @@ + + fprintf(f, "%3d ", (int)(code - codestart)); + +- if (*code >= OP_BRA) +- { +- if (*code - OP_BRA > EXTRACT_BASIC_MAX) +- fprintf(f, "%3d Bra extra\n", GET(code, 1)); +- else +- fprintf(f, "%3d Bra %d\n", GET(code, 1), *code - OP_BRA); +- code += _pcre_OP_lengths[OP_BRA]; +- continue; +- } +- + switch(*code) + { + case OP_END: +@@ -203,6 +203,14 @@ + fprintf(f, "\n"); + continue; + ++ case OP_CBRA: ++ case OP_SCBRA: ++ fprintf(f, "%3d %s %d", GET(code, 1), OP_names[*code], ++ GET2(code, 1+LINK_SIZE)); ++ break; ++ ++ case OP_BRA: ++ case OP_SBRA: + case OP_KETRMAX: + case OP_KETRMIN: + case OP_ALT: +@@ -213,33 +221,45 @@ + case OP_ASSERTBACK_NOT: + case OP_ONCE: + case OP_COND: ++ case OP_SCOND: + case OP_REVERSE: + fprintf(f, "%3d %s", GET(code, 1), OP_names[*code]); + break; + +- case OP_BRANUMBER: +- printf("%3d %s", GET2(code, 1), OP_names[*code]); ++ case OP_CREF: ++ fprintf(f, "%3d %s", GET2(code,1), OP_names[*code]); + break; + +- case OP_CREF: +- if (GET2(code, 1) == CREF_RECURSE) +- fprintf(f, " Cond recurse"); ++ case OP_RREF: ++ c = GET2(code, 1); ++ if (c == RREF_ANY) ++ fprintf(f, " Cond recurse any"); + else +- fprintf(f, "%3d %s", GET2(code,1), OP_names[*code]); ++ fprintf(f, " Cond recurse %d", c); ++ break; ++ ++ case OP_DEF: ++ fprintf(f, " Cond def"); + break; + + case OP_STAR: + case OP_MINSTAR: ++ case OP_POSSTAR: + case OP_PLUS: + case OP_MINPLUS: ++ case OP_POSPLUS: + case OP_QUERY: + case OP_MINQUERY: ++ case OP_POSQUERY: + case OP_TYPESTAR: + case OP_TYPEMINSTAR: ++ case OP_TYPEPOSSTAR: + case OP_TYPEPLUS: + case OP_TYPEMINPLUS: ++ case OP_TYPEPOSPLUS: + case OP_TYPEQUERY: + case OP_TYPEMINQUERY: ++ case OP_TYPEPOSQUERY: + fprintf(f, " "); + if (*code >= OP_TYPESTAR) + { +@@ -257,17 +277,20 @@ + case OP_EXACT: + case OP_UPTO: + case OP_MINUPTO: ++ case OP_POSUPTO: + fprintf(f, " "); + extra = print_char(f, code+3, utf8); + fprintf(f, "{"); +- if (*code != OP_EXACT) fprintf(f, ","); ++ if (*code != OP_EXACT) fprintf(f, "0,"); + fprintf(f, "%d}", GET2(code,1)); + if (*code == OP_MINUPTO) fprintf(f, "?"); ++ else if (*code == OP_POSUPTO) fprintf(f, "+"); + break; + + case OP_TYPEEXACT: + case OP_TYPEUPTO: + case OP_TYPEMINUPTO: ++ case OP_TYPEPOSUPTO: + fprintf(f, " %s", OP_names[code[3]]); + if (code[3] == OP_PROP || code[3] == OP_NOTPROP) + { +@@ -278,20 +301,26 @@ + if (*code != OP_TYPEEXACT) fprintf(f, "0,"); + fprintf(f, "%d}", GET2(code,1)); + if (*code == OP_TYPEMINUPTO) fprintf(f, "?"); ++ else if (*code == OP_TYPEPOSUPTO) fprintf(f, "+"); + break; + + case OP_NOT: +- if (isprint(c = code[1])) fprintf(f, " [^%c]", c); ++ c = code[1]; ++ if (PRINTABLE(c)) fprintf(f, " [^%c]", c); + else fprintf(f, " [^\\x%02x]", c); + break; + + case OP_NOTSTAR: + case OP_NOTMINSTAR: ++ case OP_NOTPOSSTAR: + case OP_NOTPLUS: + case OP_NOTMINPLUS: ++ case OP_NOTPOSPLUS: + case OP_NOTQUERY: + case OP_NOTMINQUERY: +- if (isprint(c = code[1])) fprintf(f, " [^%c]", c); ++ case OP_NOTPOSQUERY: ++ c = code[1]; ++ if (PRINTABLE(c)) fprintf(f, " [^%c]", c); + else fprintf(f, " [^\\x%02x]", c); + fprintf(f, "%s", OP_names[*code]); + break; +@@ -299,11 +328,14 @@ + case OP_NOTEXACT: + case OP_NOTUPTO: + case OP_NOTMINUPTO: +- if (isprint(c = code[3])) fprintf(f, " [^%c]{", c); ++ case OP_NOTPOSUPTO: ++ c = code[3]; ++ if (PRINTABLE(c)) fprintf(f, " [^%c]{", c); + else fprintf(f, " [^\\x%02x]{", c); + if (*code != OP_NOTEXACT) fprintf(f, "0,"); + fprintf(f, "%d}", GET2(code,1)); + if (*code == OP_NOTMINUPTO) fprintf(f, "?"); ++ else if (*code == OP_NOTPOSUPTO) fprintf(f, "+"); + break; + + case OP_RECURSE: +@@ -363,12 +395,14 @@ + for (j = i+1; j < 256; j++) + if ((ccode[j/8] & (1 << (j&7))) == 0) break; + if (i == '-' || i == ']') fprintf(f, "\\"); +- if (isprint(i)) fprintf(f, "%c", i); else fprintf(f, "\\x%02x", i); ++ if (PRINTABLE(i)) fprintf(f, "%c", i); ++ else fprintf(f, "\\x%02x", i); + if (--j > i) + { + if (j != i + 1) fprintf(f, "-"); + if (j == '-' || j == ']') fprintf(f, "\\"); +- if (isprint(j)) fprintf(f, "%c", j); else fprintf(f, "\\x%02x", j); ++ if (PRINTABLE(j)) fprintf(f, "%c", j); ++ else fprintf(f, "\\x%02x", j); + } + i = j; + } +diff -ruN ../pcre.orig/pcrelib/pcre_scanner.cc ./pcrelib/pcre_scanner.cc +--- ../pcre.orig/pcrelib/pcre_scanner.cc Mon Mar 6 22:45:57 2006 ++++ ./pcrelib/pcre_scanner.cc Fri Feb 9 22:31:20 2007 +@@ -43,6 +43,7 @@ + input_(data_), + skip_(NULL), + should_skip_(false), ++ skip_repeat_(false), + save_comments_(false), + comments_(NULL), + comments_offset_(0) { +@@ -53,6 +54,7 @@ + input_(data_), + skip_(NULL), + should_skip_(false), ++ skip_repeat_(false), + save_comments_(false), + comments_(NULL), + comments_offset_(0) { +@@ -63,15 +65,31 @@ + delete comments_; + } + ++void Scanner::SetSkipExpression(const char* re) { ++ delete skip_; ++ if (re != NULL) { ++ skip_ = new RE(re); ++ should_skip_ = true; ++ skip_repeat_ = true; ++ ConsumeSkip(); ++ } else { ++ skip_ = NULL; ++ should_skip_ = false; ++ skip_repeat_ = false; ++ } ++} ++ + void Scanner::Skip(const char* re) { + delete skip_; + if (re != NULL) { + skip_ = new RE(re); + should_skip_ = true; ++ skip_repeat_ = false; + ConsumeSkip(); + } else { + skip_ = NULL; + should_skip_ = false; ++ skip_repeat_ = false; + } + } + +@@ -118,19 +136,22 @@ + + // helper function to consume *skip_ and honour save_comments_ + void Scanner::ConsumeSkip() { ++ const char* start_data = input_.data(); ++ while (skip_->Consume(&input_)) { ++ if (!skip_repeat_) { ++ // Only one skip allowed. ++ break; ++ } ++ } + if (save_comments_) { +- if (NULL == comments_) { ++ if (comments_ == NULL) { + comments_ = new vector<StringPiece>; + } +- const char *start_data = input_.data(); +- skip_->Consume(&input_); + // already pointing one past end, so no need to +1 + int length = input_.data() - start_data; + if (length > 0) { + comments_->push_back(StringPiece(start_data, length)); + } +- } else { +- skip_->Consume(&input_); + } + } + +diff -ruN ../pcre.orig/pcrelib/pcre_scanner.h ./pcrelib/pcre_scanner.h +--- ../pcre.orig/pcrelib/pcre_scanner.h Tue Aug 9 01:59:00 2005 ++++ ./pcrelib/pcre_scanner.h Fri Feb 9 22:31:20 2007 +@@ -36,7 +36,7 @@ + // Scanner scanner(input); + // string var; + // int number; +-// scanner.Skip("\\s+"); // Skip any white space we encounter ++// scanner.SetSkipExpression("\\s+"); // Skip any white space we encounter + // while (scanner.Consume("(\\w+) = (\\d+)", &var, &number)) { + // ...; + // } +@@ -90,10 +90,16 @@ + // skipped. For example, a programming language scanner would use + // a skip RE that matches white space and comments. + // +- // scanner.Skip("(\\s|//.*|/[*](.|\n)*?[*]/)*"); ++ // scanner.SetSkipExpression("\\s+|//.*|/[*](.|\n)*?[*]/"); ++ // ++ // Skipping repeats as long as it succeeds. We used to let people do ++ // this by writing "(...)*" in the regular expression, but that added ++ // up to lots of recursive calls within the pcre library, so now we ++ // control repetition explicitly via the function call API. + // + // You can pass NULL for "re" if you do not want any data to be skipped. +- void Skip(const char* re); ++ void Skip(const char* re); // DEPRECATED; does *not* repeat ++ void SetSkipExpression(const char* re); + + // Temporarily pause "skip"ing. This + // Skip("Foo"); code ; DisableSkip(); code; EnableSkip() +@@ -109,12 +115,13 @@ + /***** Special wrappers around SetSkip() for some common idioms *****/ + + // Arranges to skip whitespace, C comments, C++ comments. +- // The overall RE is a repeated disjunction of the following REs: ++ // The overall RE is a disjunction of the following REs: + // \\s whitespace + // //.*\n C++ comment + // /[*](.|\n)*?[*]/ C comment (x*? means minimal repetitions of x) ++ // We get repetition via the semantics of SetSkipExpression, not by using * + void SkipCXXComments() { +- Skip("((\\s|//.*\n|/[*](.|\n)*?[*]/)*)"); ++ SetSkipExpression("\\s|//.*\n|/[*](?:\n|.)*?[*]/"); + } + + void set_save_comments(bool comments) { +@@ -143,6 +150,7 @@ + StringPiece input_; // Unprocessed input + RE* skip_; // If non-NULL, RE for skipping input + bool should_skip_; // If true, use skip_ ++ bool skip_repeat_; // If true, repeat skip_ as long as it works + bool save_comments_; // If true, aggregate the skip expression + + // the skipped comments +diff -ruN ../pcre.orig/pcrelib/pcre_scanner_unittest.cc ./pcrelib/pcre_scanner_unittest.cc +--- ../pcre.orig/pcrelib/pcre_scanner_unittest.cc Mon Mar 6 22:45:57 2006 ++++ ./pcrelib/pcre_scanner_unittest.cc Fri Feb 9 22:31:20 2007 +@@ -33,10 +33,13 @@ + // functionality. + + #include <stdio.h> ++#include <string> + #include <vector> + #include <pcre_stringpiece.h> + #include <pcre_scanner.h> + ++#define FLAGS_unittest_stack_size 49152 ++ + // Dies with a fatal error if the two values are not equal. + #define CHECK_EQ(a, b) do { \ + if ( (a) != (b) ) { \ +@@ -116,8 +119,31 @@ + comments.resize(0); + } + ++static void TestBigComment() { ++ string input; ++ for (int i = 0; i < 1024; ++i) { ++ char buf[1024]; ++ snprintf(buf, sizeof(buf), " # Comment %d\n", i); ++ input += buf; ++ } ++ input += "name = value;\n"; ++ ++ Scanner s(input.c_str()); ++ s.SetSkipExpression("\\s+|#.*\n"); ++ ++ string name; ++ string value; ++ s.Consume("(\\w+) = (\\w+);", &name, &value); ++ CHECK_EQ(name, "name"); ++ CHECK_EQ(value, "value"); ++} ++ ++// TODO: also test scanner and big-comment in a thread with a ++// small stack size ++ + int main(int argc, char** argv) { + TestScanner(); ++ TestBigComment(); + + // Done + printf("OK\n"); +diff -ruN ../pcre.orig/pcrelib/pcre_study.c ./pcrelib/pcre_study.c +--- ../pcre.orig/pcrelib/pcre_study.c Mon Jan 1 10:36:04 2007 ++++ ./pcrelib/pcre_study.c Fri Feb 9 22:31:20 2007 +@@ -6,7 +6,7 @@ + and semantics are as close as possible to those of the Perl 5 language. + + Written by Philip Hazel +- Copyright (c) 1997-2007 University of Cambridge ++ Copyright (c) 1997-2006 University of Cambridge + + ----------------------------------------------------------------------------- + Redistribution and use in source and binary forms, with or without +@@ -45,6 +45,11 @@ + #include "pcre_internal.h" + + ++/* Returns from set_start_bits() */ ++ ++enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE }; ++ ++ + /************************************************* + * Set a bit and maybe its alternate case * + *************************************************/ +@@ -72,12 +77,16 @@ + + + /************************************************* +-* Create bitmap of starting chars * ++* Create bitmap of starting bytes * + *************************************************/ + +-/* This function scans a compiled unanchored expression and attempts to build a +-bitmap of the set of initial characters. If it can't, it returns FALSE. As time +-goes by, we may be able to get more clever at doing this. ++/* This function scans a compiled unanchored expression recursively and ++attempts to build a bitmap of the set of possible starting bytes. As time goes ++by, we may be able to get more clever at doing this. The SSB_CONTINUE return is ++useful for parenthesized groups in patterns such as (a*)b where the group ++provides some optional starting bytes but scanning must continue at the outer ++level to find at least one mandatory byte. At the outermost level, this ++function fails unless the result is SSB_DONE. + + Arguments: + code points to an expression +@@ -86,14 +95,17 @@ + utf8 TRUE if in UTF-8 mode + cd the block with char table pointers + +-Returns: TRUE if table built, FALSE otherwise ++Returns: SSB_FAIL => Failed to find any starting bytes ++ SSB_DONE => Found mandatory starting bytes ++ SSB_CONTINUE => Found optional starting bytes + */ + +-static BOOL ++static int + set_start_bits(const uschar *code, uschar *start_bits, BOOL caseless, + BOOL utf8, compile_data *cd) + { + register int c; ++int yield = SSB_DONE; + + #if 0 + /* ========================================================================= */ +@@ -114,36 +126,60 @@ + + do + { +- const uschar *tcode = code + 1 + LINK_SIZE; ++ const uschar *tcode = code + (((int)*code == OP_CBRA)? 3:1) + LINK_SIZE; + BOOL try_next = TRUE; + +- while (try_next) ++ while (try_next) /* Loop for items in this branch */ + { +- /* If a branch starts with a bracket or a positive lookahead assertion, +- recurse to set bits from within them. That's all for this branch. */ +- +- if ((int)*tcode >= OP_BRA || *tcode == OP_ASSERT) ++ int rc; ++ switch(*tcode) + { +- if (!set_start_bits(tcode, start_bits, caseless, utf8, cd)) +- return FALSE; +- try_next = FALSE; +- } ++ /* Fail if we reach something we don't understand */ + +- else switch(*tcode) +- { + default: +- return FALSE; ++ return SSB_FAIL; + +- /* Skip over callout */ ++ /* If we hit a bracket or a positive lookahead assertion, recurse to set ++ bits from within the subpattern. If it can't find anything, we have to ++ give up. If it finds some mandatory character(s), we are done for this ++ branch. Otherwise, carry on scanning after the subpattern. */ ++ ++ case OP_BRA: ++ case OP_SBRA: ++ case OP_CBRA: ++ case OP_SCBRA: ++ case OP_ONCE: ++ case OP_ASSERT: ++ rc = set_start_bits(tcode, start_bits, caseless, utf8, cd); ++ if (rc == SSB_FAIL) return SSB_FAIL; ++ if (rc == SSB_DONE) try_next = FALSE; else ++ { ++ do tcode += GET(tcode, 1); while (*tcode == OP_ALT); ++ tcode += 1 + LINK_SIZE; ++ } ++ break; + +- case OP_CALLOUT: +- tcode += 2 + 2*LINK_SIZE; ++ /* If we hit ALT or KET, it means we haven't found anything mandatory in ++ this branch, though we might have found something optional. For ALT, we ++ continue with the next alternative, but we have to arrange that the final ++ result from subpattern is SSB_CONTINUE rather than SSB_DONE. For KET, ++ return SSB_CONTINUE: if this is the top level, that indicates failure, ++ but after a nested subpattern, it causes scanning to continue. */ ++ ++ case OP_ALT: ++ yield = SSB_CONTINUE; ++ try_next = FALSE; + break; + +- /* Skip over extended extraction bracket number */ ++ case OP_KET: ++ case OP_KETRMAX: ++ case OP_KETRMIN: ++ return SSB_CONTINUE; + +- case OP_BRANUMBER: +- tcode += 3; ++ /* Skip over callout */ ++ ++ case OP_CALLOUT: ++ tcode += 2 + 2*LINK_SIZE; + break; + + /* Skip over lookbehind and negative lookahead assertions */ +@@ -152,7 +188,7 @@ + case OP_ASSERTBACK: + case OP_ASSERTBACK_NOT: + do tcode += GET(tcode, 1); while (*tcode == OP_ALT); +- tcode += 1+LINK_SIZE; ++ tcode += 1 + LINK_SIZE; + break; + + /* Skip over an option setting, changing the caseless flag */ +@@ -166,27 +202,30 @@ + + case OP_BRAZERO: + case OP_BRAMINZERO: +- if (!set_start_bits(++tcode, start_bits, caseless, utf8, cd)) +- return FALSE; ++ if (set_start_bits(++tcode, start_bits, caseless, utf8, cd) == SSB_FAIL) ++ return SSB_FAIL; + /* ========================================================================= + See the comment at the head of this function concerning the next line, + which was an old fudge for the benefit of OS/2. + dummy = 1; + ========================================================================= */ + do tcode += GET(tcode,1); while (*tcode == OP_ALT); +- tcode += 1+LINK_SIZE; ++ tcode += 1 + LINK_SIZE; + break; + + /* Single-char * or ? sets the bit and tries the next item */ + + case OP_STAR: + case OP_MINSTAR: ++ case OP_POSSTAR: + case OP_QUERY: + case OP_MINQUERY: ++ case OP_POSQUERY: + set_bit(start_bits, tcode[1], caseless, cd); + tcode += 2; + #ifdef SUPPORT_UTF8 +- if (utf8) while ((*tcode & 0xc0) == 0x80) tcode++; ++ if (utf8 && tcode[-1] >= 0xc0) ++ tcode += _pcre_utf8_table4[tcode[-1] & 0x3f]; + #endif + break; + +@@ -194,10 +233,12 @@ + + case OP_UPTO: + case OP_MINUPTO: ++ case OP_POSUPTO: + set_bit(start_bits, tcode[3], caseless, cd); + tcode += 4; + #ifdef SUPPORT_UTF8 +- if (utf8) while ((*tcode & 0xc0) == 0x80) tcode++; ++ if (utf8 && tcode[-1] >= 0xc0) ++ tcode += _pcre_utf8_table4[tcode[-1] & 0x3f]; + #endif + break; + +@@ -210,6 +251,7 @@ + case OP_CHARNC: + case OP_PLUS: + case OP_MINPLUS: ++ case OP_POSPLUS: + set_bit(start_bits, tcode[1], caseless, cd); + try_next = FALSE; + break; +@@ -283,16 +325,19 @@ + + case OP_TYPEUPTO: + case OP_TYPEMINUPTO: ++ case OP_TYPEPOSUPTO: + tcode += 2; /* Fall through */ + + case OP_TYPESTAR: + case OP_TYPEMINSTAR: ++ case OP_TYPEPOSSTAR: + case OP_TYPEQUERY: + case OP_TYPEMINQUERY: ++ case OP_TYPEPOSQUERY: + switch(tcode[1]) + { + case OP_ANY: +- return FALSE; ++ return SSB_FAIL; + + case OP_NOT_DIGIT: + for (c = 0; c < 32; c++) +@@ -418,7 +463,7 @@ + code += GET(code, 1); /* Advance to next branch */ + } + while (*code == OP_ALT); +-return TRUE; ++return yield; + } + + +@@ -492,8 +537,8 @@ + /* See if we can find a fixed set of initial characters for the pattern. */ + + memset(start_bits, 0, 32 * sizeof(uschar)); +-if (!set_start_bits(code, start_bits, (re->options & PCRE_CASELESS) != 0, +- (re->options & PCRE_UTF8) != 0, &compile_block)) return NULL; ++if (set_start_bits(code, start_bits, (re->options & PCRE_CASELESS) != 0, ++ (re->options & PCRE_UTF8) != 0, &compile_block) != SSB_DONE) return NULL; + + /* Get a pcre_extra block and a pcre_study_data block. The study data is put in + the latter, which is pointed to by the former, which may also get additional +diff -ruN ../pcre.orig/pcrelib/pcre_tables.c ./pcrelib/pcre_tables.c +--- ../pcre.orig/pcrelib/pcre_tables.c Mon Jan 1 10:36:04 2007 ++++ ./pcrelib/pcre_tables.c Fri Feb 9 22:31:20 2007 +@@ -6,7 +6,7 @@ + and semantics are as close as possible to those of the Perl 5 language. + + Written by Philip Hazel +- Copyright (c) 1997-2007 University of Cambridge ++ Copyright (c) 1997-2006 University of Cambridge + + ----------------------------------------------------------------------------- + Redistribution and use in source and binary forms, with or without +@@ -72,9 +72,8 @@ + const int _pcre_utf8_table2[] = { 0, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc}; + const int _pcre_utf8_table3[] = { 0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01}; + +-/* Table of the number of extra characters, indexed by the first character +-masked with 0x3f. The highest number for a valid UTF-8 character is in fact +-0x3d. */ ++/* Table of the number of extra bytes, indexed by the first byte masked with ++0x3f. The highest number for a valid UTF-8 first byte is in fact 0x3d. */ + + const uschar _pcre_utf8_table4[] = { + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, +@@ -89,6 +88,7 @@ + { "Any", PT_ANY, 0 }, + { "Arabic", PT_SC, ucp_Arabic }, + { "Armenian", PT_SC, ucp_Armenian }, ++ { "Balinese", PT_SC, ucp_Balinese }, + { "Bengali", PT_SC, ucp_Bengali }, + { "Bopomofo", PT_SC, ucp_Bopomofo }, + { "Braille", PT_SC, ucp_Braille }, +@@ -104,6 +104,7 @@ + { "Common", PT_SC, ucp_Common }, + { "Coptic", PT_SC, ucp_Coptic }, + { "Cs", PT_PC, ucp_Cs }, ++ { "Cuneiform", PT_SC, ucp_Cuneiform }, + { "Cypriot", PT_SC, ucp_Cypriot }, + { "Cyrillic", PT_SC, ucp_Cyrillic }, + { "Deseret", PT_SC, ucp_Deseret }, +@@ -146,6 +147,7 @@ + { "N", PT_GC, ucp_N }, + { "Nd", PT_PC, ucp_Nd }, + { "New_Tai_Lue", PT_SC, ucp_New_Tai_Lue }, ++ { "Nko", PT_SC, ucp_Nko }, + { "Nl", PT_PC, ucp_Nl }, + { "No", PT_PC, ucp_No }, + { "Ogham", PT_SC, ucp_Ogham }, +@@ -158,6 +160,8 @@ + { "Pd", PT_PC, ucp_Pd }, + { "Pe", PT_PC, ucp_Pe }, + { "Pf", PT_PC, ucp_Pf }, ++ { "Phags_Pa", PT_SC, ucp_Phags_Pa }, ++ { "Phoenician", PT_SC, ucp_Phoenician }, + { "Pi", PT_PC, ucp_Pi }, + { "Po", PT_PC, ucp_Po }, + { "Ps", PT_PC, ucp_Ps }, +diff -ruN ../pcre.orig/pcrelib/pcre_ucp_searchfuncs.c ./pcrelib/pcre_ucp_searchfuncs.c +--- ../pcre.orig/pcrelib/pcre_ucp_searchfuncs.c Mon Jan 1 10:36:04 2007 ++++ ./pcrelib/pcre_ucp_searchfuncs.c Fri Feb 9 22:31:20 2007 +@@ -6,7 +6,7 @@ + and semantics are as close as possible to those of the Perl 5 language. + + Written by Philip Hazel +- Copyright (c) 1997-2007 University of Cambridge ++ Copyright (c) 1997-2006 University of Cambridge + + ----------------------------------------------------------------------------- + Redistribution and use in source and binary forms, with or without +@@ -131,11 +131,11 @@ + Arguments: + c the character value + +-Returns: the other case or -1 if none ++Returns: the other case or NOTACHAR if none + */ + +-int +-_pcre_ucp_othercase(const int c) ++unsigned int ++_pcre_ucp_othercase(const unsigned int c) + { + int bot = 0; + int top = sizeof(ucp_table)/sizeof(cnode); +@@ -161,14 +161,14 @@ + } + } + +-/* Found an entry in the table. Return -1 for a range entry. Otherwise return +-the other case if there is one, else -1. */ ++/* Found an entry in the table. Return NOTACHAR for a range entry. Otherwise ++return the other case if there is one, else NOTACHAR. */ + +-if ((ucp_table[mid].f0 & f0_rangeflag) != 0) return -1; ++if ((ucp_table[mid].f0 & f0_rangeflag) != 0) return NOTACHAR; + + offset = ucp_table[mid].f1 & f1_casemask; + if ((offset & f1_caseneg) != 0) offset |= f1_caseneg; +-return (offset == 0)? -1 : c + offset; ++return (offset == 0)? NOTACHAR : c + offset; + } + + +diff -ruN ../pcre.orig/pcrelib/pcre_valid_utf8.c ./pcrelib/pcre_valid_utf8.c +--- ../pcre.orig/pcrelib/pcre_valid_utf8.c Mon Jan 1 10:36:04 2007 ++++ ./pcrelib/pcre_valid_utf8.c Fri Feb 9 22:31:20 2007 +@@ -6,7 +6,7 @@ + and semantics are as close as possible to those of the Perl 5 language. + + Written by Philip Hazel +- Copyright (c) 1997-2007 University of Cambridge ++ Copyright (c) 1997-2006 University of Cambridge + + ----------------------------------------------------------------------------- + Redistribution and use in source and binary forms, with or without +@@ -79,7 +79,7 @@ + register int ab; + register int c = *p; + if (c < 128) continue; +- if ((c & 0xc0) != 0xc0) return p - string; ++ if (c < 0xc0) return p - string; + ab = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ + if (length < ab) return p - string; + length -= ab; +diff -ruN ../pcre.orig/pcrelib/pcre_version.c ./pcrelib/pcre_version.c +--- ../pcre.orig/pcrelib/pcre_version.c Mon Jan 1 10:36:04 2007 ++++ ./pcrelib/pcre_version.c Fri Feb 9 22:31:20 2007 +@@ -6,7 +6,7 @@ + and semantics are as close as possible to those of the Perl 5 language. + + Written by Philip Hazel +- Copyright (c) 1997-2007 University of Cambridge ++ Copyright (c) 1997-2006 University of Cambridge + + ----------------------------------------------------------------------------- + Redistribution and use in source and binary forms, with or without +@@ -49,16 +49,38 @@ + * Return version string * + *************************************************/ + ++/* These macros are the standard way of turning unquoted text into C strings. ++They allow macros like PCRE_MAJOR to be defined without quotes, which is ++convenient for user programs that want to test its value. */ ++ + #define STRING(a) # a + #define XSTRING(s) STRING(s) + ++/* A problem turned up with PCRE_PRERELEASE, which is defined empty for ++production releases. Originally, it was used naively in this code: ++ ++ return XSTRING(PCRE_MAJOR) ++ "." XSTRING(PCRE_MINOR) ++ XSTRING(PCRE_PRERELEASE) ++ " " XSTRING(PCRE_DATE); ++ ++However, when PCRE_PRERELEASE is empty, this leads to an attempted expansion of ++STRING(). The C standard states: "If (before argument substitution) any ++argument consists of no preprocessing tokens, the behavior is undefined." It ++turns out the gcc treats this case as a single empty string - which is what we ++really want - but Visual C grumbles about the lack of an argument for the ++macro. Unfortunately, both are within their rights. To cope with both ways of ++handling this, I had resort to some messy hackery that does a test at run time. ++I could find no way of detecting that a macro is defined as an empty string at ++pre-processor time. This hack uses a standard trick for avoiding calling ++the STRING macro with an empty argument when doing the test. */ ++ + PCRE_DATA_SCOPE const char * + pcre_version(void) + { +-return XSTRING(PCRE_MAJOR) +- "." XSTRING(PCRE_MINOR) +- XSTRING(PCRE_PRERELEASE) +- " " XSTRING(PCRE_DATE); ++return (XSTRING(Z PCRE_PRERELEASE)[1] == 0)? ++ XSTRING(PCRE_MAJOR.PCRE_MINOR PCRE_DATE) : ++ XSTRING(PCRE_MAJOR.PCRE_MINOR) XSTRING(PCRE_PRERELEASE PCRE_DATE); + } + + /* End of pcre_version.c */ +diff -ruN ../pcre.orig/pcrelib/pcrecpp.cc ./pcrelib/pcrecpp.cc +--- ../pcre.orig/pcrelib/pcrecpp.cc Wed Aug 30 22:00:22 2006 ++++ ./pcrelib/pcrecpp.cc Fri Feb 9 22:31:20 2007 +@@ -61,7 +61,7 @@ + // If the user doesn't ask for any options, we just use this one + static RE_Options default_options; + +-void RE::Init(const char* pat, const RE_Options* options) { ++void RE::Init(const string& pat, const RE_Options* options) { + pattern_ = pat; + if (options == NULL) { + options_ = default_options; +@@ -78,7 +78,7 @@ + // conservative in that it may treat some "simple" patterns + // as "complex" (e.g., if the vertical bar is in a character + // class or is escaped). But it seems good enough. +- if (strchr(pat, '|') == NULL) { ++ if (strchr(pat.c_str(), '|') == NULL) { + // Simple pattern: we can use position-based checks to perform + // fully anchored matches + re_full_ = re_partial_; +@@ -89,12 +89,18 @@ + } + } + +-RE::~RE() { ++void RE::Cleanup() { + if (re_full_ != NULL && re_full_ != re_partial_) (*pcre_free)(re_full_); + if (re_partial_ != NULL) (*pcre_free)(re_partial_); + if (error_ != &empty_string) delete error_; + } + ++ ++RE::~RE() { ++ Cleanup(); ++} ++ ++ + pcre* RE::Compile(Anchor anchor) { + // First, convert RE_Options into pcre options + int pcre_options = 0; +@@ -424,6 +430,34 @@ + return Rewrite(out, rewrite, text, vec, matches); + } + ++/*static*/ string RE::QuoteMeta(const StringPiece& unquoted) { ++ string result; ++ ++ // Escape any ascii character not in [A-Za-z_0-9]. ++ // ++ // Note that it's legal to escape a character even if it has no ++ // special meaning in a regular expression -- so this function does ++ // that. (This also makes it identical to the perl function of the ++ // same name; see `perldoc -f quotemeta`.) ++ for (int ii = 0; ii < unquoted.size(); ++ii) { ++ // Note that using 'isalnum' here raises the benchmark time from ++ // 32ns to 58ns: ++ if ((unquoted[ii] < 'a' || unquoted[ii] > 'z') && ++ (unquoted[ii] < 'A' || unquoted[ii] > 'Z') && ++ (unquoted[ii] < '0' || unquoted[ii] > '9') && ++ unquoted[ii] != '_' && ++ // If this is the part of a UTF8 or Latin1 character, we need ++ // to copy this byte without escaping. Experimentally this is ++ // what works correctly with the regexp library. ++ !(unquoted[ii] & 128)) { ++ result += '\\'; ++ } ++ result += unquoted[ii]; ++ } ++ ++ return result; ++} ++ + /***** Actual matching and rewriting code *****/ + + int RE::TryMatch(const StringPiece& text, +@@ -809,14 +843,14 @@ + return parse_##name##_radix(str, n, dest, 0); \ + } + +-DEFINE_INTEGER_PARSERS(short); +-DEFINE_INTEGER_PARSERS(ushort); +-DEFINE_INTEGER_PARSERS(int); +-DEFINE_INTEGER_PARSERS(uint); +-DEFINE_INTEGER_PARSERS(long); +-DEFINE_INTEGER_PARSERS(ulong); +-DEFINE_INTEGER_PARSERS(longlong); +-DEFINE_INTEGER_PARSERS(ulonglong); ++DEFINE_INTEGER_PARSERS(short) /* */ ++DEFINE_INTEGER_PARSERS(ushort) /* */ ++DEFINE_INTEGER_PARSERS(int) /* Don't use semicolons after these */ ++DEFINE_INTEGER_PARSERS(uint) /* statements because they can cause */ ++DEFINE_INTEGER_PARSERS(long) /* compiler warnings if the checking */ ++DEFINE_INTEGER_PARSERS(ulong) /* level is turned up high enough. */ ++DEFINE_INTEGER_PARSERS(longlong) /* */ ++DEFINE_INTEGER_PARSERS(ulonglong) /* */ + + #undef DEFINE_INTEGER_PARSERS + +diff -ruN ../pcre.orig/pcrelib/pcrecpp.h ./pcrelib/pcrecpp.h +--- ../pcre.orig/pcrelib/pcrecpp.h Mon Mar 6 22:45:57 2006 ++++ ./pcrelib/pcrecpp.h Fri Feb 9 22:31:20 2007 +@@ -112,6 +112,12 @@ + // T (where "bool T::ParseFrom(const char*, int)" exists) + // NULL (the corresponding matched sub-pattern is not copied) + // ++// CAVEAT: An optional sub-pattern that does not exist in the matched ++// string is assigned the empty string. Therefore, the following will ++// return false (because the empty string is not a valid number): ++// int number; ++// pcrecpp::RE::FullMatch("abc", "[a-z]+(\\d+)?", &number); ++// + // ----------------------------------------------------------------------- + // DO_MATCH + // +@@ -488,8 +494,25 @@ + // pass in a string or a "const char*" wherever an "RE" is expected. + RE(const char* pat) { Init(pat, NULL); } + RE(const char *pat, const RE_Options& option) { Init(pat, &option); } +- RE(const string& pat) { Init(pat.c_str(), NULL); } +- RE(const string& pat, const RE_Options& option) { Init(pat.c_str(), &option); } ++ RE(const string& pat) { Init(pat, NULL); } ++ RE(const string& pat, const RE_Options& option) { Init(pat, &option); } ++ ++ // Copy constructor & assignment - note that these are expensive ++ // because they recompile the expression. ++ RE(const RE& re) { Init(re.pattern_, &re.options_); } ++ const RE& operator=(const RE& re) { ++ if (this != &re) { ++ Cleanup(); ++ ++ // This is the code that originally came from Google ++ // Init(re.pattern_.c_str(), &re.options_); ++ ++ // This is the replacement from Ari Pollak ++ Init(re.pattern_, &re.options_); ++ } ++ return *this; ++ } ++ + + ~RE(); + +@@ -589,6 +612,15 @@ + const StringPiece &text, + string *out) const; + ++ // Escapes all potentially meaningful regexp characters in ++ // 'unquoted'. The returned string, used as a regular expression, ++ // will exactly match the original string. For example, ++ // 1.5-2.0? ++ // may become: ++ // 1\.5\-2\.0\? ++ static string QuoteMeta(const StringPiece& unquoted); ++ ++ + /***** Generic matching interface *****/ + + // Type of match (TODO: Should be restructured as part of RE_Options) +@@ -611,7 +643,8 @@ + + private: + +- void Init(const char* pattern, const RE_Options* options); ++ void Init(const string& pattern, const RE_Options* options); ++ void Cleanup(); + + // Match against "text", filling in "vec" (up to "vecsize" * 2/3) with + // pairs of integers for the beginning and end positions of matched +@@ -655,11 +688,6 @@ + pcre* re_full_; // For full matches + pcre* re_partial_; // For partial matches + const string* error_; // Error indicator (or points to empty string) +- +- // Don't allow the default copy or assignment constructors -- +- // they're expensive and too easy to do by accident. +- RE(const RE&); +- void operator=(const RE&); + }; + + } // namespace pcrecpp +diff -ruN ../pcre.orig/pcrelib/pcrecpp_unittest.cc ./pcrelib/pcrecpp_unittest.cc +--- ../pcre.orig/pcrelib/pcrecpp_unittest.cc Wed Aug 30 22:00:22 2006 ++++ ./pcrelib/pcrecpp_unittest.cc Fri Feb 9 22:31:20 2007 +@@ -1,4 +1,6 @@ +-// Copyright (c) 2005, Google Inc. ++// -*- coding: utf-8 -*- ++// ++// Copyright (c) 2005 - 2006, Google Inc. + // All rights reserved. + // + // Redistribution and use in source and binary forms, with or without +@@ -445,6 +447,80 @@ + CHECK(re4.FullMatch(text_bad) == false); + } + ++// A meta-quoted string, interpreted as a pattern, should always match ++// the original unquoted string. ++static void TestQuoteMeta(string unquoted, RE_Options options = RE_Options()) { ++ string quoted = RE::QuoteMeta(unquoted); ++ RE re(quoted, options); ++ CHECK(re.FullMatch(unquoted)); ++} ++ ++// A string containing meaningful regexp characters, which is then meta- ++// quoted, should not generally match a string the unquoted string does. ++static void NegativeTestQuoteMeta(string unquoted, string should_not_match, ++ RE_Options options = RE_Options()) { ++ string quoted = RE::QuoteMeta(unquoted); ++ RE re(quoted, options); ++ CHECK(!re.FullMatch(should_not_match)); ++} ++ ++// Tests that quoted meta characters match their original strings, ++// and that a few things that shouldn't match indeed do not. ++static void TestQuotaMetaSimple() { ++ TestQuoteMeta("foo"); ++ TestQuoteMeta("foo.bar"); ++ TestQuoteMeta("foo\\.bar"); ++ TestQuoteMeta("[1-9]"); ++ TestQuoteMeta("1.5-2.0?"); ++ TestQuoteMeta("\\d"); ++ TestQuoteMeta("Who doesn't like ice cream?"); ++ TestQuoteMeta("((a|b)c?d*e+[f-h]i)"); ++ TestQuoteMeta("((?!)xxx).*yyy"); ++ TestQuoteMeta("(["); ++} ++ ++static void TestQuoteMetaSimpleNegative() { ++ NegativeTestQuoteMeta("foo", "bar"); ++ NegativeTestQuoteMeta("...", "bar"); ++ NegativeTestQuoteMeta("\\.", "."); ++ NegativeTestQuoteMeta("\\.", ".."); ++ NegativeTestQuoteMeta("(a)", "a"); ++ NegativeTestQuoteMeta("(a|b)", "a"); ++ NegativeTestQuoteMeta("(a|b)", "(a)"); ++ NegativeTestQuoteMeta("(a|b)", "a|b"); ++ NegativeTestQuoteMeta("[0-9]", "0"); ++ NegativeTestQuoteMeta("[0-9]", "0-9"); ++ NegativeTestQuoteMeta("[0-9]", "[9]"); ++ NegativeTestQuoteMeta("((?!)xxx)", "xxx"); ++} ++ ++static void TestQuoteMetaLatin1() { ++ TestQuoteMeta("3\xb2 = 9"); ++} ++ ++static void TestQuoteMetaUtf8() { ++#ifdef SUPPORT_UTF8 ++ TestQuoteMeta("Pl\xc3\xa1\x63ido Domingo", pcrecpp::UTF8()); ++ TestQuoteMeta("xyz", pcrecpp::UTF8()); // No fancy utf8 ++ TestQuoteMeta("\xc2\xb0", pcrecpp::UTF8()); // 2-byte utf8 (degree symbol) ++ TestQuoteMeta("27\xc2\xb0 degrees", pcrecpp::UTF8()); // As a middle character ++ TestQuoteMeta("\xe2\x80\xb3", pcrecpp::UTF8()); // 3-byte utf8 (double prime) ++ TestQuoteMeta("\xf0\x9d\x85\x9f", pcrecpp::UTF8()); // 4-byte utf8 (music note) ++ TestQuoteMeta("27\xc2\xb0"); // Interpreted as Latin-1, but should still work ++ NegativeTestQuoteMeta("27\xc2\xb0", // 2-byte utf (degree symbol) ++ "27\\\xc2\\\xb0", ++ pcrecpp::UTF8()); ++#endif ++} ++ ++static void TestQuoteMetaAll() { ++ printf("Testing QuoteMeta\n"); ++ TestQuotaMetaSimple(); ++ TestQuoteMetaSimpleNegative(); ++ TestQuoteMetaLatin1(); ++ TestQuoteMetaUtf8(); ++} ++ + // + // Options tests contributed by + // Giuseppe Maxia, CTO, Stardata s.r.l. +@@ -667,6 +743,35 @@ + Test_all_options(); + } + ++static void TestConstructors() { ++ printf("Testing constructors\n"); ++ ++ RE_Options options; ++ options.set_dotall(true); ++ const char *str = "HELLO\n" "cruel\n" "world"; ++ ++ RE orig("HELLO.*world", options); ++ CHECK(orig.FullMatch(str)); ++ ++ RE copy1(orig); ++ CHECK(copy1.FullMatch(str)); ++ ++ RE copy2("not a match"); ++ CHECK(!copy2.FullMatch(str)); ++ copy2 = copy1; ++ CHECK(copy2.FullMatch(str)); ++ copy2 = orig; ++ CHECK(copy2.FullMatch(str)); ++ ++ // Make sure when we assign to ourselves, nothing bad happens ++ orig = orig; ++ copy1 = copy1; ++ copy2 = copy2; ++ CHECK(orig.FullMatch(str)); ++ CHECK(copy1.FullMatch(str)); ++ CHECK(copy2.FullMatch(str)); ++} ++ + int main(int argc, char** argv) { + // Treat any flag as --help + if (argc > 1 && argv[1][0] == '-') { +@@ -985,11 +1090,14 @@ + CHECK(RE("h.*o").PartialMatch("hello!")); + CHECK(RE("((((((((((((((((((((x))))))))))))))))))))").PartialMatch("x")); + ++ /***** other tests *****/ ++ + RadixTests(); + TestReplace(); + TestExtract(); + TestConsume(); + TestFindAndConsume(); ++ TestQuoteMetaAll(); + TestMatchNumberPeculiarity(); + + // Check the pattern() accessor +@@ -1108,6 +1216,9 @@ + if (getenv("VERBOSE_TEST") != NULL) + VERBOSE_TEST = true; + TestOptions(); ++ ++ // Test the constructors ++ TestConstructors(); + + // Done + printf("OK\n"); +diff -ruN ../pcre.orig/pcrelib/pcregrep.c ./pcrelib/pcregrep.c +--- ../pcre.orig/pcrelib/pcregrep.c Wed Jan 3 21:08:37 2007 ++++ ./pcrelib/pcregrep.c Tue Feb 27 04:31:14 2007 +@@ -6,7 +6,7 @@ + its pattern matching. On a Unix or Win32 system it can recurse into + directories. + +- Copyright (c) 1997-2007 University of Cambridge ++ Copyright (c) 1997-2006 University of Cambridge + + ----------------------------------------------------------------------------- + Redistribution and use in source and binary forms, with or without +@@ -56,7 +56,7 @@ + + typedef int BOOL; + +-#define VERSION "4.3 01-Jun-2006" ++#define VERSION "4.4 29-Nov-2006" + #define MAX_PATTERN_COUNT 100 + + #if BUFSIZ > 8192 +@@ -65,7 +65,6 @@ + #define MBUFTHIRD 8192 + #endif + +- + /* Values for the "filenames" variable, which specifies options for file name + output. The order is important; it is assumed that a file name is wanted for + all values greater than FN_DEFAULT. */ +@@ -83,6 +82,10 @@ + #define PO_LINE_MATCH 0x0002 + #define PO_FIXED_STRINGS 0x0004 + ++/* Line ending types */ ++ ++enum { EL_LF, EL_CR, EL_CRLF, EL_ANY }; ++ + + + /************************************************* +@@ -100,8 +103,7 @@ + static const char *jfriedl_postfix = ""; + #endif + +-static int endlinebyte = '\n'; /* Last byte of endline sequence */ +-static int endlineextra = 0; /* Extra bytes for endline sequence */ ++static int endlinetype; + + static char *colour_string = (char *)"1;31"; + static char *colour_option = NULL; +@@ -142,6 +144,7 @@ + static BOOL only_matching = FALSE; + static BOOL quiet = FALSE; + static BOOL silent = FALSE; ++static BOOL utf8 = FALSE; + + /* Structure for options and list of them */ + +@@ -219,6 +222,16 @@ + static const char *suffix[] = { + "", "\\b", ")$", ")$", "\\E", "\\E\\b", "\\E)$", "\\E)$" }; + ++/* UTF-8 tables - used only when the newline setting is "all". */ ++ ++const int utf8_table3[] = { 0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01}; ++ ++const char utf8_table4[] = { ++ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, ++ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, ++ 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, ++ 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 }; ++ + + + /************************************************* +@@ -471,6 +484,216 @@ + + + /************************************************* ++* Find end of line * ++*************************************************/ ++ ++/* The length of the endline sequence that is found is set via lenptr. This may ++be zero at the very end of the file if there is no line-ending sequence there. ++ ++Arguments: ++ p current position in line ++ endptr end of available data ++ lenptr where to put the length of the eol sequence ++ ++Returns: pointer to the last byte of the line ++*/ ++ ++static char * ++end_of_line(char *p, char *endptr, int *lenptr) ++{ ++switch(endlinetype) ++ { ++ default: /* Just in case */ ++ case EL_LF: ++ while (p < endptr && *p != '\n') p++; ++ if (p < endptr) ++ { ++ *lenptr = 1; ++ return p + 1; ++ } ++ *lenptr = 0; ++ return endptr; ++ ++ case EL_CR: ++ while (p < endptr && *p != '\r') p++; ++ if (p < endptr) ++ { ++ *lenptr = 1; ++ return p + 1; ++ } ++ *lenptr = 0; ++ return endptr; ++ ++ case EL_CRLF: ++ for (;;) ++ { ++ while (p < endptr && *p != '\r') p++; ++ if (++p >= endptr) ++ { ++ *lenptr = 0; ++ return endptr; ++ } ++ if (*p == '\n') ++ { ++ *lenptr = 2; ++ return p + 1; ++ } ++ } ++ break; ++ ++ case EL_ANY: ++ while (p < endptr) ++ { ++ int extra = 0; ++ register int c = *((unsigned char *)p); ++ ++ if (utf8 && c >= 0xc0) ++ { ++ int gcii, gcss; ++ extra = utf8_table4[c & 0x3f]; /* Number of additional bytes */ ++ gcss = 6*extra; ++ c = (c & utf8_table3[extra]) << gcss; ++ for (gcii = 1; gcii <= extra; gcii++) ++ { ++ gcss -= 6; ++ c |= (p[gcii] & 0x3f) << gcss; ++ } ++ } ++ ++ p += 1 + extra; ++ ++ switch (c) ++ { ++ case 0x0a: /* LF */ ++ case 0x0b: /* VT */ ++ case 0x0c: /* FF */ ++ *lenptr = 1; ++ return p; ++ ++ case 0x0d: /* CR */ ++ if (p < endptr && *p == 0x0a) ++ { ++ *lenptr = 2; ++ p++; ++ } ++ else *lenptr = 1; ++ return p; ++ ++ case 0x85: /* NEL */ ++ *lenptr = utf8? 2 : 1; ++ return p; ++ ++ case 0x2028: /* LS */ ++ case 0x2029: /* PS */ ++ *lenptr = 3; ++ return p; ++ ++ default: ++ break; ++ } ++ } /* End of loop for ANY case */ ++ ++ *lenptr = 0; /* Must have hit the end */ ++ return endptr; ++ } /* End of overall switch */ ++} ++ ++ ++ ++/************************************************* ++* Find start of previous line * ++*************************************************/ ++ ++/* This is called when looking back for before lines to print. ++ ++Arguments: ++ p start of the subsequent line ++ startptr start of available data ++ ++Returns: pointer to the start of the previous line ++*/ ++ ++static char * ++previous_line(char *p, char *startptr) ++{ ++switch(endlinetype) ++ { ++ default: /* Just in case */ ++ case EL_LF: ++ p--; ++ while (p > startptr && p[-1] != '\n') p--; ++ return p; ++ ++ case EL_CR: ++ p--; ++ while (p > startptr && p[-1] != '\n') p--; ++ return p; ++ ++ case EL_CRLF: ++ for (;;) ++ { ++ p -= 2; ++ while (p > startptr && p[-1] != '\n') p--; ++ if (p <= startptr + 1 || p[-2] == '\r') return p; ++ } ++ return p; /* But control should never get here */ ++ ++ case EL_ANY: ++ if (*(--p) == '\n' && p > startptr && p[-1] == '\r') p--; ++ if (utf8) while ((*p & 0xc0) == 0x80) p--; ++ ++ while (p > startptr) ++ { ++ register int c; ++ char *pp = p - 1; ++ ++ if (utf8) ++ { ++ int extra = 0; ++ while ((*pp & 0xc0) == 0x80) pp--; ++ c = *((unsigned char *)pp); ++ if (c >= 0xc0) ++ { ++ int gcii, gcss; ++ extra = utf8_table4[c & 0x3f]; /* Number of additional bytes */ ++ gcss = 6*extra; ++ c = (c & utf8_table3[extra]) << gcss; ++ for (gcii = 1; gcii <= extra; gcii++) ++ { ++ gcss -= 6; ++ c |= (pp[gcii] & 0x3f) << gcss; ++ } ++ } ++ } ++ else c = *((unsigned char *)pp); ++ ++ switch (c) ++ { ++ case 0x0a: /* LF */ ++ case 0x0b: /* VT */ ++ case 0x0c: /* FF */ ++ case 0x0d: /* CR */ ++ case 0x85: /* NEL */ ++ case 0x2028: /* LS */ ++ case 0x2029: /* PS */ ++ return p; ++ ++ default: ++ break; ++ } ++ ++ p = pp; /* Back one character */ ++ } /* End of loop for ANY case */ ++ ++ return startptr; /* Hit start of data */ ++ } /* End of overall switch */ ++} ++ ++ ++ ++ ++ ++/************************************************* + * Print the previous "after" lines * + *************************************************/ + +@@ -495,13 +718,13 @@ + int count = 0; + while (lastmatchrestart < endptr && count++ < after_context) + { ++ int ellength; + char *pp = lastmatchrestart; + if (printname != NULL) fprintf(stdout, "%s-", printname); + if (number) fprintf(stdout, "%d-", lastmatchnumber++); +- while (*pp != endlinebyte) pp++; +- fwrite(lastmatchrestart, 1, pp - lastmatchrestart + (1 + endlineextra), +- stdout); +- lastmatchrestart = pp + 1; ++ pp = end_of_line(pp, endptr, &ellength); ++ fwrite(lastmatchrestart, 1, pp - lastmatchrestart, stdout); ++ lastmatchrestart = pp; + } + hyphenpending = TRUE; + } +@@ -558,7 +781,7 @@ + + while (ptr < endptr) + { +- int i; ++ int i, endlinelength; + int mrc = 0; + BOOL match = FALSE; + char *t = ptr; +@@ -571,11 +794,10 @@ + line. In multiline mode the PCRE_FIRSTLINE option is used for compiling, so + that any match is constrained to be in the first line. */ + +- linelength = 0; +- while (t < endptr && *t++ != endlinebyte) linelength++; ++ t = end_of_line(t, endptr, &endlinelength); ++ linelength = t - ptr - endlinelength; + length = multiline? endptr - ptr : linelength; + +- + /* Extra processing for Jeffrey Friedl's debugging. */ + + #ifdef JFRIEDL_DEBUG +@@ -706,13 +928,13 @@ + + if (after_context > 0 && lastmatchnumber > 0) + { ++ int ellength; + int linecount = 0; + char *p = lastmatchrestart; + + while (p < ptr && linecount < after_context) + { +- while (*p != endlinebyte) p++; +- p++; ++ p = end_of_line(p, ptr, &ellength); + linecount++; + } + +@@ -725,10 +947,9 @@ + char *pp = lastmatchrestart; + if (printname != NULL) fprintf(stdout, "%s-", printname); + if (number) fprintf(stdout, "%d-", lastmatchnumber++); +- while (*pp != endlinebyte) pp++; +- fwrite(lastmatchrestart, 1, pp - lastmatchrestart + +- (1 + endlineextra), stdout); +- lastmatchrestart = pp + 1; ++ pp = end_of_line(pp, endptr, &ellength); ++ fwrite(lastmatchrestart, 1, pp - lastmatchrestart, stdout); ++ lastmatchrestart = pp; + } + if (lastmatchrestart != ptr) hyphenpending = TRUE; + } +@@ -754,8 +975,7 @@ + linecount < before_context) + { + linecount++; +- p--; +- while (p > buffer && p[-1] != endlinebyte) p--; ++ p = previous_line(p, buffer); + } + + if (lastmatchnumber > 0 && p > lastmatchrestart && !hyphenprinted) +@@ -763,12 +983,13 @@ + + while (p < ptr) + { ++ int ellength; + char *pp = p; + if (printname != NULL) fprintf(stdout, "%s-", printname); + if (number) fprintf(stdout, "%d-", linenumber - linecount--); +- while (*pp != endlinebyte) pp++; +- fwrite(p, 1, pp - p + (1 + endlineextra), stdout); +- p = pp + 1; ++ pp = end_of_line(pp, endptr, &ellength); ++ fwrite(p, 1, pp - p, stdout); ++ p = pp; + } + } + +@@ -788,11 +1009,16 @@ + + if (multiline) + { ++ int ellength; + char *endmatch = ptr + offsets[1]; + t = ptr; +- while (t < endmatch) { if (*t++ == endlinebyte) linenumber++; } +- while (endmatch < endptr && *endmatch != endlinebyte) endmatch++; +- linelength = endmatch - ptr; ++ while (t < endmatch) ++ { ++ t = end_of_line(t, endptr, &ellength); ++ if (t <= endmatch) linenumber++; else break; ++ } ++ endmatch = end_of_line(endmatch, endptr, &ellength); ++ linelength = endmatch - ptr - ellength; + } + + /*** NOTE: Use only fwrite() to output the data line, so that binary +@@ -824,9 +1050,7 @@ + fprintf(stdout, "%c[00m", 0x1b); + fwrite(ptr + offsets[1], 1, linelength - offsets[1], stdout); + } +- else fwrite(ptr, 1, linelength, stdout); +- +- fprintf(stdout, "\n"); ++ else fwrite(ptr, 1, linelength + endlinelength, stdout); + } + + /* End of doing what has to be done for a match */ +@@ -836,13 +1060,13 @@ + /* Remember where the last match happened for after_context. We remember + where we are about to restart, and that line's number. */ + +- lastmatchrestart = ptr + linelength + 1; ++ lastmatchrestart = ptr + linelength + endlinelength; + lastmatchnumber = linenumber + 1; + } + + /* Advance to after the newline and increment the line number. */ + +- ptr += linelength + 1; ++ ptr += linelength + endlinelength; + linenumber++; + + /* If we haven't yet reached the end of the file (the buffer is full), and +@@ -964,8 +1188,7 @@ + while ((nextfile = readdirectory(dir)) != NULL) + { + int frc, blen; +- sprintf(buffer, "%.512s%c%.128s", pathname, sep, nextfile); +- blen = strlen(buffer); ++ blen = slprintf(buffer, sizeof(buffer), "%.512s%c%.128s", pathname, sep, nextfile); + + if (exclude_compiled != NULL && + pcre_exec(exclude_compiled, NULL, buffer, blen, 0, 0, NULL, 0) >= 0) +@@ -1057,7 +1280,7 @@ + { + int n; + char s[4]; +- if (op->one_char > 0) sprintf(s, "-%c,", op->one_char); else strcpy(s, " "); ++ if (op->one_char > 0) snprintf(s, sizeof(s), "-%c,", op->one_char); else strcpy(s, " "); + printf(" %s --%s%n", s, op->long_name, &n); + n = 30 - n; + if (n < 1) n = 1; +@@ -1098,7 +1321,7 @@ + case 'q': quiet = TRUE; break; + case 'r': dee_action = dee_RECURSE; break; + case 's': silent = TRUE; break; +- case 'u': options |= PCRE_UTF8; break; ++ case 'u': options |= PCRE_UTF8; utf8 = TRUE; break; + case 'v': invert = TRUE; break; + case 'w': process_options |= PO_WORD_MATCH; break; + case 'x': process_options |= PO_LINE_MATCH; break; +@@ -1131,7 +1354,7 @@ + { + static char buffer[8]; + char *p = buffer; +-sprintf(p, "%d", n); ++snprintf(p, sizeof(buffer), "%d", n); + while (*p != 0) p++; + switch (n%10) + { +@@ -1177,7 +1400,7 @@ + return FALSE; + } + +-sprintf(buffer, "%s%.*s%s", prefix[process_options], MBUFTHIRD, pattern, ++snprintf(buffer, sizeof(buffer), "%s%.*s%s", prefix[process_options], MBUFTHIRD, pattern, + suffix[process_options]); + pattern_list[pattern_count] = + pcre_compile(buffer, options, &error, &errptr, pcretables); +@@ -1231,14 +1454,16 @@ + { + if ((process_options & PO_FIXED_STRINGS) != 0) + { ++ char *eop = pattern + strlen(pattern); + char buffer[MBUFTHIRD]; + for(;;) + { +- char *p = strchr(pattern, endlinebyte); +- if (p == NULL) ++ int ellength; ++ char *p = end_of_line(pattern, eop, &ellength); ++ if (ellength == 0) + return compile_single_pattern(pattern, options, filename, count); +- sprintf(buffer, "%.*s", p - pattern - endlineextra, pattern); +- pattern = p + 1; ++ snprintf(buffer, sizeof(buffer), "%.*s", p - pattern - ellength, pattern); ++ pattern = p; + if (!compile_single_pattern(buffer, options, filename, count)) + return FALSE; + } +@@ -1267,7 +1492,9 @@ + const char *locale_from = "--locale"; + const char *error; + +-/* Set the default line ending value from the default in the PCRE library. */ ++/* Set the default line ending value from the default in the PCRE library; ++"lf", "cr", "crlf", and "any" are supported. Anything else is treated as "lf". ++*/ + + (void)pcre_config(PCRE_CONFIG_NEWLINE, &i); + switch(i) +@@ -1275,6 +1502,7 @@ + default: newline = (char *)"lf"; break; + case '\r': newline = (char *)"cr"; break; + case ('\r' << 8) | '\n': newline = (char *)"crlf"; break; ++ case -1: newline = (char *)"any"; break; + } + + /* Process the options */ +@@ -1350,8 +1578,8 @@ + char buff1[24]; + char buff2[24]; + int baselen = opbra - op->long_name; +- sprintf(buff1, "%.*s", baselen, op->long_name); +- sprintf(buff2, "%s%.*s", buff1, strlen(op->long_name) - baselen - 2, ++ snprintf(buff1, sizeof(buff1), "%.*s", baselen, op->long_name); ++ snprintf(buff2, sizeof(buff2), "%s%.*s", buff1, strlen(op->long_name) - baselen - 2, + opbra + 1); + if (strcmp(arg, buff1) == 0 || strcmp(arg, buff2) == 0) + break; +@@ -1565,16 +1793,22 @@ + if (strcmp(newline, "cr") == 0 || strcmp(newline, "CR") == 0) + { + pcre_options |= PCRE_NEWLINE_CR; +- endlinebyte = '\r'; ++ endlinetype = EL_CR; + } + else if (strcmp(newline, "lf") == 0 || strcmp(newline, "LF") == 0) + { + pcre_options |= PCRE_NEWLINE_LF; ++ endlinetype = EL_LF; + } + else if (strcmp(newline, "crlf") == 0 || strcmp(newline, "CRLF") == 0) + { + pcre_options |= PCRE_NEWLINE_CRLF; +- endlineextra = 1; ++ endlinetype = EL_CRLF; ++ } ++else if (strcmp(newline, "any") == 0 || strcmp(newline, "ANY") == 0) ++ { ++ pcre_options |= PCRE_NEWLINE_ANY; ++ endlinetype = EL_ANY; + } + else + { +@@ -1700,7 +1934,7 @@ + if (error != NULL) + { + char s[16]; +- if (pattern_count == 1) s[0] = 0; else sprintf(s, " number %d", j); ++ if (pattern_count == 1) s[0] = 0; else snprintf(s, sizeof(s), " number %d", j); + fprintf(stderr, "pcregrep: Error while studying regex%s: %s\n", s, error); + return 2; + } +diff -ruN ../pcre.orig/pcrelib/pcreposix.c ./pcrelib/pcreposix.c +--- ../pcre.orig/pcrelib/pcreposix.c Mon Jan 1 10:36:04 2007 ++++ ./pcrelib/pcreposix.c Sat Feb 24 04:30:55 2007 +@@ -6,7 +6,7 @@ + and semantics are as close as possible to those of the Perl 5 language. + + Written by Philip Hazel +- Copyright (c) 1997-2007 University of Cambridge ++ Copyright (c) 1997-2006 University of Cambridge + + ----------------------------------------------------------------------------- + Redistribution and use in source and binary forms, with or without +@@ -78,7 +78,7 @@ + REG_BADPAT, /* unrecognized character after (?< */ + REG_BADPAT, /* lookbehind assertion is not fixed length */ + REG_BADPAT, /* malformed number or name after (?( */ +- REG_BADPAT, /* conditional group containe more than two branches */ ++ REG_BADPAT, /* conditional group contains more than two branches */ + REG_BADPAT, /* assertion expected after (?( */ + REG_BADPAT, /* (?R or (?digits must be followed by ) */ + REG_ECTYPE, /* unknown POSIX class name */ +@@ -93,7 +93,7 @@ + REG_BADPAT, /* closing ) for (?C expected */ + REG_BADPAT, /* recursive call could loop indefinitely */ + REG_BADPAT, /* unrecognized character after (?P */ +- REG_BADPAT, /* syntax error after (?P */ ++ REG_BADPAT, /* syntax error in subpattern name (missing terminator) */ + REG_BADPAT, /* two named subpatterns have the same name */ + REG_BADPAT, /* invalid UTF-8 string */ + REG_BADPAT, /* support for \P, \p, and \X has not been compiled */ +@@ -102,7 +102,13 @@ + REG_BADPAT, /* subpattern name is too long (maximum 32 characters) */ + REG_BADPAT, /* too many named subpatterns (maximum 10,000) */ + REG_BADPAT, /* repeated subpattern is too long */ +- REG_BADPAT /* octal value is greater than \377 (not in UTF-8 mode) */ ++ REG_BADPAT, /* octal value is greater than \377 (not in UTF-8 mode) */ ++ REG_BADPAT, /* internal error: overran compiling workspace */ ++ REG_BADPAT, /* internal error: previously-checked referenced subpattern not found */ ++ REG_BADPAT, /* DEFINE group contains more than one branch */ ++ REG_BADPAT, /* repeating a DEFINE group is not allowed */ ++ REG_INVARG, /* inconsistent NEWLINE options */ ++ REG_BADPAT /* \g is not followed followed by an (optionally braced) non-zero number */ + }; + + /* Table of texts corresponding to POSIX error codes */ +@@ -152,7 +158,7 @@ + if (errbuf_size > 0) + { + if (addlength > 0 && errbuf_size >= length + addlength) +- sprintf(errbuf, "%s%s%-6d", message, addmessage, (int)preg->re_erroffset); ++ snprintf(errbuf, errbuf_size, "%s%s%-6d", message, addmessage, (int)preg->re_erroffset); + else + { + strncpy(errbuf, message, errbuf_size - 1); +diff -ruN ../pcre.orig/pcrelib/pcretest.c ./pcrelib/pcretest.c +--- ../pcre.orig/pcrelib/pcretest.c Wed Aug 30 22:00:22 2006 ++++ ./pcrelib/pcretest.c Fri Feb 9 22:31:20 2007 +@@ -44,10 +44,29 @@ + #include <locale.h> + #include <errno.h> + +-#ifndef _WIN32 +-#include <sys/resource.h> ++ ++/* A number of things vary for Windows builds. Originally, pcretest opened its ++input and output without "b"; then I was told that "b" was needed in some ++environments, so it was added for release 5.0 to both the input and output. (It ++makes no difference on Unix-like systems.) Later I was told that it is wrong ++for the input on Windows. I've now abstracted the modes into two macros that ++are set here, to make it easier to fiddle with them, and removed "b" from the ++input mode under Windows. */ ++ ++#if defined(_WIN32) || defined(WIN32) ++#include <io.h> /* For _setmode() */ ++#include <fcntl.h> /* For _O_BINARY */ ++#define INPUT_MODE "r" ++#define OUTPUT_MODE "wb" ++ ++#else ++#include <sys/time.h> /* These two includes are needed */ ++#include <sys/resource.h> /* for setrlimit(). */ ++#define INPUT_MODE "rb" ++#define OUTPUT_MODE "wb" + #endif + ++ + #define PCRE_SPY /* For Win32 build, import data, not export */ + + /* We include pcre_internal.h because we need the internal info for displaying +@@ -74,10 +93,18 @@ + + /* We also need the pcre_printint() function for printing out compiled + patterns. This function is in a separate file so that it can be included in +-pcre_compile.c when that module is compiled with debugging enabled. */ ++pcre_compile.c when that module is compiled with debugging enabled. ++ ++The definition of the macro PRINTABLE, which determines whether to print an ++output character as-is or as a hex value when showing compiled patterns, is ++contained in this file. We uses it here also, in cases when the locale has not ++been explicitly changed, so as to get consistent output from systems that ++differ in their output from isprint() even in the "C" locale. */ + + #include "pcre_printint.src" + ++#define PRINTHEX(c) (locale_set? isprint(c) : PRINTABLE(c)) ++ + + /* It is possible to compile this test program without including support for + testing the POSIX interface, though this is not available via the standard +@@ -103,6 +130,8 @@ + #endif + #endif + ++/* This is the default loop count for timing. */ ++ + #define LOOPREPEAT 500000 + + /* Static variables */ +@@ -114,6 +143,7 @@ + static int callout_fail_count; + static int callout_fail_id; + static int first_callout; ++static int locale_set = 0; + static int show_malloc; + static int use_utf8; + static size_t gotten_store; +@@ -157,6 +187,7 @@ + for (;;) + { + int rlen = buffer_size - (here - buffer); ++ + if (rlen > 1000) + { + int dlen; +@@ -213,7 +244,7 @@ + + /* We don't use strtoul() because SunOS4 doesn't have it. Rather than mess + around with conditional compilation, just do the job by hand. It is only used +-for unpicking the -o argument, so just keep it simple. ++for unpicking arguments, so just keep it simple. + + Arguments: + str string to be converted +@@ -311,6 +342,8 @@ + Returns: number of characters placed in the buffer + */ + ++#if !defined NOUTF8 ++ + static int + ord2utf8(int cvalue, uschar *utf8bytes) + { +@@ -327,6 +360,8 @@ + return i + 1; + } + ++#endif ++ + + + /************************************************* +@@ -353,16 +388,19 @@ + { + length -= rc - 1; + p += rc; +- if (c < 256 && isprint(c)) ++ if (PRINTHEX(c)) + { + if (f != NULL) fprintf(f, "%c", c); + yield++; + } + else + { +- int n; +- if (f != NULL) fprintf(f, "\\x{%02x}%n", c, &n); +- yield += n; ++ int n = 4; ++ if (f != NULL) fprintf(f, "\\x{%02x}", c); ++ yield += (n <= 0x000000ff)? 2 : ++ (n <= 0x00000fff)? 3 : ++ (n <= 0x0000ffff)? 4 : ++ (n <= 0x000fffff)? 5 : 6; + } + continue; + } +@@ -371,7 +409,8 @@ + + /* Not UTF-8, or malformed UTF-8 */ + +- if (isprint(c = *(p++))) ++ c = *p++; ++ if (PRINTHEX(c)) + { + if (f != NULL) fprintf(f, "%c", c); + yield++; +@@ -614,7 +653,7 @@ + *************************************************/ + + /* This is used both at compile and run-time to check for <xxx> escapes, where +-xxx is LF, CR, or CRLF. Print a message and return 0 if there is no match. ++xxx is LF, CR, CRLF, or ANY. Print a message and return 0 if there is no match. + + Arguments: + p points after the leading '<' +@@ -629,6 +668,7 @@ + if (strncmp((char *)p, "cr>", 3) == 0) return PCRE_NEWLINE_CR; + if (strncmp((char *)p, "lf>", 3) == 0) return PCRE_NEWLINE_LF; + if (strncmp((char *)p, "crlf>", 5) == 0) return PCRE_NEWLINE_CRLF; ++if (strncmp((char *)p, "any>", 4) == 0) return PCRE_NEWLINE_ANY; + fprintf(f, "Unknown newline type at: <%s\n", p); + return 0; + } +@@ -636,6 +676,38 @@ + + + /************************************************* ++* Usage function * ++*************************************************/ ++ ++static void ++usage(void) ++{ ++printf("Usage: pcretest [options] [<input> [<output>]]\n"); ++printf(" -b show compiled code (bytecode)\n"); ++printf(" -C show PCRE compile-time options and exit\n"); ++printf(" -d debug: show compiled code and information (-b and -i)\n"); ++#if !defined NODFA ++printf(" -dfa force DFA matching for all subjects\n"); ++#endif ++printf(" -help show usage information\n"); ++printf(" -i show information about compiled patterns\n" ++ " -m output memory used information\n" ++ " -o <n> set size of offsets vector to <n>\n"); ++#if !defined NOPOSIX ++printf(" -p use POSIX interface\n"); ++#endif ++printf(" -q quiet: do not output PCRE version number at start\n"); ++printf(" -S <n> set stack size to <n> megabytes\n"); ++printf(" -s output store (memory) used information\n" ++ " -t time compilation and execution\n"); ++printf(" -t <n> time compilation and execution, repeating <n> times\n"); ++printf(" -tm time execution (matching) only\n"); ++printf(" -tm <n> time execution (matching) only, repeating <n> times\n"); ++} ++ ++ ++ ++/************************************************* + * Main Program * + *************************************************/ + +@@ -650,6 +722,7 @@ + int study_options = 0; + int op = 1; + int timeit = 0; ++int timeitm = 0; + int showinfo = 0; + int showstore = 0; + int quiet = 0; +@@ -681,16 +754,19 @@ + dbuffer = (unsigned char *)malloc(buffer_size); + pbuffer = (unsigned char *)malloc(buffer_size); + +-/* The outfile variable is static so that new_malloc can use it. The _setmode() +-stuff is some magic that I don't understand, but which apparently does good +-things in Windows. It's related to line terminations. */ +- +-#if defined(_WIN32) || defined(WIN32) +-_setmode( _fileno( stdout ), 0x8000 ); +-#endif /* defined(_WIN32) || defined(WIN32) */ ++/* The outfile variable is static so that new_malloc can use it. */ + + outfile = stdout; + ++/* The following _setmode() stuff is some Windows magic that tells its runtime ++library to translate CRLF into a single LF character. At least, that's what ++I've been told: never having used Windows I take this all on trust. Originally ++it set 0x8000, but then I was advised that _O_BINARY was better. */ ++ ++#if defined(_WIN32) || defined(WIN32) ++_setmode( _fileno( stdout ), _O_BINARY ); ++#endif ++ + /* Scan options */ + + while (argc > 1 && argv[op][0] == '-') +@@ -699,8 +775,8 @@ + + if (strcmp(argv[op], "-s") == 0 || strcmp(argv[op], "-m") == 0) + showstore = 1; +- else if (strcmp(argv[op], "-t") == 0) timeit = 1; + else if (strcmp(argv[op], "-q") == 0) quiet = 1; ++ else if (strcmp(argv[op], "-b") == 0) debug = 1; + else if (strcmp(argv[op], "-i") == 0) showinfo = 1; + else if (strcmp(argv[op], "-d") == 0) showinfo = debug = 1; + #if !defined NODFA +@@ -713,11 +789,25 @@ + op++; + argc--; + } ++ else if (strcmp(argv[op], "-t") == 0 || strcmp(argv[op], "-tm") == 0) ++ { ++ int both = argv[op][2] == 0; ++ int temp; ++ if (argc > 2 && (temp = get_value((unsigned char *)argv[op+1], &endptr), ++ *endptr == 0)) ++ { ++ timeitm = temp; ++ op++; ++ argc--; ++ } ++ else timeitm = LOOPREPEAT; ++ if (both) timeit = timeitm; ++ } + else if (strcmp(argv[op], "-S") == 0 && argc > 2 && + ((stack_size = get_value((unsigned char *)argv[op+1], &endptr)), + *endptr == 0)) + { +-#ifdef _WIN32 ++#if defined(_WIN32) || defined(WIN32) + printf("PCRE: -S not supported on this OS\n"); + exit(1); + #else +@@ -749,7 +839,8 @@ + printf(" %sUnicode properties support\n", rc? "" : "No "); + (void)pcre_config(PCRE_CONFIG_NEWLINE, &rc); + printf(" Newline sequence is %s\n", (rc == '\r')? "CR" : +- (rc == '\n')? "LF" : "CRLF"); ++ (rc == '\n')? "LF" : (rc == ('\r'<<8 | '\n'))? "CRLF" : ++ (rc == -1)? "ANY" : "???"); + (void)pcre_config(PCRE_CONFIG_LINK_SIZE, &rc); + printf(" Internal link size = %d\n", rc); + (void)pcre_config(PCRE_CONFIG_POSIX_MALLOC_THRESHOLD, &rc); +@@ -762,24 +853,16 @@ + printf(" Match recursion uses %s\n", rc? "stack" : "heap"); + exit(0); + } ++ else if (strcmp(argv[op], "-help") == 0 || ++ strcmp(argv[op], "--help") == 0) ++ { ++ usage(); ++ goto EXIT; ++ } + else + { + printf("** Unknown or malformed option %s\n", argv[op]); +- printf("Usage: pcretest [options] [<input> [<output>]]\n"); +- printf(" -C show PCRE compile-time options and exit\n"); +- printf(" -d debug: show compiled code; implies -i\n"); +-#if !defined NODFA +- printf(" -dfa force DFA matching for all subjects\n"); +-#endif +- printf(" -i show information about compiled pattern\n" +- " -m output memory used information\n" +- " -o <n> set size of offsets vector to <n>\n"); +-#if !defined NOPOSIX +- printf(" -p use POSIX interface\n"); +-#endif +- printf(" -S <n> set stack size to <n> megabytes\n"); +- printf(" -s output store (memory) used information\n" +- " -t time compilation and execution\n"); ++ usage(); + yield = 1; + goto EXIT; + } +@@ -803,7 +886,7 @@ + + if (argc > 1) + { +- infile = fopen(argv[op], "rb"); ++ infile = fopen(argv[op], INPUT_MODE); + if (infile == NULL) + { + printf("** Failed to open %s\n", argv[op]); +@@ -814,7 +897,7 @@ + + if (argc > 2) + { +- outfile = fopen(argv[op+1], "wb"); ++ outfile = fopen(argv[op+1], OUTPUT_MODE); + if (outfile == NULL) + { + printf("** Failed to open %s\n", argv[op+1]); +@@ -859,7 +942,7 @@ + int do_showinfo = showinfo; + int do_showrest = 0; + int do_flip = 0; +- int erroroffset, len, delimiter; ++ int erroroffset, len, delimiter, poffset; + + use_utf8 = 0; + +@@ -969,6 +1052,7 @@ + } + + pp = p; ++ poffset = p - buffer; + + for(;;) + { +@@ -989,6 +1073,11 @@ + if (infile != stdin) fprintf(outfile, "%s", (char *)pp); + } + ++ /* The buffer may have moved while being extended; reset the start of data ++ pointer to the correct relative point in the buffer. */ ++ ++ p = buffer + poffset; ++ + /* If the first character after the delimiter is backslash, make + the pattern end with backslash. This is purely to provide a way + of testing for the error message when a pattern ends with backslash. */ +@@ -1020,6 +1109,7 @@ + + case '+': do_showrest = 1; break; + case 'A': options |= PCRE_ANCHORED; break; ++ case 'B': do_debug = 1; break; + case 'C': options |= PCRE_AUTO_CALLOUT; break; + case 'D': do_debug = do_showinfo = 1; break; + case 'E': options |= PCRE_DOLLAR_ENDONLY; break; +@@ -1042,14 +1132,16 @@ + + case 'L': + ppp = pp; +- /* The '\r' test here is so that it works on Windows */ +- while (*ppp != '\n' && *ppp != '\r' && *ppp != ' ') ppp++; ++ /* The '\r' test here is so that it works on Windows. */ ++ /* The '0' test is just in case this is an unterminated line. */ ++ while (*ppp != 0 && *ppp != '\n' && *ppp != '\r' && *ppp != ' ') ppp++; + *ppp = 0; + if (setlocale(LC_CTYPE, (const char *)pp) == NULL) + { + fprintf(outfile, "** Failed to set locale \"%s\"\n", pp); + goto SKIP_DATA; + } ++ locale_set = 1; + tables = pcre_maketables(); + pp = ppp; + break; +@@ -1116,19 +1208,19 @@ + #endif /* !defined NOPOSIX */ + + { +- if (timeit) ++ if (timeit > 0) + { + register int i; + clock_t time_taken; + clock_t start_time = clock(); +- for (i = 0; i < LOOPREPEAT; i++) ++ for (i = 0; i < timeit; i++) + { + re = pcre_compile((char *)p, options, &error, &erroroffset, tables); + if (re != NULL) free(re); + } + time_taken = clock() - start_time; +- fprintf(outfile, "Compile time %.3f milliseconds\n", +- (((double)time_taken * 1000.0) / (double)LOOPREPEAT) / ++ fprintf(outfile, "Compile time %.4f milliseconds\n", ++ (((double)time_taken * 1000.0) / (double)timeit) / + (double)CLOCKS_PER_SEC); + } + +@@ -1180,17 +1272,17 @@ + + if (do_study) + { +- if (timeit) ++ if (timeit > 0) + { + register int i; + clock_t time_taken; + clock_t start_time = clock(); +- for (i = 0; i < LOOPREPEAT; i++) ++ for (i = 0; i < timeit; i++) + extra = pcre_study(re, study_options, &error); + time_taken = clock() - start_time; + if (extra != NULL) free(extra); +- fprintf(outfile, " Study time %.3f milliseconds\n", +- (((double)time_taken * 1000.0) / (double)LOOPREPEAT) / ++ fprintf(outfile, " Study time %.4f milliseconds\n", ++ (((double)time_taken * 1000.0) / (double)timeit) / + (double)CLOCKS_PER_SEC); + } + extra = pcre_study(re, study_options, &error); +@@ -1233,6 +1325,12 @@ + + SHOW_INFO: + ++ if (do_debug) ++ { ++ fprintf(outfile, "------------------------------------------------------------------\n"); ++ pcre_printint(re, outfile); ++ } ++ + if (do_showinfo) + { + unsigned long int get_options, all_options; +@@ -1243,12 +1341,6 @@ + int nameentrysize, namecount; + const uschar *nametable; + +- if (do_debug) +- { +- fprintf(outfile, "------------------------------------------------------------------\n"); +- pcre_printint(re, outfile); +- } +- + new_info(re, NULL, PCRE_INFO_OPTIONS, &get_options); + new_info(re, NULL, PCRE_INFO_SIZE, &size); + new_info(re, NULL, PCRE_INFO_CAPTURECOUNT, &count); +@@ -1327,7 +1419,7 @@ + ((get_options & PCRE_NO_UTF8_CHECK) != 0)? " no_utf8_check" : "", + ((get_options & PCRE_DUPNAMES) != 0)? " dupnames" : ""); + +- switch (get_options & PCRE_NEWLINE_CRLF) ++ switch (get_options & PCRE_NEWLINE_BITS) + { + case PCRE_NEWLINE_CR: + fprintf(outfile, "Forced newline sequence: CR\n"); +@@ -1341,6 +1433,10 @@ + fprintf(outfile, "Forced newline sequence: CRLF\n"); + break; + ++ case PCRE_NEWLINE_ANY: ++ fprintf(outfile, "Forced newline sequence: ANY\n"); ++ break; ++ + default: + break; + } +@@ -1358,7 +1454,7 @@ + int ch = first_char & 255; + const char *caseless = ((first_char & REQ_CASELESS) == 0)? + "" : " (caseless)"; +- if (isprint(ch)) ++ if (PRINTHEX(ch)) + fprintf(outfile, "First char = \'%c\'%s\n", ch, caseless); + else + fprintf(outfile, "First char = %d%s\n", ch, caseless); +@@ -1373,7 +1469,7 @@ + int ch = need_char & 255; + const char *caseless = ((need_char & REQ_CASELESS) == 0)? + "" : " (caseless)"; +- if (isprint(ch)) ++ if (PRINTHEX(ch)) + fprintf(outfile, "Need char = \'%c\'%s\n", ch, caseless); + else + fprintf(outfile, "Need char = %d%s\n", ch, caseless); +@@ -1409,7 +1505,7 @@ + fprintf(outfile, "\n "); + c = 2; + } +- if (isprint(i) && i != ' ') ++ if (PRINTHEX(i) && i != ' ') + { + fprintf(outfile, "%c ", i); + c += 2; +@@ -1468,6 +1564,7 @@ + strerror(errno)); + } + else fprintf(outfile, "Study data written to %s\n", to_file); ++ + } + } + fclose(f); +@@ -1866,7 +1963,7 @@ + + for (;; gmatched++) /* Loop for /g or /G */ + { +- if (timeit) ++ if (timeitm > 0) + { + register int i; + clock_t time_taken; +@@ -1876,7 +1973,7 @@ + if (all_use_dfa || use_dfa) + { + int workspace[1000]; +- for (i = 0; i < LOOPREPEAT; i++) ++ for (i = 0; i < timeitm; i++) + count = pcre_dfa_exec(re, NULL, (char *)bptr, len, start_offset, + options | g_notempty, use_offsets, use_size_offsets, workspace, + sizeof(workspace)/sizeof(int)); +@@ -1884,13 +1981,13 @@ + else + #endif + +- for (i = 0; i < LOOPREPEAT; i++) ++ for (i = 0; i < timeitm; i++) + count = pcre_exec(re, extra, (char *)bptr, len, + start_offset, options | g_notempty, use_offsets, use_size_offsets); + + time_taken = clock() - start_time; +- fprintf(outfile, "Execute time %.3f milliseconds\n", +- (((double)time_taken * 1000.0) / (double)LOOPREPEAT) / ++ fprintf(outfile, "Execute time %.4f milliseconds\n", ++ (((double)time_taken * 1000.0) / (double)timeitm) / + (double)CLOCKS_PER_SEC); + } + +@@ -1966,7 +2063,28 @@ + + if (count >= 0) + { +- int i; ++ int i, maxcount; ++ ++#if !defined NODFA ++ if (all_use_dfa || use_dfa) maxcount = use_size_offsets/2; else ++#endif ++ maxcount = use_size_offsets/3; ++ ++ /* This is a check against a lunatic return value. */ ++ ++ if (count > maxcount) ++ { ++ fprintf(outfile, ++ "** PCRE error: returned count %d is too big for offset size %d\n", ++ count, use_size_offsets); ++ count = use_size_offsets/3; ++ if (do_g || do_G) ++ { ++ fprintf(outfile, "** /%c loop abandoned\n", do_g? 'g' : 'G'); ++ do_g = do_G = FALSE; /* Break g/G loop */ ++ } ++ } ++ + for (i = 0; i < count * 2; i += 2) + { + if (use_offsets[i] < 0) +@@ -2165,6 +2283,7 @@ + { + new_free((void *)tables); + setlocale(LC_CTYPE, "C"); ++ locale_set = 0; + } + } + +diff -ruN ../pcre.orig/pcrelib/ucp.h ./pcrelib/ucp.h +--- ../pcre.orig/pcrelib/ucp.h Mon Mar 6 22:45:57 2006 ++++ ./pcrelib/ucp.h Fri Feb 9 22:31:20 2007 +@@ -6,7 +6,9 @@ + #define _UCP_H + + /* This file contains definitions of the property values that are returned by +-the function _pcre_ucp_findprop(). */ ++the function _pcre_ucp_findprop(). New values that are added for new releases ++of Unicode should always be at the end of each enum, for backwards ++compatibility. */ + + /* These are the general character categories. */ + +@@ -118,7 +120,12 @@ + ucp_Tibetan, + ucp_Tifinagh, + ucp_Ugaritic, +- ucp_Yi ++ ucp_Yi, ++ ucp_Balinese, /* New for Unicode 5.0.0 */ ++ ucp_Cuneiform, /* New for Unicode 5.0.0 */ ++ ucp_Nko, /* New for Unicode 5.0.0 */ ++ ucp_Phags_Pa, /* New for Unicode 5.0.0 */ ++ ucp_Phoenician /* New for Unicode 5.0.0 */ + }; + + #endif +diff -ruN ../pcre.orig/pcrelib/ucpinternal.h ./pcrelib/ucpinternal.h +--- ../pcre.orig/pcrelib/ucpinternal.h Mon Mar 6 22:45:57 2006 ++++ ./pcrelib/ucpinternal.h Fri Feb 9 22:31:20 2007 +@@ -2,6 +2,9 @@ + * Unicode Property Table handler * + *************************************************/ + ++#ifndef _UCPINTERNAL_H ++#define _UCPINTERNAL_H ++ + /* Internal header file defining the layout of the bits in each pair of 32-bit + words that form a data item in the table. */ + +@@ -83,5 +86,7 @@ + (7) Otherwise, set the bottom to one element past the current point and goto + (2). + */ ++ ++#endif /* _UCPINTERNAL_H */ + + /* End of ucpinternal.h */ +diff -ruN ../pcre.orig/pcrelib/ucptable.c ./pcrelib/ucptable.c +--- ../pcre.orig/pcrelib/ucptable.c Mon Mar 6 22:45:57 2006 ++++ ./pcrelib/ucptable.c Fri Feb 9 22:31:20 2007 +@@ -1,5 +1,6 @@ + /* This source module is automatically generated from the Unicode +-property table. See ucpinternal.h for a description of the layout. */ ++property table. See ucpinternal.h for a description of the layout. ++This version was made from the Unicode 5.0.0 tables. */ + + static cnode ucp_table[] = { + { 0x09800000, 0x0000001f }, +@@ -298,7 +299,7 @@ + { 0x2100017d, 0x24000001 }, + { 0x2100017e, 0x1400ffff }, + { 0x2100017f, 0x1400fed4 }, +- { 0x21000180, 0x14000000 }, ++ { 0x21000180, 0x140000c3 }, + { 0x21000181, 0x240000d2 }, + { 0x21000182, 0x24000001 }, + { 0x21000183, 0x1400ffff }, +@@ -475,13 +476,27 @@ + { 0x21000232, 0x24000001 }, + { 0x21000233, 0x1400ffff }, + { 0x21800234, 0x14000005 }, +- { 0x2100023a, 0x24000000 }, ++ { 0x2100023a, 0x24002a2b }, + { 0x2100023b, 0x24000001 }, + { 0x2100023c, 0x1400ffff }, + { 0x2100023d, 0x2400ff5d }, +- { 0x2100023e, 0x24000000 }, ++ { 0x2100023e, 0x24002a28 }, + { 0x2180023f, 0x14000001 }, +- { 0x21000241, 0x24000053 }, ++ { 0x21000241, 0x24000001 }, ++ { 0x21000242, 0x1400ffff }, ++ { 0x21000243, 0x2400ff3d }, ++ { 0x21000244, 0x24000045 }, ++ { 0x21000245, 0x24000047 }, ++ { 0x21000246, 0x24000001 }, ++ { 0x21000247, 0x1400ffff }, ++ { 0x21000248, 0x24000001 }, ++ { 0x21000249, 0x1400ffff }, ++ { 0x2100024a, 0x24000001 }, ++ { 0x2100024b, 0x1400ffff }, ++ { 0x2100024c, 0x24000001 }, ++ { 0x2100024d, 0x1400ffff }, ++ { 0x2100024e, 0x24000001 }, ++ { 0x2100024f, 0x1400ffff }, + { 0x21800250, 0x14000002 }, + { 0x21000253, 0x1400ff2e }, + { 0x21000254, 0x1400ff32 }, +@@ -499,25 +514,30 @@ + { 0x21800264, 0x14000003 }, + { 0x21000268, 0x1400ff2f }, + { 0x21000269, 0x1400ff2d }, +- { 0x2180026a, 0x14000004 }, ++ { 0x2100026a, 0x14000000 }, ++ { 0x2100026b, 0x140029f7 }, ++ { 0x2180026c, 0x14000002 }, + { 0x2100026f, 0x1400ff2d }, + { 0x21800270, 0x14000001 }, + { 0x21000272, 0x1400ff2b }, + { 0x21800273, 0x14000001 }, + { 0x21000275, 0x1400ff2a }, +- { 0x21800276, 0x14000009 }, ++ { 0x21800276, 0x14000006 }, ++ { 0x2100027d, 0x140029e7 }, ++ { 0x2180027e, 0x14000001 }, + { 0x21000280, 0x1400ff26 }, + { 0x21800281, 0x14000001 }, + { 0x21000283, 0x1400ff26 }, + { 0x21800284, 0x14000003 }, + { 0x21000288, 0x1400ff26 }, +- { 0x21000289, 0x14000000 }, ++ { 0x21000289, 0x1400ffbb }, + { 0x2100028a, 0x1400ff27 }, + { 0x2100028b, 0x1400ff27 }, +- { 0x2180028c, 0x14000005 }, ++ { 0x2100028c, 0x1400ffb9 }, ++ { 0x2180028d, 0x14000004 }, + { 0x21000292, 0x1400ff25 }, + { 0x21000293, 0x14000000 }, +- { 0x21000294, 0x1400ffad }, ++ { 0x21000294, 0x1c000000 }, + { 0x21800295, 0x1400001a }, + { 0x218002b0, 0x18000011 }, + { 0x098002c2, 0x60000003 }, +@@ -532,6 +552,9 @@ + { 0x1b800346, 0x30000029 }, + { 0x13800374, 0x60000001 }, + { 0x1300037a, 0x18000000 }, ++ { 0x1300037b, 0x14000082 }, ++ { 0x1300037c, 0x14000082 }, ++ { 0x1300037d, 0x14000082 }, + { 0x0900037e, 0x54000000 }, + { 0x13800384, 0x60000001 }, + { 0x13000386, 0x24000026 }, +@@ -647,7 +670,9 @@ + { 0x130003fa, 0x24000001 }, + { 0x130003fb, 0x1400ffff }, + { 0x130003fc, 0x14000000 }, +- { 0x138003fd, 0x24000002 }, ++ { 0x130003fd, 0x2400ff7e }, ++ { 0x130003fe, 0x2400ff7e }, ++ { 0x130003ff, 0x2400ff7e }, + { 0x0c000400, 0x24000050 }, + { 0x0c000401, 0x24000050 }, + { 0x0c000402, 0x24000050 }, +@@ -835,7 +860,7 @@ + { 0x0c0004bd, 0x1400ffff }, + { 0x0c0004be, 0x24000001 }, + { 0x0c0004bf, 0x1400ffff }, +- { 0x0c0004c0, 0x24000000 }, ++ { 0x0c0004c0, 0x2400000f }, + { 0x0c0004c1, 0x24000001 }, + { 0x0c0004c2, 0x1400ffff }, + { 0x0c0004c3, 0x24000001 }, +@@ -850,6 +875,7 @@ + { 0x0c0004cc, 0x1400ffff }, + { 0x0c0004cd, 0x24000001 }, + { 0x0c0004ce, 0x1400ffff }, ++ { 0x0c0004cf, 0x1400fff1 }, + { 0x0c0004d0, 0x24000001 }, + { 0x0c0004d1, 0x1400ffff }, + { 0x0c0004d2, 0x24000001 }, +@@ -892,6 +918,12 @@ + { 0x0c0004f7, 0x1400ffff }, + { 0x0c0004f8, 0x24000001 }, + { 0x0c0004f9, 0x1400ffff }, ++ { 0x0c0004fa, 0x24000001 }, ++ { 0x0c0004fb, 0x1400ffff }, ++ { 0x0c0004fc, 0x24000001 }, ++ { 0x0c0004fd, 0x1400ffff }, ++ { 0x0c0004fe, 0x24000001 }, ++ { 0x0c0004ff, 0x1400ffff }, + { 0x0c000500, 0x24000001 }, + { 0x0c000501, 0x1400ffff }, + { 0x0c000502, 0x24000001 }, +@@ -908,6 +940,10 @@ + { 0x0c00050d, 0x1400ffff }, + { 0x0c00050e, 0x24000001 }, + { 0x0c00050f, 0x1400ffff }, ++ { 0x0c000510, 0x24000001 }, ++ { 0x0c000511, 0x1400ffff }, ++ { 0x0c000512, 0x24000001 }, ++ { 0x0c000513, 0x1400ffff }, + { 0x01000531, 0x24000030 }, + { 0x01000532, 0x24000030 }, + { 0x01000533, 0x24000030 }, +@@ -989,8 +1025,7 @@ + { 0x01000587, 0x14000000 }, + { 0x09000589, 0x54000000 }, + { 0x0100058a, 0x44000000 }, +- { 0x19800591, 0x30000028 }, +- { 0x198005bb, 0x30000002 }, ++ { 0x19800591, 0x3000002c }, + { 0x190005be, 0x54000000 }, + { 0x190005bf, 0x30000000 }, + { 0x190005c0, 0x54000000 }, +@@ -1043,6 +1078,13 @@ + { 0x37800780, 0x1c000025 }, + { 0x378007a6, 0x3000000a }, + { 0x370007b1, 0x1c000000 }, ++ { 0x3f8007c0, 0x34000009 }, ++ { 0x3f8007ca, 0x1c000020 }, ++ { 0x3f8007eb, 0x30000008 }, ++ { 0x3f8007f4, 0x18000001 }, ++ { 0x3f0007f6, 0x68000000 }, ++ { 0x3f8007f7, 0x54000002 }, ++ { 0x3f0007fa, 0x18000000 }, + { 0x0e800901, 0x30000001 }, + { 0x0e000903, 0x28000000 }, + { 0x0e800904, 0x1c000035 }, +@@ -1059,7 +1101,7 @@ + { 0x09800964, 0x54000001 }, + { 0x0e800966, 0x34000009 }, + { 0x09000970, 0x54000000 }, +- { 0x0e00097d, 0x1c000000 }, ++ { 0x0e80097b, 0x1c000004 }, + { 0x02000981, 0x30000000 }, + { 0x02800982, 0x28000001 }, + { 0x02800985, 0x1c000007 }, +@@ -1203,7 +1245,9 @@ + { 0x1c800cd5, 0x28000001 }, + { 0x1c000cde, 0x1c000000 }, + { 0x1c800ce0, 0x1c000001 }, ++ { 0x1c800ce2, 0x30000001 }, + { 0x1c800ce6, 0x34000009 }, ++ { 0x1c800cf1, 0x68000001 }, + { 0x24800d02, 0x28000001 }, + { 0x24800d05, 0x1c000007 }, + { 0x24800d0e, 0x1c000002 }, +@@ -1452,13 +1496,33 @@ + { 0x05801a17, 0x30000001 }, + { 0x05801a19, 0x28000002 }, + { 0x05801a1e, 0x54000001 }, ++ { 0x3d801b00, 0x30000003 }, ++ { 0x3d001b04, 0x28000000 }, ++ { 0x3d801b05, 0x1c00002e }, ++ { 0x3d001b34, 0x30000000 }, ++ { 0x3d001b35, 0x28000000 }, ++ { 0x3d801b36, 0x30000004 }, ++ { 0x3d001b3b, 0x28000000 }, ++ { 0x3d001b3c, 0x30000000 }, ++ { 0x3d801b3d, 0x28000004 }, ++ { 0x3d001b42, 0x30000000 }, ++ { 0x3d801b43, 0x28000001 }, ++ { 0x3d801b45, 0x1c000006 }, ++ { 0x3d801b50, 0x34000009 }, ++ { 0x3d801b5a, 0x54000006 }, ++ { 0x3d801b61, 0x68000009 }, ++ { 0x3d801b6b, 0x30000008 }, ++ { 0x3d801b74, 0x68000008 }, + { 0x21801d00, 0x1400002b }, + { 0x21801d2c, 0x18000035 }, + { 0x21801d62, 0x14000015 }, + { 0x0c001d78, 0x18000000 }, +- { 0x21801d79, 0x14000021 }, ++ { 0x21801d79, 0x14000003 }, ++ { 0x21001d7d, 0x14000ee6 }, ++ { 0x21801d7e, 0x1400001c }, + { 0x21801d9b, 0x18000024 }, +- { 0x1b801dc0, 0x30000003 }, ++ { 0x1b801dc0, 0x3000000a }, ++ { 0x1b801dfe, 0x30000001 }, + { 0x21001e00, 0x24000001 }, + { 0x21001e01, 0x1400ffff }, + { 0x21001e02, 0x24000001 }, +@@ -1967,7 +2031,7 @@ + { 0x1b8020dd, 0x2c000003 }, + { 0x1b0020e1, 0x30000000 }, + { 0x1b8020e2, 0x2c000002 }, +- { 0x1b8020e5, 0x30000006 }, ++ { 0x1b8020e5, 0x3000000a }, + { 0x09802100, 0x68000001 }, + { 0x09002102, 0x24000000 }, + { 0x09802103, 0x68000003 }, +@@ -1995,7 +2059,7 @@ + { 0x0900212e, 0x68000000 }, + { 0x0900212f, 0x14000000 }, + { 0x09802130, 0x24000001 }, +- { 0x09002132, 0x68000000 }, ++ { 0x21002132, 0x2400001c }, + { 0x09002133, 0x24000000 }, + { 0x09002134, 0x14000000 }, + { 0x09802135, 0x1c000003 }, +@@ -2008,7 +2072,8 @@ + { 0x09802146, 0x14000003 }, + { 0x0900214a, 0x68000000 }, + { 0x0900214b, 0x64000000 }, +- { 0x0900214c, 0x68000000 }, ++ { 0x0980214c, 0x68000001 }, ++ { 0x2100214e, 0x1400ffe4 }, + { 0x09802153, 0x3c00000c }, + { 0x09002160, 0x38000010 }, + { 0x09002161, 0x38000010 }, +@@ -2042,7 +2107,9 @@ + { 0x0900217d, 0x3800fff0 }, + { 0x0900217e, 0x3800fff0 }, + { 0x0900217f, 0x3800fff0 }, +- { 0x09802180, 0x38000003 }, ++ { 0x09802180, 0x38000002 }, ++ { 0x09002183, 0x24000001 }, ++ { 0x21002184, 0x1400ffff }, + { 0x09802190, 0x64000004 }, + { 0x09802195, 0x68000004 }, + { 0x0980219a, 0x64000001 }, +@@ -2073,10 +2140,9 @@ + { 0x0900237c, 0x64000000 }, + { 0x0980237d, 0x6800001d }, + { 0x0980239b, 0x64000018 }, +- { 0x090023b4, 0x58000000 }, +- { 0x090023b5, 0x48000000 }, +- { 0x090023b6, 0x54000000 }, +- { 0x098023b7, 0x68000024 }, ++ { 0x098023b4, 0x68000027 }, ++ { 0x098023dc, 0x64000005 }, ++ { 0x098023e2, 0x68000005 }, + { 0x09802400, 0x68000026 }, + { 0x09802440, 0x6800000a }, + { 0x09802460, 0x3c00003b }, +@@ -2143,7 +2209,7 @@ + { 0x09802600, 0x6800006e }, + { 0x0900266f, 0x64000000 }, + { 0x09802670, 0x6800002c }, +- { 0x098026a0, 0x68000011 }, ++ { 0x098026a0, 0x68000012 }, + { 0x09802701, 0x68000003 }, + { 0x09802706, 0x68000003 }, + { 0x0980270c, 0x6800001b }, +@@ -2174,6 +2240,7 @@ + { 0x098027c0, 0x64000004 }, + { 0x090027c5, 0x58000000 }, + { 0x090027c6, 0x48000000 }, ++ { 0x098027c7, 0x64000003 }, + { 0x098027d0, 0x64000015 }, + { 0x090027e6, 0x58000000 }, + { 0x090027e7, 0x48000000 }, +@@ -2215,7 +2282,8 @@ + { 0x090029fc, 0x58000000 }, + { 0x090029fd, 0x48000000 }, + { 0x098029fe, 0x64000101 }, +- { 0x09802b00, 0x68000013 }, ++ { 0x09802b00, 0x6800001a }, ++ { 0x09802b20, 0x68000003 }, + { 0x11002c00, 0x24000030 }, + { 0x11002c01, 0x24000030 }, + { 0x11002c02, 0x24000030 }, +@@ -2310,6 +2378,23 @@ + { 0x11002c5c, 0x1400ffd0 }, + { 0x11002c5d, 0x1400ffd0 }, + { 0x11002c5e, 0x1400ffd0 }, ++ { 0x21002c60, 0x24000001 }, ++ { 0x21002c61, 0x1400ffff }, ++ { 0x21002c62, 0x2400d609 }, ++ { 0x21002c63, 0x2400f11a }, ++ { 0x21002c64, 0x2400d619 }, ++ { 0x21002c65, 0x1400d5d5 }, ++ { 0x21002c66, 0x1400d5d8 }, ++ { 0x21002c67, 0x24000001 }, ++ { 0x21002c68, 0x1400ffff }, ++ { 0x21002c69, 0x24000001 }, ++ { 0x21002c6a, 0x1400ffff }, ++ { 0x21002c6b, 0x24000001 }, ++ { 0x21002c6c, 0x1400ffff }, ++ { 0x21002c74, 0x14000000 }, ++ { 0x21002c75, 0x24000001 }, ++ { 0x21002c76, 0x1400ffff }, ++ { 0x21002c77, 0x14000000 }, + { 0x0a002c80, 0x24000001 }, + { 0x0a002c81, 0x1400ffff }, + { 0x0a002c82, 0x24000001 }, +@@ -2559,6 +2644,8 @@ + { 0x3c80a016, 0x1c000476 }, + { 0x3c80a490, 0x68000036 }, + { 0x0980a700, 0x60000016 }, ++ { 0x0980a717, 0x18000003 }, ++ { 0x0980a720, 0x60000001 }, + { 0x3080a800, 0x1c000001 }, + { 0x3000a802, 0x28000000 }, + { 0x3080a803, 0x1c000002 }, +@@ -2570,6 +2657,8 @@ + { 0x3080a825, 0x30000001 }, + { 0x3000a827, 0x28000000 }, + { 0x3080a828, 0x68000003 }, ++ { 0x4080a840, 0x1c000033 }, ++ { 0x4080a874, 0x54000003 }, + { 0x1780ac00, 0x1c002ba3 }, + { 0x0980d800, 0x1000037f }, + { 0x0980db80, 0x1000007f }, +@@ -2765,13 +2854,15 @@ + { 0x1301018a, 0x3c000000 }, + { 0x29810300, 0x1c00001e }, + { 0x29810320, 0x3c000003 }, +- { 0x12810330, 0x1c000019 }, ++ { 0x12810330, 0x1c000010 }, ++ { 0x12010341, 0x38000000 }, ++ { 0x12810342, 0x1c000007 }, + { 0x1201034a, 0x38000000 }, + { 0x3b810380, 0x1c00001d }, + { 0x3b01039f, 0x54000000 }, + { 0x2a8103a0, 0x1c000023 }, + { 0x2a8103c8, 0x1c000007 }, +- { 0x2a0103d0, 0x68000000 }, ++ { 0x2a0103d0, 0x54000000 }, + { 0x2a8103d1, 0x38000004 }, + { 0x0d010400, 0x24000028 }, + { 0x0d010401, 0x24000028 }, +@@ -2861,6 +2952,9 @@ + { 0x0b810837, 0x1c000001 }, + { 0x0b01083c, 0x1c000000 }, + { 0x0b01083f, 0x1c000000 }, ++ { 0x41810900, 0x1c000015 }, ++ { 0x41810916, 0x3c000003 }, ++ { 0x4101091f, 0x54000000 }, + { 0x1e010a00, 0x1c000000 }, + { 0x1e810a01, 0x30000002 }, + { 0x1e810a05, 0x30000001 }, +@@ -2872,6 +2966,9 @@ + { 0x1e010a3f, 0x30000000 }, + { 0x1e810a40, 0x3c000007 }, + { 0x1e810a50, 0x54000008 }, ++ { 0x3e812000, 0x1c00036e }, ++ { 0x3e812400, 0x38000062 }, ++ { 0x3e812470, 0x54000003 }, + { 0x0981d000, 0x680000f5 }, + { 0x0981d100, 0x68000026 }, + { 0x0981d12a, 0x6800003a }, +@@ -2890,6 +2987,7 @@ + { 0x1381d242, 0x30000002 }, + { 0x1301d245, 0x68000000 }, + { 0x0981d300, 0x68000056 }, ++ { 0x0981d360, 0x3c000011 }, + { 0x0981d400, 0x24000019 }, + { 0x0981d41a, 0x14000019 }, + { 0x0981d434, 0x24000019 }, +@@ -2957,6 +3055,8 @@ + { 0x0981d7aa, 0x14000018 }, + { 0x0901d7c3, 0x64000000 }, + { 0x0981d7c4, 0x14000005 }, ++ { 0x0901d7ca, 0x24000000 }, ++ { 0x0901d7cb, 0x14000000 }, + { 0x0981d7ce, 0x34000031 }, + { 0x16820000, 0x1c00a6d6 }, + { 0x1682f800, 0x1c00021d }, |