diff options
author | tobez <tobez@FreeBSD.org> | 2007-11-07 06:08:03 +0800 |
---|---|---|
committer | tobez <tobez@FreeBSD.org> | 2007-11-07 06:08:03 +0800 |
commit | ab5bb7da2bbd0e5843c22a4cda7255b347372d4b (patch) | |
tree | a69b16826a048016dc96fbfed88f933a88ab9dde | |
parent | 9305299f1997039e57df2f6a7f0101844ff291ce (diff) | |
download | freebsd-ports-gnome-ab5bb7da2bbd0e5843c22a4cda7255b347372d4b.tar.gz freebsd-ports-gnome-ab5bb7da2bbd0e5843c22a4cda7255b347372d4b.tar.zst freebsd-ports-gnome-ab5bb7da2bbd0e5843c22a4cda7255b347372d4b.zip |
Fix a possible buffer overflow with ASCII regexes that really are
Unicode regexes.
Obtained from: perl5-porters (Nicholas Clark), with modifications
Approved by: portmgr (marcus)
-rw-r--r-- | lang/perl5.10/Makefile | 1 | ||||
-rw-r--r-- | lang/perl5.10/files/patch-utf-regcomp | 95 | ||||
-rw-r--r-- | lang/perl5.12/Makefile | 1 | ||||
-rw-r--r-- | lang/perl5.12/files/patch-utf-regcomp | 95 | ||||
-rw-r--r-- | lang/perl5.14/Makefile | 1 | ||||
-rw-r--r-- | lang/perl5.14/files/patch-utf-regcomp | 95 | ||||
-rw-r--r-- | lang/perl5.16/Makefile | 1 | ||||
-rw-r--r-- | lang/perl5.16/files/patch-utf-regcomp | 95 | ||||
-rw-r--r-- | lang/perl5.8/Makefile | 1 | ||||
-rw-r--r-- | lang/perl5.8/files/patch-utf-regcomp | 95 |
10 files changed, 480 insertions, 0 deletions
diff --git a/lang/perl5.10/Makefile b/lang/perl5.10/Makefile index 13c8a8237609..3c607d2a42a9 100644 --- a/lang/perl5.10/Makefile +++ b/lang/perl5.10/Makefile @@ -7,6 +7,7 @@ PORTNAME= perl PORTVERSION= ${PERL_VER} +PORTREVISION= 1 CATEGORIES= lang devel perl5 MASTER_SITES= ${MASTER_SITE_PERL_CPAN} \ ${MASTER_SITE_LOCAL:S/$/:local/} \ diff --git a/lang/perl5.10/files/patch-utf-regcomp b/lang/perl5.10/files/patch-utf-regcomp new file mode 100644 index 000000000000..44c9a55de49a --- /dev/null +++ b/lang/perl5.10/files/patch-utf-regcomp @@ -0,0 +1,95 @@ +--- regcomp.c.orig 2006-01-08 21:59:27.000000000 +0100 ++++ regcomp.c 2007-11-06 22:48:26.000000000 +0100 +@@ -135,7 +135,10 @@ typedef struct RExC_state_t { + I32 extralen; + I32 seen_zerolen; + I32 seen_evals; +- I32 utf8; ++ I32 utf8; /* whether the pattern is utf8 or not */ ++ I32 orig_utf8; /* whether the pattern was originally in utf8 */ ++ /* XXX use this for future optimisation of case ++ * where pattern must be upgraded to utf8. */ + #if ADD_TO_REGEXEC + char *starttry; /* -Dr: where regtry was called. */ + #define RExC_starttry (pRExC_state->starttry) +@@ -161,6 +164,7 @@ typedef struct RExC_state_t { + #define RExC_seen_zerolen (pRExC_state->seen_zerolen) + #define RExC_seen_evals (pRExC_state->seen_evals) + #define RExC_utf8 (pRExC_state->utf8) ++#define RExC_orig_utf8 (pRExC_state->orig_utf8) + + #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?') + #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \ +@@ -1749,15 +1753,16 @@ Perl_pregcomp(pTHX_ char *exp, char *xen + if (exp == NULL) + FAIL("NULL regexp argument"); + +- RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8; ++ RExC_utf8 = RExC_orig_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8; + +- RExC_precomp = exp; + DEBUG_r({ + if (!PL_colorset) reginitcolors(); + PerlIO_printf(Perl_debug_log, "%sCompiling REx%s `%s%*s%s'\n", + PL_colors[4],PL_colors[5],PL_colors[0], +- (int)(xend - exp), RExC_precomp, PL_colors[1]); ++ (int)(xend - exp), exp, PL_colors[1]); + }); ++redo_first_pass: ++ RExC_precomp = exp; + RExC_flags = pm->op_pmflags; + RExC_sawback = 0; + +@@ -1783,6 +1788,25 @@ Perl_pregcomp(pTHX_ char *exp, char *xen + RExC_precomp = Nullch; + return(NULL); + } ++ if (RExC_utf8 && !RExC_orig_utf8) { ++ /* It's possible to write a regexp in ascii that represents unicode ++ codepoints outside of the byte range, such as via \x{100}. If we ++ detect such a sequence we have to convert the entire pattern to utf8 ++ and then recompile, as our sizing calculation will have been based ++ on 1 byte == 1 character, but we will need to use utf8 to encode ++ at least some part of the pattern, and therefore must convert the whole ++ thing. ++ XXX: somehow figure out how to make this less expensive... ++ -- dmq */ ++ STRLEN len = xend-exp; ++ DEBUG_r(PerlIO_printf(Perl_debug_log, ++ "UTF8 mismatch! Converting to utf8 for resizing and compile\n")); ++ exp = (char*)Perl_bytes_to_utf8(aTHX_ (U8*)exp, &len); ++ xend = exp + len; ++ RExC_orig_utf8 = RExC_utf8; ++ SAVEFREEPV(exp); ++ goto redo_first_pass; ++ } + DEBUG_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size)); + + /* Small enough for pointer-storage convention? +--- t/op/pat.t.orig 2006-01-07 13:53:32.000000000 +0100 ++++ t/op/pat.t 2007-11-06 21:52:30.000000000 +0100 +@@ -6,7 +6,7 @@ + + $| = 1; + +-print "1..1187\n"; ++print "1..1189\n"; + + BEGIN { + chdir 't' if -d 't'; +@@ -3395,5 +3395,14 @@ ok(("foba ba$s" =~ qr/(foo|BaSS|bar)/i) + "# assigning to original string should not corrupt match vars"); + } + +-# last test 1187 ++{ ++ use warnings; ++ my @w; ++ local $SIG{__WARN__}=sub{push @w,"@_"}; ++ my $c=qq(\x{DF}); ++ ok($c=~/${c}|\x{100}/, "ASCII pattern that really is utf8"); ++ ok(@w==0, "ASCII pattern that really is utf8"); ++} ++ ++# last test 1189 + diff --git a/lang/perl5.12/Makefile b/lang/perl5.12/Makefile index 13c8a8237609..3c607d2a42a9 100644 --- a/lang/perl5.12/Makefile +++ b/lang/perl5.12/Makefile @@ -7,6 +7,7 @@ PORTNAME= perl PORTVERSION= ${PERL_VER} +PORTREVISION= 1 CATEGORIES= lang devel perl5 MASTER_SITES= ${MASTER_SITE_PERL_CPAN} \ ${MASTER_SITE_LOCAL:S/$/:local/} \ diff --git a/lang/perl5.12/files/patch-utf-regcomp b/lang/perl5.12/files/patch-utf-regcomp new file mode 100644 index 000000000000..44c9a55de49a --- /dev/null +++ b/lang/perl5.12/files/patch-utf-regcomp @@ -0,0 +1,95 @@ +--- regcomp.c.orig 2006-01-08 21:59:27.000000000 +0100 ++++ regcomp.c 2007-11-06 22:48:26.000000000 +0100 +@@ -135,7 +135,10 @@ typedef struct RExC_state_t { + I32 extralen; + I32 seen_zerolen; + I32 seen_evals; +- I32 utf8; ++ I32 utf8; /* whether the pattern is utf8 or not */ ++ I32 orig_utf8; /* whether the pattern was originally in utf8 */ ++ /* XXX use this for future optimisation of case ++ * where pattern must be upgraded to utf8. */ + #if ADD_TO_REGEXEC + char *starttry; /* -Dr: where regtry was called. */ + #define RExC_starttry (pRExC_state->starttry) +@@ -161,6 +164,7 @@ typedef struct RExC_state_t { + #define RExC_seen_zerolen (pRExC_state->seen_zerolen) + #define RExC_seen_evals (pRExC_state->seen_evals) + #define RExC_utf8 (pRExC_state->utf8) ++#define RExC_orig_utf8 (pRExC_state->orig_utf8) + + #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?') + #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \ +@@ -1749,15 +1753,16 @@ Perl_pregcomp(pTHX_ char *exp, char *xen + if (exp == NULL) + FAIL("NULL regexp argument"); + +- RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8; ++ RExC_utf8 = RExC_orig_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8; + +- RExC_precomp = exp; + DEBUG_r({ + if (!PL_colorset) reginitcolors(); + PerlIO_printf(Perl_debug_log, "%sCompiling REx%s `%s%*s%s'\n", + PL_colors[4],PL_colors[5],PL_colors[0], +- (int)(xend - exp), RExC_precomp, PL_colors[1]); ++ (int)(xend - exp), exp, PL_colors[1]); + }); ++redo_first_pass: ++ RExC_precomp = exp; + RExC_flags = pm->op_pmflags; + RExC_sawback = 0; + +@@ -1783,6 +1788,25 @@ Perl_pregcomp(pTHX_ char *exp, char *xen + RExC_precomp = Nullch; + return(NULL); + } ++ if (RExC_utf8 && !RExC_orig_utf8) { ++ /* It's possible to write a regexp in ascii that represents unicode ++ codepoints outside of the byte range, such as via \x{100}. If we ++ detect such a sequence we have to convert the entire pattern to utf8 ++ and then recompile, as our sizing calculation will have been based ++ on 1 byte == 1 character, but we will need to use utf8 to encode ++ at least some part of the pattern, and therefore must convert the whole ++ thing. ++ XXX: somehow figure out how to make this less expensive... ++ -- dmq */ ++ STRLEN len = xend-exp; ++ DEBUG_r(PerlIO_printf(Perl_debug_log, ++ "UTF8 mismatch! Converting to utf8 for resizing and compile\n")); ++ exp = (char*)Perl_bytes_to_utf8(aTHX_ (U8*)exp, &len); ++ xend = exp + len; ++ RExC_orig_utf8 = RExC_utf8; ++ SAVEFREEPV(exp); ++ goto redo_first_pass; ++ } + DEBUG_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size)); + + /* Small enough for pointer-storage convention? +--- t/op/pat.t.orig 2006-01-07 13:53:32.000000000 +0100 ++++ t/op/pat.t 2007-11-06 21:52:30.000000000 +0100 +@@ -6,7 +6,7 @@ + + $| = 1; + +-print "1..1187\n"; ++print "1..1189\n"; + + BEGIN { + chdir 't' if -d 't'; +@@ -3395,5 +3395,14 @@ ok(("foba ba$s" =~ qr/(foo|BaSS|bar)/i) + "# assigning to original string should not corrupt match vars"); + } + +-# last test 1187 ++{ ++ use warnings; ++ my @w; ++ local $SIG{__WARN__}=sub{push @w,"@_"}; ++ my $c=qq(\x{DF}); ++ ok($c=~/${c}|\x{100}/, "ASCII pattern that really is utf8"); ++ ok(@w==0, "ASCII pattern that really is utf8"); ++} ++ ++# last test 1189 + diff --git a/lang/perl5.14/Makefile b/lang/perl5.14/Makefile index 13c8a8237609..3c607d2a42a9 100644 --- a/lang/perl5.14/Makefile +++ b/lang/perl5.14/Makefile @@ -7,6 +7,7 @@ PORTNAME= perl PORTVERSION= ${PERL_VER} +PORTREVISION= 1 CATEGORIES= lang devel perl5 MASTER_SITES= ${MASTER_SITE_PERL_CPAN} \ ${MASTER_SITE_LOCAL:S/$/:local/} \ diff --git a/lang/perl5.14/files/patch-utf-regcomp b/lang/perl5.14/files/patch-utf-regcomp new file mode 100644 index 000000000000..44c9a55de49a --- /dev/null +++ b/lang/perl5.14/files/patch-utf-regcomp @@ -0,0 +1,95 @@ +--- regcomp.c.orig 2006-01-08 21:59:27.000000000 +0100 ++++ regcomp.c 2007-11-06 22:48:26.000000000 +0100 +@@ -135,7 +135,10 @@ typedef struct RExC_state_t { + I32 extralen; + I32 seen_zerolen; + I32 seen_evals; +- I32 utf8; ++ I32 utf8; /* whether the pattern is utf8 or not */ ++ I32 orig_utf8; /* whether the pattern was originally in utf8 */ ++ /* XXX use this for future optimisation of case ++ * where pattern must be upgraded to utf8. */ + #if ADD_TO_REGEXEC + char *starttry; /* -Dr: where regtry was called. */ + #define RExC_starttry (pRExC_state->starttry) +@@ -161,6 +164,7 @@ typedef struct RExC_state_t { + #define RExC_seen_zerolen (pRExC_state->seen_zerolen) + #define RExC_seen_evals (pRExC_state->seen_evals) + #define RExC_utf8 (pRExC_state->utf8) ++#define RExC_orig_utf8 (pRExC_state->orig_utf8) + + #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?') + #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \ +@@ -1749,15 +1753,16 @@ Perl_pregcomp(pTHX_ char *exp, char *xen + if (exp == NULL) + FAIL("NULL regexp argument"); + +- RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8; ++ RExC_utf8 = RExC_orig_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8; + +- RExC_precomp = exp; + DEBUG_r({ + if (!PL_colorset) reginitcolors(); + PerlIO_printf(Perl_debug_log, "%sCompiling REx%s `%s%*s%s'\n", + PL_colors[4],PL_colors[5],PL_colors[0], +- (int)(xend - exp), RExC_precomp, PL_colors[1]); ++ (int)(xend - exp), exp, PL_colors[1]); + }); ++redo_first_pass: ++ RExC_precomp = exp; + RExC_flags = pm->op_pmflags; + RExC_sawback = 0; + +@@ -1783,6 +1788,25 @@ Perl_pregcomp(pTHX_ char *exp, char *xen + RExC_precomp = Nullch; + return(NULL); + } ++ if (RExC_utf8 && !RExC_orig_utf8) { ++ /* It's possible to write a regexp in ascii that represents unicode ++ codepoints outside of the byte range, such as via \x{100}. If we ++ detect such a sequence we have to convert the entire pattern to utf8 ++ and then recompile, as our sizing calculation will have been based ++ on 1 byte == 1 character, but we will need to use utf8 to encode ++ at least some part of the pattern, and therefore must convert the whole ++ thing. ++ XXX: somehow figure out how to make this less expensive... ++ -- dmq */ ++ STRLEN len = xend-exp; ++ DEBUG_r(PerlIO_printf(Perl_debug_log, ++ "UTF8 mismatch! Converting to utf8 for resizing and compile\n")); ++ exp = (char*)Perl_bytes_to_utf8(aTHX_ (U8*)exp, &len); ++ xend = exp + len; ++ RExC_orig_utf8 = RExC_utf8; ++ SAVEFREEPV(exp); ++ goto redo_first_pass; ++ } + DEBUG_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size)); + + /* Small enough for pointer-storage convention? +--- t/op/pat.t.orig 2006-01-07 13:53:32.000000000 +0100 ++++ t/op/pat.t 2007-11-06 21:52:30.000000000 +0100 +@@ -6,7 +6,7 @@ + + $| = 1; + +-print "1..1187\n"; ++print "1..1189\n"; + + BEGIN { + chdir 't' if -d 't'; +@@ -3395,5 +3395,14 @@ ok(("foba ba$s" =~ qr/(foo|BaSS|bar)/i) + "# assigning to original string should not corrupt match vars"); + } + +-# last test 1187 ++{ ++ use warnings; ++ my @w; ++ local $SIG{__WARN__}=sub{push @w,"@_"}; ++ my $c=qq(\x{DF}); ++ ok($c=~/${c}|\x{100}/, "ASCII pattern that really is utf8"); ++ ok(@w==0, "ASCII pattern that really is utf8"); ++} ++ ++# last test 1189 + diff --git a/lang/perl5.16/Makefile b/lang/perl5.16/Makefile index 13c8a8237609..3c607d2a42a9 100644 --- a/lang/perl5.16/Makefile +++ b/lang/perl5.16/Makefile @@ -7,6 +7,7 @@ PORTNAME= perl PORTVERSION= ${PERL_VER} +PORTREVISION= 1 CATEGORIES= lang devel perl5 MASTER_SITES= ${MASTER_SITE_PERL_CPAN} \ ${MASTER_SITE_LOCAL:S/$/:local/} \ diff --git a/lang/perl5.16/files/patch-utf-regcomp b/lang/perl5.16/files/patch-utf-regcomp new file mode 100644 index 000000000000..44c9a55de49a --- /dev/null +++ b/lang/perl5.16/files/patch-utf-regcomp @@ -0,0 +1,95 @@ +--- regcomp.c.orig 2006-01-08 21:59:27.000000000 +0100 ++++ regcomp.c 2007-11-06 22:48:26.000000000 +0100 +@@ -135,7 +135,10 @@ typedef struct RExC_state_t { + I32 extralen; + I32 seen_zerolen; + I32 seen_evals; +- I32 utf8; ++ I32 utf8; /* whether the pattern is utf8 or not */ ++ I32 orig_utf8; /* whether the pattern was originally in utf8 */ ++ /* XXX use this for future optimisation of case ++ * where pattern must be upgraded to utf8. */ + #if ADD_TO_REGEXEC + char *starttry; /* -Dr: where regtry was called. */ + #define RExC_starttry (pRExC_state->starttry) +@@ -161,6 +164,7 @@ typedef struct RExC_state_t { + #define RExC_seen_zerolen (pRExC_state->seen_zerolen) + #define RExC_seen_evals (pRExC_state->seen_evals) + #define RExC_utf8 (pRExC_state->utf8) ++#define RExC_orig_utf8 (pRExC_state->orig_utf8) + + #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?') + #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \ +@@ -1749,15 +1753,16 @@ Perl_pregcomp(pTHX_ char *exp, char *xen + if (exp == NULL) + FAIL("NULL regexp argument"); + +- RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8; ++ RExC_utf8 = RExC_orig_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8; + +- RExC_precomp = exp; + DEBUG_r({ + if (!PL_colorset) reginitcolors(); + PerlIO_printf(Perl_debug_log, "%sCompiling REx%s `%s%*s%s'\n", + PL_colors[4],PL_colors[5],PL_colors[0], +- (int)(xend - exp), RExC_precomp, PL_colors[1]); ++ (int)(xend - exp), exp, PL_colors[1]); + }); ++redo_first_pass: ++ RExC_precomp = exp; + RExC_flags = pm->op_pmflags; + RExC_sawback = 0; + +@@ -1783,6 +1788,25 @@ Perl_pregcomp(pTHX_ char *exp, char *xen + RExC_precomp = Nullch; + return(NULL); + } ++ if (RExC_utf8 && !RExC_orig_utf8) { ++ /* It's possible to write a regexp in ascii that represents unicode ++ codepoints outside of the byte range, such as via \x{100}. If we ++ detect such a sequence we have to convert the entire pattern to utf8 ++ and then recompile, as our sizing calculation will have been based ++ on 1 byte == 1 character, but we will need to use utf8 to encode ++ at least some part of the pattern, and therefore must convert the whole ++ thing. ++ XXX: somehow figure out how to make this less expensive... ++ -- dmq */ ++ STRLEN len = xend-exp; ++ DEBUG_r(PerlIO_printf(Perl_debug_log, ++ "UTF8 mismatch! Converting to utf8 for resizing and compile\n")); ++ exp = (char*)Perl_bytes_to_utf8(aTHX_ (U8*)exp, &len); ++ xend = exp + len; ++ RExC_orig_utf8 = RExC_utf8; ++ SAVEFREEPV(exp); ++ goto redo_first_pass; ++ } + DEBUG_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size)); + + /* Small enough for pointer-storage convention? +--- t/op/pat.t.orig 2006-01-07 13:53:32.000000000 +0100 ++++ t/op/pat.t 2007-11-06 21:52:30.000000000 +0100 +@@ -6,7 +6,7 @@ + + $| = 1; + +-print "1..1187\n"; ++print "1..1189\n"; + + BEGIN { + chdir 't' if -d 't'; +@@ -3395,5 +3395,14 @@ ok(("foba ba$s" =~ qr/(foo|BaSS|bar)/i) + "# assigning to original string should not corrupt match vars"); + } + +-# last test 1187 ++{ ++ use warnings; ++ my @w; ++ local $SIG{__WARN__}=sub{push @w,"@_"}; ++ my $c=qq(\x{DF}); ++ ok($c=~/${c}|\x{100}/, "ASCII pattern that really is utf8"); ++ ok(@w==0, "ASCII pattern that really is utf8"); ++} ++ ++# last test 1189 + diff --git a/lang/perl5.8/Makefile b/lang/perl5.8/Makefile index 13c8a8237609..3c607d2a42a9 100644 --- a/lang/perl5.8/Makefile +++ b/lang/perl5.8/Makefile @@ -7,6 +7,7 @@ PORTNAME= perl PORTVERSION= ${PERL_VER} +PORTREVISION= 1 CATEGORIES= lang devel perl5 MASTER_SITES= ${MASTER_SITE_PERL_CPAN} \ ${MASTER_SITE_LOCAL:S/$/:local/} \ diff --git a/lang/perl5.8/files/patch-utf-regcomp b/lang/perl5.8/files/patch-utf-regcomp new file mode 100644 index 000000000000..44c9a55de49a --- /dev/null +++ b/lang/perl5.8/files/patch-utf-regcomp @@ -0,0 +1,95 @@ +--- regcomp.c.orig 2006-01-08 21:59:27.000000000 +0100 ++++ regcomp.c 2007-11-06 22:48:26.000000000 +0100 +@@ -135,7 +135,10 @@ typedef struct RExC_state_t { + I32 extralen; + I32 seen_zerolen; + I32 seen_evals; +- I32 utf8; ++ I32 utf8; /* whether the pattern is utf8 or not */ ++ I32 orig_utf8; /* whether the pattern was originally in utf8 */ ++ /* XXX use this for future optimisation of case ++ * where pattern must be upgraded to utf8. */ + #if ADD_TO_REGEXEC + char *starttry; /* -Dr: where regtry was called. */ + #define RExC_starttry (pRExC_state->starttry) +@@ -161,6 +164,7 @@ typedef struct RExC_state_t { + #define RExC_seen_zerolen (pRExC_state->seen_zerolen) + #define RExC_seen_evals (pRExC_state->seen_evals) + #define RExC_utf8 (pRExC_state->utf8) ++#define RExC_orig_utf8 (pRExC_state->orig_utf8) + + #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?') + #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \ +@@ -1749,15 +1753,16 @@ Perl_pregcomp(pTHX_ char *exp, char *xen + if (exp == NULL) + FAIL("NULL regexp argument"); + +- RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8; ++ RExC_utf8 = RExC_orig_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8; + +- RExC_precomp = exp; + DEBUG_r({ + if (!PL_colorset) reginitcolors(); + PerlIO_printf(Perl_debug_log, "%sCompiling REx%s `%s%*s%s'\n", + PL_colors[4],PL_colors[5],PL_colors[0], +- (int)(xend - exp), RExC_precomp, PL_colors[1]); ++ (int)(xend - exp), exp, PL_colors[1]); + }); ++redo_first_pass: ++ RExC_precomp = exp; + RExC_flags = pm->op_pmflags; + RExC_sawback = 0; + +@@ -1783,6 +1788,25 @@ Perl_pregcomp(pTHX_ char *exp, char *xen + RExC_precomp = Nullch; + return(NULL); + } ++ if (RExC_utf8 && !RExC_orig_utf8) { ++ /* It's possible to write a regexp in ascii that represents unicode ++ codepoints outside of the byte range, such as via \x{100}. If we ++ detect such a sequence we have to convert the entire pattern to utf8 ++ and then recompile, as our sizing calculation will have been based ++ on 1 byte == 1 character, but we will need to use utf8 to encode ++ at least some part of the pattern, and therefore must convert the whole ++ thing. ++ XXX: somehow figure out how to make this less expensive... ++ -- dmq */ ++ STRLEN len = xend-exp; ++ DEBUG_r(PerlIO_printf(Perl_debug_log, ++ "UTF8 mismatch! Converting to utf8 for resizing and compile\n")); ++ exp = (char*)Perl_bytes_to_utf8(aTHX_ (U8*)exp, &len); ++ xend = exp + len; ++ RExC_orig_utf8 = RExC_utf8; ++ SAVEFREEPV(exp); ++ goto redo_first_pass; ++ } + DEBUG_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size)); + + /* Small enough for pointer-storage convention? +--- t/op/pat.t.orig 2006-01-07 13:53:32.000000000 +0100 ++++ t/op/pat.t 2007-11-06 21:52:30.000000000 +0100 +@@ -6,7 +6,7 @@ + + $| = 1; + +-print "1..1187\n"; ++print "1..1189\n"; + + BEGIN { + chdir 't' if -d 't'; +@@ -3395,5 +3395,14 @@ ok(("foba ba$s" =~ qr/(foo|BaSS|bar)/i) + "# assigning to original string should not corrupt match vars"); + } + +-# last test 1187 ++{ ++ use warnings; ++ my @w; ++ local $SIG{__WARN__}=sub{push @w,"@_"}; ++ my $c=qq(\x{DF}); ++ ok($c=~/${c}|\x{100}/, "ASCII pattern that really is utf8"); ++ ok(@w==0, "ASCII pattern that really is utf8"); ++} ++ ++# last test 1189 + |