1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
|
--- regcomp.c.orig 2006-01-08 21:59:27.000000000 +0100
+++ regcomp.c 2007-11-06 22:48:26.000000000 +0100
@@ -135,7 +135,10 @@ typedef struct RExC_state_t {
I32 extralen;
I32 seen_zerolen;
I32 seen_evals;
- I32 utf8;
+ I32 utf8; /* whether the pattern is utf8 or not */
+ I32 orig_utf8; /* whether the pattern was originally in utf8 */
+ /* XXX use this for future optimisation of case
+ * where pattern must be upgraded to utf8. */
#if ADD_TO_REGEXEC
char *starttry; /* -Dr: where regtry was called. */
#define RExC_starttry (pRExC_state->starttry)
@@ -161,6 +164,7 @@ typedef struct RExC_state_t {
#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
#define RExC_seen_evals (pRExC_state->seen_evals)
#define RExC_utf8 (pRExC_state->utf8)
+#define RExC_orig_utf8 (pRExC_state->orig_utf8)
#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
@@ -1749,15 +1753,16 @@ Perl_pregcomp(pTHX_ char *exp, char *xen
if (exp == NULL)
FAIL("NULL regexp argument");
- RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
+ RExC_utf8 = RExC_orig_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
- RExC_precomp = exp;
DEBUG_r({
if (!PL_colorset) reginitcolors();
PerlIO_printf(Perl_debug_log, "%sCompiling REx%s `%s%*s%s'\n",
PL_colors[4],PL_colors[5],PL_colors[0],
- (int)(xend - exp), RExC_precomp, PL_colors[1]);
+ (int)(xend - exp), exp, PL_colors[1]);
});
+redo_first_pass:
+ RExC_precomp = exp;
RExC_flags = pm->op_pmflags;
RExC_sawback = 0;
@@ -1783,6 +1788,25 @@ Perl_pregcomp(pTHX_ char *exp, char *xen
RExC_precomp = Nullch;
return(NULL);
}
+ if (RExC_utf8 && !RExC_orig_utf8) {
+ /* It's possible to write a regexp in ascii that represents unicode
+ codepoints outside of the byte range, such as via \x{100}. If we
+ detect such a sequence we have to convert the entire pattern to utf8
+ and then recompile, as our sizing calculation will have been based
+ on 1 byte == 1 character, but we will need to use utf8 to encode
+ at least some part of the pattern, and therefore must convert the whole
+ thing.
+ XXX: somehow figure out how to make this less expensive...
+ -- dmq */
+ STRLEN len = xend-exp;
+ DEBUG_r(PerlIO_printf(Perl_debug_log,
+ "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
+ exp = (char*)Perl_bytes_to_utf8(aTHX_ (U8*)exp, &len);
+ xend = exp + len;
+ RExC_orig_utf8 = RExC_utf8;
+ SAVEFREEPV(exp);
+ goto redo_first_pass;
+ }
DEBUG_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size));
/* Small enough for pointer-storage convention?
--- t/op/pat.t.orig 2006-01-07 13:53:32.000000000 +0100
+++ t/op/pat.t 2007-11-06 21:52:30.000000000 +0100
@@ -6,7 +6,7 @@
$| = 1;
-print "1..1187\n";
+print "1..1189\n";
BEGIN {
chdir 't' if -d 't';
@@ -3395,5 +3395,14 @@ ok(("foba ba$s" =~ qr/(foo|BaSS|bar)/i)
"# assigning to original string should not corrupt match vars");
}
-# last test 1187
+{
+ use warnings;
+ my @w;
+ local $SIG{__WARN__}=sub{push @w,"@_"};
+ my $c=qq(\x{DF});
+ ok($c=~/${c}|\x{100}/, "ASCII pattern that really is utf8");
+ ok(@w==0, "ASCII pattern that really is utf8");
+}
+
+# last test 1189
|