Revision: 528
http://vcs.pcre.org/viewvc?view=rev&revision=528
Author: ph10
Date: 2010-05-29 17:40:22 +0100 (Sat, 29 May 2010)
Log Message:
-----------
Add knowledge of \R to auto-possessify feature.
Modified Paths:
--------------
code/trunk/ChangeLog
code/trunk/pcre_compile.c
code/trunk/testdata/testinput2
code/trunk/testdata/testoutput2
Modified: code/trunk/ChangeLog
===================================================================
--- code/trunk/ChangeLog 2010-05-29 15:50:39 UTC (rev 527)
+++ code/trunk/ChangeLog 2010-05-29 16:40:22 UTC (rev 528)
@@ -57,6 +57,8 @@
14. pcre_study() now recognizes \h, \v, and \R when constructing a bit map of
possible starting bytes for non-anchored patterns.
+
+15. The "auto-possessify" feature of pcre_compile() now recognizes \R.
Modified: code/trunk/pcre_compile.c
===================================================================
--- code/trunk/pcre_compile.c 2010-05-29 15:50:39 UTC (rev 527)
+++ code/trunk/pcre_compile.c 2010-05-29 16:40:22 UTC (rev 528)
@@ -2544,6 +2544,9 @@
else
#endif /* SUPPORT_UTF8 */
return (item == cd->fcc[next]); /* Non-UTF-8 mode */
+
+ /* Note that OP_DIGIT etc. are generated only when PCRE_UCP is *not* set.
+ When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
case OP_DIGIT:
return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
@@ -2586,11 +2589,12 @@
case 0x202f:
case 0x205f:
case 0x3000:
- return op_code != OP_HSPACE;
+ return op_code == OP_NOT_HSPACE;
default:
- return op_code == OP_HSPACE;
+ return op_code != OP_NOT_HSPACE;
}
+ case OP_ANYNL:
case OP_VSPACE:
case OP_NOT_VSPACE:
switch(next)
@@ -2602,9 +2606,9 @@
case 0x85:
case 0x2028:
case 0x2029:
- return op_code != OP_VSPACE;
+ return op_code == OP_NOT_VSPACE;
default:
- return op_code == OP_VSPACE;
+ return op_code != OP_NOT_VSPACE;
}
default:
@@ -2612,7 +2616,10 @@
}
-/* Handle the case when the next item is \d, \s, etc. */
+/* Handle the case when the next item is \d, \s, etc. Note that when PCRE_UCP
+is set, \d turns into ESC_du rather than ESC_d, etc., so ESC_d etc. are
+generated only when PCRE_UCP is *not* set, that is, when only ASCII
+characteristics are recognized. */
switch(op_code)
{
@@ -2691,32 +2698,35 @@
case OP_DIGIT:
return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
- next == -ESC_h || next == -ESC_v;
+ next == -ESC_h || next == -ESC_v || next == -ESC_R;
case OP_NOT_DIGIT:
return next == -ESC_d;
case OP_WHITESPACE:
- return next == -ESC_S || next == -ESC_d || next == -ESC_w;
+ return next == -ESC_S || next == -ESC_d || next == -ESC_w || next == -ESC_R;
case OP_NOT_WHITESPACE:
return next == -ESC_s || next == -ESC_h || next == -ESC_v;
case OP_HSPACE:
- return next == -ESC_S || next == -ESC_H || next == -ESC_d || next == -ESC_w;
+ return next == -ESC_S || next == -ESC_H || next == -ESC_d ||
+ next == -ESC_w || next == -ESC_v || next == -ESC_R;
case OP_NOT_HSPACE:
return next == -ESC_h;
/* Can't have \S in here because VT matches \S (Perl anomaly) */
+ case OP_ANYNL:
case OP_VSPACE:
return next == -ESC_V || next == -ESC_d || next == -ESC_w;
case OP_NOT_VSPACE:
- return next == -ESC_v;
+ return next == -ESC_v || next == -ESC_R;
case OP_WORDCHAR:
- return next == -ESC_W || next == -ESC_s || next == -ESC_h || next == -ESC_v;
+ return next == -ESC_W || next == -ESC_s || next == -ESC_h ||
+ next == -ESC_v || next == -ESC_R;
case OP_NOT_WORDCHAR:
return next == -ESC_w || next == -ESC_d;
Modified: code/trunk/testdata/testinput2
===================================================================
--- code/trunk/testdata/testinput2 2010-05-29 15:50:39 UTC (rev 527)
+++ code/trunk/testdata/testinput2 2010-05-29 16:40:22 UTC (rev 528)
@@ -3481,4 +3481,14 @@
** Failers
A\r\nB
+/\R+b/BZ
+
+/\R+\n/BZ
+
+/\R+\d/BZ
+
+/\d*\R/BZ
+
+/\s*\R/BZ
+
/-- End of testinput2 --/
Modified: code/trunk/testdata/testoutput2
===================================================================
--- code/trunk/testdata/testoutput2 2010-05-29 15:50:39 UTC (rev 527)
+++ code/trunk/testdata/testoutput2 2010-05-29 16:40:22 UTC (rev 528)
@@ -11081,4 +11081,49 @@
A\r\nB
No match
+/\R+b/BZ
+------------------------------------------------------------------
+ Bra
+ \R++
+ b
+ Ket
+ End
+------------------------------------------------------------------
+
+/\R+\n/BZ
+------------------------------------------------------------------
+ Bra
+ \R+
+ \x0a
+ Ket
+ End
+------------------------------------------------------------------
+
+/\R+\d/BZ
+------------------------------------------------------------------
+ Bra
+ \R++
+ \d
+ Ket
+ End
+------------------------------------------------------------------
+
+/\d*\R/BZ
+------------------------------------------------------------------
+ Bra
+ \d*+
+ \R
+ Ket
+ End
+------------------------------------------------------------------
+
+/\s*\R/BZ
+------------------------------------------------------------------
+ Bra
+ \s*+
+ \R
+ Ket
+ End
+------------------------------------------------------------------
+
/-- End of testinput2 --/