Revision: 1380
http://vcs.pcre.org/viewvc?view=rev&revision=1380
Author: ph10
Date: 2013-10-15 17:49:12 +0100 (Tue, 15 Oct 2013)
Log Message:
-----------
Explicit possessive quantifiers now use the new opcodes. Fixed an infelicity
with EXACT in caseless mode.
Modified Paths:
--------------
code/trunk/ChangeLog
code/trunk/pcre_compile.c
code/trunk/pcre_internal.h
code/trunk/testdata/testinput2
code/trunk/testdata/testinput7
code/trunk/testdata/testoutput11-16
code/trunk/testdata/testoutput11-32
code/trunk/testdata/testoutput11-8
code/trunk/testdata/testoutput17
code/trunk/testdata/testoutput2
code/trunk/testdata/testoutput5
code/trunk/testdata/testoutput7
Modified: code/trunk/ChangeLog
===================================================================
--- code/trunk/ChangeLog 2013-10-14 13:54:07 UTC (rev 1379)
+++ code/trunk/ChangeLog 2013-10-15 16:49:12 UTC (rev 1380)
@@ -135,6 +135,13 @@
27. Add JIT support for the 64 bit TileGX architecture.
Patch by Jiong Wang (Tilera Corporation).
+
+28. Possessive quantifiers for classes (both explicit and automatically
+ generated) now use special opcodes instead of wrapping in ONCE brackets.
+
+29. Whereas an item such as A{4}+ ignored the possessivenes of the quantifier
+ (because it's meaningless), this was not happening when PCRE_CASELESS was
+ set. Not wrong, but inefficient.
Version 8.33 28-May-2013
Modified: code/trunk/pcre_compile.c
===================================================================
--- code/trunk/pcre_compile.c 2013-10-14 13:54:07 UTC (rev 1379)
+++ code/trunk/pcre_compile.c 2013-10-15 16:49:12 UTC (rev 1380)
@@ -777,8 +777,66 @@
{ ucp_L, ucp_N, ucp_P, ucp_Po } /* WORD */
};
+/* This table is used when converting repeating opcodes into possessified
+versions as a result of an explicit possessive quantifier such as ++. A zero
+value means there is no possessified version - in those cases the item in
+question must be wrapped in ONCE brackets. The table is truncated at OP_CALLOUT
+because all relevant opcodes are less than that. */
+static const pcre_uint8 opcode_possessify[] = {
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 15 */
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 16 - 31 */
+ 0, /* NOTI */
+ OP_POSSTAR, 0, /* STAR, MINSTAR */
+ OP_POSPLUS, 0, /* PLUS, MINPLUS */
+ OP_POSQUERY, 0, /* QUERY, MINQUERY */
+ OP_POSUPTO, 0, /* UPTO, MINUPTO */
+ 0, /* EXACT */
+ 0, 0, 0, 0, /* POS{STAR,PLUS,QUERY,UPTO} */
+
+ OP_POSSTARI, 0, /* STARI, MINSTARI */
+ OP_POSPLUSI, 0, /* PLUSI, MINPLUSI */
+ OP_POSQUERYI, 0, /* QUERYI, MINQUERYI */
+ OP_POSUPTOI, 0, /* UPTOI, MINUPTOI */
+ 0, /* EXACTI */
+ 0, 0, 0, 0, /* POS{STARI,PLUSI,QUERYI,UPTOI} */
+
+ OP_NOTPOSSTAR, 0, /* NOTSTAR, NOTMINSTAR */
+ OP_NOTPOSPLUS, 0, /* NOTPLUS, NOTMINPLUS */
+ OP_NOTPOSQUERY, 0, /* NOTQUERY, NOTMINQUERY */
+ OP_NOTPOSUPTO, 0, /* NOTUPTO, NOTMINUPTO */
+ 0, /* NOTEXACT */
+ 0, 0, 0, 0, /* NOTPOS{STAR,PLUS,QUERY,UPTO} */
+
+ OP_NOTPOSSTARI, 0, /* NOTSTARI, NOTMINSTARI */
+ OP_NOTPOSPLUSI, 0, /* NOTPLUSI, NOTMINPLUSI */
+ OP_NOTPOSQUERYI, 0, /* NOTQUERYI, NOTMINQUERYI */
+ OP_NOTPOSUPTOI, 0, /* NOTUPTOI, NOTMINUPTOI */
+ 0, /* NOTEXACTI */
+ 0, 0, 0, 0, /* NOTPOS{STARI,PLUSI,QUERYI,UPTOI} */
+
+ OP_TYPEPOSSTAR, 0, /* TYPESTAR, TYPEMINSTAR */
+ OP_TYPEPOSPLUS, 0, /* TYPEPLUS, TYPEMINPLUS */
+ OP_TYPEPOSQUERY, 0, /* TYPEQUERY, TYPEMINQUERY */
+ OP_TYPEPOSUPTO, 0, /* TYPEUPTO, TYPEMINUPTO */
+ 0, /* TYPEEXACT */
+ 0, 0, 0, 0, /* TYPEPOS{STAR,PLUS,QUERY,UPTO} */
+
+ OP_CRPOSSTAR, 0, /* CRSTAR, CRMINSTAR */
+ OP_CRPOSPLUS, 0, /* CRPLUS, CRMINPLUS */
+ OP_CRPOSQUERY, 0, /* CRQUERY, CRMINQUERY */
+ OP_CRPOSRANGE, 0, /* CRRANGE, CRMINRANGE */
+ 0, 0, 0, 0, /* CRPOS{STAR,PLUS,QUERY,RANGE} */
+
+ 0, 0, 0, /* CLASS, NCLASS, XCLASS */
+ 0, 0, /* REF, REFI */
+ 0, 0, /* DNREF, DNREFI */
+ 0, 0 /* RECURSE, CALLOUT */
+};
+
+
+
/*************************************************
* Find an error text *
*************************************************/
@@ -2722,10 +2780,10 @@
HSPACE_CASES:
VSPACE_CASES:
return negated;
-
+
default:
return (PRIV(ucp_gentype)[prop->chartype] == ucp_Z) == negated;
- }
+ }
break; /* Control never reaches here */
case PT_WORD:
@@ -3399,7 +3457,7 @@
for (;;)
{
c = *code;
-
+
if (c >= OP_STAR && c <= OP_TYPEPOSUPTO)
{
c -= get_repeat_base(c) - OP_STAR;
@@ -3461,7 +3519,7 @@
/* end must not be NULL. */
end = get_chr_property_list(code, utf, cd->fcc, list);
- list[1] = d == OP_CRSTAR || d == OP_CRPLUS || d == OP_CRQUERY ||
+ list[1] = d == OP_CRSTAR || d == OP_CRPLUS || d == OP_CRQUERY ||
d == OP_CRRANGE;
if (compare_opcodes(end, utf, cd, list, end))
@@ -5910,43 +5968,105 @@
goto FAILED;
}
- /* If the character following a repeat is '+', or if certain optimization
- tests above succeeded, possessive_quantifier is TRUE. For some opcodes,
- there are special alternative opcodes for this case. For anything else, we
- wrap the entire repeated item inside OP_ONCE brackets. Logically, the '+'
- notation is just syntactic sugar, taken from Sun's Java package, but the
- special opcodes can optimize it.
+ /* If the character following a repeat is '+', possessive_quantifier is
+ TRUE. For some opcodes, there are special alternative opcodes for this
+ case. For anything else, we wrap the entire repeated item inside OP_ONCE
+ brackets. Logically, the '+' notation is just syntactic sugar, taken from
+ Sun's Java package, but the special opcodes can optimize it.
Some (but not all) possessively repeated subpatterns have already been
completely handled in the code just above. For them, possessive_quantifier
- is always FALSE at this stage.
+ is always FALSE at this stage. Note that the repeated item starts at
+ tempcode, not at previous, which might be the first part of a string whose
+ (former) last char we repeated. */
- Note that the repeated item starts at tempcode, not at previous, which
- might be the first part of a string whose (former) last char we repeated.
-
- Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
- an 'upto' may follow. We skip over an 'exact' item, and then test the
- length of what remains before proceeding. */
-
if (possessive_quantifier)
{
int len;
- if (*tempcode == OP_TYPEEXACT)
+ /* Possessifying an EXACT quantifier has no effect, so we can ignore it.
+ However, QUERY, STAR, or UPTO may follow (for quantifiers such as {5,6},
+ {5,}, or {5,10}). We skip over an EXACT item; if the length of what
+ remains is greater than zero, there's a further opcode that can be
+ handled. If not, do nothing, leaving the EXACT alone. */
+
+ switch(*tempcode)
+ {
+ case OP_TYPEEXACT:
tempcode += PRIV(OP_lengths)[*tempcode] +
((tempcode[1 + IMM2_SIZE] == OP_PROP
|| tempcode[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
+ break;
- else if (*tempcode == OP_EXACT || *tempcode == OP_NOTEXACT)
- {
+ /* CHAR opcodes are used for exacts whose count is 1. */
+
+ case OP_CHAR:
+ case OP_CHARI:
+ case OP_NOT:
+ case OP_NOTI:
+ case OP_EXACT:
+ case OP_EXACTI:
+ case OP_NOTEXACT:
+ case OP_NOTEXACTI:
tempcode += PRIV(OP_lengths)[*tempcode];
#ifdef SUPPORT_UTF
if (utf && HAS_EXTRALEN(tempcode[-1]))
tempcode += GET_EXTRALEN(tempcode[-1]);
#endif
+ break;
+
+ /* For the class opcodes, the repeat operator appears at the end;
+ adjust tempcode to point to it. */
+
+ case OP_CLASS:
+ case OP_NCLASS:
+ tempcode += 1 + 32/sizeof(pcre_uchar);
+ break;
+
+#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
+ case OP_XCLASS:
+ tempcode += GET(tempcode, 1);
+ break;
+#endif
}
+ /* If tempcode is equal to code (which points to the end of the repeated
+ item), it means we have skipped an EXACT item but there is no following
+ QUERY, STAR, or UPTO; the value of len will be 0, and we do nothing. In
+ all other cases, tempcode will be pointing to the repeat opcode, and will
+ be less than code, so the value of len will be greater than 0. */
+
len = (int)(code - tempcode);
+ if (len > 0)
+ {
+ unsigned int repcode = *tempcode;
+
+ /* There is a table for possessifying opcodes, all of which are less
+ than OP_CALLOUT. A zero entry means there is no possessified version.
+ */
+
+ if (repcode < OP_CALLOUT && opcode_possessify[repcode] > 0)
+ *tempcode = opcode_possessify[repcode];
+
+ /* For opcode without a special possessified version, wrap the item in
+ ONCE brackets. Because we are moving code along, we must ensure that any
+ pending recursive references are updated. */
+
+ else
+ {
+ *code = OP_END;
+ adjust_recurse(tempcode, 1 + LINK_SIZE, utf, cd, save_hwm);
+ memmove(tempcode + 1 + LINK_SIZE, tempcode, IN_UCHARS(len));
+ code += 1 + LINK_SIZE;
+ len += 1 + LINK_SIZE;
+ tempcode[0] = OP_ONCE;
+ *code++ = OP_KET;
+ PUTINC(code, 0, len);
+ PUT(tempcode, 1, len);
+ }
+ }
+
+#ifdef NEVER
if (len > 0) switch (*tempcode)
{
case OP_STAR: *tempcode = OP_POSSTAR; break;
@@ -5974,6 +6094,11 @@
case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
case OP_TYPEUPTO: *tempcode = OP_TYPEPOSUPTO; break;
+ case OP_CRSTAR: *tempcode = OP_CRPOSSTAR; break;
+ case OP_CRPLUS: *tempcode = OP_CRPOSPLUS; break;
+ case OP_CRQUERY: *tempcode = OP_CRPOSQUERY; break;
+ case OP_CRRANGE: *tempcode = OP_CRPOSRANGE; break;
+
/* Because we are moving code along, we must ensure that any
pending recursive references are updated. */
@@ -5989,6 +6114,7 @@
PUT(tempcode, 1, len);
break;
}
+#endif
}
/* In all case we no longer have a previous item. We also set the
@@ -9044,3 +9170,4 @@
}
/* End of pcre_compile.c */
+
Modified: code/trunk/pcre_internal.h
===================================================================
--- code/trunk/pcre_internal.h 2013-10-14 13:54:07 UTC (rev 1379)
+++ code/trunk/pcre_internal.h 2013-10-15 16:49:12 UTC (rev 1380)
@@ -1893,8 +1893,8 @@
ESC_E, ESC_Q, ESC_g, ESC_k,
ESC_DU, ESC_du, ESC_SU, ESC_su, ESC_WU, ESC_wu };
-/* Opcode table */
+/********************** Opcode definitions ******************/
/****** NOTE NOTE NOTE ******
@@ -1903,9 +1903,10 @@
OP_DOLLM must not be changed without adjusting the table called autoposstab in
pcre_compile.c
-Whenever this list is updated, the two macro definitions that follow must also
-be updated to match. There are also tables called "coptable" and "poptable" in
-pcre_dfa_exec.c that must be updated.
+Whenever this list is updated, the two macro definitions that follow must be
+updated to match. The possessification table called "opcode_possessify" in
+pcre_compile.c must also be updated, and also the tables called "coptable"
+and "poptable" in pcre_dfa_exec.c.
****** NOTE NOTE NOTE ******/
@@ -2170,7 +2171,8 @@
/* *** NOTE NOTE NOTE *** Whenever the list above is updated, the two macro
definitions that follow must also be updated to match. There are also tables
-called "coptable" and "poptable" in pcre_dfa_exec.c that must be updated. */
+called "opcode_possessify" in pcre_compile.c and "coptable" and "poptable" in
+pcre_dfa_exec.c that must be updated. */
/* This macro defines textual names for all the opcodes. These are used only
Modified: code/trunk/testdata/testinput2
===================================================================
--- code/trunk/testdata/testinput2 2013-10-14 13:54:07 UTC (rev 1379)
+++ code/trunk/testdata/testinput2 2013-10-15 16:49:12 UTC (rev 1380)
@@ -829,8 +829,14 @@
/x++/DZ
-/x{1,3}+/DZ
+/x{1,3}+/BZO
+/x{1,3}+/BZOi
+
+/[^x]{1,3}+/BZO
+
+/[^x]{1,3}+/BZOi
+
/(x)*+/DZ
/^(\w++|\s++)*$/I
@@ -3941,4 +3947,12 @@
/^A\x{/
+/[ab]++/BZO
+
+/[^ab]*+/BZO
+
+/a{4}+/BZO
+
+/a{4}+/BZOi
+
/-- End of testinput2 --/
Modified: code/trunk/testdata/testinput7
===================================================================
--- code/trunk/testdata/testinput7 2013-10-14 13:54:07 UTC (rev 1379)
+++ code/trunk/testdata/testinput7 2013-10-15 16:49:12 UTC (rev 1380)
@@ -815,4 +815,8 @@
/\w+/8CWBZ
abcd
+/[\p{N}]?+/BZO
+
+/[\p{L}ab]{2,3}+/BZO
+
/-- End of testinput7 --/
Modified: code/trunk/testdata/testoutput11-16
===================================================================
--- code/trunk/testdata/testoutput11-16 2013-10-14 13:54:07 UTC (rev 1379)
+++ code/trunk/testdata/testoutput11-16 2013-10-15 16:49:12 UTC (rev 1380)
@@ -100,15 +100,13 @@
------------------------------------------------------------------
/x{1,3}+/BM
-Memory allocation (code space): 28
+Memory allocation (code space): 20
------------------------------------------------------------------
- 0 11 Bra
- 2 7 Once
- 4 x
- 6 x{0,2}+
- 9 7 Ket
- 11 11 Ket
- 13 End
+ 0 7 Bra
+ 2 x
+ 4 x{0,2}+
+ 7 7 Ket
+ 9 End
------------------------------------------------------------------
/(x)*+/BM
Modified: code/trunk/testdata/testoutput11-32
===================================================================
--- code/trunk/testdata/testoutput11-32 2013-10-14 13:54:07 UTC (rev 1379)
+++ code/trunk/testdata/testoutput11-32 2013-10-15 16:49:12 UTC (rev 1380)
@@ -100,15 +100,13 @@
------------------------------------------------------------------
/x{1,3}+/BM
-Memory allocation (code space): 56
+Memory allocation (code space): 40
------------------------------------------------------------------
- 0 11 Bra
- 2 7 Once
- 4 x
- 6 x{0,2}+
- 9 7 Ket
- 11 11 Ket
- 13 End
+ 0 7 Bra
+ 2 x
+ 4 x{0,2}+
+ 7 7 Ket
+ 9 End
------------------------------------------------------------------
/(x)*+/BM
Modified: code/trunk/testdata/testoutput11-8
===================================================================
--- code/trunk/testdata/testoutput11-8 2013-10-14 13:54:07 UTC (rev 1379)
+++ code/trunk/testdata/testoutput11-8 2013-10-15 16:49:12 UTC (rev 1380)
@@ -100,15 +100,13 @@
------------------------------------------------------------------
/x{1,3}+/BM
-Memory allocation (code space): 19
+Memory allocation (code space): 13
------------------------------------------------------------------
- 0 15 Bra
- 3 9 Once
- 6 x
- 8 x{0,2}+
- 12 9 Ket
- 15 15 Ket
- 18 End
+ 0 9 Bra
+ 3 x
+ 5 x{0,2}+
+ 9 9 Ket
+ 12 End
------------------------------------------------------------------
/(x)*+/BM
Modified: code/trunk/testdata/testoutput17
===================================================================
--- code/trunk/testdata/testoutput17 2013-10-14 13:54:07 UTC (rev 1379)
+++ code/trunk/testdata/testoutput17 2013-10-15 16:49:12 UTC (rev 1380)
@@ -440,11 +440,9 @@
/i [^\x{8000}]*
/i [^\x{7fff}]{2}
/i [^\x{7fff}]{0,7}?
- Once
/i [^\x{100}]{5}
/i [^\x{100}]?+
Ket
- Ket
End
------------------------------------------------------------------
Modified: code/trunk/testdata/testoutput2
===================================================================
--- code/trunk/testdata/testoutput2 2013-10-14 13:54:07 UTC (rev 1379)
+++ code/trunk/testdata/testoutput2 2013-10-15 16:49:12 UTC (rev 1380)
@@ -2893,21 +2893,42 @@
First char = 'x'
No need char
-/x{1,3}+/DZ
+/x{1,3}+/BZO
------------------------------------------------------------------
Bra
- Once
x
x{0,2}+
Ket
+ End
+------------------------------------------------------------------
+
+/x{1,3}+/BZOi
+------------------------------------------------------------------
+ Bra
+ /i x
+ /i x{0,2}+
Ket
End
------------------------------------------------------------------
-Capturing subpattern count = 0
-No options
-First char = 'x'
-No need char
+/[^x]{1,3}+/BZO
+------------------------------------------------------------------
+ Bra
+ [^x]
+ [^x]{0,2}+
+ Ket
+ End
+------------------------------------------------------------------
+
+/[^x]{1,3}+/BZOi
+------------------------------------------------------------------
+ Bra
+ /i [^x]
+ /i [^x]{0,2}+
+ Ket
+ End
+------------------------------------------------------------------
+
/(x)*+/DZ
------------------------------------------------------------------
Bra
@@ -4590,10 +4611,8 @@
/[ab]{1}+/DZ
------------------------------------------------------------------
Bra
- Once
[ab]{1,1}+
Ket
- Ket
End
------------------------------------------------------------------
Capturing subpattern count = 0
@@ -13764,4 +13783,36 @@
/^A\x{/
Failed: non-hex character in \x{} (closing brace missing?) at offset 5
+/[ab]++/BZO
+------------------------------------------------------------------
+ Bra
+ [ab]++
+ Ket
+ End
+------------------------------------------------------------------
+
+/[^ab]*+/BZO
+------------------------------------------------------------------
+ Bra
+ [\x00-`c-\xff] (neg)*+
+ Ket
+ End
+------------------------------------------------------------------
+
+/a{4}+/BZO
+------------------------------------------------------------------
+ Bra
+ a{4}
+ Ket
+ End
+------------------------------------------------------------------
+
+/a{4}+/BZOi
+------------------------------------------------------------------
+ Bra
+ /i a{4}
+ Ket
+ End
+------------------------------------------------------------------
+
/-- End of testinput2 --/
Modified: code/trunk/testdata/testoutput5
===================================================================
--- code/trunk/testdata/testoutput5 2013-10-14 13:54:07 UTC (rev 1379)
+++ code/trunk/testdata/testoutput5 2013-10-15 16:49:12 UTC (rev 1380)
@@ -1834,11 +1834,9 @@
/i [^\x{8000}]*
/i [^\x{7fff}]{2}
/i [^\x{7fff}]{0,7}?
- Once
/i [^\x{fffff}]{5}
/i [^\x{fffff}]?+
Ket
- Ket
End
------------------------------------------------------------------
Modified: code/trunk/testdata/testoutput7
===================================================================
--- code/trunk/testdata/testoutput7 2013-10-14 13:54:07 UTC (rev 1379)
+++ code/trunk/testdata/testoutput7 2013-10-15 16:49:12 UTC (rev 1380)
@@ -2136,4 +2136,20 @@
+3 ^ ^
0: abcd
+/[\p{N}]?+/BZO
+------------------------------------------------------------------
+ Bra
+ [\p{N}]?+
+ Ket
+ End
+------------------------------------------------------------------
+
+/[\p{L}ab]{2,3}+/BZO
+------------------------------------------------------------------
+ Bra
+ [ab\p{L}]{2,3}+
+ Ket
+ End
+------------------------------------------------------------------
+
/-- End of testinput7 --/