[Pcre-svn] [1380] code/trunk: Explicit possessive quantifier…

Top Page
Delete this message
Author: Subversion repository
Date:  
To: pcre-svn
Subject: [Pcre-svn] [1380] code/trunk: Explicit possessive quantifiers now use the new opcodes.
Revision: 1380
          http://vcs.pcre.org/viewvc?view=rev&revision=1380
Author:   ph10
Date:     2013-10-15 17:49:12 +0100 (Tue, 15 Oct 2013)


Log Message:
-----------
Explicit possessive quantifiers now use the new opcodes. Fixed an infelicity
with EXACT in caseless mode.

Modified Paths:
--------------
    code/trunk/ChangeLog
    code/trunk/pcre_compile.c
    code/trunk/pcre_internal.h
    code/trunk/testdata/testinput2
    code/trunk/testdata/testinput7
    code/trunk/testdata/testoutput11-16
    code/trunk/testdata/testoutput11-32
    code/trunk/testdata/testoutput11-8
    code/trunk/testdata/testoutput17
    code/trunk/testdata/testoutput2
    code/trunk/testdata/testoutput5
    code/trunk/testdata/testoutput7


Modified: code/trunk/ChangeLog
===================================================================
--- code/trunk/ChangeLog    2013-10-14 13:54:07 UTC (rev 1379)
+++ code/trunk/ChangeLog    2013-10-15 16:49:12 UTC (rev 1380)
@@ -135,6 +135,13 @@


 27. Add JIT support for the 64 bit TileGX architecture.
     Patch by Jiong Wang (Tilera Corporation).
+    
+28. Possessive quantifiers for classes (both explicit and automatically
+    generated) now use special opcodes instead of wrapping in ONCE brackets.
+    
+29. Whereas an item such as A{4}+ ignored the possessivenes of the quantifier 
+    (because it's meaningless), this was not happening when PCRE_CASELESS was 
+    set. Not wrong, but inefficient. 



Version 8.33 28-May-2013

Modified: code/trunk/pcre_compile.c
===================================================================
--- code/trunk/pcre_compile.c    2013-10-14 13:54:07 UTC (rev 1379)
+++ code/trunk/pcre_compile.c    2013-10-15 16:49:12 UTC (rev 1380)
@@ -777,8 +777,66 @@
   { ucp_L, ucp_N, ucp_P, ucp_Po }   /* WORD */
 };


+/* This table is used when converting repeating opcodes into possessified
+versions as a result of an explicit possessive quantifier such as ++. A zero
+value means there is no possessified version - in those cases the item in
+question must be wrapped in ONCE brackets. The table is truncated at OP_CALLOUT
+because all relevant opcodes are less than that. */

+static const pcre_uint8 opcode_possessify[] = {
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 15 */
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 16 - 31 */

+  0,                       /* NOTI */
+  OP_POSSTAR, 0,           /* STAR, MINSTAR */
+  OP_POSPLUS, 0,           /* PLUS, MINPLUS */
+  OP_POSQUERY, 0,          /* QUERY, MINQUERY */
+  OP_POSUPTO, 0,           /* UPTO, MINUPTO */
+  0,                       /* EXACT */
+  0, 0, 0, 0,              /* POS{STAR,PLUS,QUERY,UPTO} */
+
+  OP_POSSTARI, 0,          /* STARI, MINSTARI */
+  OP_POSPLUSI, 0,          /* PLUSI, MINPLUSI */
+  OP_POSQUERYI, 0,         /* QUERYI, MINQUERYI */
+  OP_POSUPTOI, 0,          /* UPTOI, MINUPTOI */
+  0,                       /* EXACTI */
+  0, 0, 0, 0,              /* POS{STARI,PLUSI,QUERYI,UPTOI} */
+
+  OP_NOTPOSSTAR, 0,        /* NOTSTAR, NOTMINSTAR */
+  OP_NOTPOSPLUS, 0,        /* NOTPLUS, NOTMINPLUS */
+  OP_NOTPOSQUERY, 0,       /* NOTQUERY, NOTMINQUERY */
+  OP_NOTPOSUPTO, 0,        /* NOTUPTO, NOTMINUPTO */
+  0,                       /* NOTEXACT */
+  0, 0, 0, 0,              /* NOTPOS{STAR,PLUS,QUERY,UPTO} */
+
+  OP_NOTPOSSTARI, 0,       /* NOTSTARI, NOTMINSTARI */
+  OP_NOTPOSPLUSI, 0,       /* NOTPLUSI, NOTMINPLUSI */
+  OP_NOTPOSQUERYI, 0,      /* NOTQUERYI, NOTMINQUERYI */
+  OP_NOTPOSUPTOI, 0,       /* NOTUPTOI, NOTMINUPTOI */
+  0,                       /* NOTEXACTI */
+  0, 0, 0, 0,              /* NOTPOS{STARI,PLUSI,QUERYI,UPTOI} */
+
+  OP_TYPEPOSSTAR, 0,       /* TYPESTAR, TYPEMINSTAR */
+  OP_TYPEPOSPLUS, 0,       /* TYPEPLUS, TYPEMINPLUS */
+  OP_TYPEPOSQUERY, 0,      /* TYPEQUERY, TYPEMINQUERY */
+  OP_TYPEPOSUPTO, 0,       /* TYPEUPTO, TYPEMINUPTO */
+  0,                       /* TYPEEXACT */
+  0, 0, 0, 0,              /* TYPEPOS{STAR,PLUS,QUERY,UPTO} */
+
+  OP_CRPOSSTAR, 0,         /* CRSTAR, CRMINSTAR */
+  OP_CRPOSPLUS, 0,         /* CRPLUS, CRMINPLUS */
+  OP_CRPOSQUERY, 0,        /* CRQUERY, CRMINQUERY */
+  OP_CRPOSRANGE, 0,        /* CRRANGE, CRMINRANGE */
+  0, 0, 0, 0,              /* CRPOS{STAR,PLUS,QUERY,RANGE} */
+
+  0, 0, 0,                 /* CLASS, NCLASS, XCLASS */
+  0, 0,                    /* REF, REFI */
+  0, 0,                    /* DNREF, DNREFI */
+  0, 0                     /* RECURSE, CALLOUT */
+};
+
+
+
 /*************************************************
 *            Find an error text                  *
 *************************************************/
@@ -2722,10 +2780,10 @@
     HSPACE_CASES:
     VSPACE_CASES:
     return negated;
-    
+
     default:
     return (PRIV(ucp_gentype)[prop->chartype] == ucp_Z) == negated;
-    }       
+    }
   break;  /* Control never reaches here */


   case PT_WORD:
@@ -3399,7 +3457,7 @@
 for (;;)
   {
   c = *code;
-  
+
   if (c >= OP_STAR && c <= OP_TYPEPOSUPTO)
     {
     c -= get_repeat_base(c) - OP_STAR;
@@ -3461,7 +3519,7 @@
       /* end must not be NULL. */
       end = get_chr_property_list(code, utf, cd->fcc, list);


-      list[1] = d == OP_CRSTAR || d == OP_CRPLUS || d == OP_CRQUERY || 
+      list[1] = d == OP_CRSTAR || d == OP_CRPLUS || d == OP_CRQUERY ||
         d == OP_CRRANGE;


       if (compare_opcodes(end, utf, cd, list, end))
@@ -5910,43 +5968,105 @@
       goto FAILED;
       }


-    /* If the character following a repeat is '+', or if certain optimization
-    tests above succeeded, possessive_quantifier is TRUE. For some opcodes,
-    there are special alternative opcodes for this case. For anything else, we
-    wrap the entire repeated item inside OP_ONCE brackets. Logically, the '+'
-    notation is just syntactic sugar, taken from Sun's Java package, but the
-    special opcodes can optimize it.
+    /* If the character following a repeat is '+', possessive_quantifier is
+    TRUE. For some opcodes, there are special alternative opcodes for this
+    case. For anything else, we wrap the entire repeated item inside OP_ONCE
+    brackets. Logically, the '+' notation is just syntactic sugar, taken from
+    Sun's Java package, but the special opcodes can optimize it.


     Some (but not all) possessively repeated subpatterns have already been
     completely handled in the code just above. For them, possessive_quantifier
-    is always FALSE at this stage.
+    is always FALSE at this stage. Note that the repeated item starts at
+    tempcode, not at previous, which might be the first part of a string whose
+    (former) last char we repeated. */


-    Note that the repeated item starts at tempcode, not at previous, which
-    might be the first part of a string whose (former) last char we repeated.
-
-    Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
-    an 'upto' may follow. We skip over an 'exact' item, and then test the
-    length of what remains before proceeding. */
-
     if (possessive_quantifier)
       {
       int len;


-      if (*tempcode == OP_TYPEEXACT)
+      /* Possessifying an EXACT quantifier has no effect, so we can ignore it.
+      However, QUERY, STAR, or UPTO may follow (for quantifiers such as {5,6},
+      {5,}, or {5,10}). We skip over an EXACT item; if the length of what
+      remains is greater than zero, there's a further opcode that can be
+      handled. If not, do nothing, leaving the EXACT alone. */
+
+      switch(*tempcode)
+        {
+        case OP_TYPEEXACT:
         tempcode += PRIV(OP_lengths)[*tempcode] +
           ((tempcode[1 + IMM2_SIZE] == OP_PROP
           || tempcode[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
+        break;


-      else if (*tempcode == OP_EXACT || *tempcode == OP_NOTEXACT)
-        {
+        /* CHAR opcodes are used for exacts whose count is 1. */
+
+        case OP_CHAR:
+        case OP_CHARI:
+        case OP_NOT:
+        case OP_NOTI:
+        case OP_EXACT:
+        case OP_EXACTI:
+        case OP_NOTEXACT:
+        case OP_NOTEXACTI:
         tempcode += PRIV(OP_lengths)[*tempcode];
 #ifdef SUPPORT_UTF
         if (utf && HAS_EXTRALEN(tempcode[-1]))
           tempcode += GET_EXTRALEN(tempcode[-1]);
 #endif
+        break;
+
+        /* For the class opcodes, the repeat operator appears at the end;
+        adjust tempcode to point to it. */
+
+        case OP_CLASS:
+        case OP_NCLASS:
+        tempcode += 1 + 32/sizeof(pcre_uchar);
+        break;
+
+#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
+        case OP_XCLASS:
+        tempcode += GET(tempcode, 1);
+        break;
+#endif
         }


+      /* If tempcode is equal to code (which points to the end of the repeated
+      item), it means we have skipped an EXACT item but there is no following
+      QUERY, STAR, or UPTO; the value of len will be 0, and we do nothing. In
+      all other cases, tempcode will be pointing to the repeat opcode, and will
+      be less than code, so the value of len will be greater than 0. */
+
       len = (int)(code - tempcode);
+      if (len > 0)
+        { 
+        unsigned int repcode = *tempcode;
+        
+        /* There is a table for possessifying opcodes, all of which are less
+        than OP_CALLOUT. A zero entry means there is no possessified version.
+        */
+        
+        if (repcode < OP_CALLOUT && opcode_possessify[repcode] > 0)
+          *tempcode = opcode_possessify[repcode];
+        
+        /* For opcode without a special possessified version, wrap the item in
+        ONCE brackets. Because we are moving code along, we must ensure that any
+        pending recursive references are updated. */
+        
+        else
+          {
+          *code = OP_END;
+          adjust_recurse(tempcode, 1 + LINK_SIZE, utf, cd, save_hwm);
+          memmove(tempcode + 1 + LINK_SIZE, tempcode, IN_UCHARS(len));
+          code += 1 + LINK_SIZE;
+          len += 1 + LINK_SIZE;
+          tempcode[0] = OP_ONCE;
+          *code++ = OP_KET;
+          PUTINC(code, 0, len);
+          PUT(tempcode, 1, len);
+          }
+        }   
+
+#ifdef NEVER
       if (len > 0) switch (*tempcode)
         {
         case OP_STAR:  *tempcode = OP_POSSTAR; break;
@@ -5974,6 +6094,11 @@
         case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
         case OP_TYPEUPTO:  *tempcode = OP_TYPEPOSUPTO; break;


+        case OP_CRSTAR:   *tempcode = OP_CRPOSSTAR; break;
+        case OP_CRPLUS:   *tempcode = OP_CRPOSPLUS; break;
+        case OP_CRQUERY:  *tempcode = OP_CRPOSQUERY; break;
+        case OP_CRRANGE:  *tempcode = OP_CRPOSRANGE; break;
+
         /* Because we are moving code along, we must ensure that any
         pending recursive references are updated. */


@@ -5989,6 +6114,7 @@
         PUT(tempcode, 1, len);
         break;
         }
+#endif
       }


     /* In all case we no longer have a previous item. We also set the
@@ -9044,3 +9170,4 @@
 }


/* End of pcre_compile.c */
+

Modified: code/trunk/pcre_internal.h
===================================================================
--- code/trunk/pcre_internal.h    2013-10-14 13:54:07 UTC (rev 1379)
+++ code/trunk/pcre_internal.h    2013-10-15 16:49:12 UTC (rev 1380)
@@ -1893,8 +1893,8 @@
        ESC_E, ESC_Q, ESC_g, ESC_k,
        ESC_DU, ESC_du, ESC_SU, ESC_su, ESC_WU, ESC_wu };


-/* Opcode table */

+/********************** Opcode definitions ******************/

/****** NOTE NOTE NOTE ******

@@ -1903,9 +1903,10 @@
OP_DOLLM must not be changed without adjusting the table called autoposstab in
pcre_compile.c

-Whenever this list is updated, the two macro definitions that follow must also
-be updated to match. There are also tables called "coptable" and "poptable" in
-pcre_dfa_exec.c that must be updated.
+Whenever this list is updated, the two macro definitions that follow must be
+updated to match. The possessification table called "opcode_possessify" in
+pcre_compile.c must also be updated, and also the tables called "coptable"
+and "poptable" in pcre_dfa_exec.c.

****** NOTE NOTE NOTE ******/

@@ -2170,7 +2171,8 @@

/* *** NOTE NOTE NOTE *** Whenever the list above is updated, the two macro
definitions that follow must also be updated to match. There are also tables
-called "coptable" and "poptable" in pcre_dfa_exec.c that must be updated. */
+called "opcode_possessify" in pcre_compile.c and "coptable" and "poptable" in
+pcre_dfa_exec.c that must be updated. */


/* This macro defines textual names for all the opcodes. These are used only

Modified: code/trunk/testdata/testinput2
===================================================================
--- code/trunk/testdata/testinput2    2013-10-14 13:54:07 UTC (rev 1379)
+++ code/trunk/testdata/testinput2    2013-10-15 16:49:12 UTC (rev 1380)
@@ -829,8 +829,14 @@


/x++/DZ

-/x{1,3}+/DZ
+/x{1,3}+/BZO

+/x{1,3}+/BZOi
+
+/[^x]{1,3}+/BZO
+
+/[^x]{1,3}+/BZOi
+
/(x)*+/DZ

/^(\w++|\s++)*$/I
@@ -3941,4 +3947,12 @@

/^A\x{/

+/[ab]++/BZO
+
+/[^ab]*+/BZO
+
+/a{4}+/BZO
+
+/a{4}+/BZOi
+
/-- End of testinput2 --/

Modified: code/trunk/testdata/testinput7
===================================================================
--- code/trunk/testdata/testinput7    2013-10-14 13:54:07 UTC (rev 1379)
+++ code/trunk/testdata/testinput7    2013-10-15 16:49:12 UTC (rev 1380)
@@ -815,4 +815,8 @@
 /\w+/8CWBZ
     abcd


+/[\p{N}]?+/BZO
+
+/[\p{L}ab]{2,3}+/BZO
+
/-- End of testinput7 --/

Modified: code/trunk/testdata/testoutput11-16
===================================================================
--- code/trunk/testdata/testoutput11-16    2013-10-14 13:54:07 UTC (rev 1379)
+++ code/trunk/testdata/testoutput11-16    2013-10-15 16:49:12 UTC (rev 1380)
@@ -100,15 +100,13 @@
 ------------------------------------------------------------------


 /x{1,3}+/BM 
-Memory allocation (code space): 28
+Memory allocation (code space): 20
 ------------------------------------------------------------------
-  0  11 Bra
-  2   7 Once
-  4     x
-  6     x{0,2}+
-  9   7 Ket
- 11  11 Ket
- 13     End
+  0   7 Bra
+  2     x
+  4     x{0,2}+
+  7   7 Ket
+  9     End
 ------------------------------------------------------------------


/(x)*+/BM

Modified: code/trunk/testdata/testoutput11-32
===================================================================
--- code/trunk/testdata/testoutput11-32    2013-10-14 13:54:07 UTC (rev 1379)
+++ code/trunk/testdata/testoutput11-32    2013-10-15 16:49:12 UTC (rev 1380)
@@ -100,15 +100,13 @@
 ------------------------------------------------------------------


 /x{1,3}+/BM 
-Memory allocation (code space): 56
+Memory allocation (code space): 40
 ------------------------------------------------------------------
-  0  11 Bra
-  2   7 Once
-  4     x
-  6     x{0,2}+
-  9   7 Ket
- 11  11 Ket
- 13     End
+  0   7 Bra
+  2     x
+  4     x{0,2}+
+  7   7 Ket
+  9     End
 ------------------------------------------------------------------


/(x)*+/BM

Modified: code/trunk/testdata/testoutput11-8
===================================================================
--- code/trunk/testdata/testoutput11-8    2013-10-14 13:54:07 UTC (rev 1379)
+++ code/trunk/testdata/testoutput11-8    2013-10-15 16:49:12 UTC (rev 1380)
@@ -100,15 +100,13 @@
 ------------------------------------------------------------------


 /x{1,3}+/BM 
-Memory allocation (code space): 19
+Memory allocation (code space): 13
 ------------------------------------------------------------------
-  0  15 Bra
-  3   9 Once
-  6     x
-  8     x{0,2}+
- 12   9 Ket
- 15  15 Ket
- 18     End
+  0   9 Bra
+  3     x
+  5     x{0,2}+
+  9   9 Ket
+ 12     End
 ------------------------------------------------------------------


/(x)*+/BM

Modified: code/trunk/testdata/testoutput17
===================================================================
--- code/trunk/testdata/testoutput17    2013-10-14 13:54:07 UTC (rev 1379)
+++ code/trunk/testdata/testoutput17    2013-10-15 16:49:12 UTC (rev 1380)
@@ -440,11 +440,9 @@
      /i [^\x{8000}]*
      /i [^\x{7fff}]{2}
      /i [^\x{7fff}]{0,7}?
-        Once
      /i [^\x{100}]{5}
      /i [^\x{100}]?+
         Ket
-        Ket
         End
 ------------------------------------------------------------------



Modified: code/trunk/testdata/testoutput2
===================================================================
--- code/trunk/testdata/testoutput2    2013-10-14 13:54:07 UTC (rev 1379)
+++ code/trunk/testdata/testoutput2    2013-10-15 16:49:12 UTC (rev 1380)
@@ -2893,21 +2893,42 @@
 First char = 'x'
 No need char


-/x{1,3}+/DZ
+/x{1,3}+/BZO
 ------------------------------------------------------------------
         Bra
-        Once
         x
         x{0,2}+
         Ket
+        End
+------------------------------------------------------------------
+
+/x{1,3}+/BZOi
+------------------------------------------------------------------
+        Bra
+     /i x
+     /i x{0,2}+
         Ket
         End
 ------------------------------------------------------------------
-Capturing subpattern count = 0
-No options
-First char = 'x'
-No need char


+/[^x]{1,3}+/BZO
+------------------------------------------------------------------
+        Bra
+        [^x]
+        [^x]{0,2}+
+        Ket
+        End
+------------------------------------------------------------------
+
+/[^x]{1,3}+/BZOi
+------------------------------------------------------------------
+        Bra
+     /i [^x]
+     /i [^x]{0,2}+
+        Ket
+        End
+------------------------------------------------------------------
+
 /(x)*+/DZ
 ------------------------------------------------------------------
         Bra
@@ -4590,10 +4611,8 @@
 /[ab]{1}+/DZ
 ------------------------------------------------------------------
         Bra
-        Once
         [ab]{1,1}+
         Ket
-        Ket
         End
 ------------------------------------------------------------------
 Capturing subpattern count = 0
@@ -13764,4 +13783,36 @@
 /^A\x{/
 Failed: non-hex character in \x{} (closing brace missing?) at offset 5


+/[ab]++/BZO
+------------------------------------------------------------------
+        Bra
+        [ab]++
+        Ket
+        End
+------------------------------------------------------------------
+
+/[^ab]*+/BZO
+------------------------------------------------------------------
+        Bra
+        [\x00-`c-\xff] (neg)*+
+        Ket
+        End
+------------------------------------------------------------------
+
+/a{4}+/BZO
+------------------------------------------------------------------
+        Bra
+        a{4}
+        Ket
+        End
+------------------------------------------------------------------
+
+/a{4}+/BZOi
+------------------------------------------------------------------
+        Bra
+     /i a{4}
+        Ket
+        End
+------------------------------------------------------------------
+
 /-- End of testinput2 --/


Modified: code/trunk/testdata/testoutput5
===================================================================
--- code/trunk/testdata/testoutput5    2013-10-14 13:54:07 UTC (rev 1379)
+++ code/trunk/testdata/testoutput5    2013-10-15 16:49:12 UTC (rev 1380)
@@ -1834,11 +1834,9 @@
      /i [^\x{8000}]*
      /i [^\x{7fff}]{2}
      /i [^\x{7fff}]{0,7}?
-        Once
      /i [^\x{fffff}]{5}
      /i [^\x{fffff}]?+
         Ket
-        Ket
         End
 ------------------------------------------------------------------



Modified: code/trunk/testdata/testoutput7
===================================================================
--- code/trunk/testdata/testoutput7    2013-10-14 13:54:07 UTC (rev 1379)
+++ code/trunk/testdata/testoutput7    2013-10-15 16:49:12 UTC (rev 1380)
@@ -2136,4 +2136,20 @@
  +3 ^   ^    
  0: abcd


+/[\p{N}]?+/BZO
+------------------------------------------------------------------
+        Bra
+        [\p{N}]?+
+        Ket
+        End
+------------------------------------------------------------------
+
+/[\p{L}ab]{2,3}+/BZO
+------------------------------------------------------------------
+        Bra
+        [ab\p{L}]{2,3}+
+        Ket
+        End
+------------------------------------------------------------------
+
 /-- End of testinput7 --/