[Pcre-svn] [723] code/trunk: Revert handling of atomic group…

Top Page
Delete this message
Author: Subversion repository
Date:  
To: pcre-svn
Subject: [Pcre-svn] [723] code/trunk: Revert handling of atomic groups that do not include captures to the old way of
Revision: 723
          http://vcs.pcre.org/viewvc?view=rev&revision=723
Author:   ph10
Date:     2011-10-08 16:55:23 +0100 (Sat, 08 Oct 2011)


Log Message:
-----------
Revert handling of atomic groups that do not include captures to the old way of
handling, thus reducing stack usage.

Modified Paths:
--------------
    code/trunk/ChangeLog
    code/trunk/pcre_compile.c
    code/trunk/pcre_dfa_exec.c
    code/trunk/pcre_exec.c
    code/trunk/pcre_internal.h
    code/trunk/pcre_printint.src
    code/trunk/pcre_study.c


Modified: code/trunk/ChangeLog
===================================================================
--- code/trunk/ChangeLog    2011-10-07 19:18:55 UTC (rev 722)
+++ code/trunk/ChangeLog    2011-10-08 15:55:23 UTC (rev 723)
@@ -94,6 +94,11 @@


 17. If a user had set PCREGREP_COLO(U)R to something other than 1:31, the
     RunGrepTest script failed. 
+    
+18. Change 22 for version 13 caused atomic groups to use more stack. This is
+    inevitable for groups that contain captures, but it can lead to a lot of 
+    stack use in large patterns. The old behaviour has been restored for atomic 
+    groups that do not contain any capturing parentheses.  



Version 8.13 16-Aug-2011

Modified: code/trunk/pcre_compile.c
===================================================================
--- code/trunk/pcre_compile.c    2011-10-07 19:18:55 UTC (rev 722)
+++ code/trunk/pcre_compile.c    2011-10-08 15:55:23 UTC (rev 723)
@@ -1506,6 +1506,7 @@
     case OP_CBRA:
     case OP_BRA:
     case OP_ONCE:
+    case OP_ONCE_NC: 
     case OP_COND:
     d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), utf8, atend, cd);
     if (d < 0) return d;
@@ -2045,7 +2046,8 @@


   if (c == OP_BRA  || c == OP_BRAPOS ||
       c == OP_CBRA || c == OP_CBRAPOS ||
-      c == OP_ONCE || c == OP_COND)
+      c == OP_ONCE || c == OP_ONCE_NC ||
+      c == OP_COND)
     {
     BOOL empty_branch;
     if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */
@@ -3142,6 +3144,7 @@
   int subfirstbyte;
   int terminator;
   int mclength;
+  int tempbracount; 
   uschar mcbuffer[8];


   /* Get next byte in the pattern */
@@ -4840,8 +4843,10 @@
         uschar *ketcode = code - 1 - LINK_SIZE;
         uschar *bracode = ketcode - GET(ketcode, 1);


-        if (*bracode == OP_ONCE && possessive_quantifier) *bracode = OP_BRA;
-        if (*bracode == OP_ONCE)
+        if ((*bracode == OP_ONCE || *bracode == OP_ONCE_NC) && 
+            possessive_quantifier) *bracode = OP_BRA;
+             
+        if (*bracode == OP_ONCE || *bracode == OP_ONCE_NC)
           *ketcode = OP_KETRMAX + repeat_type;
         else
           {
@@ -5906,6 +5911,7 @@
     *code = bravalue;
     tempcode = code;
     tempreqvary = cd->req_varyopt;        /* Save value before bracket */
+    tempbracount = cd->bracount;          /* Save value before bracket */
     length_prevgroup = 0;                 /* Initialize for pre-compile phase */


     if (!compile_regex(
@@ -5927,6 +5933,12 @@
            &length_prevgroup              /* Pre-compile phase */
          ))
       goto FAILED;
+      
+    /* If this was an atomic group and there are no capturing groups within it, 
+    generate OP_ONCE_NC instead of OP_ONCE. */ 
+      
+    if (bravalue == OP_ONCE && cd->bracount <= tempbracount) 
+      *code = OP_ONCE_NC;


     if (bravalue >= OP_ASSERT && bravalue <= OP_ASSERTBACK_NOT)
       cd->assert_depth -= 1;
@@ -6726,7 +6738,8 @@


    /* Other brackets */


-   else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
+   else if (op == OP_ASSERT || op == OP_ONCE || op == OP_ONCE_NC ||
+            op == OP_COND)
      {
      if (!is_anchored(scode, bracket_map, backref_map)) return FALSE;
      }
@@ -6830,7 +6843,7 @@


    /* Other brackets */


-   else if (op == OP_ASSERT || op == OP_ONCE)
+   else if (op == OP_ASSERT || op == OP_ONCE || op == OP_ONCE_NC)
      {
      if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
      }
@@ -6900,6 +6913,7 @@
      case OP_SCBRAPOS:
      case OP_ASSERT:
      case OP_ONCE:
+     case OP_ONCE_NC: 
      case OP_COND:
      if ((d = find_firstassertedchar(scode, op == OP_ASSERT)) < 0)
        return -1;


Modified: code/trunk/pcre_dfa_exec.c
===================================================================
--- code/trunk/pcre_dfa_exec.c    2011-10-07 19:18:55 UTC (rev 722)
+++ code/trunk/pcre_dfa_exec.c    2011-10-08 15:55:23 UTC (rev 723)
@@ -164,7 +164,8 @@
   0,                             /* Assert not                             */
   0,                             /* Assert behind                          */
   0,                             /* Assert behind not                      */
-  0, 0, 0, 0, 0, 0,              /* ONCE, BRA, BRAPOS, CBRA, CBRAPOS, COND */
+  0, 0,                          /* ONCE, ONCE_NC                          */
+  0, 0, 0, 0, 0,                 /* BRA, BRAPOS, CBRA, CBRAPOS, COND       */
   0, 0, 0, 0, 0,                 /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND  */
   0, 0,                          /* CREF, NCREF                            */
   0, 0,                          /* RREF, NRREF                            */
@@ -232,7 +233,8 @@
   0,                             /* Assert not                             */
   0,                             /* Assert behind                          */
   0,                             /* Assert behind not                      */
-  0, 0, 0, 0, 0, 0,              /* ONCE, BRA, BRAPOS, CBRA, CBRAPOS, COND */
+  0, 0,                          /* ONCE, ONCE_NC                          */
+  0, 0, 0, 0, 0,                 /* BRA, BRAPOS, CBRA, CBRAPOS, COND       */
   0, 0, 0, 0, 0,                 /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND  */
   0, 0,                          /* CREF, NCREF                            */
   0, 0,                          /* RREF, NRREF                            */
@@ -2789,6 +2791,7 @@


       /*-----------------------------------------------------------------*/
       case OP_ONCE:
+      case OP_ONCE_NC: 
         {
         int local_offsets[2];
         int local_workspace[1000];


Modified: code/trunk/pcre_exec.c
===================================================================
--- code/trunk/pcre_exec.c    2011-10-07 19:18:55 UTC (rev 722)
+++ code/trunk/pcre_exec.c    2011-10-08 15:55:23 UTC (rev 723)
@@ -277,7 +277,7 @@
        RM31,  RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
        RM41,  RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
        RM51,  RM52, RM53, RM54, RM55, RM56, RM57, RM58, RM59, RM60,
-       RM61,  RM62, RM63 };
+       RM61,  RM62, RM63, RM64, RM65, RM66 };


 /* These versions of the macros use the stack, as normal. There are debugging
 versions and production versions. Note that the "rw" argument of RMATCH isn't
@@ -793,7 +793,88 @@
     md->start_match_ptr = ecode;     
     md->mark = ecode + 2;
     RRETURN(MATCH_THEN);
+    
+    /* Handle an atomic group that does not contain any capturing parentheses.
+    This can be handled like an assertion. Prior to 8.13, all atomic groups 
+    were handled this way. In 8.13, the code was changed as below for ONCE, so 
+    that backups pass through the group and thereby reset captured values. 
+    However, this uses a lot more stack, so in 8.20, atomic groups that do not 
+    contain any captures generate OP_ONCE_NC, which can be handled in the old, 
+    less stack intensive way.


+    Check the alternative branches in turn - the matching won't pass the KET
+    for this kind of subpattern. If any one branch matches, we carry on as at
+    the end of a normal bracket, leaving the subject pointer, but resetting
+    the start-of-match value in case it was changed by \K. */
+
+    case OP_ONCE_NC:
+    prev = ecode;
+    saved_eptr = eptr;
+    do
+      {
+      RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM64);
+      if (rrc == MATCH_MATCH)  /* Note: _not_ MATCH_ACCEPT */
+        {
+        mstart = md->start_match_ptr;
+        break;
+        }
+      if (rrc == MATCH_THEN)
+        {
+        next = ecode + GET(ecode,1);
+        if (md->start_match_ptr < next && 
+            (*ecode == OP_ALT || *next == OP_ALT))
+          rrc = MATCH_NOMATCH;
+        }    
+ 
+      if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+      ecode += GET(ecode,1);
+      }
+    while (*ecode == OP_ALT);
+
+    /* If hit the end of the group (which could be repeated), fail */
+
+    if (*ecode != OP_ONCE_NC && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
+
+    /* Continue as from after the group, updating the offsets high water
+    mark, since extracts may have been taken. */
+
+    do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
+
+    offset_top = md->end_offset_top;
+    eptr = md->end_match_ptr;
+
+    /* For a non-repeating ket, just continue at this level. This also
+    happens for a repeating ket if no characters were matched in the group.
+    This is the forcible breaking of infinite loops as implemented in Perl
+    5.005. */
+
+    if (*ecode == OP_KET || eptr == saved_eptr)
+      {
+      ecode += 1+LINK_SIZE;
+      break;
+      }
+
+    /* The repeating kets try the rest of the pattern or restart from the
+    preceding bracket, in the appropriate order. The second "call" of match()
+    uses tail recursion, to avoid using another stack frame. */
+
+    if (*ecode == OP_KETRMIN)
+      {
+      RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM65);
+      if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+      ecode = prev;
+      goto TAIL_RECURSE;
+      }
+    else  /* OP_KETRMAX */
+      {
+      md->match_function_type = MATCH_CBEGROUP; 
+      RMATCH(eptr, prev, offset_top, md, eptrb, RM66);
+      if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+      ecode += 1 + LINK_SIZE;
+      goto TAIL_RECURSE;
+      }
+    /* Control never gets here */
+
     /* Handle a capturing bracket, other than those that are possessive with an
     unlimited repeat. If there is space in the offset vector, save the current
     subject position in the working slot at the top of the vector. We mustn't
@@ -888,7 +969,7 @@
     /* VVVVVVVVVVVVVVVVVVVVVVVVV */


     /* Non-capturing or atomic group, except for possessive with unlimited
-    repeat. Loop for all the alternatives.
+    repeat and ONCE group with no captures. Loop for all the alternatives.


     When we get to the final alternative within the brackets, we used to return
     the result of a recursive call to match() whatever happened so it was
@@ -1745,14 +1826,15 @@
       }
     else saved_eptr = NULL;


-    /* If we are at the end of an assertion group, stop matching and return
-    MATCH_MATCH, but record the current high water mark for use by positive
-    assertions. We also need to record the match start in case it was changed
-    by \K. */
+    /* If we are at the end of an assertion group or a non-capturing atomic 
+    group, stop matching and return MATCH_MATCH, but record the current high
+    water mark for use by positive assertions. We also need to record the match
+    start in case it was changed by \K. */


-    if (*prev >= OP_ASSERT && *prev <= OP_ASSERTBACK_NOT)
+    if ((*prev >= OP_ASSERT && *prev <= OP_ASSERTBACK_NOT) ||
+         *prev == OP_ONCE_NC) 
       {
-      md->end_match_ptr = eptr;      /* For ONCE */
+      md->end_match_ptr = eptr;      /* For ONCE_NC */
       md->end_offset_top = offset_top;
       md->start_match_ptr = mstart;
       MRRETURN(MATCH_MATCH);         /* Sets md->mark */
@@ -1820,11 +1902,11 @@
     /* For an ordinary non-repeating ket, just continue at this level. This
     also happens for a repeating ket if no characters were matched in the
     group. This is the forcible breaking of infinite loops as implemented in
-    Perl 5.005. For a non-repeating atomic group, establish a backup point by
-    processing the rest of the pattern at a lower level. If this results in a
-    NOMATCH return, pass MATCH_ONCE back to the original OP_ONCE level, thereby
-    bypassing intermediate backup points, but resetting any captures that
-    happened along the way. */
+    Perl 5.005. For a non-repeating atomic group that includes captures,
+    establish a backup point by processing the rest of the pattern at a lower
+    level. If this results in a NOMATCH return, pass MATCH_ONCE back to the
+    original OP_ONCE level, thereby bypassing intermediate backup points, but
+    resetting any captures that happened along the way. */


     if (*ecode == OP_KET || eptr == saved_eptr)
       {
@@ -5745,7 +5827,8 @@
   LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
   LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
   LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
-  LBL(53) LBL(54) LBL(55) LBL(56) LBL(57) LBL(58) LBL(63)
+  LBL(53) LBL(54) LBL(55) LBL(56) LBL(57) LBL(58) LBL(63) LBL(64)
+  LBL(65) LBL(66) 
 #ifdef SUPPORT_UTF8
   LBL(16) LBL(18) LBL(20) LBL(21) LBL(22) LBL(23) LBL(28) LBL(30)
   LBL(32) LBL(34) LBL(42) LBL(46)


Modified: code/trunk/pcre_internal.h
===================================================================
--- code/trunk/pcre_internal.h    2011-10-07 19:18:55 UTC (rev 722)
+++ code/trunk/pcre_internal.h    2011-10-08 15:55:23 UTC (rev 723)
@@ -1456,60 +1456,61 @@
   OP_ASSERTBACK,     /* 121 Positive lookbehind */
   OP_ASSERTBACK_NOT, /* 122 Negative lookbehind */


- /* ONCE, BRA, BRAPOS, CBRA, CBRAPOS, and COND must come immediately after the
- assertions, with ONCE first, as there's a test for >= ONCE for a subpattern
- that isn't an assertion. The POS versions must immediately follow the non-POS
- versions in each case. */
+ /* ONCE, ONCE_NC, BRA, BRAPOS, CBRA, CBRAPOS, and COND must come immediately
+ after the assertions, with ONCE first, as there's a test for >= ONCE for a
+ subpattern that isn't an assertion. The POS versions must immediately follow
+ the non-POS versions in each case. */

-  OP_ONCE,           /* 123 Atomic group */
-  OP_BRA,            /* 124 Start of non-capturing bracket */
-  OP_BRAPOS,         /* 125 Ditto, with unlimited, possessive repeat */
-  OP_CBRA,           /* 126 Start of capturing bracket */
-  OP_CBRAPOS,        /* 127 Ditto, with unlimited, possessive repeat */
-  OP_COND,           /* 128 Conditional group */
+  OP_ONCE,           /* 123 Atomic group, contains captures */
+  OP_ONCE_NC,        /* 124 Atomic group containing no captures */ 
+  OP_BRA,            /* 125 Start of non-capturing bracket */
+  OP_BRAPOS,         /* 126 Ditto, with unlimited, possessive repeat */
+  OP_CBRA,           /* 127 Start of capturing bracket */
+  OP_CBRAPOS,        /* 128 Ditto, with unlimited, possessive repeat */
+  OP_COND,           /* 129 Conditional group */


/* These five must follow the previous five, in the same order. There's a
check for >= SBRA to distinguish the two sets. */

-  OP_SBRA,           /* 129 Start of non-capturing bracket, check empty  */
-  OP_SBRAPOS,        /* 130 Ditto, with unlimited, possessive repeat */
-  OP_SCBRA,          /* 131 Start of capturing bracket, check empty */
-  OP_SCBRAPOS,       /* 132 Ditto, with unlimited, possessive repeat */
-  OP_SCOND,          /* 133 Conditional group, check empty */
+  OP_SBRA,           /* 130 Start of non-capturing bracket, check empty  */
+  OP_SBRAPOS,        /* 131 Ditto, with unlimited, possessive repeat */
+  OP_SCBRA,          /* 132 Start of capturing bracket, check empty */
+  OP_SCBRAPOS,       /* 133 Ditto, with unlimited, possessive repeat */
+  OP_SCOND,          /* 134 Conditional group, check empty */


/* The next two pairs must (respectively) be kept together. */

-  OP_CREF,           /* 134 Used to hold a capture number as condition */
-  OP_NCREF,          /* 135 Same, but generated by a name reference*/
-  OP_RREF,           /* 136 Used to hold a recursion number as condition */
-  OP_NRREF,          /* 137 Same, but generated by a name reference*/
-  OP_DEF,            /* 138 The DEFINE condition */
+  OP_CREF,           /* 135 Used to hold a capture number as condition */
+  OP_NCREF,          /* 136 Same, but generated by a name reference*/
+  OP_RREF,           /* 137 Used to hold a recursion number as condition */
+  OP_NRREF,          /* 138 Same, but generated by a name reference*/
+  OP_DEF,            /* 139 The DEFINE condition */


-  OP_BRAZERO,        /* 139 These two must remain together and in this */
-  OP_BRAMINZERO,     /* 140 order. */
-  OP_BRAPOSZERO,     /* 141 */
+  OP_BRAZERO,        /* 140 These two must remain together and in this */
+  OP_BRAMINZERO,     /* 141 order. */
+  OP_BRAPOSZERO,     /* 142 */


/* These are backtracking control verbs */

-  OP_MARK,           /* 142 always has an argument */
-  OP_PRUNE,          /* 143 */
-  OP_PRUNE_ARG,      /* 144 same, but with argument */
-  OP_SKIP,           /* 145 */
-  OP_SKIP_ARG,       /* 146 same, but with argument */
-  OP_THEN,           /* 147 */
-  OP_THEN_ARG,       /* 148 same, but with argument */
-  OP_COMMIT,         /* 149 */
+  OP_MARK,           /* 143 always has an argument */
+  OP_PRUNE,          /* 144 */
+  OP_PRUNE_ARG,      /* 145 same, but with argument */
+  OP_SKIP,           /* 146 */
+  OP_SKIP_ARG,       /* 147 same, but with argument */
+  OP_THEN,           /* 148 */
+  OP_THEN_ARG,       /* 149 same, but with argument */
+  OP_COMMIT,         /* 150 */


/* These are forced failure and success verbs */

-  OP_FAIL,           /* 150 */
-  OP_ACCEPT,         /* 151 */
-  OP_ASSERT_ACCEPT,  /* 152 Used inside assertions */
-  OP_CLOSE,          /* 153 Used before OP_ACCEPT to close open captures */
+  OP_FAIL,           /* 151 */
+  OP_ACCEPT,         /* 152 */
+  OP_ASSERT_ACCEPT,  /* 153 Used inside assertions */
+  OP_CLOSE,          /* 154 Used before OP_ACCEPT to close open captures */


/* This is used to skip a subpattern with a {0} quantifier */

-  OP_SKIPZERO,       /* 154 */
+  OP_SKIPZERO,       /* 155 */


   /* This is not an opcode, but is used to check that tables indexed by opcode
   are the correct length, in order to catch updating errors - there have been
@@ -1553,7 +1554,7 @@
   "Recurse", "Callout",                                           \
   "Alt", "Ket", "KetRmax", "KetRmin", "KetRpos",                  \
   "Reverse", "Assert", "Assert not", "AssertB", "AssertB not",    \
-  "Once",                                                         \
+  "Once", "Once_NC",                                              \
   "Bra", "BraPos", "CBra", "CBraPos",                             \
   "Cond",                                                         \
   "SBra", "SBraPos", "SCBra", "SCBraPos",                         \
@@ -1627,6 +1628,7 @@
   1+LINK_SIZE,                   /* Assert behind                          */ \
   1+LINK_SIZE,                   /* Assert behind not                      */ \
   1+LINK_SIZE,                   /* ONCE                                   */ \
+  1+LINK_SIZE,                   /* ONCE_NC                                */ \
   1+LINK_SIZE,                   /* BRA                                    */ \
   1+LINK_SIZE,                   /* BRAPOS                                 */ \
   3+LINK_SIZE,                   /* CBRA                                   */ \


Modified: code/trunk/pcre_printint.src
===================================================================
--- code/trunk/pcre_printint.src    2011-10-07 19:18:55 UTC (rev 722)
+++ code/trunk/pcre_printint.src    2011-10-08 15:55:23 UTC (rev 723)
@@ -260,6 +260,7 @@
     case OP_ASSERTBACK:
     case OP_ASSERTBACK_NOT:
     case OP_ONCE:
+    case OP_ONCE_NC: 
     case OP_COND:
     case OP_SCOND:
     case OP_REVERSE:


Modified: code/trunk/pcre_study.c
===================================================================
--- code/trunk/pcre_study.c    2011-10-07 19:18:55 UTC (rev 722)
+++ code/trunk/pcre_study.c    2011-10-08 15:55:23 UTC (rev 723)
@@ -127,6 +127,7 @@
     case OP_BRAPOS:
     case OP_SBRAPOS:
     case OP_ONCE:
+    case OP_ONCE_NC:
     d = find_minlength(cc, startcode, options, recurse_depth);
     if (d < 0) return d;
     branchlength += d;
@@ -810,6 +811,7 @@
       case OP_CBRAPOS:
       case OP_SCBRAPOS:
       case OP_ONCE:
+      case OP_ONCE_NC:
       case OP_ASSERT:
       rc = set_start_bits(tcode, start_bits, utf8, cd);
       if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;