[Pcre-svn] [916] code/trunk: Fix several partial matching bu…

Top Page
Delete this message
Author: Subversion repository
Date:  
To: pcre-svn
Subject: [Pcre-svn] [916] code/trunk: Fix several partial matching bugs for backrefs, \R, \X, and CRLF line endings.
Revision: 916
          http://vcs.pcre.org/viewvc?view=rev&revision=916
Author:   ph10
Date:     2012-02-15 09:50:53 +0000 (Wed, 15 Feb 2012)


Log Message:
-----------
Fix several partial matching bugs for backrefs, \R, \X, and CRLF line endings.

Modified Paths:
--------------
    code/trunk/ChangeLog
    code/trunk/pcre_dfa_exec.c
    code/trunk/pcre_exec.c
    code/trunk/testdata/testinput2
    code/trunk/testdata/testinput5
    code/trunk/testdata/testoutput2
    code/trunk/testdata/testoutput5


Modified: code/trunk/ChangeLog
===================================================================
--- code/trunk/ChangeLog    2012-02-14 13:05:39 UTC (rev 915)
+++ code/trunk/ChangeLog    2012-02-15 09:50:53 UTC (rev 916)
@@ -14,7 +14,21 @@


4. Partial matching support is added to the JIT compiler.

-5.  Improved \X and back reference partial matching.
+5.  Fixed several bugs concerned with partial matching of items that consist
+    of more than one character:
+    
+    (a) /^(..)\1/ did not partially match "aba" because checking references was
+        done on an "all or nothing" basis. This also applied to repeated 
+        references.
+        
+    (b) \R did not give a hard partial match if \r was found at the end of the
+        subject.
+        
+    (c) \X did not give a hard partial match after matching one or more 
+        characters at the end of the subject.
+        
+    (d) When newline was set to CRLF, a pattern such as /a$/ did not give a 
+        partial match for the string "\r".



Version 8.30 04-February-2012

Modified: code/trunk/pcre_dfa_exec.c
===================================================================
--- code/trunk/pcre_dfa_exec.c    2012-02-14 13:05:39 UTC (rev 915)
+++ code/trunk/pcre_dfa_exec.c    2012-02-15 09:50:53 UTC (rev 916)
@@ -424,6 +424,8 @@
 BOOL utf = FALSE;
 #endif


+BOOL reset_could_continue = FALSE;
+
rlevel++;
offsetcount &= (-2);

@@ -571,8 +573,10 @@
int clen, dlen;
unsigned int c, d;
int forced_fail = 0;
- BOOL could_continue = FALSE;
-
+ BOOL partial_newline = FALSE;
+ BOOL could_continue = reset_could_continue;
+ reset_could_continue = FALSE;
+
/* Make the new state list into the active state list and empty the
new state list. */

@@ -641,7 +645,8 @@

     /* A negative offset is a special case meaning "hold off going to this
     (negated) state until the number of characters in the data field have
-    been skipped". */
+    been skipped". If the could_continue flag was passed over from a previous 
+    state, arrange for it to passed on. */


     if (state_offset < 0)
       {
@@ -650,6 +655,7 @@
         DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP));
         ADD_NEW_DATA(state_offset, current_state->count,
           current_state->data - 1);
+        if (could_continue) reset_could_continue = TRUE;
         continue;
         }
       else
@@ -916,6 +922,19 @@
                (ptr == end_subject - md->nllen)
             ))
           { ADD_ACTIVE(state_offset + 1, 0); }
+        else if (ptr + 1 >= md->end_subject &&
+                 (md->moptions & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) != 0 &&
+                 NLBLOCK->nltype == NLTYPE_FIXED &&
+                 NLBLOCK->nllen == 2 && 
+                 c == NLBLOCK->nl[0])
+          {
+          if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
+            {
+            reset_could_continue = TRUE;
+            ADD_NEW_DATA(-(state_offset + 1), 0, 1);  
+            }  
+          else could_continue = partial_newline = TRUE; 
+          } 
         }
       break;


@@ -928,6 +947,19 @@
         else if (clen == 0 ||
             ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr)))
           { ADD_ACTIVE(state_offset + 1, 0); }
+        else if (ptr + 1 >= md->end_subject &&
+                 (md->moptions & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) != 0 &&
+                 NLBLOCK->nltype == NLTYPE_FIXED &&
+                 NLBLOCK->nllen == 2 && 
+                 c == NLBLOCK->nl[0])
+          {
+          if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
+            {
+            reset_could_continue = TRUE;
+            ADD_NEW_DATA(-(state_offset + 1), 0, 1);  
+            }  
+          else could_continue = partial_newline = TRUE; 
+          } 
         }
       else if (IS_NEWLINE(ptr))
         { ADD_ACTIVE(state_offset + 1, 0); }
@@ -1824,6 +1856,8 @@
           ncount++;
           nptr += ndlen;
           }
+        if (nptr >= end_subject && (md->moptions & PCRE_PARTIAL_HARD) != 0) 
+            reset_could_continue = TRUE; 
         if (++count >= GET2(code, 1))
           { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
         else
@@ -2037,6 +2071,8 @@
           ncount++;
           nptr += nclen;
           }
+        if (nptr >= end_subject && (md->moptions & PCRE_PARTIAL_HARD) != 0) 
+            reset_could_continue = TRUE; 
         ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
         }
       break;
@@ -2062,14 +2098,20 @@
         break;


         case 0x000d:
-        if (ptr + 1 < end_subject && ptr[1] == 0x0a)
+        if (ptr + 1 >= end_subject) 
           {
+          ADD_NEW(state_offset + 1, 0); 
+          if ((md->moptions & PCRE_PARTIAL_HARD) != 0) 
+            reset_could_continue = TRUE; 
+          }  
+        else if (ptr[1] == 0x0a)
+          {
           ADD_NEW_DATA(-(state_offset + 1), 0, 1);
           }
         else
-          {
+          { 
           ADD_NEW(state_offset + 1, 0);
-          }
+          } 
         break;
         }
       break;
@@ -2942,7 +2984,7 @@


   The "could_continue" variable is true if a state could have continued but
   for the fact that the end of the subject was reached. */
-
+  
   if (new_count <= 0)
     {
     if (rlevel == 1 &&                               /* Top level, and */
@@ -2954,7 +2996,10 @@
         ((md->moptions & PCRE_PARTIAL_SOFT) != 0 &&  /* Soft partial and */
          match_count < 0)                            /* no matches */
         ) &&                                         /* And... */
-        ptr >= end_subject &&                  /* Reached end of subject */
+        (
+        ptr >= end_subject ||                  /* Reached end of subject or */
+        partial_newline                        /* a partial newline */
+        ) && 
         ptr > md->start_used_ptr)              /* Inspected non-empty string */
       {
       if (offsetcount >= 2)


Modified: code/trunk/pcre_exec.c
===================================================================
--- code/trunk/pcre_exec.c    2012-02-14 13:05:39 UTC (rev 915)
+++ code/trunk/pcre_exec.c    2012-02-15 09:50:53 UTC (rev 916)
@@ -140,14 +140,16 @@
   md          points to match data block
   caseless    TRUE if caseless


-Returns:      < 0 if not matched, otherwise the number of subject bytes matched
+Returns:      >= 0 the number of subject bytes matched
+              -1 no match
+              -2 partial match; always given if at end subject 
 */


static int
match_ref(int offset, register PCRE_PUCHAR eptr, int length, match_data *md,
BOOL caseless)
{
-int matched_length = length;
+PCRE_PUCHAR eptr_start = eptr;
register PCRE_PUCHAR p = md->start_subject + md->offset_vector[offset];

#ifdef PCRE_DEBUG
@@ -163,7 +165,8 @@
printf("\n");
#endif

-/* Always fail if reference not set (and not JavaScript compatible). */
+/* Always fail if reference not set (and not JavaScript compatible - in that
+case the length is passed as zero). */

if (length < 0) return -1;

@@ -186,16 +189,14 @@
     reference, not along the subject (earlier code did this wrong). */


     PCRE_PUCHAR endptr = p + length;
-    PCRE_PUCHAR eptr_start = eptr;
     while (p < endptr)
       {
       int c, d;
-      if (eptr >= md->end_subject) return -((int)(eptr - eptr_start) + 1);
+      if (eptr >= md->end_subject) return -2;   /* Partial match */
       GETCHARINC(c, eptr);
       GETCHARINC(d, p);
       if (c != d && c != UCD_OTHERCASE(d)) return -1;
       }
-    matched_length = (int)(eptr - eptr_start);
     }
   else
 #endif
@@ -204,15 +205,9 @@
   /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
   is no UCP support. */
     {
-    if (eptr + length > md->end_subject)
-      {
-      if (md->partial == 0)
-        return -1;
-      length = (int)(md->end_subject - eptr);
-      matched_length = -(length + 1);
-      }
     while (length-- > 0)
       {
+      if (eptr >= md->end_subject) return -2;   /* Partial match */
       if (TABLE_GET(*p, md->lcc, *p) != TABLE_GET(*eptr, md->lcc, *eptr)) return -1;
       p++;
       eptr++;
@@ -225,17 +220,14 @@


 else
   {
-  if (eptr + length > md->end_subject)
+  while (length-- > 0) 
     {
-    if (md->partial == 0)
-      return -1;
-    length = (int)(md->end_subject - eptr);
-    matched_length = -(length + 1);
-    }
-  while (length-- > 0) if (*p++ != *eptr++) return -1;
+    if (eptr >= md->end_subject) return -2;   /* Partial match */
+    if (*p++ != *eptr++) return -1;
+    } 
   }


-return matched_length;
+return (int)(eptr - eptr_start);
}


@@ -2073,7 +2065,21 @@

     case OP_DOLLM:
     if (eptr < md->end_subject)
-      { if (!IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); }
+      { 
+      if (!IS_NEWLINE(eptr)) 
+        {
+        if (eptr + 1 >= md->end_subject &&
+            md->partial != 0 &&
+            NLBLOCK->nltype == NLTYPE_FIXED &&
+            NLBLOCK->nllen == 2 && 
+            *eptr == NLBLOCK->nl[0])
+          {  
+          md->hitend = TRUE;
+          if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
+          }
+        RRETURN(MATCH_NOMATCH); 
+        } 
+      }
     else
       {
       if (md->noteol) RRETURN(MATCH_NOMATCH);
@@ -2105,7 +2111,18 @@
     ASSERT_NL_OR_EOS:
     if (eptr < md->end_subject &&
         (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
+      {
+      if (eptr + 1 >= md->end_subject &&
+          md->partial != 0 &&
+          NLBLOCK->nltype == NLTYPE_FIXED &&
+          NLBLOCK->nllen == 2 && 
+          *eptr == NLBLOCK->nl[0])
+        {  
+        md->hitend = TRUE;
+        if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
+        }
       RRETURN(MATCH_NOMATCH);
+      } 


     /* Either at end of string or \n before end. */


@@ -2379,7 +2396,11 @@
       default: RRETURN(MATCH_NOMATCH);


       case 0x000d:
-      if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
+      if (eptr >= md->end_subject)
+        {
+        SCHECK_PARTIAL();
+        }    
+      else if (*eptr == 0x0a) eptr++;
       break;


       case 0x000a:
@@ -2609,10 +2630,7 @@
       if (UCD_CATEGORY(c) != ucp_M) break;
       eptr += len;
       }
-    if (md->partial != 0 && eptr >= md->end_subject)
-      {
-      SCHECK_PARTIAL();
-      }
+    CHECK_PARTIAL();   
     ecode++;
     break;
 #endif
@@ -2678,7 +2696,7 @@
       default:               /* No repeat follows */
       if ((length = match_ref(offset, eptr, length, md, caseless)) < 0)
         {
-        eptr += -(length + 1);
+        if (length == -2) eptr = md->end_subject;   /* Partial match */ 
         CHECK_PARTIAL();
         RRETURN(MATCH_NOMATCH);
         }
@@ -2704,7 +2722,7 @@
       int slength;
       if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
         {
-        eptr += -(slength + 1);
+        if (slength == -2) eptr = md->end_subject;   /* Partial match */ 
         CHECK_PARTIAL();
         RRETURN(MATCH_NOMATCH);
         }
@@ -2728,7 +2746,7 @@
         if (fi >= max) RRETURN(MATCH_NOMATCH);
         if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
           {
-          eptr += -(slength + 1);
+          if (slength == -2) eptr = md->end_subject;   /* Partial match */ 
           CHECK_PARTIAL();
           RRETURN(MATCH_NOMATCH);
           }
@@ -2747,14 +2765,20 @@
         int slength;
         if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
           {
-          /* Restore the eptr after the check. */
-          eptr += -(slength + 1);
-          CHECK_PARTIAL();
-          eptr -= -(slength + 1);
+          /* Can't use CHECK_PARTIAL because we don't want to update eptr in 
+          the soft partial matching case. */ 
+           
+          if (slength == -2 && md->partial != 0 && 
+              md->end_subject > md->start_used_ptr)
+            {  
+            md->hitend = TRUE;
+            if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
+            }
           break;
           }
         eptr += slength;
         }
+         
       while (eptr >= pp)
         {
         RMATCH(eptr, ecode, offset_top, md, eptrb, RM15);
@@ -4188,11 +4212,8 @@
             if (UCD_CATEGORY(c) != ucp_M) break;
             eptr += len;
             }
+          CHECK_PARTIAL();   
           }
-        if (md->partial != 0 && eptr >= md->end_subject)
-          {
-          SCHECK_PARTIAL();
-          }
         }


       else
@@ -4976,10 +4997,7 @@
             if (UCD_CATEGORY(c) != ucp_M) break;
             eptr += len;
             }
-          if (md->partial != 0 && eptr >= md->end_subject)
-            {
-            SCHECK_PARTIAL();
-            }
+          CHECK_PARTIAL();   
           }
         }
       else
@@ -5523,10 +5541,7 @@
             if (UCD_CATEGORY(c) != ucp_M) break;
             eptr += len;
             }
-          if (eptr >= md->end_subject)
-            {
-            SCHECK_PARTIAL();
-            }
+          CHECK_PARTIAL();   
           }


         /* eptr is now past the end of the maximum run */
@@ -6318,8 +6333,7 @@
 /* If the pattern was successfully studied with JIT support, run the JIT
 executable instead of the rest of this function. Most options must be set at
 compile time for the JIT code to be usable. Fallback to the normal code path if
-an unsupported flag is set. In particular, JIT does not support partial
-matching. */
+an unsupported flag is set. */


 #ifdef SUPPORT_JIT
 if (extra_data != NULL
@@ -6334,10 +6348,11 @@
     (const pcre_uchar *)subject, length, start_offset, options,
     ((extra_data->flags & PCRE_EXTRA_MATCH_LIMIT) == 0)
     ? MATCH_LIMIT : extra_data->match_limit, offsets, offsetcount);
+     
   /* PCRE_ERROR_NULL means that the selected normal or partial matching
   mode is not compiled. In this case we simply fallback to interpreter. */
-  if (rc != PCRE_ERROR_NULL)
-    return rc;
+   
+  if (rc != PCRE_ERROR_NULL) return rc;
   }
 #endif



Modified: code/trunk/testdata/testinput2
===================================================================
--- code/trunk/testdata/testinput2    2012-02-14 13:05:39 UTC (rev 915)
+++ code/trunk/testdata/testinput2    2012-02-15 09:50:53 UTC (rev 916)
@@ -3640,4 +3640,24 @@
     abaBabA\P
     abAbABaBx\P


+/^(..)\1/
+    aba\P
+
+/^(..)\1{2,3}x/
+    aba\P
+    ababa\P
+    ababa\P\P
+    abababx
+    ababababx  
+
+/^(..)\1{2,3}?x/
+    aba\P
+    ababa\P
+    ababa\P\P
+    abababx
+    ababababx  
+    
+/^(..)(\1{2,3})ab/
+    abababab
+
 /-- End of testinput2 --/


Modified: code/trunk/testdata/testinput5
===================================================================
--- code/trunk/testdata/testinput5    2012-02-14 13:05:39 UTC (rev 915)
+++ code/trunk/testdata/testinput5    2012-02-15 09:50:53 UTC (rev 916)
@@ -732,4 +732,36 @@
     abaBabA\P
     abAbABaBx\P


+/^\X/8
+    A\P
+    A\P\P 
+    A\x{300}\x{301}\P
+    A\x{300}\x{301}\P\P  
+    A\x{301}\P
+    A\x{301}\P\P  
+    
+/^\X{2,3}/8
+    A\P
+    A\P\P 
+    AA\P
+    AA\P\P  
+    A\x{300}\x{301}\P
+    A\x{300}\x{301}\P\P  
+    A\x{300}\x{301}A\x{300}\x{301}\P
+    A\x{300}\x{301}A\x{300}\x{301}\P\P  
+
+/^\X{2}/8
+    AA\P
+    AA\P\P  
+    A\x{300}\x{301}A\x{300}\x{301}\P
+    A\x{300}\x{301}A\x{300}\x{301}\P\P  
+    
+/^\X+/8
+    AA\P
+    AA\P\P  
+
+/^\X+?Z/8
+    AA\P
+    AA\P\P 
+
 /-- End of testinput5 --/


Modified: code/trunk/testdata/testoutput2
===================================================================
--- code/trunk/testdata/testoutput2    2012-02-14 13:05:39 UTC (rev 915)
+++ code/trunk/testdata/testoutput2    2012-02-15 09:50:53 UTC (rev 916)
@@ -12086,4 +12086,42 @@
  0: abAbABaBx
  1: ab


+/^(..)\1/
+    aba\P
+Partial match: aba
+
+/^(..)\1{2,3}x/
+    aba\P
+Partial match: aba
+    ababa\P
+Partial match: ababa
+    ababa\P\P
+Partial match: ababa
+    abababx
+ 0: abababx
+ 1: ab
+    ababababx  
+ 0: ababababx
+ 1: ab
+
+/^(..)\1{2,3}?x/
+    aba\P
+Partial match: aba
+    ababa\P
+Partial match: ababa
+    ababa\P\P
+Partial match: ababa
+    abababx
+ 0: abababx
+ 1: ab
+    ababababx  
+ 0: ababababx
+ 1: ab
+    
+/^(..)(\1{2,3})ab/
+    abababab
+ 0: abababab
+ 1: ab
+ 2: abab
+
 /-- End of testinput2 --/


Modified: code/trunk/testdata/testoutput5
===================================================================
--- code/trunk/testdata/testoutput5    2012-02-14 13:05:39 UTC (rev 915)
+++ code/trunk/testdata/testoutput5    2012-02-15 09:50:53 UTC (rev 916)
@@ -1726,4 +1726,58 @@
  0: abAbABaBx
  1: ab


+/^\X/8
+    A\P
+ 0: A
+    A\P\P 
+Partial match: A
+    A\x{300}\x{301}\P
+ 0: A\x{300}\x{301}
+    A\x{300}\x{301}\P\P  
+Partial match: A\x{300}\x{301}
+    A\x{301}\P
+ 0: A\x{301}
+    A\x{301}\P\P  
+Partial match: A\x{301}
+    
+/^\X{2,3}/8
+    A\P
+Partial match: A
+    A\P\P 
+Partial match: A
+    AA\P
+ 0: AA
+    AA\P\P  
+Partial match: AA
+    A\x{300}\x{301}\P
+Partial match: A\x{300}\x{301}
+    A\x{300}\x{301}\P\P  
+Partial match: A\x{300}\x{301}
+    A\x{300}\x{301}A\x{300}\x{301}\P
+ 0: A\x{300}\x{301}A\x{300}\x{301}
+    A\x{300}\x{301}A\x{300}\x{301}\P\P  
+Partial match: A\x{300}\x{301}A\x{300}\x{301}
+
+/^\X{2}/8
+    AA\P
+ 0: AA
+    AA\P\P  
+Partial match: AA
+    A\x{300}\x{301}A\x{300}\x{301}\P
+ 0: A\x{300}\x{301}A\x{300}\x{301}
+    A\x{300}\x{301}A\x{300}\x{301}\P\P  
+Partial match: A\x{300}\x{301}A\x{300}\x{301}
+    
+/^\X+/8
+    AA\P
+ 0: AA
+    AA\P\P  
+Partial match: AA
+
+/^\X+?Z/8
+    AA\P
+Partial match: AA
+    AA\P\P 
+Partial match: AA
+
 /-- End of testinput5 --/