[Pcre-svn] [979] code/trunk: Fix DFA bug (3 cases) when UTF …

Top Page
Delete this message
Author: Subversion repository
Date:  
To: pcre-svn
Subject: [Pcre-svn] [979] code/trunk: Fix DFA bug (3 cases) when UTF code was being obeyed in non-UTF mode.
Revision: 979
          http://vcs.pcre.org/viewvc?view=rev&revision=979
Author:   ph10
Date:     2012-06-17 20:08:41 +0100 (Sun, 17 Jun 2012)


Log Message:
-----------
Fix DFA bug (3 cases) when UTF code was being obeyed in non-UTF mode.

Modified Paths:
--------------
    code/trunk/ChangeLog
    code/trunk/pcre_dfa_exec.c
    code/trunk/testdata/testinput8
    code/trunk/testdata/testoutput8


Modified: code/trunk/ChangeLog
===================================================================
--- code/trunk/ChangeLog    2012-06-17 16:55:07 UTC (rev 978)
+++ code/trunk/ChangeLog    2012-06-17 19:08:41 UTC (rev 979)
@@ -136,6 +136,15 @@
     the same checks as \x{...} characters in non-JavaScript mode. Specifically, 
     codepoints that are too big for the mode are faulted, and in a UTF mode, 
     disallowed codepoints are also faulted. 
+    
+39. If PCRE was compiled with UTF support, in three places in the DFA
+    matcher there was code that should only have been obeyed in UTF mode, but
+    was being obeyed unconditionally. In 8-bit mode this could cause incorrect
+    processing when bytes with values greater than 127 were present. In 16-bit
+    mode the bug would be provoked by values in the range 0xfc00 to 0xdc00. In
+    both cases the values are those that cannot be the first data item in a UTF
+    character. The three items that might have provoked this were recursions,
+    possessively repeated groups, and atomic groups.



Version 8.30 04-February-2012

Modified: code/trunk/pcre_dfa_exec.c
===================================================================
--- code/trunk/pcre_dfa_exec.c    2012-06-17 16:55:07 UTC (rev 978)
+++ code/trunk/pcre_dfa_exec.c    2012-06-17 19:08:41 UTC (rev 979)
@@ -38,7 +38,6 @@
 -----------------------------------------------------------------------------
 */


-
 /* This module contains the external function pcre_dfa_exec(), which is an
 alternative matching function that uses a sort of DFA algorithm (not a true
 FSM). This is NOT Perl-compatible, but it has advantages in certain
@@ -382,7 +381,8 @@
     next_new_state->count  = (y); \
     next_new_state->data   = (z); \
     next_new_state++; \
-    DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
+    DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d) line %d\n", rlevel*2-2, SP, \
+      (x), (y), (z), __LINE__)); \
     } \
   else return PCRE_ERROR_DFA_WSSIZE


@@ -611,7 +611,7 @@

   if (ptr < end_subject)
     {
-    clen = 1;        /* Number of bytes in the character */
+    clen = 1;        /* Number of data items in the character */
 #ifdef SUPPORT_UTF
     if (utf) { GETCHARLEN(c, ptr, clen); } else
 #endif  /* SUPPORT_UTF */
@@ -789,7 +789,7 @@
             offsets[0] = (int)(current_subject - start_subject);
             offsets[1] = (int)(ptr - start_subject);
             DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
-              offsets[1] - offsets[0], current_subject));
+              offsets[1] - offsets[0], (char *)current_subject));
             }
           if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
             {
@@ -2797,9 +2797,12 @@
             {
             int charcount = local_offsets[rc+1] - local_offsets[rc];
 #ifdef SUPPORT_UTF
-            const pcre_uchar *p = start_subject + local_offsets[rc];
-            const pcre_uchar *pp = start_subject + local_offsets[rc+1];
-            while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
+            if (utf)
+              { 
+              const pcre_uchar *p = start_subject + local_offsets[rc];
+              const pcre_uchar *pp = start_subject + local_offsets[rc+1];
+              while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
+              } 
 #endif
             if (charcount > 0)
               {
@@ -2898,7 +2901,7 @@
             const pcre_uchar *pp = local_ptr;
             charcount = (int)(pp - p);
 #ifdef SUPPORT_UTF
-            while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
+            if (utf) while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
 #endif
             ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
             }
@@ -2980,9 +2983,12 @@
           else
             {
 #ifdef SUPPORT_UTF
-            const pcre_uchar *p = start_subject + local_offsets[0];
-            const pcre_uchar *pp = start_subject + local_offsets[1];
-            while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
+            if (utf)
+              { 
+              const pcre_uchar *p = start_subject + local_offsets[0];
+              const pcre_uchar *pp = start_subject + local_offsets[1];
+              while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
+              } 
 #endif
             ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
             if (repeat_state_offset >= 0)


Modified: code/trunk/testdata/testinput8
===================================================================
--- code/trunk/testdata/testinput8    2012-06-17 16:55:07 UTC (rev 978)
+++ code/trunk/testdata/testinput8    2012-06-17 19:08:41 UTC (rev 979)
@@ -4787,4 +4787,15 @@
 /abcdef/
    abc\R


+/<H((?(?!<H|F>)(.)|(?R))++)*F>/
+    text <H more text <H texting more  hexA0-"\xA0"    hex above 7F-"\xBC" F> text xxxxx <H text F> text F> text2 <H text sample F> more text.
+
+/^(?>.{4})abc|^\w\w.xabcd/
+    xxxxabcd
+    xx\xa0xabcd 
+
+/^(.{4}){2}+abc|^\w\w.x\w\w\w\wabcd/
+    xxxxxxxxabcd
+    xx\xa0xxxxxabcd 
+
 /-- End of testinput8 --/


Modified: code/trunk/testdata/testoutput8
===================================================================
--- code/trunk/testdata/testoutput8    2012-06-17 16:55:07 UTC (rev 978)
+++ code/trunk/testdata/testoutput8    2012-06-17 19:08:41 UTC (rev 979)
@@ -7996,4 +7996,24 @@
    abc\R
 Error -30 (invalid data in workspace for DFA restart)


+/<H((?(?!<H|F>)(.)|(?R))++)*F>/
+    text <H more text <H texting more  hexA0-"\xA0"    hex above 7F-"\xBC" F> text xxxxx <H text F> text F> text2 <H text sample F> more text.
+ 0: <H more text <H texting more  hexA0-"\xa0"    hex above 7F-"\xbc" F> text xxxxx <H text F> text F>
+
+/^(?>.{4})abc|^\w\w.xabcd/
+    xxxxabcd
+ 0: xxxxabcd
+ 1: xxxxabc
+    xx\xa0xabcd 
+ 0: xx\xa0xabcd
+ 1: xx\xa0xabc
+
+/^(.{4}){2}+abc|^\w\w.x\w\w\w\wabcd/
+    xxxxxxxxabcd
+ 0: xxxxxxxxabcd
+ 1: xxxxxxxxabc
+    xx\xa0xxxxxabcd 
+ 0: xx\xa0xxxxxabcd
+ 1: xx\xa0xxxxxabc
+
 /-- End of testinput8 --/