[Pcre-svn] [606] code/trunk: Tidy the API for _pcre_valid_ut…

トップ ページ
このメッセージを削除
著者: Subversion repository
日付:  
To: pcre-svn
題目: [Pcre-svn] [606] code/trunk: Tidy the API for _pcre_valid_utf8() to a more suitable form for a future public
Revision: 606
          http://vcs.pcre.org/viewvc?view=rev&revision=606
Author:   ph10
Date:     2011-06-06 18:46:22 +0100 (Mon, 06 Jun 2011)


Log Message:
-----------
Tidy the API for _pcre_valid_utf8() to a more suitable form for a future public
release. Also make -s in pcretest force a study for every regex.

Modified Paths:
--------------
    code/trunk/ChangeLog
    code/trunk/doc/pcretest.1
    code/trunk/pcre_compile.c
    code/trunk/pcre_dfa_exec.c
    code/trunk/pcre_exec.c
    code/trunk/pcre_valid_utf8.c
    code/trunk/pcretest.c


Modified: code/trunk/ChangeLog
===================================================================
--- code/trunk/ChangeLog    2011-06-03 18:18:30 UTC (rev 605)
+++ code/trunk/ChangeLog    2011-06-06 17:46:22 UTC (rev 606)
@@ -74,6 +74,13 @@
     opcodes that mean there is no starting character; this means that when new 
     ones are added and accidentally left out of pcre_study(), testing should 
     pick them up.
+    
+14. The -s option of pcretest has been documented for ages as being an old 
+    synonym of -m (show memory usage). I have changed it to mean "force study 
+    for every regex", that is, assume /S for every regex. This is similar to -i 
+    and -d etc. It's slightly incompatible, but I'm hoping nobody is still 
+    using it. It makes it easier to run collection of tests with study enabled, 
+    and thereby test pcre_study() more easily.  



Version 8.12 15-Jan-2011

Modified: code/trunk/doc/pcretest.1
===================================================================
--- code/trunk/doc/pcretest.1    2011-06-03 18:18:30 UTC (rev 605)
+++ code/trunk/doc/pcretest.1    2011-06-06 17:46:22 UTC (rev 606)
@@ -56,8 +56,7 @@
 .TP 10
 \fB-m\fP
 Output the size of each compiled pattern after it has been compiled. This is
-equivalent to adding \fB/M\fP to each regular expression. For compatibility
-with earlier versions of pcretest, \fB-s\fP is a synonym for \fB-m\fP.
+equivalent to adding \fB/M\fP to each regular expression.
 .TP 10
 \fB-o\fP \fIosize\fP
 Set the number of elements in the output vector that is used when calling
@@ -79,6 +78,10 @@
 On Unix-like systems, set the size of the run-time stack to \fIsize\fP
 megabytes.
 .TP 10
+\fB-s\fP
+Behave as if each regex has the \fB/S\fP modifier; in other words, force each 
+regex to be studied.
+.TP 10
 \fB-t\fP
 Run each compile, study, and match many times with a timer, and output
 resulting time per compile or match (in milliseconds). Do not set \fB-m\fP with
@@ -789,6 +792,6 @@
 .rs
 .sp
 .nf
-Last updated: 07 May 2011
+Last updated: 06 June 2011
 Copyright (c) 1997-2011 University of Cambridge.
 .fi


Modified: code/trunk/pcre_compile.c
===================================================================
--- code/trunk/pcre_compile.c    2011-06-03 18:18:30 UTC (rev 605)
+++ code/trunk/pcre_compile.c    2011-06-06 17:46:22 UTC (rev 606)
@@ -6976,13 +6976,12 @@


/* Can't support UTF8 unless PCRE has been compiled to include the code. The
return of an error code from _pcre_valid_utf8() is a new feature, introduced in
-release 8.13. The only use we make of it here is to adjust the offset value to
-the end of the string for a short string error, for compatibility with previous
-versions. */
+release 8.13. It is passed back from pcre_[dfa_]exec(), but at the moment is
+not used here. */

 #ifdef SUPPORT_UTF8
 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&
-     (*erroroffset = _pcre_valid_utf8((USPTR)pattern, -1, &errorcode)) >= 0)
+     (errorcode = _pcre_valid_utf8((USPTR)pattern, -1, erroroffset)) != 0)
   {
   errorcode = ERR44;
   goto PCRE_EARLY_ERROR_RETURN2;


Modified: code/trunk/pcre_dfa_exec.c
===================================================================
--- code/trunk/pcre_dfa_exec.c    2011-06-03 18:18:30 UTC (rev 605)
+++ code/trunk/pcre_dfa_exec.c    2011-06-06 17:46:22 UTC (rev 606)
@@ -3123,23 +3123,21 @@
 #ifdef SUPPORT_UTF8
 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
   {
-  int errorcode; 
-  int tb = _pcre_valid_utf8((uschar *)subject, length, &errorcode);
-  if (tb >= 0)
+  int erroroffset; 
+  int errorcode = _pcre_valid_utf8((uschar *)subject, length, &erroroffset);
+  if (errorcode != 0)
     {
     if (offsetcount >= 2)
       {
-      offsets[0] = tb;
+      offsets[0] = erroroffset;
       offsets[1] = errorcode;
       }    
     return (errorcode <= PCRE_UTF8_ERR5 && (options & PCRE_PARTIAL_HARD) != 0)?
       PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
     }  
-  if (start_offset > 0 && start_offset < length)
-    {
-    tb = ((USPTR)subject)[start_offset] & 0xc0;
-    if (tb == 0x80) return PCRE_ERROR_BADUTF8_OFFSET;
-    }
+  if (start_offset > 0 && start_offset < length &&
+        (((USPTR)subject)[start_offset] & 0xc0) == 0x80) 
+    return PCRE_ERROR_BADUTF8_OFFSET;
   }
 #endif



Modified: code/trunk/pcre_exec.c
===================================================================
--- code/trunk/pcre_exec.c    2011-06-03 18:18:30 UTC (rev 605)
+++ code/trunk/pcre_exec.c    2011-06-06 17:46:22 UTC (rev 606)
@@ -5999,28 +5999,29 @@
   return PCRE_ERROR_BADPARTIAL;


/* Check a UTF-8 string if required. Pass back the character offset and error
-code if a results vector is available. */
+code for an invalid string if a results vector is available. */

 #ifdef SUPPORT_UTF8
 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
   {
-  int errorcode; 
-  int tb = _pcre_valid_utf8((USPTR)subject, length, &errorcode);
-  if (tb >= 0)
+  int erroroffset; 
+  int errorcode = _pcre_valid_utf8((USPTR)subject, length, &erroroffset);
+  if (errorcode != 0)
     {
     if (offsetcount >= 2)
       {
-      offsets[0] = tb;
+      offsets[0] = erroroffset;
       offsets[1] = errorcode;
       }    
     return (errorcode <= PCRE_UTF8_ERR5 && md->partial > 1)?
       PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
-    }   
-  if (start_offset > 0 && start_offset < length)
-    {
-    tb = ((USPTR)subject)[start_offset] & 0xc0;
-    if (tb == 0x80) return PCRE_ERROR_BADUTF8_OFFSET;
-    }
+    } 
+
+  /* Check that a start_offset points to the start of a UTF-8 character. */
+        
+  if (start_offset > 0 && start_offset < length &&
+      (((USPTR)subject)[start_offset] & 0xc0) == 0x80) 
+    return PCRE_ERROR_BADUTF8_OFFSET;
   }
 #endif



Modified: code/trunk/pcre_valid_utf8.c
===================================================================
--- code/trunk/pcre_valid_utf8.c    2011-06-03 18:18:30 UTC (rev 605)
+++ code/trunk/pcre_valid_utf8.c    2011-06-06 17:46:22 UTC (rev 606)
@@ -68,7 +68,7 @@
 characters is still checked.


From release 8.13 more information about the details of the error are passed
-back in the error code:
+back in the returned value:

 PCRE_UTF8_ERR0   No error
 PCRE_UTF8_ERR1   Missing 1 byte at the end of the string
@@ -96,14 +96,14 @@
 Arguments:
   string       points to the string
   length       length of string, or -1 if the string is zero-terminated
-  errp         pointer to an error code variable 
+  errp         pointer to an error position offset variable 


-Returns:       < 0    if the string is a valid UTF-8 string
-               >= 0   otherwise; the value is the offset of the bad character
+Returns:       = 0    if the string is a valid UTF-8 string
+               > 0    otherwise, setting the offset of the bad character
 */


int
-_pcre_valid_utf8(USPTR string, int length, int *errorcode)
+_pcre_valid_utf8(USPTR string, int length, int *erroroffset)
{
#ifdef SUPPORT_UTF8
register USPTR p;
@@ -114,8 +114,6 @@
length = p - string;
}

-*errorcode = PCRE_UTF8_ERR0;
-
for (p = string; length-- > 0; p++)
{
register int ab, c, d;
@@ -125,21 +123,21 @@

   if (c < 0xc0)                         /* Isolated 10xx xxxx byte */
     {
-    *errorcode = PCRE_UTF8_ERR20; 
-    return p - string;
+    *erroroffset = p - string;
+    return PCRE_UTF8_ERR20; 
     } 


   if (c >= 0xfe)                        /* Invalid 0xfe or 0xff bytes */
     {
-    *errorcode = PCRE_UTF8_ERR21; 
-    return p - string;
+    *erroroffset = p - string;
+    return PCRE_UTF8_ERR21; 
     } 


   ab = _pcre_utf8_table4[c & 0x3f];     /* Number of additional bytes */
   if (length < ab) 
     {
-    *errorcode = ab - length;           /* Codes ERR1 to ERR5 */
-    return p - string;                  /* Missing bytes */
+    *erroroffset = p - string;          /* Missing bytes */
+    return ab - length;                 /* Codes ERR1 to ERR5 */
     } 
   length -= ab;                         /* Length remaining */


@@ -147,8 +145,8 @@

   if (((d = *(++p)) & 0xc0) != 0x80) 
     {
-    *errorcode = PCRE_UTF8_ERR6; 
-    return p - string - 1;
+    *erroroffset = p - string - 1;
+    return PCRE_UTF8_ERR6; 
     } 


/* For each length, check that the remaining bytes start with the 0x80 bit
@@ -162,8 +160,8 @@

     case 1: if ((c & 0x3e) == 0)  
       {
-      *errorcode = PCRE_UTF8_ERR15;   
-      return p - string - 1;  
+      *erroroffset = p - string - 1;  
+      return PCRE_UTF8_ERR15;   
       } 
     break; 


@@ -174,18 +172,18 @@
     case 2:
     if ((*(++p) & 0xc0) != 0x80)     /* Third byte */
       {
-      *errorcode = PCRE_UTF8_ERR7;
-      return p - string - 2;   
+      *erroroffset = p - string - 2;   
+      return PCRE_UTF8_ERR7;
       } 
     if (c == 0xe0 && (d & 0x20) == 0)
       {
-      *errorcode = PCRE_UTF8_ERR16; 
-      return p - string - 2;
+      *erroroffset = p - string - 2;
+      return PCRE_UTF8_ERR16; 
       } 
     if (c == 0xed && d >= 0xa0)
       {
-      *errorcode = PCRE_UTF8_ERR14;  
-      return p - string - 2;
+      *erroroffset = p - string - 2;
+      return PCRE_UTF8_ERR14;  
       } 
     break;


@@ -196,23 +194,23 @@
     case 3:
     if ((*(++p) & 0xc0) != 0x80)     /* Third byte */
       {
-      *errorcode = PCRE_UTF8_ERR7;
-      return p - string - 2;   
+      *erroroffset = p - string - 2;   
+      return PCRE_UTF8_ERR7;
       } 
     if ((*(++p) & 0xc0) != 0x80)     /* Fourth byte */
       {
-      *errorcode = PCRE_UTF8_ERR8;
-      return p - string - 3;   
+      *erroroffset = p - string - 3;   
+      return PCRE_UTF8_ERR8;
       } 
     if (c == 0xf0 && (d & 0x30) == 0)
       {
-      *errorcode = PCRE_UTF8_ERR17;  
-      return p - string - 3;
+      *erroroffset = p - string - 3;
+      return PCRE_UTF8_ERR17;  
       } 
     if (c > 0xf4 || (c == 0xf4 && d > 0x8f))
       {
-      *errorcode = PCRE_UTF8_ERR13;  
-      return p - string - 3;
+      *erroroffset = p - string - 3;
+      return PCRE_UTF8_ERR13;  
       }
     break;


@@ -227,23 +225,23 @@
     case 4: 
     if ((*(++p) & 0xc0) != 0x80)     /* Third byte */
       {
-      *errorcode = PCRE_UTF8_ERR7;
-      return p - string - 2;   
+      *erroroffset = p - string - 2;   
+      return PCRE_UTF8_ERR7;
       } 
     if ((*(++p) & 0xc0) != 0x80)     /* Fourth byte */
       {
-      *errorcode = PCRE_UTF8_ERR8;
-      return p - string - 3;   
+      *erroroffset = p - string - 3;   
+      return PCRE_UTF8_ERR8;
       } 
     if ((*(++p) & 0xc0) != 0x80)     /* Fifth byte */
       {
-      *errorcode = PCRE_UTF8_ERR9;
-      return p - string - 4;   
+      *erroroffset = p - string - 4;   
+      return PCRE_UTF8_ERR9;
       } 
     if (c == 0xf8 && (d & 0x38) == 0) 
       {
-      *errorcode = PCRE_UTF8_ERR18; 
-      return p - string - 4;
+      *erroroffset = p - string - 4;
+      return PCRE_UTF8_ERR18; 
       } 
     break;


@@ -253,28 +251,28 @@
     case 5:
     if ((*(++p) & 0xc0) != 0x80)     /* Third byte */
       {
-      *errorcode = PCRE_UTF8_ERR7;
-      return p - string - 2;   
+      *erroroffset = p - string - 2;   
+      return PCRE_UTF8_ERR7;
       } 
     if ((*(++p) & 0xc0) != 0x80)     /* Fourth byte */
       {
-      *errorcode = PCRE_UTF8_ERR8;
-      return p - string - 3;   
+      *erroroffset = p - string - 3;   
+      return PCRE_UTF8_ERR8;
       } 
     if ((*(++p) & 0xc0) != 0x80)     /* Fifth byte */
       {
-      *errorcode = PCRE_UTF8_ERR9;
-      return p - string - 4;   
+      *erroroffset = p - string - 4;   
+      return PCRE_UTF8_ERR9;
       } 
     if ((*(++p) & 0xc0) != 0x80)     /* Sixth byte */
       {
-      *errorcode = PCRE_UTF8_ERR10;
-      return p - string - 5;   
+      *erroroffset = p - string - 5;   
+      return PCRE_UTF8_ERR10;
       } 
     if (c == 0xfc && (d & 0x3c) == 0) 
       {
-      *errorcode = PCRE_UTF8_ERR19; 
-      return p - string - 5;
+      *erroroffset = p - string - 5;
+      return PCRE_UTF8_ERR19; 
       } 
     break;
     }
@@ -285,8 +283,8 @@


   if (ab > 3) 
     {
-    *errorcode = (ab == 4)? PCRE_UTF8_ERR11 : PCRE_UTF8_ERR12; 
-    return p - string - ab;
+    *erroroffset = p - string - ab;
+    return (ab == 4)? PCRE_UTF8_ERR11 : PCRE_UTF8_ERR12; 
     } 
   }


@@ -295,7 +293,7 @@
(void)(length);
#endif

-return -1; /* This indicates success */
+return PCRE_UTF8_ERR0; /* This indicates success */
}

/* End of pcre_valid_utf8.c */

Modified: code/trunk/pcretest.c
===================================================================
--- code/trunk/pcretest.c    2011-06-03 18:18:30 UTC (rev 605)
+++ code/trunk/pcretest.c    2011-06-06 17:46:22 UTC (rev 606)
@@ -1186,7 +1186,7 @@
 #endif
 printf("  -q       quiet: do not output PCRE version number at start\n");
 printf("  -S <n>   set stack size to <n> megabytes\n");
-printf("  -s       output store (memory) used information\n"
+printf("  -s       force each pattern to be studied\n"
        "  -t       time compilation and execution\n");
 printf("  -t <n>   time compilation and execution, repeating <n> times\n");
 printf("  -tm      time execution (matching) only\n");
@@ -1214,6 +1214,7 @@
 int timeitm = 0;
 int showinfo = 0;
 int showstore = 0;
+int force_study = 0;
 int quiet = 0;
 int size_offsets = 45;
 int size_offsets_max;
@@ -1262,8 +1263,8 @@
   {
   unsigned char *endptr;


-  if (strcmp(argv[op], "-s") == 0 || strcmp(argv[op], "-m") == 0)
-    showstore = 1;
+  if (strcmp(argv[op], "-m") == 0) showstore = 1;
+  else if (strcmp(argv[op], "-s") == 0) force_study = 1; 
   else if (strcmp(argv[op], "-q") == 0) quiet = 1;
   else if (strcmp(argv[op], "-b") == 0) debug = 1;
   else if (strcmp(argv[op], "-i") == 0) showinfo = 1;
@@ -1807,10 +1808,10 @@
     true_size = ((real_pcre *)re)->size;
     regex_gotten_store = gotten_store;


-    /* If /S was present, study the regexp to generate additional info to
+    /* If -s or /S was present, study the regexp to generate additional info to
     help with the matching. */


-    if (do_study)
+    if (do_study || force_study)
       {
       if (timeit > 0)
         {
@@ -2050,7 +2051,7 @@
       so messes up the test suite. (And with the /F option, it might be
       flipped.) */


-      if (do_study)
+      if (do_study || force_study)
         {
         if (extra == NULL)
           fprintf(outfile, "Study returned NULL\n");