[Pcre-svn] [1090] code/trunk: pcre32: pcretest: Don't conver…

Top Page
Delete this message
Author: Subversion repository
Date:  
To: pcre-svn
Subject: [Pcre-svn] [1090] code/trunk: pcre32: pcretest: Don't convert data line to UTF-8 first
Revision: 1090
          http://vcs.pcre.org/viewvc?view=rev&revision=1090
Author:   chpe
Date:     2012-10-16 16:55:48 +0100 (Tue, 16 Oct 2012)


Log Message:
-----------
pcre32: pcretest: Don't convert data line to UTF-8 first

While reading the data lines, directly put them into the 8, 16 or
32 bit buffers instead of first converting them into UTF-8 and only
afterwards converting that buffer to 16/32 bit. This is necessary so
the in 32 bit mode the \x{} escapes can use the full 32-bit range
(while the non-standard 5/6 byte UTF-8 sequences can only express
characters up to 31-bits).

Modified Paths:
--------------
    code/trunk/pcre_internal.h
    code/trunk/pcretest.c
    code/trunk/testdata/testinput7
    code/trunk/testdata/testoutput18-16
    code/trunk/testdata/testoutput18-32
    code/trunk/testdata/testoutput26
    code/trunk/testdata/testoutput7


Modified: code/trunk/pcre_internal.h
===================================================================
--- code/trunk/pcre_internal.h    2012-10-16 15:55:45 UTC (rev 1089)
+++ code/trunk/pcre_internal.h    2012-10-16 15:55:48 UTC (rev 1090)
@@ -586,31 +586,10 @@


#else /* SUPPORT_UTF */

-#if defined COMPILE_PCRE8
-
-/* These macros were originally written in the form of loops that used data
-from the tables whose names start with PRIV(utf8_table). They were rewritten by
-a user so as not to use loops, because in some environments this gives a
-significant performance advantage, and it seems never to do any harm. */
-
-/* Tells the biggest code point which can be encoded as a single character. */
-
-#define MAX_VALUE_FOR_SINGLE_CHAR 127
-
/* Tests whether the code point needs extra characters to decode. */

-#define HAS_EXTRALEN(c) ((c) >= 0xc0)
+#define HASUTF8EXTRALEN(c) ((c) >= 0xc0)

-/* Returns with the additional number of characters if IS_MULTICHAR(c) is TRUE.
-Otherwise it has an undefined behaviour. */
-
-#define GET_EXTRALEN(c) (PRIV(utf8_table4)[(c) & 0x3f])
-
-/* Returns TRUE, if the given character is not the first character
-of a UTF sequence. */
-
-#define NOT_FIRSTCHAR(c) (((c) & 0xc0) == 0x80)
-
/* Base macro to pick up the remaining bytes of a UTF-8 character, not
advancing the pointer. */

@@ -633,20 +612,6 @@
           ((eptr[4] & 0x3f) << 6) | (eptr[5] & 0x3f); \
     }


-/* Get the next UTF-8 character, not advancing the pointer. This is called when
-we know we are in UTF-8 mode. */
-
-#define GETCHAR(c, eptr) \
- c = *eptr; \
- if (c >= 0xc0) GETUTF8(c, eptr);
-
-/* Get the next UTF-8 character, testing for UTF-8 mode, and not advancing the
-pointer. */
-
-#define GETCHARTEST(c, eptr) \
- c = *eptr; \
- if (utf && c >= 0xc0) GETUTF8(c, eptr);
-
/* Base macro to pick up the remaining bytes of a UTF-8 character, advancing
the pointer. */

@@ -681,6 +646,45 @@
       } \
     }


+#if defined COMPILE_PCRE8
+
+/* These macros were originally written in the form of loops that used data
+from the tables whose names start with PRIV(utf8_table). They were rewritten by
+a user so as not to use loops, because in some environments this gives a
+significant performance advantage, and it seems never to do any harm. */
+
+/* Tells the biggest code point which can be encoded as a single character. */
+
+#define MAX_VALUE_FOR_SINGLE_CHAR 127
+
+/* Tests whether the code point needs extra characters to decode. */
+
+#define HAS_EXTRALEN(c) ((c) >= 0xc0)
+
+/* Returns with the additional number of characters if IS_MULTICHAR(c) is TRUE.
+Otherwise it has an undefined behaviour. */
+
+#define GET_EXTRALEN(c) (PRIV(utf8_table4)[(c) & 0x3f])
+
+/* Returns TRUE, if the given character is not the first character
+of a UTF sequence. */
+
+#define NOT_FIRSTCHAR(c) (((c) & 0xc0) == 0x80)
+
+/* Get the next UTF-8 character, not advancing the pointer. This is called when
+we know we are in UTF-8 mode. */
+
+#define GETCHAR(c, eptr) \
+ c = *eptr; \
+ if (c >= 0xc0) GETUTF8(c, eptr);
+
+/* Get the next UTF-8 character, testing for UTF-8 mode, and not advancing the
+pointer. */
+
+#define GETCHARTEST(c, eptr) \
+ c = *eptr; \
+ if (utf && c >= 0xc0) GETUTF8(c, eptr);
+
/* Get the next UTF-8 character, advancing the pointer. This is called when we
know we are in UTF-8 mode. */


Modified: code/trunk/pcretest.c
===================================================================
--- code/trunk/pcretest.c    2012-10-16 15:55:45 UTC (rev 1089)
+++ code/trunk/pcretest.c    2012-10-16 15:55:48 UTC (rev 1090)
@@ -831,7 +831,6 @@


static int buffer_size = 50000;
static pcre_uint8 *buffer = NULL;
-static pcre_uint8 *dbuffer = NULL;
static pcre_uint8 *pbuffer = NULL;

 /* Another buffer is needed translation to 16/32-bit character strings. It will
@@ -1666,10 +1665,9 @@
     {
     int new_buffer_size = 2*buffer_size;
     pcre_uint8 *new_buffer = (pcre_uint8 *)malloc(new_buffer_size);
-    pcre_uint8 *new_dbuffer = (pcre_uint8 *)malloc(new_buffer_size);
     pcre_uint8 *new_pbuffer = (pcre_uint8 *)malloc(new_buffer_size);


-    if (new_buffer == NULL || new_dbuffer == NULL || new_pbuffer == NULL)
+    if (new_buffer == NULL || new_pbuffer == NULL)
       {
       fprintf(stderr, "pcretest: malloc(%d) failed\n", new_buffer_size);
       exit(1);
@@ -1684,11 +1682,9 @@
     here = new_buffer + (here - buffer);


     free(buffer);
-    free(dbuffer);
     free(pbuffer);


     buffer = new_buffer;
-    dbuffer = new_dbuffer;
     pbuffer = new_pbuffer;
     }
   }
@@ -2719,6 +2715,8 @@
 int verify_jit = 0;
 int yield = 0;
 int stack_size;
+pcre_uint8 *dbuffer = NULL;
+size_t dbuffer_size = 1u << 14;


#if !defined NOPOSIX
int posix = 0;
@@ -2762,7 +2760,6 @@
and 32-bit buffers (buffer16, buffer32) are obtained only if needed. */

buffer = (pcre_uint8 *)malloc(buffer_size);
-dbuffer = (pcre_uint8 *)malloc(buffer_size);
pbuffer = (pcre_uint8 *)malloc(buffer_size);

/* The outfile variable is static so that new_malloc can use it. */
@@ -4060,7 +4057,15 @@

   for (;;)
     {
-    pcre_uint8 *q;
+#ifdef SUPPORT_PCRE8
+    pcre_uint8 *q8;
+#endif
+#ifdef SUPPORT_PCRE16
+    pcre_uint16 *q16;
+#endif
+#ifdef SUPPORT_PCRE32
+    pcre_uint32 *q32;
+#endif
     pcre_uint8 *bptr;
     int *use_offsets = offsets;
     int use_size_offsets = size_offsets;
@@ -4132,7 +4137,47 @@
     p = buffer;
     while (isspace(*p)) p++;


-    bptr = q = dbuffer;
+#ifndef NOUTF
+    /* Check that the data is well-formed UTF-8 if we're in UTF mode. To create
+       invalid input to pcre_exec, you must use \x?? or \x{} sequences. */
+    if (use_utf)
+      {
+      char *q;
+      pcre_uint32 c;
+      int n = 1;
+
+      for (q = p; n > 0 && *q; q += n) n = utf82ord(q, &c);
+      if (n <= 0)
+        {
+        fprintf(outfile, "**Failed: invalid UTF-8 string cannot be used as input in UTF mode\n");
+        goto NEXT_DATA;
+        }
+      }
+#endif
+
+    /* Allocate a buffer to hold the data line. len+1 is an upper bound on
+       the number of pcre_uchar units that will be needed. */
+    if (dbuffer == NULL || len >= dbuffer_size)
+      {
+      dbuffer_size *= 2;
+      dbuffer = (pcre_uint8 *)realloc(dbuffer, dbuffer_size * CHAR_SIZE);
+      if (dbuffer == NULL)
+        {
+        fprintf(stderr, "pcretest: malloc(%d) failed\n", dbuffer_size);
+        exit(1);
+        }
+      }
+
+#ifdef SUPPORT_PCRE8
+    q8 = (pcre_uint8 *) dbuffer;
+#endif
+#ifdef SUPPORT_PCRE16
+    q16 = (pcre_uint16 *) dbuffer;
+#endif
+#ifdef SUPPORT_PCRE32
+    q32 = (pcre_uint32 *) dbuffer;
+#endif
+
     while ((c = *p++) != 0)
       {
       int i = 0;
@@ -4145,11 +4190,9 @@


       if (c != '\\')
         {
-        if (use_utf)
-          {
-          *q++ = c;
-          continue;
-          }
+#ifndef NOUTF
+        if (use_utf && HASUTF8EXTRALEN(c)) { GETUTF8INC(c, p); }
+#endif
         }


       /* Handle backslash escapes */
@@ -4210,11 +4253,13 @@
           c = c * 16 + tolower(*p) - ((isdigit(*p))? '0' : 'a' - 10);
           p++;
           }
-        if (use_utf)
+#if !defined NOUTF && defined SUPPORT_PCRE8
+        if (use_utf && (pcre_mode == PCRE8_MODE))
           {
-          *q++ = c;
+          *q8++ = c;
           continue;
           }
+#endif
         break;


         case 0:   /* \ followed by EOF allows for an empty line */
@@ -4427,48 +4472,114 @@
       than 127       in UTF mode must have come from \x{...} or octal constructs
       because values from \x.. get this far only in non-UTF mode. */


-#if !defined NOUTF || defined SUPPORT_PCRE16 || defined SUPPORT_PCRE32
-      if (pcre_mode != PCRE8_MODE || use_utf)
+#ifdef SUPPORT_PCRE8
+      if (pcre_mode == PCRE8_MODE)
         {
-        pcre_uint8 buff8[8];
-        int ii, utn;
-        utn = ord2utf8(c, buff8);
-        for (ii = 0; ii < utn; ii++) *q++ = buff8[ii];
+#ifndef NOUTF
+        if (use_utf)
+          {
+          q8 += ord2utf8(c, q8);
+          }
+        else
+#endif
+          {
+          if (c > 0xffu)
+            {
+            fprintf(outfile, "** Character \\x{%x} is greater than 255 "
+              "and UTF-8 mode is not enabled.\n", c);
+            fprintf(outfile, "** Truncation will probably give the wrong "
+              "result.\n");
+            }
+
+          *q8++ = c;
+          }
         }
-      else
 #endif
+#ifdef SUPPORT_PCRE16
+      if (pcre_mode == PCRE16_MODE)
         {
-        if (c > 255)
+#ifndef NOUTF
+        if (use_utf)
           {
-          fprintf(outfile, "** Character \\x{%x} is greater than 255 "
-            "and UTF-8 mode is not enabled.\n", c);
-          fprintf(outfile, "** Truncation will probably give the wrong "
-            "result.\n");
+          if (c > 0x10ffffu)
+            {
+            fprintf(outfile, "**Failed: character value greater than 0x10ffff "
+              "cannot be converted to UTF-16\n");
+            goto NEXT_DATA;
+            }
+          else if (c >= 0x10000u)
+            {
+            c-= 0x10000u;
+            *q16++ = 0xD800 | (c >> 10);
+            *q16++ = 0xDC00 | (c & 0x3ff);
+            }
+          else
+            *q16++ = c;
           }
-        *q++ = c;
+        else 
+#endif
+          {
+          if (c > 0xffffu)
+            {
+            fprintf(outfile, "** Character value is greater than 0xffff "
+              "and UTF-16 mode is not enabled.\n", c);
+            fprintf(outfile, "** Truncation will probably give the wrong "
+              "result.\n");
+            }
+
+          *q16++ = c;
+          }
         }
+#endif
+#ifdef SUPPORT_PCRE32
+      if (pcre_mode == PCRE32_MODE)
+        {
+        *q32++ = c;
+        }
+#endif
+
       }


     /* Reached end of subject string */


-    *q = 0;
-    len = (int)(q - dbuffer);
+#ifdef SUPPORT_PCRE8
+    if (pcre_mode == PCRE8_MODE)
+    {
+      *q8 = 0;
+      len = (int)(q8 - (pcre_uint8 *)dbuffer);
+    }
+#endif
+#ifdef SUPPORT_PCRE16
+    if (pcre_mode == PCRE16_MODE)
+    {
+      *q16 = 0;
+      len = (int)(q16 - (pcre_uint16 *)dbuffer);
+    }
+#endif
+#ifdef SUPPORT_PCRE32
+    if (pcre_mode == PCRE32_MODE)
+    {
+      *q32 = 0;
+      len = (int)(q32 - (pcre_uint32 *)dbuffer);
+    }
+#endif


     /* Move the data to the end of the buffer so that a read over the end of
     the buffer will be seen by valgrind, even if it doesn't cause a crash. If
     we are using the POSIX interface, we must include the terminating zero. */


+    bptr = dbuffer;
+
 #if !defined NOPOSIX
     if (posix || do_posix)
       {
-      memmove(bptr + buffer_size - len - 1, bptr, len + 1);
-      bptr += buffer_size - len - 1;
+      memmove(bptr + dbuffer_size - len - 1, bptr, len + 1);
+      bptr += dbuffer_size - len - 1;
       }
     else
 #endif
       {
-      memmove(bptr + buffer_size - len, bptr, len);
-      bptr += buffer_size - len;
+      bptr = memmove(bptr + (dbuffer_size - len) * CHAR_SIZE, bptr, len * CHAR_SIZE);
       }


     if ((all_use_dfa || use_dfa) && find_match_limit)
@@ -4532,61 +4643,6 @@


     /* Handle matching via the native interface - repeats for /g and /G */


-#ifdef SUPPORT_PCRE16
-    if (pcre_mode == PCRE16_MODE)
-      {
-      len = to16(TRUE, bptr, REAL_PCRE_OPTIONS(re) & PCRE_UTF8, len);
-      switch(len)
-        {
-        case -1:
-        fprintf(outfile, "**Failed: invalid UTF-8 string cannot be "
-          "converted to UTF-16\n");
-        goto NEXT_DATA;
-
-        case -2:
-        fprintf(outfile, "**Failed: character value greater than 0x10ffff "
-          "cannot be converted to UTF-16\n");
-        goto NEXT_DATA;
-
-        case -3:
-        fprintf(outfile, "**Failed: character value greater than 0xffff "
-          "cannot be converted to 16-bit in non-UTF mode\n");
-        goto NEXT_DATA;
-
-        default:
-        break;
-        }
-      bptr = (pcre_uint8 *)buffer16;
-      }
-#endif
-
-#ifdef SUPPORT_PCRE32
-    if (pcre_mode == PCRE32_MODE)
-      {
-      len = to32(TRUE, bptr, REAL_PCRE_OPTIONS(re) & PCRE_UTF32, len);
-      switch(len)
-        {
-        case -1:
-        fprintf(outfile, "**Failed: invalid UTF-8 string cannot be "
-          "converted to UTF-32\n");
-        goto NEXT_DATA;
-
-        case -2:
-        fprintf(outfile, "**Failed: character value greater than 0x10ffff "
-          "cannot be converted to UTF-32\n");
-        goto NEXT_DATA;
-
-        case -3:
-        fprintf(outfile, "**Failed: character value is ill-formed UTF-32\n");
-        goto NEXT_DATA;
-
-        default:
-        break;
-        }
-      bptr = (pcre_uint8 *)buffer32;
-      }
-#endif
-
     /* Ensure that there is a JIT callback if we want to verify that JIT was
     actually used. If jit_stack == NULL, no stack has yet been assigned. */



Modified: code/trunk/testdata/testinput7
===================================================================
--- code/trunk/testdata/testinput7    2012-10-16 15:55:45 UTC (rev 1089)
+++ code/trunk/testdata/testinput7    2012-10-16 15:55:48 UTC (rev 1090)
@@ -89,7 +89,7 @@
 /(\p{Yi}{0,3}+\277)*/


 /\p{Zl}{2,3}+/8BZ
-    \xe2\x80\xa8\xe2\x80\xa8
+    


     \x{2028}\x{2028}\x{2028}


/\p{Zl}/8BZ

Modified: code/trunk/testdata/testoutput18-16
===================================================================
--- code/trunk/testdata/testoutput18-16    2012-10-16 15:55:45 UTC (rev 1089)
+++ code/trunk/testdata/testoutput18-16    2012-10-16 15:55:48 UTC (rev 1090)
@@ -6,7 +6,7 @@


 /abc/8
     \xC3]
-**Failed: invalid UTF-8 string cannot be converted to UTF-16
+**Failed: invalid UTF-8 string cannot be used as input in UTF mode


 /X(\C{3})/8
     X\x{11234}Y


Modified: code/trunk/testdata/testoutput18-32
===================================================================
--- code/trunk/testdata/testoutput18-32    2012-10-16 15:55:45 UTC (rev 1089)
+++ code/trunk/testdata/testoutput18-32    2012-10-16 15:55:48 UTC (rev 1090)
@@ -6,7 +6,7 @@


 /abc/8
     \xC3]
-**Failed: invalid UTF-8 string cannot be converted to UTF-32
+**Failed: invalid UTF-8 string cannot be used as input in UTF mode


 /X(\C{3})/8
     X\x{11234}Y
@@ -629,7 +629,7 @@
     \x{dfff}\?
 No match
     \x{110000}
-**Failed: character value greater than 0x10ffff cannot be converted to UTF-32
+Error -10 (bad UTF-32 string) offset=0 reason=3
     \x{d800}\x{1234}
 Error -10 (bad UTF-32 string) offset=0 reason=1
     \x{fffe}


Modified: code/trunk/testdata/testoutput26
===================================================================
--- code/trunk/testdata/testoutput26    2012-10-16 15:55:45 UTC (rev 1089)
+++ code/trunk/testdata/testoutput26    2012-10-16 15:55:48 UTC (rev 1090)
@@ -7,6 +7,6 @@


 /\C/8
     \x{110000}
-**Failed: character value greater than 0x10ffff cannot be converted to UTF-32
+Error -10 (bad UTF-32 string) offset=0 reason=3


/-- End of testinput26 --/

Modified: code/trunk/testdata/testoutput7
===================================================================
--- code/trunk/testdata/testoutput7    2012-10-16 15:55:45 UTC (rev 1089)
+++ code/trunk/testdata/testoutput7    2012-10-16 15:55:48 UTC (rev 1090)
@@ -233,7 +233,7 @@
         Ket
         End
 ------------------------------------------------------------------
-    \xe2\x80\xa8\xe2\x80\xa8
+    


  0: \x{2028}\x{2028}
     \x{2028}\x{2028}\x{2028}
  0: \x{2028}\x{2028}\x{2028}