[Pcre-svn] [647] code/trunk: Improve class handling for \H a…

Top Page
Delete this message
Author: Subversion repository
Date:  
To: pcre-svn
Subject: [Pcre-svn] [647] code/trunk: Improve class handling for \H and \V by ignoring caseless.
Revision: 647
          http://www.exim.org/viewvc/pcre2?view=rev&revision=647
Author:   ph10
Date:     2017-01-03 18:17:31 +0000 (Tue, 03 Jan 2017)
Log Message:
-----------
Improve class handling for \H and \V by ignoring caseless.


Modified Paths:
--------------
    code/trunk/ChangeLog
    code/trunk/src/pcre2_compile.c
    code/trunk/src/pcre2_intmodedep.h
    code/trunk/testdata/testinput5
    code/trunk/testdata/testoutput5


Modified: code/trunk/ChangeLog
===================================================================
--- code/trunk/ChangeLog    2017-01-03 11:35:26 UTC (rev 646)
+++ code/trunk/ChangeLog    2017-01-03 18:17:31 UTC (rev 647)
@@ -309,7 +309,12 @@
 47. Detecting patterns that are too large inside the length-measuring loop
 saves processing ridiculously long patterns to their end.


+48. Ignore PCRE2_CASELESS when processing \h, \H, \v, and \V in classes as it
+just wastes time. In the UTF case it can also produce redundant entries in
+XCLASS lists caused by characters with multiple other cases and pairs of
+characters in the same "not-x" sublists.

+
Version 10.22 29-July-2016
--------------------------


Modified: code/trunk/src/pcre2_compile.c
===================================================================
--- code/trunk/src/pcre2_compile.c    2017-01-03 11:35:26 UTC (rev 646)
+++ code/trunk/src/pcre2_compile.c    2017-01-03 18:17:31 UTC (rev 647)
@@ -117,7 +117,7 @@
 /* Function definitions to allow mutual recursion */


 static unsigned int
-  add_list_to_class(uint8_t *, PCRE2_UCHAR **, uint32_t, compile_block *,
+  add_list_to_class_internal(uint8_t *, PCRE2_UCHAR **, uint32_t, compile_block *,
     const uint32_t *, unsigned int);


static int
@@ -4219,13 +4219,15 @@


 /*************************************************
-*        Add a character or range to a class     *
+* Add a character or range to a class (internal) *
 *************************************************/


/* This function packages up the logic of adding a character or range of
characters to a class. The character values in the arguments will be within the
-valid values for the current mode (8-bit, 16-bit, UTF, etc). This function is
-mutually recursive with the function immediately below.
+valid values for the current mode (8-bit, 16-bit, UTF, etc). This function is
+called only from within the "add to class" group of functions, some of which
+are recursive and mutually recursive. The external entry point is
+add_to_class().

 Arguments:
   classbits     the bit map for characters < 256
@@ -4240,8 +4242,8 @@
 */


 static unsigned int
-add_to_class(uint8_t *classbits, PCRE2_UCHAR **uchardptr, uint32_t options,
-  compile_block *cb, uint32_t start, uint32_t end)
+add_to_class_internal(uint8_t *classbits, PCRE2_UCHAR **uchardptr, 
+  uint32_t options, compile_block *cb, uint32_t start, uint32_t end)
 {
 uint32_t c;
 uint32_t classbits_end = (end <= 0xff ? end : 0xff);
@@ -4267,12 +4269,12 @@
       {
       /* Handle a single character that has more than one other case. */


-      if (rc > 0) n8 += add_list_to_class(classbits, uchardptr, options, cb,
+      if (rc > 0) n8 += add_list_to_class_internal(classbits, uchardptr, options, cb,
         PRIV(ucd_caseless_sets) + rc, oc);


       /* Do nothing if the other case range is within the original range. */


-      else if (oc >= start && od <= end) continue;
+      else if (oc >= cb->class_range_start && od <= cb->class_range_end) continue;


       /* Extend the original range if there is overlap, noting that if oc < c, we
       can't have od > end because a subrange is always shorter than the basic
@@ -4284,7 +4286,7 @@
         end = od;       /* Extend upwards */
         if (end > classbits_end) classbits_end = (end <= 0xff ? end : 0xff);
         }
-      else n8 += add_to_class(classbits, uchardptr, options, cb, oc, od);
+      else n8 += add_to_class_internal(classbits, uchardptr, options, cb, oc, od);
       }
     }
   else
@@ -4299,12 +4301,14 @@
     }
   }


-/* Now handle the original range. Adjust the final value according to the bit
-length - this means that the same lists of (e.g.) horizontal spaces can be used
-in all cases. */
+/* Now handle the originally supplied range. Adjust the final value according
+to the bit length - this means that the same lists of (e.g.) horizontal spaces
+can be used in all cases. */

if ((options & PCRE2_UTF) == 0 && end > MAX_NON_UTF_CHAR)
end = MAX_NON_UTF_CHAR;
+
+if (start > cb->class_range_start && end < cb->class_range_end) return n8;

/* Use the bitmap for characters < 256. Otherwise use extra data.*/

@@ -4357,10 +4361,10 @@
     *uchardata++ = XCL_SINGLE;
     *uchardata++ = start;
     }
-#endif
+#endif  /* PCRE2_CODE_UNIT_WIDTH == 8 */
   *uchardptr = uchardata;   /* Updata extra data pointer */
   }
-#else
+#else  /* SUPPORT_WIDE_CHARS */
   (void)uchardptr;          /* Avoid compiler warning */
 #endif /* SUPPORT_WIDE_CHARS */


@@ -4370,14 +4374,15 @@


 /*************************************************
-*        Add a list of characters to a class     *
+* Add a list of characters to a class (internal) *
 *************************************************/


/* This function is used for adding a list of case-equivalent characters to a
class, and also for adding a list of horizontal or vertical whitespace. If the
list is in order (which it should be), ranges of characters are detected and
-handled appropriately. This function is mutually recursive with the function
-above.
+handled appropriately. This function is called (sometimes recursively) only
+from within the "add to class" set of functions. The external entry point is
+add_list_to_class().

 Arguments:
   classbits     the bit map for characters < 256
@@ -4394,6 +4399,76 @@
 */


 static unsigned int
+add_list_to_class_internal(uint8_t *classbits, PCRE2_UCHAR **uchardptr, 
+  uint32_t options, compile_block *cb, const uint32_t *p, unsigned int except)
+{
+unsigned int n8 = 0;
+while (p[0] < NOTACHAR)
+  {
+  unsigned int n = 0;
+  if (p[0] != except)
+    {
+    while(p[n+1] == p[0] + n + 1) n++;
+    n8 += add_to_class_internal(classbits, uchardptr, options, cb, p[0], p[n]);
+    }
+  p += n + 1;
+  }
+return n8;
+}
+
+
+
+/*************************************************
+*   External entry point for add range to class  *
+*************************************************/
+
+/* This function sets the overall range so that the internal functions can try 
+to avoid duplication when handling case-independence.
+
+Arguments:
+  classbits     the bit map for characters < 256
+  uchardptr     points to the pointer for extra data
+  options       the options word
+  cb            compile data
+  start         start of range character
+  end           end of range character
+
+Returns:        the number of < 256 characters added
+                the pointer to extra data is updated
+*/
+
+static unsigned int
+add_to_class(uint8_t *classbits, PCRE2_UCHAR **uchardptr, uint32_t options,
+  compile_block *cb, uint32_t start, uint32_t end)
+{
+cb->class_range_start = start;
+cb->class_range_end = end;
+return add_to_class_internal(classbits, uchardptr, options, cb, start, end);
+}
+
+
+/*************************************************
+*   External entry point for add list to class   *
+*************************************************/
+
+/* This function sets the overall range so that the internal functions can try 
+to avoid duplication when handling case-independence.
+
+Arguments:
+  classbits     the bit map for characters < 256
+  uchardptr     points to the pointer for extra data
+  options       the options word
+  cb            contains pointers to tables etc.
+  p             points to row of 32-bit values, terminated by NOTACHAR
+  except        character to omit; this is used when adding lists of
+                  case-equivalent characters to avoid including the one we
+                  already know about
+
+Returns:        the number of < 256 characters added
+                the pointer to extra data is updated
+*/
+
+static unsigned int
 add_list_to_class(uint8_t *classbits, PCRE2_UCHAR **uchardptr, uint32_t options,
   compile_block *cb, const uint32_t *p, unsigned int except)
 {
@@ -4404,7 +4479,9 @@
   if (p[0] != except)
     {
     while(p[n+1] == p[0] + n + 1) n++;
-    n8 += add_to_class(classbits, uchardptr, options, cb, p[0], p[n]);
+    cb->class_range_start = p[0];
+    cb->class_range_end = p[n];  
+    n8 += add_to_class_internal(classbits, uchardptr, options, cb, p[0], p[n]);
     }
   p += n + 1;
   }
@@ -5071,25 +5148,31 @@
           should_flip_negation = TRUE;
           for (i = 0; i < 32; i++) classbits[i] |= ~cbits[i+cbit_space];
           break;
+          
+          /* When adding the horizontal or vertical space lists to a class, or 
+          their complements, disable PCRE2_CASELESS, because it justs wastes 
+          time, and in the "not-x" UTF cases can create unwanted duplicates in 
+          the XCLASS list (provoked by characters that have more than one other 
+          case and by both cases being in the same "not-x" sublist). */


           case ESC_h:
-          (void)add_list_to_class(classbits, &class_uchardata, options, cb,
-            PRIV(hspace_list), NOTACHAR);
+          (void)add_list_to_class(classbits, &class_uchardata, 
+            options & ~PCRE2_CASELESS, cb, PRIV(hspace_list), NOTACHAR);
           break;


           case ESC_H:
-          (void)add_not_list_to_class(classbits, &class_uchardata, options,
-            cb, PRIV(hspace_list));
+          (void)add_not_list_to_class(classbits, &class_uchardata, 
+            options & ~PCRE2_CASELESS, cb, PRIV(hspace_list));
           break;


           case ESC_v:
-          (void)add_list_to_class(classbits, &class_uchardata, options, cb,
-            PRIV(vspace_list), NOTACHAR);
+          (void)add_list_to_class(classbits, &class_uchardata, 
+            options & ~PCRE2_CASELESS, cb, PRIV(vspace_list), NOTACHAR);
           break;


           case ESC_V:
-          (void)add_not_list_to_class(classbits, &class_uchardata, options,
-            cb, PRIV(vspace_list));
+          (void)add_not_list_to_class(classbits, &class_uchardata, 
+            options & ~PCRE2_CASELESS, cb, PRIV(vspace_list));
           break;


           case ESC_p:


Modified: code/trunk/src/pcre2_intmodedep.h
===================================================================
--- code/trunk/src/pcre2_intmodedep.h    2017-01-03 11:35:26 UTC (rev 646)
+++ code/trunk/src/pcre2_intmodedep.h    2017-01-03 18:17:31 UTC (rev 647)
@@ -719,6 +719,8 @@
   uint32_t backref_map;            /* Bitmap of low back refs */
   uint32_t nltype;                 /* Newline type */
   uint32_t nllen;                  /* Newline string length */
+  uint32_t class_range_start;      /* Overall class range start */
+  uint32_t class_range_end;        /* Overall class range end */  
   PCRE2_UCHAR nl[4];               /* Newline string when fixed length */
   int  max_lookbehind;             /* Maximum lookbehind (characters) */
   int  parens_depth;               /* Depth of nested parentheses */


Modified: code/trunk/testdata/testinput5
===================================================================
--- code/trunk/testdata/testinput5    2017-01-03 11:35:26 UTC (rev 646)
+++ code/trunk/testdata/testinput5    2017-01-03 18:17:31 UTC (rev 647)
@@ -1759,4 +1759,10 @@


/^(?<!(?=􃡜))/B,utf

+# Horizontal and vertical space lists ignore caseless
+
+/[\HH]/Bi,utf
+
+/[^\HH]/Bi,utf
+
# End of testinput5

Modified: code/trunk/testdata/testoutput5
===================================================================
--- code/trunk/testdata/testoutput5    2017-01-03 11:35:26 UTC (rev 646)
+++ code/trunk/testdata/testoutput5    2017-01-03 18:17:31 UTC (rev 647)
@@ -4216,4 +4216,22 @@
         End
 ------------------------------------------------------------------


+# Horizontal and vertical space lists ignore caseless
+
+/[\HH]/Bi,utf
+------------------------------------------------------------------
+        Bra
+        [\x00-\x08\x0a-\x1f!-\x9f\xa1-\xff\x{100}-\x{167f}\x{1681}-\x{180d}\x{180f}-\x{1fff}\x{200b}-\x{202e}\x{2030}-\x{205e}\x{2060}-\x{2fff}\x{3001}-\x{10ffff}]
+        Ket
+        End
+------------------------------------------------------------------
+
+/[^\HH]/Bi,utf
+------------------------------------------------------------------
+        Bra
+        [^\x00-\x08\x0a-\x1f!-\x9f\xa1-\xff\x{100}-\x{167f}\x{1681}-\x{180d}\x{180f}-\x{1fff}\x{200b}-\x{202e}\x{2030}-\x{205e}\x{2060}-\x{2fff}\x{3001}-\x{10ffff}]
+        Ket
+        End
+------------------------------------------------------------------
+
 # End of testinput5