[pcre-dev] Handle PCRE_JCHANGED more exactly

Αρχική Σελίδα
Delete this message
Συντάκτης: Stepan Kasal
Ημερομηνία:  
Προς: pcre-dev
Αντικείμενο: [pcre-dev] Handle PCRE_JCHANGED more exactly
Hello,
I noticed that PCRE_JCHANGED can be set even though the option 'J'
does not actually change. For example, in these cases:

/(?J)(?P<A>a)/     ... a global change at the beginning of the pattern
/x(?-J:(?P<A>a))/  ... J is unset al the time
/x(?J:(?P<A>a))/J  ... J is set al the time
/x(?J:y)z(?P<A>a)/ ... J does not affect any named subpattern


Actually, I think that ``J changed'' is not exactly the information
which is needed. Function get_first_set can perform its
optimization, unless a subpattern name was used more than once.
Let's call this bit PCRE_DUPNAMEUSED; it's easy to trace this info.

So I wrote a patch which replaces PCRE_JCHANGED by PCRE_DUPNAMEUSED.

But then I discovered that PCRE_INFO_JCHANGED is part of the
documented interface. That's unfortunate, and even the 7.4 changelog
shows that the definition of this bit differs from what would be
suitable internally (cf. first of my examples above).

So I discarded my patch, and replaced it by a much smaller one.
Attached please find it; it passes "make check".
Aren't my comments too cheeky?

Wouldn't it be possible to admit that PCRE_INFO_JCHANGED was a bad
idea and remove it from the documented interface? It's not that long
since it was published, after all.

(I'm willing to redo the patch, if necessary.)

Have a nice day,
    Stepan Kasal

2007-11-23 Stepan Kasal <skasal@???>

Introduce PCRE_DUPNAMEUSED which is set iff a duplicate subpattern
name occurs. This is exactly the bit needed in pcre_get.c .

Index: pcre.h.in
===================================================================
--- pcre.h.in    (revision 276)
+++ pcre.h.in    (working copy)
@@ -168,8 +168,9 @@
 #define PCRE_INFO_STUDYSIZE         10
 #define PCRE_INFO_DEFAULT_TABLES    11
 #define PCRE_INFO_OKPARTIAL         12
-#define PCRE_INFO_JCHANGED          13
+#define PCRE_INFO_JCHANGED          13  /* Not used by pcre itself, but unfortunately documented */
 #define PCRE_INFO_HASCRORLF         14
+#define PCRE_INFO_DUPNAMEUSED       15


 /* Request types for pcre_config(). Do not re-arrange, in order to remain
 compatible. */
Index: pcre_internal.h
===================================================================
--- pcre_internal.h    (revision 276)
+++ pcre_internal.h    (working copy)
@@ -489,8 +489,9 @@
 #define PCRE_FIRSTSET      0x0002  /* first_byte is set */
 #define PCRE_REQCHSET      0x0004  /* req_byte is set */
 #define PCRE_STARTLINE     0x0008  /* start after \n for multiline */
-#define PCRE_JCHANGED      0x0010  /* j option used in regex */
+#define PCRE_JCHANGED      0x0010  /* j option used in regex; useless, but documented */
 #define PCRE_HASCRORLF     0x0020  /* explicit \r or \n in pattern */
+#define PCRE_DUPNAMEUSED   0x0040  /* a duplicate subpattern name has occured */


/* Options for the "extra" block produced by pcre_study(). */

Index: pcre_compile.c
===================================================================
--- pcre_compile.c    (revision 276)
+++ pcre_compile.c    (working copy)
@@ -4482,7 +4482,9 @@
                 {
                 if (slot[2+namelen] == 0)
                   {
-                  if ((options & PCRE_DUPNAMES) == 0)
+                  if ((options & PCRE_DUPNAMES) != 0)
+                    cd->external_flags |= PCRE_DUPNAMEUSED;
+                  else
                     {
                     *errorcodeptr = ERR43;
                     goto FAILED;
@@ -4733,8 +4735,8 @@
             {
             case '-': optset = &unset; break;


-            case 'J':    /* Record that it changed in the external options */
-            *optset |= PCRE_DUPNAMES;
+            case 'J': *optset |= PCRE_DUPNAMES;
+            /* Remember that we have seen this, for PCRE_INFO_JCHANGED. */
             cd->external_flags |= PCRE_JCHANGED;
             break;


Index: pcre_get.c
===================================================================
--- pcre_get.c    (revision 276)
+++ pcre_get.c    (working copy)
@@ -189,7 +189,7 @@
 int entrysize;
 char *first, *last;
 uschar *entry;
-if ((re->options & PCRE_DUPNAMES) == 0 && (re->flags & PCRE_JCHANGED) == 0)
+if ((re->flags & PCRE_DUPNAMEUSED) == 0)
   return pcre_get_stringnumber(code, stringname);
 entrysize = pcre_get_stringtable_entries(code, stringname, &first, &last);
 if (entrysize <= 0) return entrysize;
Index: pcretest.c
===================================================================
--- pcretest.c    (revision 276)
+++ pcretest.c    (working copy)
@@ -1396,7 +1396,7 @@
 #if !defined NOINFOCHECK
       int old_first_char, old_options, old_count;
 #endif
-      int count, backrefmax, first_char, need_char, okpartial, jchanged,
+      int count, backrefmax, first_char, need_char, okpartial,
         hascrorlf;
       int nameentrysize, namecount;
       const uschar *nametable;
@@ -1411,7 +1411,6 @@
       new_info(re, NULL, PCRE_INFO_NAMECOUNT, &namecount);
       new_info(re, NULL, PCRE_INFO_NAMETABLE, (void *)&nametable);
       new_info(re, NULL, PCRE_INFO_OKPARTIAL, &okpartial);
-      new_info(re, NULL, PCRE_INFO_JCHANGED, &jchanged);
       new_info(re, NULL, PCRE_INFO_HASCRORLF, &hascrorlf);


 #if !defined NOINFOCHECK
@@ -1478,8 +1477,6 @@
           ((get_options & PCRE_NO_UTF8_CHECK) != 0)? " no_utf8_check" : "",
           ((get_options & PCRE_DUPNAMES) != 0)? " dupnames" : "");


-      if (jchanged) fprintf(outfile, "Duplicate name status changes\n");
-
       switch (get_options & PCRE_NEWLINE_BITS)
         {
         case PCRE_NEWLINE_CR:
Index: testdata/testoutput2
===================================================================
--- testdata/testoutput2    (revision 276)
+++ testdata/testoutput2    (working copy)
@@ -6166,7 +6166,6 @@
   A   2
   A   3
 Options: anchored dupnames
-Duplicate name status changes
 No first char
 No need char
     a1b\CA
@@ -6200,7 +6199,6 @@
   B   3
   C   4
 Options: anchored
-Duplicate name status changes
 No first char
 No need char
     a bc d\CA\CB\CC