[Pcre-svn] [744] code/trunk: Correctly supporting \x and \u …

Startseite
Nachricht löschen
Autor: Subversion repository
Datum:  
To: pcre-svn
Betreff: [Pcre-svn] [744] code/trunk: Correctly supporting \x and \u in JavaScript compatibility mode
Revision: 744
          http://vcs.pcre.org/viewvc?view=rev&revision=744
Author:   zherczeg
Date:     2011-11-13 16:31:38 +0000 (Sun, 13 Nov 2011)


Log Message:
-----------
Correctly supporting \x and \u in JavaScript compatibility mode

Modified Paths:
--------------
    code/trunk/ChangeLog
    code/trunk/pcre_compile.c
    code/trunk/testdata/testinput2
    code/trunk/testdata/testoutput2


Modified: code/trunk/ChangeLog
===================================================================
--- code/trunk/ChangeLog    2011-11-08 09:59:38 UTC (rev 743)
+++ code/trunk/ChangeLog    2011-11-13 16:31:38 UTC (rev 744)
@@ -12,12 +12,15 @@
 3.  Fix cache-flush issue on PowerPC (It is still an experimental JIT port).
     PCRE_EXTRA_TABLES is not suported by JIT, and should be checked before
     calling _pcre_jit_exec. Some extra comments are added.
-    
-4.  Mark settings inside atomic groups that do not contain any capturing 
-    parentheses, for example, (?>a(*:m)), were not being passed out. This bug 
+
+4.  Mark settings inside atomic groups that do not contain any capturing
+    parentheses, for example, (?>a(*:m)), were not being passed out. This bug
     was introduced by change 18 for 8.20.


+5.  Supporting of \x and \u in JavaScript compatibility mode based on the
+    ECMA-262 standard.


+
Version 8.20 21-Oct-2011
------------------------


Modified: code/trunk/pcre_compile.c
===================================================================
--- code/trunk/pcre_compile.c    2011-11-08 09:59:38 UTC (rev 743)
+++ code/trunk/pcre_compile.c    2011-11-13 16:31:38 UTC (rev 744)
@@ -676,9 +676,39 @@


     case CHAR_l:
     case CHAR_L:
+    *errorcodeptr = ERR37;
+    break;
+
     case CHAR_u:
+    if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
+      {
+      /* In JavaScript, \u must be followed by four hexadecimal numbers.
+      Otherwise it is a lowercase u letter. */
+      if ((digitab[ptr[1]] & ctype_xdigit) != 0 && (digitab[ptr[2]] & ctype_xdigit) != 0
+           && (digitab[ptr[3]] & ctype_xdigit) != 0 && (digitab[ptr[4]] & ctype_xdigit) != 0)
+        {
+        int i;
+        c = 0;
+        for (i = 0; i < 4; ++i)
+          {
+          register int cc = *(++ptr);
+#ifndef EBCDIC  /* ASCII/UTF-8 coding */
+          if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */
+          c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
+#else           /* EBCDIC coding */
+          if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */
+          c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
+#endif
+          }
+        }
+      }
+    else
+      *errorcodeptr = ERR37;
+    break;
+
     case CHAR_U:
-    *errorcodeptr = ERR37;
+    /* In JavaScript, \U is an uppercase U letter. */
+    if ((options & PCRE_JAVASCRIPT_COMPAT) == 0) *errorcodeptr = ERR37;
     break;


     /* In a character class, \g is just a literal "g". Outside a character
@@ -828,6 +858,29 @@
     treated as a data character. */


     case CHAR_x:
+    if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
+      {
+      /* In JavaScript, \x must be followed by two hexadecimal numbers.
+      Otherwise it is a lowercase x letter. */
+      if ((digitab[ptr[1]] & ctype_xdigit) != 0 && (digitab[ptr[2]] & ctype_xdigit) != 0)
+        {
+        int i;
+        c = 0;
+        for (i = 0; i < 2; ++i)
+          {
+          register int cc = *(++ptr);
+#ifndef EBCDIC  /* ASCII/UTF-8 coding */
+          if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */
+          c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
+#else           /* EBCDIC coding */
+          if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */
+          c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
+#endif
+          }
+        }
+      break;
+      }
+
     if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
       {
       const uschar *pt = ptr + 2;


Modified: code/trunk/testdata/testinput2
===================================================================
--- code/trunk/testdata/testinput2    2011-11-08 09:59:38 UTC (rev 743)
+++ code/trunk/testdata/testinput2    2011-11-13 16:31:38 UTC (rev 744)
@@ -3969,4 +3969,38 @@
 /^(?>(a+))(?>b+)(?>(c+))(?>d+)(?>(e+))/
      \Maabbccddee


+/^a\x41z/<JS>
+    aAz
+    *** Failers
+    ax41z
+
+/^a[m\x41]z/<JS>
+    aAz
+
+/^a\x1z/<JS>
+    ax1z
+
+/^a\X41z/<JS>
+    aX41z
+    *** Failers
+    aAz
+
+/^a\u0041z/<JS>
+    aAz
+    *** Failers
+    au0041z
+
+/^a[m\u0041]z/<JS>
+    aAz
+
+/^a\u041z/<JS>
+    au041z
+    *** Failers
+    aAz
+
+/^a\U0041z/<JS>
+    aU0041z
+    *** Failers
+    aAz
+
 /-- End of testinput2 --/


Modified: code/trunk/testdata/testoutput2
===================================================================
--- code/trunk/testdata/testoutput2    2011-11-08 09:59:38 UTC (rev 743)
+++ code/trunk/testdata/testoutput2    2011-11-13 16:31:38 UTC (rev 744)
@@ -12502,4 +12502,56 @@
  2: cc
  3: ee


+/^a\x41z/<JS>
+    aAz
+ 0: aAz
+    *** Failers
+No match
+    ax41z
+No match
+
+/^a[m\x41]z/<JS>
+    aAz
+ 0: aAz
+
+/^a\x1z/<JS>
+    ax1z
+ 0: ax1z
+
+/^a\X41z/<JS>
+    aX41z
+ 0: aX41z
+    *** Failers
+No match
+    aAz
+No match
+
+/^a\u0041z/<JS>
+    aAz
+ 0: aAz
+    *** Failers
+No match
+    au0041z
+No match
+
+/^a[m\u0041]z/<JS>
+    aAz
+ 0: aAz
+
+/^a\u041z/<JS>
+    au041z
+ 0: au041z
+    *** Failers
+No match
+    aAz
+No match
+
+/^a\U0041z/<JS>
+    aU0041z
+ 0: aU0041z
+    *** Failers
+No match
+    aAz
+No match
+
 /-- End of testinput2 --/