[Pcre-svn] [915] code/trunk: Improved \X and back reference …

Top Page
Delete this message
Author: Subversion repository
Date:  
To: pcre-svn
Subject: [Pcre-svn] [915] code/trunk: Improved \X and back reference partial matching
Revision: 915
          http://vcs.pcre.org/viewvc?view=rev&revision=915
Author:   zherczeg
Date:     2012-02-14 13:05:39 +0000 (Tue, 14 Feb 2012)


Log Message:
-----------
Improved \X and back reference partial matching

Modified Paths:
--------------
    code/trunk/ChangeLog
    code/trunk/pcre_exec.c
    code/trunk/pcre_internal.h
    code/trunk/pcre_jit_compile.c
    code/trunk/testdata/testinput2
    code/trunk/testdata/testinput5
    code/trunk/testdata/testinput6
    code/trunk/testdata/testoutput2
    code/trunk/testdata/testoutput5
    code/trunk/testdata/testoutput6


Modified: code/trunk/ChangeLog
===================================================================
--- code/trunk/ChangeLog    2012-02-13 06:04:50 UTC (rev 914)
+++ code/trunk/ChangeLog    2012-02-14 13:05:39 UTC (rev 915)
@@ -14,7 +14,9 @@


4. Partial matching support is added to the JIT compiler.

+5. Improved \X and back reference partial matching.

+
Version 8.30 04-February-2012
-----------------------------


Modified: code/trunk/pcre_exec.c
===================================================================
--- code/trunk/pcre_exec.c    2012-02-13 06:04:50 UTC (rev 914)
+++ code/trunk/pcre_exec.c    2012-02-14 13:05:39 UTC (rev 915)
@@ -147,7 +147,7 @@
 match_ref(int offset, register PCRE_PUCHAR eptr, int length, match_data *md,
   BOOL caseless)
 {
-PCRE_PUCHAR eptr_start = eptr;
+int matched_length = length;
 register PCRE_PUCHAR p = md->start_subject + md->offset_vector[offset];


 #ifdef PCRE_DEBUG
@@ -186,14 +186,16 @@
     reference, not along the subject (earlier code did this wrong). */


     PCRE_PUCHAR endptr = p + length;
+    PCRE_PUCHAR eptr_start = eptr;
     while (p < endptr)
       {
       int c, d;
-      if (eptr >= md->end_subject) return -1;
+      if (eptr >= md->end_subject) return -((int)(eptr - eptr_start) + 1);
       GETCHARINC(c, eptr);
       GETCHARINC(d, p);
       if (c != d && c != UCD_OTHERCASE(d)) return -1;
       }
+    matched_length = (int)(eptr - eptr_start);
     }
   else
 #endif
@@ -202,7 +204,13 @@
   /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
   is no UCP support. */
     {
-    if (eptr + length > md->end_subject) return -1;
+    if (eptr + length > md->end_subject)
+      {
+      if (md->partial == 0)
+        return -1;
+      length = (int)(md->end_subject - eptr);
+      matched_length = -(length + 1);
+      }
     while (length-- > 0)
       {
       if (TABLE_GET(*p, md->lcc, *p) != TABLE_GET(*eptr, md->lcc, *eptr)) return -1;
@@ -217,11 +225,17 @@


 else
   {
-  if (eptr + length > md->end_subject) return -1;
+  if (eptr + length > md->end_subject)
+    {
+    if (md->partial == 0)
+      return -1;
+    length = (int)(md->end_subject - eptr);
+    matched_length = -(length + 1);
+    }
   while (length-- > 0) if (*p++ != *eptr++) return -1;
   }


-return (int)(eptr - eptr_start);
+return matched_length;
}


@@ -2595,6 +2609,10 @@
       if (UCD_CATEGORY(c) != ucp_M) break;
       eptr += len;
       }
+    if (md->partial != 0 && eptr >= md->end_subject)
+      {
+      SCHECK_PARTIAL();
+      }
     ecode++;
     break;
 #endif
@@ -2660,6 +2678,7 @@
       default:               /* No repeat follows */
       if ((length = match_ref(offset, eptr, length, md, caseless)) < 0)
         {
+        eptr += -(length + 1);
         CHECK_PARTIAL();
         RRETURN(MATCH_NOMATCH);
         }
@@ -2685,6 +2704,7 @@
       int slength;
       if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
         {
+        eptr += -(slength + 1);
         CHECK_PARTIAL();
         RRETURN(MATCH_NOMATCH);
         }
@@ -2708,6 +2728,7 @@
         if (fi >= max) RRETURN(MATCH_NOMATCH);
         if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
           {
+          eptr += -(slength + 1);
           CHECK_PARTIAL();
           RRETURN(MATCH_NOMATCH);
           }
@@ -2726,7 +2747,10 @@
         int slength;
         if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
           {
+          /* Restore the eptr after the check. */
+          eptr += -(slength + 1);
           CHECK_PARTIAL();
+          eptr -= -(slength + 1);
           break;
           }
         eptr += slength;
@@ -4165,6 +4189,10 @@
             eptr += len;
             }
           }
+        if (md->partial != 0 && eptr >= md->end_subject)
+          {
+          SCHECK_PARTIAL();
+          }
         }


       else
@@ -4948,6 +4976,10 @@
             if (UCD_CATEGORY(c) != ucp_M) break;
             eptr += len;
             }
+          if (md->partial != 0 && eptr >= md->end_subject)
+            {
+            SCHECK_PARTIAL();
+            }
           }
         }
       else
@@ -5491,6 +5523,10 @@
             if (UCD_CATEGORY(c) != ucp_M) break;
             eptr += len;
             }
+          if (eptr >= md->end_subject)
+            {
+            SCHECK_PARTIAL();
+            }
           }


         /* eptr is now past the end of the maximum run */


Modified: code/trunk/pcre_internal.h
===================================================================
--- code/trunk/pcre_internal.h    2012-02-13 06:04:50 UTC (rev 914)
+++ code/trunk/pcre_internal.h    2012-02-14 13:05:39 UTC (rev 915)
@@ -1944,7 +1944,7 @@


 /* JIT compiling modes. The function list is indexed by them. */
 enum { JIT_COMPILE, JIT_PARTIAL_SOFT_COMPILE, JIT_PARTIAL_HARD_COMPILE,
-       JIT_NUMBER_OF_COMPILE_TYPES };
+       JIT_NUMBER_OF_COMPILE_MODES };


/* The real format of the start of the pcre block; the index of names and the
code vector run on as long as necessary after the end. We store an explicit

Modified: code/trunk/pcre_jit_compile.c
===================================================================
--- code/trunk/pcre_jit_compile.c    2012-02-13 06:04:50 UTC (rev 914)
+++ code/trunk/pcre_jit_compile.c    2012-02-14 13:05:39 UTC (rev 915)
@@ -163,10 +163,10 @@
 } jit_arguments;


typedef struct executable_functions {
- void *executable_funcs[JIT_NUMBER_OF_COMPILE_TYPES];
+ void *executable_funcs[JIT_NUMBER_OF_COMPILE_MODES];
PUBL(jit_callback) callback;
void *userdata;
- sljit_uw executable_sizes[JIT_NUMBER_OF_COMPILE_TYPES];
+ sljit_uw executable_sizes[JIT_NUMBER_OF_COMPILE_MODES];
} executable_functions;

 typedef struct jump_list {
@@ -2590,10 +2590,10 @@
 while (src1 < end1)
   {
   if (src2 >= end2)
-    return 0;
+    return (pcre_uchar*)1;
   GETCHARINC(c1, src1);
   GETCHARINC(c2, src2);
-  if (c1 != c2 && c1 != UCD_OTHERCASE(c2)) return 0;
+  if (c1 != c2 && c1 != UCD_OTHERCASE(c2)) return NULL;
   }
 return src2;
 }
@@ -3288,6 +3288,12 @@


   OP1(SLJIT_MOV, STR_PTR, 0, TMP3, 0);
   JUMPHERE(jump[0]);
+  if (common->mode == JIT_PARTIAL_HARD_COMPILE)
+    {
+    jump[0] = CMP(SLJIT_C_LESS, STR_PTR, 0, STR_END, 0);
+    check_partial(common);
+    JUMPHERE(jump[0]);
+    }
   return cc;
 #endif


@@ -3727,6 +3733,8 @@
DEFINE_COMPILER;
int offset = GET2(cc, 1) << 1;
struct sljit_jump *jump = NULL;
+struct sljit_jump *partial;
+struct sljit_jump *nopartial;

 OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(SLJIT_LOCALS_REG), OVECTOR(offset));
 /* OVECTOR(1) contains the "string begin - 1" constant. */
@@ -3741,16 +3749,22 @@
   if (withchecks)
     jump = CMP(SLJIT_C_EQUAL, TMP1, 0, TMP2, 0);


-  if (common->mode != JIT_COMPILE)
-    fallback_at_str_end(common, fallbacks);
-
   /* Needed to save important temporary registers. */
   OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_LOCALS_REG), LOCALS0, STACK_TOP, 0);
   OP1(SLJIT_MOV, SLJIT_TEMPORARY_REG2, 0, ARGUMENTS, 0);
   OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_TEMPORARY_REG2), SLJIT_OFFSETOF(jit_arguments, ptr), STR_PTR, 0);
   sljit_emit_ijump(compiler, SLJIT_CALL3, SLJIT_IMM, SLJIT_FUNC_OFFSET(do_utf_caselesscmp));
   OP1(SLJIT_MOV, STACK_TOP, 0, SLJIT_MEM1(SLJIT_LOCALS_REG), LOCALS0);
-  add_jump(compiler, fallbacks, CMP(SLJIT_C_EQUAL, SLJIT_RETURN_REG, 0, SLJIT_IMM, 0));
+  if (common->mode == JIT_COMPILE)
+    add_jump(compiler, fallbacks, CMP(SLJIT_C_LESS_EQUAL, SLJIT_RETURN_REG, 0, SLJIT_IMM, 1));
+  else
+    {
+    add_jump(compiler, fallbacks, CMP(SLJIT_C_EQUAL, SLJIT_RETURN_REG, 0, SLJIT_IMM, 0));
+    nopartial = CMP(SLJIT_C_NOT_EQUAL, SLJIT_RETURN_REG, 0, SLJIT_IMM, 1);
+    check_partial(common);
+    add_jump(compiler, fallbacks, JUMP(SLJIT_JUMP));
+    JUMPHERE(nopartial);
+    }
   OP1(SLJIT_MOV, STR_PTR, 0, SLJIT_RETURN_REG, 0);
   }
 else
@@ -3760,14 +3774,30 @@
   if (withchecks)
     jump = JUMP(SLJIT_C_ZERO);


-  if (common->mode != JIT_COMPILE)
-    fallback_at_str_end(common, fallbacks);
-
   OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
+  partial = CMP(SLJIT_C_GREATER, STR_PTR, 0, STR_END, 0);
+  if (common->mode == JIT_COMPILE)
+    add_jump(compiler, fallbacks, partial);


-  add_jump(compiler, fallbacks, CMP(SLJIT_C_GREATER, STR_PTR, 0, STR_END, 0));
   add_jump(compiler, *cc == OP_REF ? &common->casefulcmp : &common->caselesscmp, JUMP(SLJIT_FAST_CALL));
   add_jump(compiler, fallbacks, CMP(SLJIT_C_NOT_EQUAL, TMP2, 0, SLJIT_IMM, 0));
+
+  if (common->mode != JIT_COMPILE)
+    {
+    nopartial = JUMP(SLJIT_JUMP);
+    JUMPHERE(partial);
+    /* TMP2 -= STR_END - STR_PTR */
+    OP2(SLJIT_SUB, TMP2, 0, TMP2, 0, STR_PTR, 0);
+    OP2(SLJIT_ADD, TMP2, 0, TMP2, 0, STR_END, 0);
+    partial = CMP(SLJIT_C_EQUAL, TMP2, 0, SLJIT_IMM, 0);
+    OP1(SLJIT_MOV, STR_PTR, 0, STR_END, 0);
+    add_jump(compiler, *cc == OP_REF ? &common->casefulcmp : &common->caselesscmp, JUMP(SLJIT_FAST_CALL));
+    add_jump(compiler, fallbacks, CMP(SLJIT_C_NOT_EQUAL, TMP2, 0, SLJIT_IMM, 0));
+    JUMPHERE(partial);
+    check_partial(common);
+    add_jump(compiler, fallbacks, JUMP(SLJIT_JUMP));
+    JUMPHERE(nopartial);
+    }
   }


 if (jump != NULL)
@@ -7027,7 +7057,7 @@
 {
 int i;
 executable_functions *functions = (executable_functions *)executable_funcs;
-for (i = 0; i < JIT_NUMBER_OF_COMPILE_TYPES; i++)
+for (i = 0; i < JIT_NUMBER_OF_COMPILE_MODES; i++)
   {
   if (functions->executable_funcs[i] != NULL)
     sljit_free_code(functions->executable_funcs[i]);


Modified: code/trunk/testdata/testinput2
===================================================================
--- code/trunk/testdata/testinput2    2012-02-13 06:04:50 UTC (rev 914)
+++ code/trunk/testdata/testinput2    2012-02-14 13:05:39 UTC (rev 915)
@@ -3601,4 +3601,43 @@
 /(?=a(*:x))(?=a(*:y)c|)/K+
     ab


+/(..)\1/
+    ab\P
+    aba\P
+    abab\P
+
+/(..)\1/i
+    ab\P
+    abA\P
+    aBAb\P
+
+/(..)\1{2,}/
+    ab\P
+    aba\P
+    abab\P
+    ababa\P
+    ababab\P
+    ababab\P\P
+    abababa\P
+    abababa\P\P
+
+/(..)\1{2,}/i
+    ab\P
+    aBa\P
+    aBAb\P
+    AbaBA\P
+    abABAb\P
+    aBAbaB\P\P
+    abABabA\P
+    abaBABa\P\P
+
+/(..)\1{2,}?x/i
+    ab\P
+    abA\P
+    aBAb\P
+    abaBA\P
+    abAbaB\P
+    abaBabA\P
+    abAbABaBx\P
+
 /-- End of testinput2 --/


Modified: code/trunk/testdata/testinput5
===================================================================
--- code/trunk/testdata/testinput5    2012-02-13 06:04:50 UTC (rev 914)
+++ code/trunk/testdata/testinput5    2012-02-14 13:05:39 UTC (rev 915)
@@ -693,4 +693,43 @@
     \x{2027}\x{2030}\x{2028}\x{2029}
     \x09\x0e\x{84}\x{86}\x{85}\x0a\x0b\x0c\x0d


+/(..)\1/8
+    ab\P
+    aba\P
+    abab\P
+
+/(..)\1/8i
+    ab\P
+    abA\P
+    aBAb\P
+
+/(..)\1{2,}/8
+    ab\P
+    aba\P
+    abab\P
+    ababa\P
+    ababab\P
+    ababab\P\P
+    abababa\P
+    abababa\P\P
+
+/(..)\1{2,}/8i
+    ab\P
+    aBa\P
+    aBAb\P
+    AbaBA\P
+    abABAb\P
+    aBAbaB\P\P
+    abABabA\P
+    abaBABa\P\P
+
+/(..)\1{2,}?x/8i
+    ab\P
+    abA\P
+    aBAb\P
+    abaBA\P
+    abAbaB\P
+    abaBabA\P
+    abAbABaBx\P
+
 /-- End of testinput5 --/


Modified: code/trunk/testdata/testinput6
===================================================================
--- code/trunk/testdata/testinput6    2012-02-13 06:04:50 UTC (rev 914)
+++ code/trunk/testdata/testinput6    2012-02-14 13:05:39 UTC (rev 915)
@@ -816,4 +816,28 @@
     Ⱥ
     ⱥ


+/\X/
+    a\P
+    a\P\P
+
+/\Xa/
+    aa\P
+    aa\P\P
+
+/\X{2}/
+    aa\P
+    aa\P\P
+
+/\X+a/
+    a\P
+    aa\P
+    aa\P\P
+
+/\X+?a/
+    a\P
+    ab\P
+    aa\P
+    aa\P\P
+    aba\P
+
 /-- End of testinput6 --/


Modified: code/trunk/testdata/testoutput2
===================================================================
--- code/trunk/testdata/testoutput2    2012-02-13 06:04:50 UTC (rev 914)
+++ code/trunk/testdata/testoutput2    2012-02-14 13:05:39 UTC (rev 915)
@@ -12011,4 +12011,79 @@
  0+ ab
 MK: x


+/(..)\1/
+    ab\P
+Partial match: ab
+    aba\P
+Partial match: aba
+    abab\P
+ 0: abab
+ 1: ab
+
+/(..)\1/i
+    ab\P
+Partial match: ab
+    abA\P
+Partial match: abA
+    aBAb\P
+ 0: aBAb
+ 1: aB
+
+/(..)\1{2,}/
+    ab\P
+Partial match: ab
+    aba\P
+Partial match: aba
+    abab\P
+Partial match: abab
+    ababa\P
+Partial match: ababa
+    ababab\P
+ 0: ababab
+ 1: ab
+    ababab\P\P
+Partial match: ababab
+    abababa\P
+ 0: ababab
+ 1: ab
+    abababa\P\P
+Partial match: abababa
+
+/(..)\1{2,}/i
+    ab\P
+Partial match: ab
+    aBa\P
+Partial match: aBa
+    aBAb\P
+Partial match: aBAb
+    AbaBA\P
+Partial match: AbaBA
+    abABAb\P
+ 0: abABAb
+ 1: ab
+    aBAbaB\P\P
+Partial match: aBAbaB
+    abABabA\P
+ 0: abABab
+ 1: ab
+    abaBABa\P\P
+Partial match: abaBABa
+
+/(..)\1{2,}?x/i
+    ab\P
+Partial match: ab
+    abA\P
+Partial match: abA
+    aBAb\P
+Partial match: aBAb
+    abaBA\P
+Partial match: abaBA
+    abAbaB\P
+Partial match: abAbaB
+    abaBabA\P
+Partial match: abaBabA
+    abAbABaBx\P
+ 0: abAbABaBx
+ 1: ab
+
 /-- End of testinput2 --/


Modified: code/trunk/testdata/testoutput5
===================================================================
--- code/trunk/testdata/testoutput5    2012-02-13 06:04:50 UTC (rev 914)
+++ code/trunk/testdata/testoutput5    2012-02-14 13:05:39 UTC (rev 915)
@@ -1651,4 +1651,79 @@
     \x09\x0e\x{84}\x{86}\x{85}\x0a\x0b\x0c\x0d
  0: \x{85}\x{0a}\x{0b}\x{0c}\x{0d}


+/(..)\1/8
+    ab\P
+Partial match: ab
+    aba\P
+Partial match: aba
+    abab\P
+ 0: abab
+ 1: ab
+
+/(..)\1/8i
+    ab\P
+Partial match: ab
+    abA\P
+Partial match: abA
+    aBAb\P
+ 0: aBAb
+ 1: aB
+
+/(..)\1{2,}/8
+    ab\P
+Partial match: ab
+    aba\P
+Partial match: aba
+    abab\P
+Partial match: abab
+    ababa\P
+Partial match: ababa
+    ababab\P
+ 0: ababab
+ 1: ab
+    ababab\P\P
+Partial match: ababab
+    abababa\P
+ 0: ababab
+ 1: ab
+    abababa\P\P
+Partial match: abababa
+
+/(..)\1{2,}/8i
+    ab\P
+Partial match: ab
+    aBa\P
+Partial match: aBa
+    aBAb\P
+Partial match: aBAb
+    AbaBA\P
+Partial match: AbaBA
+    abABAb\P
+ 0: abABAb
+ 1: ab
+    aBAbaB\P\P
+Partial match: aBAbaB
+    abABabA\P
+ 0: abABab
+ 1: ab
+    abaBABa\P\P
+Partial match: abaBABa
+
+/(..)\1{2,}?x/8i
+    ab\P
+Partial match: ab
+    abA\P
+Partial match: abA
+    aBAb\P
+Partial match: aBAb
+    abaBA\P
+Partial match: abaBA
+    abAbaB\P
+Partial match: abAbaB
+    abaBabA\P
+Partial match: abaBabA
+    abAbABaBx\P
+ 0: abAbABaBx
+ 1: ab
+
 /-- End of testinput5 --/


Modified: code/trunk/testdata/testoutput6
===================================================================
--- code/trunk/testdata/testoutput6    2012-02-13 06:04:50 UTC (rev 914)
+++ code/trunk/testdata/testoutput6    2012-02-14 13:05:39 UTC (rev 915)
@@ -1375,4 +1375,42 @@
     ⱥ
  0: \x{2c65}


+/\X/
+    a\P
+ 0: a
+    a\P\P
+Partial match: a
+
+/\Xa/
+    aa\P
+ 0: aa
+    aa\P\P
+ 0: aa
+
+/\X{2}/
+    aa\P
+ 0: aa
+    aa\P\P
+Partial match: aa
+
+/\X+a/
+    a\P
+Partial match: a
+    aa\P
+ 0: aa
+    aa\P\P
+Partial match: aa
+
+/\X+?a/
+    a\P
+Partial match: a
+    ab\P
+Partial match: ab
+    aa\P
+ 0: aa
+    aa\P\P
+ 0: aa
+    aba\P
+ 0: aba
+
 /-- End of testinput6 --/