[Pcre-svn] [301] code/trunk: Improve matching speed of patte…

Top Page
Delete this message
Author: Subversion repository
Date:  
To: pcre-svn
Subject: [Pcre-svn] [301] code/trunk: Improve matching speed of patterns starting with + or * in JIT.
Revision: 301
          http://www.exim.org/viewvc/pcre2?view=rev&revision=301
Author:   zherczeg
Date:     2015-07-03 07:46:20 +0100 (Fri, 03 Jul 2015)
Log Message:
-----------
Improve matching speed of patterns starting with + or * in JIT.


Modified Paths:
--------------
    code/trunk/ChangeLog
    code/trunk/src/pcre2_jit_compile.c


Modified: code/trunk/ChangeLog
===================================================================
--- code/trunk/ChangeLog    2015-07-02 13:18:57 UTC (rev 300)
+++ code/trunk/ChangeLog    2015-07-03 06:46:20 UTC (rev 301)
@@ -1,6 +1,12 @@
 Change Log for PCRE2
 --------------------


+Version 10.21 xx-xxx-xxxx
+-------------------------
+
+1. Improve matching speed of patterns starting with + or * in JIT.
+
+
Version 10.20 30-June-2015
--------------------------


Modified: code/trunk/src/pcre2_jit_compile.c
===================================================================
--- code/trunk/src/pcre2_jit_compile.c    2015-07-02 13:18:57 UTC (rev 300)
+++ code/trunk/src/pcre2_jit_compile.c    2015-07-03 06:46:20 UTC (rev 301)
@@ -381,11 +381,15 @@
   int control_head_ptr;
   /* Points to the last matched capture block index. */
   int capture_last_ptr;
+  /* Fast forward skipping byte code pointer. */
+  PCRE2_SPTR fast_forward_bc_ptr;
+  /* Fast forward skipping local variable position. */
+  int fast_forward_ptr;


/* Flipped and lower case tables. */
const sljit_ub *fcc;
sljit_sw lcc;
- /* Mode can be PCRE_STUDY_JIT_COMPILE and others. */
+ /* Mode can be PCRE2_JIT_COMPLETE and others. */
int mode;
/* TRUE, when minlength is greater than 0. */
BOOL might_be_empty;
@@ -921,6 +925,120 @@
return TRUE;
}

+static void detect_fast_forward_skip(compiler_common *common, int *private_data_start)
+{
+PCRE2_SPTR cc = common->start;
+PCRE2_SPTR end;
+
+/* Skip not repeated brackets. */
+while (TRUE)
+  {
+  switch(*cc)
+    {
+    case OP_SOD:
+    case OP_SOM:
+    case OP_SET_SOM:
+    case OP_NOT_WORD_BOUNDARY:
+    case OP_WORD_BOUNDARY:
+    case OP_EODN:
+    case OP_EOD:
+    case OP_CIRC:
+    case OP_CIRCM:
+    case OP_DOLL:
+    case OP_DOLLM:
+    /* Zero width assertions. */
+    cc++;
+    continue;
+    }
+
+  if (*cc != OP_BRA && *cc != OP_CBRA)
+    break;
+
+  end = cc + GET(cc, 1);
+  if (*end != OP_KET || PRIVATE_DATA(end) != 0)
+    return;
+  if (*cc == OP_CBRA)
+    {
+    if (common->optimized_cbracket[GET2(cc, 1 + LINK_SIZE)] == 0)
+      return;
+    cc += IMM2_SIZE;
+    }
+  cc += 1 + LINK_SIZE;
+  }
+
+common->fast_forward_bc_ptr = cc;
+
+switch(*cc)
+  {
+  case OP_TYPESTAR:
+  case OP_TYPEMINSTAR:
+  case OP_TYPEPLUS:
+  case OP_TYPEMINPLUS:
+  case OP_TYPEPOSSTAR:
+  case OP_TYPEPOSPLUS:
+  if (cc[1] == OP_ANYNL || cc[1] == OP_EXTUNI)
+    break;
+  /* Fall through. */
+
+  case OP_STAR:
+  case OP_MINSTAR:
+  case OP_PLUS:
+  case OP_MINPLUS:
+  case OP_POSSTAR:
+  case OP_POSPLUS:
+
+  case OP_STARI:
+  case OP_MINSTARI:
+  case OP_PLUSI:
+  case OP_MINPLUSI:
+  case OP_POSSTARI:
+  case OP_POSPLUSI:
+
+  case OP_NOTSTAR:
+  case OP_NOTMINSTAR:
+  case OP_NOTPLUS:
+  case OP_NOTMINPLUS:
+  case OP_NOTPOSSTAR:
+  case OP_NOTPOSPLUS:
+
+  case OP_NOTSTARI:
+  case OP_NOTMINSTARI:
+  case OP_NOTPLUSI:
+  case OP_NOTMINPLUSI:
+  case OP_NOTPOSSTARI:
+  case OP_NOTPOSPLUSI:
+
+  common->fast_forward_ptr = *private_data_start;
+  *private_data_start += sizeof(sljit_sw);
+  return;
+
+  case OP_CLASS:
+  case OP_NCLASS:
+#if defined SUPPORT_UNICODE || PCRE2_CODE_UNIT_WIDTH != 8
+  case OP_XCLASS:
+  cc += (*cc == OP_XCLASS) ? GET(cc, 1) : (int)(1 + (32 / sizeof(PCRE2_UCHAR)));
+#else
+  cc += (int)(1 + (32 / sizeof(PCRE2_UCHAR)));
+#endif
+
+  switch(*cc)
+    {
+    case OP_CRSTAR:
+    case OP_CRMINSTAR:
+    case OP_CRPLUS:
+    case OP_CRMINPLUS:
+    case OP_CRPOSSTAR:
+    case OP_CRPOSPLUS:
+    common->fast_forward_ptr = *private_data_start;
+    *private_data_start += sizeof(sljit_sw);
+    return;
+    }
+  break;
+  }
+
+common->fast_forward_bc_ptr = NULL;
+}
+
 static int get_class_iterator_size(PCRE2_SPTR cc)
 {
 sljit_ui min;
@@ -8082,6 +8200,7 @@
 PCRE2_UCHAR opcode;
 PCRE2_UCHAR type;
 sljit_ui max = 0, exact;
+BOOL fast_forward = (cc == common->fast_forward_bc_ptr);
 BOOL charpos_enabled;
 PCRE2_UCHAR charpos_char;
 unsigned int charpos_othercasebit;
@@ -8114,6 +8233,7 @@
 /* Handle fixed part first. */
 if (exact > 1)
   {
+  SLJIT_ASSERT(!fast_forward);
   if (common->mode == PCRE2_JIT_COMPLETE
 #ifdef SUPPORT_UNICODE
       && !common->utf
@@ -8144,9 +8264,12 @@
   {
   case OP_STAR:
   case OP_UPTO:
+  SLJIT_ASSERT(!fast_forward || opcode == OP_STAR);
+
   if (type == OP_ANYNL || type == OP_EXTUNI)
     {
     SLJIT_ASSERT(private_data_ptr == 0);
+    SLJIT_ASSERT(!fast_forward);


     allocate_stack(common, 2);
     OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), STACK(0), STR_PTR, 0);
@@ -8228,6 +8351,8 @@
         add_jump(compiler, &backtrack->topbacktracks, JUMP(SLJIT_ZERO));
         }
       compile_char1_matchingpath(common, type, cc, &backtrack->topbacktracks, FALSE);
+      if (fast_forward)
+        OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), common->fast_forward_ptr, STR_PTR, 0);
       JUMPHERE(jump);


       detect_partial_match(common, &backtrack->topbacktracks);
@@ -8249,6 +8374,8 @@
       /* Search the last instance of charpos_char. */
       label = LABEL();
       compile_char1_matchingpath(common, type, cc, &no_match, FALSE);
+      if (fast_forward)
+        OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), common->fast_forward_ptr, STR_PTR, 0);
       detect_partial_match(common, &no_match);
       OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0));
       if (charpos_othercasebit != 0)
@@ -8304,6 +8431,8 @@


       set_jumps(no_match, LABEL());
       OP1(SLJIT_MOV, STR_PTR, 0, base, offset0);
+      if (fast_forward)
+        OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), common->fast_forward_ptr, STR_PTR, 0);
       }
 #endif
     else
@@ -8331,6 +8460,8 @@
       OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
       set_jumps(no_match, LABEL());
       OP1(SLJIT_MOV, base, offset0, STR_PTR, 0);
+      if (fast_forward)
+        OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), common->fast_forward_ptr, STR_PTR, 0);
       }
     }
   BACKTRACK_AS(char_iterator_backtrack)->matchingpath = LABEL();
@@ -8341,9 +8472,12 @@
     allocate_stack(common, 1);
   OP1(SLJIT_MOV, base, offset0, STR_PTR, 0);
   BACKTRACK_AS(char_iterator_backtrack)->matchingpath = LABEL();
+  if (fast_forward)
+    OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), common->fast_forward_ptr, STR_PTR, 0);
   break;


   case OP_MINUPTO:
+  SLJIT_ASSERT(!fast_forward);
   if (private_data_ptr == 0)
     allocate_stack(common, 2);
   OP1(SLJIT_MOV, base, offset0, STR_PTR, 0);
@@ -8353,6 +8487,7 @@


   case OP_QUERY:
   case OP_MINQUERY:
+  SLJIT_ASSERT(!fast_forward);
   if (private_data_ptr == 0)
     allocate_stack(common, 1);
   OP1(SLJIT_MOV, base, offset0, STR_PTR, 0);
@@ -8375,6 +8510,8 @@
     JUMPTO(SLJIT_JUMP, label);
     set_jumps(no_match, LABEL());
     OP1(SLJIT_MOV, STR_PTR, 0, tmp_base, tmp_offset);
+    if (fast_forward)
+      OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), common->fast_forward_ptr, STR_PTR, 0);
     break;
     }
 #endif
@@ -8385,9 +8522,12 @@
   set_jumps(no_char1_match, LABEL());
   OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
   set_jumps(no_match, LABEL());
+  if (fast_forward)
+    OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), common->fast_forward_ptr, STR_PTR, 0);
   break;


   case OP_POSUPTO:
+  SLJIT_ASSERT(!fast_forward);
 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
   if (common->utf)
     {
@@ -8416,6 +8556,7 @@
   break;


case OP_POSQUERY:
+ SLJIT_ASSERT(!fast_forward);
OP1(SLJIT_MOV, tmp_base, tmp_offset, STR_PTR, 0);
compile_char1_matchingpath(common, type, cc, &no_match, TRUE);
OP1(SLJIT_MOV, tmp_base, tmp_offset, STR_PTR, 0);
@@ -10251,6 +10392,9 @@

private_data_size = common->cbra_ptr + (re->top_bracket + 1) * sizeof(sljit_sw);
set_private_data_ptrs(common, &private_data_size, ccend);
+if ((re->overall_options & PCRE2_ANCHORED) == 0 && (re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0)
+ detect_fast_forward_skip(common, &private_data_size);
+
if (private_data_size > SLJIT_MAX_LOCAL_SIZE)
{
SLJIT_FREE(common->private_data_ptrs, allocator_data);
@@ -10335,6 +10479,8 @@
OP1(SLJIT_MOV, COUNT_MATCH, 0, SLJIT_MEM1(SLJIT_SP), LIMIT_MATCH);
if (common->capture_last_ptr != 0)
OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), common->capture_last_ptr, SLJIT_IMM, 0);
+if (common->fast_forward_ptr != 0)
+ OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), common->fast_forward_ptr, STR_PTR, 0);

if (common->start_ptr != OVECTOR(0))
OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), common->start_ptr, STR_PTR, 0);
@@ -10419,7 +10565,10 @@
OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(SLJIT_SP), common->first_line_end);
}

-OP1(SLJIT_MOV, STR_PTR, 0, SLJIT_MEM1(SLJIT_SP), common->start_ptr);
+if (common->fast_forward_ptr != 0)
+ OP1(SLJIT_MOV, STR_PTR, 0, SLJIT_MEM1(SLJIT_SP), common->fast_forward_ptr);
+else
+ OP1(SLJIT_MOV, STR_PTR, 0, SLJIT_MEM1(SLJIT_SP), common->start_ptr);

if ((re->overall_options & PCRE2_ANCHORED) == 0)
{
@@ -10464,6 +10613,7 @@
JUMPTO(SLJIT_JUMP, empty_match_backtrack_label);
}

+common->fast_forward_bc_ptr = NULL;
common->currententry = common->entries;
common->local_exit = TRUE;
quit_label = common->quit_label;