[Pcre-svn] [1047] code/trunk: Case folding in JIT and removi…

Top Page
Delete this message
Author: Subversion repository
Date:  
To: pcre-svn
Subject: [Pcre-svn] [1047] code/trunk: Case folding in JIT and removing unnecessary spaces
Revision: 1047
          http://vcs.pcre.org/viewvc?view=rev&revision=1047
Author:   zherczeg
Date:     2012-09-28 16:06:38 +0100 (Fri, 28 Sep 2012)


Log Message:
-----------
Case folding in JIT and removing unnecessary spaces

Modified Paths:
--------------
    code/trunk/pcre_compile.c
    code/trunk/pcre_exec.c
    code/trunk/pcre_jit_compile.c


Modified: code/trunk/pcre_compile.c
===================================================================
--- code/trunk/pcre_compile.c    2012-09-25 16:27:58 UTC (rev 1046)
+++ code/trunk/pcre_compile.c    2012-09-28 15:06:38 UTC (rev 1047)
@@ -2996,18 +2996,18 @@
           PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
           c == CHAR_UNDERSCORE) == negated;


-#ifdef SUPPORT_UCP        
+#ifdef SUPPORT_UCP
   case PT_CLIST:
   p = PRIV(ucd_caseless_sets) + prop->caseset;
   for (;;)
     {
     if ((unsigned int)c < *p) return !negated;
     if ((unsigned int)c == *p++) return negated;
-    }     
+    }
   break;  /* Control never reaches here */
-#endif   
+#endif
   }
-   
+
 return FALSE;
 }
 #endif  /* SUPPORT_UCP */
@@ -3109,12 +3109,12 @@
 if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||
   STRNCMP_UC_C8(ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
     return FALSE;
-    
+
 /* If the previous item is a character, get its value. */


 if (op_code == OP_CHAR || op_code == OP_CHARI || 
     op_code == OP_NOT || op_code == OP_NOTI)
-  { 
+  {
 #ifdef SUPPORT_UTF
   GETCHARTEST(c, previous);
 #else
@@ -3133,19 +3133,19 @@
     {
     int ocs = UCD_CASESET(next);
     if (ocs > 0) return check_char_prop(c, PT_CLIST, ocs, FALSE);
-    }  
+    }
 #endif


   switch(op_code)
     {
     case OP_CHAR:
     return c != next;
-  
+
     /* For CHARI (caseless character) we must check the other case. If we have
     Unicode property support, we can use it to test the other case of
     high-valued characters. We know that next can have only one other case, 
     because multi-other-case characters are dealt with above. */
-  
+
     case OP_CHARI:
     if (c == next) return FALSE;
 #ifdef SUPPORT_UTF
@@ -3184,39 +3184,39 @@
     else
 #endif  /* SUPPORT_UTF */
     return (c == TABLE_GET((unsigned int)next, cd->fcc, next));  /* Not UTF */
-  
+
     /* Note that OP_DIGIT etc. are generated only when PCRE_UCP is *not* set.
     When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
-  
+
     case OP_DIGIT:
     return next > 255 || (cd->ctypes[next] & ctype_digit) == 0;
-  
+
     case OP_NOT_DIGIT:
     return next <= 255 && (cd->ctypes[next] & ctype_digit) != 0;
-  
+
     case OP_WHITESPACE:
     return next > 255 || (cd->ctypes[next] & ctype_space) == 0;
-  
+
     case OP_NOT_WHITESPACE:
     return next <= 255 && (cd->ctypes[next] & ctype_space) != 0;
-  
+
     case OP_WORDCHAR:
     return next > 255 || (cd->ctypes[next] & ctype_word) == 0;
-  
+
     case OP_NOT_WORDCHAR:
     return next <= 255 && (cd->ctypes[next] & ctype_word) != 0;
-  
+
     case OP_HSPACE:
     case OP_NOT_HSPACE:
     switch(next)
       {
       HSPACE_CASES: 
       return op_code == OP_NOT_HSPACE;
-       
+
       default:
       return op_code != OP_NOT_HSPACE;
       }
-  
+
     case OP_ANYNL:
     case OP_VSPACE:
     case OP_NOT_VSPACE:
@@ -3224,23 +3224,23 @@
       {
       VSPACE_CASES: 
       return op_code == OP_NOT_VSPACE;
-       
+
       default:
       return op_code != OP_NOT_VSPACE;
       }
-  
+
 #ifdef SUPPORT_UCP
     case OP_PROP:
     return check_char_prop(next, previous[0], previous[1], FALSE);
-  
+
     case OP_NOTPROP:
     return check_char_prop(next, previous[0], previous[1], TRUE);
 #endif
-  
+
     default:
     return FALSE;
     }
-  }   
+  }


 /* Handle the case when the next item is \d, \s, etc. Note that when PCRE_UCP
 is set, \d turns into ESC_du rather than ESC_d, etc., so ESC_d etc. are
@@ -3278,7 +3278,7 @@
       {
       HSPACE_CASES: 
       return -next != ESC_h;
-       
+
       default:
       return -next == ESC_h;
       }
@@ -3289,7 +3289,7 @@
       {
       VSPACE_CASES: 
       return -next != ESC_v;
-       
+
       default:
       return -next == ESC_v;
       }


Modified: code/trunk/pcre_exec.c
===================================================================
--- code/trunk/pcre_exec.c    2012-09-25 16:27:58 UTC (rev 1046)
+++ code/trunk/pcre_exec.c    2012-09-28 15:06:38 UTC (rev 1047)
@@ -200,12 +200,12 @@
       if (c != d && c != d + ur->other_case) 
         {
         const pcre_uint32 *pp = PRIV(ucd_caseless_sets) + ur->caseset; 
-        for (;;)                                                                   
-          {                                                                        
-          if (c < *pp) return -1;                                                   
-          if (c == *pp++) break;     
+        for (;;)
+          {
+          if (c < *pp) return -1;
+          if (c == *pp++) break;
           }
-        } 
+        }
       }
     }
   else
@@ -2583,17 +2583,17 @@
              c == CHAR_UNDERSCORE) == (op == OP_NOTPROP))
           RRETURN(MATCH_NOMATCH);
         break;
-        
+
         case PT_CLIST:
         cp = PRIV(ucd_caseless_sets) + prop->caseset;
         for (;;)
           {
           if (c < *cp)
-            { if (op == OP_PROP) RRETURN(MATCH_NOMATCH); else break; }
-          if (c == *cp++)   
-            { if (op == OP_PROP) break; else RRETURN(MATCH_NOMATCH); }
-          }      
-        break; 
+            { if (op == OP_PROP) { RRETURN(MATCH_NOMATCH); } else break; }
+          if (c == *cp++)
+            { if (op == OP_PROP) break; else { RRETURN(MATCH_NOMATCH); } }
+          }
+        break;


         /* This should never occur */


@@ -4200,12 +4200,12 @@
             for (;;)
               {
               if (c < *cp) 
-                { if (prop_fail_result) break; else RRETURN(MATCH_NOMATCH); }
+                { if (prop_fail_result) break; else { RRETURN(MATCH_NOMATCH); } }
               if (c == *cp++)
-                { if (prop_fail_result) RRETURN(MATCH_NOMATCH); else break; }
-              }    
+                { if (prop_fail_result) { RRETURN(MATCH_NOMATCH); } else break; }
+              }
             }
-          break;   
+          break;


           /* This should not occur */


@@ -4935,11 +4935,11 @@
             cp = PRIV(ucd_caseless_sets) + UCD_CASESET(c);
             for (;;)
               {
-              if (c < *cp) 
-                { if (prop_fail_result) break; else RRETURN(MATCH_NOMATCH); }
+              if (c < *cp)
+                { if (prop_fail_result) break; else { RRETURN(MATCH_NOMATCH); } }
               if (c == *cp++)
-                { if (prop_fail_result) RRETURN(MATCH_NOMATCH); else break; }
-              }    
+                { if (prop_fail_result) { RRETURN(MATCH_NOMATCH); } else break; }
+              }
             }
           /* Control never gets here */



Modified: code/trunk/pcre_jit_compile.c
===================================================================
--- code/trunk/pcre_jit_compile.c    2012-09-25 16:27:58 UTC (rev 1046)
+++ code/trunk/pcre_jit_compile.c    2012-09-28 15:06:38 UTC (rev 1047)
@@ -3481,9 +3481,11 @@
 static const pcre_uchar *SLJIT_CALL do_utf_caselesscmp(pcre_uchar *src1, jit_arguments *args, pcre_uchar *end1)
 {
 /* This function would be ineffective to do in JIT level. */
-int c1, c2;
+pcre_uint32 c1, c2;
 const pcre_uchar *src2 = args->uchar_ptr;
 const pcre_uchar *end2 = args->end;
+const ucd_record *ur;
+const pcre_uint32 *pp;


 while (src1 < end1)
   {
@@ -3491,7 +3493,16 @@
     return (pcre_uchar*)1;
   GETCHARINC(c1, src1);
   GETCHARINC(c2, src2);
-  if (c1 != c2 && c1 != UCD_OTHERCASE(c2)) return NULL;
+  ur = GET_UCD(c2);
+  if (c1 != c2 && c1 != c2 + ur->other_case)
+    {
+    pp = PRIV(ucd_caseless_sets) + ur->caseset;
+    for (;;)
+      {
+      if (c1 < *pp) return NULL;
+      if (c1 == *pp++) break;
+      }
+    }
   }
 return src2;
 }
@@ -3683,18 +3694,17 @@
 DEFINE_COMPILER;
 jump_list *found = NULL;
 jump_list **list = (*cc & XCL_NOT) == 0 ? &found : backtracks;
-unsigned int c;
-int compares;
+pcre_int32 c, charoffset;
+const pcre_uint32 *other_cases;
 struct sljit_jump *jump = NULL;
 pcre_uchar *ccbegin;
+int compares, invertcmp, numberofcmps;
 #ifdef SUPPORT_UCP
 BOOL needstype = FALSE, needsscript = FALSE, needschar = FALSE;
 BOOL charsaved = FALSE;
 int typereg = TMP1, scriptreg = TMP1;
 unsigned int typeoffset;
 #endif
-int invertcmp, numberofcmps;
-unsigned int charoffset;


 /* Although SUPPORT_UTF must be defined, we are
    not necessary in utf mode even in 8 bit mode. */
@@ -3792,6 +3802,10 @@
       needschar = TRUE;
       break;


+      case PT_CLIST:
+      needschar = TRUE;
+      break;
+
       default:
       SLJIT_ASSERT_STOP();
       break;
@@ -4001,6 +4015,20 @@
       COND_VALUE(SLJIT_OR | SLJIT_SET_E, TMP2, 0, SLJIT_C_LESS_EQUAL);
       jump = JUMP(SLJIT_C_NOT_ZERO ^ invertcmp);
       break;
+
+      case PT_CLIST:
+      other_cases = PRIV(ucd_caseless_sets) + cc[1];
+
+      OP2(SLJIT_SUB | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, *other_cases++ - charoffset);
+      COND_VALUE(SLJIT_MOV, TMP2, 0, SLJIT_C_EQUAL);
+
+      while (*other_cases < NOTACHAR)
+        {
+        OP2(SLJIT_SUB | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, *other_cases++ - charoffset);
+        COND_VALUE(SLJIT_OR, TMP2, 0, SLJIT_C_EQUAL);
+        }
+      jump = JUMP(SLJIT_C_NOT_ZERO ^ invertcmp);
+      break;
       }
     cc += 2;
     }