Revision: 1041
http://vcs.pcre.org/viewvc?view=rev&revision=1041
Author: ph10
Date: 2012-09-16 11:16:27 +0100 (Sun, 16 Sep 2012)
Log Message:
-----------
Turn case lists for horizontal and vertical white space into macros so they are
defined only once.
Modified Paths:
--------------
code/trunk/ChangeLog
code/trunk/pcre_compile.c
code/trunk/pcre_dfa_exec.c
code/trunk/pcre_exec.c
code/trunk/pcre_internal.h
Modified: code/trunk/ChangeLog
===================================================================
--- code/trunk/ChangeLog 2012-09-16 06:52:27 UTC (rev 1040)
+++ code/trunk/ChangeLog 2012-09-16 10:16:27 UTC (rev 1041)
@@ -83,7 +83,10 @@
19. Improving the first n character searches.
+20. Turn case lists for horizontal and vertical white space into macros so that
+ they are defined only once.
+
Version 8.31 06-July-2012
-------------------------
Modified: code/trunk/pcre_compile.c
===================================================================
--- code/trunk/pcre_compile.c 2012-09-16 06:52:27 UTC (rev 1040)
+++ code/trunk/pcre_compile.c 2012-09-16 10:16:27 UTC (rev 1041)
@@ -3168,28 +3168,9 @@
case OP_NOT_HSPACE:
switch(next)
{
- case CHAR_HT:
- case CHAR_SPACE:
-#ifndef EBCDIC
- case 0xa0:
- case 0x1680:
- case 0x180e:
- case 0x2000:
- case 0x2001:
- case 0x2002:
- case 0x2003:
- case 0x2004:
- case 0x2005:
- case 0x2006:
- case 0x2007:
- case 0x2008:
- case 0x2009:
- case 0x200A:
- case 0x202f:
- case 0x205f:
- case 0x3000:
-#endif /* Not EBCDIC */
+ HSPACE_CASES:
return op_code == OP_NOT_HSPACE;
+
default:
return op_code != OP_NOT_HSPACE;
}
@@ -3199,16 +3180,9 @@
case OP_NOT_VSPACE:
switch(next)
{
- case CHAR_LF:
- case CHAR_VT:
- case CHAR_FF:
- case CHAR_CR:
- case CHAR_NEL:
-#ifndef EBCDIC
- case 0x2028:
- case 0x2029:
-#endif
+ VSPACE_CASES:
return op_code == OP_NOT_VSPACE;
+
default:
return op_code != OP_NOT_VSPACE;
}
@@ -3265,28 +3239,9 @@
case ESC_H:
switch(c)
{
- case CHAR_HT:
- case CHAR_SPACE:
-#ifndef EBCDIC
- case 0xa0:
- case 0x1680:
- case 0x180e:
- case 0x2000:
- case 0x2001:
- case 0x2002:
- case 0x2003:
- case 0x2004:
- case 0x2005:
- case 0x2006:
- case 0x2007:
- case 0x2008:
- case 0x2009:
- case 0x200A:
- case 0x202f:
- case 0x205f:
- case 0x3000:
-#endif /* Not EBCDIC */
+ HSPACE_CASES:
return -next != ESC_h;
+
default:
return -next == ESC_h;
}
@@ -3295,16 +3250,9 @@
case ESC_V:
switch(c)
{
- case CHAR_LF:
- case CHAR_VT:
- case CHAR_FF:
- case CHAR_CR:
- case CHAR_NEL:
-#ifndef EBCDIC
- case 0x2028:
- case 0x2029:
-#endif /* Not EBCDIC */
+ VSPACE_CASES:
return -next != ESC_v;
+
default:
return -next == ESC_v;
}
Modified: code/trunk/pcre_dfa_exec.c
===================================================================
--- code/trunk/pcre_dfa_exec.c 2012-09-16 06:52:27 UTC (rev 1040)
+++ code/trunk/pcre_dfa_exec.c 2012-09-16 10:16:27 UTC (rev 1041)
@@ -1448,15 +1448,7 @@
BOOL OK;
switch (c)
{
- case CHAR_LF:
- case CHAR_VT:
- case CHAR_FF:
- case CHAR_CR:
- case CHAR_NEL:
-#ifndef EBCDIC
- case 0x2028:
- case 0x2029:
-#endif /* Not EBCDIC */
+ VSPACE_CASES:
OK = TRUE;
break;
@@ -1489,27 +1481,7 @@
BOOL OK;
switch (c)
{
- case CHAR_HT:
- case CHAR_SPACE:
-#ifndef EBCDIC
- case 0xa0: /* NBSP */
- case 0x1680: /* OGHAM SPACE MARK */
- case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
- case 0x2000: /* EN QUAD */
- case 0x2001: /* EM QUAD */
- case 0x2002: /* EN SPACE */
- case 0x2003: /* EM SPACE */
- case 0x2004: /* THREE-PER-EM SPACE */
- case 0x2005: /* FOUR-PER-EM SPACE */
- case 0x2006: /* SIX-PER-EM SPACE */
- case 0x2007: /* FIGURE SPACE */
- case 0x2008: /* PUNCTUATION SPACE */
- case 0x2009: /* THIN SPACE */
- case 0x200A: /* HAIR SPACE */
- case 0x202f: /* NARROW NO-BREAK SPACE */
- case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
- case 0x3000: /* IDEOGRAPHIC SPACE */
-#endif /* Not EBCDIC */
+ HSPACE_CASES:
OK = TRUE;
break;
@@ -1729,15 +1701,7 @@
BOOL OK;
switch (c)
{
- case CHAR_LF:
- case CHAR_VT:
- case CHAR_FF:
- case CHAR_CR:
- case CHAR_NEL:
-#ifndef EBCDIC
- case 0x2028:
- case 0x2029:
-#endif /* Not EBCDIC */
+ VSPACE_CASES:
OK = TRUE;
break;
@@ -1777,27 +1741,7 @@
BOOL OK;
switch (c)
{
- case CHAR_HT:
- case CHAR_SPACE:
-#ifndef EBCDIC
- case 0xa0: /* NBSP */
- case 0x1680: /* OGHAM SPACE MARK */
- case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
- case 0x2000: /* EN QUAD */
- case 0x2001: /* EM QUAD */
- case 0x2002: /* EN SPACE */
- case 0x2003: /* EM SPACE */
- case 0x2004: /* THREE-PER-EM SPACE */
- case 0x2005: /* FOUR-PER-EM SPACE */
- case 0x2006: /* SIX-PER-EM SPACE */
- case 0x2007: /* FIGURE SPACE */
- case 0x2008: /* PUNCTUATION SPACE */
- case 0x2009: /* THIN SPACE */
- case 0x200A: /* HAIR SPACE */
- case 0x202f: /* NARROW NO-BREAK SPACE */
- case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
- case 0x3000: /* IDEOGRAPHIC SPACE */
-#endif /* Not EBCDIC */
+ HSPACE_CASES:
OK = TRUE;
break;
@@ -1999,15 +1943,7 @@
BOOL OK;
switch (c)
{
- case CHAR_LF:
- case CHAR_VT:
- case CHAR_FF:
- case CHAR_CR:
- case CHAR_NEL:
-#ifndef EBCDIC
- case 0x2028:
- case 0x2029:
-#endif /* Not EBCDIC */
+ VSPACE_CASES:
OK = TRUE;
break;
@@ -2043,27 +1979,7 @@
BOOL OK;
switch (c)
{
- case CHAR_HT:
- case CHAR_SPACE:
-#ifndef EBCDIC
- case 0xa0: /* NBSP */
- case 0x1680: /* OGHAM SPACE MARK */
- case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
- case 0x2000: /* EN QUAD */
- case 0x2001: /* EM QUAD */
- case 0x2002: /* EN SPACE */
- case 0x2003: /* EM SPACE */
- case 0x2004: /* THREE-PER-EM SPACE */
- case 0x2005: /* FOUR-PER-EM SPACE */
- case 0x2006: /* SIX-PER-EM SPACE */
- case 0x2007: /* FIGURE SPACE */
- case 0x2008: /* PUNCTUATION SPACE */
- case 0x2009: /* THIN SPACE */
- case 0x200A: /* HAIR SPACE */
- case 0x202f: /* NARROW NO-BREAK SPACE */
- case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
- case 0x3000: /* IDEOGRAPHIC SPACE */
-#endif /* Not EBCDIC */
+ HSPACE_CASES:
OK = TRUE;
break;
@@ -2206,15 +2122,7 @@
case OP_NOT_VSPACE:
if (clen > 0) switch(c)
{
- case CHAR_LF:
- case CHAR_VT:
- case CHAR_FF:
- case CHAR_CR:
- case CHAR_NEL:
-#ifndef EBCDIC
- case 0x2028:
- case 0x2029:
-#endif /* Not EBCDIC */
+ VSPACE_CASES:
break;
default:
@@ -2227,19 +2135,12 @@
case OP_VSPACE:
if (clen > 0) switch(c)
{
- case CHAR_LF:
- case CHAR_VT:
- case CHAR_FF:
- case CHAR_CR:
- case CHAR_NEL:
-#ifndef EBCDIC
- case 0x2028:
- case 0x2029:
-#endif /* Not EBCDIC */
+ VSPACE_CASES:
ADD_NEW(state_offset + 1, 0);
break;
- default: break;
+ default:
+ break;
}
break;
@@ -2247,27 +2148,7 @@
case OP_NOT_HSPACE:
if (clen > 0) switch(c)
{
- case CHAR_HT:
- case CHAR_SPACE:
-#ifndef EBCDIC
- case 0xa0: /* NBSP */
- case 0x1680: /* OGHAM SPACE MARK */
- case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
- case 0x2000: /* EN QUAD */
- case 0x2001: /* EM QUAD */
- case 0x2002: /* EN SPACE */
- case 0x2003: /* EM SPACE */
- case 0x2004: /* THREE-PER-EM SPACE */
- case 0x2005: /* FOUR-PER-EM SPACE */
- case 0x2006: /* SIX-PER-EM SPACE */
- case 0x2007: /* FIGURE SPACE */
- case 0x2008: /* PUNCTUATION SPACE */
- case 0x2009: /* THIN SPACE */
- case 0x200A: /* HAIR SPACE */
- case 0x202f: /* NARROW NO-BREAK SPACE */
- case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
- case 0x3000: /* IDEOGRAPHIC SPACE */
-#endif /* Not EBCDIC */
+ HSPACE_CASES:
break;
default:
@@ -2280,29 +2161,12 @@
case OP_HSPACE:
if (clen > 0) switch(c)
{
- case CHAR_HT:
- case CHAR_SPACE:
-#ifndef EBCDIC
- case 0xa0: /* NBSP */
- case 0x1680: /* OGHAM SPACE MARK */
- case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
- case 0x2000: /* EN QUAD */
- case 0x2001: /* EM QUAD */
- case 0x2002: /* EN SPACE */
- case 0x2003: /* EM SPACE */
- case 0x2004: /* THREE-PER-EM SPACE */
- case 0x2005: /* FOUR-PER-EM SPACE */
- case 0x2006: /* SIX-PER-EM SPACE */
- case 0x2007: /* FIGURE SPACE */
- case 0x2008: /* PUNCTUATION SPACE */
- case 0x2009: /* THIN SPACE */
- case 0x200A: /* HAIR SPACE */
- case 0x202f: /* NARROW NO-BREAK SPACE */
- case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
- case 0x3000: /* IDEOGRAPHIC SPACE */
-#endif /* Not EBCDIC */
+ HSPACE_CASES:
ADD_NEW(state_offset + 1, 0);
break;
+
+ default:
+ break;
}
break;
Modified: code/trunk/pcre_exec.c
===================================================================
--- code/trunk/pcre_exec.c 2012-09-16 06:52:27 UTC (rev 1040)
+++ code/trunk/pcre_exec.c 2012-09-16 10:16:27 UTC (rev 1041)
@@ -2429,10 +2429,10 @@
case CHAR_VT:
case CHAR_FF:
case CHAR_NEL:
-#ifndef EBCDIC
+#ifndef EBCDIC
case 0x2028:
case 0x2029:
-#endif /* Not EBCDIC */
+#endif /* Not EBCDIC */
if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
break;
}
@@ -2448,29 +2448,8 @@
GETCHARINCTEST(c, eptr);
switch(c)
{
+ HSPACE_CASES: RRETURN(MATCH_NOMATCH); /* Byte and multibyte cases */
default: break;
- case CHAR_HT:
- case CHAR_SPACE:
-#ifndef EBCDIC
- case 0xa0: /* NBSP */
- case 0x1680: /* OGHAM SPACE MARK */
- case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
- case 0x2000: /* EN QUAD */
- case 0x2001: /* EM QUAD */
- case 0x2002: /* EN SPACE */
- case 0x2003: /* EM SPACE */
- case 0x2004: /* THREE-PER-EM SPACE */
- case 0x2005: /* FOUR-PER-EM SPACE */
- case 0x2006: /* SIX-PER-EM SPACE */
- case 0x2007: /* FIGURE SPACE */
- case 0x2008: /* PUNCTUATION SPACE */
- case 0x2009: /* THIN SPACE */
- case 0x200A: /* HAIR SPACE */
- case 0x202f: /* NARROW NO-BREAK SPACE */
- case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
- case 0x3000: /* IDEOGRAPHIC SPACE */
-#endif /* Not EBCDIC */
- RRETURN(MATCH_NOMATCH);
}
ecode++;
break;
@@ -2484,29 +2463,8 @@
GETCHARINCTEST(c, eptr);
switch(c)
{
+ HSPACE_CASES: break; /* Byte and multibyte cases */
default: RRETURN(MATCH_NOMATCH);
- case CHAR_HT:
- case CHAR_SPACE:
-#ifndef EBCDIC
- case 0xa0: /* NBSP */
- case 0x1680: /* OGHAM SPACE MARK */
- case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
- case 0x2000: /* EN QUAD */
- case 0x2001: /* EM QUAD */
- case 0x2002: /* EN SPACE */
- case 0x2003: /* EM SPACE */
- case 0x2004: /* THREE-PER-EM SPACE */
- case 0x2005: /* FOUR-PER-EM SPACE */
- case 0x2006: /* SIX-PER-EM SPACE */
- case 0x2007: /* FIGURE SPACE */
- case 0x2008: /* PUNCTUATION SPACE */
- case 0x2009: /* THIN SPACE */
- case 0x200A: /* HAIR SPACE */
- case 0x202f: /* NARROW NO-BREAK SPACE */
- case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
- case 0x3000: /* IDEOGRAPHIC SPACE */
-#endif /* Not EBCDIC */
- break;
}
ecode++;
break;
@@ -2520,17 +2478,8 @@
GETCHARINCTEST(c, eptr);
switch(c)
{
+ VSPACE_CASES: RRETURN(MATCH_NOMATCH);
default: break;
- case CHAR_LF:
- case CHAR_VT:
- case CHAR_FF:
- case CHAR_CR:
- case CHAR_NEL:
-#ifndef EBCDIC
- case 0x2028: /* LINE SEPARATOR */
- case 0x2029: /* PARAGRAPH SEPARATOR */
-#endif /* Not EBCDIC */
- RRETURN(MATCH_NOMATCH);
}
ecode++;
break;
@@ -2544,17 +2493,8 @@
GETCHARINCTEST(c, eptr);
switch(c)
{
+ VSPACE_CASES: break;
default: RRETURN(MATCH_NOMATCH);
- case CHAR_LF:
- case CHAR_VT:
- case CHAR_FF:
- case CHAR_CR:
- case CHAR_NEL:
-#ifndef EBCDIC
- case 0x2028: /* LINE SEPARATOR */
- case 0x2029: /* PARAGRAPH SEPARATOR */
-#endif /* Not EBCDIC */
- break;
}
ecode++;
break;
@@ -2652,19 +2592,19 @@
RRETURN(MATCH_NOMATCH);
}
else
- {
- int lgb, rgb;
+ {
+ int lgb, rgb;
GETCHARINCTEST(c, eptr);
- lgb = UCD_GRAPHBREAK(c);
+ lgb = UCD_GRAPHBREAK(c);
while (eptr < md->end_subject)
{
int len = 1;
if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
- rgb = UCD_GRAPHBREAK(c);
+ rgb = UCD_GRAPHBREAK(c);
if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
- lgb = rgb;
+ lgb = rgb;
eptr += len;
- }
+ }
}
CHECK_PARTIAL();
ecode++;
@@ -4243,19 +4183,19 @@
RRETURN(MATCH_NOMATCH);
}
else
- {
- int lgb, rgb;
+ {
+ int lgb, rgb;
GETCHARINCTEST(c, eptr);
- lgb = UCD_GRAPHBREAK(c);
+ lgb = UCD_GRAPHBREAK(c);
while (eptr < md->end_subject)
{
int len = 1;
if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
- rgb = UCD_GRAPHBREAK(c);
+ rgb = UCD_GRAPHBREAK(c);
if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
- lgb = rgb;
+ lgb = rgb;
eptr += len;
- }
+ }
}
CHECK_PARTIAL();
}
@@ -4333,10 +4273,10 @@
case CHAR_VT:
case CHAR_FF:
case CHAR_NEL:
-#ifndef EBCDIC
+#ifndef EBCDIC
case 0x2028:
case 0x2029:
-#endif /* Not EBCDIC */
+#endif /* Not EBCDIC */
if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
break;
}
@@ -4354,29 +4294,8 @@
GETCHARINC(c, eptr);
switch(c)
{
+ HSPACE_CASES: RRETURN(MATCH_NOMATCH); /* Byte and multibyte cases */
default: break;
- case CHAR_HT:
- case CHAR_SPACE:
-#ifndef EBCDIC
- case 0xa0: /* NBSP */
- case 0x1680: /* OGHAM SPACE MARK */
- case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
- case 0x2000: /* EN QUAD */
- case 0x2001: /* EM QUAD */
- case 0x2002: /* EN SPACE */
- case 0x2003: /* EM SPACE */
- case 0x2004: /* THREE-PER-EM SPACE */
- case 0x2005: /* FOUR-PER-EM SPACE */
- case 0x2006: /* SIX-PER-EM SPACE */
- case 0x2007: /* FIGURE SPACE */
- case 0x2008: /* PUNCTUATION SPACE */
- case 0x2009: /* THIN SPACE */
- case 0x200A: /* HAIR SPACE */
- case 0x202f: /* NARROW NO-BREAK SPACE */
- case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
- case 0x3000: /* IDEOGRAPHIC SPACE */
-#endif /* Not EBCDIC */
- RRETURN(MATCH_NOMATCH);
}
}
break;
@@ -4392,29 +4311,8 @@
GETCHARINC(c, eptr);
switch(c)
{
+ HSPACE_CASES: break; /* Byte and multibyte cases */
default: RRETURN(MATCH_NOMATCH);
- case CHAR_HT:
- case CHAR_SPACE:
-#ifndef EBCDIC
- case 0xa0: /* NBSP */
- case 0x1680: /* OGHAM SPACE MARK */
- case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
- case 0x2000: /* EN QUAD */
- case 0x2001: /* EM QUAD */
- case 0x2002: /* EN SPACE */
- case 0x2003: /* EM SPACE */
- case 0x2004: /* THREE-PER-EM SPACE */
- case 0x2005: /* FOUR-PER-EM SPACE */
- case 0x2006: /* SIX-PER-EM SPACE */
- case 0x2007: /* FIGURE SPACE */
- case 0x2008: /* PUNCTUATION SPACE */
- case 0x2009: /* THIN SPACE */
- case 0x200A: /* HAIR SPACE */
- case 0x202f: /* NARROW NO-BREAK SPACE */
- case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
- case 0x3000: /* IDEOGRAPHIC SPACE */
-#endif
- break;
}
}
break;
@@ -4430,17 +4328,8 @@
GETCHARINC(c, eptr);
switch(c)
{
+ VSPACE_CASES: RRETURN(MATCH_NOMATCH);
default: break;
- case CHAR_LF:
- case CHAR_VT:
- case CHAR_FF:
- case CHAR_CR:
- case CHAR_NEL:
-#ifndef EBCDIC
- case 0x2028: /* LINE SEPARATOR */
- case 0x2029: /* PARAGRAPH SEPARATOR */
-#endif
- RRETURN(MATCH_NOMATCH);
}
}
break;
@@ -4456,17 +4345,8 @@
GETCHARINC(c, eptr);
switch(c)
{
+ VSPACE_CASES: break;
default: RRETURN(MATCH_NOMATCH);
- case CHAR_LF:
- case CHAR_VT:
- case CHAR_FF:
- case CHAR_CR:
- case CHAR_NEL:
-#ifndef EBCDIC
- case 0x2028: /* LINE SEPARATOR */
- case 0x2029: /* PARAGRAPH SEPARATOR */
-#endif
- break;
}
}
break;
@@ -4655,29 +4535,10 @@
switch(*eptr++)
{
default: break;
- case CHAR_HT:
- case CHAR_SPACE:
-#ifndef EBCDIC
- case 0xa0: /* NBSP */
+ HSPACE_BYTE_CASES:
#ifdef COMPILE_PCRE16
- case 0x1680: /* OGHAM SPACE MARK */
- case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
- case 0x2000: /* EN QUAD */
- case 0x2001: /* EM QUAD */
- case 0x2002: /* EN SPACE */
- case 0x2003: /* EM SPACE */
- case 0x2004: /* THREE-PER-EM SPACE */
- case 0x2005: /* FOUR-PER-EM SPACE */
- case 0x2006: /* SIX-PER-EM SPACE */
- case 0x2007: /* FIGURE SPACE */
- case 0x2008: /* PUNCTUATION SPACE */
- case 0x2009: /* THIN SPACE */
- case 0x200A: /* HAIR SPACE */
- case 0x202f: /* NARROW NO-BREAK SPACE */
- case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
- case 0x3000: /* IDEOGRAPHIC SPACE */
-#endif /* COMPILE_PCRE16 */
-#endif /* Not EBCDIC */
+ HSPACE_MULTIBYTE_CASES:
+#endif
RRETURN(MATCH_NOMATCH);
}
}
@@ -4694,29 +4555,10 @@
switch(*eptr++)
{
default: RRETURN(MATCH_NOMATCH);
- case CHAR_HT:
- case CHAR_SPACE:
-#ifndef EBCDIC
- case 0xa0: /* NBSP */
+ HSPACE_BYTE_CASES:
#ifdef COMPILE_PCRE16
- case 0x1680: /* OGHAM SPACE MARK */
- case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
- case 0x2000: /* EN QUAD */
- case 0x2001: /* EM QUAD */
- case 0x2002: /* EN SPACE */
- case 0x2003: /* EM SPACE */
- case 0x2004: /* THREE-PER-EM SPACE */
- case 0x2005: /* FOUR-PER-EM SPACE */
- case 0x2006: /* SIX-PER-EM SPACE */
- case 0x2007: /* FIGURE SPACE */
- case 0x2008: /* PUNCTUATION SPACE */
- case 0x2009: /* THIN SPACE */
- case 0x200A: /* HAIR SPACE */
- case 0x202f: /* NARROW NO-BREAK SPACE */
- case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
- case 0x3000: /* IDEOGRAPHIC SPACE */
-#endif /* COMPILE_PCRE16 */
-#endif /* Not EBCDIC */
+ HSPACE_MULTIBYTE_CASES:
+#endif
break;
}
}
@@ -4732,17 +4574,12 @@
}
switch(*eptr++)
{
- default: break;
- case CHAR_LF:
- case CHAR_VT:
- case CHAR_FF:
- case CHAR_CR:
- case CHAR_NEL:
+ VSPACE_BYTE_CASES:
#ifdef COMPILE_PCRE16
- case 0x2028: /* LINE SEPARATOR */
- case 0x2029: /* PARAGRAPH SEPARATOR */
+ VSPACE_MULTIBYTE_CASES:
#endif
RRETURN(MATCH_NOMATCH);
+ default: break;
}
}
break;
@@ -4758,14 +4595,9 @@
switch(*eptr++)
{
default: RRETURN(MATCH_NOMATCH);
- case CHAR_LF:
- case CHAR_VT:
- case CHAR_FF:
- case CHAR_CR:
- case CHAR_NEL:
+ VSPACE_BYTE_CASES:
#ifdef COMPILE_PCRE16
- case 0x2028: /* LINE SEPARATOR */
- case 0x2029: /* PARAGRAPH SEPARATOR */
+ VSPACE_MULTIBYTE_CASES:
#endif
break;
}
@@ -5066,19 +4898,19 @@
RRETURN(MATCH_NOMATCH);
}
else
- {
- int lgb, rgb;
+ {
+ int lgb, rgb;
GETCHARINCTEST(c, eptr);
- lgb = UCD_GRAPHBREAK(c);
+ lgb = UCD_GRAPHBREAK(c);
while (eptr < md->end_subject)
{
int len = 1;
if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
- rgb = UCD_GRAPHBREAK(c);
+ rgb = UCD_GRAPHBREAK(c);
if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
- lgb = rgb;
+ lgb = rgb;
eptr += len;
- }
+ }
}
CHECK_PARTIAL();
}
@@ -5127,17 +4959,17 @@
case CHAR_CR:
if (eptr < md->end_subject && *eptr == CHAR_LF) eptr++;
break;
-
+
case CHAR_LF:
break;
case CHAR_VT:
case CHAR_FF:
case CHAR_NEL:
-#ifndef EBCDIC
+#ifndef EBCDIC
case 0x2028:
case 0x2029:
-#endif /* Not EBCDIC */
+#endif /* Not EBCDIC */
if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
break;
}
@@ -5146,92 +4978,32 @@
case OP_NOT_HSPACE:
switch(c)
{
+ HSPACE_CASES: RRETURN(MATCH_NOMATCH);
default: break;
- case CHAR_HT:
- case CHAR_SPACE:
-#ifndef EBCDIC
- case 0xa0: /* NBSP */
- case 0x1680: /* OGHAM SPACE MARK */
- case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
- case 0x2000: /* EN QUAD */
- case 0x2001: /* EM QUAD */
- case 0x2002: /* EN SPACE */
- case 0x2003: /* EM SPACE */
- case 0x2004: /* THREE-PER-EM SPACE */
- case 0x2005: /* FOUR-PER-EM SPACE */
- case 0x2006: /* SIX-PER-EM SPACE */
- case 0x2007: /* FIGURE SPACE */
- case 0x2008: /* PUNCTUATION SPACE */
- case 0x2009: /* THIN SPACE */
- case 0x200A: /* HAIR SPACE */
- case 0x202f: /* NARROW NO-BREAK SPACE */
- case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
- case 0x3000: /* IDEOGRAPHIC SPACE */
-#endif /* Not EBCDIC */
- RRETURN(MATCH_NOMATCH);
}
break;
case OP_HSPACE:
switch(c)
{
+ HSPACE_CASES: break;
default: RRETURN(MATCH_NOMATCH);
- case CHAR_HT:
- case CHAR_SPACE:
-#ifndef EBCDIC
- case 0xa0: /* NBSP */
- case 0x1680: /* OGHAM SPACE MARK */
- case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
- case 0x2000: /* EN QUAD */
- case 0x2001: /* EM QUAD */
- case 0x2002: /* EN SPACE */
- case 0x2003: /* EM SPACE */
- case 0x2004: /* THREE-PER-EM SPACE */
- case 0x2005: /* FOUR-PER-EM SPACE */
- case 0x2006: /* SIX-PER-EM SPACE */
- case 0x2007: /* FIGURE SPACE */
- case 0x2008: /* PUNCTUATION SPACE */
- case 0x2009: /* THIN SPACE */
- case 0x200A: /* HAIR SPACE */
- case 0x202f: /* NARROW NO-BREAK SPACE */
- case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
- case 0x3000: /* IDEOGRAPHIC SPACE */
-#endif /* Not EBCDIC */
- break;
}
break;
case OP_NOT_VSPACE:
switch(c)
{
+ VSPACE_CASES: RRETURN(MATCH_NOMATCH);
default: break;
- case CHAR_LF:
- case CHAR_VT:
- case CHAR_FF:
- case CHAR_CR:
- case CHAR_NEL:
-#ifndef EBCDIC
- case 0x2028: /* LINE SEPARATOR */
- case 0x2029: /* PARAGRAPH SEPARATOR */
-#endif /* Not EBCDIC */
- RRETURN(MATCH_NOMATCH);
}
break;
case OP_VSPACE:
switch(c)
{
+ VSPACE_CASES: break;
default: RRETURN(MATCH_NOMATCH);
- case CHAR_LF:
- case CHAR_VT:
- case CHAR_FF:
- case CHAR_CR:
- case CHAR_NEL:
-#ifndef EBCDIC
- case 0x2028: /* LINE SEPARATOR */
- case 0x2029: /* PARAGRAPH SEPARATOR */
-#endif /* Not EBCDIC */
- break;
}
break;
@@ -5332,29 +5104,10 @@
switch(c)
{
default: break;
- case CHAR_HT:
- case CHAR_SPACE:
-#ifndef EBCDIC
- case 0xa0: /* NBSP */
+ HSPACE_BYTE_CASES:
#ifdef COMPILE_PCRE16
- case 0x1680: /* OGHAM SPACE MARK */
- case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
- case 0x2000: /* EN QUAD */
- case 0x2001: /* EM QUAD */
- case 0x2002: /* EN SPACE */
- case 0x2003: /* EM SPACE */
- case 0x2004: /* THREE-PER-EM SPACE */
- case 0x2005: /* FOUR-PER-EM SPACE */
- case 0x2006: /* SIX-PER-EM SPACE */
- case 0x2007: /* FIGURE SPACE */
- case 0x2008: /* PUNCTUATION SPACE */
- case 0x2009: /* THIN SPACE */
- case 0x200A: /* HAIR SPACE */
- case 0x202f: /* NARROW NO-BREAK SPACE */
- case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
- case 0x3000: /* IDEOGRAPHIC SPACE */
-#endif /* COMPILE_PCRE16 */
-#endif /* Not EBCDIC */
+ HSPACE_MULTIBYTE_CASES:
+#endif
RRETURN(MATCH_NOMATCH);
}
break;
@@ -5363,29 +5116,10 @@
switch(c)
{
default: RRETURN(MATCH_NOMATCH);
- case CHAR_HT:
- case CHAR_SPACE:
-#ifndef EBCDIC
- case 0xa0: /* NBSP */
+ HSPACE_BYTE_CASES:
#ifdef COMPILE_PCRE16
- case 0x1680: /* OGHAM SPACE MARK */
- case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
- case 0x2000: /* EN QUAD */
- case 0x2001: /* EM QUAD */
- case 0x2002: /* EN SPACE */
- case 0x2003: /* EM SPACE */
- case 0x2004: /* THREE-PER-EM SPACE */
- case 0x2005: /* FOUR-PER-EM SPACE */
- case 0x2006: /* SIX-PER-EM SPACE */
- case 0x2007: /* FIGURE SPACE */
- case 0x2008: /* PUNCTUATION SPACE */
- case 0x2009: /* THIN SPACE */
- case 0x200A: /* HAIR SPACE */
- case 0x202f: /* NARROW NO-BREAK SPACE */
- case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
- case 0x3000: /* IDEOGRAPHIC SPACE */
-#endif /* COMPILE_PCRE16 */
-#endif /* Not EBCDIC */
+ HSPACE_MULTIBYTE_CASES:
+#endif
break;
}
break;
@@ -5394,14 +5128,9 @@
switch(c)
{
default: break;
- case CHAR_LF:
- case CHAR_VT:
- case CHAR_FF:
- case CHAR_CR:
- case CHAR_NEL:
+ VSPACE_BYTE_CASES:
#ifdef COMPILE_PCRE16
- case 0x2028: /* LINE SEPARATOR */
- case 0x2029: /* PARAGRAPH SEPARATOR */
+ VSPACE_MULTIBYTE_CASES:
#endif
RRETURN(MATCH_NOMATCH);
}
@@ -5411,14 +5140,9 @@
switch(c)
{
default: RRETURN(MATCH_NOMATCH);
- case CHAR_LF:
- case CHAR_VT:
- case CHAR_FF:
- case CHAR_CR:
- case CHAR_NEL:
+ VSPACE_BYTE_CASES:
#ifdef COMPILE_PCRE16
- case 0x2028: /* LINE SEPARATOR */
- case 0x2029: /* PARAGRAPH SEPARATOR */
+ VSPACE_MULTIBYTE_CASES:
#endif
break;
}
@@ -5651,19 +5375,19 @@
break;
}
else
- {
- int lgb, rgb;
+ {
+ int lgb, rgb;
GETCHARINCTEST(c, eptr);
- lgb = UCD_GRAPHBREAK(c);
+ lgb = UCD_GRAPHBREAK(c);
while (eptr < md->end_subject)
{
int len = 1;
if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
- rgb = UCD_GRAPHBREAK(c);
+ rgb = UCD_GRAPHBREAK(c);
if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
- lgb = rgb;
+ lgb = rgb;
eptr += len;
- }
+ }
}
CHECK_PARTIAL();
}
@@ -5802,10 +5526,10 @@
{
if (c != CHAR_LF &&
(md->bsr_anycrlf ||
- (c != CHAR_VT && c != CHAR_FF && c != CHAR_NEL
-#ifndef EBCDIC
+ (c != CHAR_VT && c != CHAR_FF && c != CHAR_NEL
+#ifndef EBCDIC
&& c != 0x2028 && c != 0x2029
-#endif /* Not EBCDIC */
+#endif /* Not EBCDIC */
)))
break;
eptr += len;
@@ -5827,30 +5551,8 @@
GETCHARLEN(c, eptr, len);
switch(c)
{
+ HSPACE_CASES: gotspace = TRUE; break;
default: gotspace = FALSE; break;
- case CHAR_HT:
- case CHAR_SPACE:
-#ifndef EBCDIC
- case 0xa0: /* NBSP */
- case 0x1680: /* OGHAM SPACE MARK */
- case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
- case 0x2000: /* EN QUAD */
- case 0x2001: /* EM QUAD */
- case 0x2002: /* EN SPACE */
- case 0x2003: /* EM SPACE */
- case 0x2004: /* THREE-PER-EM SPACE */
- case 0x2005: /* FOUR-PER-EM SPACE */
- case 0x2006: /* SIX-PER-EM SPACE */
- case 0x2007: /* FIGURE SPACE */
- case 0x2008: /* PUNCTUATION SPACE */
- case 0x2009: /* THIN SPACE */
- case 0x200A: /* HAIR SPACE */
- case 0x202f: /* NARROW NO-BREAK SPACE */
- case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
- case 0x3000: /* IDEOGRAPHIC SPACE */
-#endif /* Not EBCDIC */
- gotspace = TRUE;
- break;
}
if (gotspace == (ctype == OP_NOT_HSPACE)) break;
eptr += len;
@@ -5871,18 +5573,8 @@
GETCHARLEN(c, eptr, len);
switch(c)
{
+ VSPACE_CASES: gotspace = TRUE; break;
default: gotspace = FALSE; break;
- case CHAR_LF:
- case CHAR_VT:
- case CHAR_FF:
- case CHAR_CR:
- case CHAR_NEL:
-#ifndef EBCDIC
- case 0x2028: /* LINE SEPARATOR */
- case 0x2029: /* PARAGRAPH SEPARATOR */
-#endif /* Not EBCDIC */
- gotspace = TRUE;
- break;
}
if (gotspace == (ctype == OP_NOT_VSPACE)) break;
eptr += len;
@@ -6074,18 +5766,17 @@
SCHECK_PARTIAL();
break;
}
- c = *eptr;
- if (c == CHAR_HT || c == CHAR_SPACE
-#ifndef EBCDIC
- || c == 0xa0
+ switch(*eptr)
+ {
+ default: eptr++; break;
+ HSPACE_BYTE_CASES:
#ifdef COMPILE_PCRE16
- || c == 0x1680 || c == 0x180e || (c >= 0x2000 && c <= 0x200A)
- || c == 0x202f || c == 0x205f || c == 0x3000
-#endif /* COMPILE_PCRE16 */
-#endif /* Not EBCDIC */
- ) break;
- eptr++;
+ HSPACE_MULTIBYTE_CASES:
+#endif
+ goto ENDLOOP00;
+ }
}
+ ENDLOOP00:
break;
case OP_HSPACE:
@@ -6096,18 +5787,17 @@
SCHECK_PARTIAL();
break;
}
- c = *eptr;
- if (c != CHAR_HT && c != CHAR_SPACE
-#ifndef EBCDIC
- && c != 0xa0
+ switch(*eptr)
+ {
+ default: goto ENDLOOP01;
+ HSPACE_BYTE_CASES:
#ifdef COMPILE_PCRE16
- && c != 0x1680 && c != 0x180e && (c < 0x2000 || c > 0x200A)
- && c != 0x202f && c != 0x205f && c != 0x3000
-#endif /* COMPILE_PCRE16 */
-#endif /* Not EBCDIC */
- ) break;
- eptr++;
+ HSPACE_MULTIBYTE_CASES:
+#endif
+ eptr++; break;
+ }
}
+ ENDLOOP01:
break;
case OP_NOT_VSPACE:
@@ -6118,15 +5808,17 @@
SCHECK_PARTIAL();
break;
}
- c = *eptr;
- if (c == CHAR_LF || c == CHAR_VT || c == CHAR_FF ||
- c == CHAR_CR || c == CHAR_NEL
+ switch(*eptr)
+ {
+ default: eptr++; break;
+ VSPACE_BYTE_CASES:
#ifdef COMPILE_PCRE16
- || c == 0x2028 || c == 0x2029
+ VSPACE_MULTIBYTE_CASES:
#endif
- ) break;
- eptr++;
+ goto ENDLOOP02;
+ }
}
+ ENDLOOP02:
break;
case OP_VSPACE:
@@ -6137,15 +5829,17 @@
SCHECK_PARTIAL();
break;
}
- c = *eptr;
- if (c != CHAR_LF && c != CHAR_VT && c != CHAR_FF &&
- c != CHAR_CR && c != CHAR_NEL
+ switch(*eptr)
+ {
+ default: goto ENDLOOP03;
+ VSPACE_BYTE_CASES:
#ifdef COMPILE_PCRE16
- && c != 0x2028 && c != 0x2029
+ VSPACE_MULTIBYTE_CASES:
#endif
- ) break;
- eptr++;
+ eptr++; break;
+ }
}
+ ENDLOOP03:
break;
case OP_NOT_DIGIT:
Modified: code/trunk/pcre_internal.h
===================================================================
--- code/trunk/pcre_internal.h 2012-09-16 06:52:27 UTC (rev 1040)
+++ code/trunk/pcre_internal.h 2012-09-16 10:16:27 UTC (rev 1041)
@@ -529,11 +529,11 @@
#define MAX_MARK ((1 << (sizeof(pcre_uchar)*8)) - 1)
/* When UTF encoding is being used, a character is no longer just a single
-character. The macros for character handling generate simple sequences when
-used in character-mode, and more complicated ones for UTF characters.
-GETCHARLENTEST and other macros are not used when UTF is not supported,
-so they are not defined. To make sure they can never even appear when
-UTF support is omitted, we don't even define them. */
+byte. The macros for character handling generate simple sequences when used in
+character-mode, and more complicated ones for UTF characters. GETCHARLENTEST
+and other macros are not used when UTF is not supported, so they are not
+defined. To make sure they can never even appear when UTF support is omitted,
+we don't even define them. */
#ifndef SUPPORT_UTF
@@ -832,6 +832,68 @@
#endif /* SUPPORT_UTF */
+/* Tests for Unicode horizontal and vertical whitespace characters must check a
+number of different values. Using a switch statement for this generates the
+fastest code (no loop, no memory access), and there are several places where
+this happens. In order to ensure that all the case lists remain in step, we use
+macros so that there is only one place where the lists are defined.
+
+NOTE: These values are also used explicitly in pcre_compile.c when processing
+\h, \H, \v and \V in a character class, so any changes here should be
+duplicated there as well. They also appear in pcre_jit_compile.c. */
+
+#ifndef EBCDIC
+#define HSPACE_MULTIBYTE_CASES \
+ case 0x1680: /* OGHAM SPACE MARK */ \
+ case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */ \
+ case 0x2000: /* EN QUAD */ \
+ case 0x2001: /* EM QUAD */ \
+ case 0x2002: /* EN SPACE */ \
+ case 0x2003: /* EM SPACE */ \
+ case 0x2004: /* THREE-PER-EM SPACE */ \
+ case 0x2005: /* FOUR-PER-EM SPACE */ \
+ case 0x2006: /* SIX-PER-EM SPACE */ \
+ case 0x2007: /* FIGURE SPACE */ \
+ case 0x2008: /* PUNCTUATION SPACE */ \
+ case 0x2009: /* THIN SPACE */ \
+ case 0x200A: /* HAIR SPACE */ \
+ case 0x202f: /* NARROW NO-BREAK SPACE */ \
+ case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ \
+ case 0x3000 /* IDEOGRAPHIC SPACE */
+
+#define HSPACE_BYTE_CASES \
+ case CHAR_HT: \
+ case CHAR_SPACE: \
+ case 0xa0 /* NBSP */
+
+#define VSPACE_MULTIBYTE_CASES \
+ case 0x2028: /* LINE SEPARATOR */ \
+ case 0x2029 /* PARAGRAPH SEPARATOR */
+
+#else /* EBCDIC */
+#define HSPACE_MULTIBYTE_CASES
+#define VSPACE_MULTIBYTE_CASES
+
+#define HSPACE_BYTE_CASES \
+ case CHAR_HT: \
+ case CHAR_SPACE
+#endif /* EBCDIC */
+
+#define VSPACE_BYTE_CASES \
+ case CHAR_LF: \
+ case CHAR_VT: \
+ case CHAR_FF: \
+ case CHAR_CR: \
+ case CHAR_NEL
+
+#define HSPACE_CASES \
+ HSPACE_BYTE_CASES: \
+ HSPACE_MULTIBYTE_CASES
+
+#define VSPACE_CASES \
+ VSPACE_BYTE_CASES: \
+ VSPACE_MULTIBYTE_CASES
+
/* In case there is no definition of offsetof() provided - though any proper
Standard C system should have one. */
@@ -946,15 +1008,15 @@
/* UTF-8 support is not enabled; use the platform-dependent character literals
so that PCRE works in both ASCII and EBCDIC environments, but only in non-UTF
-mode. Newline characters are problematic in EBCDIC. Though it has CR and LF
+mode. Newline characters are problematic in EBCDIC. Though it has CR and LF
characters, a common practice has been to use its NL (0x15) character as the
-line terminator in C-like processing environments. However, sometimes the LF
+line terminator in C-like processing environments. However, sometimes the LF
(0x25) character is used instead, according to this Unicode document:
http://unicode.org/standard/reports/tr13/tr13-5.html
-PCRE defaults EBCDIC NL to 0x15, but has a build-time option to select 0x25
-instead. Whichever is *not* chosen is defined as NEL.
+PCRE defaults EBCDIC NL to 0x15, but has a build-time option to select 0x25
+instead. Whichever is *not* chosen is defined as NEL.
In both ASCII and EBCDIC environments, CHAR_NL and CHAR_LF are synonyms for the
same code point. */
@@ -983,7 +1045,7 @@
#else /* Not EBCDIC */
-/* In ASCII/Unicode, linefeed is '\n' and we equate this to NL for
+/* In ASCII/Unicode, linefeed is '\n' and we equate this to NL for
compatibility. NEL is the Unicode newline character; make sure it is
a positive value. */
@@ -2083,7 +2145,7 @@
int external_flags; /* External flag bits to be set */
int req_varyopt; /* "After variable item" flag for reqbyte */
BOOL had_accept; /* (*ACCEPT) encountered */
- BOOL had_pruneorskip; /* (*PRUNE) or (*SKIP) encountered */
+ BOOL had_pruneorskip; /* (*PRUNE) or (*SKIP) encountered */
BOOL check_lookbehind; /* Lookbehinds need later checking */
int nltype; /* Newline type */
int nllen; /* Newline string length */