Hi,
I made a performance test for PCRE 8.12. I want to know the
performance difference between the following cases.
Case 1:
respectively match data source with the following 4 regular expressions.
\btype\b\W*?\btext\b\W*?\bjavascript\b
\burl\b\W*?\bshell:
<input\b.*?\btype\b\W*?\bimage\b
\bonkeyup\b\W*?\=
Case 2:
merge the 4 rules above to one rule as followed with '|' and match
the same data source as case one.
\btype\b\W*?\btext\b\W*?\bjavascript\b|\burl\b\W*?\bshell:|<input\b.*?\btype\b\W*?\bimage\b|\bonkeyup\b\W*?\=
The following is my test results.
For case 1, cost about 2.3 seconds to do 100000 matching.
For case 2, cost about 24 seconds to do the same thing as case one.
The test results really surprised me. I had thought the case 2 would
have better performance. But on the contrary, the case 2 has much
lower performance compared with case 1. I don't know why. Could you
please explain the reason? Are there any problems with my test case?
The attached are my test code and data.
Thanks a lot!
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <errno.h>
#include <sys/time.h>
#include <sys/resource.h>
#include "pcre.h"
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>
#define SRC_STR_LEN (1024 * 320)
char src_str[SRC_STR_LEN];
int main()
{
pcre *big_exp;
pcre *small_exp[4];
char *big_pat = "\\btype\\b\\W*?\\btext\\b\\W*?\\bjavascript\\b|\\burl\\b\\W*?\\bshell:|<input\\b.*?\\btype\\b\\W*?\\bimage\\b|\bonkeyup\\b\\W*?\\=";
char *small_pat[] = {"\\btype\\b\\W*?\\btext\\b\\W*?\\bjavascript\\b", "\\burl\\b\\W*?\\bshell:", "<input\\b.*?\\btype\\b\\W*?\\bimage\\b", "\\bonkeyup\\b\\W*?\\=", NULL};
const char *err;
int err_offset, i = 0, ret = 0, fd, j, k;
fd = open("./source_data", O_RDONLY);
if (fd < 0) {
perror("open()");
}
ret = read(fd, src_str, sizeof(src_str));
printf("read %d bytes from file\n", ret);
close(fd);
printf("big pattern: %s\n", big_pat);
for (i = 0; small_pat[i] != NULL; i++) {
printf("small pattern[%d]: %s\n", i, small_pat[i]);
}
big_exp = pcre_compile(big_pat, 0, &err, &err_offset, NULL);
if (big_exp == NULL) {
printf("pcre_compile failed.\n");
return -1;
}
for (i = 0; i < 4; i++) {
small_exp[i] = pcre_compile(small_pat[i], 0, &err, &err_offset, NULL);
if (small_exp[i] == NULL) {
printf("pcre compile failed...\n");
return -1;
}
}
for (i = 0; i < 100000; i++) {
//#if 0
/* Case 1 */
for (k = 0; k < 4; k++) {
j = pcre_exec(small_exp[k], NULL, src_str, ret, 0, 0, 0, 0);
}
//#endif
#if 0
/* Case 2 */
j = pcre_exec(big_exp, NULL, src_str, ret, 0, 0, 0, 0);
#endif
}
pcre_free(big_exp);
return 0;
}