Revision: 1080
http://vcs.pcre.org/viewvc?view=rev&revision=1080
Author: chpe
Date: 2012-10-16 16:55:07 +0100 (Tue, 16 Oct 2012)
Log Message:
-----------
pcre32: fullinfo: Add variants of (FIRST|LAST)LITERAL that are 32-bit clean
Since for pcre32 the whole range of the output is already used up
for the character itself, return the special values separately.
Modified Paths:
--------------
code/trunk/doc/pcre_fullinfo.3
code/trunk/doc/pcreapi.3
code/trunk/pcre.h.in
code/trunk/pcre_fullinfo.c
code/trunk/pcretest.c
Modified: code/trunk/doc/pcre_fullinfo.3
===================================================================
--- code/trunk/doc/pcre_fullinfo.3 2012-10-16 15:55:04 UTC (rev 1079)
+++ code/trunk/doc/pcre_fullinfo.3 2012-10-16 15:55:07 UTC (rev 1080)
@@ -53,6 +53,16 @@
PCRE_INFO_OPTIONS Option bits used for compilation
PCRE_INFO_SIZE Size of compiled pattern
PCRE_INFO_STUDYSIZE Size of study data
+ PCRE_INFO_FIRSTLITERAL Literal first data character for a match (if any; else 0)
+ PCRE_INFO_FIRSTLITERALSET Returns
+ 1 if there is a first data character set, which can
+ then be retrieved using PCRE_INFO_FIRSTLITERAL,
+ 2 if the first character is at the start of the data
+ string or after a newline, and
+ 0 otherwise
+ PCRE_INFO_LASTLITERAL2 Literal last data character required (if any; else 0)
+ PCRE_INFO_LASTLITERAL2SET Returns 1 if the last data character is set (which can then
+ be retrieved using PCRE_INFO_LASTLITERAL2); 0 otherwise
.sp
The \fIwhere\fP argument must point to an integer variable, except for the
following \fIwhat\fP values:
@@ -64,6 +74,8 @@
PCRE_INFO_NAMETABLE const unsigned char * (8-bit library)
PCRE_INFO_OPTIONS unsigned long int
PCRE_INFO_SIZE size_t
+ PCRE_INFO_FIRSTLITERAL uint32_t
+ PCRE_INOF_LASTLITERAL2 uint32_t
.sp
The yield of the function is zero on success or:
.sp
Modified: code/trunk/doc/pcreapi.3
===================================================================
--- code/trunk/doc/pcreapi.3 2012-10-16 15:55:04 UTC (rev 1079)
+++ code/trunk/doc/pcreapi.3 2012-10-16 15:55:07 UTC (rev 1080)
@@ -1235,6 +1235,11 @@
-1 is returned, indicating that the pattern matches only at the start of a
subject string or after any newline within the string. Otherwise -2 is
returned. For anchored patterns, -2 is returned.
+.P
+Since for the 32-bit library using the non-UTF-32 mode, this function is unable
+to return the full 32-bit range of the character, this value is deprecated;
+instead the PCRE_INFO_FIRSTLITERALSET and PCRE_INFO_FIRSTLITERAL values should
+be used.
.sp
PCRE_INFO_FIRSTTABLE
.sp
@@ -1282,6 +1287,11 @@
only if it follows something of variable length. For example, for the pattern
/^a\ed+z\ed+/ the returned value is "z", but for /^a\edz\ed/ the returned value
is -1.
+.P
+Since for the 32-bit library using the non-UTF-32 mode, this function is unable
+to return the full 32-bit range of the character, this value is deprecated;
+instead the PCRE_INFO_LASTLITERAL2SET and PCRE_INFO_LASTLITERAL2 values should
+be used.
.sp
PCRE_INFO_MAXLOOKBEHIND
.sp
@@ -1425,6 +1435,69 @@
\fBpcreprecompile\fP
.\"
documentation for details).
+.sp
+ PCRE_INFO_FIRSTLITERALSET
+.sp
+Return information about the first data unit of any matched string, for a
+non-anchored pattern. The fourth argument should point to an \fBint\fP
+variable.
+.P
+If there is a fixed first value, for example, the letter "c" from a pattern
+such as (cat|cow|coyote), 1 is returned, and the character value can be
+retrieved using PCRE_INFO_FIRSTLITERAL.
+.P
+If there is no fixed first value, and if either
+.sp
+(a) the pattern was compiled with the PCRE_MULTILINE option, and every branch
+starts with "^", or
+.sp
+(b) every branch of the pattern starts with ".*" and PCRE_DOTALL is not set
+(if it were set, the pattern would be anchored),
+.sp
+2 is returned, indicating that the pattern matches only at the start of a
+subject string or after any newline within the string. Otherwise 0 is
+returned. For anchored patterns, 0 is returned.
+.sp
+ PCRE_INFO_FIRSTLITERAL
+.sp
+Return the fixed first character value, if PCRE_INFO_FIRSTLITERALSET returned 1;
+otherwise returns 0. The fourth argument should point to an \fBuint_t\fP
+variable.
+.P
+In the 8-bit library, the value is always less than 256. In the 16-bit library
+the value can be up to 0xffff. In the 32-bit library in UTF-32 mode the value
+can be up to 0x10ffff, and up to 0xffffffff when not using UTF-32 mode.
+.P
+If there is no fixed first value, and if either
+.sp
+(a) the pattern was compiled with the PCRE_MULTILINE option, and every branch
+starts with "^", or
+.sp
+(b) every branch of the pattern starts with ".*" and PCRE_DOTALL is not set
+(if it were set, the pattern would be anchored),
+.sp
+-1 is returned, indicating that the pattern matches only at the start of a
+subject string or after any newline within the string. Otherwise -2 is
+returned. For anchored patterns, -2 is returned.
+.sp
+ PCRE_INFO_LASTLITERAL2SET
+.sp
+Returns 1 if there is a rightmost literal data unit that must exist in any matched
+string, other than at its start. The fourth argument should point to an \fBint\fP
+variable. If there is no such value, 0 is returned. If returning 1, the character
+value itself can be retrieved using PCRE_INFO_LASTLITERAL2.
+.P
+For anchored patterns, a last literal value is recorded only if it follows something
+of variable length. For example, for the pattern /^a\ed+z\ed+/ the returned value
+1 (with "z" returned from PCRE_INFO_LASTLITERAL2), but for /^a\edz\ed/ the returned
+value is 0.
+.sp
+ PCRE_INFO_LASTLITERAL2
+.sp
+Return the value of the rightmost literal data unit that must exist in any
+matched string, other than at its start, if such a value has been recorded. The
+fourth argument should point to an \fBuint32_t\fP variable. If there is no such
+value, 0 is returned.
.
.
.SH "REFERENCE COUNTS"
Modified: code/trunk/pcre.h.in
===================================================================
--- code/trunk/pcre.h.in 2012-10-16 15:55:04 UTC (rev 1079)
+++ code/trunk/pcre.h.in 2012-10-16 15:55:07 UTC (rev 1080)
@@ -246,6 +246,10 @@
#define PCRE_INFO_JIT 16
#define PCRE_INFO_JITSIZE 17
#define PCRE_INFO_MAXLOOKBEHIND 18
+#define PCRE_INFO_FIRSTLITERAL 19
+#define PCRE_INFO_FIRSTLITERALSET 20
+#define PCRE_INFO_LASTLITERAL2 21
+#define PCRE_INFO_LASTLITERAL2SET 22
/* Request types for pcre_config(). Do not re-arrange, in order to remain
compatible. */
Modified: code/trunk/pcre_fullinfo.c
===================================================================
--- code/trunk/pcre_fullinfo.c 2012-10-16 15:55:04 UTC (rev 1079)
+++ code/trunk/pcre_fullinfo.c 2012-10-16 15:55:07 UTC (rev 1080)
@@ -140,6 +140,17 @@
((re->flags & PCRE_STARTLINE) != 0)? -1 : -2;
break;
+ case PCRE_INFO_FIRSTLITERAL:
+ *((pcre_uint32 *)where) =
+ (re->flags & PCRE_FIRSTSET) != 0 ? re->first_char : 0;
+ break;
+
+ case PCRE_INFO_FIRSTLITERALSET:
+ *((int *)where) =
+ ((re->flags & PCRE_FIRSTSET) != 0) ? 1 :
+ ((re->flags & PCRE_STARTLINE) != 0) ? 2 : 0;
+ break;
+
/* Make sure we pass back the pointer to the bit vector in the external
block, not the internal copy (with flipped integer fields). */
@@ -166,6 +177,16 @@
((re->flags & PCRE_REQCHSET) != 0)? re->req_char : -1;
break;
+ case PCRE_INFO_LASTLITERAL2:
+ *((pcre_uint32 *)where) =
+ ((re->flags & PCRE_REQCHSET) != 0) ? re->req_char : 0;
+ break;
+
+ case PCRE_INFO_LASTLITERAL2SET:
+ *((int *)where) =
+ ((re->flags & PCRE_REQCHSET) != 0);
+ break;
+
case PCRE_INFO_NAMEENTRYSIZE:
*((int *)where) = re->name_entry_size;
break;
Modified: code/trunk/pcretest.c
===================================================================
--- code/trunk/pcretest.c 2012-10-16 15:55:04 UTC (rev 1079)
+++ code/trunk/pcretest.c 2012-10-16 15:55:07 UTC (rev 1080)
@@ -3757,7 +3757,8 @@
if (do_showinfo)
{
unsigned long int all_options;
- int count, backrefmax, first_char, need_char, okpartial, jchanged,
+ pcre_uint32 first_char, need_char;
+ int count, backrefmax, first_char_set, need_char_set, okpartial, jchanged,
hascrorlf, maxlookbehind;
int nameentrysize, namecount;
const pcre_uint8 *nametable;
@@ -3765,8 +3766,10 @@
if (new_info(re, NULL, PCRE_INFO_SIZE, &size) +
new_info(re, NULL, PCRE_INFO_CAPTURECOUNT, &count) +
new_info(re, NULL, PCRE_INFO_BACKREFMAX, &backrefmax) +
- new_info(re, NULL, PCRE_INFO_FIRSTBYTE, &first_char) +
- new_info(re, NULL, PCRE_INFO_LASTLITERAL, &need_char) +
+ new_info(re, NULL, PCRE_INFO_FIRSTLITERAL, &first_char) +
+ new_info(re, NULL, PCRE_INFO_FIRSTLITERALSET, &first_char_set) +
+ new_info(re, NULL, PCRE_INFO_LASTLITERAL2, &need_char) +
+ new_info(re, NULL, PCRE_INFO_LASTLITERAL2SET, &need_char_set) +
new_info(re, NULL, PCRE_INFO_NAMEENTRYSIZE, &nameentrysize) +
new_info(re, NULL, PCRE_INFO_NAMECOUNT, &namecount) +
new_info(re, NULL, PCRE_INFO_NAMETABLE, (void *)&nametable) +
@@ -3865,16 +3868,12 @@
break;
}
- if (first_char == -1)
+ if (first_char_set == 2)
{
fprintf(outfile, "First char at start or follows newline\n");
}
- else if (first_char < 0)
+ else if (first_char_set == 1)
{
- fprintf(outfile, "No first char\n");
- }
- else
- {
const char *caseless =
((REAL_PCRE_FLAGS(re) & PCRE_FCH_CASELESS) == 0)?
"" : " (caseless)";
@@ -3888,8 +3887,12 @@
fprintf(outfile, "%s\n", caseless);
}
}
+ else
+ {
+ fprintf(outfile, "No first char\n");
+ }
- if (need_char < 0)
+ if (need_char_set == 0)
{
fprintf(outfile, "No need char\n");
}