Revision: 1688
http://vcs.pcre.org/viewvc?view=rev&revision=1688
Author: ph10
Date: 2017-02-24 17:30:30 +0000 (Fri, 24 Feb 2017)
Log Message:
-----------
Fix Unicode property crash for 32-bit characters greater than 0x10ffff.
Modified Paths:
--------------
code/trunk/ChangeLog
code/trunk/maint/MultiStage2.py
code/trunk/pcre_internal.h
code/trunk/pcre_ucd.c
Modified: code/trunk/ChangeLog
===================================================================
--- code/trunk/ChangeLog 2017-02-23 17:25:44 UTC (rev 1687)
+++ code/trunk/ChangeLog 2017-02-24 17:30:30 UTC (rev 1688)
@@ -10,7 +10,7 @@
1. Fixed typo in CMakeLists.txt (wrong number of arguments for
PCRE_STATIC_RUNTIME (affects MSVC only).
-2. Issue 1 for 8.40 below was not correctly fixed. If pcregrep in multiline
+2. Issue 1 for 8.40 below was not correctly fixed. If pcregrep in multiline
mode with --only-matching matched several lines, it restarted scanning at the
next line instead of moving on to the end of the matched string, which can be
several lines after the start.
@@ -29,6 +29,10 @@
(a) Check for values < 256 when calling isprint() in pcretest.
(b) Give an error for too big a number after \O.
+
+7. In the 32-bit library in non-UTF mode, an attempt to find a Unicode
+property for a character with a code point greater than 0x10ffff (the Unicode
+maximum) caused a crash.
Version 8.40 11-January-2017
Modified: code/trunk/maint/MultiStage2.py
===================================================================
--- code/trunk/maint/MultiStage2.py 2017-02-23 17:25:44 UTC (rev 1687)
+++ code/trunk/maint/MultiStage2.py 2017-02-24 17:30:30 UTC (rev 1688)
@@ -1,5 +1,7 @@
#! /usr/bin/python
+# WARNING! This is a python 2 script.
+
# Multistage table builder
# (c) Peter Kankowski, 2008
@@ -15,10 +17,10 @@
# ./MultiStage2.py >../pcre_ucd.c
#
# It requires four Unicode data tables, DerivedGeneralCategory.txt,
-# GraphemeBreakProperty.txt, Scripts.txt, and CaseFolding.txt, to be in the
-# Unicode.tables subdirectory. The first of these is found in the "extracted"
-# subdirectory of the Unicode database (UCD) on the Unicode web site; the
-# second is in the "auxiliary" subdirectory; the other two are directly in the
+# GraphemeBreakProperty.txt, Scripts.txt, and CaseFolding.txt, to be in the
+# Unicode.tables subdirectory. The first of these is found in the "extracted"
+# subdirectory of the Unicode database (UCD) on the Unicode web site; the
+# second is in the "auxiliary" subdirectory; the other two are directly in the
# UCD directory.
#
# Minor modifications made to this script:
@@ -42,7 +44,7 @@
# code scans CaseFolding.txt instead of UnicodeData.txt.
#
# The main tables generated by this script are used by macros defined in
-# pcre_internal.h. They look up Unicode character properties using short
+# pcre_internal.h. They look up Unicode character properties using short
# sequences of code that contains no branches, which makes for greater speed.
#
# Conceptually, there is a table of records (of type ucd_record), containing a
@@ -69,13 +71,13 @@
# Example: lowercase "a" (U+0061) is in block 0
# lookup 0 in stage1 table yields 0
# lookup 97 in the first table in stage2 yields 16
-# record 17 is { 33, 5, 11, 0, -32 }
+# record 17 is { 33, 5, 11, 0, -32 }
# 33 = ucp_Latin => Latin script
# 5 = ucp_Ll => Lower case letter
# 11 = ucp_gbOther => Grapheme break property "Other"
# 0 => not part of a caseless set
# -32 => Other case is U+0041
-#
+#
# Almost all lowercase latin characters resolve to the same record. One or two
# are different because they are part of a multi-character caseless set (for
# example, k, K and the Kelvin symbol are such a set).
@@ -83,17 +85,17 @@
# Example: hiragana letter A (U+3042) is in block 96 (0x60)
# lookup 96 in stage1 table yields 88
# lookup 66 in the 88th table in stage2 yields 467
-# record 470 is { 26, 7, 11, 0, 0 }
+# record 470 is { 26, 7, 11, 0, 0 }
# 26 = ucp_Hiragana => Hiragana script
# 7 = ucp_Lo => Other letter
# 11 = ucp_gbOther => Grapheme break property "Other"
# 0 => not part of a caseless set
-# 0 => No other case
+# 0 => No other case
#
# In these examples, no other blocks resolve to the same "virtual" block, as it
# happens, but plenty of other blocks do share "virtual" blocks.
#
-# There is a fourth table, maintained by hand, which translates from the
+# There is a fourth table, maintained by hand, which translates from the
# individual character types such as ucp_Cc to the general types like ucp_C.
#
# Philip Hazel, 03 July 2008
@@ -101,8 +103,8 @@
# 01-March-2010: Updated list of scripts for Unicode 5.2.0
# 30-April-2011: Updated list of scripts for Unicode 6.0.0
# July-2012: Updated list of scripts for Unicode 6.1.0
-# 20-August-2012: Added scan of GraphemeBreakProperty.txt and added a new
-# field in the record to hold the value. Luckily, the
+# 20-August-2012: Added scan of GraphemeBreakProperty.txt and added a new
+# field in the record to hold the value. Luckily, the
# structure had a hole in it, so the resulting table is
# not much bigger than before.
# 18-September-2012: Added code for multiple caseless sets. This uses the
@@ -144,14 +146,14 @@
if m.group(3) is None:
last = char
else:
- last = int(m.group(3), 16)
+ last = int(m.group(3), 16)
for i in range(char, last + 1):
# It is important not to overwrite a previously set
# value because in the CaseFolding file there are lines
- # to be ignored (returning the default value of 0)
- # which often come after a line which has already set
- # data.
- if table[i] == default_value:
+ # to be ignored (returning the default value of 0)
+ # which often come after a line which has already set
+ # data.
+ if table[i] == default_value:
table[i] = value
file.close()
return table
@@ -192,7 +194,7 @@
stage2 += block
blocks[block] = start
stage1.append(start)
-
+
return stage1, stage2
# Print a table
@@ -199,7 +201,7 @@
def print_table(table, table_name, block_size = None):
type, size = get_type_size(table)
ELEMS_PER_LINE = 16
-
+
s = "const %s %s[] = { /* %d bytes" % (type, table_name, size * len(table))
if block_size:
s += ", block = %d" % block_size
@@ -245,15 +247,15 @@
size = (size + slice_size - 1) & -slice_size
size += slice_size
structure += '%s property_%d;\n' % (slice_type, i)
-
+
# round up to the first item of the next structure in array
record_slice = map(lambda record: record[0], records)
slice_type, slice_size = get_type_size(record_slice)
size = (size + slice_size - 1) & -slice_size
-
+
structure += '} ucd_record;\n*/\n\n'
return size, structure
-
+
def test_record_size():
tests = [ \
( [(3,), (6,), (6,), (1,)], 1 ), \
@@ -305,7 +307,7 @@
'Old_North_Arabian', 'Old_Permic', 'Pahawh_Hmong', 'Palmyrene', 'Psalter_Pahlavi',
'Pau_Cin_Hau', 'Siddham', 'Tirhuta', 'Warang_Citi'
]
-
+
category_names = ['Cc', 'Cf', 'Cn', 'Co', 'Cs', 'Ll', 'Lm', 'Lo', 'Lt', 'Lu',
'Mc', 'Me', 'Mn', 'Nd', 'Nl', 'No', 'Pc', 'Pd', 'Pe', 'Pf', 'Pi', 'Po', 'Ps',
'Sc', 'Sk', 'Sm', 'So', 'Zl', 'Zp', 'Zs' ]
@@ -321,20 +323,20 @@
other_case = read_table('Unicode.tables/CaseFolding.txt', get_other_case, 0)
-# This block of code was added by PH in September 2012. I am not a Python
-# programmer, so the style is probably dreadful, but it does the job. It scans
-# the other_case table to find sets of more than two characters that must all
-# match each other caselessly. Later in this script a table of these sets is
-# written out. However, we have to do this work here in order to compute the
+# This block of code was added by PH in September 2012. I am not a Python
+# programmer, so the style is probably dreadful, but it does the job. It scans
+# the other_case table to find sets of more than two characters that must all
+# match each other caselessly. Later in this script a table of these sets is
+# written out. However, we have to do this work here in order to compute the
# offsets in the table that are inserted into the main table.
# The CaseFolding.txt file lists pairs, but the common logic for reading data
-# sets only one value, so first we go through the table and set "return"
+# sets only one value, so first we go through the table and set "return"
# offsets for those that are not already set.
for c in range(0x10ffff):
if other_case[c] != 0 and other_case[c + other_case[c]] == 0:
- other_case[c + other_case[c]] = -other_case[c]
+ other_case[c + other_case[c]] = -other_case[c]
# Now scan again and create equivalence sets.
@@ -344,25 +346,25 @@
o = c + other_case[c]
# Trigger when this character's other case does not point back here. We
- # now have three characters that are case-equivalent.
-
+ # now have three characters that are case-equivalent.
+
if other_case[o] != -other_case[c]:
t = o + other_case[o]
-
- # Scan the existing sets to see if any of the three characters are already
+
+ # Scan the existing sets to see if any of the three characters are already
# part of a set. If so, unite the existing set with the new set.
-
- appended = 0
+
+ appended = 0
for s in sets:
- found = 0
+ found = 0
for x in s:
if x == c or x == o or x == t:
found = 1
-
+
# Add new characters to an existing set
-
+
if found:
- found = 0
+ found = 0
for y in [c, o, t]:
for x in s:
if x == y:
@@ -370,10 +372,10 @@
if not found:
s.append(y)
appended = 1
-
+
# If we have not added to an existing set, create a new one.
- if not appended:
+ if not appended:
sets.append([c, o, t])
# End of loop looking for caseless sets.
@@ -384,7 +386,7 @@
offset = 1;
for s in sets:
- for x in s:
+ for x in s:
caseless_offsets[x] = offset
offset += len(s) + 1
@@ -393,7 +395,7 @@
# Combine the tables
-table, records = combine_tables(script, category, break_props,
+table, records = combine_tables(script, category, break_props,
caseless_offsets, other_case)
record_size, record_struct = get_record_size_struct(records.keys())
@@ -450,6 +452,20 @@
print "const pcre_uint32 PRIV(ucd_caseless_sets)[] = {0};"
print "#else"
print
+print "/* If the 32-bit library is run in non-32-bit mode, character values"
+print "greater than 0x10ffff may be encountered. For these we set up a"
+print "special record. */"
+print
+print "#ifdef COMPILE_PCRE32"
+print "const ucd_record PRIV(dummy_ucd_record)[] = {{"
+print " ucp_Common, /* script */"
+print " ucp_Cn, /* type unassigned */"
+print " ucp_gbOther, /* grapheme break property */"
+print " 0, /* case set */"
+print " 0, /* other case */"
+print " }};"
+print "#endif"
+print
print record_struct
# --- Added by PH: output the table of caseless character sets ---
@@ -460,7 +476,7 @@
s = sorted(s)
for x in s:
print ' 0x%04x,' % x,
- print ' NOTACHAR,'
+ print ' NOTACHAR,'
print '};'
print
Modified: code/trunk/pcre_internal.h
===================================================================
--- code/trunk/pcre_internal.h 2017-02-23 17:25:44 UTC (rev 1687)
+++ code/trunk/pcre_internal.h 2017-02-24 17:30:30 UTC (rev 1688)
@@ -2772,6 +2772,9 @@
extern const pcre_uint16 PRIV(ucd_stage2)[];
extern const pcre_uint32 PRIV(ucp_gentype)[];
extern const pcre_uint32 PRIV(ucp_gbtable)[];
+#ifdef COMPILE_PCRE32
+extern const ucd_record PRIV(dummy_ucd_record)[];
+#endif
#ifdef SUPPORT_JIT
extern const int PRIV(ucp_typerange)[];
#endif
@@ -2780,9 +2783,15 @@
/* UCD access macros */
#define UCD_BLOCK_SIZE 128
-#define GET_UCD(ch) (PRIV(ucd_records) + \
+#define REAL_GET_UCD(ch) (PRIV(ucd_records) + \
PRIV(ucd_stage2)[PRIV(ucd_stage1)[(int)(ch) / UCD_BLOCK_SIZE] * \
UCD_BLOCK_SIZE + (int)(ch) % UCD_BLOCK_SIZE])
+
+#ifdef COMPILE_PCRE32
+#define GET_UCD(ch) ((ch > 0x10ffff)? PRIV(dummy_ucd_record) : REAL_GET_UCD(ch))
+#else
+#define GET_UCD(ch) REAL_GET_UCD(ch)
+#endif
#define UCD_CHARTYPE(ch) GET_UCD(ch)->chartype
#define UCD_SCRIPT(ch) GET_UCD(ch)->script
Modified: code/trunk/pcre_ucd.c
===================================================================
--- code/trunk/pcre_ucd.c 2017-02-23 17:25:44 UTC (rev 1687)
+++ code/trunk/pcre_ucd.c 2017-02-24 17:30:30 UTC (rev 1688)
@@ -38,6 +38,20 @@
const pcre_uint32 PRIV(ucd_caseless_sets)[] = {0};
#else
+/* If the 32-bit library is run in non-32-bit mode, character values
+greater than 0x10ffff may be encountered. For these we set up a
+special record. */
+
+#ifdef COMPILE_PCRE32
+const ucd_record PRIV(dummy_ucd_record)[] = {{
+ ucp_Common, /* script */
+ ucp_Cn, /* type unassigned */
+ ucp_gbOther, /* grapheme break property */
+ 0, /* case set */
+ 0, /* other case */
+ }};
+#endif
+
/* When recompiling tables with a new Unicode version, please check the
types in this structure definition from pcre_internal.h (the actual
field names will be different):