[Pcre-svn] [352] code/trunk: Final (?) tidies for new Unicode property code.

Autor: Subversion repository
Data:
Para: pcre-svn
Assunto: [Pcre-svn] [352] code/trunk: Final (?) tidies for new Unicode property code.

Revision: 352

          http://vcs.pcre.org/viewvc?view=rev&revision=352
Author:   ph10
Date:     2008-07-07 16:12:56 +0100 (Mon, 07 Jul 2008)

Log Message:
-----------
Final (?) tidies for new Unicode property code.

Modified Paths:
--------------
    code/trunk/maint/MultiStage2.py
    code/trunk/maint/README
    code/trunk/pcre_ucd.c

Modified: code/trunk/maint/MultiStage2.py
===================================================================
--- code/trunk/maint/MultiStage2.py    2008-07-04 18:27:16 UTC (rev 351)
+++ code/trunk/maint/MultiStage2.py    2008-07-07 15:12:56 UTC (rev 352)
@@ -25,6 +25,7 @@
 #  Adjusted data file names to take from the Unicode.tables directory
 #  Adjusted global table names by prefixing _pcre_.
 #  Commented out stuff relating to the casefolding table, which isn't used.
+#  Corrected size calculation
 #
 # The tables generated by this script are used by macros defined in
 # pcre_internal.h. They look up Unicode character properties using short 
@@ -189,8 +190,45 @@
                 index.append(i)
         return index, records

-def print_records(records):
-        print 'const ucd_record _pcre_ucd_records[] = { /* %d bytes */' % (len(records) * 4)
+def get_record_size_struct(records):
+        size = 0
+        structure = '/* When recompiling tables with a new Unicode version,\n' + \
+        'please check types in the structure definition from pcre_internal.h:\ntypedef struct {\n'
+        for i in range(len(records[0])):
+                record_slice = map(lambda record: record[i], records)
+                slice_type, slice_size = get_type_size(record_slice)
+                # add padding: round up to the nearest power of slice_size
+                size = (size + slice_size - 1) & -slice_size
+                size += slice_size
+                structure += '%s property_%d;\n' % (slice_type, i)
+        
+        # round up to the first item of the next structure in array
+        record_slice = map(lambda record: record[0], records)
+        slice_type, slice_size = get_type_size(record_slice)
+        size = (size + slice_size - 1) & -slice_size
+        
+        structure += '} ucd_record; */\n\n'
+        return size, structure
+        
+def test_record_size():
+        tests = [ \
+          ( [(3,), (6,), (6,), (1,)], 1 ), \
+          ( [(300,), (600,), (600,), (100,)], 2 ), \
+          ( [(25, 3), (6, 6), (34, 6), (68, 1)], 2 ), \
+          ( [(300, 3), (6, 6), (340, 6), (690, 1)], 4 ), \
+          ( [(3, 300), (6, 6), (6, 340), (1, 690)], 4 ), \
+          ( [(300, 300), (6, 6), (6, 340), (1, 690)], 4 ), \
+          ( [(3, 100000), (6, 6), (6, 123456), (1, 690)], 8 ), \
+          ( [(100000, 300), (6, 6), (123456, 6), (1, 690)], 8 ), \
+        ]
+        for test in tests:
+            size, struct = get_record_size_struct(test[0])
+            assert(size == test[1])
+            #print struct
+
+def print_records(records, record_size):
+        print 'const ucd_record _pcre_ucd_records[] = { ' + \
+              '/* %d bytes, record size %d */' % (len(records) * record_size, record_size)
         records = zip(records.keys(), records.values())
         records.sort(None, lambda x: x[1])
         for i, record in enumerate(records):
@@ -213,6 +251,7 @@
   'Mc', 'Me', 'Mn', 'Nd', 'Nl', 'No', 'Pc', 'Pd', 'Pe', 'Pf', 'Pi', 'Po', 'Ps',
   'Sc', 'Sk', 'Sm', 'So', 'Zl', 'Zp', 'Zs' ]

+test_record_size()

script = read_table('Unicode.tables/Scripts.txt', make_get_names(script_names), script_names.index('Common'))
category = read_table('Unicode.tables/DerivedGeneralCategory.txt', make_get_names(category_names), category_names.index('Cn'))
@@ -220,11 +259,12 @@
# case_fold = read_table('CaseFolding.txt', get_case_folding_value, 0)

table, records = combine_tables(script, category, other_case)
+record_size, record_struct = get_record_size_struct(records.keys())

 # Find the optimum block size for the two-stage table
 min_size = sys.maxint
 for block_size in [2 ** i for i in range(5,10)]:
-        size = len(records) * 4
+        size = len(records) * record_size
         stage1, stage2 = compress_table(table, block_size)
         size += get_tables_size(stage1, stage2)
         #print "/* block size %5d  => %5d bytes */" % (block_size, size)
@@ -241,7 +281,8 @@
 print "/* Unicode character database. */"
 print "/* This file was autogenerated by the MultiStage2.py script. */"
 print "/* Total size: %d bytes, block size: %d. */" % (min_size, min_block_size)
-print_records(records)
+print record_struct
+print_records(records, record_size)
 print_table(min_stage1, '_pcre_ucd_stage1')
 print_table(min_stage2, '_pcre_ucd_stage2', min_block_size)
 print "#if UCD_BLOCK_SIZE != %d" % min_block_size

Modified: code/trunk/maint/README
===================================================================
--- code/trunk/maint/README    2008-07-04 18:27:16 UTC (rev 351)
+++ code/trunk/maint/README    2008-07-07 15:12:56 UTC (rev 352)
@@ -63,10 +63,11 @@

When there is a new release of Unicode, the files in Unicode.tables must be
refreshed from the web site. If the new version of Unicode adds new character
-scripts, both the MultiStage2.py and the GenerateUtt.py scripts must be edited
-to add the new names. Then the MultiStage2.py script can then be run to
-generate a new version of pcre_ucd.c and the GenerateUtt.py can be run to
-generate the tricky tables in pcre_tables.c.
+scripts, the source file ucp.h and both the MultiStage2.py and the
+GenerateUtt.py scripts must be edited to add the new names. Then the
+MultiStage2.py script can then be run to generate a new version of pcre_ucd.c
+and the GenerateUtt.py can be run to generate the tricky tables for inclusion
+in pcre_tables.c.

The ucptest program can then be compiled and used to check that the new tables
in pcre_ucd.c work properly, using the data files in ucptestdata to check a

Modified: code/trunk/pcre_ucd.c
===================================================================
--- code/trunk/pcre_ucd.c    2008-07-04 18:27:16 UTC (rev 351)
+++ code/trunk/pcre_ucd.c    2008-07-07 15:12:56 UTC (rev 352)
@@ -5,8 +5,17 @@

 /* Unicode character database. */
 /* This file was autogenerated by the MultiStage2.py script. */
-/* Total size: 50980 bytes, block size: 128. */
-const ucd_record _pcre_ucd_records[] = { /* 1828 bytes */
+/* Total size: 52808 bytes, block size: 128. */
+/* When recompiling tables with a new Unicode version,
+please check types in the structure definition from pcre_internal.h:
+typedef struct {
+uschar property_0;
+uschar property_1;
+pcre_int32 property_2;
+} ucd_record; */
+
+
+const ucd_record _pcre_ucd_records[] = { /* 3656 bytes, record size 8 */
   {     9,      0,      0, }, /*   0 */
   {     9,     29,      0, }, /*   1 */
   {     9,     21,      0, }, /*   2 */

Esta mensagem é parte da seguinte discussão:
	Árvore completa da discussão ordenada por data

[Pcre-svn] [352] code/trunk: Final (?) tidies for new Unicod…