Revision: 1025
http://www.exim.org/viewvc/pcre2?view=rev&revision=1025
Author: ph10
Date: 2018-10-14 15:27:16 +0100 (Sun, 14 Oct 2018)
Log Message:
-----------
Upgrade the ucptest program (used only by maintainer) and script run tests.
Modified Paths:
--------------
code/trunk/maint/ucptest.c
code/trunk/testdata/testinput4
code/trunk/testdata/testoutput4
Modified: code/trunk/maint/ucptest.c
===================================================================
--- code/trunk/maint/ucptest.c 2018-10-12 17:02:34 UTC (rev 1024)
+++ code/trunk/maint/ucptest.c 2018-10-14 14:27:16 UTC (rev 1025)
@@ -7,15 +7,43 @@
/* Compile thus:
gcc -DHAVE_CONFIG_H -DPCRE2_CODE_UNIT_WIDTH=8 -o ucptest \
ucptest.c ../src/pcre2_ucd.c ../src/pcre2_tables.c
+ Add -lreadline or -ledit if required.
*/
-/* If there are arguments, they are a list of hexadecimal code points whose
+/* This is a hacked-up program for testing the Unicode properties tables of
+PCRE2. It can also be used for finding characters with certain properties.
+I wrote it to help with debugging PCRE, and have added things that I found
+useful, in a rather haphazard way. The code has never been "tidied" or checked
+for robustness.
+
+If there are arguments, they are a list of hexadecimal code points whose
properties are to be output. Otherwise, the program expects to read commands on
-stdin, and it writes output to stdout. There is only one command, "findprop",
-followed by a list of Unicode code points as hex numbers (without any
-prefixes). The output is one line per character, giving its Unicode properties
-followed by its other case if there is one. */
+stdin, and it writes output to stdout. There are two commands:
+"findprop" must be followed by a list of Unicode code points as hex numbers
+(without any prefixes). The output is one line per character, giving its
+Unicode properties followed by its other case if there is one, followed by its
+Script Extension list if it is not just the same as the base script.
+
+"find" must be followed by a list of property names and their values. This
+finds characters that have those properties. If multiple properties are listed,
+they must all be matched. Currently supported:
+
+ script <name> The character must have this script property. Only one
+ such script may be given.
+ scriptx <name> This script must be in the character's Script Extension
+ property list. If this is used many times, all the given
+ scripts must be present.
+ type <abbrev> The character's type (e.g. Lu or Nd) must match.
+ gbreak <name> The grapheme break property must match.
+
+If a <name> or <abbrev> is preceded by !, the value must NOT be present. For
+Script Extensions, there may be a mixture of positive and negative
+requirements. All must be satisfied.
+
+No more than 100 characters are output. If there are more, the list ends with
+... */
+
#ifdef HAVE_CONFIG_H
#include "../src/config.h"
#endif
@@ -31,8 +59,24 @@
#include "../src/pcre2_internal.h"
#include "../src/pcre2_ucp.h"
+#ifdef HAVE_UNISTD_H
+#include <unistd.h>
+#endif
+#if defined(SUPPORT_LIBREADLINE) || defined(SUPPORT_LIBEDIT)
+#if defined(SUPPORT_LIBREADLINE)
+#include <readline/readline.h>
+#include <readline/history.h>
+#else
+#if defined(HAVE_EDITLINE_READLINE_H)
+#include <editline/readline.h>
+#else
+#include <readline/readline.h>
+#endif
+#endif
+#endif
+
/* -------------------------------------------------------------------*/
#define CS (char *)
@@ -45,186 +89,235 @@
/* -------------------------------------------------------------------*/
-
-
-/*************************************************
-* Find a script name *
-*************************************************/
-
-static unsigned char *
-find_script_name(int script)
-{
-switch(script)
- {
- default: return US"??";
- case ucp_Unknown: return US"Unknown";
- case ucp_Arabic: return US"Arabic";
- case ucp_Armenian: return US"Armenian";
- case ucp_Balinese: return US"Balinese";
- case ucp_Bengali: return US"Bengali";
- case ucp_Bopomofo: return US"Bopomofo";
- case ucp_Braille: return US"Braille";
- case ucp_Buginese: return US"Buginese";
- case ucp_Buhid: return US"Buhid";
- case ucp_Canadian_Aboriginal: return US"Canadian_Aboriginal";
- case ucp_Cherokee: return US"Cherokee";
- case ucp_Common: return US"Common";
- case ucp_Coptic: return US"Coptic";
- case ucp_Cuneiform: return US"Cuneiform";
- case ucp_Cypriot: return US"Cypriot";
- case ucp_Cyrillic: return US"Cyrillic";
- case ucp_Deseret: return US"Deseret";
- case ucp_Devanagari: return US"Devanagari";
- case ucp_Ethiopic: return US"Ethiopic";
- case ucp_Georgian: return US"Georgian";
- case ucp_Glagolitic: return US"Glagolitic";
- case ucp_Gothic: return US"Gothic";
- case ucp_Greek: return US"Greek";
- case ucp_Gujarati: return US"Gujarati";
- case ucp_Gurmukhi: return US"Gurmukhi";
- case ucp_Han: return US"Han";
- case ucp_Hangul: return US"Hangul";
- case ucp_Hanunoo: return US"Hanunoo";
- case ucp_Hebrew: return US"Hebrew";
- case ucp_Hiragana: return US"Hiragana";
- case ucp_Inherited: return US"Inherited";
- case ucp_Kannada: return US"Kannada";
- case ucp_Katakana: return US"Katakana";
- case ucp_Kharoshthi: return US"Kharoshthi";
- case ucp_Khmer: return US"Khmer";
- case ucp_Lao: return US"Lao";
- case ucp_Latin: return US"Latin";
- case ucp_Limbu: return US"Limbu";
- case ucp_Linear_B: return US"Linear_B";
- case ucp_Malayalam: return US"Malayalam";
- case ucp_Mongolian: return US"Mongolian";
- case ucp_Myanmar: return US"Myanmar";
- case ucp_New_Tai_Lue: return US"New_Tai_Lue";
- case ucp_Nko: return US"Nko";
- case ucp_Ogham: return US"Ogham";
- case ucp_Old_Italic: return US"Old_Italic";
- case ucp_Old_Persian: return US"Old_Persian";
- case ucp_Oriya: return US"Oriya";
- case ucp_Osmanya: return US"Osmanya";
- case ucp_Phags_Pa: return US"Phags_Pa";
- case ucp_Phoenician: return US"Phoenician";
- case ucp_Runic: return US"Runic";
- case ucp_Shavian: return US"Shavian";
- case ucp_Sinhala: return US"Sinhala";
- case ucp_Syloti_Nagri: return US"Syloti_Nagri";
- case ucp_Syriac: return US"Syriac";
- case ucp_Tagalog: return US"Tagalog";
- case ucp_Tagbanwa: return US"Tagbanwa";
- case ucp_Tai_Le: return US"Tai_Le";
- case ucp_Tamil: return US"Tamil";
- case ucp_Telugu: return US"Telugu";
- case ucp_Thaana: return US"Thaana";
- case ucp_Thai: return US"Thai";
- case ucp_Tibetan: return US"Tibetan";
- case ucp_Tifinagh: return US"Tifinagh";
- case ucp_Ugaritic: return US"Ugaritic";
- case ucp_Yi: return US"Yi";
+const unsigned char *script_names[] = {
+ US"Unknown",
+ US"Arabic",
+ US"Armenian",
+ US"Bengali",
+ US"Bopomofo",
+ US"Braille",
+ US"Buginese",
+ US"Buhid",
+ US"Canadian_Aboriginal",
+ US"Cherokee",
+ US"Common",
+ US"Coptic",
+ US"Cypriot",
+ US"Cyrillic",
+ US"Deseret",
+ US"Devanagari",
+ US"Ethiopic",
+ US"Georgian",
+ US"Glagolitic",
+ US"Gothic",
+ US"Greek",
+ US"Gujarati",
+ US"Gurmukhi",
+ US"Han",
+ US"Hangul",
+ US"Hanunoo",
+ US"Hebrew",
+ US"Hiragana",
+ US"Inherited",
+ US"Kannada",
+ US"Katakana",
+ US"Kharoshthi",
+ US"Khmer",
+ US"Lao",
+ US"Latin",
+ US"Limbu",
+ US"Linear_B",
+ US"Malayalam",
+ US"Mongolian",
+ US"Myanmar",
+ US"New_Tai_Lue",
+ US"Ogham",
+ US"Old_Italic",
+ US"Old_Persian",
+ US"Oriya",
+ US"Osmanya",
+ US"Runic",
+ US"Shavian",
+ US"Sinhala",
+ US"Syloti_Nagri",
+ US"Syriac",
+ US"Tagalog",
+ US"Tagbanwa",
+ US"Tai_Le",
+ US"Tamil",
+ US"Telugu",
+ US"Thaana",
+ US"Thai",
+ US"Tibetan",
+ US"Tifinagh",
+ US"Ugaritic",
+ US"Yi",
+ /* New for Unicode 5.0: */
+ US"Balinese",
+ US"Cuneiform",
+ US"Nko",
+ US"Phags_Pa",
+ US"Phoenician",
/* New for Unicode 5.1: */
- case ucp_Carian: return US"Carian";
- case ucp_Cham: return US"Cham";
- case ucp_Kayah_Li: return US"Kayah_Li";
- case ucp_Lepcha: return US"Lepcha";
- case ucp_Lycian: return US"Lycian";
- case ucp_Lydian: return US"Lydian";
- case ucp_Ol_Chiki: return US"Ol_Chiki";
- case ucp_Rejang: return US"Rejang";
- case ucp_Saurashtra: return US"Saurashtra";
- case ucp_Sundanese: return US"Sundanese";
- case ucp_Vai: return US"Vai";
+ US"Carian",
+ US"Cham",
+ US"Kayah_Li",
+ US"Lepcha",
+ US"Lycian",
+ US"Lydian",
+ US"Ol_Chiki",
+ US"Rejang",
+ US"Saurashtra",
+ US"Sundanese",
+ US"Vai",
/* New for Unicode 5.2: */
- case ucp_Avestan: return US"Avestan";
- case ucp_Bamum: return US"Bamum";
- case ucp_Egyptian_Hieroglyphs: return US"Egyptian_Hieroglyphs";
- case ucp_Imperial_Aramaic: return US"Imperial_Aramaic";
- case ucp_Inscriptional_Pahlavi: return US"Inscriptional_Pahlavi";
- case ucp_Inscriptional_Parthian: return US"Inscriptional_Parthian";
- case ucp_Javanese: return US"Javanese";
- case ucp_Kaithi: return US"Kaithi";
- case ucp_Lisu: return US"Lisu";
- case ucp_Meetei_Mayek: return US"Meetei_Mayek";
- case ucp_Old_South_Arabian: return US"Old_South_Arabian";
- case ucp_Old_Turkic: return US"Old_Turkic";
- case ucp_Samaritan: return US"Samaritan";
- case ucp_Tai_Tham: return US"Tai_Tham";
- case ucp_Tai_Viet: return US"Tai_Viet";
+ US"Avestan",
+ US"Bamum",
+ US"Egyptian_Hieroglyphs",
+ US"Imperial_Aramaic",
+ US"Inscriptional_Pahlavi",
+ US"Inscriptional_Parthian",
+ US"Javanese",
+ US"Kaithi",
+ US"Lisu",
+ US"Meetei_Mayek",
+ US"Old_South_Arabian",
+ US"Old_Turkic",
+ US"Samaritan",
+ US"Tai_Tham",
+ US"Tai_Viet",
/* New for Unicode 6.0.0 */
- case ucp_Batak: return US"Batak";
- case ucp_Brahmi: return US"Brahmi";
- case ucp_Mandaic: return US"Mandaic";
-
+ US"Batak",
+ US"Brahmi",
+ US"Mandaic",
/* New for Unicode 6.1.0 */
- case ucp_Chakma: return US"Chakma";
- case ucp_Meroitic_Cursive: return US"Meroitic_Cursive";
- case ucp_Meroitic_Hieroglyphs: return US"Meroitic_Hieroglyphs";
- case ucp_Miao: return US"Miao";
- case ucp_Sharada: return US"Sharada";
- case ucp_Sora_Sompeng: return US"Sora Sompent";
- case ucp_Takri: return US"Takri";
-
+ US"Chakma",
+ US"Meroitic_Cursive",
+ US"Meroitic_Hieroglyphs",
+ US"Miao",
+ US"Sharada",
+ US"Sora Sompent",
+ US"Takri",
/* New for Unicode 7.0.0 */
- case ucp_Bassa_Vah: return US"Bassa_Vah";
- case ucp_Caucasian_Albanian: return US"Caucasian_Albanian";
- case ucp_Duployan: return US"Duployan";
- case ucp_Elbasan: return US"Elbasan";
- case ucp_Grantha: return US"Grantha";
- case ucp_Khojki: return US"Khojki";
- case ucp_Khudawadi: return US"Khudawadi";
- case ucp_Linear_A: return US"Linear_A";
- case ucp_Mahajani: return US"Mahajani";
- case ucp_Manichaean: return US"Manichaean";
- case ucp_Mende_Kikakui: return US"Mende_Kikakui";
- case ucp_Modi: return US"Modi";
- case ucp_Mro: return US"Mro";
- case ucp_Nabataean: return US"Nabataean";
- case ucp_Old_North_Arabian: return US"Old_North_Arabian";
- case ucp_Old_Permic: return US"Old_Permic";
- case ucp_Pahawh_Hmong: return US"Pahawh_Hmong";
- case ucp_Palmyrene: return US"Palmyrene";
- case ucp_Psalter_Pahlavi: return US"Psalter_Pahlavi";
- case ucp_Pau_Cin_Hau: return US"Pau_Cin_Hau";
- case ucp_Siddham: return US"Siddham";
- case ucp_Tirhuta: return US"Tirhuta";
- case ucp_Warang_Citi: return US"Warang_Citi";
-
+ US"Bassa_Vah",
+ US"Caucasian_Albanian",
+ US"Duployan",
+ US"Elbasan",
+ US"Grantha",
+ US"Khojki",
+ US"Khudawadi",
+ US"Linear_A",
+ US"Mahajani",
+ US"Manichaean",
+ US"Mende_Kikakui",
+ US"Modi",
+ US"Mro",
+ US"Nabataean",
+ US"Old_North_Arabian",
+ US"Old_Permic",
+ US"Pahawh_Hmong",
+ US"Palmyrene",
+ US"Psalter_Pahlavi",
+ US"Pau_Cin_Hau",
+ US"Siddham",
+ US"Tirhuta",
+ US"Warang_Citi",
/* New for Unicode 8.0.0 */
- case ucp_Ahom: return US"Ahom";
- case ucp_Anatolian_Hieroglyphs: return US"Anatolian_Hieroglyphs";
- case ucp_Hatran: return US"Hatran";
- case ucp_Multani: return US"Multani";
- case ucp_Old_Hungarian: return US"Old_Hungarian";
- case ucp_SignWriting: return US"SignWriting";
-
+ US"Ahom",
+ US"Anatolian_Hieroglyphs",
+ US"Hatran",
+ US"Multani",
+ US"Old_Hungarian",
+ US"SignWriting",
/* New for Unicode 10.0.0 (no update since 8.0.0) */
- case ucp_Adlam: return US"Adlam";
- case ucp_Bhaiksuki: return US"Bhaiksuki";
- case ucp_Marchen: return US"Marchen";
- case ucp_Newa: return US"Newa";
- case ucp_Osage: return US"Osage";
- case ucp_Tangut: return US"Tangut";
- case ucp_Masaram_Gondi: return US"Masaram_Gondi";
- case ucp_Nushu: return US"Nushu";
- case ucp_Soyombo: return US"Soyombo";
- case ucp_Zanabazar_Square: return US"Zanabazar_Square";
+ US"Adlam",
+ US"Bhaiksuki",
+ US"Marchen",
+ US"Newa",
+ US"Osage",
+ US"Tangut",
+ US"Masaram_Gondi",
+ US"Nushu",
+ US"Soyombo",
+ US"Zanabazar_Square",
+ /* New for Unicode 11.0.0 */
+ US"Dogra",
+ US"Gunjala_Gondi",
+ US"Hanifi_Rohingya",
+ US"Makasar",
+ US"Medefaidrin",
+ US"Old_Sogdian",
+ US"Sogdian"
+};
- /* New for Unicode 11.0.0 */
- case ucp_Dogra: return US"Dogra";
- case ucp_Gunjala_Gondi: return US"Gunjala_Gondi";
- case ucp_Hanifi_Rohingya: return US"Hanifi_Rohingya";
- case ucp_Makasar: return US"Makasar";
- case ucp_Medefaidrin: return US"Medefaidrin";
- case ucp_Old_Sogdian: return US"Old_Sogdian";
- case ucp_Sogdian: return US"Sogdian";
- }
-}
+const unsigned char *type_names[] = {
+ US"Cc",
+ US"Cf",
+ US"Cn",
+ US"Co",
+ US"Cs",
+ US"Ll",
+ US"Lm",
+ US"Lo",
+ US"Lt",
+ US"Lu",
+ US"Mc",
+ US"Me",
+ US"Mn",
+ US"Nd",
+ US"Nl",
+ US"No",
+ US"Pc",
+ US"Pd",
+ US"Pe",
+ US"Pf",
+ US"Pi",
+ US"Po",
+ US"Ps",
+ US"Sc",
+ US"Sk",
+ US"Sm",
+ US"So",
+ US"Zl",
+ US"Zp",
+ US"Zs"
+};
+const unsigned char *gb_names[] = {
+ US"CR",
+ US"LF",
+ US"Control",
+ US"Extend",
+ US"Prepend",
+ US"SpacingMark",
+ US"L",
+ US"V",
+ US"T",
+ US"LV",
+ US"LVT",
+ US"RegionalIndicator",
+ US"Other",
+ US"ZWJ",
+ US"Extended_Pictographic"
+};
/*************************************************
+* Test for interaction *
+*************************************************/
+
+static BOOL
+is_stdin_tty(void)
+{
+#if defined WIN32
+return _isatty(_fileno(stdin));
+#else
+return isatty(fileno(stdin));
+#endif
+}
+
+
+/*************************************************
* Print Unicode property info for a char *
*************************************************/
@@ -239,11 +332,13 @@
int othercase = UCD_OTHERCASE(c);
int caseset = UCD_CASESET(c);
-unsigned char *fulltypename = US"??";
-unsigned char *typename = US"??";
-unsigned char *graphbreak = US"??";
+const unsigned char *fulltypename = US"??";
+const unsigned char *typename = US"??";
+const unsigned char *scriptname = US"??";
+const unsigned char *graphbreak = US"??";
-unsigned char *scriptname = find_script_name(script);
+if (script < sizeof(script_names)/sizeof(char *))
+ scriptname = script_names[script];
switch (type)
{
@@ -289,7 +384,7 @@
case ucp_Zp: fulltypename = US"Paragraph separator"; break;
case ucp_Zs: fulltypename = US"Space separator"; break;
}
-
+
switch(gbprop)
{
case ucp_gbCR: graphbreak = US"CR"; break;
@@ -308,12 +403,12 @@
case ucp_gbOther: graphbreak = US"Other"; break;
case ucp_gbZWJ: graphbreak = US"Zero Width Joiner"; break;
case ucp_gbExtended_Pictographic:
- graphbreak = US"Extended Pictographic"; break;
- default: graphbreak = US"Unknown"; break;
+ graphbreak = US"Extended Pictographic"; break;
+ default: graphbreak = US"Unknown"; break;
}
-
+
printf("%04x %s: %s, %s, %s", c, typename, fulltypename, scriptname, graphbreak);
-if (othercase != c)
+if (othercase != c)
{
printf(", %04x", othercase);
if (caseset != 0)
@@ -321,25 +416,33 @@
const uint32_t *p = PRIV(ucd_caseless_sets) + caseset - 1;
while (*(++p) < NOTACHAR)
if (*p != othercase && *p != c) printf(", %04x", *p);
- }
- }
-
+ }
+ }
+
if (scriptx != script)
{
- printf(", [");
- if (scriptx >= 0) printf("%s", find_script_name(scriptx)); else
+ printf(", [");
+ if (scriptx >= 0)
{
- char *sep = "";
+ scriptname = (scriptx >= sizeof(script_names)/sizeof(char *))?
+ US"??" : script_names[scriptx];
+ printf("%s", scriptname);
+ }
+ else
+ {
+ char *sep = "";
const uint8_t *p = PRIV(ucd_script_sets) - scriptx;
while (*p != 0)
{
- printf("%s%s", sep, find_script_name(*p++));
- sep = ", ";
- }
- }
+ scriptname = (*p >= sizeof(script_names)/sizeof(char *))?
+ US"??" : script_names[*p++];
+ printf("%s%s", sep, scriptname);
+ sep = ", ";
+ }
+ }
printf("]");
- }
-
+ }
+
printf("\n");
}
@@ -346,6 +449,267 @@
/*************************************************
+* Find character(s) with given property/ies *
+*************************************************/
+
+static void
+find_chars(unsigned char *s)
+{
+unsigned char name[24];
+unsigned char value[24];
+unsigned char *t;
+unsigned int count= 0;
+int scriptx_list[24];
+unsigned int scriptx_count = 0;
+uint32_t i, c;
+int script = -1;
+int type = -1;
+int gbreak = -1;
+BOOL script_not = FALSE;
+BOOL type_not = FALSE;
+BOOL gbreak_not = FALSE;
+BOOL hadrange = FALSE;
+const ucd_record *ucd, *next_ucd;
+const char *pad = " ";
+
+while (*s != 0)
+ {
+ unsigned int offset = 0;
+ BOOL scriptx_not = FALSE;
+
+ for (t = name; *s != 0 && !isspace(*s); s++) *t++ = *s;
+ *t = 0;
+ while (isspace(*s)) s++;
+
+ for (t = value; *s != 0 && !isspace(*s); s++) *t++ = *s;
+ *t = 0;
+ while (isspace(*s)) s++;
+
+ if (strcmp(CS name, "script") == 0 ||
+ strcmp(CS name, "scriptx") == 0)
+ {
+ if (value[0] == '!')
+ {
+ if (name[6] == 'x') scriptx_not = TRUE;
+ else script_not = TRUE;
+ offset = 1;
+ }
+
+ for (i = 0; i < sizeof(script_names)/sizeof(char *); i++)
+ {
+ if (strcmp(CS value + offset, script_names[i]) == 0)
+ {
+ if (name[6] == 'x')
+ {
+ scriptx_list[scriptx_count++] = scriptx_not? (-i):i;
+ }
+ else
+ {
+ if (script < 0) script = i; else
+ {
+ printf("** Only 1 script value allowed\n");
+ return;
+ }
+ }
+ break;
+ }
+ }
+
+ if (i >= sizeof(script_names)/sizeof(char *))
+ {
+ printf("** Unrecognized script name '%s'\n", value);
+ return;
+ }
+ }
+
+ else if (strcmp(CS name, "type") == 0)
+ {
+ if (type >= 0)
+ {
+ printf("** Only 1 type value allowed\n");
+ return;
+ }
+ else
+ {
+ if (value[0] == '!')
+ {
+ type_not = TRUE;
+ offset = 1;
+ }
+
+ for (i = 0; i < sizeof(type_names)/sizeof(char *); i++)
+ {
+ if (strcmp(CS (value + offset), type_names[i]) == 0)
+ {
+ type = i;
+ break;
+ }
+ }
+ if (i >= sizeof(type_names)/sizeof(char *))
+ {
+ printf("** Unrecognized type name '%s'\n", value);
+ return;
+ }
+ }
+ }
+
+ else if (strcmp(CS name, "gbreak") == 0)
+ {
+ if (gbreak >= 0)
+ {
+ printf("** Only 1 grapheme break value allowed\n");
+ return;
+ }
+ else
+ {
+ if (value[0] == '!')
+ {
+ gbreak_not = TRUE;
+ offset = 1;
+ }
+
+ for (i = 0; i < sizeof(gb_names)/sizeof(char *); i++)
+ {
+ if (strcmp(CS (value + offset), gb_names[i]) == 0)
+ {
+ gbreak = i;
+ break;
+ }
+ }
+ if (i >= sizeof(gb_names)/sizeof(char *))
+ {
+ printf("** Unrecognized gbreak name '%s'\n", value);
+ return;
+ }
+ }
+ }
+
+ else
+ {
+ printf("** Unrecognized property name '%s'\n", name);
+ return;
+ }
+ }
+
+if (script < 0 && scriptx_count == 0 && type < 0 && gbreak < 0)
+ {
+ printf("** No properties specified\n");
+ return;
+ }
+
+for (c = 0; c <= 0x10ffff; c++)
+ {
+ if (script >= 0 && (script == UCD_SCRIPT(c)) == script_not) continue;
+
+ if (scriptx_count > 0)
+ {
+ const uint8_t *char_scriptx = NULL;
+ int found = 0;
+ int scriptx = UCD_SCRIPTX(c);
+
+ if (scriptx < 0) char_scriptx = PRIV(ucd_script_sets) - scriptx;
+
+ for (i = 0; i < scriptx_count; i++)
+ {
+ /* Positive requirment */
+ if (scriptx_list[i] >= 0)
+ {
+ if (scriptx >= 0)
+ {
+ if (scriptx == scriptx_list[i]) found++;
+ }
+
+ else
+ {
+ const uint8_t *p;
+ for (p = char_scriptx; *p != 0; p++)
+ {
+ if (scriptx_list[i] == *p)
+ {
+ found++;
+ break;
+ }
+ }
+ }
+ }
+ /* Negative requirement */
+ else
+ {
+ if (scriptx >= 0)
+ {
+ if (scriptx != -scriptx_list[i]) found++;
+ }
+ else
+ {
+ const uint8_t *p;
+ for (p = char_scriptx; *p != 0; p++)
+ if (-scriptx_list[i] == *p) break;
+ if (*p == 0) found++;
+ }
+ }
+ }
+
+ if (found != scriptx_count) continue;
+ }
+
+ if (type >= 0)
+ {
+ if (type_not)
+ {
+ if (type == UCD_CHARTYPE(c)) continue;
+ }
+ else
+ {
+ if (type != UCD_CHARTYPE(c)) continue;
+ }
+ }
+
+ if (gbreak >= 0)
+ {
+ if (gbreak_not)
+ {
+ if (gbreak == UCD_GRAPHBREAK(c)) continue;
+ }
+ else
+ {
+ if (gbreak != UCD_GRAPHBREAK(c)) continue;
+ }
+ }
+
+ /* All conditions are met. Look for runs. */
+
+ ucd = GET_UCD(c);
+
+ for (i = c + 1; i < 0x10ffff; i++)
+ {
+ next_ucd = GET_UCD(i);
+ if (memcmp(ucd, next_ucd, sizeof(ucd_record)) != 0) break;
+ }
+
+ if (--i > c)
+ {
+ printf("%04x..", c);
+ c = i;
+ hadrange = TRUE;
+ }
+ else if (hadrange) printf("%s", pad);
+
+ print_prop(c);
+ if (c >= 0x100000) pad = " ";
+ else if (c >= 0x10000) pad = " ";
+ count++;
+ if (count >= 100)
+ {
+ printf("...\n");
+ break;
+ }
+ }
+
+if (count == 0) printf("No characters found\n");
+}
+
+
+/*************************************************
* Main program *
*************************************************/
@@ -352,6 +716,7 @@
int
main(int argc, char **argv)
{
+BOOL interactive;
unsigned char buffer[1024];
if (argc > 1)
@@ -359,19 +724,48 @@
int i;
for (i = 1; i < argc; i++)
{
- unsigned char *endptr;
+ unsigned char *endptr;
int c = strtoul(argv[i], CSS(&endptr), 16);
- print_prop(c);
+ if (*endptr != 0)
+ printf("** Hex number expected; ignored '%s'\n", argv[i]);
+ else print_prop(c);
}
return 0;
- }
+ }
-while (fgets(CS buffer, sizeof(buffer), stdin) != NULL)
+interactive = is_stdin_tty();
+
+#if defined(SUPPORT_LIBREADLINE) || defined(SUPPORT_LIBEDIT)
+if (interactive) using_history();
+#endif
+
+for(;;)
{
unsigned char name[24];
unsigned char *s, *t;
- printf("%s", buffer);
+#if defined(SUPPORT_LIBREADLINE) || defined(SUPPORT_LIBEDIT)
+ if (interactive)
+ {
+ size_t len;
+ s = readline("> ");
+ if (s == NULL) break;
+ len = strlen(s);
+ if (len > 0) add_history(s);
+ memcpy(buffer, s, len);
+ buffer[len] = '\n';
+ buffer[len+1] = 0;
+ free(s);
+ }
+ else
+#endif
+
+ {
+ if (interactive) printf("> ");
+ if (fgets(CS buffer, sizeof(buffer), stdin) == NULL) break;
+ if (!interactive) printf("%s", buffer);
+ }
+
s = buffer;
while (isspace(*s)) s++;
if (*s == 0) continue;
@@ -386,15 +780,32 @@
{
unsigned char *endptr;
int c = strtoul(CS s, CSS(&endptr), 16);
- print_prop(c);
+
+ if (*endptr != 0 && !isspace(*endptr))
+ {
+ while (*endptr != 0 && !isspace(*endptr)) endptr++;
+ printf("** Hex number expected; ignored '%.*s'\n", endptr-s, s);
+ }
+ else print_prop(c);
s = endptr;
while (isspace(*s)) s++;
}
}
- else printf("Unknown test command %s\n", name);
+ else if (strcmp(CS name, "find") == 0)
+ {
+ find_chars(s);
+ }
+
+ else printf("** Unknown test command %s\n", name);
}
+if (interactive) printf("\n");
+
+#if defined(SUPPORT_LIBREADLINE) || defined(SUPPORT_LIBEDIT)
+if (interactive) clear_history();
+#endif
+
return 0;
}
Modified: code/trunk/testdata/testinput4
===================================================================
--- code/trunk/testdata/testinput4 2018-10-12 17:02:34 UTC (rev 1024)
+++ code/trunk/testdata/testinput4 2018-10-14 14:27:16 UTC (rev 1025)
@@ -2432,6 +2432,8 @@
AB\x{1cf7} Latin Latin Common-extended-Beng
\x{1cf7}AB Common-extend-Beng Latin Latin
\x{1cf7}\x{0993} Common-extend-Beng Bengali
+ A\x{1abe}BC Test enclosing mark
+ \x{0370}\x{1abe}\x{0371} Which can occur with any script (Greek here)
# Test loop breaking for empty string match
Modified: code/trunk/testdata/testoutput4
===================================================================
--- code/trunk/testdata/testoutput4 2018-10-12 17:02:34 UTC (rev 1024)
+++ code/trunk/testdata/testoutput4 2018-10-14 14:27:16 UTC (rev 1025)
@@ -3936,6 +3936,10 @@
0: \x{1cf7}
\x{1cf7}\x{0993} Common-extend-Beng Bengali
0: \x{1cf7}\x{993}
+ A\x{1abe}BC Test enclosing mark
+ 0: A\x{1abe}BC
+ \x{0370}\x{1abe}\x{0371} Which can occur with any script (Greek here)
+ 0: \x{370}\x{1abe}\x{371}
# Test loop breaking for empty string match