[exim-cvs] Expansions: disallow UTF-16 surrogates from ${utf…

Top Page
Delete this message
Reply to this message
Author: Exim Git Commits Mailing List
Date:  
To: exim-cvs
Subject: [exim-cvs] Expansions: disallow UTF-16 surrogates from ${utf8clean:...}. Bug 2998
Gitweb: https://git.exim.org/exim.git/commitdiff/1209e3e19e292cee517e43a2ccfe9b44b33bb1dc
Commit:     1209e3e19e292cee517e43a2ccfe9b44b33bb1dc
Parent:     66ce3fc9291d13fe8a7d4099942b9101aef1c38c
Author:     Jasen Betts <jasen@???>
AuthorDate: Sun Jul 23 13:43:59 2023 +0100
Committer:  Jeremy Harris <jgh146exb@???>
CommitDate: Sun Jul 23 13:49:10 2023 +0100

    Expansions: disallow UTF-16 surrogates from ${utf8clean:...}.  Bug 2998
---
 doc/doc-txt/ChangeLog |  4 ++++
 src/src/expand.c      | 27 +++++++++++++++++----------
 2 files changed, 21 insertions(+), 10 deletions(-)

diff --git a/doc/doc-txt/ChangeLog b/doc/doc-txt/ChangeLog
index a3b43b2f5..3d74d58b0 100644
--- a/doc/doc-txt/ChangeLog
+++ b/doc/doc-txt/ChangeLog
@@ -163,6 +163,10 @@ JH/30 Bug 3006: Fix handling of JSON strings having embedded commas. Previously
       need to be protected by the doublequotes.  While there, add handling for
       backslashes.
 
+JH/31 Bug 2998: Fix ${utf8clean:...} to disallow UTF-16 surrogate codepoints.
+      Found and fixed by Jasen Betts. No testcase for this as my usual text
+      editor insists on emitting only valid UTF-8.
+
 Exim version 4.96
 -----------------
 
diff --git a/src/src/expand.c b/src/src/expand.c
index fea6501fe..d8ea7ae6b 100644
--- a/src/src/expand.c
+++ b/src/src/expand.c
@@ -7862,7 +7862,7 @@ NOT_ITEM: ;
     case EOP_UTF8CLEAN:
       {
       int seq_len = 0, index = 0, bytes_left = 0, complete;
-      long codepoint = -1;
+      ulong codepoint = (ulong)-1;
       uschar seq_buff[4];            /* accumulate utf-8 here */
 
       /* Manually track tainting, as we deal in individual chars below */
@@ -7896,6 +7896,15 @@ NOT_ITEM: ;
         if (--bytes_left == 0)        /* codepoint complete */
           if(codepoint > 0x10FFFF)    /* is it too large? */
             complete = -1;    /* error (RFC3629 limit) */
+          else if ( (codepoint & 0x1FF800 ) == 0xD800 ) /* surrogate */
+            /* A UTF-16 surrogate (which should be one of a pair that
+            encode a Unicode codepoint that is outside the Basic
+            Multilingual Plane).  Error, not UTF8.
+            RFC2279.2 is slightly unclear on this, but 
+            https://unicodebook.readthedocs.io/issues.html#strict-utf8-decoder
+            says "Surrogates characters are also invalid in UTF-8:
+            characters in U+D800—U+DFFF have to be rejected." */
+            complete = -1;
           else
             {        /* finished; output utf-8 sequence */
             yield = string_catn(yield, seq_buff, seq_len);
@@ -7905,27 +7914,25 @@ NOT_ITEM: ;
           }
         else    /* no bytes left: new sequence */
           {
-          if(!(c & 0x80))    /* 1-byte sequence, US-ASCII, keep it */
+          if (!(c & 0x80))    /* 1-byte sequence, US-ASCII, keep it */
         {
         yield = string_catn(yield, &c, 1);
         continue;
         }
-          if((c & 0xe0) == 0xc0)        /* 2-byte sequence */
-        {
-        if(c == 0xc0 || c == 0xc1)    /* 0xc0 and 0xc1 are illegal */
+          if ((c & 0xe0) == 0xc0)        /* 2-byte sequence */
+        if (c == 0xc0 || c == 0xc1)    /* 0xc0 and 0xc1 are illegal */
           complete = -1;
         else
           {
-            bytes_left = 1;
-            codepoint = c & 0x1f;
+          bytes_left = 1;
+          codepoint = c & 0x1f;
           }
-        }
-          else if((c & 0xf0) == 0xe0)        /* 3-byte sequence */
+          else if ((c & 0xf0) == 0xe0)        /* 3-byte sequence */
         {
         bytes_left = 2;
         codepoint = c & 0x0f;
         }
-          else if((c & 0xf8) == 0xf0)        /* 4-byte sequence */
+          else if ((c & 0xf8) == 0xf0)        /* 4-byte sequence */
         {
         bytes_left = 3;
         codepoint = c & 0x07;


--
## subscription configuration (requires account):
## https://lists.exim.org/mailman3/postorius/lists/exim-cvs.lists.exim.org/
## unsubscribe (doesn't require an account):
## exim-cvs-unsubscribe@???
## Exim details at http://www.exim.org/
## Please use the Wiki with this list - http://wiki.exim.org/