QXmlStreamWriter: decode UTF-8 into code points

We were iterating over code *units* and that yielded wrong results. The one from the bug report was simply caused by the fact that QUtf8StringView::value_type is char, which is signed on x86, so the expression: *it <= u'\x1F' was true for all non-Latin1 content. But in attempting to fix this, I needed to do the proper UTF-8 decoding, as otherwise we wouldn't catch non-Latin1 sequences and such. [ChangeLog][QtCore][QXmlStreamWriter] Fixed a bug that caused the class to fail to write UTF-8 strings with non-US-ASCII content when passed as a QUtf8StringView. Fixes: QTBUG-122241 Pick-to: 6.5 6.6 6.7 Change-Id: I83dda2d36c904517b3c0fffd17b42bbf09a493d0 Reviewed-by: Mate Barany <mate.barany@qt.io>
author: Thiago Macieira <thiago.macieira@intel.com> 2024-02-15 15:04:18 -0800
committer: Thiago Macieira <thiago.macieira@intel.com> 2024-04-18 14:35:09 -0700
commit: 94c62e322264e2e7d61193ae74ba8556a330385c (patch)
tree: 294690436c07cb22159545f38daed4538c51e8fe /src/corelib/serialization/qxmlstream.cpp
parent: 17c964c4e874ab59a2af7859ae23f5cb4ad01d36 (diff)
1 files changed, 36 insertions, 10 deletions
diff --git a/src/corelib/serialization/qxmlstream.cpp b/src/corelib/serialization/qxmlstream.cpp
index 56330027372..f0fad77a085 100644
--- a/src/corelib/serialization/qxmlstream.cpp
+++ b/src/corelib/serialization/qxmlstream.cpp
@@ -2963,54 +2963,80 @@ void QXmlStreamWriterPrivate::write(QAnyStringView s)
 
 void QXmlStreamWriterPrivate::writeEscaped(QAnyStringView s, bool escapeWhitespace)
 {
+    struct NextLatin1 {
+        char32_t operator()(const char *&it, const char *) const
+        { return uchar(*it++); }
+    };
+    struct NextUtf8 {
+        char32_t operator()(const char *&it, const char *end) const
+        {
+            uchar uc = *it++;
+            char32_t utf32 = 0;
+            char32_t *output = &utf32;
+            qsizetype n = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(uc, output, it, end);
+            return n < 0 ? 0 : utf32;
+        }
+    };
+    struct NextUtf16 {
+        char32_t operator()(const QChar *&it, const QChar *) const
+        {
+            return (it++)->unicode();
+        }
+    };
+
     QString escaped;
     escaped.reserve(s.size());
     s.visit([&] (auto s) {
         using View = decltype(s);
+        using Decoder = std::conditional_t<std::is_same_v<View, QLatin1StringView>, NextLatin1,
+                            std::conditional_t<std::is_same_v<View, QUtf8StringView>, NextUtf8, NextUtf16>>;
 
         auto it = s.begin();
         const auto end = s.end();
+        Decoder decoder;
 
         while (it != end) {
             QLatin1StringView replacement;
             auto mark = it;
 
             while (it != end) {
-                if (*it == u'<') {
+                auto next_it = it;
+                char32_t uc = decoder(next_it, end);
+                if (uc == u'<') {
                     replacement = "&lt;"_L1;
                     break;
-                } else if (*it == u'>') {
+                } else if (uc == u'>') {
                     replacement = "&gt;"_L1;
                     break;
-                } else if (*it == u'&') {
+                } else if (uc == u'&') {
                     replacement = "&amp;"_L1;
                     break;
-                } else if (*it == u'\"') {
+                } else if (uc == u'\"') {
                     replacement = "&quot;"_L1;
                     break;
-                } else if (*it == u'\t') {
+                } else if (uc == u'\t') {
                     if (escapeWhitespace) {
                         replacement = "&#9;"_L1;
                         break;
                     }
-                } else if (*it == u'\n') {
+                } else if (uc == u'\n') {
                     if (escapeWhitespace) {
                         replacement = "&#10;"_L1;
                         break;
                     }
-                } else if (*it == u'\v' || *it == u'\f') {
+                } else if (uc == u'\v' || uc == u'\f') {
                     hasEncodingError = true;
                     break;
-                } else if (*it == u'\r') {
+                } else if (uc == u'\r') {
                     if (escapeWhitespace) {
                         replacement = "&#13;"_L1;
                         break;
                     }
-                } else if (*it <= u'\x1F' || *it >= u'\uFFFE') {
+                } else if (uc <= u'\x1F' || uc == u'\uFFFE' || uc == u'\uFFFF') {
                     hasEncodingError = true;
                     break;
                 }
-                ++it;
+                it = next_it;
             }
 
             escaped.append(View{mark, it});
author	Thiago Macieira <thiago.macieira@intel.com>	2024-02-15 15:04:18 -0800
committer	Thiago Macieira <thiago.macieira@intel.com>	2024-04-18 14:35:09 -0700
commit	94c62e322264e2e7d61193ae74ba8556a330385c (patch)
tree	294690436c07cb22159545f38daed4538c51e8fe /src/corelib/serialization/qxmlstream.cpp
parent	17c964c4e874ab59a2af7859ae23f5cb4ad01d36 (diff)