moc: rewrite mergeStringLiterals() to avoid O(N²) algorithm

The old code called QList::erase(it, it) in a loop; which causes quadratic behavior. Fix by writing a std::unique-inspired merge algorithm that works in linear time, at the cost of merging only pairs of consecutive STRING_LITERALs. The old code tried to merge maximal sequences of adjacent STRING_LITERALs. This should not make much of a difference, since QByteArray grows geometrically, so appends (and insert at end - 1 is almost append) take amortized linear time in the final size of the result. Symbol is a QByteArray version of QStringRef (i.e. a cheap substring: string, from, len), except with individual copies of the underlying string, so make sure to rid Symbol::lex of non-lexem() data before merging. Aurélien reports that `lex` may contain a whole file, so this is an important step. Aurélien reports a 6x speedup on one of his benchmark files. Amends 0c9d1f99dac5c118d49e7f2b04f70eae3ba7b837. Reported-by: Aurélien Brooke <aurelien@bahiasoft.fr> Done-with: Aurélien Brooke <aurelien@bahiasoft.fr> Pick-to: 6.10 6.9 6.8 6.5 Change-Id: I1093a43c3727fde1785795b1aac7417be011bad8 Reviewed-by: Fabian Kosmale <fabian.kosmale@qt.io> Reviewed-by: Aurélien Brooke <aurelien@bahiasoft.fr>
author: Marc Mutz <marc.mutz@qt.io> 2025-07-29 15:16:39 +0200
committer: Marc Mutz <marc.mutz@qt.io> 2025-08-01 15:46:03 +0000
commit: 4f3a8a8d68ce804f5d999559ccf11a42a79e3f8f (patch)
tree: 66d3f211d3940b1f8690fb7770f3db98b0962f56 /src/tools/moc/preprocessor.cpp
parent: f0f45d1bcd4b1f34b7f47fb9f0458c38d93e6da3 (diff)
1 files changed, 58 insertions, 22 deletions
diff --git a/src/tools/moc/preprocessor.cpp b/src/tools/moc/preprocessor.cpp
index 76bf08e332b..6259f7a4838 100644
--- a/src/tools/moc/preprocessor.cpp
+++ b/src/tools/moc/preprocessor.cpp
@@ -1003,33 +1003,69 @@ static QByteArray readOrMapFile(QFile *file)
     return rawInput ? QByteArray::fromRawData(rawInput, size) : file->readAll();
 }
 
+void Symbol::mergeStringLiteral(const Symbol &next)
+{
+    Q_ASSERT(len >= 2); // at least `""`
+    Q_ASSERT(from + len <= lex.size());
+    Q_ASSERT(next.len >= 2); // at least `""`
+    Q_ASSERT(next.from + next.len <= next.lex.size());
+
+    if (len != lex.size()) {
+        // "rubbish" around lexem() in `lex`: clean up (`lex` may be the whole file)
+        QByteArray l = lexemView().chopped(1) % next.lexemView().sliced(1);
+        lex = std::move(l); // lexemView() aliases `lex`; only clobber it now
+        from = 0;
+    } else {
+        // like QByteArray::append(), but dealing with the "" around each lexem:
+        const auto unquoted = next.unquotedLexemView();
+        lex.insert(from + len - 1, // before closing `"`
+                   unquoted);
+    }
+    len = lex.size();
+}
+
 static void mergeStringLiterals(Symbols &symbols)
 {
-    for (Symbols::iterator i = symbols.begin(); i != symbols.end(); ++i) {
-        if (i->token == STRING_LITERAL) {
-            Symbols::Iterator mergeSymbol = i;
-            qsizetype literalsLength = mergeSymbol->len;
-            while (++i != symbols.end() && i->token == STRING_LITERAL)
-                literalsLength += i->len - 2; // no quotes
-
-            if (literalsLength != mergeSymbol->len) {
-                QByteArray mergeSymbolOriginalLexem = mergeSymbol->unquotedLexem();
-                QByteArray &mergeSymbolLexem = mergeSymbol->lex;
-                mergeSymbolLexem.resize(0);
-                mergeSymbolLexem.reserve(literalsLength);
-                mergeSymbolLexem.append('"');
-                mergeSymbolLexem.append(mergeSymbolOriginalLexem);
-                for (Symbols::iterator j = mergeSymbol + 1; j != i; ++j)
-                    mergeSymbolLexem.append(j->lex.constData() + j->from + 1, j->len - 2); // append j->unquotedLexem()
-                mergeSymbolLexem.append('"');
-                mergeSymbol->len = mergeSymbol->lex.size();
-                mergeSymbol->from = 0;
-                i = symbols.erase(mergeSymbol + 1, i);
+    // like std::unique, but merges instead of skips adjacent STRING_LITERALs:
+
+    const auto mergeable = [](const Symbol &lhs, const Symbol &rhs) {
+        return lhs.token == STRING_LITERAL && rhs.token == STRING_LITERAL;
+    };
+
+    auto end = symbols.end();
+    auto it = std::adjacent_find(symbols.begin(), symbols.end(), mergeable);
+    if (it == end) // none found
+        return;
+
+    // we know `it`, `it + 1` are both STRING_LITERAL (adjacent_find post-condition)
+    // in particular: it + 1 < end
+
+    auto dst = it;
+    auto lit = dst;
+    ++it;
+    lit->mergeStringLiteral(*it);
+
+    while (++it != end) {
+        // Loop Invariants:
+        // - [begin(), dst] is already processed
+        // - `lit` is the last string literal
+        //   - we can merge if lit == dst
+        // - [it, end[ still to be checked
+        if (it->token == STRING_LITERAL) {
+            if (lit == dst) {            // can merge
+                lit->mergeStringLiteral(*it);
+            } else {                     // can't merge: not adjacent to previous STRING_LITERAL
+                *++dst = std::move(*it);
+                lit = dst;               // remember that this was a literal
             }
-            if (i == symbols.end())
-                break;
+        } else {
+            *++dst = std::move(*it);
         }
     }
+
+    ++dst;
+
+    symbols.erase(dst, end);
 }
 
 static QByteArray searchIncludePaths(const QList<Parser::IncludePath> &includepaths,
author	Marc Mutz <marc.mutz@qt.io>	2025-07-29 15:16:39 +0200
committer	Marc Mutz <marc.mutz@qt.io>	2025-08-01 15:46:03 +0000
commit	4f3a8a8d68ce804f5d999559ccf11a42a79e3f8f (patch)
tree	66d3f211d3940b1f8690fb7770f3db98b0962f56 /src/tools/moc/preprocessor.cpp
parent	f0f45d1bcd4b1f34b7f47fb9f0458c38d93e6da3 (diff)