diff options
Diffstat (limited to 'clang-tools-extra/clangd/SourceCode.cpp')
| -rw-r--r-- | clang-tools-extra/clangd/SourceCode.cpp | 98 |
1 files changed, 90 insertions, 8 deletions
diff --git a/clang-tools-extra/clangd/SourceCode.cpp b/clang-tools-extra/clangd/SourceCode.cpp index 65a7ca987c13..ddfaec888e2d 100644 --- a/clang-tools-extra/clangd/SourceCode.cpp +++ b/clang-tools-extra/clangd/SourceCode.cpp @@ -16,6 +16,66 @@ namespace clang { namespace clangd { using namespace llvm; +// Here be dragons. LSP positions use columns measured in *UTF-16 code units*! +// Clangd uses UTF-8 and byte-offsets internally, so conversion is nontrivial. + +// Iterates over unicode codepoints in the (UTF-8) string. For each, +// invokes CB(UTF-8 length, UTF-16 length), and breaks if it returns true. +// Returns true if CB returned true, false if we hit the end of string. +template <typename Callback> +static bool iterateCodepoints(StringRef U8, const Callback &CB) { + for (size_t I = 0; I < U8.size();) { + unsigned char C = static_cast<unsigned char>(U8[I]); + if (LLVM_LIKELY(!(C & 0x80))) { // ASCII character. + if (CB(1, 1)) + return true; + ++I; + continue; + } + // This convenient property of UTF-8 holds for all non-ASCII characters. + size_t UTF8Length = countLeadingOnes(C); + // 0xxx is ASCII, handled above. 10xxx is a trailing byte, invalid here. + // 11111xxx is not valid UTF-8 at all. Assert because it's probably our bug. + assert((UTF8Length >= 2 && UTF8Length <= 4) && + "Invalid UTF-8, or transcoding bug?"); + I += UTF8Length; // Skip over all trailing bytes. + // A codepoint takes two UTF-16 code unit if it's astral (outside BMP). + // Astral codepoints are encoded as 4 bytes in UTF-8 (11110xxx ...) + if (CB(UTF8Length, UTF8Length == 4 ? 2 : 1)) + return true; + } + return false; +} + +// Returns the offset into the string that matches \p Units UTF-16 code units. +// Conceptually, this converts to UTF-16, truncates to CodeUnits, converts back +// to UTF-8, and returns the length in bytes. +static size_t measureUTF16(StringRef U8, int U16Units, bool &Valid) { + size_t Result = 0; + Valid = U16Units == 0 || iterateCodepoints(U8, [&](int U8Len, int U16Len) { + Result += U8Len; + U16Units -= U16Len; + return U16Units <= 0; + }); + if (U16Units < 0) // Offset was into the middle of a surrogate pair. + Valid = false; + // Don't return an out-of-range index if we overran. + return std::min(Result, U8.size()); +} + +// Counts the number of UTF-16 code units needed to represent a string. +// Like most strings in clangd, the input is UTF-8 encoded. +static size_t utf16Len(StringRef U8) { + // A codepoint takes two UTF-16 code unit if it's astral (outside BMP). + // Astral codepoints are encoded as 4 bytes in UTF-8, starting with 11110xxx. + size_t Count = 0; + iterateCodepoints(U8, [&](int U8Len, int U16Len) { + Count += U16Len; + return false; + }); + return Count; +} + llvm::Expected<size_t> positionToOffset(StringRef Code, Position P, bool AllowColumnsBeyondLineLength) { if (P.line < 0) @@ -40,12 +100,15 @@ llvm::Expected<size_t> positionToOffset(StringRef Code, Position P, if (NextNL == StringRef::npos) NextNL = Code.size(); - if (StartOfLine + P.character > NextNL && !AllowColumnsBeyondLineLength) + bool Valid; + size_t ByteOffsetInLine = measureUTF16( + Code.substr(StartOfLine, NextNL - StartOfLine), P.character, Valid); + if (!Valid && !AllowColumnsBeyondLineLength) return llvm::make_error<llvm::StringError>( - llvm::formatv("Character value is out of range ({0})", P.character), + llvm::formatv("UTF-16 offset {0} is invalid for line {1}", P.character, + P.line), llvm::errc::invalid_argument); - // FIXME: officially P.character counts UTF-16 code units, not UTF-8 bytes! - return std::min(NextNL, StartOfLine + P.character); + return StartOfLine + ByteOffsetInLine; } Position offsetToPosition(StringRef Code, size_t Offset) { @@ -54,17 +117,26 @@ Position offsetToPosition(StringRef Code, size_t Offset) { int Lines = Before.count('\n'); size_t PrevNL = Before.rfind('\n'); size_t StartOfLine = (PrevNL == StringRef::npos) ? 0 : (PrevNL + 1); - // FIXME: officially character counts UTF-16 code units, not UTF-8 bytes! Position Pos; Pos.line = Lines; - Pos.character = static_cast<int>(Offset - StartOfLine); + Pos.character = utf16Len(Before.substr(StartOfLine)); return Pos; } Position sourceLocToPosition(const SourceManager &SM, SourceLocation Loc) { + // We use the SourceManager's line tables, but its column number is in bytes. + FileID FID; + unsigned Offset; + std::tie(FID, Offset) = SM.getDecomposedSpellingLoc(Loc); Position P; - P.line = static_cast<int>(SM.getSpellingLineNumber(Loc)) - 1; - P.character = static_cast<int>(SM.getSpellingColumnNumber(Loc)) - 1; + P.line = static_cast<int>(SM.getLineNumber(FID, Offset)) - 1; + bool Invalid = false; + StringRef Code = SM.getBufferData(FID, &Invalid); + if (!Invalid) { + auto ColumnInBytes = SM.getColumnNumber(FID, Offset) - 1; + auto LineSoFar = Code.substr(Offset - ColumnInBytes, ColumnInBytes); + P.character = utf16Len(LineSoFar); + } return P; } @@ -76,6 +148,16 @@ Range halfOpenToRange(const SourceManager &SM, CharSourceRange R) { return {Begin, End}; } +std::pair<size_t, size_t> offsetToClangLineColumn(StringRef Code, + size_t Offset) { + Offset = std::min(Code.size(), Offset); + StringRef Before = Code.substr(0, Offset); + int Lines = Before.count('\n'); + size_t PrevNL = Before.rfind('\n'); + size_t StartOfLine = (PrevNL == StringRef::npos) ? 0 : (PrevNL + 1); + return {Lines + 1, Offset - StartOfLine + 1}; +} + std::pair<llvm::StringRef, llvm::StringRef> splitQualifiedName(llvm::StringRef QName) { size_t Pos = QName.rfind("::"); |
