diff options
| author | Edward Welbourne <edward.welbourne@qt.io> | 2019-05-27 19:13:54 +0200 |
|---|---|---|
| committer | Edward Welbourne <edward.welbourne@qt.io> | 2019-07-10 17:05:30 +0200 |
| commit | a9aa206b7b8ac4e69f8c46233b4080e00e845ff5 (patch) | |
| tree | 0d19cb1f1a3b9d79d322e6e63f6f72160977ca67 /src/corelib/tools/qchar.cpp | |
| parent | 85d3061c1cd4617ef09cb381320611c27da205a5 (diff) | |
Move text-related code out of corelib/tools/ to corelib/text/
This includes byte array, string, char, unicode, locale, collation and
regular expressions.
Change-Id: I8b125fa52c8c513eb57a0f1298b91910e5a0d786
Reviewed-by: Volker Hilsheimer <volker.hilsheimer@qt.io>
Diffstat (limited to 'src/corelib/tools/qchar.cpp')
| -rw-r--r-- | src/corelib/tools/qchar.cpp | 2059 |
1 files changed, 0 insertions, 2059 deletions
diff --git a/src/corelib/tools/qchar.cpp b/src/corelib/tools/qchar.cpp deleted file mode 100644 index 0c190c6a3d6..00000000000 --- a/src/corelib/tools/qchar.cpp +++ /dev/null @@ -1,2059 +0,0 @@ -/**************************************************************************** -** -** Copyright (C) 2016 The Qt Company Ltd. -** Contact: https://www.qt.io/licensing/ -** -** This file is part of the QtCore module of the Qt Toolkit. -** -** $QT_BEGIN_LICENSE:LGPL$ -** Commercial License Usage -** Licensees holding valid commercial Qt licenses may use this file in -** accordance with the commercial license agreement provided with the -** Software or, alternatively, in accordance with the terms contained in -** a written agreement between you and The Qt Company. For licensing terms -** and conditions see https://www.qt.io/terms-conditions. For further -** information use the contact form at https://www.qt.io/contact-us. -** -** GNU Lesser General Public License Usage -** Alternatively, this file may be used under the terms of the GNU Lesser -** General Public License version 3 as published by the Free Software -** Foundation and appearing in the file LICENSE.LGPL3 included in the -** packaging of this file. Please review the following information to -** ensure the GNU Lesser General Public License version 3 requirements -** will be met: https://www.gnu.org/licenses/lgpl-3.0.html. -** -** GNU General Public License Usage -** Alternatively, this file may be used under the terms of the GNU -** General Public License version 2.0 or (at your option) the GNU General -** Public license version 3 or any later version approved by the KDE Free -** Qt Foundation. The licenses are as published by the Free Software -** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3 -** included in the packaging of this file. Please review the following -** information to ensure the GNU General Public License requirements will -** be met: https://www.gnu.org/licenses/gpl-2.0.html and -** https://www.gnu.org/licenses/gpl-3.0.html. -** -** $QT_END_LICENSE$ -** -****************************************************************************/ - -// Don't define it while compiling this module, or USERS of Qt will -// not be able to link. -#ifdef QT_NO_CAST_FROM_ASCII -# undef QT_NO_CAST_FROM_ASCII -#endif -#ifdef QT_NO_CAST_TO_ASCII -# undef QT_NO_CAST_TO_ASCII -#endif -#include "qchar.h" - -#include "qdatastream.h" - -#include "qunicodetables_p.h" -#include "qunicodetables.cpp" - -#include <algorithm> - -QT_BEGIN_NAMESPACE - -#define FLAG(x) (1 << (x)) - -/*! - \class QLatin1Char - \inmodule QtCore - \brief The QLatin1Char class provides an 8-bit ASCII/Latin-1 character. - - \ingroup string-processing - - This class is only useful to construct a QChar with 8-bit character. - - \sa QChar, QLatin1String, QString -*/ - -/*! - \fn const char QLatin1Char::toLatin1() const - - Converts a Latin-1 character to an 8-bit ASCII representation of the character. -*/ - -/*! - \fn const ushort QLatin1Char::unicode() const - - Converts a Latin-1 character to an 16-bit-encoded Unicode representation - of the character. -*/ - -/*! - \fn QLatin1Char::QLatin1Char(char c) - - Constructs a Latin-1 character for \a c. This constructor should be - used when the encoding of the input character is known to be Latin-1. -*/ - -/*! - \class QChar - \inmodule QtCore - \brief The QChar class provides a 16-bit Unicode character. - - \ingroup string-processing - \reentrant - - In Qt, Unicode characters are 16-bit entities without any markup - or structure. This class represents such an entity. It is - lightweight, so it can be used everywhere. Most compilers treat - it like an \c{unsigned short}. - - QChar provides a full complement of testing/classification - functions, converting to and from other formats, converting from - composed to decomposed Unicode, and trying to compare and - case-convert if you ask it to. - - The classification functions include functions like those in the - standard C++ header \<cctype\> (formerly \<ctype.h\>), but - operating on the full range of Unicode characters, not just for the ASCII - range. They all return true if the character is a certain type of character; - otherwise they return false. These classification functions are - isNull() (returns \c true if the character is '\\0'), isPrint() - (true if the character is any sort of printable character, - including whitespace), isPunct() (any sort of punctation), - isMark() (Unicode Mark), isLetter() (a letter), isNumber() (any - sort of numeric character, not just 0-9), isLetterOrNumber(), and - isDigit() (decimal digits). All of these are wrappers around - category() which return the Unicode-defined category of each - character. Some of these also calculate the derived properties - (for example isSpace() returns \c true if the character is of category - Separator_* or an exceptional code point from Other_Control category). - - QChar also provides direction(), which indicates the "natural" - writing direction of this character. The joiningType() function - indicates how the character joins with it's neighbors (needed - mostly for Arabic or Syriac) and finally hasMirrored(), which indicates - whether the character needs to be mirrored when it is printed in - it's "unnatural" writing direction. - - Composed Unicode characters (like \a ring) can be converted to - decomposed Unicode ("a" followed by "ring above") by using decomposition(). - - In Unicode, comparison is not necessarily possible and case - conversion is very difficult at best. Unicode, covering the - "entire" world, also includes most of the world's case and - sorting problems. operator==() and friends will do comparison - based purely on the numeric Unicode value (code point) of the - characters, and toUpper() and toLower() will do case changes when - the character has a well-defined uppercase/lowercase equivalent. - For locale-dependent comparisons, use QString::localeAwareCompare(). - - The conversion functions include unicode() (to a scalar), - toLatin1() (to scalar, but converts all non-Latin-1 characters to - 0), row() (gives the Unicode row), cell() (gives the Unicode - cell), digitValue() (gives the integer value of any of the - numerous digit characters), and a host of constructors. - - QChar provides constructors and cast operators that make it easy - to convert to and from traditional 8-bit \c{char}s. If you - defined \c QT_NO_CAST_FROM_ASCII and \c QT_NO_CAST_TO_ASCII, as - explained in the QString documentation, you will need to - explicitly call fromLatin1(), or use QLatin1Char, - to construct a QChar from an 8-bit \c char, and you will need to - call toLatin1() to get the 8-bit value back. - - For more information see - \l{http://www.unicode.org/ucd/}{"About the Unicode Character Database"}. - - \sa Unicode, QString, QLatin1Char -*/ - -/*! - \enum QChar::UnicodeVersion - - Specifies which version of the \l{http://www.unicode.org/}{Unicode standard} - introduced a certain character. - - \value Unicode_1_1 Version 1.1 - \value Unicode_2_0 Version 2.0 - \value Unicode_2_1_2 Version 2.1.2 - \value Unicode_3_0 Version 3.0 - \value Unicode_3_1 Version 3.1 - \value Unicode_3_2 Version 3.2 - \value Unicode_4_0 Version 4.0 - \value Unicode_4_1 Version 4.1 - \value Unicode_5_0 Version 5.0 - \value Unicode_5_1 Version 5.1 - \value Unicode_5_2 Version 5.2 - \value Unicode_6_0 Version 6.0 - \value Unicode_6_1 Version 6.1 - \value Unicode_6_2 Version 6.2 - \value Unicode_6_3 Version 6.3 Since Qt 5.3 - \value Unicode_7_0 Version 7.0 Since Qt 5.5 - \value Unicode_8_0 Version 8.0 Since Qt 5.6 - \value Unicode_9_0 Version 9.0 Since Qt 5.11 - \value Unicode_10_0 Version 10.0 Since Qt 5.11 - \value Unicode_Unassigned The value is not assigned to any character - in version 8.0 of Unicode. - - \sa unicodeVersion(), currentUnicodeVersion() -*/ - -/*! - \enum QChar::Category - - This enum maps the Unicode character categories. - - The following characters are normative in Unicode: - - \value Mark_NonSpacing Unicode class name Mn - - \value Mark_SpacingCombining Unicode class name Mc - - \value Mark_Enclosing Unicode class name Me - - \value Number_DecimalDigit Unicode class name Nd - - \value Number_Letter Unicode class name Nl - - \value Number_Other Unicode class name No - - \value Separator_Space Unicode class name Zs - - \value Separator_Line Unicode class name Zl - - \value Separator_Paragraph Unicode class name Zp - - \value Other_Control Unicode class name Cc - - \value Other_Format Unicode class name Cf - - \value Other_Surrogate Unicode class name Cs - - \value Other_PrivateUse Unicode class name Co - - \value Other_NotAssigned Unicode class name Cn - - - The following categories are informative in Unicode: - - \value Letter_Uppercase Unicode class name Lu - - \value Letter_Lowercase Unicode class name Ll - - \value Letter_Titlecase Unicode class name Lt - - \value Letter_Modifier Unicode class name Lm - - \value Letter_Other Unicode class name Lo - - \value Punctuation_Connector Unicode class name Pc - - \value Punctuation_Dash Unicode class name Pd - - \value Punctuation_Open Unicode class name Ps - - \value Punctuation_Close Unicode class name Pe - - \value Punctuation_InitialQuote Unicode class name Pi - - \value Punctuation_FinalQuote Unicode class name Pf - - \value Punctuation_Other Unicode class name Po - - \value Symbol_Math Unicode class name Sm - - \value Symbol_Currency Unicode class name Sc - - \value Symbol_Modifier Unicode class name Sk - - \value Symbol_Other Unicode class name So - - \sa category() -*/ - -/*! - \enum QChar::Script - \since 5.1 - - This enum type defines the Unicode script property values. - - For details about the Unicode script property values see - \l{http://www.unicode.org/reports/tr24/}{Unicode Standard Annex #24}. - - In order to conform to C/C++ naming conventions "Script_" is prepended - to the codes used in the Unicode Standard. - - \value Script_Unknown For unassigned, private-use, noncharacter, and surrogate code points. - \value Script_Inherited For characters that may be used with multiple scripts - and that inherit their script from the preceding characters. - These include nonspacing marks, enclosing marks, - and zero width joiner/non-joiner characters. - \value Script_Common For characters that may be used with multiple scripts - and that do not inherit their script from the preceding characters. - - \value Script_Latin - \value Script_Greek - \value Script_Cyrillic - \value Script_Armenian - \value Script_Hebrew - \value Script_Arabic - \value Script_Syriac - \value Script_Thaana - \value Script_Devanagari - \value Script_Bengali - \value Script_Gurmukhi - \value Script_Gujarati - \value Script_Oriya - \value Script_Tamil - \value Script_Telugu - \value Script_Kannada - \value Script_Malayalam - \value Script_Sinhala - \value Script_Thai - \value Script_Lao - \value Script_Tibetan - \value Script_Myanmar - \value Script_Georgian - \value Script_Hangul - \value Script_Ethiopic - \value Script_Cherokee - \value Script_CanadianAboriginal - \value Script_Ogham - \value Script_Runic - \value Script_Khmer - \value Script_Mongolian - \value Script_Hiragana - \value Script_Katakana - \value Script_Bopomofo - \value Script_Han - \value Script_Yi - \value Script_OldItalic - \value Script_Gothic - \value Script_Deseret - \value Script_Tagalog - \value Script_Hanunoo - \value Script_Buhid - \value Script_Tagbanwa - \value Script_Coptic - \value Script_Limbu - \value Script_TaiLe - \value Script_LinearB - \value Script_Ugaritic - \value Script_Shavian - \value Script_Osmanya - \value Script_Cypriot - \value Script_Braille - \value Script_Buginese - \value Script_NewTaiLue - \value Script_Glagolitic - \value Script_Tifinagh - \value Script_SylotiNagri - \value Script_OldPersian - \value Script_Kharoshthi - \value Script_Balinese - \value Script_Cuneiform - \value Script_Phoenician - \value Script_PhagsPa - \value Script_Nko - \value Script_Sundanese - \value Script_Lepcha - \value Script_OlChiki - \value Script_Vai - \value Script_Saurashtra - \value Script_KayahLi - \value Script_Rejang - \value Script_Lycian - \value Script_Carian - \value Script_Lydian - \value Script_Cham - \value Script_TaiTham - \value Script_TaiViet - \value Script_Avestan - \value Script_EgyptianHieroglyphs - \value Script_Samaritan - \value Script_Lisu - \value Script_Bamum - \value Script_Javanese - \value Script_MeeteiMayek - \value Script_ImperialAramaic - \value Script_OldSouthArabian - \value Script_InscriptionalParthian - \value Script_InscriptionalPahlavi - \value Script_OldTurkic - \value Script_Kaithi - \value Script_Batak - \value Script_Brahmi - \value Script_Mandaic - \value Script_Chakma - \value Script_MeroiticCursive - \value Script_MeroiticHieroglyphs - \value Script_Miao - \value Script_Sharada - \value Script_SoraSompeng - \value Script_Takri - \value Script_CaucasianAlbanian - \value Script_BassaVah - \value Script_Duployan - \value Script_Elbasan - \value Script_Grantha - \value Script_PahawhHmong - \value Script_Khojki - \value Script_LinearA - \value Script_Mahajani - \value Script_Manichaean - \value Script_MendeKikakui - \value Script_Modi - \value Script_Mro - \value Script_OldNorthArabian - \value Script_Nabataean - \value Script_Palmyrene - \value Script_PauCinHau - \value Script_OldPermic - \value Script_PsalterPahlavi - \value Script_Siddham - \value Script_Khudawadi - \value Script_Tirhuta - \value Script_WarangCiti - \value Script_Ahom - \value Script_AnatolianHieroglyphs - \value Script_Hatran - \value Script_Multani - \value Script_OldHungarian - \value Script_SignWriting - \value Script_Adlam - \value Script_Bhaiksuki - \value Script_Marchen - \value Script_Newa - \value Script_Osage - \value Script_Tangut - \value Script_MasaramGondi - \value Script_Nushu - \value Script_Soyombo - \value Script_ZanabazarSquare - - \omitvalue ScriptCount - - \sa script() -*/ - -/*! - \enum QChar::Direction - - This enum type defines the Unicode direction attributes. See the - \l{http://www.unicode.org/reports/tr9/tr9-35.html#Table_Bidirectional_Character_Types}{Unicode Standard} for a description - of the values. - - In order to conform to C/C++ naming conventions "Dir" is prepended - to the codes used in the Unicode Standard. - - \value DirAL - \value DirAN - \value DirB - \value DirBN - \value DirCS - \value DirEN - \value DirES - \value DirET - \value DirFSI Since Qt 5.3 - \value DirL - \value DirLRE - \value DirLRI Since Qt 5.3 - \value DirLRO - \value DirNSM - \value DirON - \value DirPDF - \value DirPDI Since Qt 5.3 - \value DirR - \value DirRLE - \value DirRLI Since Qt 5.3 - \value DirRLO - \value DirS - \value DirWS - - \sa direction() -*/ - -/*! - \enum QChar::Decomposition - - This enum type defines the Unicode decomposition attributes. See - the \l{http://www.unicode.org/}{Unicode Standard} for a - description of the values. - - \value NoDecomposition - \value Canonical - \value Circle - \value Compat - \value Final - \value Font - \value Fraction - \value Initial - \value Isolated - \value Medial - \value Narrow - \value NoBreak - \value Small - \value Square - \value Sub - \value Super - \value Vertical - \value Wide - - \sa decomposition() -*/ - -/*! - \enum QChar::JoiningType - since 5.3 - - This enum type defines the Unicode joining type attributes. See the - \l{http://www.unicode.org/}{Unicode Standard} for a description of the values. - - In order to conform to C/C++ naming conventions "Joining_" is prepended - to the codes used in the Unicode Standard. - - \value Joining_None - \value Joining_Causing - \value Joining_Dual - \value Joining_Right - \value Joining_Left - \value Joining_Transparent - - \sa joiningType() -*/ - -#if QT_DEPRECATED_SINCE(5, 3) -/*! - \enum QChar::Joining - \deprecated in 5.3, use JoiningType instead. - - This enum type defines the Unicode joining attributes. See the - \l{http://www.unicode.org/}{Unicode Standard} for a description - of the values. - - \value Center - \value Dual - \value OtherJoining - \value Right - - \sa joining() -*/ -#endif - -/*! - \enum QChar::CombiningClass - - \internal - - This enum type defines names for some of the Unicode combining - classes. See the \l{http://www.unicode.org/}{Unicode Standard} - for a description of the values. - - \value Combining_Above - \value Combining_AboveAttached - \value Combining_AboveLeft - \value Combining_AboveLeftAttached - \value Combining_AboveRight - \value Combining_AboveRightAttached - \value Combining_Below - \value Combining_BelowAttached - \value Combining_BelowLeft - \value Combining_BelowLeftAttached - \value Combining_BelowRight - \value Combining_BelowRightAttached - \value Combining_DoubleAbove - \value Combining_DoubleBelow - \value Combining_IotaSubscript - \value Combining_Left - \value Combining_LeftAttached - \value Combining_Right - \value Combining_RightAttached -*/ - -/*! - \enum QChar::SpecialCharacter - - \value Null A QChar with this value isNull(). - \value Tabulation Character tabulation. - \value LineFeed - \value CarriageReturn - \value Space - \value Nbsp Non-breaking space. - \value SoftHyphen - \value ReplacementCharacter The character shown when a font has no glyph - for a certain codepoint. A special question mark character is often - used. Codecs use this codepoint when input data cannot be - represented in Unicode. - \value ObjectReplacementCharacter Used to represent an object such as an - image when such objects cannot be presented. - \value ByteOrderMark - \value ByteOrderSwapped - \value ParagraphSeparator - \value LineSeparator - \value LastValidCodePoint -*/ - -/*! - \fn void QChar::setCell(uchar cell) - \internal -*/ - -/*! - \fn void QChar::setRow(uchar row) - \internal -*/ - -/*! - \fn QChar::QChar() - - Constructs a null QChar ('\\0'). - - \sa isNull() -*/ - -/*! - \fn QChar::QChar(QLatin1Char ch) - - Constructs a QChar corresponding to ASCII/Latin-1 character \a ch. -*/ - -/*! - \fn QChar::QChar(SpecialCharacter ch) - - Constructs a QChar for the predefined character value \a ch. -*/ - -/*! - \fn QChar::QChar(char16_t ch) - \since 5.10 - - Constructs a QChar corresponding to the UTF-16 character \a ch. -*/ - -/*! - \fn QChar::QChar(wchar_t ch) - \since 5.10 - - Constructs a QChar corresponding to the wide character \a ch. - - \note This constructor is only available on Windows. -*/ - -/*! - \fn QChar::QChar(char ch) - - Constructs a QChar corresponding to ASCII/Latin-1 character \a ch. - - \note This constructor is not available when \c QT_NO_CAST_FROM_ASCII - is defined. - - \sa QT_NO_CAST_FROM_ASCII -*/ - -/*! - \fn QChar::QChar(uchar ch) - - Constructs a QChar corresponding to ASCII/Latin-1 character \a ch. - - \note This constructor is not available when \c QT_NO_CAST_FROM_ASCII - or \c QT_RESTRICTED_CAST_FROM_ASCII is defined. - - \sa QT_NO_CAST_FROM_ASCII, QT_RESTRICTED_CAST_FROM_ASCII -*/ - -/*! - \fn QChar::QChar(uchar cell, uchar row) - - Constructs a QChar for Unicode cell \a cell in row \a row. - - \sa cell(), row() -*/ - -/*! - \fn QChar::QChar(ushort code) - - Constructs a QChar for the character with Unicode code point \a code. -*/ - -/*! - \fn QChar::QChar(short code) - - Constructs a QChar for the character with Unicode code point \a code. -*/ - -/*! - \fn QChar::QChar(uint code) - - Constructs a QChar for the character with Unicode code point \a code. -*/ - -/*! - \fn QChar::QChar(int code) - - Constructs a QChar for the character with Unicode code point \a code. -*/ - -/*! - \fn bool QChar::isNull() const - - Returns \c true if the character is the Unicode character 0x0000 - ('\\0'); otherwise returns \c false. -*/ - -/*! - \fn uchar QChar::cell() const - - Returns the cell (least significant byte) of the Unicode character. - - \sa row() -*/ - -/*! - \fn uchar QChar::row() const - - Returns the row (most significant byte) of the Unicode character. - - \sa cell() -*/ - -/*! - \fn bool QChar::isPrint() const - - Returns \c true if the character is a printable character; otherwise - returns \c false. This is any character not of category Other_*. - - Note that this gives no indication of whether the character is - available in a particular font. -*/ - -/*! - \overload - \since 5.0 - - Returns \c true if the UCS-4-encoded character specified by \a ucs4 is - a printable character; otherwise returns \c false. - This is any character not of category Other_*. - - Note that this gives no indication of whether the character is - available in a particular font. -*/ -bool QChar::isPrint(uint ucs4) noexcept -{ - if (ucs4 > LastValidCodePoint) - return false; - const int test = FLAG(Other_Control) | - FLAG(Other_Format) | - FLAG(Other_Surrogate) | - FLAG(Other_PrivateUse) | - FLAG(Other_NotAssigned); - return !(FLAG(qGetProp(ucs4)->category) & test); -} - -/*! - \fn bool QChar::isSpace() const - - Returns \c true if the character is a separator character - (Separator_* categories or certain code points from Other_Control category); - otherwise returns \c false. -*/ - -/*! - \fn bool QChar::isSpace(uint ucs4) - \overload - \since 5.0 - - Returns \c true if the UCS-4-encoded character specified by \a ucs4 is - a separator character (Separator_* categories or certain code points - from Other_Control category); otherwise returns \c false. -*/ - -/*! - \internal -*/ -bool QT_FASTCALL QChar::isSpace_helper(uint ucs4) noexcept -{ - if (ucs4 > LastValidCodePoint) - return false; - const int test = FLAG(Separator_Space) | - FLAG(Separator_Line) | - FLAG(Separator_Paragraph); - return FLAG(qGetProp(ucs4)->category) & test; -} - -/*! - \fn bool QChar::isMark() const - - Returns \c true if the character is a mark (Mark_* categories); - otherwise returns \c false. - - See QChar::Category for more information regarding marks. -*/ - -/*! - \overload - \since 5.0 - - Returns \c true if the UCS-4-encoded character specified by \a ucs4 is - a mark (Mark_* categories); otherwise returns \c false. -*/ -bool QChar::isMark(uint ucs4) noexcept -{ - if (ucs4 > LastValidCodePoint) - return false; - const int test = FLAG(Mark_NonSpacing) | - FLAG(Mark_SpacingCombining) | - FLAG(Mark_Enclosing); - return FLAG(qGetProp(ucs4)->category) & test; -} - -/*! - \fn bool QChar::isPunct() const - - Returns \c true if the character is a punctuation mark (Punctuation_* - categories); otherwise returns \c false. -*/ - -/*! - \overload - \since 5.0 - - Returns \c true if the UCS-4-encoded character specified by \a ucs4 is - a punctuation mark (Punctuation_* categories); otherwise returns \c false. -*/ -bool QChar::isPunct(uint ucs4) noexcept -{ - if (ucs4 > LastValidCodePoint) - return false; - const int test = FLAG(Punctuation_Connector) | - FLAG(Punctuation_Dash) | - FLAG(Punctuation_Open) | - FLAG(Punctuation_Close) | - FLAG(Punctuation_InitialQuote) | - FLAG(Punctuation_FinalQuote) | - FLAG(Punctuation_Other); - return FLAG(qGetProp(ucs4)->category) & test; -} - -/*! - \fn bool QChar::isSymbol() const - - Returns \c true if the character is a symbol (Symbol_* categories); - otherwise returns \c false. -*/ - -/*! - \overload - \since 5.0 - - Returns \c true if the UCS-4-encoded character specified by \a ucs4 is - a symbol (Symbol_* categories); otherwise returns \c false. -*/ -bool QChar::isSymbol(uint ucs4) noexcept -{ - if (ucs4 > LastValidCodePoint) - return false; - const int test = FLAG(Symbol_Math) | - FLAG(Symbol_Currency) | - FLAG(Symbol_Modifier) | - FLAG(Symbol_Other); - return FLAG(qGetProp(ucs4)->category) & test; -} - -/*! - \fn bool QChar::isLetter() const - - Returns \c true if the character is a letter (Letter_* categories); - otherwise returns \c false. -*/ - -/*! - \fn bool QChar::isLetter(uint ucs4) - \overload - \since 5.0 - - Returns \c true if the UCS-4-encoded character specified by \a ucs4 is - a letter (Letter_* categories); otherwise returns \c false. -*/ - -/*! - \internal -*/ -bool QT_FASTCALL QChar::isLetter_helper(uint ucs4) noexcept -{ - if (ucs4 > LastValidCodePoint) - return false; - const int test = FLAG(Letter_Uppercase) | - FLAG(Letter_Lowercase) | - FLAG(Letter_Titlecase) | - FLAG(Letter_Modifier) | - FLAG(Letter_Other); - return FLAG(qGetProp(ucs4)->category) & test; -} - -/*! - \fn bool QChar::isNumber() const - - Returns \c true if the character is a number (Number_* categories, - not just 0-9); otherwise returns \c false. - - \sa isDigit() -*/ - -/*! - \fn bool QChar::isNumber(uint ucs4) - \overload - \since 5.0 - - Returns \c true if the UCS-4-encoded character specified by \a ucs4 is - a number (Number_* categories, not just 0-9); otherwise returns \c false. - - \sa isDigit() -*/ - -/*! - \internal -*/ -bool QT_FASTCALL QChar::isNumber_helper(uint ucs4) noexcept -{ - if (ucs4 > LastValidCodePoint) - return false; - const int test = FLAG(Number_DecimalDigit) | - FLAG(Number_Letter) | - FLAG(Number_Other); - return FLAG(qGetProp(ucs4)->category) & test; -} - -/*! - \fn bool QChar::isLetterOrNumber() const - - Returns \c true if the character is a letter or number (Letter_* or - Number_* categories); otherwise returns \c false. -*/ - -/*! - \fn bool QChar::isLetterOrNumber(uint ucs4) - \overload - \since 5.0 - - Returns \c true if the UCS-4-encoded character specified by \a ucs4 is - a letter or number (Letter_* or Number_* categories); otherwise returns \c false. -*/ - -/*! - \internal -*/ -bool QT_FASTCALL QChar::isLetterOrNumber_helper(uint ucs4) noexcept -{ - if (ucs4 > LastValidCodePoint) - return false; - const int test = FLAG(Letter_Uppercase) | - FLAG(Letter_Lowercase) | - FLAG(Letter_Titlecase) | - FLAG(Letter_Modifier) | - FLAG(Letter_Other) | - FLAG(Number_DecimalDigit) | - FLAG(Number_Letter) | - FLAG(Number_Other); - return FLAG(qGetProp(ucs4)->category) & test; -} - -/*! - \fn bool QChar::isDigit() const - - Returns \c true if the character is a decimal digit - (Number_DecimalDigit); otherwise returns \c false. - - \sa isNumber() -*/ - -/*! - \fn bool QChar::isDigit(uint ucs4) - \overload - \since 5.0 - - Returns \c true if the UCS-4-encoded character specified by \a ucs4 is - a decimal digit (Number_DecimalDigit); otherwise returns \c false. - - \sa isNumber() -*/ - -/*! - \fn bool QChar::isNonCharacter() const - \since 5.0 - - Returns \c true if the QChar is a non-character; false otherwise. - - Unicode has a certain number of code points that are classified - as "non-characters:" that is, they can be used for internal purposes - in applications but cannot be used for text interchange. - Those are the last two entries each Unicode Plane ([0xfffe..0xffff], - [0x1fffe..0x1ffff], etc.) as well as the entries in range [0xfdd0..0xfdef]. -*/ - -/*! - \fn bool QChar::isHighSurrogate() const - - Returns \c true if the QChar is the high part of a UTF16 surrogate - (for example if its code point is in range [0xd800..0xdbff]); false otherwise. -*/ - -/*! - \fn bool QChar::isLowSurrogate() const - - Returns \c true if the QChar is the low part of a UTF16 surrogate - (for example if its code point is in range [0xdc00..0xdfff]); false otherwise. -*/ - -/*! - \fn bool QChar::isSurrogate() const - \since 5.0 - - Returns \c true if the QChar contains a code point that is in either - the high or the low part of the UTF-16 surrogate range - (for example if its code point is in range [0xd800..0xdfff]); false otherwise. -*/ - -/*! - \fn static bool QChar::isNonCharacter(uint ucs4) - \overload - \since 5.0 - - Returns \c true if the UCS-4-encoded character specified by \a ucs4 - is a non-character; false otherwise. - - Unicode has a certain number of code points that are classified - as "non-characters:" that is, they can be used for internal purposes - in applications but cannot be used for text interchange. - Those are the last two entries each Unicode Plane ([0xfffe..0xffff], - [0x1fffe..0x1ffff], etc.) as well as the entries in range [0xfdd0..0xfdef]. -*/ - -/*! - \fn static bool QChar::isHighSurrogate(uint ucs4) - \overload - - Returns \c true if the UCS-4-encoded character specified by \a ucs4 - is the high part of a UTF16 surrogate - (for example if its code point is in range [0xd800..0xdbff]); false otherwise. -*/ - -/*! - \fn static bool QChar::isLowSurrogate(uint ucs4) - \overload - - Returns \c true if the UCS-4-encoded character specified by \a ucs4 - is the low part of a UTF16 surrogate - (for example if its code point is in range [0xdc00..0xdfff]); false otherwise. -*/ - -/*! - \fn static bool QChar::isSurrogate(uint ucs4) - \overload - \since 5.0 - - Returns \c true if the UCS-4-encoded character specified by \a ucs4 - contains a code point that is in either the high or the low part of the - UTF-16 surrogate range (for example if its code point is in range [0xd800..0xdfff]); - false otherwise. -*/ - -/*! - \fn static bool QChar::requiresSurrogates(uint ucs4) - - Returns \c true if the UCS-4-encoded character specified by \a ucs4 - can be split into the high and low parts of a UTF16 surrogate - (for example if its code point is greater than or equals to 0x10000); - false otherwise. -*/ - -/*! - \fn static uint QChar::surrogateToUcs4(ushort high, ushort low) - - Converts a UTF16 surrogate pair with the given \a high and \a low values - to it's UCS-4-encoded code point. -*/ - -/*! - \fn static uint QChar::surrogateToUcs4(QChar high, QChar low) - \overload - - Converts a UTF16 surrogate pair (\a high, \a low) to it's UCS-4-encoded code point. -*/ - -/*! - \fn static ushort QChar::highSurrogate(uint ucs4) - - Returns the high surrogate part of a UCS-4-encoded code point. - The returned result is undefined if \a ucs4 is smaller than 0x10000. -*/ - -/*! - \fn static ushort QChar::lowSurrogate(uint ucs4) - - Returns the low surrogate part of a UCS-4-encoded code point. - The returned result is undefined if \a ucs4 is smaller than 0x10000. -*/ - -/*! - \fn int QChar::digitValue() const - - Returns the numeric value of the digit, or -1 if the character is not a digit. -*/ - -/*! - \overload - Returns the numeric value of the digit specified by the UCS-4-encoded - character, \a ucs4, or -1 if the character is not a digit. -*/ -int QChar::digitValue(uint ucs4) noexcept -{ - if (ucs4 > LastValidCodePoint) - return -1; - return qGetProp(ucs4)->digitValue; -} - -/*! - \fn QChar::Category QChar::category() const - - Returns the character's category. -*/ - -/*! - \overload - Returns the category of the UCS-4-encoded character specified by \a ucs4. -*/ -QChar::Category QChar::category(uint ucs4) noexcept -{ - if (ucs4 > LastValidCodePoint) - return QChar::Other_NotAssigned; - return (QChar::Category) qGetProp(ucs4)->category; -} - -/*! - \fn QChar::Direction QChar::direction() const - - Returns the character's direction. -*/ - -/*! - \overload - Returns the direction of the UCS-4-encoded character specified by \a ucs4. -*/ -QChar::Direction QChar::direction(uint ucs4) noexcept -{ - if (ucs4 > LastValidCodePoint) - return QChar::DirL; - return (QChar::Direction) qGetProp(ucs4)->direction; -} - -/*! - \fn QChar::JoiningType QChar::joiningType() const - \since 5.3 - - Returns information about the joining type attributes of the character - (needed for certain languages such as Arabic or Syriac). -*/ - -/*! - \overload - \since 5.3 - - Returns information about the joining type attributes of the UCS-4-encoded - character specified by \a ucs4 - (needed for certain languages such as Arabic or Syriac). -*/ -QChar::JoiningType QChar::joiningType(uint ucs4) noexcept -{ - if (ucs4 > LastValidCodePoint) - return QChar::Joining_None; - return QChar::JoiningType(qGetProp(ucs4)->joining); -} - -#if QT_DEPRECATED_SINCE(5, 3) -/*! - \fn QChar::Joining QChar::joining() const - \deprecated in 5.3, use joiningType() instead. - - Returns information about the joining properties of the character - (needed for certain languages such as Arabic). -*/ - -/*! - \overload - \deprecated in 5.3, use joiningType() instead. - - Returns information about the joining properties of the UCS-4-encoded - character specified by \a ucs4 (needed for certain languages such as Arabic). -*/ -QChar::Joining QChar::joining(uint ucs4) noexcept -{ - if (ucs4 > LastValidCodePoint) - return QChar::OtherJoining; - switch (qGetProp(ucs4)->joining) { - case QChar::Joining_Causing: return QChar::Center; - case QChar::Joining_Dual: return QChar::Dual; - case QChar::Joining_Right: return QChar::Right; - default: break; - } - return QChar::OtherJoining; -} -#endif - -/*! - \fn bool QChar::hasMirrored() const - - Returns \c true if the character should be reversed if the text - direction is reversed; otherwise returns \c false. - - A bit faster equivalent of (ch.mirroredChar() != ch). - - \sa mirroredChar() -*/ - -/*! - \overload - \since 5.0 - - Returns \c true if the UCS-4-encoded character specified by \a ucs4 - should be reversed if the text direction is reversed; otherwise returns \c false. - - A bit faster equivalent of (QChar::mirroredChar(ucs4) != ucs4). - - \sa mirroredChar() -*/ -bool QChar::hasMirrored(uint ucs4) noexcept -{ - if (ucs4 > LastValidCodePoint) - return false; - return qGetProp(ucs4)->mirrorDiff != 0; -} - -/*! - \fn bool QChar::isLower() const - - Returns \c true if the character is a lowercase letter, for example - category() is Letter_Lowercase. - - \sa isUpper(), toLower(), toUpper() -*/ - -/*! - \fn static bool QChar::isLower(uint ucs4) - \overload - \since 5.0 - - Returns \c true if the UCS-4-encoded character specified by \a ucs4 - is a lowercase letter, for example category() is Letter_Lowercase. - - \sa isUpper(), toLower(), toUpper() -*/ - -/*! - \fn bool QChar::isUpper() const - - Returns \c true if the character is an uppercase letter, for example - category() is Letter_Uppercase. - - \sa isLower(), toUpper(), toLower() -*/ - -/*! - \fn static bool QChar::isUpper(uint ucs4) - \overload - \since 5.0 - - Returns \c true if the UCS-4-encoded character specified by \a ucs4 - is an uppercase letter, for example category() is Letter_Uppercase. - - \sa isLower(), toUpper(), toLower() -*/ - -/*! - \fn bool QChar::isTitleCase() const - - Returns \c true if the character is a titlecase letter, for example - category() is Letter_Titlecase. - - \sa isLower(), toUpper(), toLower(), toTitleCase() -*/ - -/*! - \fn static bool QChar::isTitleCase(uint ucs4) - \overload - \since 5.0 - - Returns \c true if the UCS-4-encoded character specified by \a ucs4 - is a titlecase letter, for example category() is Letter_Titlecase. - - \sa isLower(), toUpper(), toLower(), toTitleCase() -*/ -/*! - \fn QChar QChar::mirroredChar() const - - Returns the mirrored character if this character is a mirrored - character; otherwise returns the character itself. - - \sa hasMirrored() -*/ - -/*! - \overload - Returns the mirrored character if the UCS-4-encoded character specified - by \a ucs4 is a mirrored character; otherwise returns the character itself. - - \sa hasMirrored() -*/ -uint QChar::mirroredChar(uint ucs4) noexcept -{ - if (ucs4 > LastValidCodePoint) - return ucs4; - return ucs4 + qGetProp(ucs4)->mirrorDiff; -} - - -// constants for Hangul (de)composition, see UAX #15 -enum { - Hangul_SBase = 0xac00, - Hangul_LBase = 0x1100, - Hangul_VBase = 0x1161, - Hangul_TBase = 0x11a7, - Hangul_LCount = 19, - Hangul_VCount = 21, - Hangul_TCount = 28, - Hangul_NCount = Hangul_VCount * Hangul_TCount, - Hangul_SCount = Hangul_LCount * Hangul_NCount -}; - -// buffer has to have a length of 3. It's needed for Hangul decomposition -static const unsigned short * QT_FASTCALL decompositionHelper - (uint ucs4, int *length, int *tag, unsigned short *buffer) -{ - if (ucs4 >= Hangul_SBase && ucs4 < Hangul_SBase + Hangul_SCount) { - // compute Hangul syllable decomposition as per UAX #15 - const uint SIndex = ucs4 - Hangul_SBase; - buffer[0] = Hangul_LBase + SIndex / Hangul_NCount; // L - buffer[1] = Hangul_VBase + (SIndex % Hangul_NCount) / Hangul_TCount; // V - buffer[2] = Hangul_TBase + SIndex % Hangul_TCount; // T - *length = buffer[2] == Hangul_TBase ? 2 : 3; - *tag = QChar::Canonical; - return buffer; - } - - const unsigned short index = GET_DECOMPOSITION_INDEX(ucs4); - if (index == 0xffff) { - *length = 0; - *tag = QChar::NoDecomposition; - return nullptr; - } - - const unsigned short *decomposition = uc_decomposition_map+index; - *tag = (*decomposition) & 0xff; - *length = (*decomposition) >> 8; - return decomposition+1; -} - -/*! - Decomposes a character into it's constituent parts. Returns an empty string - if no decomposition exists. -*/ -QString QChar::decomposition() const -{ - return QChar::decomposition(ucs); -} - -/*! - \overload - Decomposes the UCS-4-encoded character specified by \a ucs4 into it's - constituent parts. Returns an empty string if no decomposition exists. -*/ -QString QChar::decomposition(uint ucs4) -{ - unsigned short buffer[3]; - int length; - int tag; - const unsigned short *d = decompositionHelper(ucs4, &length, &tag, buffer); - return QString(reinterpret_cast<const QChar *>(d), length); -} - -/*! - \fn QChar::Decomposition QChar::decompositionTag() const - - Returns the tag defining the composition of the character. Returns - QChar::NoDecomposition if no decomposition exists. -*/ - -/*! - \overload - Returns the tag defining the composition of the UCS-4-encoded character - specified by \a ucs4. Returns QChar::NoDecomposition if no decomposition exists. -*/ -QChar::Decomposition QChar::decompositionTag(uint ucs4) noexcept -{ - if (ucs4 >= Hangul_SBase && ucs4 < Hangul_SBase + Hangul_SCount) - return QChar::Canonical; - const unsigned short index = GET_DECOMPOSITION_INDEX(ucs4); - if (index == 0xffff) - return QChar::NoDecomposition; - return (QChar::Decomposition)(uc_decomposition_map[index] & 0xff); -} - -/*! - \fn unsigned char QChar::combiningClass() const - - Returns the combining class for the character as defined in the - Unicode standard. This is mainly useful as a positioning hint for - marks attached to a base character. - - The Qt text rendering engine uses this information to correctly - position non-spacing marks around a base character. -*/ - -/*! - \overload - Returns the combining class for the UCS-4-encoded character specified by - \a ucs4, as defined in the Unicode standard. -*/ -unsigned char QChar::combiningClass(uint ucs4) noexcept -{ - if (ucs4 > LastValidCodePoint) - return 0; - return (unsigned char) qGetProp(ucs4)->combiningClass; -} - -/*! - \fn QChar::Script QChar::script() const - \since 5.1 - - Returns the Unicode script property value for this character. -*/ - -/*! - \overload - \since 5.1 - - Returns the Unicode script property value for the character specified in - its UCS-4-encoded form as \a ucs4. -*/ -QChar::Script QChar::script(uint ucs4) noexcept -{ - if (ucs4 > LastValidCodePoint) - return QChar::Script_Unknown; - return (QChar::Script) qGetProp(ucs4)->script; -} - -/*! - \fn QChar::UnicodeVersion QChar::unicodeVersion() const - - Returns the Unicode version that introduced this character. -*/ - -/*! - \overload - Returns the Unicode version that introduced the character specified in - its UCS-4-encoded form as \a ucs4. -*/ -QChar::UnicodeVersion QChar::unicodeVersion(uint ucs4) noexcept -{ - if (ucs4 > LastValidCodePoint) - return QChar::Unicode_Unassigned; - return (QChar::UnicodeVersion) qGetProp(ucs4)->unicodeVersion; -} - -/*! - Returns the most recent supported Unicode version. -*/ -QChar::UnicodeVersion QChar::currentUnicodeVersion() noexcept -{ - return UNICODE_DATA_VERSION; -} - - -template <typename Traits, typename T> -Q_DECL_CONST_FUNCTION static inline T convertCase_helper(T uc) noexcept -{ - const QUnicodeTables::Properties *prop = qGetProp(uc); - - if (Q_UNLIKELY(Traits::caseSpecial(prop))) { - const ushort *specialCase = specialCaseMap + Traits::caseDiff(prop); - // so far, there are no special cases beyond BMP (guaranteed by the qunicodetables generator) - return *specialCase == 1 ? specialCase[1] : uc; - } - - return uc + Traits::caseDiff(prop); -} - -/*! - \fn QChar QChar::toLower() const - - Returns the lowercase equivalent if the character is uppercase or titlecase; - otherwise returns the character itself. -*/ - -/*! - \overload - Returns the lowercase equivalent of the UCS-4-encoded character specified - by \a ucs4 if the character is uppercase or titlecase; otherwise returns - the character itself. -*/ -uint QChar::toLower(uint ucs4) noexcept -{ - if (ucs4 > LastValidCodePoint) - return ucs4; - return convertCase_helper<QUnicodeTables::LowercaseTraits>(ucs4); -} - -/*! - \fn QChar QChar::toUpper() const - - Returns the uppercase equivalent if the character is lowercase or titlecase; - otherwise returns the character itself. -*/ - -/*! - \overload - Returns the uppercase equivalent of the UCS-4-encoded character specified - by \a ucs4 if the character is lowercase or titlecase; otherwise returns - the character itself. -*/ -uint QChar::toUpper(uint ucs4) noexcept -{ - if (ucs4 > LastValidCodePoint) - return ucs4; - return convertCase_helper<QUnicodeTables::UppercaseTraits>(ucs4); -} - -/*! - \fn QChar QChar::toTitleCase() const - - Returns the title case equivalent if the character is lowercase or uppercase; - otherwise returns the character itself. -*/ - -/*! - \overload - Returns the title case equivalent of the UCS-4-encoded character specified - by \a ucs4 if the character is lowercase or uppercase; otherwise returns - the character itself. -*/ -uint QChar::toTitleCase(uint ucs4) noexcept -{ - if (ucs4 > LastValidCodePoint) - return ucs4; - return convertCase_helper<QUnicodeTables::TitlecaseTraits>(ucs4); -} - -static inline uint foldCase(const ushort *ch, const ushort *start) -{ - uint ucs4 = *ch; - if (QChar::isLowSurrogate(ucs4) && ch > start && QChar::isHighSurrogate(*(ch - 1))) - ucs4 = QChar::surrogateToUcs4(*(ch - 1), ucs4); - return convertCase_helper<QUnicodeTables::CasefoldTraits>(ucs4); -} - -static inline uint foldCase(uint ch, uint &last) noexcept -{ - uint ucs4 = ch; - if (QChar::isLowSurrogate(ucs4) && QChar::isHighSurrogate(last)) - ucs4 = QChar::surrogateToUcs4(last, ucs4); - last = ch; - return convertCase_helper<QUnicodeTables::CasefoldTraits>(ucs4); -} - -static inline ushort foldCase(ushort ch) noexcept -{ - return convertCase_helper<QUnicodeTables::CasefoldTraits>(ch); -} - -static inline QChar foldCase(QChar ch) noexcept -{ - return QChar(foldCase(ch.unicode())); -} - -/*! - \fn QChar QChar::toCaseFolded() const - - Returns the case folded equivalent of the character. - For most Unicode characters this is the same as toLower(). -*/ - -/*! - \overload - Returns the case folded equivalent of the UCS-4-encoded character specified - by \a ucs4. For most Unicode characters this is the same as toLower(). -*/ -uint QChar::toCaseFolded(uint ucs4) noexcept -{ - if (ucs4 > LastValidCodePoint) - return ucs4; - return convertCase_helper<QUnicodeTables::CasefoldTraits>(ucs4); -} - -/*! - \fn char QChar::toLatin1() const - - Returns the Latin-1 character equivalent to the QChar, or 0. This - is mainly useful for non-internationalized software. - - \note It is not possible to distinguish a non-Latin-1 character from a Latin-1 0 - (NUL) character. Prefer to use unicode(), which does not have this ambiguity. - - \sa unicode() -*/ - -/*! - \fn QChar QChar::fromLatin1(char) - - Converts the Latin-1 character \a c to its equivalent QChar. This - is mainly useful for non-internationalized software. - - An alternative is to use QLatin1Char. - - \sa toLatin1(), unicode() -*/ - -/*! - \fn char QChar::toAscii() const - \deprecated - - Returns the Latin-1 character value of the QChar, or 0 if the character is not - representable. - - The main purpose of this function is to preserve ASCII characters used - in C strings. This is mainly useful for developers of non-internationalized - software. - - \note It is not possible to distinguish a non-Latin 1 character from an ASCII 0 - (NUL) character. Prefer to use unicode(), which does not have this ambiguity. - - \note This function does not check whether the character value is inside - the valid range of US-ASCII. - - \sa toLatin1(), unicode() -*/ - -/*! - \fn QChar QChar::fromAscii(char) - \deprecated - - Converts the ASCII character \a c to it's equivalent QChar. This - is mainly useful for non-internationalized software. - - An alternative is to use QLatin1Char. - - \sa fromLatin1(), unicode() -*/ - -#ifndef QT_NO_DATASTREAM -/*! - \relates QChar - - Writes the char \a chr to the stream \a out. - - \sa {Serializing Qt Data Types} -*/ -QDataStream &operator<<(QDataStream &out, QChar chr) -{ - out << quint16(chr.unicode()); - return out; -} - -/*! - \relates QChar - - Reads a char from the stream \a in into char \a chr. - - \sa {Serializing Qt Data Types} -*/ -QDataStream &operator>>(QDataStream &in, QChar &chr) -{ - quint16 u; - in >> u; - chr.unicode() = ushort(u); - return in; -} -#endif // QT_NO_DATASTREAM - -/*! - \fn ushort & QChar::unicode() - - Returns a reference to the numeric Unicode value of the QChar. -*/ - -/*! - \fn ushort QChar::unicode() const - - Returns the numeric Unicode value of the QChar. -*/ - -/***************************************************************************** - Documentation of QChar related functions - *****************************************************************************/ - -/*! - \fn bool operator==(QChar c1, QChar c2) - - \relates QChar - - Returns \c true if \a c1 and \a c2 are the same Unicode character; - otherwise returns \c false. -*/ - -/*! - \fn int operator!=(QChar c1, QChar c2) - - \relates QChar - - Returns \c true if \a c1 and \a c2 are not the same Unicode - character; otherwise returns \c false. -*/ - -/*! - \fn int operator<=(QChar c1, QChar c2) - - \relates QChar - - Returns \c true if the numeric Unicode value of \a c1 is less than - or equal to that of \a c2; otherwise returns \c false. -*/ - -/*! - \fn int operator>=(QChar c1, QChar c2) - - \relates QChar - - Returns \c true if the numeric Unicode value of \a c1 is greater than - or equal to that of \a c2; otherwise returns \c false. -*/ - -/*! - \fn int operator<(QChar c1, QChar c2) - - \relates QChar - - Returns \c true if the numeric Unicode value of \a c1 is less than - that of \a c2; otherwise returns \c false. -*/ - -/*! - \fn int operator>(QChar c1, QChar c2) - - \relates QChar - - Returns \c true if the numeric Unicode value of \a c1 is greater than - that of \a c2; otherwise returns \c false. -*/ - - -// --------------------------------------------------------------------------- - - -static void decomposeHelper(QString *str, bool canonical, QChar::UnicodeVersion version, int from) -{ - int length; - int tag; - unsigned short buffer[3]; - - QString &s = *str; - - const unsigned short *utf16 = reinterpret_cast<unsigned short *>(s.data()); - const unsigned short *uc = utf16 + s.length(); - while (uc != utf16 + from) { - uint ucs4 = *(--uc); - if (QChar(ucs4).isLowSurrogate() && uc != utf16) { - ushort high = *(uc - 1); - if (QChar(high).isHighSurrogate()) { - --uc; - ucs4 = QChar::surrogateToUcs4(high, ucs4); - } - } - - if (QChar::unicodeVersion(ucs4) > version) - continue; - - const unsigned short *d = decompositionHelper(ucs4, &length, &tag, buffer); - if (!d || (canonical && tag != QChar::Canonical)) - continue; - - int pos = uc - utf16; - s.replace(pos, QChar::requiresSurrogates(ucs4) ? 2 : 1, reinterpret_cast<const QChar *>(d), length); - // since the replace invalidates the pointers and we do decomposition recursive - utf16 = reinterpret_cast<unsigned short *>(s.data()); - uc = utf16 + pos + length; - } -} - - -struct UCS2Pair { - ushort u1; - ushort u2; -}; - -inline bool operator<(const UCS2Pair &ligature1, const UCS2Pair &ligature2) -{ return ligature1.u1 < ligature2.u1; } -inline bool operator<(ushort u1, const UCS2Pair &ligature) -{ return u1 < ligature.u1; } -inline bool operator<(const UCS2Pair &ligature, ushort u1) -{ return ligature.u1 < u1; } - -struct UCS2SurrogatePair { - UCS2Pair p1; - UCS2Pair p2; -}; - -inline bool operator<(const UCS2SurrogatePair &ligature1, const UCS2SurrogatePair &ligature2) -{ return QChar::surrogateToUcs4(ligature1.p1.u1, ligature1.p1.u2) < QChar::surrogateToUcs4(ligature2.p1.u1, ligature2.p1.u2); } -inline bool operator<(uint u1, const UCS2SurrogatePair &ligature) -{ return u1 < QChar::surrogateToUcs4(ligature.p1.u1, ligature.p1.u2); } -inline bool operator<(const UCS2SurrogatePair &ligature, uint u1) -{ return QChar::surrogateToUcs4(ligature.p1.u1, ligature.p1.u2) < u1; } - -static uint inline ligatureHelper(uint u1, uint u2) -{ - if (u1 >= Hangul_LBase && u1 <= Hangul_SBase + Hangul_SCount) { - // compute Hangul syllable composition as per UAX #15 - // hangul L-V pair - const uint LIndex = u1 - Hangul_LBase; - if (LIndex < Hangul_LCount) { - const uint VIndex = u2 - Hangul_VBase; - if (VIndex < Hangul_VCount) - return Hangul_SBase + (LIndex * Hangul_VCount + VIndex) * Hangul_TCount; - } - // hangul LV-T pair - const uint SIndex = u1 - Hangul_SBase; - if (SIndex < Hangul_SCount && (SIndex % Hangul_TCount) == 0) { - const uint TIndex = u2 - Hangul_TBase; - if (TIndex <= Hangul_TCount) - return u1 + TIndex; - } - } - - const unsigned short index = GET_LIGATURE_INDEX(u2); - if (index == 0xffff) - return 0; - const unsigned short *ligatures = uc_ligature_map+index; - ushort length = *ligatures++; - if (QChar::requiresSurrogates(u1)) { - const UCS2SurrogatePair *data = reinterpret_cast<const UCS2SurrogatePair *>(ligatures); - const UCS2SurrogatePair *r = std::lower_bound(data, data + length, u1); - if (r != data + length && QChar::surrogateToUcs4(r->p1.u1, r->p1.u2) == u1) - return QChar::surrogateToUcs4(r->p2.u1, r->p2.u2); - } else { - const UCS2Pair *data = reinterpret_cast<const UCS2Pair *>(ligatures); - const UCS2Pair *r = std::lower_bound(data, data + length, ushort(u1)); - if (r != data + length && r->u1 == ushort(u1)) - return r->u2; - } - - return 0; -} - -static void composeHelper(QString *str, QChar::UnicodeVersion version, int from) -{ - QString &s = *str; - - if (from < 0 || s.length() - from < 2) - return; - - uint stcode = 0; // starter code point - int starter = -1; // starter position - int next = -1; // to prevent i == next - int lastCombining = 255; // to prevent combining > lastCombining - - int pos = from; - while (pos < s.length()) { - int i = pos; - uint uc = s.at(pos).unicode(); - if (QChar(uc).isHighSurrogate() && pos < s.length()-1) { - ushort low = s.at(pos+1).unicode(); - if (QChar(low).isLowSurrogate()) { - uc = QChar::surrogateToUcs4(uc, low); - ++pos; - } - } - - const QUnicodeTables::Properties *p = qGetProp(uc); - if (p->unicodeVersion > version) { - starter = -1; - next = -1; // to prevent i == next - lastCombining = 255; // to prevent combining > lastCombining - ++pos; - continue; - } - - int combining = p->combiningClass; - if ((i == next || combining > lastCombining) && starter >= from) { - // allowed to form ligature with S - uint ligature = ligatureHelper(stcode, uc); - if (ligature) { - stcode = ligature; - QChar *d = s.data(); - // ligatureHelper() never changes planes - if (QChar::requiresSurrogates(ligature)) { - d[starter] = QChar(QChar::highSurrogate(ligature)); - d[starter + 1] = QChar(QChar::lowSurrogate(ligature)); - s.remove(i, 2); - } else { - d[starter] = QChar(ligature); - s.remove(i, 1); - } - continue; - } - } - if (combining == 0) { - starter = i; - stcode = uc; - next = pos + 1; - } - lastCombining = combining; - - ++pos; - } -} - - -static void canonicalOrderHelper(QString *str, QChar::UnicodeVersion version, int from) -{ - QString &s = *str; - const int l = s.length()-1; - - uint u1, u2; - ushort c1, c2; - - int pos = from; - while (pos < l) { - int p2 = pos+1; - u1 = s.at(pos).unicode(); - if (QChar(u1).isHighSurrogate()) { - ushort low = s.at(p2).unicode(); - if (QChar(low).isLowSurrogate()) { - u1 = QChar::surrogateToUcs4(u1, low); - if (p2 >= l) - break; - ++p2; - } - } - c1 = 0; - - advance: - u2 = s.at(p2).unicode(); - if (QChar(u2).isHighSurrogate() && p2 < l) { - ushort low = s.at(p2+1).unicode(); - if (QChar(low).isLowSurrogate()) { - u2 = QChar::surrogateToUcs4(u2, low); - ++p2; - } - } - - c2 = 0; - { - const QUnicodeTables::Properties *p = qGetProp(u2); - if (p->unicodeVersion <= version) - c2 = p->combiningClass; - } - if (c2 == 0) { - pos = p2+1; - continue; - } - - if (c1 == 0) { - const QUnicodeTables::Properties *p = qGetProp(u1); - if (p->unicodeVersion <= version) - c1 = p->combiningClass; - } - - if (c1 > c2) { - QChar *uc = s.data(); - int p = pos; - // exchange characters - if (!QChar::requiresSurrogates(u2)) { - uc[p++] = QChar(u2); - } else { - uc[p++] = QChar(QChar::highSurrogate(u2)); - uc[p++] = QChar(QChar::lowSurrogate(u2)); - } - if (!QChar::requiresSurrogates(u1)) { - uc[p++] = QChar(u1); - } else { - uc[p++] = QChar(QChar::highSurrogate(u1)); - uc[p++] = QChar(QChar::lowSurrogate(u1)); - } - if (pos > 0) - --pos; - if (pos > 0 && s.at(pos).isLowSurrogate()) - --pos; - } else { - ++pos; - if (QChar::requiresSurrogates(u1)) - ++pos; - - u1 = u2; - c1 = c2; // != 0 - p2 = pos + 1; - if (QChar::requiresSurrogates(u1)) - ++p2; - if (p2 > l) - break; - - goto advance; - } - } -} - -// returns true if the text is in a desired Normalization Form already; false otherwise. -// sets lastStable to the position of the last stable code point -static bool normalizationQuickCheckHelper(QString *str, QString::NormalizationForm mode, int from, int *lastStable) -{ - Q_STATIC_ASSERT(QString::NormalizationForm_D == 0); - Q_STATIC_ASSERT(QString::NormalizationForm_C == 1); - Q_STATIC_ASSERT(QString::NormalizationForm_KD == 2); - Q_STATIC_ASSERT(QString::NormalizationForm_KC == 3); - - enum { NFQC_YES = 0, NFQC_NO = 1, NFQC_MAYBE = 3 }; - - const ushort *string = reinterpret_cast<const ushort *>(str->constData()); - int length = str->length(); - - // this avoids one out of bounds check in the loop - while (length > from && QChar::isHighSurrogate(string[length - 1])) - --length; - - uchar lastCombining = 0; - for (int i = from; i < length; ++i) { - int pos = i; - uint uc = string[i]; - if (uc < 0x80) { - // ASCII characters are stable code points - lastCombining = 0; - *lastStable = pos; - continue; - } - - if (QChar::isHighSurrogate(uc)) { - ushort low = string[i + 1]; - if (!QChar::isLowSurrogate(low)) { - // treat surrogate like stable code point - lastCombining = 0; - *lastStable = pos; - continue; - } - ++i; - uc = QChar::surrogateToUcs4(uc, low); - } - - const QUnicodeTables::Properties *p = qGetProp(uc); - - if (p->combiningClass < lastCombining && p->combiningClass > 0) - return false; - - const uchar check = (p->nfQuickCheck >> (mode << 1)) & 0x03; - if (check != NFQC_YES) - return false; // ### can we quick check NFQC_MAYBE ? - - lastCombining = p->combiningClass; - if (lastCombining == 0) - *lastStable = pos; - } - - if (length != str->length()) // low surrogate parts at the end of text - *lastStable = str->length() - 1; - - return true; -} - -QT_END_NAMESPACE |
