This source file includes following definitions.
- foldQuoteMarkOrSoftHyphen
- foldQuoteMarksAndSoftHyphens
- foldQuoteMarksAndSoftHyphens
- isNonLatin1Separator
- isSeparator
- isKanaLetter
- isSmallKanaLetter
- composedVoicedSoundMark
- isCombiningVoicedSoundMark
- containsKanaLetters
- normalizeCharactersIntoNFCForm
- compareKanaLetterAndComposedVoicedSoundMarks
- checkOnlyKanaLettersInStrings
- checkKanaStringsEqual
#include "config.h"
#include "platform/text/UnicodeUtilities.h"
#include "wtf/text/StringBuffer.h"
#include "wtf/unicode/CharacterNames.h"
#include <unicode/unorm.h>
using namespace WTF::Unicode;
namespace WebCore {
enum VoicedSoundMarkType {
NoVoicedSoundMark,
VoicedSoundMark,
SemiVoicedSoundMark
};
template <typename CharType>
static inline CharType foldQuoteMarkOrSoftHyphen(CharType c)
{
switch (static_cast<UChar>(c)) {
case hebrewPunctuationGershayim:
case leftDoubleQuotationMark:
case rightDoubleQuotationMark:
return '"';
case hebrewPunctuationGeresh:
case leftSingleQuotationMark:
case rightSingleQuotationMark:
return '\'';
case softHyphen:
return 0;
default:
return c;
}
}
void foldQuoteMarksAndSoftHyphens(UChar* data, size_t length)
{
for (size_t i = 0; i < length; ++i)
data[i] = foldQuoteMarkOrSoftHyphen(data[i]);
}
void foldQuoteMarksAndSoftHyphens(String& s)
{
s.replace(hebrewPunctuationGeresh, '\'');
s.replace(hebrewPunctuationGershayim, '"');
s.replace(leftDoubleQuotationMark, '"');
s.replace(leftSingleQuotationMark, '\'');
s.replace(rightDoubleQuotationMark, '"');
s.replace(rightSingleQuotationMark, '\'');
s.replace(softHyphen, 0);
}
static bool isNonLatin1Separator(UChar32 character)
{
ASSERT_ARG(character, character >= 256);
return U_GET_GC_MASK(character) & (U_GC_S_MASK | U_GC_P_MASK | U_GC_Z_MASK | U_GC_CF_MASK);
}
bool isSeparator(UChar32 character)
{
static const bool latin1SeparatorTable[256] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0
};
if (character < 256)
return latin1SeparatorTable[character];
return isNonLatin1Separator(character);
}
bool isKanaLetter(UChar character)
{
if (character >= 0x3041 && character <= 0x3096)
return true;
if (character >= 0x30A1 && character <= 0x30FA)
return true;
if (character >= 0x31F0 && character <= 0x31FF)
return true;
if (character >= 0xFF66 && character <= 0xFF9D && character != 0xFF70)
return true;
return false;
}
bool isSmallKanaLetter(UChar character)
{
ASSERT(isKanaLetter(character));
switch (character) {
case 0x3041:
case 0x3043:
case 0x3045:
case 0x3047:
case 0x3049:
case 0x3063:
case 0x3083:
case 0x3085:
case 0x3087:
case 0x308E:
case 0x3095:
case 0x3096:
case 0x30A1:
case 0x30A3:
case 0x30A5:
case 0x30A7:
case 0x30A9:
case 0x30C3:
case 0x30E3:
case 0x30E5:
case 0x30E7:
case 0x30EE:
case 0x30F5:
case 0x30F6:
case 0x31F0:
case 0x31F1:
case 0x31F2:
case 0x31F3:
case 0x31F4:
case 0x31F5:
case 0x31F6:
case 0x31F7:
case 0x31F8:
case 0x31F9:
case 0x31FA:
case 0x31FB:
case 0x31FC:
case 0x31FD:
case 0x31FE:
case 0x31FF:
case 0xFF67:
case 0xFF68:
case 0xFF69:
case 0xFF6A:
case 0xFF6B:
case 0xFF6C:
case 0xFF6D:
case 0xFF6E:
case 0xFF6F:
return true;
}
return false;
}
static inline VoicedSoundMarkType composedVoicedSoundMark(UChar character)
{
ASSERT(isKanaLetter(character));
switch (character) {
case 0x304C:
case 0x304E:
case 0x3050:
case 0x3052:
case 0x3054:
case 0x3056:
case 0x3058:
case 0x305A:
case 0x305C:
case 0x305E:
case 0x3060:
case 0x3062:
case 0x3065:
case 0x3067:
case 0x3069:
case 0x3070:
case 0x3073:
case 0x3076:
case 0x3079:
case 0x307C:
case 0x3094:
case 0x30AC:
case 0x30AE:
case 0x30B0:
case 0x30B2:
case 0x30B4:
case 0x30B6:
case 0x30B8:
case 0x30BA:
case 0x30BC:
case 0x30BE:
case 0x30C0:
case 0x30C2:
case 0x30C5:
case 0x30C7:
case 0x30C9:
case 0x30D0:
case 0x30D3:
case 0x30D6:
case 0x30D9:
case 0x30DC:
case 0x30F4:
case 0x30F7:
case 0x30F8:
case 0x30F9:
case 0x30FA:
return VoicedSoundMark;
case 0x3071:
case 0x3074:
case 0x3077:
case 0x307A:
case 0x307D:
case 0x30D1:
case 0x30D4:
case 0x30D7:
case 0x30DA:
case 0x30DD:
return SemiVoicedSoundMark;
}
return NoVoicedSoundMark;
}
static inline bool isCombiningVoicedSoundMark(UChar character)
{
switch (character) {
case 0x3099:
case 0x309A:
return true;
}
return false;
}
bool containsKanaLetters(const String& pattern)
{
const unsigned length = pattern.length();
for (unsigned i = 0; i < length; ++i) {
if (isKanaLetter(pattern[i]))
return true;
}
return false;
}
void normalizeCharactersIntoNFCForm(const UChar* characters, unsigned length, Vector<UChar>& buffer)
{
ASSERT(length);
buffer.resize(length);
UErrorCode status = U_ZERO_ERROR;
size_t bufferSize = unorm_normalize(characters, length, UNORM_NFC, 0, buffer.data(), length, &status);
ASSERT(status == U_ZERO_ERROR || status == U_STRING_NOT_TERMINATED_WARNING || status == U_BUFFER_OVERFLOW_ERROR);
ASSERT(bufferSize);
buffer.resize(bufferSize);
if (status == U_ZERO_ERROR || status == U_STRING_NOT_TERMINATED_WARNING)
return;
status = U_ZERO_ERROR;
unorm_normalize(characters, length, UNORM_NFC, 0, buffer.data(), bufferSize, &status);
ASSERT(status == U_STRING_NOT_TERMINATED_WARNING);
}
static inline size_t compareKanaLetterAndComposedVoicedSoundMarks(const UChar* first, const UChar* firstEnd, const UChar* second, const UChar* secondEnd)
{
const UChar* start = first;
if (isSmallKanaLetter(*first) != isSmallKanaLetter(*second))
return kNotFound;
if (composedVoicedSoundMark(*first) != composedVoicedSoundMark(*second))
return kNotFound;
++first;
++second;
while (true) {
const bool secondIsNotSoundMark = second == secondEnd || !isCombiningVoicedSoundMark(*second);
if (first == firstEnd || !isCombiningVoicedSoundMark(*first)) {
return secondIsNotSoundMark ? first - start : kNotFound;
}
if (secondIsNotSoundMark)
return kNotFound;
if (*first != *second)
return kNotFound;
++first;
++second;
}
}
bool checkOnlyKanaLettersInStrings(const UChar* firstData, unsigned firstLength, const UChar* secondData, unsigned secondLength)
{
const UChar* a = firstData;
const UChar* aEnd = firstData + firstLength;
const UChar* b = secondData;
const UChar* bEnd = secondData + secondLength;
while (true) {
while (a != aEnd && !isKanaLetter(*a))
++a;
while (b != bEnd && !isKanaLetter(*b))
++b;
if (a == aEnd || b == bEnd) {
return a == aEnd && b == bEnd;
}
const size_t offset = compareKanaLetterAndComposedVoicedSoundMarks(a, aEnd, b, bEnd);
if (offset == kNotFound)
return false;
a += offset;
b += offset;
}
}
bool checkKanaStringsEqual(const UChar* firstData, unsigned firstLength, const UChar* secondData, unsigned secondLength)
{
const UChar* a = firstData;
const UChar* aEnd = firstData + firstLength;
const UChar* b = secondData;
const UChar* bEnd = secondData + secondLength;
while (true) {
while (a != aEnd && !isKanaLetter(*a) && b != bEnd && !isKanaLetter(*b)) {
if (*a++ != *b++)
return false;
}
if (a == aEnd || b == bEnd) {
return a == aEnd && b == bEnd;
}
if (isKanaLetter(*a) != isKanaLetter(*b))
return false;
const size_t offset = compareKanaLetterAndComposedVoicedSoundMarks(a, aEnd, b, bEnd);
if (offset == kNotFound)
return false;
a += offset;
b += offset;
}
}
}