This source file includes following definitions.
- NormalizeText
- DetectLanguageOfUnicodeText
#include "encodings/compact_lang_det/win/cld_unicodetext.h"
#include <string>
#include <vector>
#include "encodings/compact_lang_det/compact_lang_det.h"
#include "encodings/compact_lang_det/string_byte_sink.h"
#include "base/string_util.h"
#include "unicode/normlzr.h"
#include "unicode/unistr.h"
#include "unicode/ustring.h"
std::string NormalizeText(const UChar* text) {
icu::UnicodeString source(1, text, -1);
icu::UnicodeString normalized;
UErrorCode status = U_ZERO_ERROR;
icu::Normalizer::normalize(source, UNORM_NFC, 0, normalized, status);
if (U_FAILURE(status))
return std::string();
normalized.toLower();
std::string utf8;
StringByteSink sink(&utf8);
normalized.toUTF8(sink);
return utf8;
}
Language DetectLanguageOfUnicodeText(
const CompactLangDet::DetectionTables* detection_tables,
const UChar* text, bool is_plain_text,
bool* is_reliable, int* num_languages,
int* error_code, int* text_bytes) {
if (!text || !num_languages)
return NUM_LANGUAGES;
std::string utf8_encoded = NormalizeText(text);
if (utf8_encoded.empty())
return NUM_LANGUAGES;
Language language3[3] = {
UNKNOWN_LANGUAGE, UNKNOWN_LANGUAGE, UNKNOWN_LANGUAGE
};
int percent3[3] = { 0, 0, 0 };
int text_bytes_tmp = 0;
CompactLangDet::DetectLanguageSummary(detection_tables,
utf8_encoded.c_str(),
utf8_encoded.length(),
is_plain_text, language3, percent3,
&text_bytes_tmp, is_reliable);
const int kMinTextPercentToCountLanguage = 20;
*num_languages = 0;
if (text_bytes)
*text_bytes = text_bytes_tmp;
COMPILE_ASSERT(arraysize(language3) == arraysize(percent3),
language3_and_percent3_should_be_of_the_same_size);
for (int i = 0; i < arraysize(language3); ++i) {
if (IsValidLanguage(language3[i]) && !IS_LANGUAGE_UNKNOWN(language3[i]) &&
percent3[i] >= kMinTextPercentToCountLanguage) {
++*num_languages;
}
}
return language3[0];
}