This source file includes following definitions.
- GetSimilarLanguageGroupCode
- ApplyLanguageCodeCorrection
- GetCLDMajorVersion
- DetermineTextLanguage
- CanCLDComplementSubCode
- DeterminePageLanguage
- CorrectLanguageCodeTypo
- IsValidLanguageCode
- IsSameOrSimilarLanguages
- MaybeServerWrongConfiguration
#include "components/translate/language_detection/language_detection_util.h"
#include "base/logging.h"
#include "base/metrics/field_trial.h"
#include "base/strings/string_split.h"
#include "base/strings/string_util.h"
#include "base/strings/utf_string_conversions.h"
#include "base/time/time.h"
#include "components/translate/core/common/translate_constants.h"
#include "components/translate/core/common/translate_metrics.h"
#include "components/translate/core/common/translate_util.h"
#if !defined(CLD_VERSION) || CLD_VERSION==1
#include "third_party/cld/encodings/compact_lang_det/compact_lang_det.h"
#include "third_party/cld/encodings/compact_lang_det/win/cld_unicodetext.h"
#endif
#if !defined(CLD_VERSION) || CLD_VERSION==2
#include "third_party/cld_2/src/public/compact_lang_det.h"
#endif
namespace {
struct SimilarLanguageCode {
const char* const code;
int group;
};
const SimilarLanguageCode kSimilarLanguageCodes[] = {
{"bs", 1},
{"hr", 1},
{"hi", 2},
{"ne", 2},
};
int GetSimilarLanguageGroupCode(const std::string& language) {
for (size_t i = 0; i < arraysize(kSimilarLanguageCodes); ++i) {
if (language.find(kSimilarLanguageCodes[i].code) != 0)
continue;
return kSimilarLanguageCodes[i].group;
}
return 0;
}
const char* kWellKnownCodesOnWrongConfiguration[] = {
"es", "pt", "ja", "ru", "de", "zh-CN", "zh-TW", "ar", "id", "fr", "it", "th"
};
void ApplyLanguageCodeCorrection(std::string* code) {
translate::CorrectLanguageCodeTypo(code);
if (!translate::IsValidLanguageCode(*code)) {
*code = std::string();
return;
}
translate::ToTranslateLanguageSynonym(code);
}
int GetCLDMajorVersion() {
#if !defined(CLD_VERSION)
std::string group_name = base::FieldTrialList::FindFullName("CLD1VsCLD2");
if (group_name == "CLD2")
return 2;
else
return 1;
#else
return CLD_VERSION;
#endif
}
std::string DetermineTextLanguage(const base::string16& text,
bool* is_cld_reliable) {
std::string language = translate::kUnknownLanguageCode;
int text_bytes = 0;
bool is_reliable = false;
int cld_language = 0;
bool is_valid_language = false;
switch (GetCLDMajorVersion()) {
#if !defined(CLD_VERSION) || CLD_VERSION==1
case 1: {
int num_languages = 0;
cld_language =
DetectLanguageOfUnicodeText(NULL, text.c_str(), true, &is_reliable,
&num_languages, NULL, &text_bytes);
is_valid_language = cld_language != NUM_LANGUAGES &&
cld_language != UNKNOWN_LANGUAGE &&
cld_language != TG_UNKNOWN_LANGUAGE;
break;
}
#endif
#if !defined(CLD_VERSION) || CLD_VERSION==2
case 2: {
std::string utf8_text(base::UTF16ToUTF8(text));
CLD2::Language language3[3];
int percent3[3];
CLD2::DetectLanguageSummary(
utf8_text.c_str(), (int)utf8_text.size(), true, language3, percent3,
&text_bytes, &is_reliable);
cld_language = language3[0];
is_valid_language = cld_language != CLD2::NUM_LANGUAGES &&
cld_language != CLD2::UNKNOWN_LANGUAGE &&
cld_language != CLD2::TG_UNKNOWN_LANGUAGE;
break;
}
#endif
default:
NOTREACHED();
}
if (is_cld_reliable != NULL)
*is_cld_reliable = is_reliable;
if (is_reliable && text_bytes >= 100 && is_valid_language) {
switch (GetCLDMajorVersion()) {
#if !defined(CLD_VERSION) || CLD_VERSION==1
case 1:
language =
LanguageCodeWithDialects(static_cast<Language>(cld_language));
break;
#endif
#if !defined(CLD_VERSION) || CLD_VERSION==2
case 2:
if (cld_language == CLD2::CHINESE) {
language = "zh-CN";
} else if (cld_language == CLD2::CHINESE_T) {
language = "zh-TW";
} else {
language =
CLD2::LanguageCode(static_cast<CLD2::Language>(cld_language));
}
break;
#endif
default:
NOTREACHED();
}
}
VLOG(9) << "Detected lang_id: " << language << ", from Text:\n" << text
<< "\n*************************************\n";
return language;
}
bool CanCLDComplementSubCode(
const std::string& page_language, const std::string& cld_language) {
return page_language == "zh" && StartsWithASCII(cld_language, "zh-", false);
}
}
namespace translate {
std::string DeterminePageLanguage(const std::string& code,
const std::string& html_lang,
const base::string16& contents,
std::string* cld_language_p,
bool* is_cld_reliable_p) {
base::TimeTicks begin_time = base::TimeTicks::Now();
bool is_cld_reliable;
std::string cld_language = DetermineTextLanguage(contents, &is_cld_reliable);
translate::ReportLanguageDetectionTime(begin_time, base::TimeTicks::Now());
if (cld_language_p != NULL)
*cld_language_p = cld_language;
if (is_cld_reliable_p != NULL)
*is_cld_reliable_p = is_cld_reliable;
translate::ToTranslateLanguageSynonym(&cld_language);
std::string modified_html_lang;
if (!html_lang.empty()) {
modified_html_lang = html_lang;
ApplyLanguageCodeCorrection(&modified_html_lang);
translate::ReportHtmlLang(html_lang, modified_html_lang);
VLOG(9) << "html lang based language code: " << modified_html_lang;
}
std::string modified_code;
if (!code.empty()) {
modified_code = code;
ApplyLanguageCodeCorrection(&modified_code);
translate::ReportContentLanguage(code, modified_code);
}
std::string language = modified_html_lang.empty() ? modified_code :
modified_html_lang;
if (language.empty()) {
translate::ReportLanguageVerification(
translate::LANGUAGE_VERIFICATION_CLD_ONLY);
return cld_language;
}
if (cld_language == kUnknownLanguageCode) {
translate::ReportLanguageVerification(
translate::LANGUAGE_VERIFICATION_UNKNOWN);
return language;
}
if (CanCLDComplementSubCode(language, cld_language)) {
translate::ReportLanguageVerification(
translate::LANGUAGE_VERIFICATION_CLD_COMPLEMENT_SUB_CODE);
return cld_language;
}
if (IsSameOrSimilarLanguages(language, cld_language)) {
translate::ReportLanguageVerification(
translate::LANGUAGE_VERIFICATION_CLD_AGREE);
return language;
}
if (MaybeServerWrongConfiguration(language, cld_language)) {
translate::ReportLanguageVerification(
translate::LANGUAGE_VERIFICATION_TRUST_CLD);
return cld_language;
}
translate::ReportLanguageVerification(
translate::LANGUAGE_VERIFICATION_CLD_DISAGREE);
return kUnknownLanguageCode;
}
void CorrectLanguageCodeTypo(std::string* code) {
DCHECK(code);
size_t coma_index = code->find(',');
if (coma_index != std::string::npos) {
*code = code->substr(0, coma_index);
}
base::TrimWhitespaceASCII(*code, base::TRIM_ALL, code);
size_t underscore_index = code->find('_');
if (underscore_index != std::string::npos)
(*code)[underscore_index] = '-';
size_t dash_index = code->find('-');
if (dash_index != std::string::npos) {
*code = StringToLowerASCII(code->substr(0, dash_index)) +
StringToUpperASCII(code->substr(dash_index));
} else {
*code = StringToLowerASCII(*code);
}
}
bool IsValidLanguageCode(const std::string& code) {
std::vector<std::string> chunks;
base::SplitString(code, '-', &chunks);
if (chunks.size() < 1 || 2 < chunks.size())
return false;
const std::string& main_code = chunks[0];
if (main_code.size() < 1 || 3 < main_code.size())
return false;
for (std::string::const_iterator it = main_code.begin();
it != main_code.end(); ++it) {
if (!IsAsciiAlpha(*it))
return false;
}
if (chunks.size() == 1)
return true;
const std::string& sub_code = chunks[1];
if (sub_code.size() != 2)
return false;
for (std::string::const_iterator it = sub_code.begin();
it != sub_code.end(); ++it) {
if (!IsAsciiAlpha(*it))
return false;
}
return true;
}
bool IsSameOrSimilarLanguages(const std::string& page_language,
const std::string& cld_language) {
std::vector<std::string> chunks;
base::SplitString(page_language, '-', &chunks);
if (chunks.size() == 0)
return false;
std::string page_language_main_part = chunks[0];
base::SplitString(cld_language, '-', &chunks);
if (chunks.size() == 0)
return false;
std::string cld_language_main_part = chunks[0];
if (page_language_main_part == cld_language_main_part) {
translate::ReportSimilarLanguageMatch(false);
return true;
}
int page_code = GetSimilarLanguageGroupCode(page_language);
bool match = page_code != 0 &&
page_code == GetSimilarLanguageGroupCode(cld_language);
translate::ReportSimilarLanguageMatch(match);
return match;
}
bool MaybeServerWrongConfiguration(const std::string& page_language,
const std::string& cld_language) {
if (!StartsWithASCII(page_language, "en", false))
return false;
for (size_t i = 0; i < arraysize(kWellKnownCodesOnWrongConfiguration); ++i) {
if (cld_language == kWellKnownCodesOnWrongConfiguration[i])
return true;
}
return false;
}
}