This source file includes following definitions.
- ExtLanguageName
- ExtLanguageDeclaredName
- ExtLanguageCode
- GetLanguageFromNumberOrName
- GetLScriptFromNumberOrName
- NormalizeLanguage
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include "encodings/compact_lang_det/ext_lang_enc.h"
#include "encodings/compact_lang_det/win/cld_macros.h"
#include "encodings/compact_lang_det/win/cld_strtoint.h"
static const char* const kExtLanguageName[] = {
"X_BORK_BORK_BORK", "X_PIG_LATIN", "X_HACKER", "X_KLINGON", "X_ELMER_FUDD",
"X_OGHAM", "X_RUNIC", "X_YI", "X_OLD_ITALIC", "X_GOTHIC",
"X_DESERET", "X_HANUNOO", "X_BUHID", "X_TAGBANWA", "X_TAI_LE",
"X_LINEAR_B", "X_UGARITIC", "X_SHAVIAN", "X_OSMANYA", "X_CYPRIOT",
"X_BUGINESE", "X_COPTIC", "X_NEW_TAI_LUE", "X_GLAGOLITIC", "X_TIFINAGH",
"X_SYLOTI_NAGRI", "X_OLD_PERSIAN", "X_KHAROSHTHI", "X_BALINESE", "X_CUNEIFORM",
"X_PHOENICIAN", "X_PHAGS_PA", "X_NKO",
"X_SUDANESE", "X_LEPCHA", "X_OL_CHIKI", "X_VAI", "X_SAURASHTRA",
"X_KAYAH_LI", "X_REJANG", "X_LYCIAN", "X_CARIAN", "X_LYDIAN",
"X_CHAM",
};
static const char* const kExtLangDeclaredName[] = {
"ENGLISH",
"DANISH",
"DUTCH",
"FINNISH",
"FRENCH",
"GERMAN",
"HEBREW",
"ITALIAN",
"JAPANESE",
"KOREAN",
"NORWEGIAN",
"POLISH",
"PORTUGUESE",
"RUSSIAN",
"SPANISH",
"SWEDISH",
"CHINESE",
"CZECH",
"GREEK",
"ICELANDIC",
"LATVIAN",
"LITHUANIAN",
"ROMANIAN",
"HUNGARIAN",
"ESTONIAN",
"TG_UNKNOWN_LANGUAGE",
"UNKNOWN_LANGUAGE",
"BULGARIAN",
"CROATIAN",
"SERBIAN",
"IRISH",
"GALICIAN",
"TAGALOG",
"TURKISH",
"UKRAINIAN",
"HINDI",
"MACEDONIAN",
"BENGALI",
"INDONESIAN",
"LATIN",
"MALAY",
"MALAYALAM",
"WELSH",
"NEPALI",
"TELUGU",
"ALBANIAN",
"TAMIL",
"BELARUSIAN",
"JAVANESE",
"OCCITAN",
"URDU",
"BIHARI",
"GUJARATI",
"THAI",
"ARABIC",
"CATALAN",
"ESPERANTO",
"BASQUE",
"INTERLINGUA",
"KANNADA",
"PUNJABI",
"SCOTS_GAELIC",
"SWAHILI",
"SLOVENIAN",
"MARATHI",
"MALTESE",
"VIETNAMESE",
"FRISIAN",
"SLOVAK",
"CHINESE_T",
"FAROESE",
"SUNDANESE",
"UZBEK",
"AMHARIC",
"AZERBAIJANI",
"GEORGIAN",
"TIGRINYA",
"PERSIAN",
"BOSNIAN",
"SINHALESE",
"NORWEGIAN_N",
"PORTUGUESE_P",
"PORTUGUESE_B",
"XHOSA",
"ZULU",
"GUARANI",
"SESOTHO",
"TURKMEN",
"KYRGYZ",
"BRETON",
"TWI",
"YIDDISH",
"SERBO_CROATIAN",
"SOMALI",
"UIGHUR",
"KURDISH",
"MONGOLIAN",
"ARMENIAN",
"LAOTHIAN",
"SINDHI",
"RHAETO_ROMANCE",
"AFRIKAANS",
"LUXEMBOURGISH",
"BURMESE",
"KHMER",
"TIBETAN",
"DHIVEHI",
"CHEROKEE",
"SYRIAC",
"LIMBU",
"ORIYA",
"ASSAMESE",
"CORSICAN",
"INTERLINGUE",
"KAZAKH",
"LINGALA",
"MOLDAVIAN",
"PASHTO",
"QUECHUA",
"SHONA",
"TAJIK",
"TATAR",
"TONGA",
"YORUBA",
"CREOLES_AND_PIDGINS_ENGLISH_BASED",
"CREOLES_AND_PIDGINS_FRENCH_BASED",
"CREOLES_AND_PIDGINS_PORTUGUESE_BASED",
"CREOLES_AND_PIDGINS_OTHER",
"MAORI",
"WOLOF",
"ABKHAZIAN",
"AFAR",
"AYMARA",
"BASHKIR",
"BISLAMA",
"DZONGKHA",
"FIJIAN",
"GREENLANDIC",
"HAUSA",
"HAITIAN_CREOLE",
"INUPIAK",
"INUKTITUT",
"KASHMIRI",
"KINYARWANDA",
"MALAGASY",
"NAURU",
"OROMO",
"RUNDI",
"SAMOAN",
"SANGO",
"SANSKRIT",
"SISWANT",
"TSONGA",
"TSWANA",
"VOLAPUK",
"ZHUANG",
"KHASI",
"SCOTS",
"GANDA",
"MANX",
"MONTENEGRIN",
};
COMPILE_ASSERT(arraysize(kExtLangDeclaredName) == NUM_LANGUAGES,
kExtLangDeclaredName_has_incorrect_length);
static const char* const kExtLanguageCode[] = {
"zzb", "zzp", "zzh", "tlh", "zze",
"xx-Ogam", "xx-Runr", "xx-Yiii", "xx-Ital", "xx-Goth",
"xx-Dsrt", "xx-Hano", "xx-Buhd", "xx-Tagb", "xx-Tale",
"xx-Linb", "xx-Ugar", "xx-Shaw", "xx-Osma", "xx-Cprt",
"xx-Bugi", "xx-Copt", "xx-Talu", "xx-Glag", "xx-Tfng",
"xx-Sylo", "xx-Xpeo", "xx-Khar", "xx-Bali", "xx-Xsux",
"xx-Phnx", "xx-Phag", "xx-Nkoo",
"xx-Sund", "xx-Lepc", "xx-Olck", "xx-Vaii", "xx-Saur",
"xx-Kali", "xx-Rjng", "xx-Lyci", "xx-Cari", "xx-Lydi",
"xx-Cham",
};
const char* ExtLanguageName(const Language lang) {
if (lang < 0) {
return "";
}
if (lang == TG_UNKNOWN_LANGUAGE) {
return "Ignore";
}
if ((0 <= lang) && (lang < NUM_LANGUAGES)) {
return LanguageName(lang);
}
if ((EXT_LANGUAGE_BASE <= lang) && (lang < EXT_NUM_LANGUAGES)) {
return kExtLanguageName[lang - EXT_LANGUAGE_BASE];
}
return invalid_language_name();
}
const char* ExtLanguageDeclaredName(const Language lang) {
if ((0 <= lang) && (lang < NUM_LANGUAGES)) {
return kExtLangDeclaredName[lang];
}
if ((EXT_LANGUAGE_BASE <= lang) && (lang < EXT_NUM_LANGUAGES)) {
return kExtLanguageName[lang - EXT_LANGUAGE_BASE];
}
return "UNKNOWN_LANGUAGE";
}
const char* ExtLanguageCode(const Language lang) {
if (lang == TG_UNKNOWN_LANGUAGE) {
return "xxx";
}
if ((0 <= lang) && (lang < NUM_LANGUAGES)) {
return LanguageCode(lang);
}
if ((EXT_LANGUAGE_BASE <= lang) && (lang < EXT_NUM_LANGUAGES)) {
return kExtLanguageCode[lang - EXT_LANGUAGE_BASE];
}
return "??";
}
Language GetLanguageFromNumberOrName(const char* src) {
if (strspn(src, "0123456789") == strlen(src)) {
return static_cast<Language>(strto32(src, NULL, 10));
}
Language retlang = UNKNOWN_LANGUAGE;
size_t len = strlen(src);
if (true ) {
if (memcmp(src, "pt-", 3) == 0) {return PORTUGUESE;}
if (memcmp(src, "en-", 3) == 0) {return ENGLISH;}
if (memcmp(src, "fr-", 3) == 0) {return FRENCH;}
if (memcmp(src, "bs-", 3) == 0) {return CROATIAN;}
if (memcmp(src, "hr-", 3) == 0) {return CROATIAN;}
if (memcmp(src, "sr-Latn", 7) == 0) {return CROATIAN;}
if (memcmp(src, "sh-Latn", 7) == 0) {return CROATIAN;}
if (memcmp(src, "sr-Cyrl", 7) == 0) {return SERBIAN;}
if (memcmp(src, "sh-Cyrl", 7) == 0) {return SERBIAN;}
}
if (len >= 3) {
if (memcmp(src, "xxx", 3) == 0) {return TG_UNKNOWN_LANGUAGE;}
if (memcmp(src, "zzb", 3) == 0) {return X_BORK_BORK_BORK;}
if (memcmp(src, "zzp", 3) == 0) {return X_PIG_LATIN;}
if (memcmp(src, "zzh", 3) == 0) {return X_HACKER;}
if (memcmp(src, "tlh", 3) == 0) {return X_KLINGON;}
if (memcmp(src, "zze", 3) == 0) {return X_ELMER_FUDD;}
}
if (len <= 3) {
LanguageFromCode(src, &retlang);
} else if (len == 7) {
if (memcmp(src, "xx-", 3) == 0) {
if (memcmp(src, "xx-Ogam", 7) == 0) {return X_OGHAM;}
if (memcmp(src, "xx-Runr", 7) == 0) {return X_RUNIC;}
if (memcmp(src, "xx-Yiii", 7) == 0) {return X_YI;}
if (memcmp(src, "xx-Ital", 7) == 0) {return X_OLD_ITALIC;}
if (memcmp(src, "xx-Goth", 7) == 0) {return X_GOTHIC;}
if (memcmp(src, "xx-Dsrt", 7) == 0) {return X_DESERET;}
if (memcmp(src, "xx-Hano", 7) == 0) {return X_HANUNOO;}
if (memcmp(src, "xx-Buhd", 7) == 0) {return X_BUHID;}
if (memcmp(src, "xx-Tagb", 7) == 0) {return X_TAGBANWA;}
if (memcmp(src, "xx-Tale", 7) == 0) {return X_TAI_LE;}
if (memcmp(src, "xx-Linb", 7) == 0) {return X_LINEAR_B;}
if (memcmp(src, "xx-Ugar", 7) == 0) {return X_UGARITIC;}
if (memcmp(src, "xx-Shaw", 7) == 0) {return X_SHAVIAN;}
if (memcmp(src, "xx-Osma", 7) == 0) {return X_OSMANYA;}
if (memcmp(src, "xx-Cprt", 7) == 0) {return X_CYPRIOT;}
if (memcmp(src, "xx-Bugi", 7) == 0) {return X_BUGINESE;}
if (memcmp(src, "xx-Copt", 7) == 0) {return X_COPTIC;}
if (memcmp(src, "xx-Talu", 7) == 0) {return X_NEW_TAI_LUE;}
if (memcmp(src, "xx-Glag", 7) == 0) {return X_GLAGOLITIC;}
if (memcmp(src, "xx-Tfng", 7) == 0) {return X_TIFINAGH;}
if (memcmp(src, "xx-Sylo", 7) == 0) {return X_SYLOTI_NAGRI;}
if (memcmp(src, "xx-Xpeo", 7) == 0) {return X_OLD_PERSIAN;}
if (memcmp(src, "xx-Khar", 7) == 0) {return X_KHAROSHTHI;}
if (memcmp(src, "xx-Bali", 7) == 0) {return X_BALINESE;}
if (memcmp(src, "xx-Xsux", 7) == 0) {return X_CUNEIFORM;}
if (memcmp(src, "xx-Phnx", 7) == 0) {return X_PHOENICIAN;}
if (memcmp(src, "xx-Phag", 7) == 0) {return X_PHAGS_PA;}
if (memcmp(src, "xx-Nkoo", 7) == 0) {return X_NKO;}
if (memcmp(src, "xx-Sund", 7) == 0) {return X_SUDANESE;}
if (memcmp(src, "xx-Lepc", 7) == 0) {return X_LEPCHA;}
if (memcmp(src, "xx-Olck", 7) == 0) {return X_OL_CHIKI;}
if (memcmp(src, "xx-Vaii", 7) == 0) {return X_VAI;}
if (memcmp(src, "xx-Saur", 7) == 0) {return X_SAURASHTRA;}
if (memcmp(src, "xx-Kali", 7) == 0) {return X_KAYAH_LI;}
if (memcmp(src, "xx-Rjng", 7) == 0) {return X_REJANG;}
if (memcmp(src, "xx-Lyci", 7) == 0) {return X_LYCIAN;}
if (memcmp(src, "xx-Cari", 7) == 0) {return X_CARIAN;}
if (memcmp(src, "xx-Lydi", 7) == 0) {return X_LYDIAN;}
if (memcmp(src, "xx-Cham", 7) == 0) {return X_CHAM;}
}
}
if (strcmp(src, "sit-NP") == 0) {return LIMBU;}
if (strcmp(src, "un-Latn") == 0) {return UNKNOWN_LANGUAGE;}
if (memcmp(src, "zh", 2) == 0) {
if (memcmp(&src[len - 2], "TW", 2) == 0) {return CHINESE_T;}
if (memcmp(&src[len - 2], "HK", 2) == 0) {return CHINESE_T;}
return CHINESE;
}
if (memcmp(src, "pt", 2) == 0) {
if (memcmp(&src[len - 2], "BR", 2) == 0) {return PORTUGUESE;}
return PORTUGUESE;
}
if (memcmp(src, "fr", 2) == 0) {
if (memcmp(&src[len -2], "CA", 2) == 0) {return FRENCH;}
return FRENCH;
}
if (src[2] == '-') {
char temp[4];
memcpy(temp, src, 4);
temp[2] = '\0';
LanguageFromCode(temp, &retlang);
}
if (src[3] == '-') {
char temp[4];
memcpy(temp, src, 4);
temp[3] = '\0';
LanguageFromCode(temp, &retlang);
}
if (retlang != UNKNOWN_LANGUAGE) {
return retlang;
}
return retlang;
}
typedef struct {
const char* name;
UnicodeLScript lscript;
} NameScriptPair;
static const NameScriptPair kNameScriptPair[] = {
{"Arab", ULScript_Arabic},
{"Armn", ULScript_Armenian},
{"Bali", ULScript_Balinese},
{"Beng", ULScript_Bengali},
{"Bugi", ULScript_Buginese},
{"Buhd", ULScript_Buhid},
{"Cans", ULScript_Canadian_Aboriginal},
{"Cari", ULScript_Carian},
{"Cham", ULScript_Cham},
{"Cher", ULScript_Cherokee},
{"Copt", ULScript_Coptic},
{"Cprt", ULScript_Cypriot},
{"Cyrl", ULScript_Cyrillic},
{"Deva", ULScript_Devanagari},
{"Dsrt", ULScript_Deseret},
{"Ethi", ULScript_Ethiopic},
{"Geor", ULScript_Georgian},
{"Glag", ULScript_Glagolitic},
{"Goth", ULScript_Gothic},
{"Grek", ULScript_Greek},
{"Gujr", ULScript_Gujarati},
{"Guru", ULScript_Gurmukhi},
{"Hani", ULScript_HanCJK},
{"Hano", ULScript_Hanunoo},
{"Hebr", ULScript_Hebrew},
{"Ital", ULScript_Old_Italic},
{"Kali", ULScript_Kayah_Li},
{"Khar", ULScript_Kharoshthi},
{"Khmr", ULScript_Khmer},
{"Knda", ULScript_Kannada},
{"Laoo", ULScript_Lao},
{"Latn", ULScript_Latin},
{"Lepc", ULScript_Lepcha},
{"Limb", ULScript_Limbu},
{"Linb", ULScript_Linear_B},
{"Lyci", ULScript_Lycian},
{"Lydi", ULScript_Lydian},
{"Mlym", ULScript_Malayalam},
{"Mong", ULScript_Mongolian},
{"Mymr", ULScript_Myanmar},
{"Nkoo", ULScript_Nko},
{"Ogam", ULScript_Ogham},
{"Olck", ULScript_Ol_Chiki},
{"Orya", ULScript_Oriya},
{"Osma", ULScript_Osmanya},
{"Phag", ULScript_Phags_Pa},
{"Phnx", ULScript_Phoenician},
{"Rjng", ULScript_Rejang},
{"Runr", ULScript_Runic},
{"Saur", ULScript_Saurashtra},
{"Shaw", ULScript_Shavian},
{"Sinh", ULScript_Sinhala},
{"Sund", ULScript_Sundanese},
{"Sylo", ULScript_Syloti_Nagri},
{"Syrc", ULScript_Syriac},
{"Tagb", ULScript_Tagbanwa},
{"Tale", ULScript_Tai_Le},
{"Talu", ULScript_New_Tai_Lue},
{"Taml", ULScript_Tamil},
{"Telu", ULScript_Telugu},
{"Tfng", ULScript_Tifinagh},
{"Tglg", ULScript_Tagalog},
{"Thaa", ULScript_Thaana},
{"Thai", ULScript_Thai},
{"Tibt", ULScript_Tibetan},
{"Ugar", ULScript_Ugaritic},
{"Vaii", ULScript_Vai},
{"Xpeo", ULScript_Old_Persian},
{"Xsux", ULScript_Cuneiform},
{"Yiii", ULScript_Yi},
{"Zyyy", ULScript_Common},
{"Zzzz", ULScript_Inherited},
};
UnicodeLScript GetLScriptFromNumberOrName(const char* src) {
if (strspn(src, "0123456789") == strlen(src)) {
return static_cast<UnicodeLScript>(strto32(src, NULL, 10));
}
if (strcmp(src, "zh-TW") == 0) {return ULScript_HanCJK;}
if (strcmp(src, "zh-CN") == 0) {return ULScript_HanCJK;}
if (strcmp(src, "pt-BR") == 0) {return ULScript_Latin;}
if (strcmp(src, "pt-PT") == 0) {return ULScript_Latin;}
if (strcmp(src, "sit-NP") == 0) {return ULScript_Latin;}
char temp[5];
const char* src2 = strchr(src, '-');
if (src2 == NULL) {return ULScript_Latin;}
src2 += 1;
memcpy(temp, src2, 4);
temp[4] = '\0';
int lo = 0;
int hi = ULScript_NUM_SCRIPTS;
while (lo < hi) {
int mid = (lo + hi) >> 1;
if (strcmp(temp, kNameScriptPair[mid].name) < 0) {
hi = mid;
} else if (strcmp(temp, kNameScriptPair[mid].name) > 0) {
lo = mid + 1;
} else {
return kNameScriptPair[mid].lscript;
}
}
return ULScript_Latin;
}
Language NormalizeLanguage(Language lang) {
if (lang == BOSNIAN) {return CROATIAN;}
if (lang == SERBO_CROATIAN) {return SERBIAN;}
if (lang == PORTUGUESE_P) {return PORTUGUESE;}
if (lang == PORTUGUESE_B) {return PORTUGUESE;}
return lang;
}