This source file includes following definitions.
- BiHashV25
- QuadHashV25Mix
- QuadHashV25
- QuadHashV25Underscore
- OctaHash40Mix
- OctaHash40
- OctaHash40underscore
- ProcessProbV25UniTote
- ProcessProbV25Tote
- DoUniScoreV3
- DoBigramScoreV3
- DoQuadScoreV3
- DoOctaScoreV3
- ReliabilityDelta
- ReliabilityMainstream
- GetNormalizedScore
- GetReliability
- DemoteNotTop40
#include <string>
#include "encodings/compact_lang_det/cldutil.h"
#include "base/basictypes.h"
#include "encodings/compact_lang_det/cldutil_dbg.h"
#include "encodings/compact_lang_det/generated/compact_lang_det_generated_meanscore.h"
#include "encodings/compact_lang_det/utf8propletterscriptnum.h"
#include "encodings/compact_lang_det/win/cld_commandlineflags.h"
#include "encodings/compact_lang_det/win/cld_logging.h"
#include "encodings/compact_lang_det/win/cld_unilib.h"
#include "encodings/compact_lang_det/win/cld_utf.h"
#include "encodings/compact_lang_det/win/cld_utf8statetable.h"
static const uint32 kPreSpaceIndicator = 0x00004444;
static const uint32 kPostSpaceIndicator = 0x44440000;
static const uint32 kWordMask0[4] = {
0xFFFFFFFF, 0x000000FF, 0x0000FFFF, 0x00FFFFFF
};
static const int kMinCJKUTF8CharBytes = 3;
static const int kMinGramCount = 3;
static const int kMaxGramCount = 16;
uint32 cld::BiHashV25(const char* word_ptr, int bytecount) {
if (bytecount == 0) {
return 0;
}
uint32 word0, word1;
if (bytecount <= 4) {
word0 = UnalignedLoad32(word_ptr) & kWordMask0[bytecount & 3];
word0 = word0 ^ (word0 >> 3);
return word0;
}
word0 = UnalignedLoad32(word_ptr);
word0 = word0 ^ (word0 >> 3);
word1 = UnalignedLoad32(word_ptr + 4) & kWordMask0[bytecount & 3];
word1 = word1 ^ (word1 << 18);
return word0 + word1;
}
uint32 QuadHashV25Mix(const char* word_ptr, int bytecount, uint32 prepost) {
uint32 word0, word1, word2;
if (bytecount <= 4) {
word0 = UnalignedLoad32(word_ptr) & kWordMask0[bytecount & 3];
word0 = word0 ^ (word0 >> 3);
return word0 ^ prepost;
} else if (bytecount <= 8) {
word0 = UnalignedLoad32(word_ptr);
word0 = word0 ^ (word0 >> 3);
word1 = UnalignedLoad32(word_ptr + 4) & kWordMask0[bytecount & 3];
word1 = word1 ^ (word1 << 4);
return (word0 ^ prepost) + word1;
}
word0 = UnalignedLoad32(word_ptr);
word0 = word0 ^ (word0 >> 3);
word1 = UnalignedLoad32(word_ptr + 4);
word1 = word1 ^ (word1 << 4);
word2 = UnalignedLoad32(word_ptr + 8) & kWordMask0[bytecount & 3];
word2 = word2 ^ (word2 << 2);
return (word0 ^ prepost) + word1 + word2;
}
uint32 cld::QuadHashV25(const char* word_ptr, int bytecount) {
if (bytecount == 0) {
return 0;
}
uint32 prepost = 0;
if (word_ptr[-1] == ' ') {prepost |= kPreSpaceIndicator;}
if (word_ptr[bytecount] == ' ') {prepost |= kPostSpaceIndicator;}
return QuadHashV25Mix(word_ptr, bytecount, prepost);
}
uint32 cld::QuadHashV25Underscore(const char* word_ptr, int bytecount) {
if (bytecount == 0) {
return 0;
}
const char* local_word_ptr = word_ptr;
int local_bytecount = bytecount;
uint32 prepost = 0;
if (local_word_ptr[0] == '_') {
prepost |= kPreSpaceIndicator;
++local_word_ptr;
--local_bytecount;
}
if (local_word_ptr[local_bytecount - 1] == '_') {
prepost |= kPostSpaceIndicator;
--local_bytecount;
}
return QuadHashV25Mix(local_word_ptr, local_bytecount, prepost);
}
uint64 OctaHash40Mix(const char* word_ptr, int bytecount, uint64 prepost) {
uint64 word0;
uint64 word1;
uint64 sum;
if (word_ptr[-1] == ' ') {prepost |= kPreSpaceIndicator;}
if (word_ptr[bytecount] == ' ') {prepost |= kPostSpaceIndicator;}
switch ((bytecount - 1) >> 2) {
case 0:
word0 = UnalignedLoad32(word_ptr) & kWordMask0[bytecount & 3];
sum = word0;
word0 = word0 ^ (word0 >> 3);
break;
case 1:
word0 = UnalignedLoad32(word_ptr);
sum = word0;
word0 = word0 ^ (word0 >> 3);
word1 = UnalignedLoad32(word_ptr + 4) & kWordMask0[bytecount & 3];
sum += word1;
word1 = word1 ^ (word1 << 4);
word0 += word1;
break;
case 2:
word0 = UnalignedLoad32(word_ptr);
sum = word0;
word0 = word0 ^ (word0 >> 3);
word1 = UnalignedLoad32(word_ptr + 4);
sum += word1;
word1 = word1 ^ (word1 << 4);
word0 += word1;
word1 = UnalignedLoad32(word_ptr + 8) & kWordMask0[bytecount & 3];
sum += word1;
word1 = word1 ^ (word1 << 2);
word0 += word1;
break;
case 3:
word0 = UnalignedLoad32(word_ptr);
sum = word0;
word0 = word0 ^ (word0 >> 3);
word1 = UnalignedLoad32(word_ptr + 4);
sum += word1;
word1 = word1 ^ (word1 << 4);
word0 += word1;
word1 = UnalignedLoad32(word_ptr + 8);
sum += word1;
word1 = word1 ^ (word1 << 2);
word0 += word1;
word1 = UnalignedLoad32(word_ptr + 12) & kWordMask0[bytecount & 3];
sum += word1;
word1 = word1 ^ (word1 >> 8);
word0 += word1;
break;
case 4:
word0 = UnalignedLoad32(word_ptr);
sum = word0;
word0 = word0 ^ (word0 >> 3);
word1 = UnalignedLoad32(word_ptr + 4);
sum += word1;
word1 = word1 ^ (word1 << 4);
word0 += word1;
word1 = UnalignedLoad32(word_ptr + 8);
sum += word1;
word1 = word1 ^ (word1 << 2);
word0 += word1;
word1 = UnalignedLoad32(word_ptr + 12);
sum += word1;
word1 = word1 ^ (word1 >> 8);
word0 += word1;
word1 = UnalignedLoad32(word_ptr + 16) & kWordMask0[bytecount & 3];
sum += word1;
word1 = word1 ^ (word1 >> 4);
word0 += word1;
break;
default:
word0 = UnalignedLoad32(&word_ptr);
sum = word0;
word0 = word0 ^ (word0 >> 3);
word1 = UnalignedLoad32(word_ptr + 4);
sum += word1;
word1 = word1 ^ (word1 << 4);
word0 += word1;
word1 = UnalignedLoad32(word_ptr + 8);
sum += word1;
word1 = word1 ^ (word1 << 2);
word0 += word1;
word1 = UnalignedLoad32(word_ptr + 12);
sum += word1;
word1 = word1 ^ (word1 >> 8);
word0 += word1;
word1 = UnalignedLoad32(word_ptr + 16);
sum += word1;
word1 = word1 ^ (word1 >> 4);
word0 += word1;
word1 = UnalignedLoad32(word_ptr + 20) & kWordMask0[bytecount & 3];
sum += word1;
word1 = word1 ^ (word1 >> 6);
word0 += word1;
break;
}
sum += (sum >> 17);
sum += (sum >> 9);
sum = (sum & 0xff) << 32;
return (word0 ^ prepost) + sum;
}
uint64 cld::OctaHash40(const char* word_ptr, int bytecount) {
if (bytecount == 0) {
return 0;
}
uint64 prepost = 0;
if (word_ptr[-1] == ' ') {prepost |= kPreSpaceIndicator;}
if (word_ptr[bytecount] == ' ') {prepost |= kPostSpaceIndicator;}
return OctaHash40Mix(word_ptr, bytecount, prepost);
}
uint64 cld::OctaHash40underscore(const char* word_ptr, int bytecount) {
if (bytecount == 0) {
return 0;
}
const char* local_word_ptr = word_ptr;
int local_bytecount = bytecount;
uint64 prepost = 0;
if (local_word_ptr[0] == '_') {
prepost |= kPreSpaceIndicator;
++local_word_ptr;
--local_bytecount;
}
if (local_word_ptr[local_bytecount - 1] == '_') {
prepost |= kPostSpaceIndicator;
--local_bytecount;
}
return OctaHash40Mix(local_word_ptr, local_bytecount, prepost);
}
void cld::ProcessProbV25UniTote(int propval, Tote* tote) {
tote->AddGram();
const UnigramProbArray* pa = &kTargetCTJKVZProbs[propval];
if (pa->probs[0] > 0) {tote->Add(cld::PackLanguage(CHINESE), pa->probs[0]);}
if (pa->probs[1] > 0) {tote->Add(cld::PackLanguage(CHINESE_T), pa->probs[1]);}
if (pa->probs[2] > 0) {tote->Add(cld::PackLanguage(JAPANESE), pa->probs[2]);}
if (pa->probs[3] > 0) {tote->Add(cld::PackLanguage(KOREAN), pa->probs[3]);}
if (pa->probs[4] > 0) {tote->Add(cld::PackLanguage(VIETNAMESE), pa->probs[4]);}
if (pa->probs[5] > 0) {tote->Add(cld::PackLanguage(ZHUANG), pa->probs[5]);}
}
void cld::ProcessProbV25Tote(uint32 probs, Tote* tote) {
tote->AddGram();
uint8 prob123 = (probs >> 0) & 0xff;
const uint8* prob123_entry = cld::LgProb2TblEntry(prob123);
uint8 top1 = (probs >> 8) & 0xff;
if (top1 > 0) {tote->Add(top1, cld::LgProb3(prob123_entry, 0));}
uint8 top2 = (probs >> 16) & 0xff;
if (top2 > 0) {tote->Add(top2, cld::LgProb3(prob123_entry, 1));}
uint8 top3 = (probs >> 24) & 0xff;
if (top3 > 0) {tote->Add(top3, cld::LgProb3(prob123_entry, 2));}
}
int cld::DoUniScoreV3(const UTF8PropObj* unigram_obj,
const char* isrc, int srclen, int advance_by,
int* tote_grams, int gram_limit, Tote* chunk_tote) {
const char* src = isrc;
if (FLAGS_dbgscore) {DbgScoreInit(src, srclen);}
if (src[0] == ' ') {++src; --srclen;}
const uint8* usrc = reinterpret_cast<const uint8*>(src);
int usrclen = srclen;
while (usrclen > 0) {
int len = kAdvanceOneChar[usrc[0]];
int propval = UTF8GenericPropertyBigOneByte(unigram_obj, &usrc, &usrclen);
if (FLAGS_dbglookup) {
DbgUniTermToStderr(propval, usrc, len);
}
if (propval > 0) {
ProcessProbV25UniTote(propval, chunk_tote);
++(*tote_grams);
if (FLAGS_dbgscore) {DbgScoreRecordUni((const char*)usrc, propval, len);}
}
if (advance_by == 2) {
} else if (advance_by == 4) {
if (UTFmax <= usrclen) {
int n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
}
} else if (advance_by == 8) {
if ((UTFmax * 3) <= usrclen) {
int n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
}
} else {
if ((UTFmax * 7) <= usrclen) {
int n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
}
}
DCHECK(usrclen >= 0);
if (*tote_grams >= gram_limit) {
break;
}
}
if (FLAGS_dbgscore) {
int len = src - isrc;
DbgScoreTop(src, (len * 2) / advance_by, chunk_tote);
DbgScoreFlush();
}
int consumed2 = reinterpret_cast<const char*>(usrc) - isrc;
return consumed2;
}
int cld::DoBigramScoreV3(const cld::CLDTableSummary* bigram_obj,
const char* isrc, int srclen, Tote* chunk_tote) {
int hit_count = 0;
const char* src = isrc;
const uint8* usrc = reinterpret_cast<const uint8*>(src);
const uint8* usrclimit1 = usrc + srclen - UTFmax;
if (FLAGS_dbgscore) {
fprintf(stderr, " " );
}
while (usrc < usrclimit1) {
int len = kAdvanceOneChar[usrc[0]];
int len2 = kAdvanceOneChar[usrc[len]] + len;
if ((kMinCJKUTF8CharBytes * 2) <= len2) {
uint32 bihash = BiHashV25(reinterpret_cast<const char*>(usrc), len2);
uint32 probs = QuadHashV3Lookup4(bigram_obj, bihash);
probs = bigram_obj->kCLDTableInd[probs &
~bigram_obj->kCLDTableKeyMask];
if (FLAGS_dbglookup) {
const char* ssrc = reinterpret_cast<const char*>(usrc);
DbgBiTermToStderr(bihash, probs, ssrc, len2);
DbgScoreRecord(NULL, probs, len2);
} else if (FLAGS_dbgscore && (probs != 0)) {
const char* ssrc = reinterpret_cast<const char*>(usrc);
DbgScoreRecord(NULL, probs, len2);
string temp(ssrc, len2);
fprintf(stderr, "%s ", temp.c_str());
}
if (probs != 0) {
ProcessProbV25Tote(probs, chunk_tote);
++hit_count;
}
}
usrc += len;
}
if (FLAGS_dbgscore) {
fprintf(stderr, "[%d bigrams scored]\n", hit_count);
DbgScoreState();
}
return hit_count;
}
int cld::DoQuadScoreV3(const cld::CLDTableSummary* quadgram_obj,
const char* isrc, int srclen, int advance_by,
int* tote_grams, int gram_limit, Tote* chunk_tote) {
const char* src = isrc;
const char* srclimit = src + srclen;
const char* srclimit7 = src + srclen - (UTFmax * 7);
const char* srclimit15 = src + srclen - (UTFmax * 15);
if (FLAGS_dbgscore) {DbgScoreInit(src, srclen);}
int next_prior = 0;
uint32 prior_quads[2] = {0, 0};
if (src[0] == ' ') {++src;}
while (src < srclimit) {
const char* src_end = src;
src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
const char* src_mid = src_end;
src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
int len = src_end - src;
uint32 quadhash = QuadHashV25(src, len);
uint32 probs = QuadHashV3Lookup4(quadgram_obj, quadhash);
probs = quadgram_obj->kCLDTableInd[probs &
~quadgram_obj->kCLDTableKeyMask];
if (FLAGS_dbglookup) {
DbgQuadTermToStderr(quadhash, probs, src, len);
}
if (probs != 0) {
if ((quadhash != prior_quads[0]) && (quadhash != prior_quads[1])) {
prior_quads[next_prior] = quadhash;
next_prior = (next_prior + 1) & 1;
ProcessProbV25Tote(probs, chunk_tote);
++(*tote_grams);
if (FLAGS_dbgscore) {DbgScoreRecord(src, probs, len);}
}
}
if (src_end[0] == ' ') {
src_mid = src_end;
}
if (advance_by == 2) {
src = src_mid;
} else if (advance_by == 4) {
src = src_end;
} else if (advance_by == 8) {
if (src < srclimit7) {
src_end += kAdvanceOneChar[(uint8)src_end[0]];
src_end += kAdvanceOneChar[(uint8)src_end[0]];
src_end += kAdvanceOneChar[(uint8)src_end[0]];
src_end += kAdvanceOneChar[(uint8)src_end[0]];
}
src = src_end;
} else {
if (src < srclimit15) {
int fourcharlen = src_end - src;
src = src_end + (3 * fourcharlen);
src += kAdvanceOneCharSpaceVowel[(uint8)src[0]];
src += kAdvanceOneCharSpaceVowel[(uint8)src[0]];
} else {
src = src_end;
}
}
DCHECK(src < srclimit);
src += kAdvanceOneCharSpaceVowel[(uint8)src[0]];
if (*tote_grams >= gram_limit) {
break;
}
}
if (FLAGS_dbgscore) {
int len = src - isrc;
DbgScoreTop(src, (len * 2) / advance_by, chunk_tote);
DbgScoreFlush();
}
int consumed = src - isrc;
if (consumed > srclen) {
consumed = srclen;
}
return consumed;
}
int cld::DoOctaScoreV3(const cld::CLDTableSummary* octagram_obj,
const char* isrc, int srclen, Tote* chunk_tote) {
int hit_count = 0;
const char* src = isrc;
const char* srclimit = src + srclen + 1;
int charcount = 0;
if (src[0] == ' ') {++src;}
const char* word_ptr = src;
const char* word_end = word_ptr;
if (FLAGS_dbgscore) {
fprintf(stderr, " " );
}
while (src < srclimit) {
if (src[0] == ' ') {
int bytecount = word_end - word_ptr;
if (bytecount == 0)
break;
uint64 wordhash40 = OctaHash40(word_ptr, bytecount);
uint32 probs = OctaHashV3Lookup4(octagram_obj, wordhash40);
probs = octagram_obj->kCLDTableInd[probs &
~octagram_obj->kCLDTableKeyMask];
if (FLAGS_dbglookup) {
DbgWordTermToStderr(wordhash40, probs, word_ptr, bytecount);
DbgScoreRecord(NULL, probs, bytecount);
} else if (FLAGS_dbgscore && (probs != 0)) {
DbgScoreRecord(NULL, probs, bytecount);
string temp(word_ptr, bytecount);
fprintf(stderr, "%s ", temp.c_str());
}
if (probs != 0) {
ProcessProbV25Tote(probs, chunk_tote);
++hit_count;
}
charcount = 0;
word_ptr = src + 1;
word_end = word_ptr;
} else {
++charcount;
}
src += cld_UniLib::OneCharLen(src);
if (charcount <= 8) {
word_end = src;
}
}
if (FLAGS_dbgscore) {
fprintf(stderr, "[%d words scored]\n", hit_count);
DbgScoreState();
}
return hit_count;
}
int cld::ReliabilityDelta(int value1, int value2, int gramcount) {
int max_reliability_percent = 100;
if (gramcount < 8) {
max_reliability_percent = 12 * gramcount;
}
int fully_reliable_thresh = (gramcount * 5) >> 3;
if (fully_reliable_thresh < kMinGramCount) {
fully_reliable_thresh = kMinGramCount;
} else if (fully_reliable_thresh > kMaxGramCount) {
fully_reliable_thresh = kMaxGramCount;
}
int delta = value1 - value2;
if (delta >= fully_reliable_thresh) {return max_reliability_percent;}
if (delta <= 0) {return 0;}
return cld::minint(max_reliability_percent,
(100 * delta) / fully_reliable_thresh);
}
int cld::ReliabilityMainstream(int topscore, int len, int mean_score) {
if (mean_score == 0) {return 100;}
if (topscore == 0) {return 0;}
if (len == 0) {return 0;}
int top_kb = (topscore << 10) / len;
double ratio;
double ratio_cutoff;
if (top_kb > mean_score) {
ratio = (1.0 * top_kb) / mean_score;
ratio_cutoff = 5.0;
} else {
ratio = (1.0 * mean_score) / top_kb;
ratio_cutoff = 4.0;
}
if (ratio <= ratio_cutoff - 2.0) {return 100;}
if (ratio > ratio_cutoff) {return 0;}
int iratio = static_cast<int>(100 * (ratio_cutoff - ratio) / 2.0);
return iratio;
}
double cld::GetNormalizedScore(Language lang, UnicodeLScript lscript,
int bytes, int score) {
int expected_score = kMeanScore[lang * 4 + LScript4(lscript)];
if (lscript == ULScript_Common) {
for (int i = 2; i >= 0; --i) {
if (kMeanScore[lang * 4 + i] > 0) {
expected_score = kMeanScore[lang * 4 + i];
break;
}
}
}
if (expected_score < 100) {
expected_score = 1000;
}
double our_score = (score << 10) / (bytes ? bytes : 1);
double ratio = our_score / expected_score;
ratio = (score * 1000.0) / expected_score;
return ratio;
}
int cld::GetReliability(int len, UnicodeLScript lscript,
const Tote* chunk_tote) {
Language cur_lang = UnpackLanguage(chunk_tote->Key(0));
int mean_score = kMeanScore[cur_lang * 4 + LScript4(lscript)];
if (lscript == ULScript_Common) {
for (int i = 2; i >= 0; --i) {
if (kMeanScore[cur_lang * 4 + i] > 0) {
mean_score = kMeanScore[cur_lang * 4 + i];
break;
}
}
}
int reliability_delta = ReliabilityDelta(chunk_tote->Value(0),
chunk_tote->Value(1),
chunk_tote->GetGramCount());
int reliability_main = ReliabilityMainstream(chunk_tote->Value(0),
len,
mean_score);
int reliability_min = minint(reliability_delta, reliability_main);
if (FLAGS_dbgreli) {
char temp1[4];
char temp2[4];
cld::DbgLangName3(UnpackLanguage(chunk_tote->Key(0)), temp1);
if (temp1[2] == ' ') {temp1[2] = '\0';}
cld::DbgLangName3(UnpackLanguage(chunk_tote->Key(1)), temp2);
if (temp2[2] == ' ') {temp2[2] = '\0';}
int srclen = len;
fprintf(stderr, "CALC GetReliability gram=%d incr=%d srclen=%d, %s=%d %s=%d "
"top/KB=%d mean/KB=%d del=%d%% reli=%d%% "
"lang/lscript %d %d\n",
chunk_tote->GetGramCount(),
chunk_tote->GetIncrCount(),
srclen,
temp1, chunk_tote->Value(0),
temp2, chunk_tote->Value(1),
(chunk_tote->Value(0) << 10) / (srclen ? srclen : 1),
mean_score,
reliability_delta,
reliability_main,
cur_lang, lscript);
}
return reliability_min;
}
void cld::DemoteNotTop40(Tote* chunk_tote, int packed_plus_one) {
for (int sub = 0; sub < chunk_tote->MaxSize(); ++sub) {
if (chunk_tote->Key(sub) == 0) continue;
if (chunk_tote->Key(sub) == packed_plus_one) continue;
if (kIsPackedTop40[chunk_tote->Key(sub)]) continue;
chunk_tote->SetValue(sub, chunk_tote->Value(sub) >> 2);
}
}