This source file includes following definitions.
- FlagFinish
- FlagSqueeze
- FlagRepeats
- FlagTop40
- FlagShort
- FlagHint
- FlagUseWords
- BackscanToSpace
- ForwardscanToSpace
- CountPredictedBytes
- CountSpaces4
- CheapRepWordsInplace
- CheapSqueezeInplace
- CheapSqueezeTriggerTest
- RemoveExtendedLanguages
- RemoveUnreliableLanguages
- RefineScoredClosePairs
- ApplyLanguageHints
- PrintHtmlEscapedText
- ScoreChunkIntoDoc
- PrintTopLang
- PrintTopLangSpeculative
- ScoreChunkIntoDoc2
- ScoreNilgrams
- ScoreUnigrams
- BackOneUTF8
- ScoreQuadgrams
- PrintLangs
- InitScriptToteLang
- MakeChar4
- HintBinaryLookup4
- ApplyTLDHint
- ApplyEncodingHint
- ApplyLanguageHint
- ExtractLangEtc
- IsFIGS
- IsEFIGS
- CalcSummaryLang
- DetectLanguageSummaryV25
#include <stdio.h>
#include <string.h>
#include <string>
#include "encodings/lang_enc.h"
#include "encodings/compact_lang_det/compact_lang_det.h"
#include "encodings/compact_lang_det/compact_lang_det_impl.h"
#include "encodings/compact_lang_det/getonescriptspan.h"
#include "encodings/compact_lang_det/letterscript_enum.h"
#include "encodings/compact_lang_det/tote.h"
#include "encodings/compact_lang_det/utf8propjustletter.h"
#include "encodings/compact_lang_det/utf8propletterscriptnum.h"
#include "encodings/compact_lang_det/utf8scannotjustletterspecial.h"
#include "encodings/compact_lang_det/cldutil_dbg.h"
#include "encodings/compact_lang_det/win/cld_basictypes.h"
#include "encodings/compact_lang_det/win/cld_commandlineflags.h"
#include "encodings/compact_lang_det/win/cld_google.h"
#include "encodings/compact_lang_det/win/cld_utf8statetable.h"
extern const UTF8PropObj compact_lang_det_generated_ctjkvz_b1_obj;
extern const cld::CLDTableSummary kCjkBiTable_obj;
extern const cld::CLDTableSummary kQuadTable_obj;
extern const cld::CLDTableSummary kLongWord8Table_obj;
DEFINE_bool(cld_html, false, "Print language spans in HTML on stderr");
DEFINE_bool(cld_forcewords, false, "Score all words, in addition to quads");
DEFINE_bool(cld_showme, false, "Put squeeze/repeat points into HTML text");
DEFINE_bool(cld_echotext, false, "Print each scriptspan to stderr");
DEFINE_int32(cld_textlimit, 160, "Examine only initial n KB of actual text");
DEFINE_int32(cld_smoothwidth, 20, "Smoothing window width in quadgrams");
static const int kLangHintInitial = 12;
static const int kLangHintBoost = 12;
static const int kShortSpanThresh = 32;
static const int kMaxSecondChanceLen = 1024;
static const int kCheapSqueezeTestThresh = 4096;
static const int kCheapSqueezeTestLen = 256;
static const int kSpacesTriggerPercent = 25;
static const int kPredictTriggerPercent = 67;
static const int kChunksizeDefault = 48;
static const int kSpacesThreshPercent = 25;
static const int kPredictThreshPercent = 40;
static const int kMaxSpaceScan = 32;
static const int kGoodLang1Percent = 70;
static const int kGoodLang1and2Percent = 93;
static const int kShortTextThresh = 256;
static const int kMinChunkSizeQuads = 4;
static const int kMaxChunkSizeQuads = 1024;
static const int kDefaultWordSpan = 256;
static const int kReallyBigWordSpan = 9999999;
static const int kMinReliableSeq = 50;
static const int kPredictionTableSize = 4096;
static const uint32 kEncodingHintProbs[] = {
0x00000000,
0x18120cd5,
0x1d3a4bc9,
0x030819d4,
0x00000000,
0x00003742,
0x00000000,
0x00000742,
0x00002242,
0x060419c9,
0x00000942,
0x00000942,
0x00000942,
0x00004642,
0x00001142,
0x46295fcd,
0x00000a42,
0x00000000,
0x03104674,
0x00000000,
0x0f1146c3,
0x00000942,
0x00000000,
0x00000000,
0x00000000,
0x00000000,
0x00000000,
0x00000000,
0x00000000,
0x451d12cd,
0x0d06052a,
0x00002242,
0x191516be,
0x08003642,
0x00000000,
0x00003742,
0x00000742,
0x00000000,
0x00000000,
0x00000000,
0x39001242,
0x00000000,
0x00000000,
0x2e001944,
0x08090a74,
0x00001142,
0x4600113d,
0x00004642,
0x00000000,
0x00000000,
0x00000000,
0x00000000,
0x00000000,
0x00000000,
0x00000000,
0x00000000,
0x00000000,
0x090646ca,
0x00000000,
0x00000000,
0x00000000,
0x00000000,
0x06001142,
0x461109c2,
0x00000000,
0x00000000,
0x00000000,
0x00000000,
0x00000000,
0x00000000,
0x00000000,
0x00000000,
0x00000000,
0x00000000,
0x00000000,
};
COMPILE_ASSERT(arraysize(kEncodingHintProbs) == NUM_ENCODINGS,
kEncodingHintProbs_has_incorrect_size);
static const uint32 kLanguageHintProbs[] = {
0x00000000,
0x00000242,
0x00000342,
0x00000442,
0x00000542,
0x00000642,
0x00000742,
0x00000842,
0x00000942,
0x00000a42,
0x51000b43,
0x00000c42,
0x00000d42,
0x00000000,
0x00000f42,
0x00001042,
0x00001142,
0x00001242,
0x00000000,
0x47001442,
0x00001542,
0x00001642,
0x00001742,
0x00001842,
0x00001942,
0x00000000,
0x00000000,
0x00001c42,
0x00001d42,
0x1e001d46,
0x00000000,
0x0f00203d,
0x5e00213a,
0x00002242,
0x00002342,
0x00000000,
0x1c1e25d4,
0x00002642,
0x00002742,
0x00000000,
0x2700293c,
0x00000000,
0x00000000,
0x00000000,
0x00000000,
0x00002e42,
0x00000000,
0x00003042,
0x00000000,
0x00000000,
0x375f3330,
0x41003436,
0x00000000,
0x0a4636b2,
0x00003742,
0x00003842,
0x00003942,
0x00003a42,
0x00000000,
0x00000000,
0x05060cca,
0x00000000,
0x00003f42,
0x00004042,
0x00004142,
0x00004242,
0x00004342,
0x00000000,
0x12004543,
0x00004642,
0x00000000,
0x00000000,
0x79004944,
0x4d004a46,
0x00004b42,
0x00000000,
0x00000000,
0x00004e42,
0x00000000,
0x00000000,
0x00000000,
0x00000000,
0x00000000,
0x00000000,
0x00000000,
0x00000000,
0x00000000,
0x00000000,
0x7a005933,
0x00000000,
0x00000000,
0x00000000,
0x00000000,
0x00000000,
0x00005f42,
0x00006042,
0x00006142,
0x051130c9,
0x020f0521,
0x64004e35,
0x00000000,
0x00006642,
0x00000000,
0x00006842,
0x00002242,
0x88006a3c,
0x00000000,
0x00000000,
0x00000000,
0x00000000,
0x00000000,
0x00000000,
0x00000000,
0x00000000,
0x00007342,
0x00000000,
0x00000000,
0x5f007645,
0x00000000,
0x00000000,
0x00007942,
0x00000000,
0x00000000,
0x00000000,
0x00000000,
0x00000000,
0x00000000,
0x00000000,
0x00000000,
0x00000000,
0x00000000,
0x00000000,
0x00000000,
0x00000000,
0x00000000,
0x00000000,
0x00000000,
0x00000000,
0x00000000,
0x00000000,
0x00000000,
0x00000542,
0x00000000,
0x00000000,
0x00000000,
0x00000000,
0x00000000,
0x00000000,
0x00000000,
0x00000000,
0x344197d3,
0x00000000,
0x00000000,
0x00000000,
0x00000000,
0x00000000,
0x00000000,
0x00000000,
0x00000000,
0x00000000,
0x00000000,
};
COMPILE_ASSERT(arraysize(kLanguageHintProbs) == NUM_LANGUAGES,
kLanguageHintProbs_has_incorrect_size);
typedef struct {
char key[4];
uint32 probs;
} HintEntry;
static const int kTLDHintProbsSize = 201;
static const HintEntry kTLDHintProbs[kTLDHintProbsSize] = {
{{0x61,0x63,0x5f,0x5f}, 0x0a000945},
{{0x61,0x64,0x5f,0x5f}, 0x00003842},
{{0x61,0x65,0x5f,0x5f}, 0x00003742},
{{0x61,0x66,0x5f,0x5f}, 0x4e00763d},
{{0x61,0x67,0x5f,0x5f}, 0x09000643},
{{0x61,0x69,0x5f,0x5f}, 0x0c180938},
{{0x61,0x6c,0x5f,0x5f}, 0x00002e42},
{{0x61,0x6e,0x5f,0x5f}, 0x6e00033d},
{{0x61,0x6f,0x5f,0x5f}, 0x05000d42},
{{0x61,0x71,0x5f,0x5f}, 0x05000f29},
{{0x61,0x72,0x5f,0x5f}, 0x00000f42},
{{0x61,0x73,0x5f,0x5f}, 0x0f120bcd},
{{0x61,0x74,0x5f,0x5f}, 0x00000642},
{{0x61,0x77,0x5f,0x5f}, 0x0f000345},
{{0x61,0x78,0x5f,0x5f}, 0x00001042},
{{0x61,0x7a,0x5f,0x5f}, 0x00004b42},
{{0x62,0x61,0x5f,0x5f}, 0x00001d42},
{{0x62,0x62,0x5f,0x5f}, 0x00002842},
{{0x62,0x64,0x5f,0x5f}, 0x00002642},
{{0x62,0x65,0x5f,0x5f}, 0x05000335},
{{0x62,0x66,0x5f,0x5f}, 0x00000542},
{{0x62,0x67,0x5f,0x5f}, 0x00001c42},
{{0x62,0x68,0x5f,0x5f}, 0x00003742},
{{0x62,0x69,0x5f,0x5f}, 0x0f00053f},
{{0x62,0x6a,0x5f,0x5f}, 0x00000542},
{{0x62,0x6d,0x5f,0x5f}, 0x98043929},
{{0x62,0x6e,0x5f,0x5f}, 0x00002942},
{{0x62,0x6f,0x5f,0x5f}, 0x00000f42},
{{0x62,0x72,0x5f,0x5f}, 0x00000d42},
{{0x62,0x74,0x5f,0x5f}, 0x00008842},
{{0x62,0x77,0x5f,0x5f}, 0x06059ac4},
{{0x62,0x79,0x5f,0x5f}, 0x00003024},
{{0x62,0x7a,0x5f,0x5f}, 0x0f0a0924},
{{0x63,0x61,0x5f,0x5f}, 0x00000542},
{{0x63,0x61,0x74,0x5f}, 0x00003842},
{{0x63,0x64,0x5f,0x5f}, 0x06051224},
{{0x63,0x66,0x5f,0x5f}, 0x00000542},
{{0x63,0x67,0x5f,0x5f}, 0x00000542},
{{0x63,0x68,0x5f,0x5f}, 0x08050638},
{{0x63,0x69,0x5f,0x5f}, 0x00000542},
{{0x63,0x6c,0x5f,0x5f}, 0x00000f42},
{{0x63,0x6d,0x5f,0x5f}, 0x00000542},
{{0x63,0x6e,0x5f,0x5f}, 0x00001142},
{{0x63,0x6f,0x5f,0x5f}, 0x00000f42},
{{0x63,0x72,0x5f,0x5f}, 0x00000f42},
{{0x63,0x75,0x5f,0x5f}, 0x00000f42},
{{0x63,0x76,0x5f,0x5f}, 0x00000d42},
{{0x63,0x78,0x5f,0x5f}, 0x223a091f},
{{0x63,0x79,0x5f,0x5f}, 0x150622ba},
{{0x63,0x7a,0x5f,0x5f}, 0x00001242},
{{0x64,0x65,0x5f,0x5f}, 0x00000642},
{{0x64,0x6b,0x5f,0x5f}, 0x00000242},
{{0x64,0x6f,0x5f,0x5f}, 0x21000f42},
{{0x64,0x7a,0x5f,0x5f}, 0x37000535},
{{0x65,0x63,0x5f,0x5f}, 0x00000f42},
{{0x65,0x65,0x5f,0x5f}, 0x00001942},
{{0x65,0x67,0x5f,0x5f}, 0x05003742},
{{0x65,0x72,0x5f,0x5f}, 0x00000b42},
{{0x65,0x73,0x5f,0x5f}, 0x38200fd4},
{{0x65,0x74,0x5f,0x5f}, 0x39004a39},
{{0x66,0x69,0x5f,0x5f}, 0x10000444},
{{0x66,0x6a,0x5f,0x5f}, 0x050489e0},
{{0x66,0x6f,0x5f,0x5f}, 0x00004742},
{{0x66,0x72,0x5f,0x5f}, 0x00000542},
{{0x67,0x61,0x5f,0x5f}, 0x00000542},
{{0x67,0x64,0x5f,0x5f}, 0x061d05d5},
{{0x67,0x65,0x5f,0x5f}, 0x00004c2d},
{{0x67,0x66,0x5f,0x5f}, 0x00000542},
{{0x67,0x67,0x5f,0x5f}, 0x06002244},
{{0x67,0x68,0x5f,0x5f}, 0x05000436},
{{0x67,0x69,0x5f,0x5f}, 0x0f0538ce},
{{0x67,0x6c,0x5f,0x5f}, 0x398a0238},
{{0x67,0x6d,0x5f,0x5f}, 0x0600043e},
{{0x67,0x6e,0x5f,0x5f}, 0x00000542},
{{0x67,0x70,0x5f,0x5f}, 0x00000542},
{{0x67,0x71,0x5f,0x5f}, 0x0f000547},
{{0x67,0x73,0x5f,0x5f}, 0x00000942},
{{0x67,0x74,0x5f,0x5f}, 0x00000f42},
{{0x68,0x6b,0x5f,0x5f}, 0x11004643},
{{0x68,0x6d,0x5f,0x5f}, 0x4606092e},
{{0x68,0x6e,0x5f,0x5f}, 0x00000f42},
{{0x68,0x72,0x5f,0x5f}, 0x00001d42},
{{0x68,0x74,0x5f,0x5f}, 0x0f000542},
{{0x68,0x75,0x5f,0x5f}, 0x00001842},
{{0x69,0x64,0x5f,0x5f}, 0x00002742},
{{0x69,0x65,0x5f,0x5f}, 0x050c1f24},
{{0x69,0x6c,0x5f,0x5f}, 0x00000742},
{{0x69,0x6e,0x74,0x5f}, 0x0f060574},
{{0x69,0x6f,0x5f,0x5f}, 0x11090fd5},
{{0x69,0x71,0x5f,0x5f}, 0x60003744},
{{0x69,0x72,0x5f,0x5f}, 0x00004e42},
{{0x69,0x73,0x5f,0x5f}, 0x00001442},
{{0x69,0x74,0x5f,0x5f}, 0x00000842},
{{0x6a,0x65,0x5f,0x5f}, 0x29050328},
{{0x6a,0x6d,0x5f,0x5f}, 0x040f0576},
{{0x6a,0x6f,0x5f,0x5f}, 0x00003742},
{{0x6a,0x70,0x5f,0x5f}, 0x00000942},
{{0x6b,0x65,0x5f,0x5f}, 0x040f3fc3},
{{0x6b,0x69,0x5f,0x5f}, 0x04000643},
{{0x6b,0x6d,0x5f,0x5f}, 0x00000542},
{{0x6b,0x70,0x5f,0x5f}, 0x00000a42},
{{0x6b,0x72,0x5f,0x5f}, 0x00000a42},
{{0x6b,0x77,0x5f,0x5f}, 0x00003742},
{{0x6b,0x79,0x5f,0x5f}, 0x0500083f},
{{0x6b,0x7a,0x5f,0x5f}, 0x0000732d},
{{0x6c,0x62,0x5f,0x5f}, 0x05003747},
{{0x6c,0x63,0x5f,0x5f}, 0x09000645},
{{0x6c,0x69,0x5f,0x5f}, 0x1600063d},
{{0x6c,0x73,0x5f,0x5f}, 0x00005742},
{{0x6c,0x74,0x5f,0x5f}, 0x00001642},
{{0x6c,0x75,0x5f,0x5f}, 0x0600053d},
{{0x6c,0x76,0x5f,0x5f}, 0x00001542},
{{0x6c,0x79,0x5f,0x5f}, 0x05003744},
{{0x6d,0x61,0x5f,0x5f}, 0x3700053d},
{{0x6d,0x63,0x5f,0x5f}, 0x00000542},
{{0x6d,0x64,0x5f,0x5f}, 0x00001724},
{{0x6d,0x65,0x5f,0x5f}, 0x00001d42},
{{0x6d,0x67,0x5f,0x5f}, 0x00000542},
{{0x6d,0x6b,0x5f,0x5f}, 0x1c002543},
{{0x6d,0x6c,0x5f,0x5f}, 0x00000542},
{{0x6d,0x6e,0x5f,0x5f}, 0x00006142},
{{0x6d,0x6f,0x5f,0x5f}, 0x110d4631},
{{0x6d,0x71,0x5f,0x5f}, 0x00000542},
{{0x6d,0x72,0x5f,0x5f}, 0x37000535},
{{0x6d,0x73,0x5f,0x5f}, 0x090f06d5},
{{0x6d,0x74,0x5f,0x5f}, 0x00004242},
{{0x6d,0x75,0x5f,0x5f}, 0x05000934},
{{0x6d,0x76,0x5f,0x5f}, 0x28000436},
{{0x6d,0x77,0x5f,0x5f}, 0x0611092a},
{{0x6d,0x78,0x5f,0x5f}, 0x00000f42},
{{0x6d,0x79,0x5f,0x5f}, 0x00002942},
{{0x6d,0x7a,0x5f,0x5f}, 0x00000d42},
{{0x6e,0x61,0x5f,0x5f}, 0x06006644},
{{0x6e,0x63,0x5f,0x5f}, 0x00000542},
{{0x6e,0x65,0x5f,0x5f}, 0x8b000542},
{{0x6e,0x66,0x5f,0x5f}, 0x00000542},
{{0x6e,0x69,0x5f,0x5f}, 0x00000f42},
{{0x6e,0x6c,0x5f,0x5f}, 0x00000342},
{{0x6e,0x6f,0x5f,0x5f}, 0x51000b43},
{{0x6e,0x75,0x5f,0x5f}, 0x0300103b},
{{0x6f,0x6d,0x5f,0x5f}, 0x00003742},
{{0x70,0x61,0x5f,0x5f}, 0x00000f42},
{{0x70,0x65,0x5f,0x5f}, 0x00000f42},
{{0x70,0x66,0x5f,0x5f}, 0x00000542},
{{0x70,0x67,0x5f,0x5f}, 0x00000f24},
{{0x70,0x68,0x5f,0x5f}, 0x00002142},
{{0x70,0x6b,0x5f,0x5f}, 0x00003342},
{{0x70,0x6c,0x5f,0x5f}, 0x30000c42},
{{0x70,0x6e,0x5f,0x5f}, 0x04000644},
{{0x70,0x72,0x5f,0x5f}, 0x00000f42},
{{0x70,0x72,0x6f,0x5f}, 0x46050fd5},
{{0x70,0x73,0x5f,0x5f}, 0x00003742},
{{0x70,0x74,0x5f,0x5f}, 0x00000d42},
{{0x70,0x79,0x5f,0x5f}, 0x00000f42},
{{0x71,0x61,0x5f,0x5f}, 0x00003742},
{{0x72,0x65,0x5f,0x5f}, 0x00000542},
{{0x72,0x6f,0x5f,0x5f}, 0x00001742},
{{0x72,0x73,0x5f,0x5f}, 0x00001d42},
{{0x72,0x77,0x5f,0x5f}, 0x9000053e},
{{0x73,0x61,0x5f,0x5f}, 0x00003742},
{{0x73,0x62,0x5f,0x5f}, 0x00000442},
{{0x73,0x63,0x5f,0x5f}, 0x060f092f},
{{0x73,0x64,0x5f,0x5f}, 0x00003742},
{{0x73,0x65,0x5f,0x5f}, 0x00001042},
{{0x73,0x69,0x5f,0x5f}, 0x00004042},
{{0x73,0x6b,0x5f,0x5f}, 0x12004543},
{{0x73,0x6d,0x5f,0x5f}, 0x00000842},
{{0x73,0x6e,0x5f,0x5f}, 0x00000542},
{{0x73,0x72,0x5f,0x5f}, 0x03001e44},
{{0x73,0x76,0x5f,0x5f}, 0x00000f42},
{{0x73,0x79,0x5f,0x5f}, 0x00003742},
{{0x74,0x63,0x5f,0x5f}, 0x0a2206cd},
{{0x74,0x66,0x5f,0x5f}, 0x00000642},
{{0x74,0x67,0x5f,0x5f}, 0x00000542},
{{0x74,0x68,0x5f,0x5f}, 0x9e0936c9},
{{0x74,0x6a,0x5f,0x5f}, 0x00007924},
{{0x74,0x6c,0x5f,0x5f}, 0x060f0dcd},
{{0x74,0x6e,0x5f,0x5f}, 0x3700053e},
{{0x74,0x6f,0x5f,0x5f}, 0x064609c5},
{{0x74,0x70,0x5f,0x5f}, 0x06000944},
{{0x74,0x72,0x5f,0x5f}, 0x00002242},
{{0x74,0x72,0x61,0x76}, 0x064509c3},
{{0x74,0x74,0x5f,0x5f}, 0x0f00063e},
{{0x74,0x77,0x5f,0x5f}, 0x00004642},
{{0x74,0x7a,0x5f,0x5f}, 0x00003f42},
{{0x75,0x61,0x5f,0x5f}, 0x0000232d},
{{0x75,0x79,0x5f,0x5f}, 0x00000f42},
{{0x75,0x7a,0x5f,0x5f}, 0x0000492d},
{{0x76,0x61,0x5f,0x5f}, 0x060f0828},
{{0x76,0x63,0x5f,0x5f}, 0x0d000939},
{{0x76,0x65,0x5f,0x5f}, 0x00000f42},
{{0x76,0x67,0x5f,0x5f}, 0x09000f43},
{{0x76,0x69,0x5f,0x5f}, 0x00002942},
{{0x76,0x6e,0x5f,0x5f}, 0x00004342},
{{0x76,0x75,0x5f,0x5f}, 0x00000642},
{{0x77,0x73,0x5f,0x5f}, 0x4b0f0624},
{{0x79,0x65,0x5f,0x5f}, 0x00003742},
{{0x79,0x75,0x5f,0x5f}, 0x1e001d3d},
{{0x7a,0x61,0x5f,0x5f}, 0x00006642},
{{0x7a,0x6d,0x5f,0x5f}, 0x0b000435},
{{0x7a,0x77,0x5f,0x5f}, 0x3f00783e},
};
static const int kMinCorrPercent = 24;
static Language Unknown = UNKNOWN_LANGUAGE;
static const Language kClosestAltLanguage[] = {
(28 >= kMinCorrPercent) ? SCOTS : UNKNOWN_LANGUAGE,
(36 >= kMinCorrPercent) ? NORWEGIAN : UNKNOWN_LANGUAGE,
(31 >= kMinCorrPercent) ? AFRIKAANS : UNKNOWN_LANGUAGE,
(15 >= kMinCorrPercent) ? ESTONIAN : UNKNOWN_LANGUAGE,
(11 >= kMinCorrPercent) ? OCCITAN : UNKNOWN_LANGUAGE,
(17 >= kMinCorrPercent) ? LUXEMBOURGISH : UNKNOWN_LANGUAGE,
(27 >= kMinCorrPercent) ? YIDDISH : UNKNOWN_LANGUAGE,
(16 >= kMinCorrPercent) ? CORSICAN : UNKNOWN_LANGUAGE,
( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,
( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,
(41 >= kMinCorrPercent) ? NORWEGIAN_N : UNKNOWN_LANGUAGE,
( 5 >= kMinCorrPercent) ? SLOVAK : UNKNOWN_LANGUAGE,
(23 >= kMinCorrPercent) ? SPANISH : UNKNOWN_LANGUAGE,
(33 >= kMinCorrPercent) ? BULGARIAN : UNKNOWN_LANGUAGE,
(28 >= kMinCorrPercent) ? GALICIAN : UNKNOWN_LANGUAGE,
(17 >= kMinCorrPercent) ? NORWEGIAN : UNKNOWN_LANGUAGE,
( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,
(42 >= kMinCorrPercent) ? SLOVAK : UNKNOWN_LANGUAGE,
( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,
(35 >= kMinCorrPercent) ? FAROESE : UNKNOWN_LANGUAGE,
( 7 >= kMinCorrPercent) ? LITHUANIAN : UNKNOWN_LANGUAGE,
( 7 >= kMinCorrPercent) ? LATVIAN : UNKNOWN_LANGUAGE,
( 4 >= kMinCorrPercent) ? LATIN : UNKNOWN_LANGUAGE,
( 4 >= kMinCorrPercent) ? SLOVAK : UNKNOWN_LANGUAGE,
(15 >= kMinCorrPercent) ? FINNISH : UNKNOWN_LANGUAGE,
( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,
( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,
(33 >= kMinCorrPercent) ? RUSSIAN : UNKNOWN_LANGUAGE,
( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,
( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,
(24 >= kMinCorrPercent) ? SCOTS_GAELIC : UNKNOWN_LANGUAGE,
(28 >= kMinCorrPercent) ? SPANISH : UNKNOWN_LANGUAGE,
( 8 >= kMinCorrPercent) ? INDONESIAN : UNKNOWN_LANGUAGE,
(29 >= kMinCorrPercent) ? AZERBAIJANI : UNKNOWN_LANGUAGE,
(28 >= kMinCorrPercent) ? RUSSIAN : UNKNOWN_LANGUAGE,
(37 >= kMinCorrPercent) ? MARATHI : UNKNOWN_LANGUAGE,
(29 >= kMinCorrPercent) ? BULGARIAN : UNKNOWN_LANGUAGE,
(14 >= kMinCorrPercent) ? ASSAMESE : UNKNOWN_LANGUAGE,
(46 >= kMinCorrPercent) ? MALAY : UNKNOWN_LANGUAGE,
( 9 >= kMinCorrPercent) ? INTERLINGUA : UNKNOWN_LANGUAGE,
(46 >= kMinCorrPercent) ? INDONESIAN : UNKNOWN_LANGUAGE,
( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,
( 4 >= kMinCorrPercent) ? BRETON : UNKNOWN_LANGUAGE,
( 8 >= kMinCorrPercent) ? HINDI : UNKNOWN_LANGUAGE,
( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,
( 3 >= kMinCorrPercent) ? ESPERANTO : UNKNOWN_LANGUAGE,
( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,
(22 >= kMinCorrPercent) ? UKRAINIAN : UNKNOWN_LANGUAGE,
(15 >= kMinCorrPercent) ? SUNDANESE : UNKNOWN_LANGUAGE,
(19 >= kMinCorrPercent) ? CATALAN : UNKNOWN_LANGUAGE,
(27 >= kMinCorrPercent) ? PERSIAN : UNKNOWN_LANGUAGE,
(36 >= kMinCorrPercent) ? HINDI : UNKNOWN_LANGUAGE,
( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,
( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,
(24 >= kMinCorrPercent) ? PERSIAN : UNKNOWN_LANGUAGE,
(19 >= kMinCorrPercent) ? OCCITAN : UNKNOWN_LANGUAGE,
( 4 >= kMinCorrPercent) ? LATIN : UNKNOWN_LANGUAGE,
( 3 >= kMinCorrPercent) ? GERMAN : UNKNOWN_LANGUAGE,
( 9 >= kMinCorrPercent) ? LATIN : UNKNOWN_LANGUAGE,
( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,
( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,
(24 >= kMinCorrPercent) ? IRISH : UNKNOWN_LANGUAGE,
( 7 >= kMinCorrPercent) ? KINYARWANDA : UNKNOWN_LANGUAGE,
(28 >= kMinCorrPercent) ? SERBO_CROATIAN : UNKNOWN_LANGUAGE,
(37 >= kMinCorrPercent) ? HINDI : UNKNOWN_LANGUAGE,
( 3 >= kMinCorrPercent) ? ITALIAN : UNKNOWN_LANGUAGE,
( 1 >= kMinCorrPercent) ? YORUBA : UNKNOWN_LANGUAGE,
(15 >= kMinCorrPercent) ? DUTCH : UNKNOWN_LANGUAGE,
(42 >= kMinCorrPercent) ? CZECH : UNKNOWN_LANGUAGE,
(24 >= kMinCorrPercent) ? CHINESE : UNKNOWN_LANGUAGE,
(35 >= kMinCorrPercent) ? ICELANDIC : UNKNOWN_LANGUAGE,
(15 >= kMinCorrPercent) ? JAVANESE : UNKNOWN_LANGUAGE,
(17 >= kMinCorrPercent) ? TAJIK : UNKNOWN_LANGUAGE,
( 7 >= kMinCorrPercent) ? TIGRINYA : UNKNOWN_LANGUAGE,
(29 >= kMinCorrPercent) ? TURKISH : UNKNOWN_LANGUAGE,
( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,
( 7 >= kMinCorrPercent) ? AMHARIC : UNKNOWN_LANGUAGE,
(27 >= kMinCorrPercent) ? URDU : UNKNOWN_LANGUAGE,
( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,
( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,
(41 >= kMinCorrPercent) ? NORWEGIAN : UNKNOWN_LANGUAGE,
( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,
( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,
(37 >= kMinCorrPercent) ? ZULU : UNKNOWN_LANGUAGE,
(37 >= kMinCorrPercent) ? XHOSA : UNKNOWN_LANGUAGE,
( 2 >= kMinCorrPercent) ? SPANISH : UNKNOWN_LANGUAGE,
(29 >= kMinCorrPercent) ? TSWANA : UNKNOWN_LANGUAGE,
( 7 >= kMinCorrPercent) ? TURKISH : UNKNOWN_LANGUAGE,
( 8 >= kMinCorrPercent) ? KAZAKH : UNKNOWN_LANGUAGE,
( 5 >= kMinCorrPercent) ? FRENCH : UNKNOWN_LANGUAGE,
( 3 >= kMinCorrPercent) ? GANDA : UNKNOWN_LANGUAGE,
(27 >= kMinCorrPercent) ? HEBREW : UNKNOWN_LANGUAGE,
(28 >= kMinCorrPercent) ? SLOVENIAN : UNKNOWN_LANGUAGE,
(12 >= kMinCorrPercent) ? OROMO : UNKNOWN_LANGUAGE,
( 9 >= kMinCorrPercent) ? UZBEK : UNKNOWN_LANGUAGE,
(15 >= kMinCorrPercent) ? PERSIAN : UNKNOWN_LANGUAGE,
( 6 >= kMinCorrPercent) ? KYRGYZ : UNKNOWN_LANGUAGE,
( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,
( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,
( 8 >= kMinCorrPercent) ? URDU : UNKNOWN_LANGUAGE,
(10 >= kMinCorrPercent) ? ITALIAN : UNKNOWN_LANGUAGE,
(31 >= kMinCorrPercent) ? DUTCH : UNKNOWN_LANGUAGE,
(17 >= kMinCorrPercent) ? GERMAN : UNKNOWN_LANGUAGE,
( 2 >= kMinCorrPercent) ? SCOTS : UNKNOWN_LANGUAGE,
( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,
(45 >= kMinCorrPercent) ? DZONGKHA : UNKNOWN_LANGUAGE,
( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,
( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,
( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,
( 8 >= kMinCorrPercent) ? DUTCH : UNKNOWN_LANGUAGE,
( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,
(14 >= kMinCorrPercent) ? BENGALI : UNKNOWN_LANGUAGE,
(16 >= kMinCorrPercent) ? ITALIAN : UNKNOWN_LANGUAGE,
( 5 >= kMinCorrPercent) ? INTERLINGUA : UNKNOWN_LANGUAGE,
( 8 >= kMinCorrPercent) ? KYRGYZ : UNKNOWN_LANGUAGE,
( 4 >= kMinCorrPercent) ? SWAHILI : UNKNOWN_LANGUAGE,
(11 >= kMinCorrPercent) ? RUSSIAN : UNKNOWN_LANGUAGE,
(19 >= kMinCorrPercent) ? PERSIAN : UNKNOWN_LANGUAGE,
( 5 >= kMinCorrPercent) ? AYMARA : UNKNOWN_LANGUAGE,
( 5 >= kMinCorrPercent) ? KINYARWANDA : UNKNOWN_LANGUAGE,
(17 >= kMinCorrPercent) ? UZBEK : UNKNOWN_LANGUAGE,
(13 >= kMinCorrPercent) ? BASHKIR : UNKNOWN_LANGUAGE,
(11 >= kMinCorrPercent) ? SAMOAN : UNKNOWN_LANGUAGE,
( 2 >= kMinCorrPercent) ? TWI : UNKNOWN_LANGUAGE,
( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,
( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,
( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,
( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,
( 6 >= kMinCorrPercent) ? TONGA : UNKNOWN_LANGUAGE,
( 3 >= kMinCorrPercent) ? OROMO : UNKNOWN_LANGUAGE,
( 1 >= kMinCorrPercent) ? MONGOLIAN : UNKNOWN_LANGUAGE,
( 8 >= kMinCorrPercent) ? SOMALI : UNKNOWN_LANGUAGE,
( 5 >= kMinCorrPercent) ? QUECHUA : UNKNOWN_LANGUAGE,
(13 >= kMinCorrPercent) ? TATAR : UNKNOWN_LANGUAGE,
( 3 >= kMinCorrPercent) ? ENGLISH : UNKNOWN_LANGUAGE,
(45 >= kMinCorrPercent) ? TIBETAN : UNKNOWN_LANGUAGE,
( 4 >= kMinCorrPercent) ? TONGA : UNKNOWN_LANGUAGE,
( 7 >= kMinCorrPercent) ? INUPIAK : UNKNOWN_LANGUAGE,
( 3 >= kMinCorrPercent) ? AFAR : UNKNOWN_LANGUAGE,
( 3 >= kMinCorrPercent) ? OCCITAN : UNKNOWN_LANGUAGE,
( 7 >= kMinCorrPercent) ? GREENLANDIC : UNKNOWN_LANGUAGE,
( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,
( 4 >= kMinCorrPercent) ? HINDI : UNKNOWN_LANGUAGE,
(30 >= kMinCorrPercent) ? RUNDI : UNKNOWN_LANGUAGE,
( 2 >= kMinCorrPercent) ? TAGALOG : UNKNOWN_LANGUAGE,
(17 >= kMinCorrPercent) ? GERMAN : UNKNOWN_LANGUAGE,
(12 >= kMinCorrPercent) ? SOMALI : UNKNOWN_LANGUAGE,
(30 >= kMinCorrPercent) ? KINYARWANDA : UNKNOWN_LANGUAGE,
(11 >= kMinCorrPercent) ? TONGA : UNKNOWN_LANGUAGE,
( 1 >= kMinCorrPercent) ? LINGALA : UNKNOWN_LANGUAGE,
(32 >= kMinCorrPercent) ? MARATHI : UNKNOWN_LANGUAGE,
(16 >= kMinCorrPercent) ? ZULU : UNKNOWN_LANGUAGE,
( 5 >= kMinCorrPercent) ? SISWANT : UNKNOWN_LANGUAGE,
(29 >= kMinCorrPercent) ? SESOTHO : UNKNOWN_LANGUAGE,
( 2 >= kMinCorrPercent) ? ESTONIAN : UNKNOWN_LANGUAGE,
( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,
( 1 >= kMinCorrPercent) ? MALAY : UNKNOWN_LANGUAGE,
(28 >= kMinCorrPercent) ? ENGLISH : UNKNOWN_LANGUAGE,
(15 >= kMinCorrPercent) ? KINYARWANDA : UNKNOWN_LANGUAGE,
( 7 >= kMinCorrPercent) ? ENGLISH : UNKNOWN_LANGUAGE,
( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,
};
COMPILE_ASSERT(arraysize(kClosestAltLanguage) == NUM_LANGUAGES,
kClosestAltLanguage_has_incorrect_size);
inline bool FlagFinish(int flags) {return (flags & kCLDFlagFinish) != 0;}
inline bool FlagSqueeze(int flags) {return (flags & kCLDFlagSqueeze) != 0;}
inline bool FlagRepeats(int flags) {return (flags & kCLDFlagRepeats) != 0;}
inline bool FlagTop40(int flags) {return (flags & kCLDFlagTop40) != 0;}
inline bool FlagShort(int flags) {return (flags & kCLDFlagShort) != 0;}
inline bool FlagHint(int flags) {return (flags & kCLDFlagHint) != 0;}
inline bool FlagUseWords(int flags) {return (flags & kCLDFlagUseWords) != 0;}
static Language prior_lang = UNKNOWN_LANGUAGE;
static bool prior_unreliable = false;
int BackscanToSpace(const char* src, int limit) {
int n = 0;
limit = cld::minint(limit, kMaxSpaceScan);
while (n < limit) {
if (src[-n - 1] == ' ') {return n;}
++n;
}
return 0;
}
int ForwardscanToSpace(const char* src, int limit) {
int n = 0;
limit = cld::minint(limit, kMaxSpaceScan);
while (n < limit) {
if (src[n] == ' ') {return n + 1;}
++n;
}
return 0;
}
int CountPredictedBytes(const char* isrc, int srclen, int* hash, int* tbl) {
int p_count = 0;
const uint8* src = reinterpret_cast<const uint8*>(isrc);
const uint8* srclimit = src + srclen;
int local_hash = *hash;
while (src < srclimit) {
int c = src[0];
int incr = 1;
if (c < 0xc0) {
} else if ((c & 0xe0) == 0xc0) {
c = (c << 8) | src[1];
incr = 2;
} else if ((c & 0xf0) == 0xe0) {
c = (c << 16) | (src[1] << 8) | src[2];
incr = 3;
} else {
c = (c << 24) | (src[1] << 16) | (src[2] << 8) | src[3];
incr = 4;
}
src += incr;
int p = tbl[local_hash];
tbl[local_hash] = c;
p_count += (c == p);
local_hash = ((local_hash << 4) ^ c) & 0xfff;
}
*hash = local_hash;
return p_count;
}
int CountSpaces4(const char* src, int src_len) {
int s_count = 0;
for (int i = 0; i < (src_len & ~3); i += 4) {
s_count += (src[i] == ' ');
s_count += (src[i+1] == ' ');
s_count += (src[i+2] == ' ');
s_count += (src[i+3] == ' ');
}
return s_count;
}
int CheapRepWordsInplace(char* isrc, int srclen, int* hash, int* tbl) {
const uint8* src = reinterpret_cast<const uint8*>(isrc);
const uint8* srclimit = src + srclen;
char* dst = isrc;
int local_hash = *hash;
char* word_dst = dst;
int good_predict_bytes = 0;
int word_length_bytes = 0;
while (src < srclimit) {
int c = src[0];
int incr = 1;
*dst++ = c;
if (c == ' ') {
if ((good_predict_bytes * 2) > word_length_bytes) {
dst = word_dst;
if (FLAGS_cld_showme) {
if ((isrc < (dst - 2)) && (dst[-2] != '.')) {
*dst++ = '.';
*dst++ = ' ';
}
}
}
word_dst = dst;
good_predict_bytes = 0;
word_length_bytes = 0;
}
if (c < 0xc0) {
} else if ((c & 0xe0) == 0xc0) {
*dst++ = src[1];
c = (c << 8) | src[1];
incr = 2;
} else if ((c & 0xf0) == 0xe0) {
*dst++ = src[1];
*dst++ = src[2];
c = (c << 16) | (src[1] << 8) | src[2];
incr = 3;
} else {
*dst++ = src[1];
*dst++ = src[2];
*dst++ = src[3];
c = (c << 24) | (src[1] << 16) | (src[2] << 8) | src[3];
incr = 4;
}
src += incr;
word_length_bytes += incr;
int p = tbl[local_hash];
tbl[local_hash] = c;
if (c == p) {
good_predict_bytes += incr;
}
local_hash = ((local_hash << 4) ^ c) & 0xfff;
}
*hash = local_hash;
if ((dst - isrc) < (srclen - 3)) {
dst[0] = ' ';
dst[1] = ' ';
dst[2] = ' ';
dst[3] = '\0';
} else if ((dst - isrc) < srclen) {
dst[0] = ' ';
}
return static_cast<int>(dst - isrc);
}
int CompactLangDetImpl::CheapSqueezeInplace(char* isrc,
int srclen,
int ichunksize) {
char* src = isrc;
char* dst = src;
char* srclimit = src + srclen;
bool skipping = false;
int hash = 0;
int* predict_tbl = new int[kPredictionTableSize];
memset(predict_tbl, 0, kPredictionTableSize * sizeof(predict_tbl[0]));
int chunksize = ichunksize;
if (chunksize == 0) {chunksize = kChunksizeDefault;}
int space_thresh = (chunksize * kSpacesThreshPercent) / 100;
int predict_thresh = (chunksize * kPredictThreshPercent) / 100;
while (src < srclimit) {
int remaining_bytes = srclimit - src;
int len = cld::minint(chunksize, remaining_bytes);
while ((src[len] & 0xc0) == 0x80)
++len;
int space_n = CountSpaces4(src, len);
int predb_n = CountPredictedBytes(src, len, &hash, predict_tbl);
if ((space_n >= space_thresh) || (predb_n >= predict_thresh)) {
if (!skipping) {
int n = BackscanToSpace(dst, static_cast<int>(dst - isrc));
dst -= n;
skipping = true;
if (FLAGS_cld_showme) {
*dst++ = 0xe2;
*dst++ = 0x96;
*dst++ = 0xa0;
*dst++ = ' ';
}
if (dst == isrc) {
*dst++ = ' ';
}
}
} else {
if (skipping) {
int n = ForwardscanToSpace(src, len);
src += n;
remaining_bytes -= n;
len -= n;
skipping = false;
}
if (len > 0) {
memmove(dst, src, len);
dst += len;
}
}
src += len;
}
if ((dst - isrc) < (srclen - 3)) {
dst[0] = ' ';
dst[1] = ' ';
dst[2] = ' ';
dst[3] = '\0';
} else if ((dst - isrc) < srclen) {
dst[0] = ' ';
}
delete[] predict_tbl;
return static_cast<int>(dst - isrc);
}
bool CheapSqueezeTriggerTest(const char* src, int srclen, int testsize) {
if (srclen < testsize) {return false;}
int space_thresh = (testsize * kSpacesTriggerPercent) / 100;
int predict_thresh = (testsize * kPredictTriggerPercent) / 100;
int hash = 0;
int* predict_tbl = new int[kPredictionTableSize];
memset(predict_tbl, 0, kPredictionTableSize * sizeof(predict_tbl[0]));
bool retval = false;
if ((CountSpaces4(src, testsize) >= space_thresh) ||
(CountPredictedBytes(src, testsize, &hash, predict_tbl) >=
predict_thresh)) {
retval = true;
}
delete[] predict_tbl;
return retval;
}
static const uint8 kClosePair[EXT_NUM_LANGUAGES + 1] = {
0,
0,0,0,0,0,0,0,0, 0,0,4,0,0,0,0,0, 0,3,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
0,0,0,5,0,0,1,0, 1,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
5,0,0,0,3,0,0,0, 0,0,0,0,0,0,0,0, 4,0,0,6,6,0,0,0, 0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0, 0,2,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,2, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
};
void RemoveExtendedLanguages(ToteWithReliability* doc_tote) {
for (int sub = 0; sub < doc_tote->MaxSize(); ++sub) {
if (cld::UnpackLanguage(doc_tote->Key(sub)) >= NUM_LANGUAGES) {
if (FLAGS_dbgscore) {
fprintf(stderr, "{-%s} ",
ExtLanguageCode(cld::UnpackLanguage(doc_tote->Key(sub))));
}
doc_tote->SetKey(sub, 0);
doc_tote->SetValue(sub, 0);
doc_tote->SetReliability(sub, 0);
}
}
}
static const int kMinReliableKeepPercent = 41;
static const int kGoodFirstT3MinBytes = 24;
void RemoveUnreliableLanguages(ToteWithReliability* doc_tote) {
int total_bytes = 0;
for (int sub = 0; sub < doc_tote->MaxSize(); ++sub) {
int plang = doc_tote->Key(sub);
if (plang == 0) {continue;}
Language lang = cld::UnpackLanguage(plang);
int bytes = doc_tote->Value(sub);
int reli = doc_tote->Reliability(sub);
if (bytes == 0) {continue;}
total_bytes += bytes;
int reliable_percent = reli / bytes;
if (reliable_percent >= kMinReliableKeepPercent) {continue;}
Language altlang = UNKNOWN_LANGUAGE;
if (lang < NUM_LANGUAGES) {altlang = kClosestAltLanguage[lang];}
if (altlang == UNKNOWN_LANGUAGE) {continue;}
int altsub = doc_tote->Find(cld::PackLanguage(altlang));
if (altsub < 0) {continue;}
int bytes2 = doc_tote->Value(altsub);
int reli2 = doc_tote->Reliability(altsub);
if (bytes2 == 0) {continue;}
int reliable_percent2 = reli2 / bytes2;
int tosub = altsub;
int fromsub = sub;
bool into_lang = false;
if ((reliable_percent2 < reliable_percent) ||
((reliable_percent2 == reliable_percent) && (lang < altlang))) {
tosub = sub;
fromsub = altsub;
into_lang = true;
}
int newpercent = cld::maxint(reliable_percent, reliable_percent2);
newpercent = cld::maxint(newpercent, kMinReliableKeepPercent);
int newbytes = bytes + bytes2;
int newreli = newpercent * newbytes;
doc_tote->SetKey(fromsub, 0);
doc_tote->SetValue(fromsub, 0);
doc_tote->SetReliability(fromsub, 0);
doc_tote->SetValue(tosub, newbytes);
doc_tote->SetReliability(tosub, newreli);
if (FLAGS_cld_html && (newbytes >= 10)) {
if (into_lang) {
fprintf(stderr, "{Unreli %s.%d(%dB) => %s} ",
ExtLanguageCode(altlang), reliable_percent2, bytes2,
ExtLanguageCode(lang));
} else {
fprintf(stderr, "{Unreli %s.%d(%dB) => %s} ",
ExtLanguageCode(lang), reliable_percent, bytes,
ExtLanguageCode(altlang));
}
}
}
for (int sub = 0; sub < doc_tote->MaxSize(); ++sub) {
int plang = doc_tote->Key(sub);
if (plang == 0) {continue;}
Language lang = cld::UnpackLanguage(plang);
int bytes = doc_tote->Value(sub);
int reli = doc_tote->Reliability(sub);
if (bytes == 0) {continue;}
bool is_tier3 = (cld::kIsPackedTop40[plang] == 0);
if (is_tier3 &&
(bytes < kGoodFirstT3MinBytes) &&
(bytes < total_bytes)) {
reli = 0;
}
int reliable_percent = reli / bytes;
if (reliable_percent >= kMinReliableKeepPercent) {continue;}
doc_tote->SetKey(sub, 0);
doc_tote->SetValue(sub, 0);
doc_tote->SetReliability(sub, 0);
if (FLAGS_cld_html && (bytes >= 10)) {
fprintf(stderr, "{Unreli %s.%d(%dB)} ",
ExtLanguageCode(lang), reliable_percent, bytes);
}
}
if (FLAGS_cld_html) {fprintf(stderr, "<br>\n");}
}
void RefineScoredClosePairs(ToteWithReliability* doc_tote) {
for (int sub = 0; sub < doc_tote->MaxSize(); ++sub) {
int close_packedlang = doc_tote->Key(sub);
int subscr = kClosePair[close_packedlang];
if (subscr == 0) {continue;}
for (int sub2 = sub + 1; sub2 < doc_tote->MaxSize(); ++sub2) {
if (kClosePair[doc_tote->Key(sub2)] == subscr) {
int close_packedlang2 = doc_tote->Key(sub2);
int from_sub, to_sub;
Language from_lang, to_lang;
if (doc_tote->Value(sub) < doc_tote->Value(sub2)) {
from_sub = sub;
to_sub = sub2;
from_lang = cld::UnpackLanguage(close_packedlang);
to_lang = cld::UnpackLanguage(close_packedlang2);
} else {
from_sub = sub2;
to_sub = sub;
from_lang = cld::UnpackLanguage(close_packedlang2);
to_lang = cld::UnpackLanguage(close_packedlang);
}
if (FLAGS_cld_html || FLAGS_dbgscore) {
int val = doc_tote->Value(from_sub);
int reli = doc_tote->Reliability(from_sub);
int reliable_percent = reli / (val ? val : 1);
fprintf(stderr, "{CloseLangPair: %s.%d%%(%dB) => %s} ",
ExtLanguageCode(from_lang),
reliable_percent,
doc_tote->Value(from_sub),
ExtLanguageCode(to_lang));
}
int sum = doc_tote->Value(to_sub) + doc_tote->Value(from_sub);
doc_tote->SetValue(to_sub, sum);
doc_tote->SetReliability(to_sub, 100 * sum);
doc_tote->SetKey(from_sub, 0);
doc_tote->SetValue(from_sub, 0);
doc_tote->SetReliability(from_sub, 0);
break;
}
}
}
}
void ApplyLanguageHints(Tote* chunk_tote, int tote_grams,
uint8* lang_hint_boost) {
if (tote_grams > 8) {
tote_grams = 8;
}
for (int sub = 0; sub < chunk_tote->MaxSize(); ++sub) {
int lang_sub = chunk_tote->Key(sub);
int new_value = chunk_tote->Value(sub) +
((lang_hint_boost[lang_sub] * tote_grams) >> 3);
chunk_tote->SetValue(sub, new_value);
if (FLAGS_dbgscore && (lang_hint_boost[lang_sub] > 0)) {
fprintf(stderr, "[%s+=%d*%d/8] ",
ExtLanguageCode(cld::UnpackLanguage(lang_sub)),
lang_hint_boost[lang_sub], tote_grams);
}
}
}
void PrintHtmlEscapedText(FILE* f, const char* txt, int len) {
for (int i = 0; i < len; ++i) {
char c = txt[i];
if (c == '<') {
fprintf(f, "<");
} else if (c == '>') {
fprintf(f, ">");
} else if (c == '&') {
fprintf(f, "&");
} else if (c == '\'') {
fprintf(f, "'");
} else if (c == '"') {
fprintf(f, """);
} else {
fprintf(f, "%c", c);
}
}
fprintf(f, "<br>\n");
}
void ScoreChunkIntoDoc(const char* src, int srclen, int advance_by,
UnicodeLScript lscript,
Tote* chunk_tote,
ToteWithReliability* doc_tote,
int tote_grams,
uint8* lang_hint_boost) {
if (lang_hint_boost) {
ApplyLanguageHints(chunk_tote, tote_grams, lang_hint_boost);
}
chunk_tote->Sort(2);
Language cur_lang = cld::UnpackLanguage(chunk_tote->Key(0));
if (cur_lang < 0) {
chunk_tote->Reinit();
return;
}
bool cur_unreliable = false;
int len = chunk_tote->GetByteCount();
int reliability = cld::GetReliability((len * 2) / advance_by,
lscript,
chunk_tote);
cur_unreliable = (reliability < cld::kMinReliable);
if (tote_grams == 0) {
reliability = 100;
cur_unreliable = false;
} else if (tote_grams == 1) {
reliability = 0;
cur_unreliable = true;
}
#if 0
if (FLAGS_cld_html) {
if (reliability >= kMinReliableKeepPercent) {
fprintf(stderr, "R%d%% ", reliability);
} else {
fprintf(stderr, "--R%d%% ", reliability);
}
}
#endif
if (cur_unreliable && (chunk_tote->Key(1) != 0)) {
int top_len = ((len * 5) + 4) >> 3;
int second_len = len - top_len;
doc_tote->Add(chunk_tote->Key(0),
top_len, chunk_tote->Value(0), reliability);
doc_tote->Add(chunk_tote->Key(1),
second_len, chunk_tote->Value(1), reliability);
if (FLAGS_dbgscore) {
fprintf(stderr, "{+%s.%d.%dR(%dB) +%s.%d.%dR(%dB)} ",
ExtLanguageCode(cld::UnpackLanguage(chunk_tote->Key(0))),
chunk_tote->Value(0),
reliability,
top_len,
ExtLanguageCode(cld::UnpackLanguage(chunk_tote->Key(1))),
chunk_tote->Value(1),
reliability,
second_len);
}
} else {
doc_tote->Add(chunk_tote->Key(0),
len, chunk_tote->Value(0), reliability);
if (FLAGS_dbgscore) {
fprintf(stderr, "{+%s.%d.%dR(%dB)} ",
ExtLanguageCode(cld::UnpackLanguage(chunk_tote->Key(0))),
chunk_tote->Value(0),
reliability,
len);
}
}
if (FLAGS_cld_html) {
if (cur_lang < 0) {cur_lang = UNKNOWN_LANGUAGE;}
cld::PrintLang(stderr, chunk_tote,
cur_lang, cur_unreliable,
prior_lang, prior_unreliable);
prior_lang = cur_lang;
prior_unreliable = cur_unreliable;
string temp(src, srclen);
if (temp[0] == '=') {
temp = "=Buffered_";
temp.append(UnicodeLScriptCode(lscript));
temp.append("=");
}
cld::PrintText(stderr, cur_lang, temp);
}
chunk_tote->Reinit();
}
void PrintTopLang(Language top_lang) {
if ((top_lang == prior_lang) && (top_lang != UNKNOWN_LANGUAGE)) {
fprintf(stderr, "[] ");
} else {
fprintf(stderr, "[%s] ", ExtLanguageName(top_lang));
prior_lang = top_lang;
}
}
void PrintTopLangSpeculative(Language top_lang) {
fprintf(stderr, "<span style=\"color:#%06X;\">", 0xa0a0a0);
if ((top_lang == prior_lang) && (top_lang != UNKNOWN_LANGUAGE)) {
fprintf(stderr, "[] ");
} else {
fprintf(stderr, "[%s] ", ExtLanguageName(top_lang));
prior_lang = top_lang;
}
fprintf(stderr, "</span>\n");
}
void ScoreChunkIntoDoc2(const char* src, int advance_by,
UnicodeLScript lscript,
Tote* chunk_tote,
ToteWithReliability* doc_tote,
int tote_grams,
uint8* lang_hint_boost) {
int srclen = static_cast<int>(strlen(src));
ScoreChunkIntoDoc(src, srclen, advance_by, lscript, chunk_tote,
doc_tote, tote_grams, lang_hint_boost);
}
void ScoreNilgrams(getone::LangSpan* scriptspan, int lang,
ToteWithReliability* doc_tote,
uint8* lang_hint_boost,
int flags, Language plus_one) {
prior_lang = UNKNOWN_LANGUAGE;
prior_unreliable = false;
const char* src = scriptspan->text;
int len = scriptspan->text_bytes;
Tote chunk_tote;
chunk_tote.AddGram();
chunk_tote.Add(lang, scriptspan->text_bytes);
chunk_tote.AddBytes(scriptspan->text_bytes);
int advance_by = 2;
int tote_grams = 0;
ScoreChunkIntoDoc(src, len, advance_by,
scriptspan->script, &chunk_tote,
doc_tote, tote_grams, lang_hint_boost);
}
static void ScoreUnigrams(const UTF8PropObj* unigram_obj,
getone::LangSpan* scriptspan,
int* tote_grams, int gram_limit,
Tote* chunk_tote,
ToteWithReliability* doc_tote,
uint8* lang_hint_boost,
int advance_by, int flags,
int* initial_word_span, Language plus_one) {
const char* src = scriptspan->text;
const char* srclimit = src + scriptspan->text_bytes;
prior_lang = UNKNOWN_LANGUAGE;
prior_unreliable = false;
while (src < srclimit) {
int len = cld::DoUniScoreV3(unigram_obj,
src, srclimit - src, advance_by,
tote_grams, gram_limit, chunk_tote);
if (FlagUseWords(flags) || (*initial_word_span > 0)) {
cld::DoBigramScoreV3(&kCjkBiTable_obj,
src, len, chunk_tote);
}
chunk_tote->AddBytes(len);
*initial_word_span -= len;
if (*tote_grams >= gram_limit) {
if (FlagTop40(flags)) {
cld::DemoteNotTop40(chunk_tote, cld::PackLanguage(plus_one));
}
ScoreChunkIntoDoc(src, len, advance_by,
scriptspan->script, chunk_tote,
doc_tote, *tote_grams, lang_hint_boost);
*tote_grams = 0;
} else {
if (FLAGS_cld_html) {
string temp(src, len);
Language top_lang = cld::UnpackLanguage(chunk_tote->CurrentTopKey());
PrintTopLangSpeculative(top_lang);
cld::PrintText(stderr, top_lang, temp);
}
}
src += len;
}
}
const uint8* BackOneUTF8(const uint8* p) {
const uint8* retval = p - 1;
if ((*retval & 0xc0) == 0x80) {--retval;}
if ((*retval & 0xc0) == 0x80) {--retval;}
if ((*retval & 0xc0) == 0x80) {--retval;}
return retval;
}
static void ScoreQuadgrams(const cld::CLDTableSummary* quadgram_obj,
getone::LangSpan* scriptspan,
int* tote_grams, int gram_limit,
Tote* chunk_tote,
ToteWithReliability* doc_tote,
uint8* lang_hint_boost,
int advance_by, int flags,
int* initial_word_span, Language plus_one) {
const char* src = scriptspan->text;
const char* srclimit = src + scriptspan->text_bytes;
const char* lastscored_src = src;
prior_lang = UNKNOWN_LANGUAGE;
prior_unreliable = false;
while (src < srclimit) {
int len = cld::DoQuadScoreV3(quadgram_obj,
src, srclimit - src, advance_by,
tote_grams, gram_limit, chunk_tote);
if (FlagUseWords(flags) || (*initial_word_span > 0)) {
cld::DoOctaScoreV3(&kLongWord8Table_obj,
src, len, chunk_tote);
}
chunk_tote->AddBytes(len);
*initial_word_span -= len;
if (*tote_grams >= gram_limit) {
if (FlagTop40(flags)) {
cld::DemoteNotTop40(chunk_tote, cld::PackLanguage(plus_one));
}
ScoreChunkIntoDoc(src, len, advance_by,
scriptspan->script, chunk_tote,
doc_tote, *tote_grams, lang_hint_boost);
lastscored_src = src + len;
*tote_grams = 0;
} else {
if (FLAGS_cld_html) {
string temp(src, len);
Language top_lang = cld::UnpackLanguage(chunk_tote->CurrentTopKey());
PrintTopLangSpeculative(top_lang);
cld::PrintText(stderr, top_lang, temp);
}
}
src += len;
}
}
void PrintLangs(FILE* f, const Language* language3, const int* percent3,
const int* text_bytes, const bool* is_reliable) {
fprintf(f, "<br> Initial_Languages ");
if (language3[0] != UNKNOWN_LANGUAGE) {
fprintf(f, "%s%s(%d%%) ",
ExtLanguageName(language3[0]),
*is_reliable ? "" : "*",
percent3[0]);
}
if (language3[1] != UNKNOWN_LANGUAGE) {
fprintf(f, "%s(%d%%) ", ExtLanguageName(language3[1]), percent3[1]);
}
if (language3[2] != UNKNOWN_LANGUAGE) {
fprintf(f, "%s(%d%%) ", ExtLanguageName(language3[2]), percent3[2]);
}
fprintf(f, "%d bytes \n", *text_bytes);
fprintf(f, "<br>\n");
}
void InitScriptToteLang(Tote* script_tote, UnicodeLScript lscript) {
Language defaultlang = cld::kDefaultLanguagePerLScript[lscript];
script_tote->Add(cld::PackLanguage(defaultlang), 1);
script_tote->AddBytes(1);
#if 0
if (FLAGS_cld_html) {
cld::PrintLang(stderr, script_tote,
defaultlang, false,
UNKNOWN_LANGUAGE, false);
prior_lang = cur_lang;
string temp("+1");
cld::PrintText(stderr, defaultlang, temp);
}
#endif
}
static const char* const kToteName[4] =
{"=Latn=", "=Hani=", "=Script2=", "=Script3="};
static const char* const kToteSwitch[4] =
{"=Latn=", "=Hani=", "=Switch2=", "=Switch3="};
static const char kCharsetToLowerTbl[256] = {
0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,
0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,
0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,
0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37, 0x38,0x39,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,
0x2d,0x61,0x62,0x63,0x64,0x65,0x66,0x67, 0x68,0x69,0x6a,0x6b,0x6c,0x6d,0x6e,0x6f,
0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77, 0x78,0x79,0x7a,0x2d,0x2d,0x2d,0x2d,0x2d,
0x2d,0x61,0x62,0x63,0x64,0x65,0x66,0x67, 0x68,0x69,0x6a,0x6b,0x6c,0x6d,0x6e,0x6f,
0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77, 0x78,0x79,0x7a,0x2d,0x2d,0x2d,0x2d,0x2d,
0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,
0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,
0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,
0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,
0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,
0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,
0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,
0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,
};
static const char kIsAlpha[256] = {
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
0,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,0,0,0,0,0,
0,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,0,0,0,0,0,
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
};
static const char kIsDigit[256] = {
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 1,1,1,1,1,1,1,1, 1,1,0,0,0,0,0,0,
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
};
void MakeChar4(const char* str, char* norm) {
memcpy(norm, "____", 4);
int l_ptr = 0;
for (int i = 0; i < strlen(str); ++i) {
uint8 uc = static_cast<uint8>(str[i]);
if (kIsAlpha[uc] | kIsDigit[uc]) {
if (l_ptr < 4) {
norm[l_ptr] = kCharsetToLowerTbl[uc];
l_ptr++;
}
}
}
}
static int HintBinaryLookup4(const HintEntry* hintprobs, int hintprobssize,
const char* norm_key) {
int lo = 0;
int hi = hintprobssize;
while (lo < hi) {
int mid = (lo + hi) >> 1;
int comp = memcmp(&hintprobs[mid].key[0], norm_key, 4);
if (comp < 0) {
lo = mid + 1;
} else if (comp > 0) {
hi = mid;
} else {
return mid;
}
}
return -1;
}
void ApplyTLDHint(uint8* lang_hint_boost, const char* tld_hint) {
if (FLAGS_dbgscore) {
fprintf(stderr, "TLD hint %s\n", tld_hint);
}
char normalized_tld[8];
MakeChar4(tld_hint, normalized_tld);
int n = HintBinaryLookup4(kTLDHintProbs, kTLDHintProbsSize,
normalized_tld);
if (n >= 0) {
uint32 probs = kTLDHintProbs[n].probs;
uint8 prob123 = (probs >> 0) & 0xff;
const uint8* prob123_entry = cld::LgProb2TblEntry(prob123);
uint8 top1 = (probs >> 8) & 0xff;
if (top1 > 0) {lang_hint_boost[top1] += cld::LgProb3(prob123_entry, 0);}
uint8 top2 = (probs >> 16) & 0xff;
if (top2 > 0) {lang_hint_boost[top2] += cld::LgProb3(prob123_entry, 1);}
uint8 top3 = (probs >> 24) & 0xff;
if (top3 > 0) {lang_hint_boost[top3] += cld::LgProb3(prob123_entry, 2);}
}
}
void ApplyEncodingHint(uint8* lang_hint_boost, int encoding_hint) {
if (FLAGS_dbgscore) {
Encoding tempenc = static_cast<Encoding>(encoding_hint);
fprintf(stderr, "ENC hint %s\n", EncodingName(tempenc));
}
if (encoding_hint < ISO_8859_1) {return;}
if (encoding_hint >= NUM_ENCODINGS) {return;}
uint32 probs = kEncodingHintProbs[encoding_hint];
uint8 prob123 = (probs >> 0) & 0xff;
const uint8* prob123_entry = cld::LgProb2TblEntry(prob123);
uint8 top1 = (probs >> 8) & 0xff;
if (top1 > 0) {lang_hint_boost[top1] += cld::LgProb3(prob123_entry, 0);}
uint8 top2 = (probs >> 16) & 0xff;
if (top2 > 0) {lang_hint_boost[top2] += cld::LgProb3(prob123_entry, 1);}
uint8 top3 = (probs >> 24) & 0xff;
if (top3 > 0) {lang_hint_boost[top3] += cld::LgProb3(prob123_entry, 2);}
}
void ApplyLanguageHint(uint8* lang_hint_boost, Language language_hint) {
if (FLAGS_dbgscore) {
fprintf(stderr, "LANG hint %s\n", ExtLanguageName(language_hint));
}
if (language_hint < ENGLISH) {return;}
if (language_hint >= NUM_LANGUAGES) {return;}
uint32 probs = kLanguageHintProbs[language_hint];
uint8 prob123 = (probs >> 0) & 0xff;
const uint8* prob123_entry = cld::LgProb2TblEntry(prob123);
uint8 top1 = (probs >> 8) & 0xff;
if (top1 > 0) {lang_hint_boost[top1] += cld::LgProb3(prob123_entry, 0);}
uint8 top2 = (probs >> 16) & 0xff;
if (top2 > 0) {lang_hint_boost[top2] += cld::LgProb3(prob123_entry, 1);}
uint8 top3 = (probs >> 24) & 0xff;
if (top3 > 0) {lang_hint_boost[top3] += cld::LgProb3(prob123_entry, 2);}
}
void ExtractLangEtc(ToteWithReliability* doc_tote, int total_text_bytes,
int* reliable_percent3, Language* language3, int* percent3,
double* normalized_score3,
int* text_bytes, bool* is_reliable) {
reliable_percent3[0] = 0;
reliable_percent3[1] = 0;
reliable_percent3[2] = 0;
language3[0] = UNKNOWN_LANGUAGE;
language3[1] = UNKNOWN_LANGUAGE;
language3[2] = UNKNOWN_LANGUAGE;
percent3[0] = 100;
percent3[1] = 0;
percent3[2] = 0;
normalized_score3[0] = 0.0;
normalized_score3[1] = 0.0;
normalized_score3[2] = 0.0;
*text_bytes = total_text_bytes;
*is_reliable = false;
int bytecount1 = total_text_bytes;
int bytecount2 = 0;
int bytecount3 = 0;
int lang1 = doc_tote->Key(0);
if (lang1 != 0) {
language3[0] = cld::UnpackLanguage(lang1);
bytecount1 = doc_tote->Value(0);
int reli1 = doc_tote->Reliability(0);
reliable_percent3[0] = reli1 / (bytecount1 ? bytecount1 : 1);
normalized_score3[0] = cld::GetNormalizedScore(language3[0],
ULScript_Common,
bytecount1,
doc_tote->Score(0));
}
int lang2 = doc_tote->Key(1);
if (lang2 != 0) {
language3[1] = cld::UnpackLanguage(lang2);
bytecount2 = doc_tote->Value(1);
int reli2 = doc_tote->Reliability(1);
reliable_percent3[1] = reli2 / (bytecount2 ? bytecount2 : 1);
normalized_score3[1] = cld::GetNormalizedScore(language3[1],
ULScript_Common,
bytecount2,
doc_tote->Score(1));
}
int lang3 = doc_tote->Key(2);
if (lang3 != 0) {
language3[2] = cld::UnpackLanguage(lang3);
bytecount3 = doc_tote->Value(2);
int reli3 = doc_tote->Reliability(2);
reliable_percent3[2] = reli3 / (bytecount3 ? bytecount3 : 1);
normalized_score3[2] = cld::GetNormalizedScore(language3[2],
ULScript_Common,
bytecount3,
doc_tote->Score(2));
}
int total_bytecount12 = bytecount1 + bytecount2;
int total_bytecount123 = total_bytecount12 + bytecount3;
if (total_text_bytes < total_bytecount123) {
total_text_bytes = total_bytecount123;
*text_bytes = total_text_bytes;
}
int total_text_bytes_div = cld::maxint(1, total_text_bytes);
percent3[0] = (bytecount1 * 100) / total_text_bytes_div;
percent3[1] = (total_bytecount12 * 100) / total_text_bytes_div;
percent3[2] = (total_bytecount123 * 100) / total_text_bytes_div;
percent3[2] -= percent3[1];
percent3[1] -= percent3[0];
if (percent3[1] < percent3[2]) {
++percent3[1];
--percent3[2];
}
if (percent3[0] < percent3[1]) {
++percent3[0];
--percent3[1];
}
*text_bytes = total_text_bytes;
if (lang1 != 0) {
int bytecount = doc_tote->Value(0);
int reli = doc_tote->Reliability(0);
int reliable_percent = reli / (bytecount ? bytecount : 1);
*is_reliable = reliable_percent >= cld::kMinReliable;
} else {
*is_reliable = true;
}
}
bool IsFIGS(Language lang) {
if (lang == FRENCH) {return true;}
if (lang == ITALIAN) {return true;}
if (lang == GERMAN) {return true;}
if (lang == SPANISH) {return true;}
return false;
}
bool IsEFIGS(Language lang) {
if (lang == ENGLISH) {return true;}
if (lang == FRENCH) {return true;}
if (lang == ITALIAN) {return true;}
if (lang == GERMAN) {return true;}
if (lang == SPANISH) {return true;}
return false;
}
static const int kNonEnBoilerplateMinPercent = 17;
static const int kNonFIGSBoilerplateMinPercent = 20;
static const int kGoodFirstMinPercent = 26;
static const int kGoodFirstReliableMinPercent = 51;
static const int kIgnoreMaxPercent = 95;
static const int kKeepMinPercent = 2;
static const int kGoodSecondT1T2MinBytes = 15;
static const int kGoodSecondT3MinBytes = 128;
void CalcSummaryLang(ToteWithReliability* doc_tote, int total_text_bytes,
const int* reliable_percent3,
const Language* language3,
const int* percent3,
Language* summary_lang, bool* is_reliable) {
int slot_count = 3;
int active_slot[3] = {0, 1, 2};
int ignore_percent = 0;
int return_percent = percent3[0];
*summary_lang = language3[0];
*is_reliable = true;
if (percent3[0] < kKeepMinPercent) {*is_reliable = false;}
for (int i = 0; i < 3; ++i) {
if (language3[i] == TG_UNKNOWN_LANGUAGE) {
ignore_percent += percent3[i];
for (int j=i+1; j < 3; ++j) {
active_slot[j - 1] = active_slot[j];
}
-- slot_count;
return_percent = (percent3[0] * 100) / (101 - ignore_percent);
*summary_lang = language3[active_slot[0]];
if (percent3[active_slot[0]] < kKeepMinPercent) {*is_reliable = false;}
}
}
int second_bytes = (total_text_bytes * percent3[active_slot[1]]) / 100;
int minbytesneeded = kGoodSecondT1T2MinBytes;
int plang_second = cld::PackLanguage(language3[active_slot[1]]);
bool is_tier3 = (cld::kIsPackedTop40[plang_second] == 0);
if (is_tier3) {
minbytesneeded = kGoodSecondT3MinBytes;
}
if ((language3[active_slot[0]] == ENGLISH) &&
(language3[active_slot[1]] != ENGLISH) &&
(language3[active_slot[1]] != UNKNOWN_LANGUAGE) &&
(percent3[active_slot[1]] >= kNonEnBoilerplateMinPercent) &&
(second_bytes >= minbytesneeded)) {
ignore_percent += percent3[active_slot[0]];
return_percent = (percent3[active_slot[1]] * 100) / (101 - ignore_percent);
*summary_lang = language3[active_slot[1]];
if (percent3[active_slot[1]] < kKeepMinPercent) {*is_reliable = false;}
} else if (IsFIGS(language3[active_slot[0]]) &&
!IsEFIGS(language3[active_slot[1]]) &&
(language3[active_slot[1]] != UNKNOWN_LANGUAGE) &&
(percent3[active_slot[1]] >= kNonFIGSBoilerplateMinPercent) &&
(second_bytes >= minbytesneeded)) {
ignore_percent += percent3[active_slot[0]];
return_percent = (percent3[active_slot[1]] * 100) / (101 - ignore_percent);
*summary_lang = language3[active_slot[1]];
if (percent3[active_slot[1]] < kKeepMinPercent) {*is_reliable = false;}
} else if ((language3[active_slot[1]] == ENGLISH) &&
(language3[active_slot[0]] != ENGLISH)) {
ignore_percent += percent3[active_slot[1]];
return_percent = (percent3[active_slot[0]] * 100) / (101 - ignore_percent);
} else if (IsFIGS(language3[active_slot[1]]) &&
!IsEFIGS(language3[active_slot[0]])) {
ignore_percent += percent3[active_slot[1]];
return_percent = (percent3[active_slot[0]] * 100) / (101 - ignore_percent);
}
if ((return_percent < kGoodFirstMinPercent)) {
*summary_lang = UNKNOWN_LANGUAGE;
*is_reliable = false;
}
if ((return_percent < kGoodFirstReliableMinPercent)) {
*is_reliable = false;
}
if ((ignore_percent > kIgnoreMaxPercent)) {
*is_reliable = false;
}
if (slot_count == 0) {
*summary_lang = UNKNOWN_LANGUAGE;
*is_reliable = false;
}
}
Language CompactLangDetImpl::DetectLanguageSummaryV25(
const CompactLangDet::DetectionTables* tables,
const char* buffer,
int buffer_length,
bool is_plain_text,
const char* tld_hint,
int encoding_hint,
Language language_hint,
bool allow_extended_lang,
int flags,
Language plus_one,
Language* language3,
int* percent3,
double* normalized_score3,
int* text_bytes,
bool* is_reliable) {
if (!tables) {
static const CompactLangDet::DetectionTables default_cld_tables = {
&kQuadTable_obj,
&compact_lang_det_generated_ctjkvz_b1_obj
};
tables = &default_cld_tables;
}
language3[0] = UNKNOWN_LANGUAGE;
language3[1] = UNKNOWN_LANGUAGE;
language3[2] = UNKNOWN_LANGUAGE;
percent3[0] = 100;
percent3[1] = 0;
percent3[2] = 0;
normalized_score3[0] = 0.0;
normalized_score3[1] = 0.0;
normalized_score3[2] = 0.0;
*text_bytes = 0;
*is_reliable = false;
ToteWithReliability doc_tote;
uint8 lang_hint_boost[EXT_NUM_LANGUAGES + 1];
memset(lang_hint_boost, 0, sizeof(lang_hint_boost));
if ((tld_hint != NULL) && (tld_hint[0] != '\0')) {
ApplyTLDHint(lang_hint_boost, tld_hint);
}
if (encoding_hint != UNKNOWN_ENCODING) {
ApplyEncodingHint(lang_hint_boost, encoding_hint);
}
if (language_hint != UNKNOWN_LANGUAGE) {
ApplyLanguageHint(lang_hint_boost, language_hint);
}
int next_other_tote = 2;
Tote totes[4];
bool tote_seen[4] = {false, false, false, false};
int tote_grams[4] = {0, 0, 0, 0};
UnicodeLScript tote_script[4] =
{ULScript_Latin, ULScript_HanCJK, ULScript_Common, ULScript_Common};
ScriptScanner ss(buffer, buffer_length, is_plain_text);
getone::LangSpan scriptspan;
scriptspan.text = NULL;
scriptspan.text_bytes = 0;
scriptspan.offset = 0;
scriptspan.script = ULScript_Common;
scriptspan.lang = UNKNOWN_LANGUAGE;
int total_text_bytes = 0;
int textlimit = FLAGS_cld_textlimit << 10;
if (textlimit == 0) {textlimit = 0x7fffffff;}
int advance_by = 2;
int advance_limit = textlimit >> 3;
int initial_word_span = kDefaultWordSpan;
if (FLAGS_cld_forcewords) {
initial_word_span = kReallyBigWordSpan;
}
int chunksizequads = FLAGS_cld_smoothwidth;
chunksizequads = cld::minint(cld::maxint(chunksizequads, kMinChunkSizeQuads),
kMaxChunkSizeQuads);
int chunksizeunis = (chunksizequads * 5) >> 1;
int spantooshortlimit = kShortSpanThresh;
prior_lang = UNKNOWN_LANGUAGE;
prior_unreliable = false;
int hash = 0;
int* predict_tbl = new int[kPredictionTableSize];
if (FlagRepeats(flags)) {
memset(predict_tbl, 0, kPredictionTableSize * sizeof(predict_tbl[0]));
}
while (ss.GetOneScriptSpanLower(&scriptspan)) {
UnicodeLScript lscript = scriptspan.script;
if (FLAGS_cld_echotext) {
PrintHtmlEscapedText(stderr, scriptspan.text, scriptspan.text_bytes);
}
if (FlagSqueeze(flags)) {
int newlen;
int chunksize = 0;
newlen = CheapSqueezeInplace(scriptspan.text, scriptspan.text_bytes,
chunksize);
scriptspan.text_bytes = newlen;
} else {
if ((total_text_bytes >= kCheapSqueezeTestThresh) &&
!FlagFinish(flags) &&
((getone::kMaxScriptBuffer >> 1) < scriptspan.text_bytes) &&
CheapSqueezeTriggerTest(scriptspan.text,
scriptspan.text_bytes,
kCheapSqueezeTestLen)) {
if (FLAGS_cld_html || FLAGS_dbgscore) {
fprintf(stderr,
"<br>---text_bytes[%d] Recursive(Squeeze)---<br><br>\n",
total_text_bytes);
}
delete[] predict_tbl;
return DetectLanguageSummaryV25(
tables,
buffer,
buffer_length,
is_plain_text,
tld_hint,
encoding_hint,
language_hint,
allow_extended_lang,
flags | kCLDFlagSqueeze,
plus_one,
language3,
percent3,
normalized_score3,
text_bytes,
is_reliable);
}
}
if (FlagRepeats(flags)) {
int newlen;
newlen = CheapRepWordsInplace(scriptspan.text, scriptspan.text_bytes,
&hash, predict_tbl);
scriptspan.text_bytes = newlen;
}
Language onlylang = cld::kOnlyLanguagePerLScript[lscript];
if (onlylang != UNKNOWN_LANGUAGE) {
ScoreNilgrams(&scriptspan, cld::PackLanguage(onlylang), &doc_tote,
lang_hint_boost, flags, plus_one);
} else if (cld::kScoreUniPerLScript[lscript] != 0) {
int tote_num = 1;
if (!tote_seen[tote_num]) {
tote_seen[tote_num] = true;
total_text_bytes += 1;
InitScriptToteLang(&totes[tote_num], lscript);
}
ScoreUnigrams(tables->unigram_obj,
&scriptspan, &tote_grams[tote_num], chunksizeunis,
&totes[tote_num],
&doc_tote, lang_hint_boost,
advance_by, flags, &initial_word_span, plus_one);
} else {
int tote_num = -1;
for (int t = 0; t < 4; ++t) {
if (lscript == tote_script[t]) {
tote_num = t;
break;
}
}
if (tote_num < 0) {
tote_num = next_other_tote;
next_other_tote ^= 1;
if (tote_seen[tote_num]) {
ScoreChunkIntoDoc2(kToteSwitch[tote_num], advance_by,
tote_script[tote_num], &totes[tote_num],
&doc_tote, tote_grams[tote_num], lang_hint_boost);
totes[tote_num].Reinit();
}
tote_script[tote_num] = lscript;
}
if (!tote_seen[tote_num]) {
tote_seen[tote_num] = true;
total_text_bytes += 1;
InitScriptToteLang(&totes[tote_num], lscript);
}
ScoreQuadgrams(tables->quadgram_obj, &scriptspan, &tote_grams[tote_num],
chunksizequads,
&totes[tote_num],
&doc_tote, lang_hint_boost,
advance_by, flags, &initial_word_span, plus_one);
}
total_text_bytes += scriptspan.text_bytes;
if (total_text_bytes > advance_limit) {
if (total_text_bytes > textlimit) {
if (FLAGS_cld_html || FLAGS_dbgscore) {
fprintf(stderr, "<br>---text_bytes[%d] textlimit %d reached---<br>",
total_text_bytes, textlimit);
}
break;
}
advance_by <<= 1;
advance_limit <<= 1;
spantooshortlimit <<= 1;
if (FLAGS_cld_html || FLAGS_dbgscore) {
fprintf(stderr, "<br>---text_bytes[%d] advance_by doubled to %d---<br>",
total_text_bytes, advance_by);
}
}
}
delete[] predict_tbl;
for (int tote_num = 0; tote_num < 4; ++tote_num) {
if (tote_seen[tote_num]) {
ScoreChunkIntoDoc2(kToteName[tote_num], advance_by,
tote_script[tote_num], &totes[tote_num], &doc_tote,
tote_grams[tote_num], lang_hint_boost);
}
}
if (!allow_extended_lang) {
RemoveExtendedLanguages(&doc_tote);
}
RefineScoredClosePairs(&doc_tote);
int reliable_percent3[3];
doc_tote.Sort(3);
ExtractLangEtc(&doc_tote, total_text_bytes,
reliable_percent3, language3, percent3, normalized_score3,
text_bytes, is_reliable);
bool have_good_answer = false;
if (FlagFinish(flags)) {
have_good_answer = true;
} else if (total_text_bytes <= kShortTextThresh) {
have_good_answer = true;
} else if (*is_reliable &&
(percent3[0] >= kGoodLang1Percent)) {
have_good_answer = true;
} else if (*is_reliable &&
((percent3[0] + percent3[1]) >= kGoodLang1and2Percent)) {
have_good_answer = true;
}
if (have_good_answer) {
RemoveUnreliableLanguages(&doc_tote);
doc_tote.Sort(3);
ExtractLangEtc(&doc_tote, total_text_bytes,
reliable_percent3, language3, percent3, normalized_score3,
text_bytes, is_reliable);
#if 0
if (language3[2] == TG_UNKNOWN_LANGUAGE) {
reliable_percent3[2] = 0;
language3[2] = UNKNOWN_LANGUAGE;
percent3[2] = 0;
} else if (language3[1] == TG_UNKNOWN_LANGUAGE) {
reliable_percent3[1] = reliable_percent3[2];
language3[1] = language3[2];
percent3[1] = percent3[2];
reliable_percent3[2] = 0;
language3[2] = UNKNOWN_LANGUAGE;
percent3[2] = 0;
} else if (language3[0] == TG_UNKNOWN_LANGUAGE) {
language3[0] = ENGLISH;
}
if (language3[0] == UNKNOWN_LANGUAGE) {
language3[0] = ENGLISH;
percent3[0] = 100;
*is_reliable = true;
}
#endif
#if 0
if (FLAGS_cld_html) {
static const int kMaxSubsetSeq = 12;
uint8 subseq[kMaxSubsetSeq];
doc_tote.ExtractSeq(kMaxSubsetSeq, subseq);
fprintf(stderr, "<br>\nSubset Sequence[%d]: ", kMaxSubsetSeq);
for (int i = 0; i < kMaxSubsetSeq; ++i) {
fprintf(stderr, "%s ", ExtLanguageCode(cld::UnpackLanguage(subseq[i])));
if ((i % 4) == 3) {fprintf(stderr, " ");}
}
fprintf(stderr, " ");
for (int i = 0; i < 3; ++i) {
if (language3[i] != UNKNOWN_LANGUAGE) {
fprintf(stderr, "%s.%d(%d%%) ",
ExtLanguageCode(language3[i]),
reliable_percent3[i],
percent3[i]);
}
}
fprintf(stderr, "%d B ", total_text_bytes);
fprintf(stderr, "<br>\n");
}
#endif
Language summary_lang;
CalcSummaryLang(&doc_tote, total_text_bytes,
reliable_percent3, language3, percent3,
&summary_lang, is_reliable);
if (FLAGS_cld_html) {
for (int i = 0; i < 3; ++i) {
if (language3[i] != UNKNOWN_LANGUAGE) {
fprintf(stderr, "%s.%d(%d%%) ",
ExtLanguageCode(language3[i]),
reliable_percent3[i],
percent3[i]);
}
}
fprintf(stderr, "%d B ", total_text_bytes);
fprintf(stderr, "= %s%c ",
ExtLanguageName(summary_lang), is_reliable ? ' ' : '*');
fprintf(stderr, "<br>\n");
}
return summary_lang;
}
if (FLAGS_cld_html || FLAGS_dbgscore) {
PrintLangs(stderr, language3, percent3, text_bytes, is_reliable);
}
Language new_plus_one = UNKNOWN_LANGUAGE;
if (cld::kIsPackedTop40[cld::PackLanguage(language3[0])] == 0) {
new_plus_one = language3[0];
} else if (cld::kIsPackedTop40[cld::PackLanguage(language3[1])] == 0) {
new_plus_one = language3[1];
}
if (total_text_bytes < kShortTextThresh) {
if (FLAGS_cld_html || FLAGS_dbgscore) {
fprintf(stderr, " ---text_bytes[%d] "
"Recursive(Top40/Rep/Short/Words)---<br><br>\n",
total_text_bytes);
}
return DetectLanguageSummaryV25(
tables,
buffer,
buffer_length,
is_plain_text,
tld_hint,
encoding_hint,
language_hint,
allow_extended_lang,
flags | kCLDFlagTop40 | kCLDFlagRepeats |
kCLDFlagShort | kCLDFlagUseWords | kCLDFlagFinish,
new_plus_one,
language3,
percent3,
normalized_score3,
text_bytes,
is_reliable);
}
if (FLAGS_cld_html || FLAGS_dbgscore) {
fprintf(stderr,
" ---text_bytes[%d] Recursive(Top40/Rep)---<br><br>\n",
total_text_bytes);
}
return DetectLanguageSummaryV25(
tables,
buffer,
buffer_length,
is_plain_text,
tld_hint,
encoding_hint,
language_hint,
allow_extended_lang,
flags | kCLDFlagTop40 | kCLDFlagRepeats |
kCLDFlagFinish,
new_plus_one,
language3,
percent3,
normalized_score3,
text_bytes,
is_reliable);
}