This source file includes following definitions.
- IsSpecial
- ScanToLetterOrSpecial
- ScanToPossibleLetter
- is_plain_text_
- SkipToFrontOfSpan
- GetOneScriptSpan
- LowerScriptSpan
- GetOneScriptSpanLower
- GetUTF8LetterScriptNum
#include "encodings/compact_lang_det/getonescriptspan.h"
#include <stdio.h>
#include <string.h>
#include "base/basictypes.h"
#include "encodings/lang_enc.h"
#include "encodings/compact_lang_det/utf8propjustletter.h"
#include "encodings/compact_lang_det/utf8propletterscriptnum.h"
#include "encodings/compact_lang_det/utf8scannotjustletterspecial.h"
#include "encodings/compact_lang_det/win/cld_basictypes.h"
#include "encodings/compact_lang_det/win/cld_commandlineflags.h"
#include "encodings/compact_lang_det/win/cld_google.h"
#include "encodings/compact_lang_det/win/cld_htmlutils.h"
#include "encodings/compact_lang_det/win/cld_unilib.h"
#include "encodings/compact_lang_det/win/cld_utf8statetable.h"
#include "encodings/compact_lang_det/win/cld_utf8utils.h"
static const Language GRAY_LANG = (Language)254;
static const int kMaxUpToWordBoundary = 50;
static const int kMaxAdvanceToWordBoundary = 10;
static const char kSpecialSymbol[256] = {
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
0,0,0,0,0,0,1,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,1,0,1,0,
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
};
#define LT 0
#define GT 1
#define EX 2
#define HY 3
#define QU 4
#define AP 5
#define SL 6
#define S_ 7
#define C_ 8
#define R_ 9
#define I_ 10
#define P_ 11
#define T_ 12
#define Y_ 13
#define L_ 14
#define E_ 15
#define CR 16
#define NL 17
#define PL 18
#define xx 19
static const uint8 kCharToSub[256] = {
NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,CR,NL, NL,CR,NL,NL,
NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
NL,EX,QU,NL, NL,NL,PL,AP, NL,NL,NL,NL, NL,HY,NL,SL,
NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, LT,NL,GT,NL,
PL,PL,PL,C_, PL,E_,PL,PL, PL,I_,PL,PL, L_,PL,PL,PL,
P_,PL,R_,S_, T_,PL,PL,PL, PL,Y_,PL,NL, NL,NL,NL,NL,
PL,PL,PL,C_, PL,E_,PL,PL, PL,I_,PL,PL, L_,PL,PL,PL,
P_,PL,R_,S_, T_,PL,PL,PL, PL,Y_,PL,NL, NL,NL,NL,NL,
NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL,
PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL,
PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL,
PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL,
};
#undef LT
#undef GT
#undef EX
#undef HY
#undef QU
#undef AP
#undef SL
#undef S_
#undef C_
#undef R_
#undef I_
#undef P_
#undef T_
#undef Y_
#undef L_
#undef E_
#undef CR
#undef NL
#undef PL
#undef xx
#define OK 0
#define X_ 1
static const uint8 kTagParseTbl_0[] = {
3, 2, 2, 2, 2, 2, 2,OK, OK,OK,OK,OK, OK,OK,OK,OK, 2, 2,OK,X_,
X_,X_,X_,X_, X_,X_,X_,X_, X_,X_,X_,X_, X_,X_,X_,X_, X_,X_,X_,X_,
3, 2, 2, 2, 2, 2, 2,OK, OK,OK,OK,OK, OK,OK,OK,OK, 2, 2,OK,X_,
X_, 2, 4, 9, 10,11, 9,13, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,X_,
X_, 2, 9, 5, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,X_,
X_, 2, 9, 6, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,X_,
6, 6, 6, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,X_,
6, 6, 6, 8, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,X_,
6, 2, 6, 8, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,X_,
X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,X_,
10,10,10,10, 9,10,10,10, 10,10,10,10, 10,10,10,10, 12,10,10,X_,
11,11,11,11, 11, 9,11,11, 11,11,11,11, 11,11,11,11, 12,11,11,X_,
X_, 2,12,12, 12,12,12,12, 12,12,12,12, 12,12,12,12, 12,12,12,X_,
X_, 2, 9, 9, 10,11, 9, 9, 14, 9, 9, 9, 28, 9, 9, 9, 9, 9, 9,X_,
X_, 2, 9, 9, 10,11, 9, 9, 9,15, 9, 9, 9, 9, 9, 9, 9, 9, 9,X_,
X_, 2, 9, 9, 10,11, 9, 9, 9, 9,16, 9, 9, 9, 9, 9, 9, 9, 9,X_,
X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9,17, 9, 9, 9, 9, 9, 9, 9,X_,
X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 18, 9, 9, 9, 9, 9, 9,X_,
X_,19, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 19,19, 9,X_,
20,19,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,X_,
19,19,19,19, 19,19,21,19, 19,19,19,19, 19,19,19,19, 19,19,19,X_,
19,19,19,19, 19,19,19,22, 19,19,19,19, 19,19,19,19, 19,19,19,X_,
19,19,19,19, 19,19,19,19, 23,19,19,19, 19,19,19,19, 19,19,19,X_,
19,19,19,19, 19,19,19,19, 19,24,19,19, 19,19,19,19, 19,19,19,X_,
19,19,19,19, 19,19,19,19, 19,19,25,19, 19,19,19,19, 19,19,19,X_,
19,19,19,19, 19,19,19,19, 19,19,19,26, 19,19,19,19, 19,19,19,X_,
19,19,19,19, 19,19,19,19, 19,19,19,19, 27,19,19,19, 19,19,19,X_,
19, 2,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,X_,
X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9,29, 9, 9, 9, 9, 9,X_,
X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9, 9,30, 9, 9, 9, 9,X_,
X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9,31, 9, 9, 9,X_,
X_,32, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 32,32, 9,X_,
33,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,X_,
32,32,32,32, 32,32,34,32, 32,32,32,32, 32,32,32,32, 32,32,32,X_,
32,32,32,32, 32,32,32,35, 32,32,32,32, 32,32,32,32, 32,32,32,X_,
32,32,32,32, 32,32,32,32, 32,32,32,32, 36,32,32,32, 32,32,32,X_,
32,32,32,32, 32,32,32,32, 32,32,32,32, 32,37,32,32, 32,32,32,X_,
32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,38,32, 32,32,32,X_,
32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,39, 32,32,32,X_,
32, 2,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,X_,
};
#undef OK
#undef X_
bool inline IsSpecial(char c) {
if ((c & 0xe0) == 0x20) {
return kSpecialSymbol[static_cast<uint8>(c)];
}
return false;
}
int ScanToLetterOrSpecial(const char* src, int len) {
int bytes_consumed;
cld::UTF8GenericScan(&utf8scannotjustletterspecial_obj, src, len,
&bytes_consumed);
return bytes_consumed;
}
int ScanToPossibleLetter(const char* isrc, int len) {
const uint8* src = reinterpret_cast<const uint8*>(isrc);
const uint8* srclimit = src + len;
const uint8* tagParseTbl = kTagParseTbl_0;
int e = 0;
while (src < srclimit) {
e = tagParseTbl[kCharToSub[*src++]];
if ((e & ~1) == 0) {
--src;
break;
}
tagParseTbl = &kTagParseTbl_0[e * 20];
}
if (src >= srclimit) {
return len;
}
if ((e != 0) && (e != 2)) {
int offset = src - reinterpret_cast<const uint8*>(isrc);
--offset;
while ((0 < offset) && (isrc[offset] != '<')) {
--offset;
}
return offset + 1;
}
return src - reinterpret_cast<const uint8*>(isrc);
}
ScriptScanner::ScriptScanner(const char* buffer,
int buffer_length,
bool is_plain_text)
: start_byte_(buffer),
next_byte_(buffer),
next_byte_limit_(buffer + buffer_length),
byte_length_(buffer_length),
is_plain_text_(is_plain_text) {
script_buffer_ = new char[getone::kMaxScriptBuffer];
script_buffer_lower_ = new char[getone::kMaxScriptLowerBuffer];
}
ScriptScanner::~ScriptScanner() {
delete[] script_buffer_;
delete[] script_buffer_lower_;
}
int ScriptScanner::SkipToFrontOfSpan(const char* src, int len, int* script) {
int sc = UNKNOWN_LSCRIPT;
int skip = 0;
int tlen, plen;
while (skip < len) {
skip += ScanToLetterOrSpecial(src + skip, len - skip);
if (skip >= len) {
return len;
}
if (IsSpecial(src[skip]) && !is_plain_text_) {
if (src[skip] == '<') {
tlen = ScanToPossibleLetter(src + skip, len - skip);
sc = 0;
} else if (src[skip] == '>') {
tlen = 1;
sc = 0;
} else if (src[skip] == '&') {
char temp[4];
EntityToBuffer(src + skip, len - skip,
temp, &tlen, &plen);
sc = getone::GetUTF8LetterScriptNum(temp);
}
} else {
tlen = cld_UniLib::OneCharLen(src + skip);
sc = getone::GetUTF8LetterScriptNum(src + skip);
}
if (sc != 0) {break;}
skip += tlen;
}
*script = sc;
return skip;
}
#ifdef NEED_ALIGNED_LOADS
static const bool kNeedsAlignedLoads = true;
#else
static const bool kNeedsAlignedLoads = false;
#endif
bool ScriptScanner::GetOneScriptSpan(getone::LangSpan* span) {
span->text = script_buffer_;
span->text_bytes = 0;
span->offset = next_byte_ - start_byte_;
span->script = UNKNOWN_LSCRIPT;
span->lang = UNKNOWN_LANGUAGE;
span->truncated = false;
int spanscript;
int sc = UNKNOWN_LSCRIPT;
int tlen, plen;
script_buffer_[0] = ' ';
script_buffer_[1] = '\0';
int take = 0;
int put = 1;
int skip = SkipToFrontOfSpan(next_byte_, byte_length_, &spanscript);
next_byte_ += skip;
byte_length_ -= skip;
if (byte_length_ <= 0) {
return false;
}
span->script = (UnicodeLScript)spanscript;
while (take < byte_length_) {
int letter_count = 0;
bool need_break = false;
while (take < byte_length_) {
if (IsSpecial(next_byte_[take]) && !is_plain_text_) {
if (next_byte_[take] == '<') {
sc = 0;
break;
} else if (next_byte_[take] == '>') {
sc = 0;
break;
} else if (next_byte_[take] == '&') {
EntityToBuffer(next_byte_ + take, byte_length_ - take,
script_buffer_ + put, &tlen, &plen);
sc = getone::GetUTF8LetterScriptNum(script_buffer_ + put);
}
} else {
tlen = plen = cld_UniLib::OneCharLen(next_byte_ + take);
if (!kNeedsAlignedLoads && (take < (byte_length_ - 3))) {
*reinterpret_cast<uint32*>(script_buffer_ + put) =
*reinterpret_cast<const uint32*>(next_byte_ + take);
} else {
memcpy(script_buffer_ + put, next_byte_ + take, plen);
}
sc = getone::GetUTF8LetterScriptNum(next_byte_ + take);
}
if ((sc != spanscript) && (sc != ULScript_Inherited)) {
if (sc == ULScript_Common) {
need_break = true;
} else {
int sc2 = getone::GetUTF8LetterScriptNum(next_byte_ + take + tlen);
if ((sc2 != ULScript_Common) && (sc2 != spanscript)) {
need_break = true;
}
}
}
if (need_break) {break;}
take += tlen;
put += plen;
++letter_count;
if (put >= getone::kMaxScriptBytes) {
span->truncated = true;
break;
}
}
while (take < byte_length_) {
take += ScanToLetterOrSpecial(next_byte_ + take, byte_length_ - take);
if (take >= byte_length_) {
take = byte_length_;
break;
}
if (IsSpecial(next_byte_[take]) && !is_plain_text_) {
if (next_byte_[take] == '<') {
tlen = ScanToPossibleLetter(next_byte_ + take, byte_length_ - take);
sc = 0;
} else if (next_byte_[take] == '>') {
tlen = 1;
sc = 0;
} else if (next_byte_[take] == '&') {
EntityToBuffer(next_byte_ + take, byte_length_ - take,
script_buffer_ + put, &tlen, &plen);
sc = getone::GetUTF8LetterScriptNum(script_buffer_ + put);
}
} else {
tlen = cld_UniLib::OneCharLen(next_byte_ + take);
sc = getone::GetUTF8LetterScriptNum(next_byte_ + take);
}
if (sc != 0) {break;}
take += tlen;
}
script_buffer_[put++] = ' ';
if (sc != spanscript) {break;}
if (put >= getone::kMaxScriptBytes - 8) {
span->truncated = true;
break;
}
}
next_byte_ += take;
byte_length_ -= take;
script_buffer_[put + 0] = ' ';
script_buffer_[put + 1] = ' ';
script_buffer_[put + 2] = ' ';
script_buffer_[put + 3] = '\0';
span->text_bytes = put;
return true;
}
void ScriptScanner::LowerScriptSpan(getone::LangSpan* span) {
#if !defined(CLD_WINDOWS)
if ((span->script == ULScript_Latin) ||
(span->script == ULScript_Cyrillic) ||
(span->script == ULScript_Greek)) {
int consumed, filled;
UniLib::ToLower(span->text, span->text_bytes + 4,
script_buffer_lower_, getone::kMaxScriptLowerBuffer,
&consumed, &filled);
span->text = script_buffer_lower_;
span->text_bytes = filled - 4;
}
#endif
}
bool ScriptScanner::GetOneScriptSpanLower(getone::LangSpan* span) {
bool ok = GetOneScriptSpan(span);
LowerScriptSpan(span);
return ok;
}
int getone::GetUTF8LetterScriptNum(const char* src) {
int srclen = cld_UniLib::OneCharLen(src);
const uint8* usrc = reinterpret_cast<const uint8*>(src);
return UTF8GenericProperty(&utf8propletterscriptnum_obj, &usrc, &srclen);
}