root/third_party/cld/encodings/compact_lang_det/cldutil.cc

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. BiHashV25
  2. QuadHashV25Mix
  3. QuadHashV25
  4. QuadHashV25Underscore
  5. OctaHash40Mix
  6. OctaHash40
  7. OctaHash40underscore
  8. ProcessProbV25UniTote
  9. ProcessProbV25Tote
  10. DoUniScoreV3
  11. DoBigramScoreV3
  12. DoQuadScoreV3
  13. DoOctaScoreV3
  14. ReliabilityDelta
  15. ReliabilityMainstream
  16. GetNormalizedScore
  17. GetReliability
  18. DemoteNotTop40

// Copyright (c) 2009 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#include <string>
#include "encodings/compact_lang_det/cldutil.h"

#include "base/basictypes.h"
#include "encodings/compact_lang_det/cldutil_dbg.h"
#include "encodings/compact_lang_det/generated/compact_lang_det_generated_meanscore.h"
#include "encodings/compact_lang_det/utf8propletterscriptnum.h"
#include "encodings/compact_lang_det/win/cld_commandlineflags.h"
#include "encodings/compact_lang_det/win/cld_logging.h"
#include "encodings/compact_lang_det/win/cld_unilib.h"
#include "encodings/compact_lang_det/win/cld_utf.h"
#include "encodings/compact_lang_det/win/cld_utf8statetable.h"

// Runtime routines for hashing, looking up, and scoring
// unigrams (CJK), bigrams (CJK), quadgrams, and octagrams.
// Unigrams and bigrams are for CJK languages only, including simplified/
// traditional Chinese, Japanese, Korean, Vietnamese Han characters, and
// Zhuang Han characters. Surrounding spaces are not considered.
// Quadgrams and octagrams for for non-CJK and include two bits indicating
// preceding and trailing spaces (word boundaries).


// Indicator bits for leading/trailing space around quad/octagram
// NOTE: 4444 bits are chosen to flip constant bits in hash of four chars of
// 1-, 2-, or 3-bytes each.
static const uint32 kPreSpaceIndicator =  0x00004444;
static const uint32 kPostSpaceIndicator = 0x44440000;

// Little-endian masks for 0..24 bytes picked up as uint32's
static const uint32 kWordMask0[4] = {
  0xFFFFFFFF, 0x000000FF, 0x0000FFFF, 0x00FFFFFF
};

static const int kMinCJKUTF8CharBytes = 3;

static const int kMinGramCount = 3;
static const int kMaxGramCount = 16;




// Routines to access a hash table of <key:wordhash, value:probs> pairs
// Buckets have 4-byte wordhash for sizes < 32K buckets, but only
// 2-byte wordhash for sizes >= 32K buckets, with other wordhash bits used as
// bucket subscript.
// Probs is a packed: three languages plus a subscript for probability table
// Buckets have all the keys together, then all the values.Key array never
// crosses a cache-line boundary, so no-match case takes exactly one cache miss.
// Match case may sometimes take an additional cache miss on value access.
//
// Other possibilites include 5 or 10 6-byte entries plus pad to make 32 or 64
// byte buckets with single cache miss.
// Or 2-byte key and 6-byte value, allowing 5 languages instead  of three.
//------------------------------------------------------------------------------


//------------------------------------------------------------------------------
// Hashing groups of 1/2/4/8 letters, perhaps with spaces or underscores
//------------------------------------------------------------------------------

// Design principles for these hash functions
// - Few operations
// - Handle 1-, 2-, and 3-byte UTF-8 scripts, ignoring intermixing except in
//   Latin script expect 1- and 2-byte mixtures.
// - Last byte of each character has about 5 bits of information
// - Spread good bits around so they can interact in at least two ways
//   with other characters
// - Use add for additional mixing thorugh carries

// CJK Three-byte bigram
//   ....dddd..cccccc..bbbbbb....aaaa
//   ..................ffffff..eeeeee
// make
//   ....dddd..cccccc..bbbbbb....aaaa
//   000....dddd..cccccc..bbbbbb....a
//   ..................ffffff..eeeeee
//   ffffff..eeeeee000000000000000000
//
// CJK Four-byte bigram
//   ..dddddd..cccccc....bbbb....aaaa
//   ..hhhhhh..gggggg....ffff....eeee
// make
//   ..dddddd..cccccc....bbbb....aaaa
//   000..dddddd..cccccc....bbbb....a
//   ..hhhhhh..gggggg....ffff....eeee
//   ..ffff....eeee000000000000000000

// BIGRAM
// Pick up 1..8 bytes and hash them via mask/shift/add. NO pre/post
// OVERSHOOTS up to 3 bytes
// For runtime use of tables
uint32 cld::BiHashV25(const char* word_ptr, int bytecount) {
  if (bytecount == 0) {
    return 0;
  }
  uint32 word0, word1;
  if (bytecount <= 4) {
    word0 = UnalignedLoad32(word_ptr) & kWordMask0[bytecount & 3];
    word0 = word0 ^ (word0 >> 3);
    return word0;
  }
  // Else do 8 bytes
  word0 = UnalignedLoad32(word_ptr);
  word0 = word0 ^ (word0 >> 3);
  word1 = UnalignedLoad32(word_ptr + 4) & kWordMask0[bytecount & 3];
  word1 = word1 ^ (word1 << 18);
  return word0 + word1;
}

//
// Ascii-7 One-byte chars
//   ...ddddd...ccccc...bbbbb...aaaaa
// make
//   ...ddddd...ccccc...bbbbb...aaaaa
//   000...ddddd...ccccc...bbbbb...aa
//
// Latin 1- and 2-byte chars
//   ...ddddd...ccccc...bbbbb...aaaaa
//   ...................fffff...eeeee
// make
//   ...ddddd...ccccc...bbbbb...aaaaa
//   000...ddddd...ccccc...bbbbb...aa
//   ...................fffff...eeeee
//   ...............fffff...eeeee0000
//
// Non-CJK Two-byte chars
//   ...ddddd...........bbbbb........
//   ...hhhhh...........fffff........
// make
//   ...ddddd...........bbbbb........
//   000...ddddd...........bbbbb.....
//   ...hhhhh...........fffff........
//   hhhh...........fffff........0000
//
// Non-CJK Three-byte chars
//   ...........ccccc................
//   ...................fffff........
//   ...lllll...................iiiii
// make
//   ...........ccccc................
//   000...........ccccc.............
//   ...................fffff........
//   ...............fffff........0000
//   ...lllll...................iiiii
//   .lllll...................iiiii00
//

// QUADGRAM
// Pick up 1..12 bytes plus pre/post space and hash them via mask/shift/add
// OVERSHOOTS up to 3 bytes
// For runtime use of tables
uint32 QuadHashV25Mix(const char* word_ptr, int bytecount, uint32 prepost) {
  uint32 word0, word1, word2;
  if (bytecount <= 4) {
    word0 = UnalignedLoad32(word_ptr) & kWordMask0[bytecount & 3];
    word0 = word0 ^ (word0 >> 3);
    return word0 ^ prepost;
  } else if (bytecount <= 8) {
    word0 = UnalignedLoad32(word_ptr);
    word0 = word0 ^ (word0 >> 3);
    word1 = UnalignedLoad32(word_ptr + 4) & kWordMask0[bytecount & 3];
    word1 = word1 ^ (word1 << 4);
    return (word0 ^ prepost) + word1;
  }
  // else do 12 bytes
  word0 = UnalignedLoad32(word_ptr);
  word0 = word0 ^ (word0 >> 3);
  word1 = UnalignedLoad32(word_ptr + 4);
  word1 = word1 ^ (word1 << 4);
  word2 = UnalignedLoad32(word_ptr + 8) & kWordMask0[bytecount & 3];
  word2 = word2 ^ (word2 << 2);
  return (word0 ^ prepost) + word1 + word2;
}


// QUADGRAM wrapper with surrounding spaces
// Pick up 1..12 bytes plus pre/post space and hash them via mask/shift/add
// UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes
// For runtime use of tables
uint32 cld::QuadHashV25(const char* word_ptr, int bytecount) {
  if (bytecount == 0) {
    return 0;
  }
  uint32 prepost = 0;
  if (word_ptr[-1] == ' ') {prepost |= kPreSpaceIndicator;}
  if (word_ptr[bytecount] == ' ') {prepost |= kPostSpaceIndicator;}
  return QuadHashV25Mix(word_ptr, bytecount, prepost);
}

// QUADGRAM wrapper with surrounding underscores (offline use)
// Pick up 1..12 bytes plus pre/post '_' and hash them via mask/shift/add
// OVERSHOOTS up to 3 bytes
// For offline construction of tables
uint32 cld::QuadHashV25Underscore(const char* word_ptr, int bytecount) {
  if (bytecount == 0) {
    return 0;
  }
  const char* local_word_ptr = word_ptr;
  int local_bytecount = bytecount;
  uint32 prepost = 0;
  if (local_word_ptr[0] == '_') {
    prepost |= kPreSpaceIndicator;
    ++local_word_ptr;
    --local_bytecount;
  }
  if (local_word_ptr[local_bytecount - 1] == '_') {
    prepost |= kPostSpaceIndicator;
    --local_bytecount;
  }
  return QuadHashV25Mix(local_word_ptr, local_bytecount, prepost);
}


// OCTAGRAM
// Pick up 1..24 bytes plus pre/post space and hash them via mask/shift/add
// UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes
//
// The low 32 bits follow the pattern from above, tuned to different scripts
// The high 8 bits are a simple sum of all bytes, shifted by 0/1/2/3 bits each
// For runtime use of tables V3
uint64 OctaHash40Mix(const char* word_ptr, int bytecount, uint64 prepost) {
  uint64 word0;
  uint64 word1;
  uint64 sum;

  if (word_ptr[-1] == ' ') {prepost |= kPreSpaceIndicator;}
  if (word_ptr[bytecount] == ' ') {prepost |= kPostSpaceIndicator;}
  switch ((bytecount - 1) >> 2) {
  case 0:       // 1..4 bytes
    word0 = UnalignedLoad32(word_ptr) & kWordMask0[bytecount & 3];
    sum = word0;
    word0 = word0 ^ (word0 >> 3);
    break;
  case 1:       // 5..8 bytes
    word0 = UnalignedLoad32(word_ptr);
    sum = word0;
    word0 = word0 ^ (word0 >> 3);
    word1 = UnalignedLoad32(word_ptr + 4) & kWordMask0[bytecount & 3];
    sum += word1;
    word1 = word1 ^ (word1 << 4);
    word0 += word1;
    break;
  case 2:       // 9..12 bytes
    word0 = UnalignedLoad32(word_ptr);
    sum = word0;
    word0 = word0 ^ (word0 >> 3);
    word1 = UnalignedLoad32(word_ptr + 4);
    sum += word1;
    word1 = word1 ^ (word1 << 4);
    word0 += word1;
    word1 = UnalignedLoad32(word_ptr + 8) & kWordMask0[bytecount & 3];
    sum += word1;
    word1 = word1 ^ (word1 << 2);
    word0 += word1;
    break;
  case 3:       // 13..16 bytes
    word0 = UnalignedLoad32(word_ptr);
    sum = word0;
    word0 = word0 ^ (word0 >> 3);
    word1 = UnalignedLoad32(word_ptr + 4);
    sum += word1;
    word1 = word1 ^ (word1 << 4);
    word0 += word1;
    word1 = UnalignedLoad32(word_ptr + 8);
    sum += word1;
    word1 = word1 ^ (word1 << 2);
    word0 += word1;
    word1 = UnalignedLoad32(word_ptr + 12) & kWordMask0[bytecount & 3];
    sum += word1;
    word1 = word1 ^ (word1 >> 8);
    word0 += word1;
    break;
  case 4:       // 17..20 bytes
    word0 = UnalignedLoad32(word_ptr);
    sum = word0;
    word0 = word0 ^ (word0 >> 3);
    word1 = UnalignedLoad32(word_ptr + 4);
    sum += word1;
    word1 = word1 ^ (word1 << 4);
    word0 += word1;
    word1 = UnalignedLoad32(word_ptr + 8);
    sum += word1;
    word1 = word1 ^ (word1 << 2);
    word0 += word1;
    word1 = UnalignedLoad32(word_ptr + 12);
    sum += word1;
    word1 = word1 ^ (word1 >> 8);
    word0 += word1;
    word1 = UnalignedLoad32(word_ptr + 16) & kWordMask0[bytecount & 3];
    sum += word1;
    word1 = word1 ^ (word1 >> 4);
    word0 += word1;
    break;
  default:      // 21..24 bytes and higher (ignores beyond 24)
    word0 = UnalignedLoad32(&word_ptr);
    sum = word0;
    word0 = word0 ^ (word0 >> 3);
    word1 = UnalignedLoad32(word_ptr + 4);
    sum += word1;
    word1 = word1 ^ (word1 << 4);
    word0 += word1;
    word1 = UnalignedLoad32(word_ptr + 8);
    sum += word1;
    word1 = word1 ^ (word1 << 2);
    word0 += word1;
    word1 = UnalignedLoad32(word_ptr + 12);
    sum += word1;
    word1 = word1 ^ (word1 >> 8);
    word0 += word1;
    word1 = UnalignedLoad32(word_ptr + 16);
    sum += word1;
    word1 = word1 ^ (word1 >> 4);
    word0 += word1;
    word1 = UnalignedLoad32(word_ptr + 20) & kWordMask0[bytecount & 3];
    sum += word1;
    word1 = word1 ^ (word1 >> 6);
    word0 += word1;
    break;
  }

  sum += (sum >> 17);             // extra 1-bit shift for bytes 2 & 3
  sum += (sum >> 9);              // extra 1-bit shift for bytes 1 & 3
  sum = (sum & 0xff) << 32;
  return (word0 ^ prepost) + sum;
}

// OCTAGRAM wrapper with surrounding spaces
// Pick up 1..24 bytes plus pre/post space and hash them via mask/shift/add
// UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes
//
// The low 32 bits follow the pattern from above, tuned to different scripts
// The high 8 bits are a simple sum of all bytes, shifted by 0/1/2/3 bits each
// For runtime use of tables V3
uint64 cld::OctaHash40(const char* word_ptr, int bytecount) {
  if (bytecount == 0) {
    return 0;
  }
  uint64 prepost = 0;
  if (word_ptr[-1] == ' ') {prepost |= kPreSpaceIndicator;}
  if (word_ptr[bytecount] == ' ') {prepost |= kPostSpaceIndicator;}
  return OctaHash40Mix(word_ptr, bytecount, prepost);
}


// OCTAGRAM wrapper with surrounding underscores (offline use)
// Pick up 1..24 bytes plus pre/post space and hash them via mask/shift/add
// UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes
//
// The low 32 bits follow the pattern from above, tuned to different scripts
// The high 8 bits are a simple sum of all bytes, shifted by 0/1/2/3 bits each
// For offline construction of tables
uint64 cld::OctaHash40underscore(const char* word_ptr, int bytecount) {
  if (bytecount == 0) {
    return 0;
  }
  const char* local_word_ptr = word_ptr;
  int local_bytecount = bytecount;
  uint64 prepost = 0;
  if (local_word_ptr[0] == '_') {
    prepost |= kPreSpaceIndicator;
    ++local_word_ptr;
    --local_bytecount;
  }
  if (local_word_ptr[local_bytecount - 1] == '_') {
    prepost |= kPostSpaceIndicator;
    --local_bytecount;
  }
  return OctaHash40Mix(local_word_ptr, local_bytecount, prepost);
}




//------------------------------------------------------------------------------
// Scoring single groups of letters
//------------------------------------------------------------------------------

// UNIGRAM score one => tote
// Input: 1-byte entry of subscript into unigram probs, plus
//  an accumulator tote.
// Output: running sums in tote updated
void cld::ProcessProbV25UniTote(int propval, Tote* tote) {
  tote->AddGram();
  const UnigramProbArray* pa = &kTargetCTJKVZProbs[propval];
  if (pa->probs[0] > 0) {tote->Add(cld::PackLanguage(CHINESE), pa->probs[0]);}
  if (pa->probs[1] > 0) {tote->Add(cld::PackLanguage(CHINESE_T), pa->probs[1]);}
  if (pa->probs[2] > 0) {tote->Add(cld::PackLanguage(JAPANESE), pa->probs[2]);}
  if (pa->probs[3] > 0) {tote->Add(cld::PackLanguage(KOREAN), pa->probs[3]);}
  if (pa->probs[4] > 0) {tote->Add(cld::PackLanguage(VIETNAMESE), pa->probs[4]);}
  if (pa->probs[5] > 0) {tote->Add(cld::PackLanguage(ZHUANG), pa->probs[5]);}
}

// BIGRAM, QUADGRAM, OCTAGRAM score one => tote
// Input: 4-byte entry of 3 language numbers and one probability subscript, plus
//  an accumulator tote. (language 0 means unused entry)
// Output: running sums in tote updated
void cld::ProcessProbV25Tote(uint32 probs, Tote* tote) {
  tote->AddGram();
  uint8 prob123 = (probs >> 0) & 0xff;
  const uint8* prob123_entry = cld::LgProb2TblEntry(prob123);

  uint8 top1 = (probs >> 8) & 0xff;
  if (top1 > 0) {tote->Add(top1, cld::LgProb3(prob123_entry, 0));}
  uint8 top2 = (probs >> 16) & 0xff;
  if (top2 > 0) {tote->Add(top2, cld::LgProb3(prob123_entry, 1));}
  uint8 top3 = (probs >> 24) & 0xff;
  if (top3 > 0) {tote->Add(top3, cld::LgProb3(prob123_entry, 2));}
}


//------------------------------------------------------------------------------
// Routines to accumulate probabilities
//------------------------------------------------------------------------------


// UNIGRAM, using UTF-8 property table, advancing by 1/2/4/8 chars
// Caller supplies table, such as compact_lang_det_generated_ctjkvz_b1_obj
// Score up to n unigrams, returning number of bytes consumed
// Updates tote_grams
int cld::DoUniScoreV3(const UTF8PropObj* unigram_obj,
                      const char* isrc, int srclen, int advance_by,
                      int* tote_grams, int gram_limit, Tote* chunk_tote) {
  const char* src = isrc;
  if (FLAGS_dbgscore) {DbgScoreInit(src, srclen);}

  // Property-based CJK unigram lookup
  if (src[0] == ' ') {++src; --srclen;}

  const uint8* usrc = reinterpret_cast<const uint8*>(src);
  int usrclen = srclen;

  while (usrclen > 0) {
    int len = kAdvanceOneChar[usrc[0]];
    // Look up property of one UTF-8 character and advance over it
    // Return 0 if input length is zero
    // Return 0 and advance one byte if input is ill-formed

    int propval = UTF8GenericPropertyBigOneByte(unigram_obj, &usrc, &usrclen);

    if (FLAGS_dbglookup) {
      DbgUniTermToStderr(propval, usrc, len);
    }

    if (propval > 0) {
      ProcessProbV25UniTote(propval, chunk_tote);
      ++(*tote_grams);
      if (FLAGS_dbgscore) {DbgScoreRecordUni((const char*)usrc, propval, len);}
    }

    // Advance by 1/2/4/8 characters (half of quad advance)
    if (advance_by == 2) {
      // Already advanced by 1
    } else if (advance_by == 4) {
      // Advance by 2 chars total, if not at end
      if (UTFmax <= usrclen) {
        int n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
      }
    } else if (advance_by == 8) {
      // Advance by 4 chars total, if not at end
      if ((UTFmax * 3) <= usrclen) {
        int n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
        n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
        n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
      }
    } else {
      // Advance by 8 chars total, if not at end
      if ((UTFmax * 7) <= usrclen) {
        int n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
        n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
        n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
        n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
        n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
        n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
        n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
      }
    }
    DCHECK(usrclen >= 0);

    if (*tote_grams >= gram_limit) {
      break;
    }
  }
  if (FLAGS_dbgscore) {
    // With advance_by>2, we consume more input to get the same number of quads
    int len = src - isrc;
    DbgScoreTop(src, (len * 2) / advance_by, chunk_tote);
    DbgScoreFlush();
  }

  int consumed2 = reinterpret_cast<const char*>(usrc) - isrc;
  return consumed2;
}


// BIGRAM, using hash table, always advancing by 1 char
// Caller supplies table, such as &kCjkBiTable_obj or &kGibberishTable_obj
// Score all bigrams in isrc, using languages that have bigrams (CJK)
// Return number of bigrams that hit in the hash table
int cld::DoBigramScoreV3(const cld::CLDTableSummary* bigram_obj,
                         const char* isrc, int srclen, Tote* chunk_tote) {
  int hit_count = 0;
  const char* src = isrc;

  // Hashtable-based CJK bigram lookup
  const uint8* usrc = reinterpret_cast<const uint8*>(src);
  const uint8* usrclimit1 = usrc + srclen - UTFmax;
  if (FLAGS_dbgscore) {
    fprintf(stderr, "  " );
  }

  while (usrc < usrclimit1) {
    int len = kAdvanceOneChar[usrc[0]];
    int len2 = kAdvanceOneChar[usrc[len]] + len;

    if ((kMinCJKUTF8CharBytes * 2) <= len2) {      // Two CJK chars possible
      // Lookup and score this bigram
      // Always ignore pre/post spaces
      uint32 bihash = BiHashV25(reinterpret_cast<const char*>(usrc), len2);
      uint32 probs = QuadHashV3Lookup4(bigram_obj, bihash);
      // Now go indirect on the subscript
      probs = bigram_obj->kCLDTableInd[probs &
        ~bigram_obj->kCLDTableKeyMask];

      // Process the bigram
      if (FLAGS_dbglookup) {
        const char* ssrc = reinterpret_cast<const char*>(usrc);
        DbgBiTermToStderr(bihash, probs, ssrc, len2);
        DbgScoreRecord(NULL, probs, len2);
      } else if (FLAGS_dbgscore && (probs != 0)) {
        const char* ssrc = reinterpret_cast<const char*>(usrc);
        DbgScoreRecord(NULL, probs, len2);
        string temp(ssrc, len2);
        fprintf(stderr, "%s ", temp.c_str());
      }

      if (probs != 0) {
        ProcessProbV25Tote(probs, chunk_tote);
        ++hit_count;
      }
    }
    usrc += len;  // Advance by one char
  }

  if (FLAGS_dbgscore) {
    fprintf(stderr, "[%d bigrams scored]\n", hit_count);
    DbgScoreState();
  }
  return hit_count;
}



// QUADGRAM, using hash table, advancing by 2/4/8/16 chars
// Caller supplies table, such as &kQuadTable_obj or &kGibberishTable_obj
// Score up to n quadgrams, returning number of bytes consumed
// Updates tote_grams
int cld::DoQuadScoreV3(const cld::CLDTableSummary* quadgram_obj,
                       const char* isrc, int srclen, int advance_by,
                       int* tote_grams, int gram_limit, Tote* chunk_tote) {
  const char* src = isrc;
  const char* srclimit = src + srclen;
  // Limit is end, which has extra 20 20 20 00 past len
  const char* srclimit7 = src + srclen - (UTFmax * 7);
  const char* srclimit15 = src + srclen - (UTFmax * 15);

  if (FLAGS_dbgscore) {DbgScoreInit(src, srclen);}

  // Run a little cache of last hits to catch overly-repetitive "text"
  int next_prior = 0;
  uint32 prior_quads[2] = {0, 0};

  // Visit all quadgrams
  if (src[0] == ' ') {++src;}
  while (src < srclimit) {
    // Find one quadgram
    const char* src_end = src;
    src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
    src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
    const char* src_mid = src_end;
    src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
    src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
    int len = src_end - src;

    // Lookup and score this quadgram
    uint32 quadhash = QuadHashV25(src, len);
    uint32 probs = QuadHashV3Lookup4(quadgram_obj, quadhash);
    // Now go indirect on the subscript
    probs = quadgram_obj->kCLDTableInd[probs &
      ~quadgram_obj->kCLDTableKeyMask];

    // Process the quadgram
    if (FLAGS_dbglookup) {
      DbgQuadTermToStderr(quadhash, probs, src, len);
    }
    if (probs != 0) {
      // Filter out recent repeats. If this works out, use in the other lookups
      if ((quadhash != prior_quads[0]) && (quadhash != prior_quads[1])) {
        prior_quads[next_prior] = quadhash;
        next_prior = (next_prior + 1) & 1;
        ProcessProbV25Tote(probs, chunk_tote);
        ++(*tote_grams);
        if (FLAGS_dbgscore) {DbgScoreRecord(src, probs, len);}
      }
    }

    // Advance all the way past word if at end-of-word
    if (src_end[0] == ' ') {
      src_mid = src_end;
    }

    // Advance by 2/4/8/16 characters
    if (advance_by == 2) {
      src = src_mid;
    } else if (advance_by == 4) {
      src = src_end;
    } else if (advance_by == 8) {
      // Advance by 8 chars total (4 more), if not at end
      if (src < srclimit7) {
        src_end += kAdvanceOneChar[(uint8)src_end[0]];
        src_end += kAdvanceOneChar[(uint8)src_end[0]];
        src_end += kAdvanceOneChar[(uint8)src_end[0]];
        src_end += kAdvanceOneChar[(uint8)src_end[0]];
      }
      src = src_end;
    } else {
      // Advance by 16 chars total (12 more), if not at end
      if (src < srclimit15) {
        // Advance by ~16 chars by adding 3 * current bytelen
        int fourcharlen = src_end - src;
        src = src_end + (3 * fourcharlen);
        // Advance a bit more if mid-character
        src += kAdvanceOneCharSpaceVowel[(uint8)src[0]];
        src += kAdvanceOneCharSpaceVowel[(uint8)src[0]];
      } else {
        src = src_end;
      }
    }
    DCHECK(src < srclimit);
    src += kAdvanceOneCharSpaceVowel[(uint8)src[0]];

    if (*tote_grams >= gram_limit) {
      break;
    }
  }

  if (FLAGS_dbgscore) {
    // With advance_by>2, we consume more input to get the same number of quads
    int len = src - isrc;
    DbgScoreTop(src, (len * 2) / advance_by, chunk_tote);
    DbgScoreFlush();
  }

  int consumed = src - isrc;

  // If advancing by more than 2, src may have overshot srclimit
  if (consumed > srclen) {
    consumed = srclen;
  }

  return consumed;
}


// OCTAGRAM, using hash table, always advancing by 1 word
// Caller supplies table, such as &kLongWord8Table_obj
// Score all words in isrc, using languages that have quadgrams
// We don't normally use this routine except on the first quadgram run,
// but it can be used to resolve unreliable pages.
// This routine does not have an optimized advance_by
// SOON: Uses indirect language/probability longword
//
// Return number of words that hit in the hash table
int cld::DoOctaScoreV3(const cld::CLDTableSummary* octagram_obj,
                       const char* isrc, int srclen, Tote* chunk_tote) {
  int hit_count = 0;
  const char* src = isrc;
  const char* srclimit = src + srclen + 1;
  // Limit is end+1, to include extra space char (0x20) off the end
  //
  // Score all words truncated to 8 characters
  int charcount = 0;
  // Skip any initial space
  if (src[0] == ' ') {++src;}
  const char* word_ptr = src;
  const char* word_end = word_ptr;
  if (FLAGS_dbgscore) {
    fprintf(stderr, "  " );
  }
  while (src < srclimit) {
    // Terminate previous word or continue current word
    if (src[0] == ' ') {
      int bytecount = word_end - word_ptr;
      if (bytecount == 0)
        break;
      // Lookup and score this word
      uint64 wordhash40 = OctaHash40(word_ptr, bytecount);
      uint32 probs = OctaHashV3Lookup4(octagram_obj, wordhash40);
      // Now go indirect on the subscript
      probs = octagram_obj->kCLDTableInd[probs &
        ~octagram_obj->kCLDTableKeyMask];

      // // Lookup and score this word
      // uint32 wordhash = QuadHashV25(word_ptr, bytecount);
      // uint32 probs = WordHashLookup4(wordhash, kLongWord8Table,
      //                                kLongWord8TableSize);
      //
      if (FLAGS_dbglookup) {
        DbgWordTermToStderr(wordhash40, probs, word_ptr, bytecount);
        DbgScoreRecord(NULL, probs, bytecount);
      } else if (FLAGS_dbgscore && (probs != 0)) {
        DbgScoreRecord(NULL, probs, bytecount);
        string temp(word_ptr, bytecount);
        fprintf(stderr, "%s ", temp.c_str());
      }

      if (probs != 0) {
        ProcessProbV25Tote(probs, chunk_tote);
        ++hit_count;
      }
      charcount = 0;
      word_ptr = src + 1;   // Over the space
      word_end = word_ptr;
    } else {
      ++charcount;
    }

    // Advance to next char
    src += cld_UniLib::OneCharLen(src);
    if (charcount <= 8) {
      word_end = src;
    }
  }

  if (FLAGS_dbgscore) {
    fprintf(stderr, "[%d words scored]\n", hit_count);
    DbgScoreState();
  }
  return hit_count;
}



//------------------------------------------------------------------------------
// Reliability calculations, for single language and between languages
//------------------------------------------------------------------------------

// Return reliablity of result 0..100 for top two scores
// delta==0 is 0% reliable, delta==fully_reliable_thresh is 100% reliable
// (on a scale where +1 is a factor of  2 ** 1.6 = 3.02)
// Threshold is uni/quadgram increment count, bounded above and below.
//
// Requiring a factor of 3 improvement (e.g. +1 log base 3)
// for each scored quadgram is too stringent, so I've backed this off to a
// factor of 2 (e.g. +5/8 log base 3).
//
// I also somewhat lowered the Min/MaxGramCount limits above
//
// Added: if fewer than 8 quads/unis, max reliability is 12*n percent
//
int cld::ReliabilityDelta(int value1, int value2, int gramcount) {
  int max_reliability_percent = 100;
  if (gramcount < 8) {
    max_reliability_percent = 12 * gramcount;
  }
  int fully_reliable_thresh = (gramcount * 5) >> 3;     // see note above
  if (fully_reliable_thresh < kMinGramCount) {          // Fully = 3..16
    fully_reliable_thresh = kMinGramCount;
  } else if (fully_reliable_thresh > kMaxGramCount) {
    fully_reliable_thresh = kMaxGramCount;
  }

  int delta = value1 - value2;
  if (delta >= fully_reliable_thresh) {return max_reliability_percent;}
  if (delta <= 0) {return 0;}
  return cld::minint(max_reliability_percent,
                     (100 * delta) / fully_reliable_thresh);
}

// Return reliablity of result 0..100 for top score vs. mainsteam score
// Values are score per 1024 bytes of input
// ratio = max(top/mainstream, mainstream/top)
// ratio > 4.0 is 0% reliable, <= 2.0 is 100% reliable
// Change: short-text word scoring can give unusually good results.
//  Let top exceed mainstream by 4x at 50% reliable
int cld::ReliabilityMainstream(int topscore, int len, int mean_score) {
  if (mean_score == 0) {return 100;}    // No reliability data available yet
  if (topscore == 0) {return 0;}        // zero score = unreliable
  if (len == 0) {return 0;}             // zero len = unreliable
  int top_kb = (topscore << 10) / len;
  double ratio;
  double ratio_cutoff;
  if (top_kb > mean_score) {
    ratio = (1.0 * top_kb) / mean_score;
    ratio_cutoff = 5.0;                 // ramp down from 100% to 0%: 3.0-5.0
  } else {
    ratio = (1.0 * mean_score) / top_kb;
    ratio_cutoff = 4.0;                 // ramp down from 100% to 0%: 2.0-4.0
  }
  if (ratio <= ratio_cutoff - 2.0) {return 100;}
  if (ratio > ratio_cutoff) {return 0;}

  int iratio = static_cast<int>(100 * (ratio_cutoff - ratio) / 2.0);
  return iratio;
}

// Calculate ratio of score per 1KB vs. expected score per 1KB
double cld::GetNormalizedScore(Language lang, UnicodeLScript lscript,
                          int bytes, int score) {
  // Average training-data score for this language-script combo, per 1KB
  int expected_score = kMeanScore[lang * 4 + LScript4(lscript)];
  if (lscript == ULScript_Common) {
    // We don't know the script (only happens with second-chance score)
    // Look for first non-zero mean value
    for (int i = 2; i >= 0; --i) {
      if (kMeanScore[lang * 4 + i] > 0) {
        expected_score = kMeanScore[lang * 4 + i];
        break;
      }
    }
  }
  if (expected_score < 100) {
      expected_score = 1000;
  }

  // Our score per 1KB
  double our_score = (score << 10) / (bytes ? bytes : 1);  // Avoid zdiv
  double ratio = our_score / expected_score;

  // Just the raw count normalized as though each language has mean=1000;
  ratio = (score * 1000.0) /  expected_score;
  return ratio;
}

// Calculate reliablity of len bytes of script lscript with chunk_tote
int cld::GetReliability(int len, UnicodeLScript lscript,
                   const Tote* chunk_tote) {
  Language cur_lang = UnpackLanguage(chunk_tote->Key(0));
  // Average score for this language-script combo
  int mean_score = kMeanScore[cur_lang * 4 + LScript4(lscript)];
  if (lscript == ULScript_Common) {
    // We don't know the script (only happens with second-chance score)
    // Look for first non-zero mean value
    for (int i = 2; i >= 0; --i) {
      if (kMeanScore[cur_lang * 4 + i] > 0) {
        mean_score = kMeanScore[cur_lang * 4 + i];
        break;
      }
    }
  }
  int reliability_delta = ReliabilityDelta(chunk_tote->Value(0),
                                           chunk_tote->Value(1),
                                           chunk_tote->GetGramCount());

  int reliability_main = ReliabilityMainstream(chunk_tote->Value(0),
                                               len,
                                               mean_score);

  int reliability_min = minint(reliability_delta, reliability_main);


  if (FLAGS_dbgreli) {
    char temp1[4];
    char temp2[4];
    cld::DbgLangName3(UnpackLanguage(chunk_tote->Key(0)), temp1);
    if (temp1[2] == ' ') {temp1[2] = '\0';}
    cld::DbgLangName3(UnpackLanguage(chunk_tote->Key(1)), temp2);
    if (temp2[2] == ' ') {temp2[2] = '\0';}
    int srclen = len;
    fprintf(stderr, "CALC GetReliability gram=%d incr=%d srclen=%d,  %s=%d %s=%d "
                   "top/KB=%d mean/KB=%d del=%d%% reli=%d%%   "
                   "lang/lscript %d %d\n",
           chunk_tote->GetGramCount(),
           chunk_tote->GetIncrCount(),
           srclen,
           temp1, chunk_tote->Value(0),
           temp2, chunk_tote->Value(1),
           (chunk_tote->Value(0) << 10) / (srclen ? srclen : 1),
           mean_score,
           reliability_delta,
           reliability_main,
           cur_lang, lscript);
  }

  return reliability_min;
}


//------------------------------------------------------------------------------
// Miscellaneous
//------------------------------------------------------------------------------

// Demote all languages except Top40 and plus_one
// Do this just before sorting chunk_tote results
void cld::DemoteNotTop40(Tote* chunk_tote, int packed_plus_one) {
  for (int sub = 0; sub < chunk_tote->MaxSize(); ++sub) {
    if (chunk_tote->Key(sub) == 0) continue;
    if (chunk_tote->Key(sub) == packed_plus_one) continue;
    if (kIsPackedTop40[chunk_tote->Key(sub)]) continue;
    // Quarter the score of others
    chunk_tote->SetValue(sub, chunk_tote->Value(sub) >> 2);
  }
}

/* [<][>][^][v][top][bottom][index][help] */