root/third_party/cld/encodings/compact_lang_det/win/cld_utf8statetable.cc

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. InStateZero
  2. UTF8GenericProperty
  3. UTF8GenericPropertyBigOneByte
  4. UTF8GenericScan

// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#include "encodings/compact_lang_det/win/cld_utf8statetable.h"

#include "base/basictypes.h"

// Return true if current Tbl pointer is within state0 range
// Note that unsigned compare checks both ends of range simultaneously
static inline bool InStateZero(const UTF8ScanObj* st, const uint8* Tbl) {
  const uint8* Tbl0 = &st->state_table[st->state0];
  return (static_cast<uint32>(Tbl - Tbl0) < st->state0_size);
}


// Look up property of one UTF-8 character and advance over it
// Return 0 if input length is zero
// Return 0 and advance one byte if input is ill-formed
uint8 UTF8GenericProperty(const UTF8PropObj* st,
                          const uint8** src,
                          int* srclen) {
  if (*srclen <= 0) {
    return 0;
  }

  const uint8* lsrc = *src;
  const uint8* Tbl_0 = &st->state_table[st->state0];
  const uint8* Tbl = Tbl_0;
  int e;
  int eshift = st->entry_shift;

  // Short series of tests faster than switch, optimizes 7-bit ASCII
  unsigned char c = lsrc[0];
  if (static_cast<signed char>(c) >= 0) {           // one byte
    e = Tbl[c];
    *src += 1;
    *srclen -= 1;
  } else if (((c & 0xe0) == 0xc0) && (*srclen >= 2)) {     // two bytes
    e = Tbl[c];
    Tbl = &Tbl_0[e << eshift];
    e = Tbl[lsrc[1]];
    *src += 2;
    *srclen -= 2;
  } else if (((c & 0xf0) == 0xe0) && (*srclen >= 3)) {     // three bytes
    e = Tbl[c];
    Tbl = &Tbl_0[e << eshift];
    e = Tbl[lsrc[1]];
    Tbl = &Tbl_0[e << eshift];
    e = Tbl[lsrc[2]];
    *src += 3;
    *srclen -= 3;
  }else if (((c & 0xf8) == 0xf0) && (*srclen >= 4)) {     // four bytes
    e = Tbl[c];
    Tbl = &Tbl_0[e << eshift];
    e = Tbl[lsrc[1]];
    Tbl = &Tbl_0[e << eshift];
    e = Tbl[lsrc[2]];
    Tbl = &Tbl_0[e << eshift];
    e = Tbl[lsrc[3]];
    *src += 4;
    *srclen -= 4;
  } else {                                                // Ill-formed
    e = 0;
    *src += 1;
    *srclen -= 1;
  }
  return e;
}

// BigOneByte versions are needed for tables > 240 states, but most
// won't need the TwoByte versions.
// Internally, to next-to-last offset is multiplied by 16 and the last
// offset is relative instead of absolute.
// Look up property of one UTF-8 character and advance over it
// Return 0 if input length is zero
// Return 0 and advance one byte if input is ill-formed
uint8 UTF8GenericPropertyBigOneByte(const UTF8PropObj* st,
                          const uint8** src,
                          int* srclen) {
  if (*srclen <= 0) {
    return 0;
  }

  const uint8* lsrc = *src;
  const uint8* Tbl_0 = &st->state_table[st->state0];
  const uint8* Tbl = Tbl_0;
  int e;
  int eshift = st->entry_shift;

  // Short series of tests faster than switch, optimizes 7-bit ASCII
  unsigned char c = lsrc[0];
  if (static_cast<signed char>(c) >= 0) {           // one byte
    e = Tbl[c];
    *src += 1;
    *srclen -= 1;
  } else if (((c & 0xe0) == 0xc0) && (*srclen >= 2)) {     // two bytes
    e = Tbl[c];
    Tbl = &Tbl_0[e << eshift];
    e = Tbl[lsrc[1]];
    *src += 2;
    *srclen -= 2;
  } else if (((c & 0xf0) == 0xe0) && (*srclen >= 3)) {     // three bytes
    e = Tbl[c];
    Tbl = &Tbl_0[e << (eshift + 4)];  // 16x the range
    e = (reinterpret_cast<const int8*>(Tbl))[lsrc[1]];
    Tbl = &Tbl[e << eshift];          // Relative +/-
    e = Tbl[lsrc[2]];
    *src += 3;
    *srclen -= 3;
  }else if (((c & 0xf8) == 0xf0) && (*srclen >= 4)) {     // four bytes
    e = Tbl[c];
    Tbl = &Tbl_0[e << eshift];
    e = Tbl[lsrc[1]];
    Tbl = &Tbl_0[e << (eshift + 4)];  // 16x the range
    e = (reinterpret_cast<const int8*>(Tbl))[lsrc[2]];
    Tbl = &Tbl[e << eshift];          // Relative +/-
    e = Tbl[lsrc[3]];
    *src += 4;
    *srclen -= 4;
  } else {                                                // Ill-formed
    e = 0;
    *src += 1;
    *srclen -= 1;
  }
  return e;
}

// Scan a UTF-8 stringpiece based on a state table.
// Always scan complete UTF-8 characters
// Set number of bytes scanned. Return reason for exiting
int UTF8GenericScan(const UTF8ScanObj* st,
                    const uint8* str,
                    const int len,
                    int* bytes_consumed) {
  int eshift = st->entry_shift;        // 6 (space optimized) or 8
  // int nEntries = (1 << eshift);       // 64 or 256 entries per state

  const uint8* isrc = str;
    //reinterpret_cast<const uint8*>(str.data());
  const uint8* src = isrc;
  //const int len = str.length();
  const uint8* srclimit = isrc + len;
  const uint8* srclimit8 = srclimit - 7;
  *bytes_consumed = 0;
  if (len == 0) return kExitOK;

  const uint8* Tbl_0 = &st->state_table[st->state0];

DoAgain:
  // Do state-table scan
  int e = 0;
  uint8 c;

  // Do fast for groups of 8 identity bytes.
  // This covers a lot of 7-bit ASCII ~8x faster then the 1-byte loop,
  // including slowing slightly on cr/lf/ht
  //----------------------------
  const uint8* Tbl2 = &st->fast_state[0];
  uint32 losub = st->losub;
  uint32 hiadd = st->hiadd;
  while (src < srclimit8) {
    uint32 s0123 = UnalignedLoad32(src);
    uint32 s4567 = UnalignedLoad32(src + 4);
    src += 8;
    // This is a fast range check for all bytes in [lowsub..0x80-hiadd)
    uint32 temp = (s0123 - losub) | (s0123 + hiadd) |
                  (s4567 - losub) | (s4567 + hiadd);
    if ((temp & 0x80808080) != 0) {
      // We typically end up here on cr/lf/ht; src was incremented
      int e0123 = (Tbl2[src[-8]] | Tbl2[src[-7]]) |
                  (Tbl2[src[-6]] | Tbl2[src[-5]]);
      if (e0123 != 0) {src -= 8; break;}    // Exit on Non-interchange
      e0123 = (Tbl2[src[-4]] | Tbl2[src[-3]]) |
              (Tbl2[src[-2]] | Tbl2[src[-1]]);
      if (e0123 != 0) {src -= 4; break;}    // Exit on Non-interchange
      // Else OK, go around again
    }
  }
  //----------------------------

  // Byte-at-a-time scan
  //----------------------------
  const uint8* Tbl = Tbl_0;
  while (src < srclimit) {
    c = *src;
    e = Tbl[c];
    src++;
    if (e >= kExitIllegalStructure) {break;}
    Tbl = &Tbl_0[e << eshift];
  }
  //----------------------------


  // Exit posibilities:
  //  Some exit code, !state0, back up over last char
  //  Some exit code, state0, back up one byte exactly
  //  source consumed, !state0, back up over partial char
  //  source consumed, state0, exit OK
  // For illegal byte in state0, avoid backup up over PREVIOUS char
  // For truncated last char, back up to beginning of it

  if (e >= kExitIllegalStructure) {
    // Back up over exactly one byte of rejected/illegal UTF-8 character
    src--;
    // Back up more if needed
    if (!InStateZero(st, Tbl)) {
      do {src--;} while ((src > isrc) && ((src[0] & 0xc0) == 0x80));
    }
  } else if (!InStateZero(st, Tbl)) {
    // Back up over truncated UTF-8 character
    e = kExitIllegalStructure;
    do {src--;} while ((src > isrc) && ((src[0] & 0xc0) == 0x80));
  } else {
    // Normal termination, source fully consumed
    e = kExitOK;
  }

  if (e == kExitDoAgain) {
    // Loop back up to the fast scan
    goto DoAgain;
  }

  *bytes_consumed = src - isrc;
  return e;
}

/* [<][>][^][v][top][bottom][index][help] */