root/third_party/cld/encodings/compact_lang_det/win/cld_utf8statetable.h

/* [<][>][^][v][top][bottom][index][help] */

INCLUDED FROM


// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#ifndef ENCODINGS_COMPACT_LANG_DET_WIN_CLD_UTF8STATETABLE_H_
#define ENCODINGS_COMPACT_LANG_DET_WIN_CLD_UTF8STATETABLE_H_

#if !defined(CLD_WINDOWS)

#include "util/utf8/utf8statetable.h"

#else

#include "encodings/compact_lang_det/win/cld_basictypes.h"

// These four-byte entries compactly encode how many bytes 0..255 to delete
// in making a string replacement, how many bytes to add 0..255, and the offset
// 0..64k-1 of the replacement string in remap_string.
struct RemapEntry {
  uint8 delete_bytes;
  uint8 add_bytes;
  uint16 bytes_offset;
};

// Exit type codes for state tables. All but the first get stuffed into
// signed one-byte entries. The first is only generated by executable code.
// To distinguish from next-state entries, these must be contiguous and
// all <= kExitNone
typedef enum {
  kExitDstSpaceFull = 239,
  kExitIllegalStructure,  // 240
  kExitOK,                // 241
  kExitReject,            // ...
  kExitReplace1,
  kExitReplace2,
  kExitReplace3,
  kExitReplace21,
  kExitReplace31,
  kExitReplace32,
  kExitReplaceOffset1,
  kExitReplaceOffset2,
  kExitReplace1S0,
  kExitSpecial,
  kExitDoAgain,
  kExitRejectAlt,
  kExitNone               // 255
} ExitReason;

typedef enum {
  kExitDstSpaceFull_2 = -32769,
  kExitIllegalStructure_2,  // -32768
  kExitOK_2,                // -32767
  kExitReject_2,            // ...
  kExitReplace1_2,
  kExitReplace2_2,
  kExitReplace3_2,
  kExitReplace21_2,
  kExitReplace31_2,
  kExitReplace32_2,
  kExitReplaceOffset1_2,
  kExitReplaceOffset2_2,
  kExitReplace1S0_2,
  kExitSpecial_2,
  kExitDoAgain_2,
  kExitRejectAlt_2,
  kExitNone_2               // -32753
} ExitReason_2;

// This struct represents one entire state table. The three initialized byte
// areas are state_table, remap_base, and remap_string. state0 and state0_size
// give the byte offset and length within state_table of the initial state --
// table lookups are expected to start and end in this state, but for
// truncated UTF-8 strings, may end in a different state. These allow a quick
// test for that condition. entry_shift is 8 for tables subscripted by a full
// byte value and 6 for space-optimized tables subscripted by only six
// significant bits in UTF-8 continuation bytes.
typedef struct {
  const uint32 state0;
  const uint32 state0_size;
  const uint32 total_size;
  const int max_expand;
  const int entry_shift;
  const int bytes_per_entry;
  const uint32 losub;
  const uint32 hiadd;
  const uint8* state_table;
  const RemapEntry* remap_base;
  const uint8* remap_string;
  const uint8* fast_state;
} UTF8StateMachineObj;

// Near-duplicate declaration for tables with two-byte entries
typedef struct {
  const uint32 state0;
  const uint32 state0_size;
  const uint32 total_size;
  const int max_expand;
  const int entry_shift;
  const int bytes_per_entry;
  const uint32 losub;
  const uint32 hiadd;
  const signed short* state_table;
  const RemapEntry* remap_base;
  const uint8* remap_string;
  const uint8* fast_state;
} UTF8StateMachineObj_2;


typedef UTF8StateMachineObj UTF8PropObj;
typedef UTF8StateMachineObj UTF8ScanObj;
typedef UTF8StateMachineObj_2 UTF8PropObj_2;


// Look up property of one UTF-8 character and advance over it
// Return 0 if input length is zero
// Return 0 and advance one byte if input is ill-formed
uint8 UTF8GenericProperty(const UTF8PropObj* st,
                          const uint8** src,
                          int* srclen);

// BigOneByte versions are needed for tables > 240 states, but most
// won't need the TwoByte versions.

// Look up property of one UTF-8 character and advance over it
// Return 0 if input length is zero
// Return 0 and advance one byte if input is ill-formed
uint8 UTF8GenericPropertyBigOneByte(const UTF8PropObj* st,
                          const uint8** src,
                          int* srclen);

// Scan a UTF-8 stringpiece based on a state table.
// Always scan complete UTF-8 characters
// Set number of bytes scanned. Return reason for exiting
int UTF8GenericScan(const UTF8ScanObj* st,
                    const uint8* str,
                    const int len,
                    int* bytes_consumed);

#endif

#endif  // ENCODINGS_COMPACT_LANG_DET_WIN_CLD_UTF8STATETABLE_H_

/* [<][>][^][v][top][bottom][index][help] */