root/third_party/cld/encodings/compact_lang_det/getonescriptspan.h

/* [<][>][^][v][top][bottom][index][help] */

INCLUDED FROM


// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#ifndef ENCODINGS_COMPACT_LANG_DET_GETONESCRIPTSPAN_H_
#define ENCODINGS_COMPACT_LANG_DET_GETONESCRIPTSPAN_H_

#include "encodings/compact_lang_det/letterscript_enum.h"
#include "encodings/compact_lang_det/compact_lang_det_impl.h"

namespace getone {
  static const int kMaxScriptBuffer = 4096;
  static const int kMaxScriptLowerBuffer = (kMaxScriptBuffer * 3) / 2;
  static const int kMaxScriptBytes = kMaxScriptBuffer- 8;   // Leave some room
  static const int kMaxAnswerBuffer = 256;

  typedef enum UnicodeLScript ULScript;

  typedef struct {
    char* text;             // Pointer to the span, somewhere
    int text_bytes;         // Number of bytes of text in the span
    int offset;             // Offset of start of span in original input buffer
    ULScript script;        // Script of all the letters in this span
    Language lang;          // Language identified for this span
    bool truncated;         // true if buffer filled up before a
                            // different script or EOF was found
  } LangSpan;


  static inline bool IsContinuationByte(char c) {
    return static_cast<signed char>(c) < -64;
  }

  // Gets lscript number for letters; always returns
  //   0 (common script) for non-letters
  int GetUTF8LetterScriptNum(const char* src);


  // Update src pointer to point to next quadgram, +2..+5
  // Looks at src[0..4]
  const char* AdvanceQuad(const char* src);
}     // end namespace getone






class ScriptScanner {
 public:
  ScriptScanner(const char* buffer, int buffer_length, bool is_plain_text);
  ~ScriptScanner();

  // Copy next run of same-script non-tag letters to buffer [NUL terminated]
  bool GetOneScriptSpan(getone::LangSpan* span);

  // Force Latin and Cyrillic scripts to be lowercase
  void LowerScriptSpan(getone::LangSpan* span);

  // Copy next run of same-script non-tag letters to buffer [NUL terminated]
  // Force Latin and Cyrillic scripts to be lowercase
  bool GetOneScriptSpanLower(getone::LangSpan* span);

 private:
  int SkipToFrontOfSpan(const char* src, int len, int* script);

  const char* start_byte_;
  const char* next_byte_;
  const char* next_byte_limit_;
  int byte_length_;
  bool is_plain_text_;
  char* script_buffer_;           // Holds text with expanded entities
  char* script_buffer_lower_;     // Holds lowercased text
};


class LangScanner {
 public:
  LangScanner(const CompactLangDetImpl::LangDetObj* langdetobj,
              getone::LangSpan* spn, int smoothwidth, int smoothcandidates,
              int maxlangs, int minlangspan);
  ~LangScanner();


  int script() {return script_;}

  // Use new text
  // Keep smoothing state if same script, otherwise reinit smoothing
  void NewText(getone::LangSpan* spn);

  bool GetOneShortLangSpanBoot(getone::LangSpan* span);  // Just for bootstrapping
  bool GetOneLangSpanBoot(getone::LangSpan* span);       // Just for bootstrapping

  // The real ones
  bool GetOneShortLangSpan(const CompactLangDetImpl::LangDetObj* langdetobj,
                           getone::LangSpan* span);
  bool GetOneLangSpan(const CompactLangDetImpl::LangDetObj* langdetobj,
                      getone::LangSpan* span);

  // Increases language bias by delta
  void SetLanguageBias(const CompactLangDetImpl::LangDetObj* langdetobj,
                       Language key, int delta);

  // For debugging output
  int next_answer_;
  char answer_buffer_[getone::kMaxAnswerBuffer];
  char answer_buffer2_[getone::kMaxAnswerBuffer];
  char answer_buffer3_[getone::kMaxAnswerBuffer];
  char answer_buffer4_[getone::kMaxAnswerBuffer];

 private:
  const char* start_byte_;
  const char* next_byte_limit_;
  const char* next_byte_;
  const char* onelangspan_begin_;
  int byte_length_;
  int script_;
  Language spanlang_;
  int smoothwidth_;
  int smoothwidth_2_;
  int smoothcandidates_;
  int maxlangs_;
  int minlangspan_;
  int rb_size_;
  int next_rb_;
  int rb_mask_;
  uint32* rb_;
  int* offset_rb_;
};

#endif  // ENCODINGS_COMPACT_LANG_DET_GETONESCRIPTSPAN_H_

/* [<][>][^][v][top][bottom][index][help] */