root/third_party/cld/encodings/compact_lang_det/compact_lang_det.cc

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. DetectLanguage
  2. DetectLanguageSummary
  3. DetectLanguageSummary
  4. ExtDetectLanguageSummary
  5. ExtDetectLanguageSummary
  6. ExtDetectLanguageSummary
  7. DetectLanguageVersion

// Copyright (c) 2009 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#include "encodings/compact_lang_det/compact_lang_det.h"
#include "encodings/compact_lang_det/compact_lang_det_impl.h"
#include "encodings/compact_lang_det/win/cld_basictypes.h"

// String is "code_version - data_scrape_date"
static const char* kDetectLanguageVersion = "V1.6 - 20081121";

// Large-table version for all ~160 languages (all Tiers)

// Scan interchange-valid UTF-8 bytes and detect most likely language
Language CompactLangDet::DetectLanguage(
                          const DetectionTables* tables,
                          const char* buffer,
                          int buffer_length,
                          bool is_plain_text,
                          bool* is_reliable) {
  bool allow_extended_lang = false;
  Language language3[3];
  int percent3[3];
  double normalized_score3[3];
  int text_bytes;
  int flags = 0;
  Language plus_one = UNKNOWN_LANGUAGE;
  const char* tld_hint = "";
  int encoding_hint = UNKNOWN_ENCODING;
  Language language_hint = UNKNOWN_LANGUAGE;

  Language lang = CompactLangDetImpl::DetectLanguageSummaryV25(
                          tables,
                          buffer,
                          buffer_length,
                          is_plain_text,
                          tld_hint,               // "id" boosts Indonesian
                          encoding_hint,          // SJS boosts Japanese
                          language_hint,          // ITALIAN boosts it
                          allow_extended_lang,
                          flags,
                          plus_one,
                          language3,
                          percent3,
                          normalized_score3,
                          &text_bytes,
                          is_reliable);
  // Default to English.
  if (lang == UNKNOWN_LANGUAGE) {
    lang = ENGLISH;
  }
  return lang;
}

// Scan interchange-valid UTF-8 bytes and detect list of top 3 languages.
Language CompactLangDet::DetectLanguageSummary(
                          const DetectionTables* tables,
                          const char* buffer,
                          int buffer_length,
                          bool is_plain_text,
                          Language* language3,
                          int* percent3,
                          int* text_bytes,
                          bool* is_reliable) {
  double normalized_score3[3];
  bool allow_extended_lang = false;
  int flags = 0;
  Language plus_one = UNKNOWN_LANGUAGE;
  const char* tld_hint = "";
  int encoding_hint = UNKNOWN_ENCODING;
  Language language_hint = UNKNOWN_LANGUAGE;

  Language lang = CompactLangDetImpl::DetectLanguageSummaryV25(
                          tables,
                          buffer,
                          buffer_length,
                          is_plain_text,
                          tld_hint,               // "id" boosts Indonesian
                          encoding_hint,          // SJS boosts Japanese
                          language_hint,          // ITALIAN boosts it
                          allow_extended_lang,
                          flags,
                          plus_one,
                          language3,
                          percent3,
                          normalized_score3,
                          text_bytes,
                          is_reliable);
  // Default to English
  if (lang == UNKNOWN_LANGUAGE) {
    lang = ENGLISH;
  }
  return lang;
}

// Same as above, with hints supplied
// Scan interchange-valid UTF-8 bytes and detect list of top 3 languages.
Language CompactLangDet::DetectLanguageSummary(
                          const DetectionTables* tables,
                          const char* buffer,
                          int buffer_length,
                          bool is_plain_text,
                          const char* tld_hint,       // "id" boosts Indonesian
                          int encoding_hint,          // SJS boosts Japanese
                          Language language_hint,     // ITALIAN boosts it
                          Language* language3,
                          int* percent3,
                          int* text_bytes,
                          bool* is_reliable) {
  double normalized_score3[3];
  bool allow_extended_lang = false;
  int flags = 0;
  Language plus_one = UNKNOWN_LANGUAGE;

  Language lang = CompactLangDetImpl::DetectLanguageSummaryV25(
                          tables,
                          buffer,
                          buffer_length,
                          is_plain_text,
                          tld_hint,               // "id" boosts Indonesian
                          encoding_hint,          // SJS boosts Japanese
                          language_hint,          // ITALIAN boosts it
                          allow_extended_lang,
                          flags,
                          plus_one,
                          language3,
                          percent3,
                          normalized_score3,
                          text_bytes,
                          is_reliable);
  // Default to English
  if (lang == UNKNOWN_LANGUAGE) {
    lang = ENGLISH;
  }
  return lang;
}


// Scan interchange-valid UTF-8 bytes and detect list of top 3 extended
// languages.
// Extended languages are additional Google interface languages and Unicode
// single-language scripts, from ext_lang_enc.h
Language CompactLangDet::ExtDetectLanguageSummary(
                          const DetectionTables* tables,
                          const char* buffer,
                          int buffer_length,
                          bool is_plain_text,
                          Language* language3,
                          int* percent3,
                          int* text_bytes,
                          bool* is_reliable) {
  double normalized_score3[3];
  bool allow_extended_lang = true;
  int flags = 0;
  Language plus_one = UNKNOWN_LANGUAGE;
  const char* tld_hint = "";
  int encoding_hint = UNKNOWN_ENCODING;
  Language language_hint = UNKNOWN_LANGUAGE;

  Language lang = CompactLangDetImpl::DetectLanguageSummaryV25(
                          tables,
                          buffer,
                          buffer_length,
                          is_plain_text,
                          tld_hint,               // "id" boosts Indonesian
                          encoding_hint,          // SJS boosts Japanese
                          language_hint,          // ITALIAN boosts it
                          allow_extended_lang,
                          flags,
                          plus_one,
                          language3,
                          percent3,
                          normalized_score3,
                          text_bytes,
                          is_reliable);
  // Do not default to English
  return lang;
}

// Same as above, with hints supplied
// Scan interchange-valid UTF-8 bytes and detect list of top 3 extended
// languages.
// Extended languages are additional Google interface languages and Unicode
// single-language scripts, from ext_lang_enc.h
Language CompactLangDet::ExtDetectLanguageSummary(
                          const DetectionTables* tables,
                          const char* buffer,
                          int buffer_length,
                          bool is_plain_text,
                          const char* tld_hint,       // "id" boosts Indonesian
                          int encoding_hint,          // SJS boosts Japanese
                          Language language_hint,     // ITALIAN boosts it
                          Language* language3,
                          int* percent3,
                          int* text_bytes,
                          bool* is_reliable) {
  double normalized_score3[3];
  bool allow_extended_lang = true;
  int flags = 0;
  Language plus_one = UNKNOWN_LANGUAGE;

  Language lang = CompactLangDetImpl::DetectLanguageSummaryV25(
                          tables,
                          buffer,
                          buffer_length,
                          is_plain_text,
                          tld_hint,               // "id" boosts Indonesian
                          encoding_hint,          // SJS boosts Japanese
                          language_hint,          // ITALIAN boosts it
                          allow_extended_lang,
                          flags,
                          plus_one,
                          language3,
                          percent3,
                          normalized_score3,
                          text_bytes,
                          is_reliable);
  // Do not default to English
  return lang;
}

// Same as above, and also returns internal language scores as a ratio to
// normal score for real text in that language. Scores close to 1.0 indicate
// normal text, while scores far away from 1.0 indicate badly-skewed text or
// gibberish
//
Language CompactLangDet::ExtDetectLanguageSummary(
                        const DetectionTables* tables,
                        const char* buffer,
                        int buffer_length,
                        bool is_plain_text,
                        const char* tld_hint,       // "id" boosts Indonesian
                        int encoding_hint,          // SJS boosts Japanese
                        Language language_hint,     // ITALIAN boosts it
                        Language* language3,
                        int* percent3,
                        double* normalized_score3,
                        int* text_bytes,
                        bool* is_reliable) {
  bool allow_extended_lang = true;
  int flags = 0;
  Language plus_one = UNKNOWN_LANGUAGE;

  Language lang = CompactLangDetImpl::DetectLanguageSummaryV25(
                          tables,
                          buffer,
                          buffer_length,
                          is_plain_text,
                          tld_hint,               // "id" boosts Indonesian
                          encoding_hint,          // SJS boosts Japanese
                          language_hint,          // ITALIAN boosts it
                          allow_extended_lang,
                          flags,
                          plus_one,
                          language3,
                          percent3,
                          normalized_score3,
                          text_bytes,
                          is_reliable);
  // Do not default to English
  return lang;
  }



// Return version text string
// String is "code_version - data_scrape_date"
const char* CompactLangDet::DetectLanguageVersion() {
  return kDetectLanguageVersion;
}


/* [<][>][^][v][top][bottom][index][help] */