root/third_party/cld/encodings/compact_lang_det/subsetsequence.cc

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. DumpInts
  2. DumpUint8s
  3. Median3
  4. Init
  5. NewLevel
  6. DoCarries
  7. Add
  8. Flush
  9. Extract

// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

// Remember a subset of a sequence of values, using a modest amount of memory

/***
  Design:
  Accumulate in powers of three, using 3-way median to collapse entries.
  At any given time, there is one most-dense (highest power of 3) range of
  entries and a series of less-dense ranges that hold 0..2 entries each. There
  is a bounded-size storage array of S cells for all the entries.

  The overflow detect is set up so that a new higher power of 3, K+1, is
  triggered precisely when range K has 3n entries and all ranges < K have
  zero entries.

  In general, think of the range sizes as a multi-digit base 3 number, except
  the highest digit may exceed 2:

  3**6  3**5  3**4  3**3  3**2  3**1  3**0  K=2
    0     0     0     0   3n-1    2     2   unused:1

  There are a total of 3n-1 + 2 + 2 entries in use. Assume a size limit S at
  one more than that, and we add a new 3**0 entry and "carry" by performing
  medians on any group of 3 elements:

  3**6  3**5  3**4  3**3  3**2  3**1  3**0       K=2
    0     0     0     0   3n-1    2     3        unused:0
    0     0     0     0   3n-1    3     0 carry  unused:2
    0     0     0     0    3n     0     0 carry  unused:4

  To accumulate 2 entries at all levels < K and 3 just before the first carry at
  level 0, we need 2*K + 1 unused cells after doing all carries, or five cells
  in this case. Since we only have 4 cells in the example above, we need to
  make room by starting a new power of three:

  3**6  3**5  3**4  3**3  3**2  3**1  3**0
    0     0     0     0    3n     0     0        K=2 unused:4
    0     0     0     n     0     0     0        K=3 unused:2n+4

  In the code below, we don't worry about overflow from the topmost place.


***/

#include "encodings/compact_lang_det/subsetsequence.h"
#include <stdio.h>

#include "encodings/compact_lang_det/win/cld_logging.h"

void DumpInts(const char* label, const int* v, int n) {
  printf("%s ", label);
  for (int i = 0; i < n; ++i) {
    printf("%d ", v[i]);
  }
  printf("\n");
}

void DumpUint8s(const char* label, const uint8* v, int n) {
  printf("%s ", label);
  for (int i = 0; i < n; ++i) {
    printf("%d ", v[i]);
  }
  printf("\n");
}

// Return median of seq_[sub] .. seq_[sub+2], favoring middle element
uint8 SubsetSequence::Median3(int sub) {
  if (seq_[sub] == seq_[sub + 1]) {
    return seq_[sub];
  }
  if (seq_[sub] == seq_[sub + 2]) {
    return seq_[sub];
  }
  return seq_[sub + 1];
}

void SubsetSequence::Init() {
  // printf("Init\n");

  k_ = 0;
  count_[0] = 0;
  next_e_ = 0;
  seq_[0] = 0;    // Default value if no calls to Add

  // Want largest <= kMaxSeq_ that allows reserve and makes count_[k_] = 0 mod 3
  int reserve = (2 * k_ + 1);
  level_limit_e_ = kMaxSeq_ - reserve;
  level_limit_e_ = (level_limit_e_ / 3) * 3;  // Round down to multiple of 3
  limit_e_ = level_limit_e_;
}

// Compress level k by 3x, creating level k+1
void SubsetSequence::NewLevel() {
  // printf("NewLevel 3 ** %d\n", k_ + 1);
  //DumpUint8s("count[k]", count_, k_ + 1);
  //DumpUint8s("seq[next]", seq_, next_e_);

  // Incoming level must be an exact multiple of three in size
  CHECK((count_[k_] % 3) == 0);
  int k_size = count_[k_];
  int new_size = k_size / 3;

  // Compress down by 3x, via median
  for (int j = 0; j < new_size; ++j) {
    seq_[j] = Median3(j * 3);
  }

  // Update counts
  count_[k_] = 0;
  // Else Overflow -- just continue with 3x dense Level K
  if (k_ < (kMaxLevel_ - 1)) {++k_;}
  count_[k_] = new_size;

  // Update limits
  next_e_ = new_size;
  limit_e_ = next_e_ + 3;

  // Want largest <= kMaxSeq_ that allows reserve and makes count_[k_] = 0 mod 3
  int reserve = (2 * k_ + 1);
  level_limit_e_ = kMaxSeq_ - reserve;
  level_limit_e_ = (level_limit_e_ / 3) * 3;  // Round down to multiple of 3
                                              //
  //DumpUint8s("after: count[k]", count_, k_ + 1);
  //DumpUint8s("after: seq[next]", seq_, next_e_);
}

void SubsetSequence::DoCarries() {
  CHECK(count_[k_] > 3);    // We depend on count_[k_] being > 3 to stop while
  // Make room by carrying

  //DumpUint8s("DoCarries count[k]", count_, k_ + 1);
  //DumpUint8s("DoCarries seq[next]", seq_, next_e_);

  int i = 0;
  while (count_[i] == 3) {
    next_e_ -= 3;
    seq_[next_e_] = Median3(next_e_);
    ++next_e_;
    count_[i] = 0;
    ++count_[i + 1];
    ++i;
  }
  limit_e_ = next_e_ + 3;

  //DumpUint8s("after: DoCarries count[k]", count_, k_ + 1);
  //DumpUint8s("after: DoCarries seq[next]", seq_, next_e_);

  // If we just fully carried into level K,
  // Make sure there is now enough room, else start level K + 1
  if (i >= k_) {
    CHECK(count_[k_] == next_e_);
    if (next_e_ >= level_limit_e_) {
      NewLevel();
    }
  }
}

void SubsetSequence::Add(uint8 e) {
  // Add an entry then carry as needed
  seq_[next_e_] = e;
  ++next_e_;
  ++count_[0];

  if (next_e_ >= limit_e_) {
    DoCarries();
  }
}


// Collapse tail end by simple median across disparate-weight values,
// dropping or duplicating last value if need be.
// This routine is idempotent.
void SubsetSequence::Flush() {
  // printf("Flush %d\n", count_[k_]);
  int start_tail = count_[k_];
  int size_tail = next_e_ - start_tail;
  if ((size_tail % 3) == 2) {
    seq_[next_e_] = seq_[next_e_ - 1];      // Duplicate last value
    ++size_tail;
  }

  // Compress tail down by 3x, via median
  int new_size = size_tail / 3;             // May delete last value
  for (int j = 0; j < new_size; ++j) {
    seq_[start_tail + j] = Median3(start_tail + j * 3);
  }

  next_e_ = start_tail + new_size;
  count_[k_] = next_e_;
}


// Extract representative pattern of exactly N values into dst[0..n-1]
// This routine may be called multiple times, but it may downsample as a
// side effect, causing subsequent calls with larger N to get poor answers.
void SubsetSequence::Extract(int to_n, uint8* dst) {
  // Collapse partial-carries in tail
  Flush();

  // Just use Bresenham to resample
  int from_n = next_e_;
  if (to_n >= from_n) {
    // Up-sample from_n => to_n
    int err = to_n - 1;          // bias toward no overshoot
    int j = 0;
    for (int i = 0; i < to_n; ++i) {
      dst[i] = seq_[j];
      err -= from_n;
      if (err < 0) {
        ++j;
        err += to_n;
      }
    }
  } else {
    // Get to the point that the number of samples is <= 3 * to_n
    while (next_e_ > (to_n * 3)) {
      // Compress down by 3x, via median
      // printf("Extract, median %d / 3\n", next_e_);
      if ((next_e_ % 3) == 2) {
        seq_[next_e_] = seq_[next_e_ - 1];    // Duplicate last value
        ++next_e_;
      }
      int new_size = next_e_ / 3;             // May delete last value
      for (int j = 0; j < new_size; ++j) {
        seq_[j] = Median3(j * 3);
      }
      next_e_ = new_size;
      count_[k_] = next_e_;
    }
    from_n = next_e_;

    if (to_n == from_n) {
      // Copy verbatim
      for (int i = 0; i < to_n; ++i) {
        dst[i] = seq_[i];
      }
      return;
    }

    // Down-sample from_n => to_n, using medians
    int err = 0;          // Bias to immediate median sample
    int j = 0;
    for (int i = 0; i < from_n; ++i) {
      err -= to_n;
      if (err < 0) {
        if (i <= (next_e_ - 2)) {
          dst[j] = Median3(i);
        } else {
          dst[j] = seq_[i];
        }
        ++j;
        err += from_n;
      }
    }
  }

}

/* [<][>][^][v][top][bottom][index][help] */