root/sync/internal_api/public/base/unique_position.cc

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. IsValidSuffix
  2. IsValidBytes
  3. CreateInvalid
  4. FromProto
  5. FromInt64
  6. InitialPosition
  7. Before
  8. After
  9. Between
  10. LessThan
  11. Equals
  12. ToProto
  13. SerializeToString
  14. ToInt64
  15. IsValid
  16. ToDebugString
  17. GetSuffixForTest
  18. FindSmallerWithSuffix
  19. FindGreaterWithSuffix
  20. FindBetweenWithSuffix
  21. is_valid_
  22. is_valid_
  23. WriteEncodedRunLength
  24. ReadEncodedRunLength
  25. IsRepeatedCharPrefix
  26. Compress
  27. CompressImpl
  28. Uncompress
  29. IsValidCompressed

// Copyright (c) 2012 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#include "sync/internal_api/public/base/unique_position.h"

#include "base/basictypes.h"
#include "base/logging.h"
#include "base/stl_util.h"
#include "base/strings/string_number_conversions.h"
#include "sync/protocol/unique_position.pb.h"
#include "third_party/zlib/zlib.h"

namespace syncer {

const size_t UniquePosition::kSuffixLength = 28;
const size_t UniquePosition::kCompressBytesThreshold = 128;

// static.
bool UniquePosition::IsValidSuffix(const std::string& suffix) {
  // The suffix must be exactly the specified length, otherwise unique suffixes
  // are not sufficient to guarantee unique positions (because prefix + suffix
  // == p + refixsuffix).
  return suffix.length() == kSuffixLength;
}

// static.
bool UniquePosition::IsValidBytes(const std::string& bytes) {
  // The first condition ensures that our suffix uniqueness is sufficient to
  // guarantee position uniqueness.  Otherwise, it's possible the end of some
  // prefix + some short suffix == some long suffix.
  // The second condition ensures that FindSmallerWithSuffix can always return a
  // result.
  return bytes.length() >= kSuffixLength
      && bytes[bytes.length()-1] != 0;
}

// static.
UniquePosition UniquePosition::CreateInvalid() {
  UniquePosition pos;
  DCHECK(!pos.IsValid());
  return pos;
}

// static.
UniquePosition UniquePosition::FromProto(const sync_pb::UniquePosition& proto) {
  if (proto.has_custom_compressed_v1()) {
    return UniquePosition(proto.custom_compressed_v1());
  } else if (proto.has_value() && !proto.value().empty()) {
    return UniquePosition(Compress(proto.value()));
  } else if (proto.has_compressed_value() && proto.has_uncompressed_length()) {
    uLongf uncompressed_len = proto.uncompressed_length();
    std::string un_gzipped;

    un_gzipped.resize(uncompressed_len);
    int result = uncompress(
        reinterpret_cast<Bytef*>(string_as_array(&un_gzipped)),
        &uncompressed_len,
        reinterpret_cast<const Bytef*>(proto.compressed_value().data()),
        proto.compressed_value().size());
    if (result != Z_OK) {
      DLOG(ERROR) << "Unzip failed " << result;
      return UniquePosition::CreateInvalid();
    }
    if (uncompressed_len != proto.uncompressed_length()) {
      DLOG(ERROR)
          << "Uncompressed length " << uncompressed_len
          << " did not match specified length " << proto.uncompressed_length();
      return UniquePosition::CreateInvalid();
    }
    return UniquePosition(Compress(un_gzipped));
  } else {
    return UniquePosition::CreateInvalid();
  }
}

// static.
UniquePosition UniquePosition::FromInt64(
    int64 x, const std::string& suffix) {
  uint64 y = static_cast<uint64>(x);
  y ^= 0x8000000000000000ULL; // Make it non-negative.
  std::string bytes(8, 0);
  for (int i = 7; i >= 0; --i) {
    bytes[i] = static_cast<uint8>(y);
    y >>= 8;
  }
  return UniquePosition(bytes + suffix, suffix);
}

// static.
UniquePosition UniquePosition::InitialPosition(
    const std::string& suffix) {
  DCHECK(IsValidSuffix(suffix));
  return UniquePosition(suffix, suffix);
}

// static.
UniquePosition UniquePosition::Before(
    const UniquePosition& x,
    const std::string& suffix) {
  DCHECK(IsValidSuffix(suffix));
  DCHECK(x.IsValid());
  const std::string& before = FindSmallerWithSuffix(
      Uncompress(x.compressed_), suffix);
  return UniquePosition(before + suffix, suffix);
}

// static.
UniquePosition UniquePosition::After(
    const UniquePosition& x,
    const std::string& suffix) {
  DCHECK(IsValidSuffix(suffix));
  DCHECK(x.IsValid());
  const std::string& after = FindGreaterWithSuffix(
      Uncompress(x.compressed_), suffix);
  return UniquePosition(after + suffix, suffix);
}

// static.
UniquePosition UniquePosition::Between(
    const UniquePosition& before,
    const UniquePosition& after,
    const std::string& suffix) {
  DCHECK(before.IsValid());
  DCHECK(after.IsValid());
  DCHECK(before.LessThan(after));
  DCHECK(IsValidSuffix(suffix));
  const std::string& mid = FindBetweenWithSuffix(
      Uncompress(before.compressed_),
      Uncompress(after.compressed_),
      suffix);
  return UniquePosition(mid + suffix, suffix);
}

UniquePosition::UniquePosition() : is_valid_(false) {}

bool UniquePosition::LessThan(const UniquePosition& other) const {
  DCHECK(this->IsValid());
  DCHECK(other.IsValid());

  return compressed_ < other.compressed_;
}

bool UniquePosition::Equals(const UniquePosition& other) const {
  if (!this->IsValid() && !other.IsValid())
    return true;

  return compressed_ == other.compressed_;
}

void UniquePosition::ToProto(sync_pb::UniquePosition* proto) const {
  proto->Clear();

  // This is the current preferred foramt.
  proto->set_custom_compressed_v1(compressed_);

  // Older clients used to write other formats.  We don't bother doing that
  // anymore because that form of backwards compatibility is expensive.  We no
  // longer want to pay that price just too support clients that have been
  // obsolete for a long time.  See the proto definition for details.
}

void UniquePosition::SerializeToString(std::string* blob) const {
  DCHECK(blob);
  sync_pb::UniquePosition proto;
  ToProto(&proto);
  proto.SerializeToString(blob);
}

int64 UniquePosition::ToInt64() const {
  uint64 y = 0;
  const std::string& s = Uncompress(compressed_);
  size_t l = sizeof(int64);
  if (s.length() < l) {
    NOTREACHED();
    l = s.length();
  }
  for (size_t i = 0; i < l; ++i) {
    const uint8 byte = s[l - i - 1];
    y |= static_cast<uint64>(byte) << (i * 8);
  }
  y ^= 0x8000000000000000ULL;
  // This is technically implementation-defined if y > INT64_MAX, so
  // we're assuming that we're on a twos-complement machine.
  return static_cast<int64>(y);
}

bool UniquePosition::IsValid() const {
  return is_valid_;
}

std::string UniquePosition::ToDebugString() const {
  const std::string bytes = Uncompress(compressed_);
  if (bytes.empty())
    return std::string("INVALID[]");

  std::string debug_string = base::HexEncode(bytes.data(), bytes.length());
  if (!IsValid()) {
    debug_string = "INVALID[" + debug_string + "]";
  }

  std::string compressed_string =
      base::HexEncode(compressed_.data(), compressed_.length());
  debug_string.append(", compressed: " + compressed_string);
  return debug_string;
}

std::string UniquePosition::GetSuffixForTest() const {
  const std::string bytes = Uncompress(compressed_);
  const size_t prefix_len = bytes.length() - kSuffixLength;
  return bytes.substr(prefix_len, std::string::npos);
}

std::string UniquePosition::FindSmallerWithSuffix(
    const std::string& reference,
    const std::string& suffix) {
  size_t ref_zeroes = reference.find_first_not_of('\0');
  size_t suffix_zeroes = suffix.find_first_not_of('\0');

  // Neither of our inputs are allowed to have trailing zeroes, so the following
  // must be true.
  DCHECK_NE(ref_zeroes, std::string::npos);
  DCHECK_NE(suffix_zeroes, std::string::npos);

  if (suffix_zeroes > ref_zeroes) {
    // Implies suffix < ref.
    return std::string();
  }

  if (suffix.substr(suffix_zeroes) < reference.substr(ref_zeroes)) {
    // Prepend zeroes so the result has as many zero digits as |reference|.
    return std::string(ref_zeroes - suffix_zeroes, '\0');
  } else if (suffix_zeroes > 1) {
    // Prepend zeroes so the result has one more zero digit than |reference|.
    // We could also take the "else" branch below, but taking this branch will
    // give us a smaller result.
    return std::string(ref_zeroes - suffix_zeroes + 1, '\0');
  } else {
    // Prepend zeroes to match those in the |reference|, then something smaller
    // than the first non-zero digit in |reference|.
    char lt_digit = static_cast<uint8>(reference[ref_zeroes])/2;
    return std::string(ref_zeroes, '\0') + lt_digit;
  }
}

// static
std::string UniquePosition::FindGreaterWithSuffix(
    const std::string& reference,
    const std::string& suffix) {
  size_t ref_FFs = reference.find_first_not_of(kuint8max);
  size_t suffix_FFs = suffix.find_first_not_of(kuint8max);

  if (ref_FFs == std::string::npos) {
    ref_FFs = reference.length();
  }
  if (suffix_FFs == std::string::npos) {
    suffix_FFs = suffix.length();
  }

  if (suffix_FFs > ref_FFs) {
    // Implies suffix > reference.
    return std::string();
  }

  if (suffix.substr(suffix_FFs) > reference.substr(ref_FFs)) {
    // Prepend FF digits to match those in |reference|.
    return std::string(ref_FFs - suffix_FFs, kuint8max);
  } else if (suffix_FFs > 1) {
    // Prepend enough leading FF digits so result has one more of them than
    // |reference| does.  We could also take the "else" branch below, but this
    // gives us a smaller result.
    return std::string(ref_FFs - suffix_FFs + 1, kuint8max);
  } else {
    // Prepend FF digits to match those in |reference|, then something larger
    // than the first non-FF digit in |reference|.
    char gt_digit = static_cast<uint8>(reference[ref_FFs]) +
        (kuint8max - static_cast<uint8>(reference[ref_FFs]) + 1) / 2;
    return std::string(ref_FFs, kuint8max) + gt_digit;
  }
}

// static
std::string UniquePosition::FindBetweenWithSuffix(
    const std::string& before,
    const std::string& after,
    const std::string& suffix) {
  DCHECK(IsValidSuffix(suffix));
  DCHECK_NE(before, after);
  DCHECK_LT(before, after);

  std::string mid;

  // Sometimes our suffix puts us where we want to be.
  if (before < suffix && suffix < after) {
    return std::string();
  }

  size_t i = 0;
  for ( ; i < std::min(before.length(), after.length()); ++i) {
    uint8 a_digit = before[i];
    uint8 b_digit = after[i];

    if (b_digit - a_digit >= 2) {
      mid.push_back(a_digit + (b_digit - a_digit)/2);
      return mid;
    } else if (a_digit == b_digit) {
      mid.push_back(a_digit);

      // Both strings are equal so far.  Will appending the suffix at this point
      // give us the comparison we're looking for?
      if (before.substr(i+1) < suffix && suffix < after.substr(i+1)) {
        return mid;
      }
    } else {
      DCHECK_EQ(b_digit - a_digit, 1);  // Implied by above if branches.

      // The two options are off by one digit.  The choice of whether to round
      // up or down here will have consequences on what we do with the remaining
      // digits.  Exploring both options is an optimization and is not required
      // for the correctness of this algorithm.

      // Option A: Round down the current digit.  This makes our |mid| <
      // |after|, no matter what we append afterwards.  We then focus on
      // appending digits until |mid| > |before|.
      std::string mid_a = mid;
      mid_a.push_back(a_digit);
      mid_a.append(FindGreaterWithSuffix(before.substr(i+1), suffix));

      // Option B: Round up the current digit.  This makes our |mid| > |before|,
      // no matter what we append afterwards.  We then focus on appending digits
      // until |mid| < |after|.  Note that this option may not be viable if the
      // current digit is the last one in |after|, so we skip the option in that
      // case.
      if (after.length() > i+1) {
        std::string mid_b = mid;
        mid_b.push_back(b_digit);
        mid_b.append(FindSmallerWithSuffix(after.substr(i+1), suffix));

        // Does this give us a shorter position value?  If so, use it.
        if (mid_b.length() < mid_a.length()) {
          return mid_b;
        }
      }
      return mid_a;
    }
  }

  // If we haven't found a midpoint yet, the following must be true:
  DCHECK_EQ(before.substr(0, i), after.substr(0, i));
  DCHECK_EQ(before, mid);
  DCHECK_LT(before.length(), after.length());

  // We know that we'll need to append at least one more byte to |mid| in the
  // process of making it < |after|.  Appending any digit, regardless of the
  // value, will make |before| < |mid|.  Therefore, the following will get us a
  // valid position.

  mid.append(FindSmallerWithSuffix(after.substr(i), suffix));
  return mid;
}

UniquePosition::UniquePosition(const std::string& internal_rep)
    : compressed_(internal_rep),
      is_valid_(IsValidBytes(Uncompress(internal_rep))) {
}

UniquePosition::UniquePosition(
    const std::string& uncompressed,
    const std::string& suffix)
  : compressed_(Compress(uncompressed)),
    is_valid_(IsValidBytes(uncompressed)) {
  DCHECK(uncompressed.rfind(suffix) + kSuffixLength == uncompressed.length());
  DCHECK(IsValidSuffix(suffix));
  DCHECK(IsValid());
}

// On custom compression:
//
// Let C(x) be the compression function and U(x) be the uncompression function.
//
// This compression scheme has a few special properties.  For one, it is
// order-preserving.  For any two valid position strings x and y:
//   x < y <=> C(x) < C(y)
// This allows us keep the position strings compressed as we sort them.
//
// The compressed format and the decode algorithm:
//
// The compressed string is a series of blocks, almost all of which are 8 bytes
// in length.  The only exception is the last block in the compressed string,
// which may be a remainder block, which has length no greater than 7.  The
// full-length blocks are either repeated character blocks or plain data blocks.
// All blocks are entirely self-contained.  Their decoded values are independent
// from that of their neighbours.
//
// A repeated character block is encoded into eight bytes and represents between
// 4 and 2^31 repeated instances of a given character in the unencoded stream.
// The encoding consists of a single character repeated four times, followed by
// an encoded count.  The encoded count is stored as a big-endian 32 bit
// integer.  There are 2^31 possible count values, and two encodings for each.
// The high encoding is 'enc = kuint32max - count'; the low encoding is 'enc =
// count'.  At compression time, the algorithm will choose between the two
// encodings based on which of the two will maintain the appropriate sort
// ordering (by a process which will be described below).  The decompression
// algorithm need not concern itself with which encoding was used; it needs only
// to decode it.  The decoded value of this block is "count" instances of the
// character that was repeated four times in the first half of this block.
//
// A plain data block is encoded into eight bytes and represents exactly eight
// bytes of data in the unencoded stream.  The plain data block must not begin
// with the same character repeated four times.  It is allowed to contain such a
// four-character sequence, just not at the start of the block.  The decoded
// value of a plain data block is identical to its encoded value.
//
// A remainder block has length of at most seven.  It is a shorter version of
// the plain data block.  It occurs only at the end of the encoded stream and
// represents exactly as many bytes of unencoded data as its own length.  Like a
// plain data block, the remainder block never begins with the same character
// repeated four times.  The decoded value of this block is identical to its
// encoded value.
//
// The encode algorithm:
//
// From the above description, it can be seen that there may be more than one
// way to encode a given input string.  The encoder must be careful to choose
// the encoding that guarantees sort ordering.
//
// The rules for the encoder are as follows:
// 1. Iterate through the input string and produce output blocks one at a time.
// 2. Where possible (ie. where the next four bytes of input consist of the
//    same character repeated four times), produce a repeated data block of
//    maximum possible length.
// 3. If there is at least 8 bytes of data remaining and it is not possible
//    to produce a repeated character block, produce a plain data block.
// 4. If there are less than 8 bytes of data remaining and it is not possible
//    to produce a repeated character block, produce a remainder block.
// 5. When producing a repeated character block, the count encoding must be
//    chosen in such a way that the sort ordering is maintained.  The choice is
//    best illustrated by way of example:
//
//      When comparing two strings, the first of which begins with of 8
//      instances of the letter 'B' and the second with 10 instances of the
//      letter 'B', which of the two should compare lower?  The result depends
//      on the 9th character of the first string, since it will be compared
//      against the 9th 'B' in the second string.  If that character is an 'A',
//      then the first string will compare lower.  If it is a 'C', then the
//      first string will compare higher.
//
//    The key insight is that the comparison value of a repeated character block
//    depends on the value of the character that follows it.  If the character
//    follows the repeated character has a value greater than the repeated
//    character itself, then a shorter run length should translate to a higher
//    comparison value.  Therefore, we encode its count using the low encoding.
//    Similarly, if the following character is lower, we use the high encoding.

namespace {

// Appends an encoded run length to |output_str|.
static void WriteEncodedRunLength(uint32 length,
                                  bool high_encoding,
                                  std::string* output_str) {
  CHECK_GE(length, 4U);
  CHECK_LT(length, 0x80000000);

  // Step 1: Invert the count, if necessary, to account for the following digit.
  uint32 encoded_length;
  if (high_encoding) {
    encoded_length = 0xffffffff - length;
  } else {
    encoded_length = length;
  }

  // Step 2: Write it as big-endian so it compares correctly with memcmp(3).
  output_str->append(1, 0xff & (encoded_length >> 24U));
  output_str->append(1, 0xff & (encoded_length >> 16U));
  output_str->append(1, 0xff & (encoded_length >> 8U));
  output_str->append(1, 0xff & (encoded_length >> 0U));
}

// Reads an encoded run length for |str| at position |i|.
static uint32 ReadEncodedRunLength(const std::string& str, size_t i) {
  DCHECK_LE(i + 4, str.length());

  // Step 1: Extract the big-endian count.
  uint32 encoded_length =
      ((uint8)(str[i+3]) << 0)  |
      ((uint8)(str[i+2]) << 8)  |
      ((uint8)(str[i+1]) << 16) |
      ((uint8)(str[i+0]) << 24);

  // Step 2: If this was an inverted count, un-invert it.
  uint32 length;
  if (encoded_length & 0x80000000) {
    length = 0xffffffff - encoded_length;
  } else {
    length = encoded_length;
  }

  return length;
}

// A series of four identical chars at the beginning of a block indicates
// the beginning of a repeated character block.
static bool IsRepeatedCharPrefix(const std::string& chars, size_t start_index) {
  return chars[start_index] == chars[start_index+1]
      && chars[start_index] == chars[start_index+2]
      && chars[start_index] == chars[start_index+3];
}

}  // namespace

// static
// Wraps the CompressImpl function with a bunch of DCHECKs.
std::string UniquePosition::Compress(const std::string& str) {
  DCHECK(IsValidBytes(str));
  std::string compressed = CompressImpl(str);
  DCHECK(IsValidCompressed(compressed));
  DCHECK_EQ(str, Uncompress(compressed));
  return compressed;
}

// static
// Performs the order preserving run length compression of a given input string.
std::string UniquePosition::CompressImpl(const std::string& str) {
  std::string output;

  // The compressed length will usually be at least as long as the suffix (28),
  // since the suffix bytes are mostly random.  Most are a few bytes longer; a
  // small few are tens of bytes longer.  Some early tests indicated that
  // roughly 99% had length 40 or smaller.  We guess that pre-sizing for 48 is a
  // good trade-off, but that has not been confirmed through profiling.
  output.reserve(48);

  // Each loop iteration will consume 8, or N bytes, where N >= 4 and is the
  // length of a string of identical digits starting at i.
  for (size_t i = 0; i < str.length(); ) {
    if (i + 4 <= str.length() && IsRepeatedCharPrefix(str, i)) {
      // Four identical bytes in a row at this position means that we must start
      // a repeated character block.  Begin by outputting those four bytes.
      output.append(str, i, 4);

      // Determine the size of the run.
      const char rep_digit = str[i];
      const size_t runs_until = str.find_first_not_of(rep_digit, i+4);

      // Handle the 'runs until end' special case specially.
      size_t run_length;
      bool encode_high;  // True if the next byte is greater than |rep_digit|.
      if (runs_until == std::string::npos) {
        run_length = str.length() - i;
        encode_high = false;
      } else {
        run_length = runs_until - i;
        encode_high = static_cast<uint8>(str[runs_until]) >
            static_cast<uint8>(rep_digit);
      }
      DCHECK_LT(run_length, static_cast<size_t>(kint32max))
          << "This implementation can't encode run-lengths greater than 2^31.";

      WriteEncodedRunLength(run_length, encode_high, &output);
      i += run_length;  // Jump forward by the size of the run length.
    } else {
      // Output up to eight bytes without any encoding.
      const size_t len = std::min(static_cast<size_t>(8), str.length() - i);
      output.append(str, i, len);
      i += len;  // Jump forward by the amount of input consumed (usually 8).
    }
  }

  return output;
}

// static
// Uncompresses strings that were compresed with UniquePosition::Compress.
std::string UniquePosition::Uncompress(const std::string& str) {
  std::string output;
  size_t i = 0;
  // Iterate through the compressed string one block at a time.
  for (i = 0; i + 8 <= str.length(); i += 8) {
    if (IsRepeatedCharPrefix(str, i)) {
      // Found a repeated character block.  Expand it.
      const char rep_digit = str[i];
      uint32 length = ReadEncodedRunLength(str, i+4);
      output.append(length, rep_digit);
    } else {
      // Found a regular block.  Copy it.
      output.append(str, i, 8);
    }
  }
  // Copy the remaining bytes that were too small to form a block.
  output.append(str, i, std::string::npos);
  return output;
}

bool UniquePosition::IsValidCompressed(const std::string& str) {
  for (size_t i = 0; i + 8 <= str.length(); i += 8) {
    if (IsRepeatedCharPrefix(str, i)) {
      uint32 count = ReadEncodedRunLength(str, i+4);
      if (count < 4) {
        // A repeated character block should at least represent the four
        // characters that started it.
        return false;
      }
      if (str[i] == str[i+4]) {
        // Does the next digit after a count match the repeated character?  Then
        // this is not the highest possible count.
        return false;
      }
    }
  }
  // We don't bother looking for the existence or checking the validity of
  // any partial blocks.  There's no way they could be invalid anyway.
  return true;
}

}  // namespace syncer

/* [<][>][^][v][top][bottom][index][help] */