root/base/i18n/streaming_utf8_validator_perftest.cc

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. IsString7Bit
  2. NextUtf8Sequence
  3. RunTest
  4. ConstructRepeatedTestString
  5. ConstructRangedTestString
  6. RunSomeTests
  7. TEST
  8. TEST
  9. TEST
  10. TEST
  11. TEST
  12. TEST
  13. TEST
  14. TEST

// Copyright 2014 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

// All data that is passed through a WebSocket with type "Text" needs to be
// validated as UTF8. Since this is done on the IO thread, it needs to be
// reasonably fast.

// We are only interested in the performance on valid UTF8. Invalid UTF8 will
// result in a connection failure, so is unlikely to become a source of
// performance issues.

#include "base/i18n/streaming_utf8_validator.h"

#include <string>

#include "base/basictypes.h"
#include "base/bind.h"
#include "base/callback.h"
#include "base/strings/string_util.h"
#include "base/strings/stringprintf.h"
#include "base/test/perf_time_logger.h"
#include "testing/gtest/include/gtest/gtest.h"

namespace base {
namespace {

// We want to test ranges of valid UTF-8 sequences. These ranges are inclusive.
// They are intended to be large enough that the validator needs to do
// meaningful work while being in some sense "realistic" (eg. control characters
// are not included).
const char kOneByteSeqRangeStart[] = " ";  // U+0020
const char kOneByteSeqRangeEnd[] = "~";    // U+007E

const char kTwoByteSeqRangeStart[] = "\xc2\xa0";  // U+00A0 non-breaking space
const char kTwoByteSeqRangeEnd[] = "\xc9\x8f";    // U+024F small y with stroke

const char kThreeByteSeqRangeStart[] = "\xe3\x81\x82";  // U+3042 Hiragana "a"
const char kThreeByteSeqRangeEnd[] = "\xe9\xbf\x83";    // U+9FC3 "to blink"

const char kFourByteSeqRangeStart[] = "\xf0\xa0\x80\x8b";  // U+2000B
const char kFourByteSeqRangeEnd[] = "\xf0\xaa\x9a\xb2";    // U+2A6B2

// The different lengths of strings to test.
const size_t kTestLengths[] = {1, 32, 256, 32768, 1 << 20};

// Simplest possible byte-at-a-time validator, to provide a baseline
// for comparison. This is only tried on 1-byte UTF-8 sequences, as
// the results will not be meaningful with sequences containing
// top-bit-set bytes.
bool IsString7Bit(const std::string& s) {
  for (std::string::const_iterator it = s.begin(); it != s.end(); ++it) {
    if (*it & 0x80)
      return false;
  }
  return true;
}

// Assumes that |previous| is a valid UTF-8 sequence, and attempts to return
// the next one. Is just barely smart enough to iterate through the ranges
// defined about.
std::string NextUtf8Sequence(const std::string& previous) {
  DCHECK(StreamingUtf8Validator::Validate(previous));
  std::string next = previous;
  for (int i = static_cast<int>(previous.length() - 1); i >= 0; --i) {
    // All bytes in a UTF-8 sequence except the first one are
    // constrained to the range 0x80 to 0xbf, inclusive. When we
    // increment past 0xbf, we carry into the previous byte.
    if (i > 0 && next[i] == '\xbf') {
      next[i] = '\x80';
      continue;  // carry
    }
    ++next[i];
    break;  // no carry
  }
  DCHECK(StreamingUtf8Validator::Validate(next))
      << "Result \"" << next << "\" failed validation";
  return next;
}

typedef bool (*TestTargetType)(const std::string&);

// Run fuction |target| over |test_string| |times| times, and report the results
// using |description|.
bool RunTest(const std::string& description,
             TestTargetType target,
             const std::string& test_string,
             int times) {
  base::PerfTimeLogger timer(description.c_str());
  bool result = true;
  for (int i = 0; i < times; ++i) {
    result = target(test_string) && result;
  }
  timer.Done();
  return result;
}

// Construct a string by repeating |input| enough times to equal or exceed
// |length|.
std::string ConstructRepeatedTestString(const std::string& input,
                                        size_t length) {
  std::string output = input;
  while (output.length() * 2 < length) {
    output += output;
  }
  if (output.length() < length) {
    output += ConstructRepeatedTestString(input, length - output.length());
  }
  return output;
}

// Construct a string by expanding the range of UTF-8 sequences
// between |input_start| and |input_end|, inclusive, and then
// repeating the resulting string until it equals or exceeds |length|
// bytes. |input_start| and |input_end| must be valid UTF-8
// sequences.
std::string ConstructRangedTestString(const std::string& input_start,
                                      const std::string& input_end,
                                      size_t length) {
  std::string output = input_start;
  std::string input = input_start;
  while (output.length() < length && input != input_end) {
    input = NextUtf8Sequence(input);
    output += input;
  }
  if (output.length() < length) {
    output = ConstructRepeatedTestString(output, length);
  }
  return output;
}

struct TestFunctionDescription {
  TestTargetType function;
  const char* function_name;
};

// IsString7Bit is intentionally placed last so it can be excluded easily.
const TestFunctionDescription kTestFunctions[] = {
    {&StreamingUtf8Validator::Validate, "StreamingUtf8Validator"},
    {&IsStringUTF8, "IsStringUTF8"}, {&IsString7Bit, "IsString7Bit"}};

// Construct a test string from |construct_test_string| for each of the lengths
// in |kTestLengths| in turn. For each string, run each test in |test_functions|
// for a number of iterations such that the total number of bytes validated
// is around 16MB.
void RunSomeTests(
    const char format[],
    base::Callback<std::string(size_t length)> construct_test_string,
    const TestFunctionDescription* test_functions,
    size_t test_count) {
  for (size_t i = 0; i < arraysize(kTestLengths); ++i) {
    const size_t length = kTestLengths[i];
    const std::string test_string = construct_test_string.Run(length);
    const int real_length = static_cast<int>(test_string.length());
    const int times = (1 << 24) / real_length;
    for (size_t test_index = 0; test_index < test_count; ++test_index) {
      EXPECT_TRUE(RunTest(StringPrintf(format,
                                       test_functions[test_index].function_name,
                                       real_length,
                                       times),
                          test_functions[test_index].function,
                          test_string,
                          times));
    }
  }
}

TEST(StreamingUtf8ValidatorPerfTest, OneByteRepeated) {
  RunSomeTests("%s: bytes=1 repeated length=%d repeat=%d",
               base::Bind(ConstructRepeatedTestString, kOneByteSeqRangeStart),
               kTestFunctions,
               3);
}

TEST(StreamingUtf8ValidatorPerfTest, OneByteRange) {
  RunSomeTests("%s: bytes=1 ranged length=%d repeat=%d",
               base::Bind(ConstructRangedTestString,
                          kOneByteSeqRangeStart,
                          kOneByteSeqRangeEnd),
               kTestFunctions,
               3);
}

TEST(StreamingUtf8ValidatorPerfTest, TwoByteRepeated) {
  RunSomeTests("%s: bytes=2 repeated length=%d repeat=%d",
               base::Bind(ConstructRepeatedTestString, kTwoByteSeqRangeStart),
               kTestFunctions,
               2);
}

TEST(StreamingUtf8ValidatorPerfTest, TwoByteRange) {
  RunSomeTests("%s: bytes=2 ranged length=%d repeat=%d",
               base::Bind(ConstructRangedTestString,
                          kTwoByteSeqRangeStart,
                          kTwoByteSeqRangeEnd),
               kTestFunctions,
               2);
}

TEST(StreamingUtf8ValidatorPerfTest, ThreeByteRepeated) {
  RunSomeTests(
      "%s: bytes=3 repeated length=%d repeat=%d",
      base::Bind(ConstructRepeatedTestString, kThreeByteSeqRangeStart),
      kTestFunctions,
      2);
}

TEST(StreamingUtf8ValidatorPerfTest, ThreeByteRange) {
  RunSomeTests("%s: bytes=3 ranged length=%d repeat=%d",
               base::Bind(ConstructRangedTestString,
                          kThreeByteSeqRangeStart,
                          kThreeByteSeqRangeEnd),
               kTestFunctions,
               2);
}

TEST(StreamingUtf8ValidatorPerfTest, FourByteRepeated) {
  RunSomeTests("%s: bytes=4 repeated length=%d repeat=%d",
               base::Bind(ConstructRepeatedTestString, kFourByteSeqRangeStart),
               kTestFunctions,
               2);
}

TEST(StreamingUtf8ValidatorPerfTest, FourByteRange) {
  RunSomeTests("%s: bytes=4 ranged length=%d repeat=%d",
               base::Bind(ConstructRangedTestString,
                          kFourByteSeqRangeStart,
                          kFourByteSeqRangeEnd),
               kTestFunctions,
               2);
}

}  // namespace
}  // namespace base

/* [<][>][^][v][top][bottom][index][help] */