This source file includes following definitions.
- to_
- AddByte
- from
- to
- current_offset_
- PrintValue
- NewLine
- InitializeCharacters
- ConstructPairAndAppend
- MoveRightMostCharToSet
- MoveAllCharsToSets
- LogStringSets
- GenerateInvalidState
- MakeState
- GenerateStates
- PrintStates
- main
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <algorithm>
#include <map>
#include <string>
#include <vector>
#include "base/basictypes.h"
#include "base/command_line.h"
#include "base/file_util.h"
#include "base/files/file_path.h"
#include "base/logging.h"
#include "base/numerics/safe_conversions.h"
#include "base/strings/stringprintf.h"
#include "third_party/icu/source/common/unicode/utf8.h"
namespace {
const char kHelpText[] =
"Usage: build_utf8_validator_tables [ --help ] [ --output=<file> ]\n";
const char kProlog[] =
"// Copyright 2013 The Chromium Authors. All rights reserved.\n"
"// Use of this source code is governed by a BSD-style license that can "
"be\n"
"// found in the LICENSE file.\n"
"\n"
"// This file is auto-generated by build_utf8_validator_tables.\n"
"// DO NOT EDIT.\n"
"\n"
"#include \"base/i18n/utf8_validator_tables.h\"\n"
"\n"
"namespace base {\n"
"namespace internal {\n"
"\n"
"const uint8 kUtf8ValidatorTables[] = {\n";
const char kEpilog[] =
"};\n"
"\n"
"const size_t kUtf8ValidatorTablesSize = arraysize(kUtf8ValidatorTables);\n"
"\n"
"} // namespace internal\n"
"} // namespace base\n";
class Range {
public:
explicit Range(uint8 value) : from_(value), to_(value) {}
void AddByte(uint8 to) {
CHECK(to == to_ + 1);
to_ = to;
}
uint8 from() const { return from_; }
uint8 to() const { return to_; }
bool operator<(const Range& rhs) const {
return (from() < rhs.from() || (from() == rhs.from() && to() < rhs.to()));
}
bool operator==(const Range& rhs) const {
return from() == rhs.from() && to() == rhs.to();
}
private:
uint8 from_;
uint8 to_;
};
typedef std::vector<Range> StringSet;
typedef std::vector<uint8> Character;
struct Pair {
Character character;
StringSet set;
};
typedef std::vector<Pair> PairVector;
class TablePrinter {
public:
explicit TablePrinter(FILE* stream)
: stream_(stream), values_on_this_line_(0), current_offset_(0) {}
void PrintValue(uint8 value) {
if (values_on_this_line_ == 0) {
fputs(" ", stream_);
} else if (values_on_this_line_ == kMaxValuesPerLine) {
fprintf(stream_, " // 0x%02x\n ", current_offset_);
values_on_this_line_ = 0;
}
fprintf(stream_, " 0x%02x,", static_cast<int>(value));
++values_on_this_line_;
++current_offset_;
}
void NewLine() {
while (values_on_this_line_ < kMaxValuesPerLine) {
fputs(" ", stream_);
++values_on_this_line_;
}
fprintf(stream_, " // 0x%02x\n", current_offset_);
values_on_this_line_ = 0;
}
private:
FILE* stream_;
int values_on_this_line_;
int current_offset_;
static const int kMaxValuesPerLine = 8;
DISALLOW_COPY_AND_ASSIGN(TablePrinter);
};
PairVector InitializeCharacters() {
PairVector vector;
for (int i = 0; i <= 0x10FFFF; ++i) {
if (i >= 0xD800 && i < 0xE000) {
continue;
}
uint8 bytes[4];
unsigned int offset = 0;
UBool is_error = false;
U8_APPEND(bytes, offset, arraysize(bytes), i, is_error);
DCHECK(!is_error);
DCHECK_GT(offset, 0u);
DCHECK_LE(offset, arraysize(bytes));
Pair pair = {Character(bytes, bytes + offset), StringSet()};
vector.push_back(pair);
}
return vector;
}
void ConstructPairAndAppend(const Character& character,
const Range& new_range,
const StringSet& existing_set,
PairVector* pairs) {
Pair new_pair = {character, StringSet(1, new_range)};
new_pair.set.insert(
new_pair.set.end(), existing_set.begin(), existing_set.end());
pairs->push_back(new_pair);
}
void MoveRightMostCharToSet(PairVector* pairs) {
PairVector new_pairs;
PairVector::const_iterator it = pairs->begin();
while (it != pairs->end() && it->character.empty()) {
new_pairs.push_back(*it);
++it;
}
CHECK(it != pairs->end());
Character unconverted_bytes(it->character.begin(), it->character.end() - 1);
Range new_range(it->character.back());
StringSet converted = it->set;
++it;
while (it != pairs->end()) {
const Pair& current_pair = *it++;
if (current_pair.character.size() == unconverted_bytes.size() + 1 &&
std::equal(unconverted_bytes.begin(),
unconverted_bytes.end(),
current_pair.character.begin()) &&
converted == current_pair.set) {
DCHECK_EQ(new_range.to() + 1, current_pair.character.back());
new_range.AddByte(current_pair.character.back());
continue;
}
ConstructPairAndAppend(unconverted_bytes, new_range, converted, &new_pairs);
unconverted_bytes = Character(current_pair.character.begin(),
current_pair.character.end() - 1);
new_range = Range(current_pair.character.back());
converted = current_pair.set;
}
ConstructPairAndAppend(unconverted_bytes, new_range, converted, &new_pairs);
new_pairs.swap(*pairs);
}
void MoveAllCharsToSets(PairVector* pairs) {
for (int i = 0; i < 4; ++i) {
MoveRightMostCharToSet(pairs);
}
#if DCHECK_IS_ON
for (PairVector::const_iterator it = pairs->begin(); it != pairs->end();
++it) {
DCHECK(it->character.empty());
}
#endif
}
void LogStringSets(const PairVector& pairs) {
for (PairVector::const_iterator pair_it = pairs.begin();
pair_it != pairs.end();
++pair_it) {
std::string set_as_string;
for (StringSet::const_iterator set_it = pair_it->set.begin();
set_it != pair_it->set.end();
++set_it) {
set_as_string += base::StringPrintf("[\\x%02x-\\x%02x]",
static_cast<int>(set_it->from()),
static_cast<int>(set_it->to()));
}
VLOG(1) << set_as_string;
}
}
struct StateRange {
uint8 from;
uint8 target_state;
};
typedef std::vector<StateRange> State;
State GenerateInvalidState() {
const StateRange range = {0, 1};
return State(1, range);
}
typedef std::map<StringSet, uint8> StateMap;
uint8 MakeState(const StringSet& set,
std::vector<State>* states,
StateMap* state_map) {
DCHECK(!set.empty());
const Range& range = set.front();
const StringSet rest(set.begin() + 1, set.end());
const StateMap::const_iterator where = state_map->find(rest);
const uint8 target_state = where == state_map->end()
? MakeState(rest, states, state_map)
: where->second;
DCHECK_LT(0, range.from());
DCHECK_LT(range.to(), 0xFF);
const StateRange new_state_initializer[] = {
{0, 1}, {range.from(), target_state},
{static_cast<uint8>(range.to() + 1), 1}};
states->push_back(
State(new_state_initializer,
new_state_initializer + arraysize(new_state_initializer)));
const uint8 new_state_number =
base::checked_cast<uint8>(states->size() - 1);
CHECK(state_map->insert(std::make_pair(set, new_state_number)).second);
return new_state_number;
}
std::vector<State> GenerateStates(const PairVector& pairs) {
std::vector<State> states(2, GenerateInvalidState());
StateMap state_map;
state_map.insert(std::make_pair(StringSet(), 0));
for (PairVector::const_iterator it = pairs.begin(); it != pairs.end(); ++it) {
DCHECK(it->character.empty());
DCHECK(!it->set.empty());
const Range& range = it->set.front();
const StringSet rest(it->set.begin() + 1, it->set.end());
const StateMap::const_iterator where = state_map.find(rest);
const uint8 target_state = where == state_map.end()
? MakeState(rest, &states, &state_map)
: where->second;
if (states[0].back().from == range.from()) {
DCHECK_EQ(1, states[0].back().target_state);
states[0].back().target_state = target_state;
DCHECK_LT(range.to(), 0xFF);
const StateRange new_range = {static_cast<uint8>(range.to() + 1), 1};
states[0].push_back(new_range);
} else {
DCHECK_LT(range.to(), 0xFF);
const StateRange new_range_initializer[] = {{range.from(), target_state},
{static_cast<uint8>(range.to() + 1), 1}};
states[0]
.insert(states[0].end(),
new_range_initializer,
new_range_initializer + arraysize(new_range_initializer));
}
}
return states;
}
void PrintStates(const std::vector<State>& states, FILE* stream) {
std::vector<uint8> state_offset(1, 0);
std::vector<uint8> shifts;
uint8 pos = 0;
for (std::vector<State>::const_iterator state_it = states.begin();
state_it != states.end();
++state_it) {
uint8 shift = 7;
for (State::const_iterator range_it = state_it->begin();
range_it != state_it->end();
++range_it) {
while (shift > 0 && range_it->from % (1 << shift) != 0) {
--shift;
}
}
shifts.push_back(shift);
pos += 1 + (1 << (7 - shift));
state_offset.push_back(pos);
}
DCHECK_EQ(129, state_offset[1]);
fputs(kProlog, stream);
TablePrinter table_printer(stream);
for (uint8 state_index = 0; state_index < states.size(); ++state_index) {
const uint8 shift = shifts[state_index];
uint8 next_range = 0;
uint8 target_state = 1;
fprintf(stream,
" // State %d, offset 0x%02x\n",
static_cast<int>(state_index),
static_cast<int>(state_offset[state_index]));
table_printer.PrintValue(shift);
for (int i = 0; i < 0x100; i += (1 << shift)) {
if (next_range < states[state_index].size() &&
states[state_index][next_range].from == i) {
target_state = states[state_index][next_range].target_state;
++next_range;
}
if (i >= 0x80) {
table_printer.PrintValue(state_offset[target_state]);
}
}
table_printer.NewLine();
}
fputs(kEpilog, stream);
}
}
int main(int argc, char* argv[]) {
CommandLine::Init(argc, argv);
logging::LoggingSettings settings;
settings.logging_dest = logging::LOG_TO_SYSTEM_DEBUG_LOG;
logging::InitLogging(settings);
if (CommandLine::ForCurrentProcess()->HasSwitch("help")) {
fwrite(kHelpText, 1, arraysize(kHelpText), stdout);
exit(EXIT_SUCCESS);
}
base::FilePath filename =
CommandLine::ForCurrentProcess()->GetSwitchValuePath("output");
FILE* output = stdout;
if (!filename.empty()) {
output = base::OpenFile(filename, "wb");
if (!output)
PLOG(FATAL) << "Couldn't open '" << filename.AsUTF8Unsafe()
<< "' for writing";
}
PairVector pairs = InitializeCharacters();
MoveAllCharsToSets(&pairs);
if (VLOG_IS_ON(1)) {
LogStringSets(pairs);
}
std::vector<State> states = GenerateStates(pairs);
PrintStates(states, output);
if (!filename.empty()) {
if (!base::CloseFile(output))
PLOG(FATAL) << "Couldn't finish writing '" << filename.AsUTF8Unsafe()
<< "'";
}
return EXIT_SUCCESS;
}