This source file includes following definitions.
- PairFirstLessThan
- CoalescePositionsFrom
- AddMatch
- AdvanceAndReturnUTF16Pos
- MoveByNGraphemes
- IsNextMatchWithinSnippetWindow
- ExtractMatchPositions
- ConvertMatchPositionsToWide
- ComputeSnippet
- Swap
#include "chrome/browser/history/snippet.h"
#include <algorithm>
#include "base/logging.h"
#include "base/memory/scoped_ptr.h"
#include "base/strings/string_split.h"
#include "base/strings/string_util.h"
#include "base/strings/utf_string_conversions.h"
#include "third_party/icu/source/common/unicode/brkiter.h"
#include "third_party/icu/source/common/unicode/utext.h"
#include "third_party/icu/source/common/unicode/utf8.h"
namespace {
bool PairFirstLessThan(const Snippet::MatchPosition& a,
const Snippet::MatchPosition& b) {
return a.first < b.first;
}
void CoalescePositionsFrom(size_t offset,
Snippet::MatchPositions* match_positions) {
DCHECK(offset < match_positions->size());
Snippet::MatchPosition& pair((*match_positions)[offset]);
++offset;
while (offset < match_positions->size() &&
pair.second >= (*match_positions)[offset].first) {
pair.second = std::max(pair.second, (*match_positions)[offset].second);
match_positions->erase(match_positions->begin() + offset);
}
}
void AddMatch(size_t start,
size_t end,
Snippet::MatchPositions* match_positions) {
DCHECK(start < end);
DCHECK(match_positions);
Snippet::MatchPosition pair(start, end);
if (match_positions->empty()) {
match_positions->push_back(pair);
return;
}
Snippet::MatchPositions::iterator i =
std::lower_bound(match_positions->begin(), match_positions->end(),
pair, &PairFirstLessThan);
if (i != match_positions->end() && i->first == start) {
if (end > i->second) {
i->second = end;
CoalescePositionsFrom(i - match_positions->begin(), match_positions);
}
} else if (i == match_positions->begin()) {
match_positions->insert(i, pair);
CoalescePositionsFrom(0, match_positions);
} else {
--i;
if (start <= i->second && end > i->second) {
i->second = end;
CoalescePositionsFrom(i - match_positions->begin(), match_positions);
} else if (end > i->second) {
++i;
if (i == match_positions->end() || end < i->first) {
match_positions->insert(i, pair);
} else {
i->first = start;
i->second = end;
CoalescePositionsFrom(i - match_positions->begin(), match_positions);
}
}
}
}
size_t AdvanceAndReturnUTF16Pos(const char* utf8_string,
int32_t utf8_length,
int32_t offset,
int32_t* utf8_pos,
size_t* utf16_pos) {
DCHECK(offset >= *utf8_pos && offset <= utf8_length);
UChar32 wide_char;
while (*utf8_pos < offset) {
U8_NEXT(utf8_string, *utf8_pos, utf8_length, wide_char);
*utf16_pos += (wide_char <= 0xFFFF) ? 1 : 2;
}
return *utf16_pos;
}
void MoveByNGraphemes(icu::BreakIterator* bi, int count, size_t* utf8_pos) {
bi->isBoundary(*utf8_pos);
bi->next(count);
*utf8_pos = static_cast<size_t>(bi->current());
}
const int kSnippetContext = 50;
bool IsNextMatchWithinSnippetWindow(icu::BreakIterator* bi,
size_t previous_match_end,
size_t next_match_start) {
if (next_match_start < previous_match_end + kSnippetContext)
return true;
bi->isBoundary(previous_match_end);
bi->next(kSnippetContext);
int64 current = bi->current();
return (next_match_start < static_cast<uint64>(current) ||
current == icu::BreakIterator::DONE);
}
}
void Snippet::ExtractMatchPositions(const std::string& offsets_str,
const std::string& column_num,
MatchPositions* match_positions) {
DCHECK(match_positions);
if (offsets_str.empty())
return;
std::vector<std::string> offsets;
base::SplitString(offsets_str, ' ', &offsets);
for (size_t i = 0; i < offsets.size() - 3; i += 4) {
if (offsets[i] != column_num)
continue;
const size_t start = atoi(offsets[i + 2].c_str());
const size_t end = start + atoi(offsets[i + 3].c_str());
CHECK(end >= start);
AddMatch(start, end, match_positions);
}
}
void Snippet::ConvertMatchPositionsToWide(
const std::string& utf8_string,
Snippet::MatchPositions* match_positions) {
DCHECK(match_positions);
int32_t utf8_pos = 0;
size_t utf16_pos = 0;
const char* utf8_cstring = utf8_string.c_str();
const int32_t utf8_length = static_cast<int32_t>(utf8_string.size());
for (Snippet::MatchPositions::iterator i = match_positions->begin();
i != match_positions->end(); ++i) {
i->first = AdvanceAndReturnUTF16Pos(utf8_cstring, utf8_length,
i->first, &utf8_pos, &utf16_pos);
i->second = AdvanceAndReturnUTF16Pos(utf8_cstring, utf8_length,
i->second, &utf8_pos, &utf16_pos);
}
}
Snippet::Snippet() {
}
Snippet::~Snippet() {
}
void Snippet::ComputeSnippet(const MatchPositions& match_positions,
const std::string& document) {
const size_t kSnippetMaxLength = 200;
const base::string16 kEllipsis = base::ASCIIToUTF16(" ... ");
UText* document_utext = NULL;
UErrorCode status = U_ZERO_ERROR;
document_utext = utext_openUTF8(document_utext, document.data(),
document.size(), &status);
scoped_ptr<icu::BreakIterator> bi(icu::BreakIterator::createCharacterInstance(
icu::Locale::getDefault(), status));
bi->setText(document_utext, status);
DCHECK(U_SUCCESS(status));
base::string16 snippet;
size_t start = 0;
for (size_t i = 0; i < match_positions.size(); ++i) {
const size_t match_start = match_positions[i].first;
const size_t match_end = match_positions[i].second;
CHECK(match_end > match_start);
CHECK(match_end <= document.size());
size_t context_start = match_start;
MoveByNGraphemes(bi.get(), -kSnippetContext, &context_start);
start = std::max(start, context_start);
if (start < match_start) {
if (start > 0)
snippet += kEllipsis;
CHECK(start < document.size());
snippet += base::UTF8ToUTF16(document.substr(start, match_start - start));
}
const size_t first = snippet.size();
snippet += base::UTF8ToUTF16(document.substr(match_start,
match_end - match_start));
matches_.push_back(std::make_pair(first, snippet.size()));
size_t end;
if (i + 1 < match_positions.size() &&
IsNextMatchWithinSnippetWindow(bi.get(), match_end,
match_positions[i + 1].first)) {
end = match_positions[i + 1].first;
CHECK(end >= match_end);
CHECK(end <= document.size());
snippet += base::UTF8ToUTF16(document.substr(match_end, end - match_end));
} else {
end = match_end;
MoveByNGraphemes(bi.get(), kSnippetContext, &end);
CHECK(end >= match_end);
CHECK(end <= document.size());
snippet += base::UTF8ToUTF16(document.substr(match_end, end - match_end));
if (end < document.size())
snippet += kEllipsis;
}
start = end;
if (snippet.size() >= kSnippetMaxLength)
break;
}
utext_close(document_utext);
swap(text_, snippet);
}
void Snippet::Swap(Snippet* other) {
text_.swap(other->text_);
matches_.swap(other->matches_);
}