This source file includes following definitions.
- weak_factory_
- ExtractFeatures
- CancelPendingExtraction
- ExtractFeaturesWithTimeout
- HandleWord
- CheckNoPendingExtraction
- RunCallback
- Clear
#include "chrome/renderer/safe_browsing/phishing_term_feature_extractor.h"
#include <list>
#include <map>
#include "base/bind.h"
#include "base/compiler_specific.h"
#include "base/i18n/case_conversion.h"
#include "base/logging.h"
#include "base/message_loop/message_loop.h"
#include "base/metrics/histogram.h"
#include "base/strings/utf_string_conversions.h"
#include "base/time/time.h"
#include "chrome/renderer/safe_browsing/feature_extractor_clock.h"
#include "chrome/renderer/safe_browsing/features.h"
#include "chrome/renderer/safe_browsing/murmurhash3_util.h"
#include "crypto/sha2.h"
#include "third_party/icu/source/common/unicode/ubrk.h"
#include "ui/base/l10n/l10n_util.h"
namespace safe_browsing {
const int PhishingTermFeatureExtractor::kMaxTimePerChunkMs = 10;
const int PhishingTermFeatureExtractor::kClockCheckGranularity = 5;
const int PhishingTermFeatureExtractor::kMaxTotalTimeMs = 500;
const int PhishingTermFeatureExtractor::kMaxNegativeWordCacheSize = 1000;
struct PhishingTermFeatureExtractor::ExtractionState {
std::string previous_words;
std::list<size_t> previous_word_sizes;
UBreakIterator* iterator;
int position;
bool position_initialized;
base::TimeTicks start_time;
int num_iterations;
ExtractionState(const base::string16& text, base::TimeTicks start_time_ticks)
: position(-1),
position_initialized(false),
start_time(start_time_ticks),
num_iterations(0) {
UErrorCode status = U_ZERO_ERROR;
iterator = ubrk_open(UBRK_WORD, NULL,
text.data(), text.size(),
&status);
if (U_FAILURE(status)) {
DLOG(ERROR) << "ubrk_open failed: " << status;
iterator = NULL;
}
}
~ExtractionState() {
if (iterator) {
ubrk_close(iterator);
}
}
};
PhishingTermFeatureExtractor::PhishingTermFeatureExtractor(
const base::hash_set<std::string>* page_term_hashes,
const base::hash_set<uint32>* page_word_hashes,
size_t max_words_per_term,
uint32 murmurhash3_seed,
FeatureExtractorClock* clock)
: page_term_hashes_(page_term_hashes),
page_word_hashes_(page_word_hashes),
max_words_per_term_(max_words_per_term),
murmurhash3_seed_(murmurhash3_seed),
negative_word_cache_(kMaxNegativeWordCacheSize),
clock_(clock),
weak_factory_(this) {
Clear();
}
PhishingTermFeatureExtractor::~PhishingTermFeatureExtractor() {
CheckNoPendingExtraction();
}
void PhishingTermFeatureExtractor::ExtractFeatures(
const base::string16* page_text,
FeatureMap* features,
const DoneCallback& done_callback) {
CheckNoPendingExtraction();
CancelPendingExtraction();
page_text_ = page_text;
features_ = features;
done_callback_ = done_callback;
state_.reset(new ExtractionState(*page_text_, clock_->Now()));
base::MessageLoop::current()->PostTask(
FROM_HERE,
base::Bind(&PhishingTermFeatureExtractor::ExtractFeaturesWithTimeout,
weak_factory_.GetWeakPtr()));
}
void PhishingTermFeatureExtractor::CancelPendingExtraction() {
weak_factory_.InvalidateWeakPtrs();
Clear();
}
void PhishingTermFeatureExtractor::ExtractFeaturesWithTimeout() {
DCHECK(state_.get());
++state_->num_iterations;
base::TimeTicks current_chunk_start_time = clock_->Now();
if (!state_->iterator) {
UMA_HISTOGRAM_COUNTS("SBClientPhishing.TermFeatureBreakIterError", 1);
RunCallback(false);
return;
}
if (!state_->position_initialized) {
state_->position = ubrk_first(state_->iterator);
if (state_->position == UBRK_DONE) {
RunCallback(true);
return;
}
state_->position_initialized = true;
}
int num_words = 0;
for (int next = ubrk_next(state_->iterator);
next != UBRK_DONE; next = ubrk_next(state_->iterator)) {
if (ubrk_getRuleStatus(state_->iterator) != UBRK_WORD_NONE) {
HandleWord(base::StringPiece16(page_text_->data() + state_->position,
next - state_->position));
++num_words;
}
state_->position = next;
if (num_words >= kClockCheckGranularity) {
num_words = 0;
base::TimeTicks now = clock_->Now();
if (now - state_->start_time >=
base::TimeDelta::FromMilliseconds(kMaxTotalTimeMs)) {
DLOG(ERROR) << "Feature extraction took too long, giving up";
UMA_HISTOGRAM_COUNTS("SBClientPhishing.TermFeatureTimeout", 1);
RunCallback(false);
return;
}
base::TimeDelta chunk_elapsed = now - current_chunk_start_time;
if (chunk_elapsed >=
base::TimeDelta::FromMilliseconds(kMaxTimePerChunkMs)) {
UMA_HISTOGRAM_TIMES("SBClientPhishing.TermFeatureChunkTime",
chunk_elapsed);
base::MessageLoop::current()->PostTask(
FROM_HERE,
base::Bind(
&PhishingTermFeatureExtractor::ExtractFeaturesWithTimeout,
weak_factory_.GetWeakPtr()));
return;
}
}
}
RunCallback(true);
}
void PhishingTermFeatureExtractor::HandleWord(
const base::StringPiece16& word) {
if (negative_word_cache_.Get(word) != negative_word_cache_.end()) {
state_->previous_words.clear();
state_->previous_word_sizes.clear();
return;
}
std::string word_lower = base::UTF16ToUTF8(base::i18n::ToLower(word));
uint32 word_hash = MurmurHash3String(word_lower, murmurhash3_seed_);
if (page_word_hashes_->find(word_hash) == page_word_hashes_->end()) {
state_->previous_words.clear();
state_->previous_word_sizes.clear();
negative_word_cache_.Put(word, true);
return;
}
std::map<std::string , std::string >
hashes_to_check;
hashes_to_check[crypto::SHA256HashString(word_lower)] = word_lower;
state_->previous_words.append(word_lower);
std::string current_term = state_->previous_words;
for (std::list<size_t>::iterator it = state_->previous_word_sizes.begin();
it != state_->previous_word_sizes.end(); ++it) {
hashes_to_check[crypto::SHA256HashString(current_term)] = current_term;
current_term.erase(0, *it);
}
for (std::map<std::string, std::string>::iterator it =
hashes_to_check.begin();
it != hashes_to_check.end(); ++it) {
if (page_term_hashes_->find(it->first) != page_term_hashes_->end()) {
features_->AddBooleanFeature(features::kPageTerm + it->second);
}
}
state_->previous_words.append(" ");
state_->previous_word_sizes.push_back(word_lower.size() + 1);
if (state_->previous_word_sizes.size() >= max_words_per_term_) {
state_->previous_words.erase(0, state_->previous_word_sizes.front());
state_->previous_word_sizes.pop_front();
}
}
void PhishingTermFeatureExtractor::CheckNoPendingExtraction() {
DCHECK(done_callback_.is_null());
DCHECK(!state_.get());
if (!done_callback_.is_null() || state_.get()) {
LOG(ERROR) << "Extraction in progress, missing call to "
<< "CancelPendingExtraction";
}
}
void PhishingTermFeatureExtractor::RunCallback(bool success) {
DCHECK(state_.get());
UMA_HISTOGRAM_COUNTS("SBClientPhishing.TermFeatureIterations",
state_->num_iterations);
UMA_HISTOGRAM_TIMES("SBClientPhishing.TermFeatureTotalTime",
clock_->Now() - state_->start_time);
DCHECK(!done_callback_.is_null());
done_callback_.Run(success);
Clear();
}
void PhishingTermFeatureExtractor::Clear() {
page_text_ = NULL;
features_ = NULL;
done_callback_.Reset();
state_.reset(NULL);
negative_word_cache_.Clear();
}
}