This source file includes following definitions.
- weak_factory_
- ExtractFeatures
- CancelPendingExtraction
- ExtractFeaturesWithTimeout
- HandleLink
- HandleForm
- HandleImage
- HandleInput
- HandleScript
- CheckNoPendingExtraction
- RunCallback
- Clear
- ResetFrameData
- GetNextDocument
- IsExternalDomain
- InsertFeatures
#include "chrome/renderer/safe_browsing/phishing_dom_feature_extractor.h"
#include "base/bind.h"
#include "base/compiler_specific.h"
#include "base/containers/hash_tables.h"
#include "base/logging.h"
#include "base/message_loop/message_loop.h"
#include "base/metrics/histogram.h"
#include "base/strings/string_util.h"
#include "base/time/time.h"
#include "chrome/renderer/safe_browsing/feature_extractor_clock.h"
#include "chrome/renderer/safe_browsing/features.h"
#include "content/public/renderer/render_view.h"
#include "net/base/registry_controlled_domains/registry_controlled_domain.h"
#include "third_party/WebKit/public/platform/WebString.h"
#include "third_party/WebKit/public/web/WebElement.h"
#include "third_party/WebKit/public/web/WebElementCollection.h"
#include "third_party/WebKit/public/web/WebFrame.h"
#include "third_party/WebKit/public/web/WebView.h"
namespace safe_browsing {
const int PhishingDOMFeatureExtractor::kMaxTimePerChunkMs = 10;
const int PhishingDOMFeatureExtractor::kClockCheckGranularity = 10;
const int PhishingDOMFeatureExtractor::kMaxTotalTimeMs = 500;
struct PhishingDOMFeatureExtractor::PageFeatureState {
int external_links;
base::hash_set<std::string> external_domains;
int secure_links;
int total_links;
int num_forms;
int num_text_inputs;
int num_pswd_inputs;
int num_radio_inputs;
int num_check_inputs;
int action_other_domain;
int total_actions;
int img_other_domain;
int total_imgs;
int num_script_tags;
base::TimeTicks start_time;
int num_iterations;
explicit PageFeatureState(base::TimeTicks start_time_ticks)
: external_links(0),
secure_links(0),
total_links(0),
num_forms(0),
num_text_inputs(0),
num_pswd_inputs(0),
num_radio_inputs(0),
num_check_inputs(0),
action_other_domain(0),
total_actions(0),
img_other_domain(0),
total_imgs(0),
num_script_tags(0),
start_time(start_time_ticks),
num_iterations(0) {}
~PageFeatureState() {}
};
struct PhishingDOMFeatureExtractor::FrameData {
blink::WebElementCollection elements;
std::string domain;
};
PhishingDOMFeatureExtractor::PhishingDOMFeatureExtractor(
content::RenderView* render_view,
FeatureExtractorClock* clock)
: render_view_(render_view),
clock_(clock),
weak_factory_(this) {
Clear();
}
PhishingDOMFeatureExtractor::~PhishingDOMFeatureExtractor() {
CheckNoPendingExtraction();
}
void PhishingDOMFeatureExtractor::ExtractFeatures(
FeatureMap* features,
const DoneCallback& done_callback) {
CheckNoPendingExtraction();
CancelPendingExtraction();
features_ = features;
done_callback_ = done_callback;
page_feature_state_.reset(new PageFeatureState(clock_->Now()));
blink::WebView* web_view = render_view_->GetWebView();
if (web_view && web_view->mainFrame()) {
cur_document_ = web_view->mainFrame()->document();
}
base::MessageLoop::current()->PostTask(
FROM_HERE,
base::Bind(&PhishingDOMFeatureExtractor::ExtractFeaturesWithTimeout,
weak_factory_.GetWeakPtr()));
}
void PhishingDOMFeatureExtractor::CancelPendingExtraction() {
weak_factory_.InvalidateWeakPtrs();
Clear();
}
void PhishingDOMFeatureExtractor::ExtractFeaturesWithTimeout() {
DCHECK(page_feature_state_.get());
++page_feature_state_->num_iterations;
base::TimeTicks current_chunk_start_time = clock_->Now();
if (cur_document_.isNull()) {
RunCallback(false);
return;
}
int num_elements = 0;
for (; !cur_document_.isNull(); cur_document_ = GetNextDocument()) {
blink::WebElement cur_element;
if (cur_frame_data_.get()) {
cur_element = cur_frame_data_->elements.nextItem();
UMA_HISTOGRAM_TIMES("SBClientPhishing.DOMFeatureResumeTime",
clock_->Now() - current_chunk_start_time);
} else {
ResetFrameData();
cur_element = cur_frame_data_->elements.firstItem();
}
for (; !cur_element.isNull();
cur_element = cur_frame_data_->elements.nextItem()) {
if (cur_element.hasTagName("a")) {
HandleLink(cur_element);
} else if (cur_element.hasTagName("form")) {
HandleForm(cur_element);
} else if (cur_element.hasTagName("img")) {
HandleImage(cur_element);
} else if (cur_element.hasTagName("input")) {
HandleInput(cur_element);
} else if (cur_element.hasTagName("script")) {
HandleScript(cur_element);
}
if (++num_elements >= kClockCheckGranularity) {
num_elements = 0;
base::TimeTicks now = clock_->Now();
if (now - page_feature_state_->start_time >=
base::TimeDelta::FromMilliseconds(kMaxTotalTimeMs)) {
DLOG(ERROR) << "Feature extraction took too long, giving up";
UMA_HISTOGRAM_COUNTS("SBClientPhishing.DOMFeatureTimeout", 1);
RunCallback(false);
return;
}
base::TimeDelta chunk_elapsed = now - current_chunk_start_time;
if (chunk_elapsed >=
base::TimeDelta::FromMilliseconds(kMaxTimePerChunkMs)) {
UMA_HISTOGRAM_TIMES("SBClientPhishing.DOMFeatureChunkTime",
chunk_elapsed);
base::MessageLoop::current()->PostTask(
FROM_HERE,
base::Bind(
&PhishingDOMFeatureExtractor::ExtractFeaturesWithTimeout,
weak_factory_.GetWeakPtr()));
return;
}
}
}
cur_frame_data_.reset();
}
InsertFeatures();
RunCallback(true);
}
void PhishingDOMFeatureExtractor::HandleLink(
const blink::WebElement& element) {
if (!element.hasAttribute("href")) {
DVLOG(1) << "Skipping anchor tag with no href";
return;
}
blink::WebURL full_url = element.document().completeURL(
element.getAttribute("href"));
std::string domain;
bool is_external = IsExternalDomain(full_url, &domain);
if (domain.empty()) {
DVLOG(1) << "Could not extract domain from link: " << full_url;
return;
}
if (is_external) {
++page_feature_state_->external_links;
page_feature_state_->external_domains.insert(domain);
}
if (GURL(full_url).SchemeIs("https")) {
++page_feature_state_->secure_links;
}
++page_feature_state_->total_links;
}
void PhishingDOMFeatureExtractor::HandleForm(
const blink::WebElement& element) {
++page_feature_state_->num_forms;
if (!element.hasAttribute("action")) {
return;
}
blink::WebURL full_url = element.document().completeURL(
element.getAttribute("action"));
std::string domain;
bool is_external = IsExternalDomain(full_url, &domain);
if (domain.empty()) {
DVLOG(1) << "Could not extract domain from form action: " << full_url;
return;
}
if (is_external) {
++page_feature_state_->action_other_domain;
}
++page_feature_state_->total_actions;
}
void PhishingDOMFeatureExtractor::HandleImage(
const blink::WebElement& element) {
if (!element.hasAttribute("src")) {
DVLOG(1) << "Skipping img tag with no src";
}
blink::WebURL full_url = element.document().completeURL(
element.getAttribute("src"));
std::string domain;
bool is_external = IsExternalDomain(full_url, &domain);
if (domain.empty()) {
DVLOG(1) << "Could not extract domain from image src: " << full_url;
return;
}
if (is_external) {
++page_feature_state_->img_other_domain;
}
++page_feature_state_->total_imgs;
}
void PhishingDOMFeatureExtractor::HandleInput(
const blink::WebElement& element) {
std::string type = element.getAttribute("type").utf8();
StringToLowerASCII(&type);
if (type == "password") {
++page_feature_state_->num_pswd_inputs;
} else if (type == "radio") {
++page_feature_state_->num_radio_inputs;
} else if (type == "checkbox") {
++page_feature_state_->num_check_inputs;
} else if (type != "submit" && type != "reset" && type != "file" &&
type != "hidden" && type != "image" && type != "button") {
++page_feature_state_->num_text_inputs;
}
}
void PhishingDOMFeatureExtractor::HandleScript(
const blink::WebElement& element) {
++page_feature_state_->num_script_tags;
}
void PhishingDOMFeatureExtractor::CheckNoPendingExtraction() {
DCHECK(done_callback_.is_null());
DCHECK(!cur_frame_data_.get());
DCHECK(cur_document_.isNull());
if (!done_callback_.is_null() || cur_frame_data_.get() ||
!cur_document_.isNull()) {
LOG(ERROR) << "Extraction in progress, missing call to "
<< "CancelPendingExtraction";
}
}
void PhishingDOMFeatureExtractor::RunCallback(bool success) {
DCHECK(page_feature_state_.get());
UMA_HISTOGRAM_COUNTS("SBClientPhishing.DOMFeatureIterations",
page_feature_state_->num_iterations);
UMA_HISTOGRAM_TIMES("SBClientPhishing.DOMFeatureTotalTime",
clock_->Now() - page_feature_state_->start_time);
DCHECK(!done_callback_.is_null());
done_callback_.Run(success);
Clear();
}
void PhishingDOMFeatureExtractor::Clear() {
features_ = NULL;
done_callback_.Reset();
cur_frame_data_.reset(NULL);
cur_document_.reset();
}
void PhishingDOMFeatureExtractor::ResetFrameData() {
DCHECK(!cur_document_.isNull());
DCHECK(!cur_frame_data_.get());
cur_frame_data_.reset(new FrameData());
cur_frame_data_->elements = cur_document_.all();
cur_frame_data_->domain =
net::registry_controlled_domains::GetDomainAndRegistry(
cur_document_.url(),
net::registry_controlled_domains::EXCLUDE_PRIVATE_REGISTRIES);
}
blink::WebDocument PhishingDOMFeatureExtractor::GetNextDocument() {
DCHECK(!cur_document_.isNull());
blink::WebFrame* frame = cur_document_.frame();
if (frame) {
while ((frame = frame->traverseNext(false))) {
if (!frame->document().isNull()) {
return frame->document();
}
}
} else {
UMA_HISTOGRAM_COUNTS("SBClientPhishing.DOMFeatureFrameRemoved", 1);
}
return blink::WebDocument();
}
bool PhishingDOMFeatureExtractor::IsExternalDomain(const GURL& url,
std::string* domain) const {
DCHECK(domain);
DCHECK(cur_frame_data_.get());
if (cur_frame_data_->domain.empty()) {
return false;
}
if (url.HostIsIPAddress()) {
domain->assign(url.host());
} else {
domain->assign(net::registry_controlled_domains::GetDomainAndRegistry(
url, net::registry_controlled_domains::EXCLUDE_PRIVATE_REGISTRIES));
}
return !domain->empty() && *domain != cur_frame_data_->domain;
}
void PhishingDOMFeatureExtractor::InsertFeatures() {
DCHECK(page_feature_state_.get());
if (page_feature_state_->total_links > 0) {
double link_freq = static_cast<double>(
page_feature_state_->external_links) /
page_feature_state_->total_links;
features_->AddRealFeature(features::kPageExternalLinksFreq, link_freq);
for (base::hash_set<std::string>::iterator it =
page_feature_state_->external_domains.begin();
it != page_feature_state_->external_domains.end(); ++it) {
features_->AddBooleanFeature(features::kPageLinkDomain + *it);
}
double secure_freq = static_cast<double>(
page_feature_state_->secure_links) / page_feature_state_->total_links;
features_->AddRealFeature(features::kPageSecureLinksFreq, secure_freq);
}
if (page_feature_state_->num_forms > 0) {
features_->AddBooleanFeature(features::kPageHasForms);
}
if (page_feature_state_->num_text_inputs > 0) {
features_->AddBooleanFeature(features::kPageHasTextInputs);
}
if (page_feature_state_->num_pswd_inputs > 0) {
features_->AddBooleanFeature(features::kPageHasPswdInputs);
}
if (page_feature_state_->num_radio_inputs > 0) {
features_->AddBooleanFeature(features::kPageHasRadioInputs);
}
if (page_feature_state_->num_check_inputs > 0) {
features_->AddBooleanFeature(features::kPageHasCheckInputs);
}
if (page_feature_state_->total_actions > 0) {
double action_freq = static_cast<double>(
page_feature_state_->action_other_domain) /
page_feature_state_->total_actions;
features_->AddRealFeature(features::kPageActionOtherDomainFreq,
action_freq);
}
if (page_feature_state_->total_imgs > 0) {
double img_freq = static_cast<double>(
page_feature_state_->img_other_domain) /
page_feature_state_->total_imgs;
features_->AddRealFeature(features::kPageImgOtherDomainFreq, img_freq);
}
if (page_feature_state_->num_script_tags > 1) {
features_->AddBooleanFeature(features::kPageNumScriptTagsGTOne);
if (page_feature_state_->num_script_tags > 6) {
features_->AddBooleanFeature(features::kPageNumScriptTagsGTSix);
}
}
}
}