root/chrome/browser/safe_browsing/browser_feature_extractor.cc

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. FilterBenignIpsOnIOThread
  2. resource_type
  3. AddFeature
  4. AddMalwareIpUrlInfo
  5. AddNavigationFeatures
  6. weak_factory_
  7. ExtractFeatures
  8. ExtractMalwareFeatures
  9. ExtractBrowseInfoFeatures
  10. StartExtractFeatures
  11. QueryUrlHistoryDone
  12. QueryHttpHostVisitsDone
  13. QueryHttpsHostVisitsDone
  14. SetHostVisitsFeatures
  15. StorePendingQuery
  16. GetPendingQuery
  17. GetHistoryService
  18. FinishExtractMalwareFeatures

// Copyright (c) 2012 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#include "chrome/browser/safe_browsing/browser_feature_extractor.h"

#include <map>
#include <utility>

#include "base/bind.h"
#include "base/bind_helpers.h"
#include "base/format_macros.h"
#include "base/stl_util.h"
#include "base/strings/stringprintf.h"
#include "base/time/time.h"
#include "chrome/browser/common/cancelable_request.h"
#include "chrome/browser/history/history_service.h"
#include "chrome/browser/history/history_service_factory.h"
#include "chrome/browser/history/history_types.h"
#include "chrome/browser/profiles/profile.h"
#include "chrome/browser/safe_browsing/browser_features.h"
#include "chrome/browser/safe_browsing/client_side_detection_host.h"
#include "chrome/browser/safe_browsing/database_manager.h"
#include "chrome/common/safe_browsing/csd.pb.h"
#include "content/public/browser/browser_thread.h"
#include "content/public/browser/navigation_controller.h"
#include "content/public/browser/navigation_entry.h"
#include "content/public/browser/web_contents.h"
#include "content/public/common/page_transition_types.h"
#include "url/gurl.h"

using content::BrowserThread;
using content::NavigationController;
using content::NavigationEntry;
using content::WebContents;

namespace safe_browsing {

namespace {

const int kMaxMalwareIPPerRequest = 5;

void FilterBenignIpsOnIOThread(
    scoped_refptr<SafeBrowsingDatabaseManager> database_manager,
    IPUrlMap* ips) {
  DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO));
  for (IPUrlMap::iterator it = ips->begin(); it != ips->end();) {
    if (!database_manager.get() ||
        !database_manager->MatchMalwareIP(it->first)) {
      // it++ here returns a copy of the old iterator and passes it to erase.
      ips->erase(it++);
    } else {
      ++it;
    }
  }
}
}  // namespace

IPUrlInfo::IPUrlInfo(const std::string& url,
                     const std::string& method,
                     const std::string& referrer,
                     const ResourceType::Type& resource_type)
      : url(url),
        method(method),
        referrer(referrer),
        resource_type(resource_type) {
}

IPUrlInfo::~IPUrlInfo() {}

BrowseInfo::BrowseInfo() : http_status_code(0) {}

BrowseInfo::~BrowseInfo() {}

static void AddFeature(const std::string& feature_name,
                       double feature_value,
                       ClientPhishingRequest* request) {
  DCHECK(request);
  ClientPhishingRequest::Feature* feature =
      request->add_non_model_feature_map();
  feature->set_name(feature_name);
  feature->set_value(feature_value);
  VLOG(2) << "Browser feature: " << feature->name() << " " << feature->value();
}

static void AddMalwareIpUrlInfo(const std::string& ip,
                                const std::vector<IPUrlInfo>& meta_infos,
                                ClientMalwareRequest* request) {
  DCHECK(request);
  for (std::vector<IPUrlInfo>::const_iterator it = meta_infos.begin();
       it != meta_infos.end(); ++it) {
    ClientMalwareRequest::UrlInfo* urlinfo =
        request->add_bad_ip_url_info();
    // We add the information about url on the bad ip.
    urlinfo->set_ip(ip);
    urlinfo->set_url(it->url);
    urlinfo->set_method(it->method);
    urlinfo->set_referrer(it->referrer);
    urlinfo->set_resource_type(static_cast<int>(it->resource_type));
  }
  DVLOG(2) << "Added url info for bad ip: " << ip;
}

static void AddNavigationFeatures(
    const std::string& feature_prefix,
    const NavigationController& controller,
    int index,
    const std::vector<GURL>& redirect_chain,
    ClientPhishingRequest* request) {
  NavigationEntry* entry = controller.GetEntryAtIndex(index);
  bool is_secure_referrer = entry->GetReferrer().url.SchemeIsSecure();
  if (!is_secure_referrer) {
    AddFeature(base::StringPrintf("%s%s=%s",
                                  feature_prefix.c_str(),
                                  features::kReferrer,
                                  entry->GetReferrer().url.spec().c_str()),
               1.0,
               request);
  }
  AddFeature(feature_prefix + features::kHasSSLReferrer,
             is_secure_referrer ? 1.0 : 0.0,
             request);
  AddFeature(feature_prefix + features::kPageTransitionType,
             static_cast<double>(
                 content::PageTransitionStripQualifier(
                    entry->GetTransitionType())),
             request);
  AddFeature(feature_prefix + features::kIsFirstNavigation,
             index == 0 ? 1.0 : 0.0,
             request);
  // Redirect chain should always be at least of size one, as the rendered
  // url is the last element in the chain.
  if (redirect_chain.empty()) {
    NOTREACHED();
    return;
  }
  if (redirect_chain.back() != entry->GetURL()) {
    // I originally had this as a DCHECK but I saw a failure once that I
    // can't reproduce. It looks like it might be related to the
    // navigation controller only keeping a limited number of navigation
    // events. For now we'll just attach a feature specifying that this is
    // a mismatch and try and figure out what to do with it on the server.
    DLOG(WARNING) << "Expected:" << entry->GetURL()
                 << " Actual:" << redirect_chain.back();
    AddFeature(feature_prefix + features::kRedirectUrlMismatch,
               1.0,
               request);
    return;
  }
  // We skip the last element since it should just be the current url.
  for (size_t i = 0; i < redirect_chain.size() - 1; i++) {
    std::string printable_redirect = redirect_chain[i].spec();
    if (redirect_chain[i].SchemeIsSecure()) {
      printable_redirect = features::kSecureRedirectValue;
    }
    AddFeature(base::StringPrintf("%s%s[%" PRIuS "]=%s",
                                  feature_prefix.c_str(),
                                  features::kRedirect,
                                  i,
                                  printable_redirect.c_str()),
               1.0,
               request);
  }
}

BrowserFeatureExtractor::BrowserFeatureExtractor(
    WebContents* tab,
    ClientSideDetectionHost* host)
    : tab_(tab),
      host_(host),
      weak_factory_(this) {
  DCHECK(tab);
}

BrowserFeatureExtractor::~BrowserFeatureExtractor() {
  weak_factory_.InvalidateWeakPtrs();
  // Delete all the pending extractions (delete callback and request objects).
  STLDeleteContainerPairFirstPointers(pending_extractions_.begin(),
                                      pending_extractions_.end());

  // Also cancel all the pending history service queries.
  HistoryService* history;
  bool success = GetHistoryService(&history);
  DCHECK(success || pending_queries_.size() == 0);
  // Cancel all the pending history lookups and cleanup the memory.
  for (PendingQueriesMap::iterator it = pending_queries_.begin();
       it != pending_queries_.end(); ++it) {
    if (history) {
      history->CancelRequest(it->first);
    }
    ExtractionData& extraction = it->second;
    delete extraction.first;  // delete request
  }
  pending_queries_.clear();
}

void BrowserFeatureExtractor::ExtractFeatures(const BrowseInfo* info,
                                              ClientPhishingRequest* request,
                                              const DoneCallback& callback) {
  DCHECK(BrowserThread::CurrentlyOn(BrowserThread::UI));
  DCHECK(request);
  DCHECK(info);
  DCHECK_EQ(0U, request->url().find("http:"));
  DCHECK(!callback.is_null());
  // Extract features pertaining to this navigation.
  const NavigationController& controller = tab_->GetController();
  int url_index = -1;
  int first_host_index = -1;

  GURL request_url(request->url());
  int index = controller.GetCurrentEntryIndex();
  // The url that we are extracting features for should already be commited.
  DCHECK_NE(index, -1);
  for (; index >= 0; index--) {
    NavigationEntry* entry = controller.GetEntryAtIndex(index);
    if (url_index == -1 && entry->GetURL() == request_url) {
      // It's possible that we've been on the on the possibly phishy url before
      // in this tab, so make sure that we use the latest navigation for
      // features.
      // Note that it's possible that the url_index should always be the
      // latest entry, but I'm worried about possible races during a navigation
      // and transient entries (i.e. interstiatials) so for now we will just
      // be cautious.
      url_index = index;
    } else if (index < url_index) {
      if (entry->GetURL().host() == request_url.host()) {
        first_host_index = index;
      } else {
        // We have found the possibly phishing url, but we are no longer on the
        // host. No reason to look back any further.
        break;
      }
    }
  }

  // Add features pertaining to how we got to
  //   1) The candidate url
  //   2) The first url on the same host as the candidate url (assuming that
  //      it's different from the candidate url).
  if (url_index != -1) {
    AddNavigationFeatures(
        std::string(), controller, url_index, info->url_redirects, request);
  }
  if (first_host_index != -1) {
    AddNavigationFeatures(features::kHostPrefix,
                          controller,
                          first_host_index,
                          info->host_redirects,
                          request);
  }

  ExtractBrowseInfoFeatures(*info, request);
  pending_extractions_[request] = callback;
  base::MessageLoop::current()->PostTask(
      FROM_HERE,
      base::Bind(&BrowserFeatureExtractor::StartExtractFeatures,
                 weak_factory_.GetWeakPtr(), request, callback));
}

void BrowserFeatureExtractor::ExtractMalwareFeatures(
    BrowseInfo* info,
    ClientMalwareRequest* request,
    const MalwareDoneCallback& callback) {
  DCHECK(BrowserThread::CurrentlyOn(BrowserThread::UI));
  DCHECK(!callback.is_null());

  // Grab the IPs because they might go away before we're done
  // checking them against the IP blacklist on the IO thread.
  scoped_ptr<IPUrlMap> ips(new IPUrlMap);
  ips->swap(info->ips);

  IPUrlMap* ips_ptr = ips.get();

  // The API doesn't take a scoped_ptr because the API gets mocked and we
  // cannot mock an API that takes scoped_ptr as arguments.
  scoped_ptr<ClientMalwareRequest> req(request);

  // IP blacklist lookups have to happen on the IO thread.
  BrowserThread::PostTaskAndReply(
      BrowserThread::IO,
      FROM_HERE,
      base::Bind(&FilterBenignIpsOnIOThread,
                 host_->database_manager(),
                 ips_ptr),
      base::Bind(&BrowserFeatureExtractor::FinishExtractMalwareFeatures,
                 weak_factory_.GetWeakPtr(),
                 base::Passed(&ips), callback, base::Passed(&req)));
}

void BrowserFeatureExtractor::ExtractBrowseInfoFeatures(
    const BrowseInfo& info,
    ClientPhishingRequest* request) {
  if (info.unsafe_resource.get()) {
    // A SafeBrowsing interstitial was shown for the current URL.
    AddFeature(features::kSafeBrowsingMaliciousUrl +
               info.unsafe_resource->url.spec(),
               1.0,
               request);
    AddFeature(features::kSafeBrowsingOriginalUrl +
               info.unsafe_resource->original_url.spec(),
               1.0,
               request);
    AddFeature(features::kSafeBrowsingIsSubresource,
               info.unsafe_resource->is_subresource ? 1.0 : 0.0,
               request);
    AddFeature(features::kSafeBrowsingThreatType,
               static_cast<double>(info.unsafe_resource->threat_type),
               request);
  }
  if (info.http_status_code != 0) {
    AddFeature(features::kHttpStatusCode, info.http_status_code, request);
  }
}

void BrowserFeatureExtractor::StartExtractFeatures(
    ClientPhishingRequest* request,
    const DoneCallback& callback) {
  DCHECK(BrowserThread::CurrentlyOn(BrowserThread::UI));
  size_t removed = pending_extractions_.erase(request);
  DCHECK_EQ(1U, removed);
  HistoryService* history;
  if (!request || !request->IsInitialized() || !GetHistoryService(&history)) {
    callback.Run(false, request);
    return;
  }
  CancelableRequestProvider::Handle handle = history->QueryURL(
      GURL(request->url()),
      true /* wants_visits */,
      &request_consumer_,
      base::Bind(&BrowserFeatureExtractor::QueryUrlHistoryDone,
                 base::Unretained(this)));

  StorePendingQuery(handle, request, callback);
}

void BrowserFeatureExtractor::QueryUrlHistoryDone(
    CancelableRequestProvider::Handle handle,
    bool success,
    const history::URLRow* row,
    history::VisitVector* visits) {
  DCHECK(BrowserThread::CurrentlyOn(BrowserThread::UI));
  ClientPhishingRequest* request;
  DoneCallback callback;
  if (!GetPendingQuery(handle, &request, &callback)) {
    DLOG(FATAL) << "No pending history query found";
    return;
  }
  DCHECK(request);
  DCHECK(!callback.is_null());
  if (!success) {
    // URL is not found in the history.  In practice this should not
    // happen (unless there is a real error) because we just visited
    // that URL.
    callback.Run(false, request);
    return;
  }
  AddFeature(features::kUrlHistoryVisitCount,
             static_cast<double>(row->visit_count()),
             request);

  base::Time threshold = base::Time::Now() - base::TimeDelta::FromDays(1);
  int num_visits_24h_ago = 0;
  int num_visits_typed = 0;
  int num_visits_link = 0;
  for (history::VisitVector::const_iterator it = visits->begin();
       it != visits->end(); ++it) {
    if (!content::PageTransitionIsMainFrame(it->transition)) {
      continue;
    }
    if (it->visit_time < threshold) {
      ++num_visits_24h_ago;
    }
    content::PageTransition transition = content::PageTransitionStripQualifier(
        it->transition);
    if (transition == content::PAGE_TRANSITION_TYPED) {
      ++num_visits_typed;
    } else if (transition == content::PAGE_TRANSITION_LINK) {
      ++num_visits_link;
    }
  }
  AddFeature(features::kUrlHistoryVisitCountMoreThan24hAgo,
             static_cast<double>(num_visits_24h_ago),
             request);
  AddFeature(features::kUrlHistoryTypedCount,
             static_cast<double>(num_visits_typed),
             request);
  AddFeature(features::kUrlHistoryLinkCount,
             static_cast<double>(num_visits_link),
             request);

  // Issue next history lookup for host visits.
  HistoryService* history;
  if (!GetHistoryService(&history)) {
    callback.Run(false, request);
    return;
  }
  CancelableRequestProvider::Handle next_handle =
      history->GetVisibleVisitCountToHost(
          GURL(request->url()),
          &request_consumer_,
          base::Bind(&BrowserFeatureExtractor::QueryHttpHostVisitsDone,
                     base::Unretained(this)));
  StorePendingQuery(next_handle, request, callback);
}

void BrowserFeatureExtractor::QueryHttpHostVisitsDone(
    CancelableRequestProvider::Handle handle,
    bool success,
    int num_visits,
    base::Time first_visit) {
  DCHECK(BrowserThread::CurrentlyOn(BrowserThread::UI));
  ClientPhishingRequest* request;
  DoneCallback callback;
  if (!GetPendingQuery(handle, &request, &callback)) {
    DLOG(FATAL) << "No pending history query found";
    return;
  }
  DCHECK(request);
  DCHECK(!callback.is_null());
  if (!success) {
    callback.Run(false, request);
    return;
  }
  SetHostVisitsFeatures(num_visits, first_visit, true, request);

  // Same lookup but for the HTTPS URL.
  HistoryService* history;
  if (!GetHistoryService(&history)) {
    callback.Run(false, request);
    return;
  }
  std::string https_url = request->url();
  CancelableRequestProvider::Handle next_handle =
      history->GetVisibleVisitCountToHost(
          GURL(https_url.replace(0, 5, "https:")),
          &request_consumer_,
          base::Bind(&BrowserFeatureExtractor::QueryHttpsHostVisitsDone,
                     base::Unretained(this)));
  StorePendingQuery(next_handle, request, callback);
}

void BrowserFeatureExtractor::QueryHttpsHostVisitsDone(
    CancelableRequestProvider::Handle handle,
    bool success,
    int num_visits,
    base::Time first_visit) {
  DCHECK(BrowserThread::CurrentlyOn(BrowserThread::UI));
  ClientPhishingRequest* request;
  DoneCallback callback;
  if (!GetPendingQuery(handle, &request, &callback)) {
    DLOG(FATAL) << "No pending history query found";
    return;
  }
  DCHECK(request);
  DCHECK(!callback.is_null());
  if (!success) {
    callback.Run(false, request);
    return;
  }
  SetHostVisitsFeatures(num_visits, first_visit, false, request);
  callback.Run(true, request);  // We're done with all the history lookups.
}

void BrowserFeatureExtractor::SetHostVisitsFeatures(
    int num_visits,
    base::Time first_visit,
    bool is_http_query,
    ClientPhishingRequest* request) {
  DCHECK(request);
  AddFeature(is_http_query ?
             features::kHttpHostVisitCount : features::kHttpsHostVisitCount,
             static_cast<double>(num_visits),
             request);
  if (num_visits > 0) {
    AddFeature(
        is_http_query ?
        features::kFirstHttpHostVisitMoreThan24hAgo :
        features::kFirstHttpsHostVisitMoreThan24hAgo,
        (first_visit < (base::Time::Now() - base::TimeDelta::FromDays(1))) ?
        1.0 : 0.0,
        request);
  }
}

void BrowserFeatureExtractor::StorePendingQuery(
    CancelableRequestProvider::Handle handle,
    ClientPhishingRequest* request,
    const DoneCallback& callback) {
  DCHECK_EQ(0U, pending_queries_.count(handle));
  pending_queries_[handle] = std::make_pair(request, callback);
}

bool BrowserFeatureExtractor::GetPendingQuery(
    CancelableRequestProvider::Handle handle,
    ClientPhishingRequest** request,
    DoneCallback* callback) {
  PendingQueriesMap::iterator it = pending_queries_.find(handle);
  DCHECK(it != pending_queries_.end());
  if (it != pending_queries_.end()) {
    *request = it->second.first;
    *callback = it->second.second;
    pending_queries_.erase(it);
    return true;
  }
  return false;
}

bool BrowserFeatureExtractor::GetHistoryService(HistoryService** history) {
  *history = NULL;
  if (tab_ && tab_->GetBrowserContext()) {
    Profile* profile = Profile::FromBrowserContext(tab_->GetBrowserContext());
    *history = HistoryServiceFactory::GetForProfile(profile,
                                                    Profile::EXPLICIT_ACCESS);
    if (*history) {
      return true;
    }
  }
  VLOG(2) << "Unable to query history.  No history service available.";
  return false;
}

void BrowserFeatureExtractor::FinishExtractMalwareFeatures(
    scoped_ptr<IPUrlMap> bad_ips,
    MalwareDoneCallback callback,
    scoped_ptr<ClientMalwareRequest> request) {
  DCHECK(BrowserThread::CurrentlyOn(BrowserThread::UI));
  int matched_bad_ips = 0;
  for (IPUrlMap::const_iterator it = bad_ips->begin();
       it != bad_ips->end(); ++it) {
    AddMalwareIpUrlInfo(it->first, it->second, request.get());
    ++matched_bad_ips;
    // Limit the number of matched bad IPs in one request to control
    // the request's size
    if (matched_bad_ips >= kMaxMalwareIPPerRequest) {
      break;
    }
  }
  callback.Run(true, request.Pass());
}

}  // namespace safe_browsing

/* [<][>][^][v][top][bottom][index][help] */