root/chrome/renderer/safe_browsing/phishing_term_feature_extractor.h

/* [<][>][^][v][top][bottom][index][help] */

INCLUDED FROM


// Copyright (c) 2011 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
//
// PhishingTermFeatureExtractor handles computing term features from the text
// of a web page for the client-side phishing detection model.  To do this, it
// takes a list of terms that appear in the model, and scans through the page
// text looking for them.  Any terms that appear will cause a corresponding
// features::kPageTerm feature to be added to the FeatureMap.
//
// To make it harder for a phisher to enumerate all of the relevant terms in
// the model, the terms are provided as SHA-256 hashes, rather than plain text.
//
// There is one PhishingTermFeatureExtractor per RenderView.

#ifndef CHROME_RENDERER_SAFE_BROWSING_PHISHING_TERM_FEATURE_EXTRACTOR_H_
#define CHROME_RENDERER_SAFE_BROWSING_PHISHING_TERM_FEATURE_EXTRACTOR_H_

#include <string>

#include "base/basictypes.h"
#include "base/callback.h"
#include "base/containers/hash_tables.h"
#include "base/containers/mru_cache.h"
#include "base/memory/scoped_ptr.h"
#include "base/memory/weak_ptr.h"
#include "base/strings/string16.h"
#include "base/strings/string_piece.h"

namespace safe_browsing {
class FeatureExtractorClock;
class FeatureMap;

class PhishingTermFeatureExtractor {
 public:
  // Callback to be run when feature extraction finishes.  The callback
  // argument is true if extraction was successful, false otherwise.
  typedef base::Callback<void(bool)> DoneCallback;

  // Creates a PhishingTermFeatureExtractor which will extract features for
  // all of the terms whose SHA-256 hashes are in |page_term_hashes|.  These
  // terms may be multi-word n-grams, with at most |max_words_per_term| words.
  //
  // |page_word_hashes| contains the murmur3 hashes for all of the individual
  // words that make up the terms.  Both sets of strings are UTF-8 encoded and
  // lowercased prior to hashing.  The caller owns both sets of strings, and
  // must ensure that they are valid until the PhishingTermFeatureExtractor is
  // destroyed.
  //
  // |clock| is used for timing feature extractor operations, and may be mocked
  // for testing.  The caller keeps ownership of the clock.
  PhishingTermFeatureExtractor(
      const base::hash_set<std::string>* page_term_hashes,
      const base::hash_set<uint32>* page_word_hashes,
      size_t max_words_per_term,
      uint32 murmurhash3_seed,
      FeatureExtractorClock* clock);
  ~PhishingTermFeatureExtractor();

  // Begins extracting features from |page_text| into the given FeatureMap.
  // |page_text| should contain the plain text of a web page, including any
  // subframes, as returned by RenderView::CaptureText().
  //
  // To avoid blocking the render thread for too long, the feature extractor
  // may run in several chunks of work, posting a task to the current
  // MessageLoop to continue processing.  Once feature extraction is complete,
  // |done_callback| is run on the current thread.
  // PhishingTermFeatureExtractor takes ownership of the callback.
  //
  // |page_text| and |features| are owned by the caller, and must not be
  // destroyed until either |done_callback| is run or
  // CancelPendingExtraction() is called.
  void ExtractFeatures(const base::string16* page_text,
                       FeatureMap* features,
                       const DoneCallback& done_callback);

  // Cancels any pending feature extraction.  The DoneCallback will not be run.
  // Must be called if there is a feature extraction in progress when the page
  // is unloaded or the PhishingTermFeatureExtractor is destroyed.
  void CancelPendingExtraction();

 private:
  struct ExtractionState;

  // The maximum amount of wall time that we will spend on a single extraction
  // iteration before pausing to let other MessageLoop tasks run.
  static const int kMaxTimePerChunkMs;

  // The number of words that we will process before checking to see whether
  // kMaxTimePerChunkMs has elapsed.  Since checking the current time can be
  // slow, we don't do this on every word processed.
  static const int kClockCheckGranularity;

  // The maximum total amount of time that the feature extractor will run
  // before giving up on the current page.
  static const int kMaxTotalTimeMs;

  // The size of the cache that we use to determine if we can avoid lower
  // casing, hashing, and UTF conversion.
  static const int kMaxNegativeWordCacheSize;

  // Does the actual work of ExtractFeatures.  ExtractFeaturesWithTimeout runs
  // until a predefined maximum amount of time has elapsed, then posts a task
  // to the current MessageLoop to continue extraction.  When extraction
  // finishes, calls RunCallback().
  void ExtractFeaturesWithTimeout();

  // Handles a single word in the page text.
  void HandleWord(const base::StringPiece16& word);

  // Helper to verify that there is no pending feature extraction.  Dies in
  // debug builds if the state is not as expected.  This is a no-op in release
  // builds.
  void CheckNoPendingExtraction();

  // Runs |done_callback_| and then clears all internal state.
  void RunCallback(bool success);

  // Clears all internal feature extraction state.
  void Clear();

  // All of the term hashes that we are looking for in the page.
  const base::hash_set<std::string>* page_term_hashes_;

  // Murmur3 hashes of all the individual words in page_term_hashes_.  If
  // page_term_hashes_ included (hashed) "one" and "one two", page_word_hashes_
  // would contain (hashed) "one" and "two".  We do this so that we can have a
  // quick out in the common case that the current word we are processing
  // doesn't contain any part of one of our terms.
  const base::hash_set<uint32>* page_word_hashes_;

  // The maximum number of words in an n-gram.
  const size_t max_words_per_term_;

  // The seed for murmurhash3.
  const uint32 murmurhash3_seed_;

  // This cache is used to see if we need to check the word at all, as
  // converting to UTF8, lowercasing, and hashing are all relatively expensive
  // operations. Though this is called an MRU cache, it seems to behave like
  // an LRU cache (i.e. it evicts the oldest accesses first).
  typedef base::HashingMRUCache<base::StringPiece16, bool> WordCache;
  WordCache negative_word_cache_;

  // Non-owned pointer to our clock.
  FeatureExtractorClock* clock_;

  // The output parameters from the most recent call to ExtractFeatures().
  const base::string16* page_text_;  // The caller keeps ownership of this.
  FeatureMap* features_;  // The caller keeps ownership of this.
  DoneCallback done_callback_;

  // Stores the current state of term extraction from |page_text_|.
  scoped_ptr<ExtractionState> state_;

  // Used in scheduling ExtractFeaturesWithTimeout tasks.
  // These pointers are invalidated if extraction is cancelled.
  base::WeakPtrFactory<PhishingTermFeatureExtractor> weak_factory_;

  DISALLOW_COPY_AND_ASSIGN(PhishingTermFeatureExtractor);
};

}  // namespace safe_browsing

#endif  // CHROME_RENDERER_SAFE_BROWSING_PHISHING_TERM_FEATURE_EXTRACTOR_H_

/* [<][>][^][v][top][bottom][index][help] */