root/components/url_matcher/url_matcher.cc

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. IsRegexCriterion
  2. IsOriginAndPathRegexCriterion
  3. string_pattern_
  4. string_pattern_
  5. string_pattern_
  6. IsFullURLCondition
  7. IsRegexCondition
  8. IsOriginAndPathRegexCondition
  9. IsMatch
  10. CanonicalizeURLForComponentSearches
  11. CreateHostPrefixCondition
  12. CreateHostSuffixCondition
  13. CreateHostContainsCondition
  14. CreateHostEqualsCondition
  15. CreatePathPrefixCondition
  16. CreatePathSuffixCondition
  17. CreatePathContainsCondition
  18. CreatePathEqualsCondition
  19. CreateQueryPrefixCondition
  20. CreateQuerySuffixCondition
  21. CreateQueryContainsCondition
  22. CreateQueryEqualsCondition
  23. CreateHostSuffixPathPrefixCondition
  24. CreateHostEqualsPathPrefixCondition
  25. CanonicalizeURLForFullSearches
  26. CanonicalizeURLForRegexSearchesHelper
  27. CanonicalizeURLForRegexSearches
  28. CanonicalizeURLForOriginAndPathRegexSearches
  29. CreateURLPrefixCondition
  30. CreateURLSuffixCondition
  31. CreateURLContainsCondition
  32. CreateURLEqualsCondition
  33. CreateURLMatchesCondition
  34. CreateOriginAndPathMatchesCondition
  35. ForgetUnusedPatterns
  36. IsEmpty
  37. CreateCondition
  38. CanonicalizeHostname
  39. IsMatch
  40. IsMatch
  41. CreateRange
  42. CreateRange
  43. conditions_
  44. port_filter_
  45. IsMatch
  46. AddConditionSets
  47. RemoveConditionSets
  48. ClearUnusedConditionSets
  49. MatchURL
  50. IsEmpty
  51. UpdateSubstringSetMatcher
  52. UpdateRegexSetMatcher
  53. UpdateTriggers
  54. UpdateConditionFactory
  55. UpdateInternalDatastructures

// Copyright 2013 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#include "components/url_matcher/url_matcher.h"

#include <algorithm>
#include <iterator>

#include "base/logging.h"
#include "url/gurl.h"
#include "url/url_canon.h"

namespace url_matcher {

// This set of classes implement a mapping of URL Component Patterns, such as
// host_prefix, host_suffix, host_equals, ..., etc., to StringPatterns
// for use in substring comparisons.
//
// The idea of this mapping is to reduce the problem of comparing many
// URL Component Patterns against one URL to the problem of searching many
// substrings in one string:
//
// ----------------------                    -----------------
// | URL Query operator | ----translate----> | StringPattern |
// ----------------------                    -----------------
//                                                   ^
//                                                   |
//                                                compare
//                                                   |
//                                                   v
// ----------------------                    -----------------
// | URL to compare     |                    |               |
// | to all URL Query   | ----translate----> | String        |
// | operators          |                    |               |
// ----------------------                    -----------------
//
// The reason for this problem reduction is that there are efficient algorithms
// for searching many substrings in one string (see Aho-Corasick algorithm).
//
// Additionally, some of the same pieces are reused to implement regular
// expression comparisons. The FilteredRE2 implementation for matching many
// regular expressions against one string uses prefiltering, in which a set
// of substrings (derived from the regexes) are first searched for, to reduce
// the number of regular expressions to test; the prefiltering step also
// uses Aho-Corasick.
//
// Case 1: {host,path,query}_{prefix,suffix,equals} searches.
// ==========================================================
//
// For searches in this class, we normalize URLs as follows:
//
// Step 1:
// Remove scheme, port and segment from URL:
// -> http://www.example.com:8080/index.html?search=foo#first_match becomes
//    www.example.com/index.html?search=foo
//
// We remove the scheme and port number because they can be checked later
// in a secondary filter step. We remove the segment (the #... part) because
// this is not guaranteed to be ASCII-7 encoded.
//
// Step 2:
// Translate URL to String and add the following position markers:
// - BU = Beginning of URL
// - ED = End of Domain
// - EP = End of Path
// - EU = End of URL
// Furthermore, the hostname is canonicalized to start with a ".".
//
// Position markers are represented as characters >127, which are therefore
// guaranteed not to be part of the ASCII-7 encoded URL character set.
//
// -> www.example.com/index.html?search=foo becomes
// BU .www.example.com ED /index.html EP ?search=foo EU
//
// -> www.example.com/index.html becomes
// BU .www.example.com ED /index.html EP EU
//
// Step 3:
// Translate URL Component Patterns as follows:
//
// host_prefix(prefix) = BU add_missing_dot_prefix(prefix)
// -> host_prefix("www.example") = BU .www.example
//
// host_suffix(suffix) = suffix ED
// -> host_suffix("example.com") = example.com ED
// -> host_suffix(".example.com") = .example.com ED
//
// host_equals(domain) = BU add_missing_dot_prefix(domain) ED
// -> host_equals("www.example.com") = BU .www.example.com ED
//
// Similarly for path query parameters ({path, query}_{prefix, suffix, equals}).
//
// With this, we can search the StringPatterns in the normalized URL.
//
//
// Case 2: url_{prefix,suffix,equals,contains} searches.
// =====================================================
//
// Step 1: as above, except that
// - the scheme is not removed
// - the port is not removed if it is specified and does not match the default
//   port for the given scheme.
//
// Step 2:
// Translate URL to String and add the following position markers:
// - BU = Beginning of URL
// - EU = End of URL
//
// -> http://www.example.com:8080/index.html?search=foo#first_match becomes
// BU http://www.example.com:8080/index.html?search=foo EU
// -> http://www.example.com:80/index.html?search=foo#first_match becomes
// BU http://www.example.com/index.html?search=foo EU
//
// url_prefix(prefix) = BU prefix
// -> url_prefix("http://www.example") = BU http://www.example
//
// url_contains(substring) = substring
// -> url_contains("index") = index
//
//
// Case 3: {host,path,query}_contains searches.
// ============================================
//
// These kinds of searches are not supported directly but can be derived
// by a combination of a url_contains() query followed by an explicit test:
//
// host_contains(str) = url_contains(str) followed by test whether str occurs
//   in host component of original URL.
// -> host_contains("example.co") = example.co
//    followed by gurl.host().find("example.co");
//
// [similarly for path_contains and query_contains].
//
//
// Regular expression matching (url_matches searches)
// ==================================================
//
// This class also supports matching regular expressions (RE2 syntax)
// against full URLs, which are transformed as in case 2.

namespace {

bool IsRegexCriterion(URLMatcherCondition::Criterion criterion) {
  return criterion == URLMatcherCondition::URL_MATCHES;
}

bool IsOriginAndPathRegexCriterion(URLMatcherCondition::Criterion criterion) {
  return criterion == URLMatcherCondition::ORIGIN_AND_PATH_MATCHES;
}

}  // namespace

//
// URLMatcherCondition
//

URLMatcherCondition::URLMatcherCondition()
    : criterion_(HOST_PREFIX),
      string_pattern_(NULL) {}

URLMatcherCondition::~URLMatcherCondition() {}

URLMatcherCondition::URLMatcherCondition(
    Criterion criterion,
    const StringPattern* string_pattern)
    : criterion_(criterion),
      string_pattern_(string_pattern) {}

URLMatcherCondition::URLMatcherCondition(const URLMatcherCondition& rhs)
    : criterion_(rhs.criterion_),
      string_pattern_(rhs.string_pattern_) {}

URLMatcherCondition& URLMatcherCondition::operator=(
    const URLMatcherCondition& rhs) {
  criterion_ = rhs.criterion_;
  string_pattern_ = rhs.string_pattern_;
  return *this;
}

bool URLMatcherCondition::operator<(const URLMatcherCondition& rhs) const {
  if (criterion_ < rhs.criterion_) return true;
  if (criterion_ > rhs.criterion_) return false;
  if (string_pattern_ != NULL && rhs.string_pattern_ != NULL)
    return *string_pattern_ < *rhs.string_pattern_;
  if (string_pattern_ == NULL && rhs.string_pattern_ != NULL) return true;
  // Either string_pattern_ != NULL && rhs.string_pattern_ == NULL,
  // or both are NULL.
  return false;
}

bool URLMatcherCondition::IsFullURLCondition() const {
  // For these criteria the SubstringMatcher needs to be executed on the
  // GURL that is canonicalized with
  // URLMatcherConditionFactory::CanonicalizeURLForFullSearches.
  switch (criterion_) {
    case HOST_CONTAINS:
    case PATH_CONTAINS:
    case QUERY_CONTAINS:
    case URL_PREFIX:
    case URL_SUFFIX:
    case URL_CONTAINS:
    case URL_EQUALS:
      return true;
    default:
      break;
  }
  return false;
}

bool URLMatcherCondition::IsRegexCondition() const {
  return IsRegexCriterion(criterion_);
}

bool URLMatcherCondition::IsOriginAndPathRegexCondition() const {
  return IsOriginAndPathRegexCriterion(criterion_);
}

bool URLMatcherCondition::IsMatch(
    const std::set<StringPattern::ID>& matching_patterns,
    const GURL& url) const {
  DCHECK(string_pattern_);
  if (!ContainsKey(matching_patterns, string_pattern_->id()))
    return false;
  // The criteria HOST_CONTAINS, PATH_CONTAINS, QUERY_CONTAINS are based on
  // a substring match on the raw URL. In case of a match, we need to verify
  // that the match was found in the correct component of the URL.
  switch (criterion_) {
    case HOST_CONTAINS:
      return url.host().find(string_pattern_->pattern()) !=
          std::string::npos;
    case PATH_CONTAINS:
      return url.path().find(string_pattern_->pattern()) !=
          std::string::npos;
    case QUERY_CONTAINS:
      return url.query().find(string_pattern_->pattern()) !=
          std::string::npos;
    default:
      break;
  }
  return true;
}

//
// URLMatcherConditionFactory
//

namespace {
// These are symbols that are not contained in 7-bit ASCII used in GURLs.
const char kBeginningOfURL[] = {static_cast<char>(-1), 0};
const char kEndOfDomain[] = {static_cast<char>(-2), 0};
const char kEndOfPath[] = {static_cast<char>(-3), 0};
const char kEndOfURL[] = {static_cast<char>(-4), 0};
}  // namespace

URLMatcherConditionFactory::URLMatcherConditionFactory() : id_counter_(0) {}

URLMatcherConditionFactory::~URLMatcherConditionFactory() {
  STLDeleteElements(&substring_pattern_singletons_);
  STLDeleteElements(&regex_pattern_singletons_);
  STLDeleteElements(&origin_and_path_regex_pattern_singletons_);
}

std::string URLMatcherConditionFactory::CanonicalizeURLForComponentSearches(
    const GURL& url) const {
  return kBeginningOfURL + CanonicalizeHostname(url.host()) + kEndOfDomain +
         url.path() + kEndOfPath +
         (url.has_query() ? "?" + url.query() : std::string()) + kEndOfURL;
}

URLMatcherCondition URLMatcherConditionFactory::CreateHostPrefixCondition(
    const std::string& prefix) {
  return CreateCondition(URLMatcherCondition::HOST_PREFIX,
      kBeginningOfURL + CanonicalizeHostname(prefix));
}

URLMatcherCondition URLMatcherConditionFactory::CreateHostSuffixCondition(
    const std::string& suffix) {
  return CreateCondition(URLMatcherCondition::HOST_SUFFIX,
      suffix + kEndOfDomain);
}

URLMatcherCondition URLMatcherConditionFactory::CreateHostContainsCondition(
    const std::string& str) {
  return CreateCondition(URLMatcherCondition::HOST_CONTAINS, str);
}

URLMatcherCondition URLMatcherConditionFactory::CreateHostEqualsCondition(
    const std::string& str) {
  return CreateCondition(URLMatcherCondition::HOST_EQUALS,
      kBeginningOfURL + CanonicalizeHostname(str) + kEndOfDomain);
}

URLMatcherCondition URLMatcherConditionFactory::CreatePathPrefixCondition(
    const std::string& prefix) {
  return CreateCondition(URLMatcherCondition::PATH_PREFIX,
      kEndOfDomain + prefix);
}

URLMatcherCondition URLMatcherConditionFactory::CreatePathSuffixCondition(
    const std::string& suffix) {
  return CreateCondition(URLMatcherCondition::PATH_SUFFIX, suffix + kEndOfPath);
}

URLMatcherCondition URLMatcherConditionFactory::CreatePathContainsCondition(
    const std::string& str) {
  return CreateCondition(URLMatcherCondition::PATH_CONTAINS, str);
}

URLMatcherCondition URLMatcherConditionFactory::CreatePathEqualsCondition(
    const std::string& str) {
  return CreateCondition(URLMatcherCondition::PATH_EQUALS,
      kEndOfDomain + str + kEndOfPath);
}

URLMatcherCondition URLMatcherConditionFactory::CreateQueryPrefixCondition(
    const std::string& prefix) {
  std::string pattern;
  if (!prefix.empty() && prefix[0] == '?')
    pattern = kEndOfPath + prefix;
  else
    pattern = kEndOfPath + ('?' + prefix);

  return CreateCondition(URLMatcherCondition::QUERY_PREFIX, pattern);
}

URLMatcherCondition URLMatcherConditionFactory::CreateQuerySuffixCondition(
    const std::string& suffix) {
  if (!suffix.empty() && suffix[0] == '?') {
    return CreateQueryEqualsCondition(suffix);
  } else {
    return CreateCondition(URLMatcherCondition::QUERY_SUFFIX,
                           suffix + kEndOfURL);
  }
}

URLMatcherCondition URLMatcherConditionFactory::CreateQueryContainsCondition(
    const std::string& str) {
  if (!str.empty() && str[0] == '?')
    return CreateQueryPrefixCondition(str);
  else
    return CreateCondition(URLMatcherCondition::QUERY_CONTAINS, str);
}

URLMatcherCondition URLMatcherConditionFactory::CreateQueryEqualsCondition(
    const std::string& str) {
  std::string pattern;
  if (!str.empty() && str[0] == '?')
    pattern = kEndOfPath + str + kEndOfURL;
  else
    pattern = kEndOfPath + ('?' + str) + kEndOfURL;

  return CreateCondition(URLMatcherCondition::QUERY_EQUALS, pattern);
}

URLMatcherCondition
    URLMatcherConditionFactory::CreateHostSuffixPathPrefixCondition(
    const std::string& host_suffix,
    const std::string& path_prefix) {
  return CreateCondition(URLMatcherCondition::HOST_SUFFIX_PATH_PREFIX,
      host_suffix + kEndOfDomain + path_prefix);
}

URLMatcherCondition
URLMatcherConditionFactory::CreateHostEqualsPathPrefixCondition(
    const std::string& host,
    const std::string& path_prefix) {
  return CreateCondition(URLMatcherCondition::HOST_EQUALS_PATH_PREFIX,
      kBeginningOfURL + CanonicalizeHostname(host) + kEndOfDomain +
      path_prefix);
}

std::string URLMatcherConditionFactory::CanonicalizeURLForFullSearches(
    const GURL& url) const {
  GURL::Replacements replacements;
  replacements.ClearPassword();
  replacements.ClearUsername();
  replacements.ClearRef();
  // Clear port if it is implicit from scheme.
  if (url.has_port()) {
    const std::string& port = url.scheme();
    if (url_canon::DefaultPortForScheme(port.c_str(), port.size()) ==
            url.EffectiveIntPort()) {
      replacements.ClearPort();
    }
  }
  return kBeginningOfURL + url.ReplaceComponents(replacements).spec() +
      kEndOfURL;
}

static std::string CanonicalizeURLForRegexSearchesHelper(
    const GURL& url,
    bool clear_query) {
  GURL::Replacements replacements;
  replacements.ClearPassword();
  replacements.ClearUsername();
  replacements.ClearRef();
  if (clear_query)
    replacements.ClearQuery();
  // Clear port if it is implicit from scheme.
  if (url.has_port()) {
    const std::string& port = url.scheme();
    if (url_canon::DefaultPortForScheme(port.c_str(), port.size()) ==
            url.EffectiveIntPort()) {
      replacements.ClearPort();
    }
  }
  return url.ReplaceComponents(replacements).spec();
}

std::string URLMatcherConditionFactory::CanonicalizeURLForRegexSearches(
    const GURL& url) const {
  return CanonicalizeURLForRegexSearchesHelper(url, false);
}

std::string
URLMatcherConditionFactory::CanonicalizeURLForOriginAndPathRegexSearches(
    const GURL& url) const {
  return CanonicalizeURLForRegexSearchesHelper(url, true);
}

URLMatcherCondition URLMatcherConditionFactory::CreateURLPrefixCondition(
    const std::string& prefix) {
  return CreateCondition(URLMatcherCondition::URL_PREFIX,
      kBeginningOfURL + prefix);
}

URLMatcherCondition URLMatcherConditionFactory::CreateURLSuffixCondition(
    const std::string& suffix) {
  return CreateCondition(URLMatcherCondition::URL_SUFFIX, suffix + kEndOfURL);
}

URLMatcherCondition URLMatcherConditionFactory::CreateURLContainsCondition(
    const std::string& str) {
  return CreateCondition(URLMatcherCondition::URL_CONTAINS, str);
}

URLMatcherCondition URLMatcherConditionFactory::CreateURLEqualsCondition(
    const std::string& str) {
  return CreateCondition(URLMatcherCondition::URL_EQUALS,
      kBeginningOfURL + str + kEndOfURL);
}

URLMatcherCondition URLMatcherConditionFactory::CreateURLMatchesCondition(
    const std::string& regex) {
  return CreateCondition(URLMatcherCondition::URL_MATCHES, regex);
}

URLMatcherCondition
URLMatcherConditionFactory::CreateOriginAndPathMatchesCondition(
    const std::string& regex) {
  return CreateCondition(URLMatcherCondition::ORIGIN_AND_PATH_MATCHES, regex);
}

void URLMatcherConditionFactory::ForgetUnusedPatterns(
      const std::set<StringPattern::ID>& used_patterns) {
  PatternSingletons::iterator i = substring_pattern_singletons_.begin();
  while (i != substring_pattern_singletons_.end()) {
    if (ContainsKey(used_patterns, (*i)->id())) {
      ++i;
    } else {
      delete *i;
      substring_pattern_singletons_.erase(i++);
    }
  }
  i = regex_pattern_singletons_.begin();
  while (i != regex_pattern_singletons_.end()) {
    if (ContainsKey(used_patterns, (*i)->id())) {
      ++i;
    } else {
      delete *i;
      regex_pattern_singletons_.erase(i++);
    }
  }
  i = origin_and_path_regex_pattern_singletons_.begin();
  while (i != origin_and_path_regex_pattern_singletons_.end()) {
    if (ContainsKey(used_patterns, (*i)->id())) {
      ++i;
    } else {
      delete *i;
      origin_and_path_regex_pattern_singletons_.erase(i++);
    }
  }
}

bool URLMatcherConditionFactory::IsEmpty() const {
  return substring_pattern_singletons_.empty() &&
      regex_pattern_singletons_.empty() &&
      origin_and_path_regex_pattern_singletons_.empty();
}

URLMatcherCondition URLMatcherConditionFactory::CreateCondition(
    URLMatcherCondition::Criterion criterion,
    const std::string& pattern) {
  StringPattern search_pattern(pattern, 0);
  PatternSingletons* pattern_singletons = NULL;
  if (IsRegexCriterion(criterion))
    pattern_singletons = &regex_pattern_singletons_;
  else if (IsOriginAndPathRegexCriterion(criterion))
    pattern_singletons = &origin_and_path_regex_pattern_singletons_;
  else
    pattern_singletons = &substring_pattern_singletons_;

  PatternSingletons::const_iterator iter =
      pattern_singletons->find(&search_pattern);

  if (iter != pattern_singletons->end()) {
    return URLMatcherCondition(criterion, *iter);
  } else {
    StringPattern* new_pattern =
        new StringPattern(pattern, id_counter_++);
    pattern_singletons->insert(new_pattern);
    return URLMatcherCondition(criterion, new_pattern);
  }
}

std::string URLMatcherConditionFactory::CanonicalizeHostname(
    const std::string& hostname) const {
  if (!hostname.empty() && hostname[0] == '.')
    return hostname;
  else
    return "." + hostname;
}

bool URLMatcherConditionFactory::StringPatternPointerCompare::operator()(
    StringPattern* lhs,
    StringPattern* rhs) const {
  if (lhs == NULL && rhs != NULL) return true;
  if (lhs != NULL && rhs != NULL)
    return lhs->pattern() < rhs->pattern();
  // Either both are NULL or only rhs is NULL.
  return false;
}

//
// URLMatcherSchemeFilter
//

URLMatcherSchemeFilter::URLMatcherSchemeFilter(const std::string& filter)
    : filters_(1) {
  filters_.push_back(filter);
}

URLMatcherSchemeFilter::URLMatcherSchemeFilter(
    const std::vector<std::string>& filters)
    : filters_(filters) {}

URLMatcherSchemeFilter::~URLMatcherSchemeFilter() {}

bool URLMatcherSchemeFilter::IsMatch(const GURL& url) const {
  return std::find(filters_.begin(), filters_.end(), url.scheme()) !=
      filters_.end();
}

//
// URLMatcherPortFilter
//

URLMatcherPortFilter::URLMatcherPortFilter(
    const std::vector<URLMatcherPortFilter::Range>& ranges)
    : ranges_(ranges) {}

URLMatcherPortFilter::~URLMatcherPortFilter() {}

bool URLMatcherPortFilter::IsMatch(const GURL& url) const {
  int port = url.EffectiveIntPort();
  for (std::vector<Range>::const_iterator i = ranges_.begin();
       i != ranges_.end(); ++i) {
    if (i->first <= port && port <= i->second)
      return true;
  }
  return false;
}

// static
URLMatcherPortFilter::Range URLMatcherPortFilter::CreateRange(int from,
                                                              int to) {
  return Range(from, to);
}

// static
URLMatcherPortFilter::Range URLMatcherPortFilter::CreateRange(int port) {
  return Range(port, port);
}

//
// URLMatcherConditionSet
//

URLMatcherConditionSet::~URLMatcherConditionSet() {}

URLMatcherConditionSet::URLMatcherConditionSet(
    ID id,
    const Conditions& conditions)
    : id_(id),
      conditions_(conditions) {}

URLMatcherConditionSet::URLMatcherConditionSet(
    ID id,
    const Conditions& conditions,
    scoped_ptr<URLMatcherSchemeFilter> scheme_filter,
    scoped_ptr<URLMatcherPortFilter> port_filter)
    : id_(id),
      conditions_(conditions),
      scheme_filter_(scheme_filter.Pass()),
      port_filter_(port_filter.Pass()) {}

bool URLMatcherConditionSet::IsMatch(
    const std::set<StringPattern::ID>& matching_patterns,
    const GURL& url) const {
  for (Conditions::const_iterator i = conditions_.begin();
       i != conditions_.end(); ++i) {
    if (!i->IsMatch(matching_patterns, url))
      return false;
  }
  if (scheme_filter_.get() && !scheme_filter_->IsMatch(url))
    return false;
  if (port_filter_.get() && !port_filter_->IsMatch(url))
    return false;
  return true;
}

//
// URLMatcher
//

URLMatcher::URLMatcher() {}

URLMatcher::~URLMatcher() {}

void URLMatcher::AddConditionSets(
    const URLMatcherConditionSet::Vector& condition_sets) {
  for (URLMatcherConditionSet::Vector::const_iterator i =
       condition_sets.begin(); i != condition_sets.end(); ++i) {
    DCHECK(url_matcher_condition_sets_.find((*i)->id()) ==
        url_matcher_condition_sets_.end());
    url_matcher_condition_sets_[(*i)->id()] = *i;
  }
  UpdateInternalDatastructures();
}

void URLMatcher::RemoveConditionSets(
    const std::vector<URLMatcherConditionSet::ID>& condition_set_ids) {
  for (std::vector<URLMatcherConditionSet::ID>::const_iterator i =
       condition_set_ids.begin(); i != condition_set_ids.end(); ++i) {
    DCHECK(url_matcher_condition_sets_.find(*i) !=
        url_matcher_condition_sets_.end());
    url_matcher_condition_sets_.erase(*i);
  }
  UpdateInternalDatastructures();
}

void URLMatcher::ClearUnusedConditionSets() {
  UpdateConditionFactory();
}

std::set<URLMatcherConditionSet::ID> URLMatcher::MatchURL(
    const GURL& url) const {
  // Find all IDs of StringPatterns that match |url|.
  // See URLMatcherConditionFactory for the canonicalization of URLs and the
  // distinction between full url searches and url component searches.
  std::set<StringPattern::ID> matches;
  if (!full_url_matcher_.IsEmpty()) {
    full_url_matcher_.Match(
        condition_factory_.CanonicalizeURLForFullSearches(url), &matches);
  }
  if (!url_component_matcher_.IsEmpty()) {
    url_component_matcher_.Match(
        condition_factory_.CanonicalizeURLForComponentSearches(url), &matches);
  }
  if (!regex_set_matcher_.IsEmpty()) {
    regex_set_matcher_.Match(
        condition_factory_.CanonicalizeURLForRegexSearches(url), &matches);
  }
  if (!origin_and_path_regex_set_matcher_.IsEmpty()) {
    origin_and_path_regex_set_matcher_.Match(
        condition_factory_.CanonicalizeURLForOriginAndPathRegexSearches(url),
        &matches);
  }

  // Calculate all URLMatcherConditionSets for which all URLMatcherConditions
  // were fulfilled.
  std::set<URLMatcherConditionSet::ID> result;
  for (std::set<StringPattern::ID>::const_iterator i = matches.begin();
       i != matches.end(); ++i) {
    // For each URLMatcherConditionSet there is exactly one condition
    // registered in substring_match_triggers_. This means that the following
    // logic tests each URLMatcherConditionSet exactly once if it can be
    // completely fulfilled.
    StringPatternTriggers::const_iterator triggered_condition_sets_iter =
        substring_match_triggers_.find(*i);
    if (triggered_condition_sets_iter == substring_match_triggers_.end())
      continue;  // Not all substring matches are triggers for a condition set.
    const std::set<URLMatcherConditionSet::ID>& condition_sets =
        triggered_condition_sets_iter->second;
    for (std::set<URLMatcherConditionSet::ID>::const_iterator j =
         condition_sets.begin(); j != condition_sets.end(); ++j) {
      URLMatcherConditionSets::const_iterator condition_set_iter =
          url_matcher_condition_sets_.find(*j);
      DCHECK(condition_set_iter != url_matcher_condition_sets_.end());
      if (condition_set_iter->second->IsMatch(matches, url))
        result.insert(*j);
    }
  }

  return result;
}

bool URLMatcher::IsEmpty() const {
  return condition_factory_.IsEmpty() &&
      url_matcher_condition_sets_.empty() &&
      substring_match_triggers_.empty() &&
      full_url_matcher_.IsEmpty() &&
      url_component_matcher_.IsEmpty() &&
      regex_set_matcher_.IsEmpty() &&
      origin_and_path_regex_set_matcher_.IsEmpty() &&
      registered_full_url_patterns_.empty() &&
      registered_url_component_patterns_.empty();
}

void URLMatcher::UpdateSubstringSetMatcher(bool full_url_conditions) {
  // The purpose of |full_url_conditions| is just that we need to execute
  // the same logic once for Full URL searches and once for URL Component
  // searches (see URLMatcherConditionFactory).

  // Determine which patterns need to be registered when this function
  // terminates.
  std::set<const StringPattern*> new_patterns;
  for (URLMatcherConditionSets::const_iterator condition_set_iter =
      url_matcher_condition_sets_.begin();
      condition_set_iter != url_matcher_condition_sets_.end();
      ++condition_set_iter) {
    const URLMatcherConditionSet::Conditions& conditions =
        condition_set_iter->second->conditions();
    for (URLMatcherConditionSet::Conditions::const_iterator condition_iter =
         conditions.begin(); condition_iter != conditions.end();
         ++condition_iter) {
      // If we are called to process Full URL searches, ignore others, and
      // vice versa. (Regex conditions are updated in UpdateRegexSetMatcher.)
      if (!condition_iter->IsRegexCondition() &&
          !condition_iter->IsOriginAndPathRegexCondition() &&
          full_url_conditions == condition_iter->IsFullURLCondition())
        new_patterns.insert(condition_iter->string_pattern());
    }
  }

  // This is the set of patterns that were registered before this function
  // is called.
  std::set<const StringPattern*>& registered_patterns =
      full_url_conditions ? registered_full_url_patterns_
                          : registered_url_component_patterns_;

  // Add all patterns that are in new_patterns but not in registered_patterns.
  std::vector<const StringPattern*> patterns_to_register =
      base::STLSetDifference<std::vector<const StringPattern*> >(
          new_patterns, registered_patterns);

  // Remove all patterns that are in registered_patterns but not in
  // new_patterns.
  std::vector<const StringPattern*> patterns_to_unregister =
      base::STLSetDifference<std::vector<const StringPattern*> >(
           registered_patterns, new_patterns);

  // Update the SubstringSetMatcher.
  SubstringSetMatcher& url_matcher =
      full_url_conditions ? full_url_matcher_ : url_component_matcher_;
  url_matcher.RegisterAndUnregisterPatterns(patterns_to_register,
                                            patterns_to_unregister);

  // Update the set of registered_patterns for the next time this function
  // is being called.
  registered_patterns.swap(new_patterns);
}

void URLMatcher::UpdateRegexSetMatcher() {
  std::vector<const StringPattern*> new_patterns;
  std::vector<const StringPattern*> new_origin_and_path_patterns;

  for (URLMatcherConditionSets::const_iterator condition_set_iter =
      url_matcher_condition_sets_.begin();
      condition_set_iter != url_matcher_condition_sets_.end();
      ++condition_set_iter) {
    const URLMatcherConditionSet::Conditions& conditions =
        condition_set_iter->second->conditions();
    for (URLMatcherConditionSet::Conditions::const_iterator condition_iter =
         conditions.begin(); condition_iter != conditions.end();
         ++condition_iter) {
      if (condition_iter->IsRegexCondition()) {
        new_patterns.push_back(condition_iter->string_pattern());
      } else if (condition_iter->IsOriginAndPathRegexCondition()) {
        new_origin_and_path_patterns.push_back(
            condition_iter->string_pattern());
      }
    }
  }

  // Start over from scratch. We can't really do better than this, since the
  // FilteredRE2 backend doesn't support incremental updates.
  regex_set_matcher_.ClearPatterns();
  regex_set_matcher_.AddPatterns(new_patterns);
  origin_and_path_regex_set_matcher_.ClearPatterns();
  origin_and_path_regex_set_matcher_.AddPatterns(new_origin_and_path_patterns);
}

void URLMatcher::UpdateTriggers() {
  // Count substring pattern frequencies.
  std::map<StringPattern::ID, size_t> substring_pattern_frequencies;
  for (URLMatcherConditionSets::const_iterator condition_set_iter =
      url_matcher_condition_sets_.begin();
      condition_set_iter != url_matcher_condition_sets_.end();
      ++condition_set_iter) {
    const URLMatcherConditionSet::Conditions& conditions =
        condition_set_iter->second->conditions();
    for (URLMatcherConditionSet::Conditions::const_iterator condition_iter =
         conditions.begin(); condition_iter != conditions.end();
         ++condition_iter) {
      const StringPattern* pattern = condition_iter->string_pattern();
      substring_pattern_frequencies[pattern->id()]++;
    }
  }

  // Update trigger conditions: Determine for each URLMatcherConditionSet which
  // URLMatcherCondition contains a StringPattern that occurs least
  // frequently in this URLMatcher. We assume that this condition is very
  // specific and occurs rarely in URLs. If a match occurs for this
  // URLMatcherCondition, we want to test all other URLMatcherCondition in the
  // respective URLMatcherConditionSet as well to see whether the entire
  // URLMatcherConditionSet is considered matching.
  substring_match_triggers_.clear();
  for (URLMatcherConditionSets::const_iterator condition_set_iter =
      url_matcher_condition_sets_.begin();
      condition_set_iter != url_matcher_condition_sets_.end();
      ++condition_set_iter) {
    const URLMatcherConditionSet::Conditions& conditions =
        condition_set_iter->second->conditions();
    if (conditions.empty())
      continue;
    URLMatcherConditionSet::Conditions::const_iterator condition_iter =
        conditions.begin();
    StringPattern::ID trigger = condition_iter->string_pattern()->id();
    // We skip the first element in the following loop.
    ++condition_iter;
    for (; condition_iter != conditions.end(); ++condition_iter) {
      StringPattern::ID current_id =
          condition_iter->string_pattern()->id();
      if (substring_pattern_frequencies[trigger] >
          substring_pattern_frequencies[current_id]) {
        trigger = current_id;
      }
    }
    substring_match_triggers_[trigger].insert(condition_set_iter->second->id());
  }
}

void URLMatcher::UpdateConditionFactory() {
  std::set<StringPattern::ID> used_patterns;
  for (URLMatcherConditionSets::const_iterator condition_set_iter =
      url_matcher_condition_sets_.begin();
      condition_set_iter != url_matcher_condition_sets_.end();
      ++condition_set_iter) {
    const URLMatcherConditionSet::Conditions& conditions =
        condition_set_iter->second->conditions();
    for (URLMatcherConditionSet::Conditions::const_iterator condition_iter =
         conditions.begin(); condition_iter != conditions.end();
         ++condition_iter) {
      used_patterns.insert(condition_iter->string_pattern()->id());
    }
  }
  condition_factory_.ForgetUnusedPatterns(used_patterns);
}

void URLMatcher::UpdateInternalDatastructures() {
  UpdateSubstringSetMatcher(false);
  UpdateSubstringSetMatcher(true);
  UpdateRegexSetMatcher();
  UpdateTriggers();
  UpdateConditionFactory();
}

}  // namespace url_matcher

/* [<][>][^][v][top][bottom][index][help] */