// Copyright 2013 The Chromium Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. #ifndef COMPONENTS_URL_MATCHER_SUBSTRING_SET_MATCHER_H_ #define COMPONENTS_URL_MATCHER_SUBSTRING_SET_MATCHER_H_ #include <limits> #include <map> #include <set> #include <string> #include <vector> #include "base/basictypes.h" #include "components/url_matcher/string_pattern.h" #include "components/url_matcher/url_matcher_export.h" namespace url_matcher { // Class that store a set of string patterns and can find for a string S, // which string patterns occur in S. class URL_MATCHER_EXPORT SubstringSetMatcher { public: SubstringSetMatcher(); ~SubstringSetMatcher(); // Registers all |patterns|. The ownership remains with the caller. // The same pattern cannot be registered twice and each pattern needs to have // a unique ID. // Ownership of the patterns remains with the caller. void RegisterPatterns(const std::vector<const StringPattern*>& patterns); // Unregisters the passed |patterns|. void UnregisterPatterns(const std::vector<const StringPattern*>& patterns); // Analogous to RegisterPatterns and UnregisterPatterns but executes both // operations in one step, which is cheaper in the execution. void RegisterAndUnregisterPatterns( const std::vector<const StringPattern*>& to_register, const std::vector<const StringPattern*>& to_unregister); // Matches |text| against all registered StringPatterns. Stores the IDs // of matching patterns in |matches|. |matches| is not cleared before adding // to it. bool Match(const std::string& text, std::set<StringPattern::ID>* matches) const; // Returns true if this object retains no allocated data. Only for debugging. bool IsEmpty() const; private: // A node of an Aho Corasick Tree. This is implemented according to // http://www.cs.uku.fi/~kilpelai/BSA05/lectures/slides04.pdf // // The algorithm is based on the idea of building a trie of all registered // patterns. Each node of the tree is annotated with a set of pattern // IDs that are used to report matches. // // The root of the trie represents an empty match. If we were looking whether // any registered pattern matches a text at the beginning of the text (i.e. // whether any pattern is a prefix of the text), we could just follow // nodes in the trie according to the matching characters in the text. // E.g., if text == "foobar", we would follow the trie from the root node // to its child labeled 'f', from there to child 'o', etc. In this process we // would report all pattern IDs associated with the trie nodes as matches. // // As we are not looking for all prefix matches but all substring matches, // this algorithm would need to compare text.substr(0), text.substr(1), ... // against the trie, which is in O(|text|^2). // // The Aho Corasick algorithm improves this runtime by using failure edges. // In case we have found a partial match of length k in the text // (text[i, ..., i + k - 1]) in the trie starting at the root and ending at // a node at depth k, but cannot find a match in the trie for character // text[i + k] at depth k + 1, we follow a failure edge. This edge // corresponds to the longest proper suffix of text[i, ..., i + k - 1] that // is a prefix of any registered pattern. // // If your brain thinks "Forget it, let's go shopping.", don't worry. // Take a nap and read an introductory text on the Aho Corasick algorithm. // It will make sense. Eventually. class AhoCorasickNode { public: // Key: label of the edge, value: node index in |tree_| of parent class. typedef std::map<char, uint32> Edges; typedef std::set<StringPattern::ID> Matches; static const uint32 kNoSuchEdge; // Represents an invalid node index. AhoCorasickNode(); ~AhoCorasickNode(); AhoCorasickNode(const AhoCorasickNode& other); AhoCorasickNode& operator=(const AhoCorasickNode& other); uint32 GetEdge(char c) const; void SetEdge(char c, uint32 node); const Edges& edges() const { return edges_; } uint32 failure() const { return failure_; } void set_failure(uint32 failure) { failure_ = failure; } void AddMatch(StringPattern::ID id); void AddMatches(const Matches& matches); const Matches& matches() const { return matches_; } private: // Outgoing edges of current node. Edges edges_; // Node index that failure edge leads to. uint32 failure_; // Identifiers of matches. Matches matches_; }; typedef std::map<StringPattern::ID, const StringPattern*> SubstringPatternMap; typedef std::vector<const StringPattern*> SubstringPatternVector; // |sorted_patterns| is a copy of |patterns_| sorted by the pattern string. void RebuildAhoCorasickTree(const SubstringPatternVector& sorted_patterns); // Inserts a path for |pattern->pattern()| into the tree and adds // |pattern->id()| to the set of matches. Ownership of |pattern| remains with // the caller. void InsertPatternIntoAhoCorasickTree(const StringPattern* pattern); void CreateFailureEdges(); // Set of all registered StringPatterns. Used to regenerate the // Aho-Corasick tree in case patterns are registered or unregistered. SubstringPatternMap patterns_; // The nodes of a Aho-Corasick tree. std::vector<AhoCorasickNode> tree_; DISALLOW_COPY_AND_ASSIGN(SubstringSetMatcher); }; } // namespace url_matcher #endif // COMPONENTS_URL_MATCHER_SUBSTRING_SET_MATCHER_H_