// Copyright (c) 2010 The Chromium Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. // // PhishingUrlFeatureExtractor handles computing URL-based features for // the client-side phishing detection model. These include tokens in the // host and path, features pertaining to host length, and IP addresses. #ifndef CHROME_RENDERER_SAFE_BROWSING_PHISHING_URL_FEATURE_EXTRACTOR_H_ #define CHROME_RENDERER_SAFE_BROWSING_PHISHING_URL_FEATURE_EXTRACTOR_H_ #include <string> #include <vector> #include "base/basictypes.h" class GURL; namespace safe_browsing { class FeatureMap; class PhishingUrlFeatureExtractor { public: PhishingUrlFeatureExtractor(); ~PhishingUrlFeatureExtractor(); // Extracts features for |url| into the given feature map. // Returns true on success. bool ExtractFeatures(const GURL& url, FeatureMap* features); private: friend class PhishingUrlFeatureExtractorTest; static const size_t kMinPathComponentLength = 3; // Given a string, finds all substrings of consecutive alphanumeric // characters of length >= kMinPathComponentLength and inserts them into // tokens. static void SplitStringIntoLongAlphanumTokens( const std::string& full, std::vector<std::string>* tokens); DISALLOW_COPY_AND_ASSIGN(PhishingUrlFeatureExtractor); }; } // namespace safe_browsing #endif // CHROME_RENDERER_SAFE_BROWSING_PHISHING_URL_FEATURE_EXTRACTOR_H_