root/chrome/browser/safe_browsing/safe_browsing_store_file.h

/* [<][>][^][v][top][bottom][index][help] */

INCLUDED FROM


// Copyright (c) 2012 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#ifndef CHROME_BROWSER_SAFE_BROWSING_SAFE_BROWSING_STORE_FILE_H_
#define CHROME_BROWSER_SAFE_BROWSING_SAFE_BROWSING_STORE_FILE_H_

#include <set>
#include <vector>

#include "chrome/browser/safe_browsing/safe_browsing_store.h"

#include "base/callback.h"
#include "base/files/file_path.h"
#include "base/files/scoped_file.h"

// Implement SafeBrowsingStore in terms of a flat file.  The file
// format is pretty literal:
//
// int32 magic;             // magic number "validating" file
// int32 version;           // format version
//
// // Counts for the various data which follows the header.
// uint32 add_chunk_count;  // Chunks seen, including empties.
// uint32 sub_chunk_count;  // Ditto.
// uint32 shard_stride;     // SBPrefix space covered per shard.
//                          // 0==entire space in one shard.
// // Sorted by chunk_id.
// array[add_chunk_count] {
//   int32 chunk_id;
// }
// // Sorted by chunk_id.
// array[sub_chunk_count] {
//   int32 chunk_id;
// }
// MD5Digest header_checksum;  // Checksum over preceeding data.
//
// // Sorted by prefix, then add chunk_id, then hash, both within shards and
// // overall.
// array[from 0 to wraparound to 0 by shard_stride] {
//   uint32 add_prefix_count;
//   uint32 sub_prefix_count;
//   uint32 add_hash_count;
//   uint32 sub_hash_count;
//   array[add_prefix_count] {
//     int32 chunk_id;
//     uint32 prefix;
//   }
//   array[sub_prefix_count] {
//     int32 chunk_id;
//     int32 add_chunk_id;
//     uint32 add_prefix;
//   }
//   array[add_hash_count] {
//     int32 chunk_id;
//     int32 received_time;     // From base::Time::ToTimeT().
//     char[32] full_hash;
//   }
//   array[sub_hash_count] {
//     int32 chunk_id;
//     int32 add_chunk_id;
//     char[32] add_full_hash;
//   }
// }
// MD5Digest checksum;      // Checksum over entire file.
//
// The checksums are used to allow writing the file without doing an expensive
// fsync().  Since the data can be re-fetched, failing the checksum is not
// catastrophic.  Histograms indicate that file corruption here is pretty
// uncommon.
//
// The |header_checksum| is present to guarantee valid header and chunk data for
// updates.  Only that part of the file needs to be read to post the update.
//
// |shard_stride| breaks the file into approximately-equal portions, allowing
// updates to stream from one file to another with modest memory usage.  It is
// dynamic to adjust to different file sizes without adding excessive overhead.
//
// During the course of an update, uncommitted data is stored in a
// temporary file (which is later re-used to commit).  This is an
// array of chunks, with the count kept in memory until the end of the
// transaction.  The format of this file is like the main file, with
// the list of chunks seen omitted, as that data is tracked in-memory:
//
// array[] {
//   uint32 add_prefix_count;
//   uint32 sub_prefix_count;
//   uint32 add_hash_count;
//   uint32 sub_hash_count;
//   array[add_prefix_count] {
//     int32 chunk_id;
//     uint32 prefix;
//   }
//   array[sub_prefix_count] {
//     int32 chunk_id;
//     int32 add_chunk_id;
//     uint32 add_prefix;
//   }
//   array[add_hash_count] {
//     int32 chunk_id;
//     int32 received_time;     // From base::Time::ToTimeT().
//     char[32] full_hash;
//   }
//   array[sub_hash_count] {
//     int32 chunk_id;
//     int32 add_chunk_id;
//     char[32] add_full_hash;
//   }
// }
//
// The overall transaction works like this:
// - Open the original file to get the chunks-seen data.
// - Open a temp file for storing new chunk info.
// - Write new chunks to the temp file.
// - When the transaction is finished:
//   - Read the update data from the temp file into memory.
//   - Overwrite the temp file with new header data.
//   - Until done:
//     - Read shards of the original file's data into memory.
//     - Merge from the update data.
//     - Write shards to the temp file.
//   - Delete original file.
//   - Rename temp file to original filename.

class SafeBrowsingStoreFile : public SafeBrowsingStore {
 public:
  SafeBrowsingStoreFile();
  virtual ~SafeBrowsingStoreFile();

  virtual void Init(const base::FilePath& filename,
                    const base::Closure& corruption_callback) OVERRIDE;

  // Delete any on-disk files, including the permanent storage.
  virtual bool Delete() OVERRIDE;

  // Get all add hash prefixes and full-length hashes, respectively, from
  // the store.
  virtual bool GetAddPrefixes(SBAddPrefixes* add_prefixes) OVERRIDE;
  virtual bool GetAddFullHashes(
      std::vector<SBAddFullHash>* add_full_hashes) OVERRIDE;

  virtual bool BeginChunk() OVERRIDE;

  virtual bool WriteAddPrefix(int32 chunk_id, SBPrefix prefix) OVERRIDE;
  virtual bool WriteAddHash(int32 chunk_id,
                            base::Time receive_time,
                            const SBFullHash& full_hash) OVERRIDE;
  virtual bool WriteSubPrefix(int32 chunk_id,
                              int32 add_chunk_id, SBPrefix prefix) OVERRIDE;
  virtual bool WriteSubHash(int32 chunk_id, int32 add_chunk_id,
                            const SBFullHash& full_hash) OVERRIDE;
  virtual bool FinishChunk() OVERRIDE;

  virtual bool BeginUpdate() OVERRIDE;
  // Store updates with pending add full hashes in file store and
  // return |add_prefixes_result| and |add_full_hashes_result|.
  virtual bool FinishUpdate(
      const std::vector<SBAddFullHash>& pending_adds,
      safe_browsing::PrefixSetBuilder* builder,
      std::vector<SBAddFullHash>* add_full_hashes_result) OVERRIDE;
  virtual bool CancelUpdate() OVERRIDE;

  virtual void SetAddChunk(int32 chunk_id) OVERRIDE;
  virtual bool CheckAddChunk(int32 chunk_id) OVERRIDE;
  virtual void GetAddChunks(std::vector<int32>* out) OVERRIDE;
  virtual void SetSubChunk(int32 chunk_id) OVERRIDE;
  virtual bool CheckSubChunk(int32 chunk_id) OVERRIDE;
  virtual void GetSubChunks(std::vector<int32>* out) OVERRIDE;

  virtual void DeleteAddChunk(int32 chunk_id) OVERRIDE;
  virtual void DeleteSubChunk(int32 chunk_id) OVERRIDE;

  // Verify |file_|'s checksum, calling the corruption callback if it
  // does not check out.  Empty input is considered valid.
  virtual bool CheckValidity() OVERRIDE;

  // Returns the name of the temporary file used to buffer data for
  // |filename|.  Exported for unit tests.
  static const base::FilePath TemporaryFileForFilename(
      const base::FilePath& filename) {
    return base::FilePath(filename.value() + FILE_PATH_LITERAL("_new"));
  }

  // Delete any on-disk files, including the permanent storage.
  static bool DeleteStore(const base::FilePath& basename);

 private:
  // Update store file with pending full hashes.
  virtual bool DoUpdate(const std::vector<SBAddFullHash>& pending_adds,
                        safe_browsing::PrefixSetBuilder* builder,
                        std::vector<SBAddFullHash>* add_full_hashes_result);

  // Enumerate different format-change events for histogramming
  // purposes.  DO NOT CHANGE THE ORDERING OF THESE VALUES.
  // TODO(shess): Remove this once the format change is complete.
  enum FormatEventType {
    // Corruption detected, broken down by file format.
    FORMAT_EVENT_FILE_CORRUPT,
    FORMAT_EVENT_SQLITE_CORRUPT,  // Obsolete

    // The type of format found in the file.  The expected case (new
    // file format) is intentionally not covered.
    FORMAT_EVENT_FOUND_SQLITE,
    FORMAT_EVENT_FOUND_UNKNOWN,

    // The number of SQLite-format files deleted should be the same as
    // FORMAT_EVENT_FOUND_SQLITE.  It can differ if the delete fails,
    // or if a failure prevents the update from succeeding.
    FORMAT_EVENT_SQLITE_DELETED,  // Obsolete
    FORMAT_EVENT_SQLITE_DELETE_FAILED,  // Obsolete

    // Found and deleted (or failed to delete) the ancient "Safe
    // Browsing" file.
    FORMAT_EVENT_DELETED_ORIGINAL,
    FORMAT_EVENT_DELETED_ORIGINAL_FAILED,

    // The checksum did not check out in CheckValidity() or in
    // FinishUpdate().  This most likely indicates that the machine
    // crashed before the file was fully sync'ed to disk.
    FORMAT_EVENT_VALIDITY_CHECKSUM_FAILURE,
    FORMAT_EVENT_UPDATE_CHECKSUM_FAILURE,

    // Memory space for histograms is determined by the max.  ALWAYS
    // ADD NEW VALUES BEFORE THIS ONE.
    FORMAT_EVENT_MAX
  };

  // Helper to record an event related to format conversion from
  // SQLite to file.
  static void RecordFormatEvent(FormatEventType event_type);

  // Some very lucky users have an original-format file still in their
  // profile.  Check for it and delete, recording a histogram for the
  // result (no histogram for not-found).  Logically this
  // would make more sense at the SafeBrowsingDatabase level, but
  // practically speaking that code doesn't touch files directly.
  static void CheckForOriginalAndDelete(const base::FilePath& filename);

  // Close all files and clear all buffers.
  bool Close();

  // Calls |corruption_callback_| if non-NULL, always returns false as
  // a convenience to the caller.
  bool OnCorruptDatabase();

  // Helper for creating a corruption callback for |old_store_|.
  // TODO(shess): Remove after migration.
  void HandleCorruptDatabase();

  // Clear temporary buffers used to accumulate chunk data.
  bool ClearChunkBuffers() {
    // NOTE: .clear() doesn't release memory.
    // TODO(shess): Figure out if this is overkill.  Some amount of
    // pre-reserved space is probably reasonable between each chunk
    // collected.
    SBAddPrefixes().swap(add_prefixes_);
    SBSubPrefixes().swap(sub_prefixes_);
    std::vector<SBAddFullHash>().swap(add_hashes_);
    std::vector<SBSubFullHash>().swap(sub_hashes_);
    return true;
  }

  // Clear all buffers used during update.
  void ClearUpdateBuffers() {
    ClearChunkBuffers();
    chunks_written_ = 0;
    std::set<int32>().swap(add_chunks_cache_);
    std::set<int32>().swap(sub_chunks_cache_);
    base::hash_set<int32>().swap(add_del_cache_);
    base::hash_set<int32>().swap(sub_del_cache_);
  }

  // Buffers for collecting data between BeginChunk() and
  // FinishChunk().
  SBAddPrefixes add_prefixes_;
  SBSubPrefixes sub_prefixes_;
  std::vector<SBAddFullHash> add_hashes_;
  std::vector<SBSubFullHash> sub_hashes_;

  // Count of chunks collected in |new_file_|.
  int chunks_written_;

  // Name of the main database file.
  base::FilePath filename_;

  // Handles to the main and scratch files.  |empty_| is true if the
  // main file didn't exist when the update was started.
  base::ScopedFILE file_;
  base::ScopedFILE new_file_;
  bool empty_;

  // Cache of chunks which have been seen.  Loaded from the database
  // on BeginUpdate() so that it can be queried during the
  // transaction.
  std::set<int32> add_chunks_cache_;
  std::set<int32> sub_chunks_cache_;

  // Cache the set of deleted chunks during a transaction, applied on
  // FinishUpdate().
  // TODO(shess): If the set is small enough, hash_set<> might be
  // slower than plain set<>.
  base::hash_set<int32> add_del_cache_;
  base::hash_set<int32> sub_del_cache_;

  base::Closure corruption_callback_;

  // Tracks whether corruption has already been seen in the current
  // update, so that only one instance is recorded in the stats.
  // TODO(shess): Remove with format-migration support.
  bool corruption_seen_;

  DISALLOW_COPY_AND_ASSIGN(SafeBrowsingStoreFile);
};

#endif  // CHROME_BROWSER_SAFE_BROWSING_SAFE_BROWSING_STORE_FILE_H_

/* [<][>][^][v][top][bottom][index][help] */