This source file includes following definitions.
- can_inline_
- can_inline_
- MatchScoreGreater
- FilterTermMatchesByWordStarts
- GetTopicalityScore
- FillInTermScoreToTopicalityScoreArray
- GetRecencyScore
- FillInDaysAgoToRecencyScoreArray
- GetFrecency
- GetFinalRelevancyScore
- Init
#include "chrome/browser/history/scored_history_match.h"
#include <algorithm>
#include <functional>
#include <iterator>
#include <numeric>
#include <set>
#include <math.h>
#include "base/logging.h"
#include "base/metrics/histogram.h"
#include "base/strings/string_util.h"
#include "base/strings/utf_string_conversions.h"
#include "chrome/browser/autocomplete/history_url_provider.h"
#include "chrome/browser/autocomplete/url_prefix.h"
#include "chrome/browser/bookmarks/bookmark_service.h"
#include "chrome/browser/omnibox/omnibox_field_trial.h"
#include "chrome/common/chrome_switches.h"
#include "content/public/browser/browser_thread.h"
namespace history {
const size_t ScoredHistoryMatch::kMaxVisitsToScore = 10;
const int ScoredHistoryMatch::kDaysToPrecomputeRecencyScoresFor = 366;
const int ScoredHistoryMatch::kMaxRawTermScore = 30;
float* ScoredHistoryMatch::raw_term_score_to_topicality_score_ = NULL;
float* ScoredHistoryMatch::days_ago_to_recency_score_ = NULL;
bool ScoredHistoryMatch::initialized_ = false;
int ScoredHistoryMatch::bookmark_value_ = 1;
bool ScoredHistoryMatch::discount_frecency_when_few_visits_ = false;
bool ScoredHistoryMatch::allow_tld_matches_ = false;
bool ScoredHistoryMatch::allow_scheme_matches_ = false;
bool ScoredHistoryMatch::also_do_hup_like_scoring_ = false;
int ScoredHistoryMatch::max_assigned_score_for_non_inlineable_matches_ = -1;
ScoredHistoryMatch::ScoredHistoryMatch()
: raw_score_(0),
can_inline_(false) {
Init();
}
ScoredHistoryMatch::ScoredHistoryMatch(
const URLRow& row,
const VisitInfoVector& visits,
const std::string& languages,
const base::string16& lower_string,
const String16Vector& terms,
const WordStarts& terms_to_word_starts_offsets,
const RowWordStarts& word_starts,
const base::Time now,
BookmarkService* bookmark_service)
: HistoryMatch(row, 0, false, false),
raw_score_(0),
can_inline_(false) {
Init();
GURL gurl = row.url();
if (!gurl.is_valid())
return;
base::string16 url = CleanUpUrlForMatching(gurl, languages);
base::string16 title = CleanUpTitleForMatching(row.title());
int term_num = 0;
for (String16Vector::const_iterator iter = terms.begin(); iter != terms.end();
++iter, ++term_num) {
base::string16 term = *iter;
TermMatches url_term_matches = MatchTermInString(term, url, term_num);
TermMatches title_term_matches = MatchTermInString(term, title, term_num);
if (url_term_matches.empty() && title_term_matches.empty())
return;
url_matches_.insert(url_matches_.end(), url_term_matches.begin(),
url_term_matches.end());
title_matches_.insert(title_matches_.end(), title_term_matches.begin(),
title_term_matches.end());
}
url_matches_ = SortAndDeoverlapMatches(url_matches_);
title_matches_ = SortAndDeoverlapMatches(title_matches_);
const URLPrefix* best_inlineable_prefix =
(!url_matches_.empty() && (terms.size() == 1)) ?
URLPrefix::BestURLPrefix(base::UTF8ToUTF16(gurl.spec()), terms[0]) :
NULL;
can_inline_ = (best_inlineable_prefix != NULL) &&
!IsWhitespace(*(lower_string.rbegin()));
if (can_inline_) {
const URLPrefix* best_prefix = URLPrefix::BestURLPrefix(
base::UTF8ToUTF16(gurl.spec()), base::string16());
DCHECK(best_prefix != NULL);
const int num_components_in_best_prefix = best_prefix->num_components;
DCHECK(best_inlineable_prefix != NULL);
const int num_components_in_best_inlineable_prefix =
best_inlineable_prefix->num_components;
innermost_match = (num_components_in_best_inlineable_prefix ==
num_components_in_best_prefix);
}
const float topicality_score = GetTopicalityScore(
terms.size(), url, terms_to_word_starts_offsets, word_starts);
const float frecency_score = GetFrecency(
now, (bookmark_service && bookmark_service->IsBookmarked(gurl)), visits);
raw_score_ = GetFinalRelevancyScore(topicality_score, frecency_score);
raw_score_ =
(raw_score_ <= kint32max) ? static_cast<int>(raw_score_) : kint32max;
if (also_do_hup_like_scoring_ && can_inline_) {
const bool promote_to_inline = (row.typed_count() > 1) ||
(IsHostOnly() && (row.typed_count() == 1));
int hup_like_score = promote_to_inline ?
HistoryURLProvider::kScoreForBestInlineableResult :
HistoryURLProvider::kBaseScoreForNonInlineableResult;
if (base::UTF8ToUTF16(gurl.host()) == terms[0])
hup_like_score = HistoryURLProvider::kScoreForBestInlineableResult;
if (!promote_to_inline && IsHostOnly())
hup_like_score++;
raw_score_ = std::max(raw_score_, hup_like_score);
}
if (!can_inline_ && (max_assigned_score_for_non_inlineable_matches_ != -1)) {
raw_score_ = std::min(max_assigned_score_for_non_inlineable_matches_,
raw_score_);
}
}
ScoredHistoryMatch::~ScoredHistoryMatch() {}
bool ScoredHistoryMatch::MatchScoreGreater(const ScoredHistoryMatch& m1,
const ScoredHistoryMatch& m2) {
if (m1.raw_score_ != m2.raw_score_)
return m1.raw_score_ > m2.raw_score_;
if (!m1.url_info.typed_count() != !m2.url_info.typed_count())
return m1.url_info.typed_count() > m2.url_info.typed_count();
if (m1.innermost_match != m2.innermost_match)
return m1.innermost_match;
if (m1.url_info.typed_count() != m2.url_info.typed_count())
return m1.url_info.typed_count() > m2.url_info.typed_count();
if (m1.url_info.typed_count() == 1) {
if (m1.IsHostOnly() != m2.IsHostOnly())
return m1.IsHostOnly();
}
if (m1.url_info.visit_count() != m2.url_info.visit_count())
return m1.url_info.visit_count() > m2.url_info.visit_count();
return m1.url_info.last_visit() > m2.url_info.last_visit();
}
TermMatches ScoredHistoryMatch::FilterTermMatchesByWordStarts(
const TermMatches& term_matches,
const WordStarts& terms_to_word_starts_offsets,
const WordStarts& word_starts,
size_t start_pos,
size_t end_pos) {
if (start_pos == std::string::npos)
return term_matches;
TermMatches filtered_matches;
WordStarts::const_iterator next_word_starts = word_starts.begin();
WordStarts::const_iterator end_word_starts = word_starts.end();
for (TermMatches::const_iterator iter = term_matches.begin();
iter != term_matches.end(); ++iter) {
const size_t term_offset = terms_to_word_starts_offsets[iter->term_num];
while ((next_word_starts != end_word_starts) &&
(*next_word_starts < (iter->offset + term_offset)))
++next_word_starts;
if ((iter->offset < start_pos) ||
((end_pos != std::string::npos) && (iter->offset >= end_pos)) ||
((next_word_starts != end_word_starts) &&
(*next_word_starts == iter->offset + term_offset)))
filtered_matches.push_back(*iter);
}
return filtered_matches;
}
float ScoredHistoryMatch::GetTopicalityScore(
const int num_terms,
const base::string16& url,
const WordStarts& terms_to_word_starts_offsets,
const RowWordStarts& word_starts) {
DCHECK(!content::BrowserThread::IsThreadInitialized(
content::BrowserThread::UI) ||
content::BrowserThread::CurrentlyOn(content::BrowserThread::UI));
if (raw_term_score_to_topicality_score_ == NULL) {
raw_term_score_to_topicality_score_ = new float[kMaxRawTermScore];
FillInTermScoreToTopicalityScoreArray();
}
DCHECK_GT(num_terms, 0);
std::vector<int> term_scores(num_terms, 0);
WordStarts::const_iterator next_word_starts =
word_starts.url_word_starts_.begin();
WordStarts::const_iterator end_word_starts =
word_starts.url_word_starts_.end();
const size_t question_mark_pos = url.find('?');
const size_t colon_pos = url.find(':');
const size_t end_of_hostname_pos = (colon_pos != std::string::npos) ?
url.find('/', colon_pos + 3) : url.find('/');
size_t last_part_of_hostname_pos =
(end_of_hostname_pos != std::string::npos) ?
url.rfind('.', end_of_hostname_pos) : url.rfind('.');
url_matches_ = FilterTermMatchesByWordStarts(
url_matches_, terms_to_word_starts_offsets, word_starts.url_word_starts_,
end_of_hostname_pos,
std::string::npos);
if (colon_pos != std::string::npos) {
url_matches_ = FilterTermMatchesByWordStarts(
url_matches_, terms_to_word_starts_offsets,
word_starts.url_word_starts_, 0, colon_pos);
}
for (TermMatches::const_iterator iter = url_matches_.begin();
iter != url_matches_.end(); ++iter) {
const size_t term_offset = terms_to_word_starts_offsets[iter->term_num];
while ((next_word_starts != end_word_starts) &&
(*next_word_starts < (iter->offset + term_offset))) {
++next_word_starts;
}
const bool at_word_boundary = (next_word_starts != end_word_starts) &&
(*next_word_starts == iter->offset + term_offset);
if ((question_mark_pos != std::string::npos) &&
(iter->offset > question_mark_pos)) {
DCHECK(at_word_boundary);
term_scores[iter->term_num] += 5;
} else if ((end_of_hostname_pos != std::string::npos) &&
(iter->offset > end_of_hostname_pos)) {
DCHECK(at_word_boundary);
term_scores[iter->term_num] += 8;
} else if ((colon_pos == std::string::npos) ||
(iter->offset > colon_pos)) {
if ((last_part_of_hostname_pos == std::string::npos) ||
(iter->offset < last_part_of_hostname_pos)) {
term_scores[iter->term_num] += at_word_boundary ? 10 : 2;
} else {
if (allow_tld_matches_)
term_scores[iter->term_num] += at_word_boundary ? 10 : 0;
}
} else {
DCHECK(at_word_boundary);
match_in_scheme = true;
if (allow_scheme_matches_)
term_scores[iter->term_num] += 10;
}
}
next_word_starts = word_starts.title_word_starts_.begin();
end_word_starts = word_starts.title_word_starts_.end();
int word_num = 0;
title_matches_ = FilterTermMatchesByWordStarts(
title_matches_, terms_to_word_starts_offsets,
word_starts.title_word_starts_, 0, std::string::npos);
for (TermMatches::const_iterator iter = title_matches_.begin();
iter != title_matches_.end(); ++iter) {
const size_t term_offset = terms_to_word_starts_offsets[iter->term_num];
while ((next_word_starts != end_word_starts) &&
(*next_word_starts < (iter->offset + term_offset))) {
++next_word_starts;
++word_num;
}
if (word_num >= 10) break;
DCHECK(next_word_starts != end_word_starts);
DCHECK_EQ(*next_word_starts, iter->offset + term_offset)
<< "not at word boundary";
term_scores[iter->term_num] += 8;
}
float topicality_score = 0;
for (size_t i = 0; i < term_scores.size(); ++i) {
if (term_scores[i] == 0)
return 0;
topicality_score += raw_term_score_to_topicality_score_[
(term_scores[i] >= kMaxRawTermScore) ? (kMaxRawTermScore - 1) :
term_scores[i]];
}
return topicality_score / num_terms;
}
void ScoredHistoryMatch::FillInTermScoreToTopicalityScoreArray() {
for (int term_score = 0; term_score < kMaxRawTermScore; ++term_score) {
float topicality_score;
if (term_score < 10) {
topicality_score = 0.1 * term_score;
} else {
topicality_score = (1.0 + 2.25 * log10(0.1 * term_score));
}
raw_term_score_to_topicality_score_[term_score] = topicality_score;
}
}
float ScoredHistoryMatch::GetRecencyScore(int last_visit_days_ago) {
DCHECK(!content::BrowserThread::IsThreadInitialized(
content::BrowserThread::UI) ||
content::BrowserThread::CurrentlyOn(content::BrowserThread::UI));
if (days_ago_to_recency_score_ == NULL) {
days_ago_to_recency_score_ = new float[kDaysToPrecomputeRecencyScoresFor];
FillInDaysAgoToRecencyScoreArray();
}
return days_ago_to_recency_score_[
std::max(
std::min(last_visit_days_ago, kDaysToPrecomputeRecencyScoresFor - 1),
0)];
}
void ScoredHistoryMatch::FillInDaysAgoToRecencyScoreArray() {
for (int days_ago = 0; days_ago < kDaysToPrecomputeRecencyScoresFor;
days_ago++) {
int unnormalized_recency_score;
if (days_ago <= 4) {
unnormalized_recency_score = 100;
} else if (days_ago <= 14) {
unnormalized_recency_score = 70 + (14 - days_ago) * (100 - 70) / (14 - 4);
} else if (days_ago <= 31) {
unnormalized_recency_score = 50 + (31 - days_ago) * (70 - 50) / (31 - 14);
} else if (days_ago <= 90) {
unnormalized_recency_score = 30 + (90 - days_ago) * (50 - 30) / (90 - 30);
} else {
unnormalized_recency_score =
10 + (365 - days_ago) * (20 - 10) / (365 - 90);
}
days_ago_to_recency_score_[days_ago] = unnormalized_recency_score / 100.0;
if (days_ago > 0) {
DCHECK_LE(days_ago_to_recency_score_[days_ago],
days_ago_to_recency_score_[days_ago - 1]);
}
}
}
float ScoredHistoryMatch::GetFrecency(const base::Time& now,
const bool bookmarked,
const VisitInfoVector& visits) {
const int total_sampled_visits = std::min(visits.size(), kMaxVisitsToScore);
if (total_sampled_visits == 0)
return 0.0f;
float summed_visit_points = 0;
for (int i = 0; i < total_sampled_visits; ++i) {
int value_of_transition =
(visits[i].second == content::PAGE_TRANSITION_TYPED) ? 20 : 1;
if (bookmarked)
value_of_transition = std::max(value_of_transition, bookmark_value_);
const float bucket_weight =
GetRecencyScore((now - visits[i].first).InDays());
summed_visit_points += (value_of_transition * bucket_weight);
}
return visits.size() * summed_visit_points /
(discount_frecency_when_few_visits_ ?
kMaxVisitsToScore : total_sampled_visits);
}
float ScoredHistoryMatch::GetFinalRelevancyScore(float topicality_score,
float frecency_score) {
if (topicality_score == 0)
return 0;
const float intermediate_score = topicality_score * frecency_score;
if (intermediate_score <= 1) {
const float slope = (600 - 400) / (1.5f - 0.0f);
return 400 + slope * intermediate_score;
}
if (intermediate_score <= 12.0) {
const float slope = (1300 - 600) / (12.0f - 1.5f);
return 600 + slope * (intermediate_score - 1.5);
}
const float slope = (1399 - 1300) / (20.0f - 12.0f);
return std::min(1399.0, 1300 + slope * (intermediate_score - 12.0));
}
void ScoredHistoryMatch::Init() {
if (initialized_)
return;
also_do_hup_like_scoring_ = false;
if (also_do_hup_like_scoring_) {
max_assigned_score_for_non_inlineable_matches_ =
HistoryURLProvider::kScoreForBestInlineableResult - 1;
}
bookmark_value_ = OmniboxFieldTrial::HQPBookmarkValue();
discount_frecency_when_few_visits_ =
OmniboxFieldTrial::HQPDiscountFrecencyWhenFewVisits();
allow_tld_matches_ = OmniboxFieldTrial::HQPAllowMatchInTLDValue();
allow_scheme_matches_ = OmniboxFieldTrial::HQPAllowMatchInSchemeValue();
initialized_ = true;
}
}