This source file includes following definitions.
- Simplify
- AndOr
- And
- Or
- SimplifyStringSet
- OrStrings
- ToLowerRune
- ToLowerRuneLatin1
- FromString
- exact
- is_exact
- match_
- TakeMatch
- ToString
- CopyIn
- CrossProduct
- Concat
- And
- Alt
- Quest
- Star
- Plus
- RuneToString
- RuneToStringLatin1
- Literal
- LiteralLatin1
- AnyChar
- NoMatch
- AnyMatch
- EmptyString
- CClass
- Walker
- latin1
- BuildInfo
- ShortVisit
- PostVisit
- FromRegexp
- DebugString
- FromRE2
#include "util/util.h"
#include "re2/prefilter.h"
#include "re2/re2.h"
#include "re2/unicode_casefold.h"
#include "re2/walker-inl.h"
namespace re2 {
static const int Trace = false;
typedef set<string>::iterator SSIter;
typedef set<string>::const_iterator ConstSSIter;
static int alloc_id = 100000;
Prefilter::Prefilter(Op op) {
op_ = op;
subs_ = NULL;
if (op_ == AND || op_ == OR)
subs_ = new vector<Prefilter*>;
alloc_id_ = alloc_id++;
VLOG(10) << "alloc_id: " << alloc_id_;
}
Prefilter::~Prefilter() {
VLOG(10) << "Deleted: " << alloc_id_;
if (subs_) {
for (int i = 0; i < subs_->size(); i++)
delete (*subs_)[i];
delete subs_;
subs_ = NULL;
}
}
Prefilter* Prefilter::Simplify() {
if (op_ != AND && op_ != OR) {
return this;
}
if (subs_->size() == 0) {
if (op_ == AND)
op_ = ALL;
else
op_ = NONE;
return this;
}
if (subs_->size() == 1) {
Prefilter* a = (*subs_)[0];
subs_->clear();
delete this;
return a->Simplify();
}
return this;
}
Prefilter* Prefilter::AndOr(Op op, Prefilter* a, Prefilter* b) {
a = a->Simplify();
b = b->Simplify();
if (a->op() > b->op()) {
Prefilter* t = a;
a = b;
b = t;
}
if (a->op() == ALL || a->op() == NONE) {
if ((a->op() == ALL && op == AND) ||
(a->op() == NONE && op == OR)) {
delete a;
return b;
} else {
delete b;
return a;
}
}
if (a->op() == op && b->op() == op) {
for (int i = 0; i < b->subs()->size(); i++) {
Prefilter* bb = (*b->subs())[i];
a->subs()->push_back(bb);
}
b->subs()->clear();
delete b;
return a;
}
if (b->op() == op) {
Prefilter* t = a;
a = b;
b = t;
}
if (a->op() == op) {
a->subs()->push_back(b);
return a;
}
Prefilter* c = new Prefilter(op);
c->subs()->push_back(a);
c->subs()->push_back(b);
return c;
}
Prefilter* Prefilter::And(Prefilter* a, Prefilter* b) {
return AndOr(AND, a, b);
}
Prefilter* Prefilter::Or(Prefilter* a, Prefilter* b) {
return AndOr(OR, a, b);
}
static void SimplifyStringSet(set<string> *ss) {
for (SSIter i = ss->begin(); i != ss->end(); ++i) {
SSIter j = i;
++j;
while (j != ss->end()) {
SSIter old_j = j;
++j;
if (old_j->find(*i) != string::npos)
ss->erase(old_j);
}
}
}
Prefilter* Prefilter::OrStrings(set<string>* ss) {
SimplifyStringSet(ss);
Prefilter* or_prefilter = NULL;
if (!ss->empty()) {
or_prefilter = new Prefilter(NONE);
for (SSIter i = ss->begin(); i != ss->end(); ++i)
or_prefilter = Or(or_prefilter, FromString(*i));
}
return or_prefilter;
}
static Rune ToLowerRune(Rune r) {
if (r < Runeself) {
if ('A' <= r && r <= 'Z')
r += 'a' - 'A';
return r;
}
CaseFold *f = LookupCaseFold(unicode_tolower, num_unicode_tolower, r);
if (f == NULL || r < f->lo)
return r;
return ApplyFold(f, r);
}
static Rune ToLowerRuneLatin1(Rune r) {
if ('A' <= r && r <= 'Z')
r += 'a' - 'A';
return r;
}
Prefilter* Prefilter::FromString(const string& str) {
Prefilter* m = new Prefilter(Prefilter::ATOM);
m->atom_ = str;
return m;
}
class Prefilter::Info {
public:
Info();
~Info();
static Info* Alt(Info* a, Info* b);
static Info* Concat(Info* a, Info* b);
static Info* And(Info* a, Info* b);
static Info* Star(Info* a);
static Info* Plus(Info* a);
static Info* Quest(Info* a);
static Info* EmptyString();
static Info* NoMatch();
static Info* AnyChar();
static Info* CClass(CharClass* cc, bool latin1);
static Info* Literal(Rune r);
static Info* LiteralLatin1(Rune r);
static Info* AnyMatch();
string ToString();
Prefilter* TakeMatch();
set<string>& exact() { return exact_; }
bool is_exact() const { return is_exact_; }
class Walker;
private:
set<string> exact_;
bool is_exact_;
Prefilter* match_;
};
Prefilter::Info::Info()
: is_exact_(false),
match_(NULL) {
}
Prefilter::Info::~Info() {
delete match_;
}
Prefilter* Prefilter::Info::TakeMatch() {
if (is_exact_) {
match_ = Prefilter::OrStrings(&exact_);
is_exact_ = false;
}
Prefilter* m = match_;
match_ = NULL;
return m;
}
string Prefilter::Info::ToString() {
if (this == NULL) {
return "";
}
if (is_exact_) {
int n = 0;
string s;
for (set<string>::iterator i = exact_.begin(); i != exact_.end(); ++i) {
if (n++ > 0)
s += ",";
s += *i;
}
return s;
}
if (match_)
return match_->DebugString();
return "";
}
static void CopyIn(const set<string>& src, set<string>* dst) {
for (ConstSSIter i = src.begin(); i != src.end(); ++i)
dst->insert(*i);
}
static void CrossProduct(const set<string>& a,
const set<string>& b,
set<string>* dst) {
for (ConstSSIter i = a.begin(); i != a.end(); ++i)
for (ConstSSIter j = b.begin(); j != b.end(); ++j)
dst->insert(*i + *j);
}
Prefilter::Info* Prefilter::Info::Concat(Info* a, Info* b) {
if (a == NULL)
return b;
DCHECK(a->is_exact_);
DCHECK(b && b->is_exact_);
Info *ab = new Info();
CrossProduct(a->exact_, b->exact_, &ab->exact_);
ab->is_exact_ = true;
delete a;
delete b;
return ab;
}
Prefilter::Info* Prefilter::Info::And(Info* a, Info* b) {
if (a == NULL)
return b;
if (b == NULL)
return a;
Info *ab = new Info();
ab->match_ = Prefilter::And(a->TakeMatch(), b->TakeMatch());
ab->is_exact_ = false;
delete a;
delete b;
return ab;
}
Prefilter::Info* Prefilter::Info::Alt(Info* a, Info* b) {
Info *ab = new Info();
if (a->is_exact_ && b->is_exact_) {
CopyIn(a->exact_, &ab->exact_);
CopyIn(b->exact_, &ab->exact_);
ab->is_exact_ = true;
} else {
ab->match_ = Prefilter::Or(a->TakeMatch(), b->TakeMatch());
ab->is_exact_ = false;
}
delete a;
delete b;
return ab;
}
Prefilter::Info* Prefilter::Info::Quest(Info *a) {
Info *ab = new Info();
ab->is_exact_ = false;
ab->match_ = new Prefilter(ALL);
delete a;
return ab;
}
Prefilter::Info* Prefilter::Info::Star(Info *a) {
return Quest(a);
}
Prefilter::Info* Prefilter::Info::Plus(Info *a) {
Info *ab = new Info();
ab->match_ = a->TakeMatch();
ab->is_exact_ = false;
delete a;
return ab;
}
static string RuneToString(Rune r) {
char buf[UTFmax];
int n = runetochar(buf, &r);
return string(buf, n);
}
static string RuneToStringLatin1(Rune r) {
char c = r & 0xff;
return string(&c, 1);
}
Prefilter::Info* Prefilter::Info::Literal(Rune r) {
Info* info = new Info();
info->exact_.insert(RuneToString(ToLowerRune(r)));
info->is_exact_ = true;
return info;
}
Prefilter::Info* Prefilter::Info::LiteralLatin1(Rune r) {
Info* info = new Info();
info->exact_.insert(RuneToStringLatin1(ToLowerRuneLatin1(r)));
info->is_exact_ = true;
return info;
}
Prefilter::Info* Prefilter::Info::AnyChar() {
Prefilter::Info* info = new Prefilter::Info();
info->match_ = new Prefilter(ALL);
return info;
}
Prefilter::Info* Prefilter::Info::NoMatch() {
Prefilter::Info* info = new Prefilter::Info();
info->match_ = new Prefilter(NONE);
return info;
}
Prefilter::Info* Prefilter::Info::AnyMatch() {
Prefilter::Info *info = new Prefilter::Info();
info->match_ = new Prefilter(ALL);
return info;
}
Prefilter::Info* Prefilter::Info::EmptyString() {
Prefilter::Info* info = new Prefilter::Info();
info->is_exact_ = true;
info->exact_.insert("");
return info;
}
typedef CharClass::iterator CCIter;
Prefilter::Info* Prefilter::Info::CClass(CharClass *cc,
bool latin1) {
if (Trace) {
VLOG(0) << "CharClassInfo:";
for (CCIter i = cc->begin(); i != cc->end(); ++i)
VLOG(0) << " " << i->lo << "-" << i->hi;
}
if (cc->size() > 10)
return AnyChar();
Prefilter::Info *a = new Prefilter::Info();
for (CCIter i = cc->begin(); i != cc->end(); ++i)
for (Rune r = i->lo; r <= i->hi; r++) {
if (latin1) {
a->exact_.insert(RuneToStringLatin1(ToLowerRuneLatin1(r)));
} else {
a->exact_.insert(RuneToString(ToLowerRune(r)));
}
}
a->is_exact_ = true;
if (Trace) {
VLOG(0) << " = " << a->ToString();
}
return a;
}
class Prefilter::Info::Walker : public Regexp::Walker<Prefilter::Info*> {
public:
Walker(bool latin1) : latin1_(latin1) {}
virtual Info* PostVisit(
Regexp* re, Info* parent_arg,
Info* pre_arg,
Info** child_args, int nchild_args);
virtual Info* ShortVisit(
Regexp* re,
Info* parent_arg);
bool latin1() { return latin1_; }
private:
bool latin1_;
DISALLOW_EVIL_CONSTRUCTORS(Walker);
};
Prefilter::Info* Prefilter::BuildInfo(Regexp* re) {
if (Trace) {
LOG(INFO) << "BuildPrefilter::Info: " << re->ToString();
}
bool latin1 = re->parse_flags() & Regexp::Latin1;
Prefilter::Info::Walker w(latin1);
Prefilter::Info* info = w.WalkExponential(re, NULL, 100000);
if (w.stopped_early()) {
delete info;
return NULL;
}
return info;
}
Prefilter::Info* Prefilter::Info::Walker::ShortVisit(
Regexp* re, Prefilter::Info* parent_arg) {
return AnyMatch();
}
Prefilter::Info* Prefilter::Info::Walker::PostVisit(
Regexp* re, Prefilter::Info* parent_arg,
Prefilter::Info* pre_arg, Prefilter::Info** child_args,
int nchild_args) {
Prefilter::Info *info;
switch (re->op()) {
default:
case kRegexpRepeat:
LOG(DFATAL) << "Bad regexp op " << re->op();
info = EmptyString();
break;
case kRegexpNoMatch:
info = NoMatch();
break;
case kRegexpEmptyMatch:
case kRegexpBeginLine:
case kRegexpEndLine:
case kRegexpBeginText:
case kRegexpEndText:
case kRegexpWordBoundary:
case kRegexpNoWordBoundary:
info = EmptyString();
break;
case kRegexpLiteral:
if (latin1()) {
info = LiteralLatin1(re->rune());
}
else {
info = Literal(re->rune());
}
break;
case kRegexpLiteralString:
if (re->nrunes() == 0) {
info = NoMatch();
break;
}
if (latin1()) {
info = LiteralLatin1(re->runes()[0]);
for (int i = 1; i < re->nrunes(); i++) {
info = Concat(info, LiteralLatin1(re->runes()[i]));
}
} else {
info = Literal(re->runes()[0]);
for (int i = 1; i < re->nrunes(); i++) {
info = Concat(info, Literal(re->runes()[i]));
}
}
break;
case kRegexpConcat: {
info = NULL;
Info* exact = NULL;
for (int i = 0; i < nchild_args; i++) {
Info* ci = child_args[i];
if (!ci->is_exact() ||
(exact && ci->exact().size() * exact->exact().size() > 16)) {
info = And(info, exact);
exact = NULL;
info = And(info, ci);
} else {
exact = Concat(exact, ci);
}
}
info = And(info, exact);
}
break;
case kRegexpAlternate:
info = child_args[0];
for (int i = 1; i < nchild_args; i++)
info = Alt(info, child_args[i]);
VLOG(10) << "Alt: " << info->ToString();
break;
case kRegexpStar:
info = Star(child_args[0]);
break;
case kRegexpQuest:
info = Quest(child_args[0]);
break;
case kRegexpPlus:
info = Plus(child_args[0]);
break;
case kRegexpAnyChar:
info = AnyChar();
break;
case kRegexpCharClass:
info = CClass(re->cc(), latin1());
break;
case kRegexpCapture:
info = child_args[0];
break;
}
if (Trace) {
VLOG(0) << "BuildInfo " << re->ToString()
<< ": " << info->ToString();
}
return info;
}
Prefilter* Prefilter::FromRegexp(Regexp* re) {
if (re == NULL)
return NULL;
Regexp* simple = re->Simplify();
Prefilter::Info *info = BuildInfo(simple);
simple->Decref();
if (info == NULL)
return NULL;
Prefilter* m = info->TakeMatch();
delete info;
return m;
}
string Prefilter::DebugString() const {
if (this == NULL)
return "<nil>";
switch (op_) {
default:
LOG(DFATAL) << "Bad op in Prefilter::DebugString: " << op_;
return StringPrintf("op%d", op_);
case NONE:
return "*no-matches*";
case ATOM:
return atom_;
case ALL:
return "";
case AND: {
string s = "";
for (int i = 0; i < subs_->size(); i++) {
if (i > 0)
s += " ";
s += (*subs_)[i]->DebugString();
}
return s;
}
case OR: {
string s = "(";
for (int i = 0; i < subs_->size(); i++) {
if (i > 0)
s += "|";
s += (*subs_)[i]->DebugString();
}
s += ")";
return s;
}
}
}
Prefilter* Prefilter::FromRE2(const RE2* re2) {
if (re2 == NULL)
return NULL;
Regexp* regexp = re2->Regexp();
if (regexp == NULL)
return NULL;
return FromRegexp(regexp);
}
}