This source file includes following definitions.
- FreeThread
- AllocThread
- CopyCapture
- AddToThreadq
- Step
- FormatCapture
- StringPieceContains
- Search
- ComputeFirstByte
- SearchNFA
#include "re2/prog.h"
#include "re2/regexp.h"
#include "util/sparse_array.h"
#include "util/sparse_set.h"
namespace re2 {
class NFA {
public:
NFA(Prog* prog);
~NFA();
bool Search(const StringPiece& text, const StringPiece& context,
bool anchored, bool longest,
StringPiece* submatch, int nsubmatch);
static const int Debug = 0;
private:
struct Thread {
union {
int id;
Thread* next;
};
const char** capture;
};
struct AddState {
int id;
int j;
const char* cap_j;
AddState()
: id(0), j(-1), cap_j(NULL) {}
explicit AddState(int id)
: id(id), j(-1), cap_j(NULL) {}
AddState(int id, const char* cap_j, int j)
: id(id), j(j), cap_j(cap_j) {}
};
typedef SparseArray<Thread*> Threadq;
inline Thread* AllocThread();
inline void FreeThread(Thread*);
void AddToThreadq(Threadq* q, int id, int flag,
const char* p, const char** capture);
inline int Step(Threadq* runq, Threadq* nextq, int c, int flag, const char* p);
string FormatCapture(const char** capture);
inline void CopyCapture(const char** dst, const char** src);
int ComputeFirstByte();
Prog* prog_;
int start_;
int ncapture_;
bool longest_;
bool endmatch_;
const char* btext_;
const char* etext_;
Threadq q0_, q1_;
const char** match_;
bool matched_;
AddState* astack_;
int nastack_;
int first_byte_;
Thread* free_threads_;
DISALLOW_EVIL_CONSTRUCTORS(NFA);
};
NFA::NFA(Prog* prog) {
prog_ = prog;
start_ = prog->start();
ncapture_ = 0;
longest_ = false;
endmatch_ = false;
btext_ = NULL;
etext_ = NULL;
q0_.resize(prog_->size());
q1_.resize(prog_->size());
nastack_ = 2*prog_->size();
astack_ = new AddState[nastack_];
match_ = NULL;
matched_ = false;
free_threads_ = NULL;
first_byte_ = ComputeFirstByte();
}
NFA::~NFA() {
delete[] match_;
delete[] astack_;
Thread* next;
for (Thread* t = free_threads_; t; t = next) {
next = t->next;
delete[] t->capture;
delete t;
}
}
void NFA::FreeThread(Thread *t) {
if (t == NULL)
return;
t->next = free_threads_;
free_threads_ = t;
}
NFA::Thread* NFA::AllocThread() {
Thread* t = free_threads_;
if (t == NULL) {
t = new Thread;
t->capture = new const char*[ncapture_];
return t;
}
free_threads_ = t->next;
return t;
}
void NFA::CopyCapture(const char** dst, const char** src) {
for (int i = 0; i < ncapture_; i+=2) {
dst[i] = src[i];
dst[i+1] = src[i+1];
}
}
void NFA::AddToThreadq(Threadq* q, int id0, int flag,
const char* p, const char** capture) {
if (id0 == 0)
return;
int nstk = 0;
AddState* stk = astack_;
stk[nstk++] = AddState(id0);
while (nstk > 0) {
DCHECK_LE(nstk, nastack_);
const AddState& a = stk[--nstk];
if (a.j >= 0)
capture[a.j] = a.cap_j;
int id = a.id;
if (id == 0)
continue;
if (q->has_index(id)) {
if (Debug)
fprintf(stderr, " [%d%s]\n", id, FormatCapture(capture).c_str());
continue;
}
q->set_new(id, NULL);
Thread** tp = &q->find(id)->second;
int j;
Thread* t;
Prog::Inst* ip = prog_->inst(id);
switch (ip->opcode()) {
default:
LOG(DFATAL) << "unhandled " << ip->opcode() << " in AddToThreadq";
break;
case kInstFail:
break;
case kInstAltMatch:
t = AllocThread();
t->id = id;
CopyCapture(t->capture, capture);
*tp = t;
case kInstAlt:
stk[nstk++] = AddState(ip->out1());
stk[nstk++] = AddState(ip->out());
break;
case kInstNop:
stk[nstk++] = AddState(ip->out());
break;
case kInstCapture:
if ((j=ip->cap()) < ncapture_) {
stk[nstk++] = AddState(0, capture[j], j);
capture[j] = p;
}
stk[nstk++] = AddState(ip->out());
break;
case kInstMatch:
case kInstByteRange:
t = AllocThread();
t->id = id;
CopyCapture(t->capture, capture);
*tp = t;
if (Debug)
fprintf(stderr, " + %d%s [%p]\n", id, FormatCapture(t->capture).c_str(), t);
break;
case kInstEmptyWidth:
if (ip->empty() & ~flag)
break;
stk[nstk++] = AddState(ip->out());
break;
}
}
}
int NFA::Step(Threadq* runq, Threadq* nextq, int c, int flag, const char* p) {
nextq->clear();
for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) {
Thread* t = i->second;
if (t == NULL)
continue;
if (longest_) {
if (matched_ && match_[0] < t->capture[0]) {
FreeThread(t);
continue;
}
}
int id = t->id;
Prog::Inst* ip = prog_->inst(id);
switch (ip->opcode()) {
default:
LOG(DFATAL) << "Unhandled " << ip->opcode() << " in step";
break;
case kInstByteRange:
if (ip->Matches(c))
AddToThreadq(nextq, ip->out(), flag, p+1, t->capture);
break;
case kInstAltMatch:
if (i != runq->begin())
break;
if (ip->greedy(prog_) || longest_) {
CopyCapture((const char**)match_, t->capture);
FreeThread(t);
for (++i; i != runq->end(); ++i)
FreeThread(i->second);
runq->clear();
matched_ = true;
if (ip->greedy(prog_))
return ip->out1();
return ip->out();
}
break;
case kInstMatch:
if (endmatch_ && p != etext_)
break;
const char* old = t->capture[1];
t->capture[1] = p;
if (longest_) {
if (!matched_ || t->capture[0] < match_[0] ||
(t->capture[0] == match_[0] && t->capture[1] > match_[1]))
CopyCapture((const char**)match_, t->capture);
} else {
CopyCapture((const char**)match_, t->capture);
t->capture[0] = old;
FreeThread(t);
for (++i; i != runq->end(); ++i)
FreeThread(i->second);
runq->clear();
matched_ = true;
return 0;
}
t->capture[0] = old;
matched_ = true;
break;
}
FreeThread(t);
}
runq->clear();
return 0;
}
string NFA::FormatCapture(const char** capture) {
string s;
for (int i = 0; i < ncapture_; i+=2) {
if (capture[i] == NULL)
StringAppendF(&s, "(?,?)");
else if (capture[i+1] == NULL)
StringAppendF(&s, "(%d,?)", (int)(capture[i] - btext_));
else
StringAppendF(&s, "(%d,%d)",
(int)(capture[i] - btext_),
(int)(capture[i+1] - btext_));
}
return s;
}
static bool StringPieceContains(const StringPiece haystack, const StringPiece needle) {
return haystack.begin() <= needle.begin() &&
haystack.end() >= needle.end();
}
bool NFA::Search(const StringPiece& text, const StringPiece& const_context,
bool anchored, bool longest,
StringPiece* submatch, int nsubmatch) {
if (start_ == 0)
return false;
StringPiece context = const_context;
if (context.begin() == NULL)
context = text;
if (!StringPieceContains(context, text)) {
LOG(FATAL) << "Bad args: context does not contain text "
<< reinterpret_cast<const void*>(context.begin())
<< "+" << context.size() << " "
<< reinterpret_cast<const void*>(text.begin())
<< "+" << text.size();
return false;
}
if (prog_->anchor_start() && context.begin() != text.begin())
return false;
if (prog_->anchor_end() && context.end() != text.end())
return false;
anchored |= prog_->anchor_start();
if (prog_->anchor_end()) {
longest = true;
endmatch_ = true;
etext_ = text.end();
}
if (nsubmatch < 0) {
LOG(DFATAL) << "Bad args: nsubmatch=" << nsubmatch;
return false;
}
ncapture_ = 2*nsubmatch;
longest_ = longest;
if (nsubmatch == 0) {
ncapture_ = 2;
}
match_ = new const char*[ncapture_];
matched_ = false;
memset(match_, 0, ncapture_*sizeof match_[0]);
btext_ = context.begin();
if (Debug) {
fprintf(stderr, "NFA::Search %s (context: %s) anchored=%d longest=%d\n",
text.as_string().c_str(), context.as_string().c_str(), anchored,
longest);
}
Threadq* runq = &q0_;
Threadq* nextq = &q1_;
runq->clear();
nextq->clear();
memset(&match_[0], 0, ncapture_*sizeof match_[0]);
const char* bp = context.begin();
int c = -1;
int wasword = 0;
if (text.begin() > context.begin()) {
c = text.begin()[-1] & 0xFF;
wasword = Prog::IsWordChar(c);
}
for (const char* p = text.begin();; p++) {
int flag = 0;
if (p == context.begin())
flag |= kEmptyBeginText | kEmptyBeginLine;
else if (p <= context.end() && p[-1] == '\n')
flag |= kEmptyBeginLine;
if (p == context.end())
flag |= kEmptyEndText | kEmptyEndLine;
else if (p < context.end() && p[0] == '\n')
flag |= kEmptyEndLine;
int isword = 0;
if (p < context.end())
isword = Prog::IsWordChar(p[0] & 0xFF);
if (isword != wasword)
flag |= kEmptyWordBoundary;
else
flag |= kEmptyNonWordBoundary;
if (Debug) {
fprintf(stderr, "%c[%#x/%d/%d]:", p > text.end() ? '$' : p == bp ? '^' : c, flag, isword, wasword);
for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) {
Thread* t = i->second;
if (t == NULL)
continue;
fprintf(stderr, " %d%s", t->id,
FormatCapture((const char**)t->capture).c_str());
}
fprintf(stderr, "\n");
}
int id = Step(runq, nextq, c, flag, p-1);
DCHECK_EQ(runq->size(), 0);
swap(nextq, runq);
nextq->clear();
if (id != 0) {
p = text.end();
for (;;) {
Prog::Inst* ip = prog_->inst(id);
switch (ip->opcode()) {
default:
LOG(DFATAL) << "Unexpected opcode in short circuit: " << ip->opcode();
break;
case kInstCapture:
match_[ip->cap()] = p;
id = ip->out();
continue;
case kInstNop:
id = ip->out();
continue;
case kInstMatch:
match_[1] = p;
matched_ = true;
break;
case kInstEmptyWidth:
if (ip->empty() & ~(kEmptyEndLine|kEmptyEndText)) {
LOG(DFATAL) << "Unexpected empty-width in short circuit: " << ip->empty();
break;
}
id = ip->out();
continue;
}
break;
}
break;
}
if (p > text.end())
break;
if (!matched_ && (!anchored || p == text.begin())) {
if (!anchored && first_byte_ >= 0 && runq->size() == 0 &&
p < text.end() && (p[0] & 0xFF) != first_byte_) {
p = reinterpret_cast<const char*>(memchr(p, first_byte_,
text.end() - p));
if (p == NULL) {
p = text.end();
isword = 0;
} else {
isword = Prog::IsWordChar(p[0] & 0xFF);
}
flag = Prog::EmptyFlags(context, p);
}
match_[0] = p;
AddToThreadq(runq, start_, flag, p, match_);
match_[0] = NULL;
}
if (runq->size() == 0) {
if (Debug)
fprintf(stderr, "dead\n");
break;
}
if (p == text.end())
c = 0;
else
c = *p & 0xFF;
wasword = isword;
}
for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i)
FreeThread(i->second);
if (matched_) {
for (int i = 0; i < nsubmatch; i++)
submatch[i].set(match_[2*i], match_[2*i+1] - match_[2*i]);
if (Debug)
fprintf(stderr, "match (%d,%d)\n",
static_cast<int>(match_[0] - btext_),
static_cast<int>(match_[1] - btext_));
return true;
}
VLOG(1) << "No matches found";
return false;
}
int NFA::ComputeFirstByte() {
if (start_ == 0)
return -1;
int b = -1;
typedef SparseSet Workq;
Workq q(prog_->size());
q.insert(start_);
for (Workq::iterator it = q.begin(); it != q.end(); ++it) {
int id = *it;
Prog::Inst* ip = prog_->inst(id);
switch (ip->opcode()) {
default:
LOG(DFATAL) << "unhandled " << ip->opcode() << " in ComputeFirstByte";
break;
case kInstMatch:
return -1;
case kInstByteRange:
if (ip->lo() != ip->hi())
return -1;
if (ip->foldcase() && 'a' <= ip->lo() && ip->lo() <= 'z')
return -1;
if (b == -1)
b = ip->lo();
else if (b != ip->lo())
return -1;
break;
case kInstNop:
case kInstCapture:
case kInstEmptyWidth:
if (ip->out())
q.insert(ip->out());
break;
case kInstAlt:
case kInstAltMatch:
if (ip->out())
q.insert(ip->out());
if (ip->out1())
q.insert(ip->out1());
break;
case kInstFail:
break;
}
}
return b;
}
bool
Prog::SearchNFA(const StringPiece& text, const StringPiece& context,
Anchor anchor, MatchKind kind,
StringPiece* match, int nmatch) {
if (NFA::Debug)
Dump();
NFA nfa(this);
StringPiece sp;
if (kind == kFullMatch) {
anchor = kAnchored;
if (nmatch == 0) {
match = &sp;
nmatch = 1;
}
}
if (!nfa.Search(text, context, anchor == kAnchored, kind != kFirstMatch, match, nmatch))
return false;
if (kind == kFullMatch && match[0].end() != text.end())
return false;
return true;
}
}