#ifndef HTMLTokenizer_h
#define HTMLTokenizer_h
#include "core/html/parser/HTMLParserOptions.h"
#include "core/html/parser/HTMLToken.h"
#include "core/html/parser/InputStreamPreprocessor.h"
#include "platform/text/SegmentedString.h"
namespace WebCore {
class HTMLTokenizer {
WTF_MAKE_NONCOPYABLE(HTMLTokenizer);
WTF_MAKE_FAST_ALLOCATED;
public:
static PassOwnPtr<HTMLTokenizer> create(const HTMLParserOptions& options) { return adoptPtr(new HTMLTokenizer(options)); }
~HTMLTokenizer();
void reset();
enum State {
DataState,
CharacterReferenceInDataState,
RCDATAState,
CharacterReferenceInRCDATAState,
RAWTEXTState,
ScriptDataState,
PLAINTEXTState,
TagOpenState,
EndTagOpenState,
TagNameState,
RCDATALessThanSignState,
RCDATAEndTagOpenState,
RCDATAEndTagNameState,
RAWTEXTLessThanSignState,
RAWTEXTEndTagOpenState,
RAWTEXTEndTagNameState,
ScriptDataLessThanSignState,
ScriptDataEndTagOpenState,
ScriptDataEndTagNameState,
ScriptDataEscapeStartState,
ScriptDataEscapeStartDashState,
ScriptDataEscapedState,
ScriptDataEscapedDashState,
ScriptDataEscapedDashDashState,
ScriptDataEscapedLessThanSignState,
ScriptDataEscapedEndTagOpenState,
ScriptDataEscapedEndTagNameState,
ScriptDataDoubleEscapeStartState,
ScriptDataDoubleEscapedState,
ScriptDataDoubleEscapedDashState,
ScriptDataDoubleEscapedDashDashState,
ScriptDataDoubleEscapedLessThanSignState,
ScriptDataDoubleEscapeEndState,
BeforeAttributeNameState,
AttributeNameState,
AfterAttributeNameState,
BeforeAttributeValueState,
AttributeValueDoubleQuotedState,
AttributeValueSingleQuotedState,
AttributeValueUnquotedState,
CharacterReferenceInAttributeValueState,
AfterAttributeValueQuotedState,
SelfClosingStartTagState,
BogusCommentState,
ContinueBogusCommentState,
MarkupDeclarationOpenState,
CommentStartState,
CommentStartDashState,
CommentState,
CommentEndDashState,
CommentEndState,
CommentEndBangState,
DOCTYPEState,
BeforeDOCTYPENameState,
DOCTYPENameState,
AfterDOCTYPENameState,
AfterDOCTYPEPublicKeywordState,
BeforeDOCTYPEPublicIdentifierState,
DOCTYPEPublicIdentifierDoubleQuotedState,
DOCTYPEPublicIdentifierSingleQuotedState,
AfterDOCTYPEPublicIdentifierState,
BetweenDOCTYPEPublicAndSystemIdentifiersState,
AfterDOCTYPESystemKeywordState,
BeforeDOCTYPESystemIdentifierState,
DOCTYPESystemIdentifierDoubleQuotedState,
DOCTYPESystemIdentifierSingleQuotedState,
AfterDOCTYPESystemIdentifierState,
BogusDOCTYPEState,
CDATASectionState,
CDATASectionRightSquareBracketState,
CDATASectionDoubleRightSquareBracketState,
};
struct Checkpoint {
HTMLParserOptions options;
State state;
UChar additionalAllowedCharacter;
bool skipNextNewLine;
bool shouldAllowCDATA;
Checkpoint()
: options(0)
, state()
, additionalAllowedCharacter('\0')
, skipNextNewLine(false)
, shouldAllowCDATA(false)
{
}
};
bool canCreateCheckpoint() const;
void createCheckpoint(Checkpoint&) const;
void restoreFromCheckpoint(const Checkpoint&);
bool nextToken(SegmentedString&, HTMLToken&);
String bufferedCharacters() const;
size_t numberOfBufferedCharacters() const
{
return m_temporaryBuffer.size() ? m_temporaryBuffer.size() + 2 : 0;
}
void updateStateFor(const String& tagName);
bool forceNullCharacterReplacement() const { return m_forceNullCharacterReplacement; }
void setForceNullCharacterReplacement(bool value) { m_forceNullCharacterReplacement = value; }
bool shouldAllowCDATA() const { return m_shouldAllowCDATA; }
void setShouldAllowCDATA(bool value) { m_shouldAllowCDATA = value; }
State state() const { return m_state; }
void setState(State state) { m_state = state; }
inline bool shouldSkipNullCharacters() const
{
return !m_forceNullCharacterReplacement
&& (m_state == HTMLTokenizer::DataState
|| m_state == HTMLTokenizer::RCDATAState
|| m_state == HTMLTokenizer::RAWTEXTState);
}
private:
explicit HTMLTokenizer(const HTMLParserOptions&);
inline bool processEntity(SegmentedString&);
inline void parseError();
inline void bufferCharacter(UChar character)
{
ASSERT(character != kEndOfFileMarker);
m_token->ensureIsCharacterToken();
m_token->appendToCharacter(character);
}
inline bool emitAndResumeIn(SegmentedString& source, State state)
{
saveEndTagNameIfNeeded();
m_state = state;
source.advanceAndUpdateLineNumber();
return true;
}
inline bool emitAndReconsumeIn(SegmentedString&, State state)
{
saveEndTagNameIfNeeded();
m_state = state;
return true;
}
inline bool emitEndOfFile(SegmentedString& source)
{
if (haveBufferedCharacterToken())
return true;
m_state = HTMLTokenizer::DataState;
source.advanceAndUpdateLineNumber();
m_token->clear();
m_token->makeEndOfFile();
return true;
}
inline bool flushEmitAndResumeIn(SegmentedString&, State);
inline bool flushBufferedEndTag(SegmentedString&);
inline bool temporaryBufferIs(const String&);
inline void addToPossibleEndTag(LChar cc);
inline void saveEndTagNameIfNeeded()
{
ASSERT(m_token->type() != HTMLToken::Uninitialized);
if (m_token->type() == HTMLToken::StartTag)
m_appropriateEndTagName = m_token->name();
}
inline bool isAppropriateEndTag();
inline bool haveBufferedCharacterToken()
{
return m_token->type() == HTMLToken::Character;
}
State m_state;
bool m_forceNullCharacterReplacement;
bool m_shouldAllowCDATA;
HTMLToken* m_token;
UChar m_additionalAllowedCharacter;
InputStreamPreprocessor<HTMLTokenizer> m_inputStreamPreprocessor;
Vector<UChar, 32> m_appropriateEndTagName;
Vector<LChar, 32> m_temporaryBuffer;
Vector<LChar, 32> m_bufferedEndTagName;
HTMLParserOptions m_options;
};
}
#endif