root/core/XMLParser16.cpp
/* [<][>][^][v][top][bottom][index][help] */
DEFINITIONS
This source file includes following definitions.
- skipWhiteSpace
- _condenseWhitespace
- getNext
- unescape
- truncateAtFirstNullChar
- m_pos
- parse
- nextAttribute
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* The Original Code is [Open Source Virtual Machine.].
*
* The Initial Developer of the Original Code is
* Adobe System Incorporated.
* Portions created by the Initial Developer are Copyright (C) 1993-2006
* the Initial Developer. All Rights Reserved.
*
* Contributor(s):
* Adobe AS3 Team
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
#include "avmplus.h"
namespace avmplus
{
//
// XMLParser
//
bool XMLParser::skipWhiteSpace()
{
bool eof = atEnd();
while (!eof)
{
wchar ch = m_str[m_pos];
if (!String::isSpace(ch))
break;
m_pos++;
eof = atEnd();
}
return !eof;
}
// !!@ I'm not sure what this was supposed to do originally but I've rewritten it
// to remove the leading and trailing white space for text elements.
// " 5 4 3 " becomes "5 4 3"
// This is to simulate the E4X XML parser
static Stringp _condenseWhitespace(Stringp text)
{
StringIndexer str(text);
// leading space
int32_t start = 0;
while (start < text->length() && String::isSpace(str[start]))
start++;
// trailing space
int32_t end = text->length() - 1;
while ((end > start) && String::isSpace(str[end]))
end--;
return text->substring(start, end + 1);
}
int XMLParser::getNext(XMLTag& tag)
{
wchar ch;
int32_t start, end;
tag.reset();
// If there's nothing left, exit.
if (atEnd()) {
return XMLParser::kEndOfDocument;
}
// R41
// If the ignore whitespace flag is on, don't produce
// all-whitespace text nodes.
if (m_ignoreWhite)
{
if (!skipWhiteSpace())
return XMLParser::kEndOfDocument;
}
// end R41
// If it starts with <, it's an XML element.
// If it doesn't, it must be a text element.
start = m_pos;
ch = m_str[m_pos];
if (ch != '<')
{
// Treat it as text. Scan up to the next < or until EOF.
m_pos = m_str->indexOfCharCode('<', m_pos + 1);
if (m_pos < 0)
m_pos = m_str->length();
tag.text = unescape(start, m_pos, false);
// Condense whitespace if desired
if (m_ignoreWhite && m_condenseWhite)
tag.text = _condenseWhitespace(tag.text);
tag.nodeType = XMLTag::kTextNodeType;
return XMLParser::kNoError;
}
// Is this a <?xml> declaration?
start = m_pos;
if (m_str->matchesLatin1("<?xml ", 6, start))
{
end = m_str->indexOfLatin1("?>", 2, start + 6);
if (end >= 0)
{
// We have the end of the XML declaration
// !!@ changed to not return <?...?> parts
tag.text = m_str->substring(start + 2, end);
m_pos = end + 2;
tag.nodeType = XMLTag::kXMLDeclaration;
return XMLParser::kNoError;
}
return XMLParser::kUnterminatedXMLDeclaration;
}
// Is this a <!DOCTYPE> declaration?
if (m_str->matchesLatin1("<!DOCTYPE", 8, start))
{
// Scan forward for '>', but check for embedded <>
int32_t depth = 0;
end = start + 1;
while (!atEnd())
{
ch = m_str[end++];
if (ch == '<')
depth++;
else if (ch == '>')
{
if (!depth)
{
// We've reached the end of the DOCTYPE.
tag.text = m_str->substring(start, end - 1);
tag.nodeType = XMLTag::kDocTypeDeclaration;
m_pos = end;
return XMLParser::kNoError;
}
depth--;
}
m_pos = end;
}
return XMLParser::kUnterminatedDocTypeDeclaration;
}
// Is this a CDATA section?
if (m_str->matchesLatin1("<![CDATA[", 9, start))
{
start += 9;
end = m_str->indexOfLatin1("]]>", 3, start);
if (end >= 0)
{
// We have the end of the CDATA section.
tag.text = m_str->substring(start, end);
tag.nodeType = XMLTag::kCDataSection;
m_pos = end + 3;
return XMLParser::kNoError;
}
return XMLParser::kUnterminatedCDataSection;
}
// Is this a processing instruction?
if (m_str->matchesLatin1("<?", 2, start))
{
// Scan forward for "?>"
start += 2;
end = m_str->indexOfLatin1("?>", 2, start);
if (end >= 0)
{
// We have the end of the processing instruction.
tag.text = m_str->substring(start, end);
tag.nodeType = XMLTag::kProcessingInstruction;
m_pos = end + 2;
return XMLParser::kNoError;
}
return XMLParser::kUnterminatedProcessingInstruction;
}
// Advance past the "<"
start = ++m_pos;
// Is this a comment? Return a comment tag->
if (m_str->matchesLatin1("!--", 3, start))
{
// Skip up to '-->'.
start += 3;
end = m_str->indexOfLatin1("-->", 3, start);
if (end >= 0)
{
tag.text = m_str->substring(start, end);
tag.nodeType = XMLTag::kComment;
m_pos = end + 3;
return XMLParser::kNoError;
}
// Got to the end of the buffer without finding a new tag->
return XMLParser::kUnterminatedComment;
}
// Extract the tag name. Scan up to ">" or whitespace.
start = m_pos;
while (!atEnd())
{
ch = m_str[m_pos];
if (ch == '>' || String::isSpace(ch))
break;
if (ch == '/' && (m_pos < m_str->length() - 1) && m_str[m_pos+1] == '>')
{
// Found close of an empty element.
// Exit!
break;
}
m_pos++;
}
if (atEnd() || m_pos == start)
// Premature end, or empty tag name
return XMLParser::kMalformedElement;
tag.text = unescape(start, m_pos, true);
tag.nodeType = XMLTag::kElementType;
// Extract attributes.
for (;;)
{
if (!skipWhiteSpace())
// Premature end!
return XMLParser::kMalformedElement;
ch = m_str[m_pos];
if (ch == '>')
break;
if (ch == '/' && (m_pos < m_str->length() - 1) && m_str[m_pos+1] == '>')
{
// Found close of an empty element.
// Exit!
tag.empty = true;
ch = m_str[++m_pos];
break;
}
// Extract the attribute name.
start = m_pos;
while (!String::isSpace(ch) && ch != '=' && ch != '>')
{
if (atEnd())
// Premature end!
return XMLParser::kMalformedElement;
ch = m_str[++m_pos];
}
if (start == m_pos)
// Empty attribute name?
return XMLParser::kMalformedElement;
Stringp attributeName = unescape(start, m_pos, true);
if (!skipWhiteSpace())
// No attribute value, error!
return XMLParser::kMalformedElement;
ch = m_str[m_pos++];
if (ch != '=')
// No attribute value, error!
return XMLParser::kMalformedElement;
if (!skipWhiteSpace())
// No attribute value, error!
return XMLParser::kMalformedElement;
wchar delimiter = m_str[m_pos++];
// Extract the attribute value.
if (delimiter != '"' && delimiter != '\'')
// Error; no opening quote for attribute value.
return XMLParser::kMalformedElement;
// Extract up to the next quote.
start = m_pos;
ch = 0;
while (ch != delimiter)
{
if (atEnd() || ch == '<')
// '<' is not permitted in an attribute value
// Changed this from kMalformedElement to kUnterminatedAttributeValue for bug 117058(105422)
return XMLParser::kUnterminatedAttributeValue;
ch = m_str[m_pos++];
}
Stringp attributeValue = unescape(start, m_pos - 1, false);
AvmAssert(attributeName->isInterned());
tag.attributes.add(attributeName);
tag.attributes.add(attributeValue);
}
// Advance past the end > of this element.
if (ch == '>')
m_pos++;
return XMLParser::kNoError;
}
Stringp XMLParser::unescape(int32_t start, int32_t last, bool intern)
{
Stringp dest = core->kEmptyString;
if (start == last)
return dest;
int32_t bgn = m_str->indexOfCharCode('&', start, last);
if (bgn < 0)
{
return intern ?
m_str->intern_substring(start, last) :
m_str->substring(start, last);
}
int32_t end = start;
while (bgn >= start && bgn < last)
{
int32_t ampEnd = m_str->indexOfCharCode(';', ++bgn, last);
if (ampEnd < 0)
// &xxx without semicolon - we are done
break;
// add the text between the last sequence and this sequence
dest = dest->append(m_str->substring(end, bgn-1));
end = ampEnd;
int32_t len = end - bgn;
// an &xx; sequence is at least two characters
bool ok = true;
if (len >= 2)
{
int32_t ch = m_str[bgn];
if (ch == '#')
{
// Parse a &#xx; decimal sequence. Or a Ý hex sequence
ch = m_str[++bgn];
len--;
int base = 10;
if (len >= 2 && ch == 'x')
base = 16, bgn++, len--;
if (len > 0)
{
int32_t value = 0;
while (len-- && ok)
{
ch = m_str[bgn++];
if (ch >= 'A' && ch <= 'F')
ch -= 7;
ch -= '0';
if (ch >= 0 && ch < base)
value = (value * base) + ch;
else
ok = false;
if (value > 0xFFFF)
ok = false;
}
if (ok)
{
wchar c = (wchar) value;
// note: this code is allowed to construct a string
// containing illegal UTF16 sequences!
dest = dest->append16(&c, 1);
bgn = ++end;
}
}
}
else if (len <= 4) // Our xmlEntities are only 4 characters or less
{
Atom entityAtom = m_str->intern_substring(bgn, end)->atom();
Atom result = core->xmlEntities->get(entityAtom);
if (result != undefinedAtom)
{
AvmAssert(atomIsIntptr(result));
wchar c = (wchar) atomGetIntptr(result);
// note: this code is allowed to construct a string
// containing illegal UTF16 sequences!
dest = dest->append16(&c, 1);
bgn = ++end;
}
else
ok = false;
}
else
ok = false;
}
if (!ok)
bgn = end + 1;
bgn = m_str->indexOfCharCode('&', bgn, last);
}
// add any remaining text
if (end < last)
dest = dest->append(m_str->substring(end, last));
if (intern)
dest = core->internString(dest);
return dest;
}
/*
This provides backwards-compatibility for a rather obscure case:
The old XMLParser considered any null terminator to end the parse,
regardless of actual string length. Some buggy SWFs take a random
ByteArray and try to convert it to XML. The old parser would stop parsing
at the first null, but the new one won't, and thus is much likely to
throw an exception (since random binary rarely parses as XML), while the
old one would return quietly if there happened to be a null character
before the first '<' character. Rather than add null-char checking
back into the parser, let's just do a quick pre-check for a null char,
and if one is found, truncate the string there.
(Watson #2471228)
*/
static Stringp truncateAtFirstNullChar(AvmCore* core, Stringp in)
{
int32_t const pos = in->indexOfCharCode(0);
if (pos > 0)
{
in = in->substr(0, pos);
}
else if (pos == 0)
{
in = core->kEmptyString;
}
return in;
}
XMLParser::XMLParser(AvmCore* core, Stringp str) : m_str(truncateAtFirstNullChar(core, str)), m_pos (0)
{
this->core = core;
if (!core->xmlEntities)
{
// Lazy creation of the XML entities table.
core->xmlEntities = new (core->GetGC()) HeapHashtable(core->GetGC());
const char *entities = "&\0\"quot\0'apos\0<lt\0>gt\0\xA0nbsp\0";
while (*entities)
{
core->xmlEntities->add(core->internConstantStringLatin1(entities+1)->atom(),
core->intToAtom(*entities));
while (*entities++) {
// do nothing
}
}
}
}
void XMLParser::parse(bool ignoreWhite /*=false*/ )
{
m_pos = 0;
m_ignoreWhite = ignoreWhite;
}
bool XMLTag::nextAttribute(uint32& index,
Stringp& name,
Stringp& value)
{
if (index >= attributes.size()) {
return false;
}
name = attributes.get(index++);
value = attributes.get(index++);
return true;
}
} // namespace