root/core/UnicodeUtils.cpp
/* [<][>][^][v][top][bottom][index][help] */
DEFINITIONS
This source file includes following definitions.
- Utf16ToUtf8
- Utf8ToUtf16
- Utf8ToUcs4
- Ucs4ToUtf8
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* The Original Code is [Open Source Virtual Machine.].
*
* The Initial Developer of the Original Code is
* Adobe System Incorporated.
* Portions created by the Initial Developer are Copyright (C) 2004-2006
* the Initial Developer. All Rights Reserved.
*
* Contributor(s):
* Adobe AS3 Team
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
#include "avmplus.h"
namespace avmplus
{
/**
*
* Table of Unicode characters that can be represented in ECMAScript
* ECMA-262 Section 15.1.3
*
* Code Point Value Representation 1st Octet 2nd Octet 3rd Octet 4th Octet
*
* 0x0000 - 0x007F 00000000 0zzzzzzz 0zzzzzzz
* 0x0080 - 0x07FF 00000yyy yyzzzzzz 110yyyyy 10zzzzzz
* 0x0800 - 0xD7FF xxxxyyyy yyzzzzzz 1110xxxx 10yyyyyy 10zzzzzz
*
* 0xD800 - 0xDBFF 110110vv vvwwwwxx 11110uuu 10uuwwww 10xxyyyy 10zzzzzz
* followed by followed by
* 0xDC00 - 0xDFFF 110111yy yyzzzzzz
*
* 0xD800 - 0xDBFF
* not followed by causes URIError
* 0xDC00 - 0xDFFF
*
* 0xDC00 - 0xDFFF causes URIError
*
* 0xE000 - 0xFFFF xxxxyyyy yyzzzzzz 1110xxxx 10yyyyyy 10zzzzzz
*
*/
int UnicodeUtils::Utf16ToUtf8(const wchar *in,
int inLen,
uint8 *out,
int outMax)
{
int outLen = 0;
if (out)
{
// Output buffer passed in; actually encode data.
while (inLen > 0)
{
wchar ch = *in;
inLen--;
if (ch < 0x80) {
if (--outMax < 0) {
return -1;
}
*out++ = (uint8)ch;
outLen++;
}
else if (ch < 0x800) {
if ((outMax -= 2) < 0) {
return -1;
}
*out++ = (uint8)(0xC0 | ((ch>>6)&0x1F));
*out++ = (uint8)(0x80 | (ch&0x3F));
outLen += 2;
}
else if (ch >= 0xD800 && ch <= 0xDBFF) {
if (--inLen < 0) {
return -1;
}
wchar ch2 = *++in;
if (ch2 < 0xDC00 || ch2 > 0xDFFF) {
// This is an invalid UTF-16 surrogate pair sequence
// Encode the replacement character instead
ch = 0xFFFD;
goto Encode3;
}
uint32 ucs4 = ((ch-0xD800)<<10) + (ch2-0xDC00) + 0x10000;
if ((outMax -= 4) < 0) {
return -1;
}
*out++ = (uint8)(0xF0 | ((ucs4>>18)&0x07));
*out++ = (uint8)(0x80 | ((ucs4>>12)&0x3F));
*out++ = (uint8)(0x80 | ((ucs4>>6)&0x3F));
*out++ = (uint8)(0x80 | (ucs4&0x3F));
outLen += 4;
}
else {
if (ch >= 0xDC00 && ch <= 0xDFFF) {
// This is an invalid UTF-16 surrogate pair sequence
// Encode the replacement character instead
ch = 0xFFFD;
}
Encode3:
if ((outMax -= 3) < 0) {
return -1;
}
*out++ = (uint8)(0xE0 | ((ch>>12)&0x0F));
*out++ = (uint8)(0x80 | ((ch>>6)&0x3F));
*out++ = (uint8)(0x80 | (ch&0x3F));
outLen += 3;
}
in++;
}
}
else
{
// Count output characters without actually encoding.
while (inLen > 0)
{
wchar ch = *in;
inLen--;
if (ch < 0x80) {
outLen++;
}
else if (ch < 0x800) {
outLen += 2;
}
else if (ch >= 0xD800 && ch <= 0xDBFF) {
if (--inLen < 0) {
return -1;
}
wchar ch2 = *++in;
if (ch2 < 0xDC00 || ch2 > 0xDFFF) {
// Invalid...
// We'll encode 0xFFFD for this
outLen += 3;
} else {
outLen += 4;
}
}
else {
outLen += 3;
}
in++;
}
}
return outLen;
}
int32_t UnicodeUtils::Utf8ToUtf16(const uint8 *in,
int32_t inLen,
wchar *out,
int32_t outMax,
bool strict)
{
int32_t outLen = 0;
uint32_t outch;
while (inLen > 0)
{
uint32_t c = uint32_t (*in);
switch (c >> 4)
{
case 0: case 1: case 2: case 3: case 4: case 5: case 6: case 7:
// 0xxx xxxx
// Let the converted == false case handle this.
break;
case 12: case 13:
// 110xxxxx 10xxxxxx
if (inLen < 2) {
// Invalid
goto invalid;
}
if ((in[1]&0xC0) != 0x80) {
// Invalid
goto invalid;
}
outch = ((c<<6 & 0x7C0) | (in[1] & 0x3F));
if (outch < 0x80) {
// Overlong sequence, reject as invalid.
goto invalid;
}
in += 2;
inLen -= 2;
if (out) {
if (--outMax < 0) {
return -1;
}
*out++ = (wchar)(outch);
}
outLen++;
continue;
case 14:
// 1110xxxx 10xxxxxx 10xxxxxx
if (inLen < 3) {
// Invalid
goto invalid;
}
if ((in[1]&0xC0) != 0x80 || (in[2]&0xC0) != 0x80) {
// Invalid
goto invalid;
}
outch = ((c<<12 & 0xF000) | (in[1]<<6 & 0xFC0) | (in[2] & 0x3F));
if (outch < 0x800) {
// Overlong sequence, reject as invalid.
goto invalid;
}
in += 3;
inLen -= 3;
if (out) {
if (--outMax < 0) {
return -1;
}
*out++ = (wchar)(outch);
}
outLen++;
continue;
case 15:
// 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
// 111110xx ... is always invalid
// 1111110x ... is always invalid
// note: when 'strict' is false, we need to mimic the behavior of FP9/FP10,
// which did not do the c&8 test. so skip it for bug-compatibility.
if ((strict && (c & 0x08)) || (inLen < 4)) {
// Invalid
goto invalid;
}
if ((in[1]&0xC0) != 0x80 ||
(in[2]&0xC0) != 0x80 ||
(in[3]&0xC0) != 0x80)
{
goto invalid;
}
outch = ((c<<18 & 0x1C0000) |
(in[1]<<12 & 0x3F000) |
(in[2]<<6 & 0xFC0) |
(in[3] & 0x3F));
if (outch < 0x10000) {
// Overlong sequence, reject as invalid.
goto invalid;
}
in += 4;
inLen -= 4;
// Encode as UTF-16 surrogate sequence
if (out) {
if ((outMax -= 2) < 0) {
return -1;
}
*out++ = (wchar) (((outch-0x10000)>>10) & 0x3FF) + 0xD800;
*out++ = (wchar) ((outch-0x10000) & 0x3FF) + 0xDC00;
}
outLen += 2;
continue;
default:
invalid:
if (strict)
return -1;
// else fall thru
}
// ! converted
if (out) {
if (--outMax < 0) {
return -1;
}
*out++ = (wchar)c;
}
inLen--;
in++;
outLen++;
}
return outLen;
}
int32_t UnicodeUtils::Utf8ToUcs4(const uint8 *chars,
int32_t len,
uint32_t *out)
{
// U-00000000 - U-0000007F: 0xxxxxxx
// U-00000080 - U-000007FF: 110xxxxx 10xxxxxx
// U-00000800 - U-0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
// U-00010000 - U-001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
// U-00200000 - U-03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
// U-04000000 - U-7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
// The minUCS4 table enforces the security rule that an
// overlong UTF-8 sequence is forbidden, if a shorter
// sequence could encode the same character.
static uint32 minUCS4[] = {
0x00000000,
0x00000080,
0x00000800,
0x00010000,
0x00200000,
0x04000000
};
int32_t n = 0;
uint32_t b;
if (len < 1) {
return 0;
}
switch (chars[0]>>4) {
case 0: case 1: case 2: case 3: case 4: case 5: case 6: case 7:
n = 1;
b = chars[0];
break;
case 12: case 13:
n = 2;
b = chars[0]&0x1F;
break;
case 14:
n = 3;
b = chars[0]&0x0F;
break;
case 15:
switch (chars[0]&0x0C) {
case 0x00:
case 0x04:
n = 4;
b = chars[0]&0x07;
break;
case 0x08:
n = 5;
b = chars[0]&0x03;
break;
case 0x0C:
n = 6;
b = chars[0]&0x01;
break;
}
// fall through intentional
default: // invalid character, should not get here
return 0;
}
if (len < n) {
return 0;
}
for (int i=1; i<n; i++) {
if ((chars[i]&0xC0) != 0x80) {
return 0;
}
b = (b<<6) | (chars[i]&0x3F);
}
if (b < minUCS4[n-1]) {
return 0;
}
*out = b;
return n;
}
int32_t UnicodeUtils::Ucs4ToUtf8(uint32_t value,
uint8 *chars)
{
// U-00000000 - U-0000007F: 0xxxxxxx
// U-00000080 - U-000007FF: 110xxxxx 10xxxxxx
// U-00000800 - U-0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
// U-00010000 - U-001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
// U-00200000 - U-03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
// U-04000000 - U-7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
if (value < 0x80) {
*chars = (uint8)value;
return 1;
}
if (value < 0x800) {
chars[0] = (uint8)(0xC0 | ((value>>6)&0x1F));
chars[1] = (uint8)(0x80 | (value&0x3F));
return 2;
}
if (value < 0x10000) {
chars[0] = (uint8)(0xE0 | ((value>>12)&0x0F));
chars[1] = (uint8)(0x80 | ((value>>6)&0x3F));
chars[2] = (uint8)(0x80 | (value&0x3F));
return 3;
}
if (value < 0x200000) {
chars[0] = (uint8)(0xF0 | ((value>>18)&0x07));
chars[1] = (uint8)(0x80 | ((value>>12)&0x3F));
chars[2] = (uint8)(0x80 | ((value>>6)&0x3F));
chars[3] = (uint8)(0x80 | (value&0x3F));
return 4;
}
if (value < 0x4000000) {
chars[0] = (uint8)(0xF8 | ((value>>24)&0x03));
chars[1] = (uint8)(0x80 | ((value>>18)&0x3F));
chars[2] = (uint8)(0x80 | ((value>>12)&0x3F));
chars[3] = (uint8)(0x80 | ((value>>6)&0x3F));
chars[4] = (uint8)(0x80 | (value&0x3F));
return 5;
}
if (value < 0x80000000) {
chars[0] = (uint8)(0xFC | ((value>>30)&0x01));
chars[1] = (uint8)(0x80 | ((value>>24)&0x3F));
chars[2] = (uint8)(0x80 | ((value>>18)&0x3F));
chars[3] = (uint8)(0x80 | ((value>>12)&0x3F));
chars[4] = (uint8)(0x80 | ((value>>6)&0x3F));
chars[5] = (uint8)(0x80 | (value&0x3F));
return 6;
}
return 0;
}
}