This source file includes following definitions.
- ConvertUTF16toUTF8
- isLegalUTF8
- ConvertUTF8toUTF16
- gf_utf8_wcslen
- gf_utf8_wcstombs
- gf_utf8_mbstowcs
- gf_utf8_wcslen
- gf_utf8_wcstombs
- gf_utf8_mbstowcs
#ifndef GPAC_DISABLE_CORE_TOOLS
#include <gpac/utf.h>
#if 1
typedef u32 UTF32;
typedef u16 UTF16;
typedef u8 UTF8;
typedef u8 Boolean;
#define UNI_REPLACEMENT_CHAR (UTF32)0x0000FFFD
#define UNI_MAX_BMP (UTF32)0x0000FFFF
#define UNI_MAX_UTF16 (UTF32)0x0010FFFF
#define UNI_MAX_UTF32 (UTF32)0x7FFFFFFF
#define UNI_MAX_LEGAL_UTF32 (UTF32)0x0010FFFF
typedef enum {
conversionOK,
sourceExhausted,
targetExhausted,
sourceIllegal
} ConversionResult;
typedef enum {
strictConversion = 0,
lenientConversion
} ConversionFlags;
static const int halfShift = 10;
static const UTF32 halfBase = 0x0010000UL;
static const UTF32 halfMask = 0x3FFUL;
#define UNI_SUR_HIGH_START (UTF32)0xD800
#define UNI_SUR_HIGH_END (UTF32)0xDBFF
#define UNI_SUR_LOW_START (UTF32)0xDC00
#define UNI_SUR_LOW_END (UTF32)0xDFFF
#define false 0
#define true 1
static const char trailingBytesForUTF8[256] = {
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
};
static const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL,
0x03C82080UL, 0xFA082080UL, 0x82082080UL
};
static const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
ConversionResult ConvertUTF16toUTF8 (
const UTF16** sourceStart, const UTF16* sourceEnd,
UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
ConversionResult result = conversionOK;
const UTF16* source = *sourceStart;
UTF8* target = *targetStart;
while (source < sourceEnd) {
UTF32 ch;
unsigned short bytesToWrite = 0;
const UTF32 byteMask = 0xBF;
const UTF32 byteMark = 0x80;
const UTF16* oldSource = source;
ch = *source++;
if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
if (source < sourceEnd) {
UTF32 ch2 = *source;
if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
+ (ch2 - UNI_SUR_LOW_START) + halfBase;
++source;
} else if (flags == strictConversion) {
--source;
result = sourceIllegal;
break;
}
} else {
--source;
result = sourceExhausted;
break;
}
} else if (flags == strictConversion) {
if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
--source;
result = sourceIllegal;
break;
}
}
if (ch < (UTF32)0x80) {
bytesToWrite = 1;
} else if (ch < (UTF32)0x800) {
bytesToWrite = 2;
} else if (ch < (UTF32)0x10000) {
bytesToWrite = 3;
} else if (ch < (UTF32)0x110000) {
bytesToWrite = 4;
} else {
bytesToWrite = 3;
ch = UNI_REPLACEMENT_CHAR;
}
target += bytesToWrite;
if (target > targetEnd) {
source = oldSource;
target -= bytesToWrite;
result = targetExhausted;
break;
}
switch (bytesToWrite) {
case 4:
*--target = (UTF8)((ch | byteMark) & byteMask);
ch >>= 6;
case 3:
*--target = (UTF8)((ch | byteMark) & byteMask);
ch >>= 6;
case 2:
*--target = (UTF8)((ch | byteMark) & byteMask);
ch >>= 6;
case 1:
*--target = (UTF8)(ch | firstByteMark[bytesToWrite]);
}
target += bytesToWrite;
}
*sourceStart = source;
*targetStart = target;
return result;
}
static Boolean isLegalUTF8(const UTF8 *source, int length) {
UTF8 a;
const UTF8 *srcptr = source+length;
switch (length) {
default:
return false;
case 4:
if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
case 3:
if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
case 2:
if ((a = (*--srcptr)) > 0xBF) return false;
switch (*source) {
case 0xE0:
if (a < 0xA0) return false;
break;
case 0xED:
if (a > 0x9F) return false;
break;
case 0xF0:
if (a < 0x90) return false;
break;
case 0xF4:
if (a > 0x8F) return false;
break;
default:
if (a < 0x80) return false;
}
case 1:
if (*source >= 0x80 && *source < 0xC2) return false;
}
if (*source > 0xF4) return false;
return true;
}
ConversionResult ConvertUTF8toUTF16 (
const UTF8** sourceStart, const UTF8* sourceEnd,
UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
ConversionResult result = conversionOK;
const UTF8* source = *sourceStart;
UTF16* target = *targetStart;
while (source < sourceEnd) {
UTF32 ch = 0;
unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
if (source + extraBytesToRead >= sourceEnd) {
result = sourceExhausted;
break;
}
if (! isLegalUTF8(source, extraBytesToRead+1)) {
result = sourceIllegal;
break;
}
switch (extraBytesToRead) {
case 5:
ch += *source++;
ch <<= 6;
case 4:
ch += *source++;
ch <<= 6;
case 3:
ch += *source++;
ch <<= 6;
case 2:
ch += *source++;
ch <<= 6;
case 1:
ch += *source++;
ch <<= 6;
case 0:
ch += *source++;
}
ch -= offsetsFromUTF8[extraBytesToRead];
if (target >= targetEnd) {
source -= (extraBytesToRead+1);
result = targetExhausted;
break;
}
if (ch <= UNI_MAX_BMP) {
if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
if (flags == strictConversion) {
source -= (extraBytesToRead+1);
result = sourceIllegal;
break;
} else {
*target++ = UNI_REPLACEMENT_CHAR;
}
} else {
*target++ = (UTF16)ch;
}
} else if (ch > UNI_MAX_UTF16) {
if (flags == strictConversion) {
result = sourceIllegal;
source -= (extraBytesToRead+1);
break;
} else {
*target++ = UNI_REPLACEMENT_CHAR;
}
} else {
if (target + 1 >= targetEnd) {
source -= (extraBytesToRead+1);
result = targetExhausted;
break;
}
ch -= halfBase;
*target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
*target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
}
}
*sourceStart = source;
*targetStart = target;
return result;
}
GF_EXPORT
size_t gf_utf8_wcslen (const unsigned short *s)
{
const unsigned short* ptr;
for (ptr = s; *ptr != (unsigned short)'\0'; ptr++) {
}
return ptr - s;
}
GF_EXPORT
size_t gf_utf8_wcstombs(char* dest, size_t len, const unsigned short** srcp)
{
const UTF16** sourceStart = srcp;
const UTF16* sourceEnd = *srcp + gf_utf8_wcslen(*srcp);
UTF8* targetStart = (UTF8*) dest;
UTF8* targetEnd = (UTF8*) dest + len;
ConversionFlags flags = strictConversion;
ConversionResult res = ConvertUTF16toUTF8(sourceStart, sourceEnd, &targetStart, targetEnd, flags);
if (res != conversionOK) return (size_t)-1;
*targetStart = 0;
*srcp=NULL;
return strlen(dest);
}
GF_EXPORT
size_t gf_utf8_mbstowcs(unsigned short* dest, size_t len, const char** srcp)
{
const UTF8** sourceStart = (const UTF8**) srcp;
const UTF8* sourceEnd = (const UTF8*) ( *srcp + strlen( *srcp) );
UTF16* targetStart = (UTF16* ) dest;
UTF16* targetEnd = (UTF16* ) (dest + len);
ConversionFlags flags = strictConversion;
ConversionResult res = ConvertUTF8toUTF16(sourceStart, sourceEnd, &targetStart, targetEnd, flags);
if (res != conversionOK) return (size_t)-1;
*targetStart = 0;
*srcp=NULL;
return gf_utf8_wcslen(dest);
}
#else
GF_EXPORT
size_t gf_utf8_wcslen (const unsigned short *s)
{
const unsigned short* ptr;
for (ptr = s; *ptr != (unsigned short)'\0'; ptr++) {
}
return ptr - s;
}
GF_EXPORT
size_t gf_utf8_wcstombs(char* dest, size_t len, const unsigned short** srcp)
{
size_t count;
const unsigned short * src = *srcp;
if (dest != NULL) {
char* destptr = dest;
for (;; src++) {
unsigned char c;
unsigned short wc = *src;
if (wc < 0x80) {
if (wc == (wchar_t)'\0') {
if (len == 0) {
*srcp = src;
break;
}
*destptr = '\0';
*srcp = NULL;
break;
}
count = 0;
c = (unsigned char) wc;
} else if (wc < 0x800) {
count = 1;
c = (unsigned char) ((wc >> 6) | 0xC0);
} else {
count = 2;
c = (unsigned char) ((wc >> 12) | 0xE0);
}
if (len <= count) {
*srcp = src;
break;
}
len -= count+1;
*destptr++ = c;
if (count > 0)
do {
*destptr++ = (unsigned char)(((wc >> (6 * --count)) & 0x3F) | 0x80);
} while (count > 0);
}
return destptr - dest;
} else {
size_t totalcount = 0;
for (;; src++) {
unsigned short wc = *src;
size_t count;
if (wc < 0x80) {
if (wc == (wchar_t)'\0') {
*srcp = NULL;
break;
}
count = 1;
} else if (wc < 0x800) {
count = 2;
} else {
count = 3;
}
totalcount += count;
}
return totalcount;
}
}
typedef struct
{
u32 count : 16;
u32 value : 16;
} gf_utf8_mbstate_t;
static gf_utf8_mbstate_t internal;
GF_EXPORT
size_t gf_utf8_mbstowcs(unsigned short* dest, size_t len, const char** srcp)
{
gf_utf8_mbstate_t* ps = &internal;
const char *src = *srcp;
unsigned short* destptr = dest;
for (; len > 0; destptr++, len--) {
const char* backup_src = src;
unsigned char c;
unsigned short wc;
size_t count;
if (ps->count == 0) {
c = (unsigned char) *src;
if (c < 0x80) {
*destptr = (wchar_t) c;
if (c == 0) {
src = NULL;
break;
}
src++;
continue;
} else if (c < 0xC0) {
goto bad_input;
}
if (c < 0xE0) {
wc = (wchar_t)(c & 0x1F) << 6;
count = 1;
if (c < 0xC2) goto bad_input;
} else if (c < 0xF0) {
wc = (wchar_t)(c & 0x0F) << 12;
count = 2;
}
else goto bad_input;
src++;
} else {
wc = ps->value << 6;
count = ps->count;
}
for (;;) {
c = (unsigned char) *src++ ^ 0x80;
if (!(c < 0x40)) goto bad_input_backup;
wc |= (unsigned short) c << (6 * --count);
if (count == 0)
break;
if ((unsigned short) wc < ((unsigned short) 1 << (5 * count + 6)))
goto bad_input_backup;
}
*destptr = wc;
ps->count = 0;
continue;
bad_input_backup:
src = backup_src;
goto bad_input;
}
*srcp = src;
return destptr-dest;
bad_input:
*srcp = src;
return (size_t)(-1);
}
#endif