root/src/utils/utf.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. ConvertUTF16toUTF8
  2. isLegalUTF8
  3. ConvertUTF8toUTF16
  4. gf_utf8_wcslen
  5. gf_utf8_wcstombs
  6. gf_utf8_mbstowcs
  7. gf_utf8_wcslen
  8. gf_utf8_wcstombs
  9. gf_utf8_mbstowcs

/*
 *                      GPAC - Multimedia Framework C SDK
 *
 *                      Authors: Jean Le Feuvre
 *                      Copyright (c) Telecom ParisTech 2007-2012
 *                                      All rights reserved
 *
 *  This file is part of GPAC / common tools sub-project
 *
 *  GPAC is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU Lesser General Public License as published by
 *  the Free Software Foundation; either version 2, or (at your option)
 *  any later version.
 *
 *  GPAC is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU Lesser General Public License for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public
 *  License along with this library; see the file COPYING.  If not, write to
 *  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
 *
 */

#ifndef GPAC_DISABLE_CORE_TOOLS

#include <gpac/utf.h>


#if 1


/*
 * Copyright 2001-2004 Unicode, Inc.
 *
 * Disclaimer
 *
 * This source code is provided as is by Unicode, Inc. No claims are
 * made as to fitness for any particular purpose. No warranties of any
 * kind are expressed or implied. The recipient agrees to determine
 * applicability of information provided. If this file has been
 * purchased on magnetic or optical media from Unicode, Inc., the
 * sole remedy for any claim will be exchange of defective media
 * within 90 days of receipt.
 *
 * Limitations on Rights to Redistribute This Code
 *
 * Unicode, Inc. hereby grants the right to freely use the information
 * supplied in this file in the creation of products supporting the
 * Unicode Standard, and to make copies of this file in any form
 * for internal or external distribution as long as this notice
 * remains attached.
 */

/* ---------------------------------------------------------------------

    Conversions between UTF32, UTF-16, and UTF-8. Source code file.
    Author: Mark E. Davis, 1994.
    Rev History: Rick McGowan, fixes & updates May 2001.
    Sept 2001: fixed const & error conditions per
        mods suggested by S. Parent & A. Lillich.
    June 2002: Tim Dodd added detection and handling of incomplete
        source sequences, enhanced error detection, added casts
        to eliminate compiler warnings.
    July 2003: slight mods to back out aggressive FFFE detection.
    Jan 2004: updated switches in from-UTF8 conversions.
    Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions.

    See the header file "ConvertUTF.h" for complete documentation.

------------------------------------------------------------------------ */

typedef u32 UTF32;      /* at least 32 bits */
typedef u16 UTF16;      /* at least 16 bits */
typedef u8 UTF8;        /* typically 8 bits */
typedef u8 Boolean; /* 0 or 1 */

/* Some fundamental constants */
#define UNI_REPLACEMENT_CHAR (UTF32)0x0000FFFD
#define UNI_MAX_BMP (UTF32)0x0000FFFF
#define UNI_MAX_UTF16 (UTF32)0x0010FFFF
#define UNI_MAX_UTF32 (UTF32)0x7FFFFFFF
#define UNI_MAX_LEGAL_UTF32 (UTF32)0x0010FFFF

typedef enum {
        conversionOK,           /* conversion successful */
        sourceExhausted,        /* partial character in source, but hit end */
        targetExhausted,        /* insuff. room in target for conversion */
        sourceIllegal           /* source sequence is illegal/malformed */
} ConversionResult;

typedef enum {
        strictConversion = 0,
        lenientConversion
} ConversionFlags;

static const int halfShift  = 10; /* used for shifting by 10 bits */

static const UTF32 halfBase = 0x0010000UL;
static const UTF32 halfMask = 0x3FFUL;

#define UNI_SUR_HIGH_START  (UTF32)0xD800
#define UNI_SUR_HIGH_END    (UTF32)0xDBFF
#define UNI_SUR_LOW_START   (UTF32)0xDC00
#define UNI_SUR_LOW_END     (UTF32)0xDFFF
#define false      0
#define true        1

/*
 * Index into the table below with the first byte of a UTF-8 sequence to
 * get the number of trailing bytes that are supposed to follow it.
 * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is
 * left as-is for anyone who may want to do such conversion, which was
 * allowed in earlier algorithms.
 */
static const char trailingBytesForUTF8[256] = {
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
        1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
        2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
};

/*
 * Magic values subtracted from a buffer value during UTF8 conversion.
 * This table contains as many values as there might be trailing bytes
 * in a UTF-8 sequence.
 */
static const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL,
                                          0x03C82080UL, 0xFA082080UL, 0x82082080UL
                                        };

/*
 * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
 * into the first byte, depending on how many bytes follow.  There are
 * as many entries in this table as there are UTF-8 sequence types.
 * (I.e., one byte sequence, two byte... etc.). Remember that sequencs
 * for *legal* UTF-8 will be 4 or fewer bytes total.
 */
static const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };

/* --------------------------------------------------------------------- */

/* The interface converts a whole buffer to avoid function-call overhead.
 * Constants have been gathered. Loops & conditionals have been removed as
 * much as possible for efficiency, in favor of drop-through switches.
 * (See "Note A" at the bottom of the file for equivalent code.)
 * If your compiler supports it, the "isLegalUTF8" call can be turned
 * into an inline function.
 */

/* --------------------------------------------------------------------- */

ConversionResult ConvertUTF16toUTF8 (
    const UTF16** sourceStart, const UTF16* sourceEnd,
    UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
        ConversionResult result = conversionOK;
        const UTF16* source = *sourceStart;
        UTF8* target = *targetStart;
        while (source < sourceEnd) {
                UTF32 ch;
                unsigned short bytesToWrite = 0;
                const UTF32 byteMask = 0xBF;
                const UTF32 byteMark = 0x80;
                const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */
                ch = *source++;
                /* If we have a surrogate pair, convert to UTF32 first. */
                if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
                        /* If the 16 bits following the high surrogate are in the source buffer... */
                        if (source < sourceEnd) {
                                UTF32 ch2 = *source;
                                /* If it's a low surrogate, convert to UTF32. */
                                if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
                                        ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
                                             + (ch2 - UNI_SUR_LOW_START) + halfBase;
                                        ++source;
                                } else if (flags == strictConversion) { /* it's an unpaired high surrogate */
                                        --source; /* return to the illegal value itself */
                                        result = sourceIllegal;
                                        break;
                                }
                        } else { /* We don't have the 16 bits following the high surrogate. */
                                --source; /* return to the high surrogate */
                                result = sourceExhausted;
                                break;
                        }
                } else if (flags == strictConversion) {
                        /* UTF-16 surrogate values are illegal in UTF-32 */
                        if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
                                --source; /* return to the illegal value itself */
                                result = sourceIllegal;
                                break;
                        }
                }
                /* Figure out how many bytes the result will require */
                if (ch < (UTF32)0x80) {
                        bytesToWrite = 1;
                } else if (ch < (UTF32)0x800) {
                        bytesToWrite = 2;
                } else if (ch < (UTF32)0x10000) {
                        bytesToWrite = 3;
                } else if (ch < (UTF32)0x110000) {
                        bytesToWrite = 4;
                } else {
                        bytesToWrite = 3;
                        ch = UNI_REPLACEMENT_CHAR;
                }

                target += bytesToWrite;
                if (target > targetEnd) {
                        source = oldSource; /* Back up source pointer! */
                        target -= bytesToWrite;
                        result = targetExhausted;
                        break;
                }
                switch (bytesToWrite) { /* note: everything falls through. */
                case 4:
                        *--target = (UTF8)((ch | byteMark) & byteMask);
                        ch >>= 6;
                case 3:
                        *--target = (UTF8)((ch | byteMark) & byteMask);
                        ch >>= 6;
                case 2:
                        *--target = (UTF8)((ch | byteMark) & byteMask);
                        ch >>= 6;
                case 1:
                        *--target =  (UTF8)(ch | firstByteMark[bytesToWrite]);
                }
                target += bytesToWrite;
        }
        *sourceStart = source;
        *targetStart = target;
        return result;
}

/*
 * Utility routine to tell whether a sequence of bytes is legal UTF-8.
 * This must be called with the length pre-determined by the first byte.
 * If not calling this from ConvertUTF8to*, then the length can be set by:
 *  length = trailingBytesForUTF8[*source]+1;
 * and the sequence is illegal right away if there aren't that many bytes
 * available.
 * If presented with a length > 4, this returns false.  The Unicode
 * definition of UTF-8 goes up to 4-byte sequences.
 */

static Boolean isLegalUTF8(const UTF8 *source, int length) {
        UTF8 a;
        const UTF8 *srcptr = source+length;
        switch (length) {
        default:
                return false;
        /* Everything else falls through when "true"... */
        case 4:
                if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
        case 3:
                if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
        case 2:
                if ((a = (*--srcptr)) > 0xBF) return false;

                switch (*source) {
                /* no fall-through in this inner switch */
                case 0xE0:
                        if (a < 0xA0) return false;
                        break;
                case 0xED:
                        if (a > 0x9F) return false;
                        break;
                case 0xF0:
                        if (a < 0x90) return false;
                        break;
                case 0xF4:
                        if (a > 0x8F) return false;
                        break;
                default:
                        if (a < 0x80) return false;
                }

        case 1:
                if (*source >= 0x80 && *source < 0xC2) return false;
        }
        if (*source > 0xF4) return false;
        return true;
}

/* --------------------------------------------------------------------- */

ConversionResult ConvertUTF8toUTF16 (
    const UTF8** sourceStart, const UTF8* sourceEnd,
    UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
        ConversionResult result = conversionOK;
        const UTF8* source = *sourceStart;
        UTF16* target = *targetStart;
        while (source < sourceEnd) {
                UTF32 ch = 0;
                unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
                if (source + extraBytesToRead >= sourceEnd) {
                        result = sourceExhausted;
                        break;
                }
                /* Do this check whether lenient or strict */
                if (! isLegalUTF8(source, extraBytesToRead+1)) {
                        result = sourceIllegal;
                        break;
                }
                /*
                 * The cases all fall through. See "Note A" below.
                 */
                switch (extraBytesToRead) {
                case 5:
                        ch += *source++;
                        ch <<= 6; /* remember, illegal UTF-8 */
                case 4:
                        ch += *source++;
                        ch <<= 6; /* remember, illegal UTF-8 */
                case 3:
                        ch += *source++;
                        ch <<= 6;
                case 2:
                        ch += *source++;
                        ch <<= 6;
                case 1:
                        ch += *source++;
                        ch <<= 6;
                case 0:
                        ch += *source++;
                }
                ch -= offsetsFromUTF8[extraBytesToRead];

                if (target >= targetEnd) {
                        source -= (extraBytesToRead+1); /* Back up source pointer! */
                        result = targetExhausted;
                        break;
                }
                if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
                        /* UTF-16 surrogate values are illegal in UTF-32 */
                        if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
                                if (flags == strictConversion) {
                                        source -= (extraBytesToRead+1); /* return to the illegal value itself */
                                        result = sourceIllegal;
                                        break;
                                } else {
                                        *target++ = UNI_REPLACEMENT_CHAR;
                                }
                        } else {
                                *target++ = (UTF16)ch; /* normal case */
                        }
                } else if (ch > UNI_MAX_UTF16) {
                        if (flags == strictConversion) {
                                result = sourceIllegal;
                                source -= (extraBytesToRead+1); /* return to the start */
                                break; /* Bail out; shouldn't continue */
                        } else {
                                *target++ = UNI_REPLACEMENT_CHAR;
                        }
                } else {
                        /* target is a character in range 0xFFFF - 0x10FFFF. */
                        if (target + 1 >= targetEnd) {
                                source -= (extraBytesToRead+1); /* Back up source pointer! */
                                result = targetExhausted;
                                break;
                        }
                        ch -= halfBase;
                        *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
                        *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
                }
        }
        *sourceStart = source;
        *targetStart = target;
        return result;
}



GF_EXPORT
size_t gf_utf8_wcslen (const unsigned short *s)
{
        const unsigned short* ptr;
        for (ptr = s; *ptr != (unsigned short)'\0'; ptr++) {
        }
        return ptr - s;
}

GF_EXPORT
size_t gf_utf8_wcstombs(char* dest, size_t len, const unsigned short** srcp)
{
        const UTF16** sourceStart = srcp;
        const UTF16* sourceEnd = *srcp + gf_utf8_wcslen(*srcp);
        UTF8* targetStart = (UTF8*) dest;
        UTF8* targetEnd = (UTF8*) dest + len;
        ConversionFlags flags = strictConversion;

        ConversionResult res = ConvertUTF16toUTF8(sourceStart, sourceEnd, &targetStart, targetEnd, flags);
        if (res != conversionOK) return (size_t)-1;
        *targetStart = 0;
        *srcp=NULL;
        return strlen(dest);
}

GF_EXPORT
size_t gf_utf8_mbstowcs(unsigned short* dest, size_t len, const char** srcp)
{
        const UTF8** sourceStart = (const UTF8**) srcp;
        const UTF8* sourceEnd = (const UTF8*) ( *srcp + strlen( *srcp) );
        UTF16* targetStart = (UTF16* ) dest;
        UTF16* targetEnd = (UTF16* ) (dest + len);
        ConversionFlags flags = strictConversion;
        ConversionResult res = ConvertUTF8toUTF16(sourceStart, sourceEnd, &targetStart, targetEnd, flags);
        if (res != conversionOK) return (size_t)-1;
        *targetStart = 0;
        *srcp=NULL;
        return gf_utf8_wcslen(dest);
}


#else

GF_EXPORT
size_t gf_utf8_wcslen (const unsigned short *s)
{
        const unsigned short* ptr;
        for (ptr = s; *ptr != (unsigned short)'\0'; ptr++) {
        }
        return ptr - s;
}

GF_EXPORT
size_t gf_utf8_wcstombs(char* dest, size_t len, const unsigned short** srcp)
{
        /*
        * Original code from the GNU UTF-8 Library
        */
        size_t count;
        const unsigned short * src = *srcp;

        if (dest != NULL) {
                char* destptr = dest;
                for (;; src++) {
                        unsigned char c;
                        unsigned short wc = *src;
                        if (wc < 0x80) {
                                if (wc == (wchar_t)'\0') {
                                        if (len == 0) {
                                                *srcp = src;
                                                break;
                                        }
                                        *destptr = '\0';
                                        *srcp = NULL;
                                        break;
                                }
                                count = 0;
                                c = (unsigned char) wc;
                        } else if (wc < 0x800) {
                                count = 1;
                                c = (unsigned char) ((wc >> 6) | 0xC0);
                        } else {
                                count = 2;
                                c = (unsigned char) ((wc >> 12) | 0xE0);
                        }
                        if (len <= count) {
                                *srcp = src;
                                break;
                        }
                        len -= count+1;
                        *destptr++ = c;
                        if (count > 0)
                                do {
                                        *destptr++ = (unsigned char)(((wc >> (6 * --count)) & 0x3F) | 0x80);
                                } while (count > 0);
                }
                return destptr - dest;
        } else {
                /* Ignore dest and len. */
                size_t totalcount = 0;
                for (;; src++) {
                        unsigned short wc = *src;
                        size_t count;
                        if (wc < 0x80) {
                                if (wc == (wchar_t)'\0') {
                                        *srcp = NULL;
                                        break;
                                }
                                count = 1;
                        } else if (wc < 0x800) {
                                count = 2;
                        } else {
                                count = 3;
                        }
                        totalcount += count;
                }
                return totalcount;
        }
}


typedef struct
{
        u32 count : 16;   /* number of bytes remaining to be processed */
        u32 value : 16;   /* if count > 0: partial wide character */
        /*
           If WCHAR_T_BITS == 16, need 2 bits for count,
           12 bits for value (10 for mbstowcs direction, 12 for wcstombs direction).
        */
} gf_utf8_mbstate_t;

static gf_utf8_mbstate_t internal;

GF_EXPORT
size_t gf_utf8_mbstowcs(unsigned short* dest, size_t len, const char** srcp)
{
        gf_utf8_mbstate_t* ps = &internal;
        const char *src = *srcp;

        unsigned short* destptr = dest;
        for (; len > 0; destptr++, len--) {
                const char* backup_src = src;
                unsigned char c;
                unsigned short wc;
                size_t count;
                if (ps->count == 0) {
                        c = (unsigned char) *src;
                        if (c < 0x80) {
                                *destptr = (wchar_t) c;
                                if (c == 0) {
                                        src = NULL;
                                        break;
                                }
                                src++;
                                continue;
                        } else if (c < 0xC0) {
                                /* Spurious 10XXXXXX byte is invalid. */
                                goto bad_input;
                        }
                        if (c < 0xE0) {
                                wc = (wchar_t)(c & 0x1F) << 6;
                                count = 1;
                                if (c < 0xC2) goto bad_input;
                        } else if (c < 0xF0) {
                                wc = (wchar_t)(c & 0x0F) << 12;
                                count = 2;
                        }
                        else goto bad_input;
                        src++;
                } else {
                        wc = ps->value << 6;
                        count = ps->count;
                }
                for (;;) {
                        c = (unsigned char) *src++ ^ 0x80;
                        if (!(c < 0x40)) goto bad_input_backup;
                        wc |= (unsigned short) c << (6 * --count);
                        if (count == 0)
                                break;
                        /* The following test is only necessary once for every character,
                        but it would be too complicated to perform it once only, on
                        the first pass through this loop. */
                        if ((unsigned short) wc < ((unsigned short) 1 << (5 * count + 6)))
                                goto bad_input_backup;
                }
                *destptr = wc;
                ps->count = 0;
                continue;

bad_input_backup:
                src = backup_src;
                goto bad_input;
        }
        *srcp = src;
        return destptr-dest;

bad_input:
        *srcp = src;
        return (size_t)(-1);
}

#endif

/* [<][>][^][v][top][bottom][index][help] */