root/Source/wtf/text/TextEncoding.cpp

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. UTF7Encoding
  2. m_backslashAsCurrencySymbol
  3. m_backslashAsCurrencySymbol
  4. decode
  5. encode
  6. normalizeAndEncode
  7. usesVisualOrdering
  8. backslashAsCurrencySymbol
  9. isNonByteBasedEncoding
  10. isUTF7Encoding
  11. closestByteBasedEquivalent
  12. encodingForFormSubmission
  13. ASCIIEncoding
  14. Latin1Encoding
  15. UTF16BigEndianEncoding
  16. UTF16LittleEndianEncoding
  17. UTF32BigEndianEncoding
  18. UTF32LittleEndianEncoding
  19. UTF8Encoding
  20. WindowsLatin1Encoding

/*
 * Copyright (C) 2004, 2006, 2007, 2008, 2009 Apple Inc. All rights reserved.
 * Copyright (C) 2006 Alexey Proskuryakov <ap@nypop.com>
 * Copyright (C) 2007-2009 Torch Mobile, Inc.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE COMPUTER, INC. OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

#include "config.h"
#include "wtf/text/TextEncoding.h"

#include "wtf/text/TextEncodingRegistry.h"
#include <unicode/unorm.h>
#include "wtf/OwnPtr.h"
#include "wtf/StdLibExtras.h"
#include "wtf/text/CString.h"
#include "wtf/text/WTFString.h"

namespace WTF {

static const TextEncoding& UTF7Encoding()
{
    static TextEncoding globalUTF7Encoding("UTF-7");
    return globalUTF7Encoding;
}

TextEncoding::TextEncoding(const char* name)
    : m_name(atomicCanonicalTextEncodingName(name))
    , m_backslashAsCurrencySymbol(backslashAsCurrencySymbol())
{
}

TextEncoding::TextEncoding(const String& name)
    : m_name(atomicCanonicalTextEncodingName(name))
    , m_backslashAsCurrencySymbol(backslashAsCurrencySymbol())
{
}

String TextEncoding::decode(const char* data, size_t length, bool stopOnError, bool& sawError) const
{
    if (!m_name)
        return String();

    return newTextCodec(*this)->decode(data, length, DataEOF, stopOnError, sawError);
}

CString TextEncoding::encode(const String& string, UnencodableHandling handling) const
{
    if (!m_name)
        return CString();

    if (string.isEmpty())
        return "";

    OwnPtr<TextCodec> textCodec = newTextCodec(*this);
    CString encodedString;
    if (string.is8Bit())
        encodedString = textCodec->encode(string.characters8(), string.length(), handling);
    else
        encodedString = textCodec->encode(string.characters16(), string.length(), handling);
    return encodedString;
}

CString TextEncoding::normalizeAndEncode(const String& string, UnencodableHandling handling) const
{
    if (!m_name)
        return CString();

    if (string.isEmpty())
        return "";

    // Text exclusively containing Latin-1 characters (U+0000..U+00FF) is left
    // unaffected by NFC. This is effectively the same as saying that all
    // Latin-1 text is already normalized to NFC.
    // Source: http://unicode.org/reports/tr15/
    if (string.is8Bit())
        return newTextCodec(*this)->encode(string.characters8(), string.length(), handling);

    const UChar* source = string.characters16();
    size_t length = string.length();

    Vector<UChar> normalizedCharacters;

    UErrorCode err = U_ZERO_ERROR;
    if (unorm_quickCheck(source, length, UNORM_NFC, &err) != UNORM_YES) {
        // First try using the length of the original string, since normalization to NFC rarely increases length.
        normalizedCharacters.grow(length);
        int32_t normalizedLength = unorm_normalize(source, length, UNORM_NFC, 0, normalizedCharacters.data(), length, &err);
        if (err == U_BUFFER_OVERFLOW_ERROR) {
            err = U_ZERO_ERROR;
            normalizedCharacters.resize(normalizedLength);
            normalizedLength = unorm_normalize(source, length, UNORM_NFC, 0, normalizedCharacters.data(), normalizedLength, &err);
        }
        ASSERT(U_SUCCESS(err));

        source = normalizedCharacters.data();
        length = normalizedLength;
    }

    return newTextCodec(*this)->encode(source, length, handling);
}

bool TextEncoding::usesVisualOrdering() const
{
    if (noExtendedTextEncodingNameUsed())
        return false;

    static const char* const a = atomicCanonicalTextEncodingName("ISO-8859-8");
    return m_name == a;
}

UChar TextEncoding::backslashAsCurrencySymbol() const
{
    return shouldShowBackslashAsCurrencySymbolIn(m_name) ? 0x00A5 : '\\';
}

bool TextEncoding::isNonByteBasedEncoding() const
{
    if (noExtendedTextEncodingNameUsed()) {
        return *this == UTF16LittleEndianEncoding()
            || *this == UTF16BigEndianEncoding();
    }

    return *this == UTF16LittleEndianEncoding()
        || *this == UTF16BigEndianEncoding()
        || *this == UTF32BigEndianEncoding()
        || *this == UTF32LittleEndianEncoding();
}

bool TextEncoding::isUTF7Encoding() const
{
    if (noExtendedTextEncodingNameUsed())
        return false;

    return *this == UTF7Encoding();
}

const TextEncoding& TextEncoding::closestByteBasedEquivalent() const
{
    if (isNonByteBasedEncoding())
        return UTF8Encoding();
    return *this;
}

// HTML5 specifies that UTF-8 be used in form submission when a form is
// is a part of a document in UTF-16 probably because UTF-16 is not a
// byte-based encoding and can contain 0x00. By extension, the same
// should be done for UTF-32. In case of UTF-7, it is a byte-based encoding,
// but it's fraught with problems and we'd rather steer clear of it.
const TextEncoding& TextEncoding::encodingForFormSubmission() const
{
    if (isNonByteBasedEncoding() || isUTF7Encoding())
        return UTF8Encoding();
    return *this;
}

const TextEncoding& ASCIIEncoding()
{
    static TextEncoding globalASCIIEncoding("ASCII");
    return globalASCIIEncoding;
}

const TextEncoding& Latin1Encoding()
{
    static TextEncoding globalLatin1Encoding("latin1");
    return globalLatin1Encoding;
}

const TextEncoding& UTF16BigEndianEncoding()
{
    static TextEncoding globalUTF16BigEndianEncoding("UTF-16BE");
    return globalUTF16BigEndianEncoding;
}

const TextEncoding& UTF16LittleEndianEncoding()
{
    static TextEncoding globalUTF16LittleEndianEncoding("UTF-16LE");
    return globalUTF16LittleEndianEncoding;
}

const TextEncoding& UTF32BigEndianEncoding()
{
    static TextEncoding globalUTF32BigEndianEncoding("UTF-32BE");
    return globalUTF32BigEndianEncoding;
}

const TextEncoding& UTF32LittleEndianEncoding()
{
    static TextEncoding globalUTF32LittleEndianEncoding("UTF-32LE");
    return globalUTF32LittleEndianEncoding;
}

const TextEncoding& UTF8Encoding()
{
    static TextEncoding globalUTF8Encoding("UTF-8");
    ASSERT(globalUTF8Encoding.isValid());
    return globalUTF8Encoding;
}

const TextEncoding& WindowsLatin1Encoding()
{
    static TextEncoding globalWindowsLatin1Encoding("WinLatin1");
    return globalWindowsLatin1Encoding;
}

} // namespace WTF

/* [<][>][^][v][top][bottom][index][help] */