root/core/StringObject.h

/* [<][>][^][v][top][bottom][index][help] */

INCLUDED FROM


/* -*- Mode: C++; c-basic-offset: 4; indent-tabs-mode: t; tab-width: 4 -*- */
/* ***** BEGIN LICENSE BLOCK *****
 * Version: MPL 1.1/GPL 2.0/LGPL 2.1
 *
 * The contents of this file are subject to the Mozilla Public License Version
 * 1.1 (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 * http://www.mozilla.org/MPL/
 *
 * Software distributed under the License is distributed on an "AS IS" basis,
 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
 * for the specific language governing rights and limitations under the
 * License.
 *
 * The Original Code is [Open Source Virtual Machine.].
 *
 * The Initial Developer of the Original Code is
 * Adobe System Incorporated.
 * Portions created by the Initial Developer are Copyright (C) 2008
 * the Initial Developer. All Rights Reserved.
 *
 * Contributor(s):
 *   Michael Daumling
 *
 * Alternatively, the contents of this file may be used under the terms of
 * either the GNU General Public License Version 2 or later (the "GPL"), or
 * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
 * in which case the provisions of the GPL or the LGPL are applicable instead
 * of those above. If you wish to allow use of your version of this file only
 * under the terms of either the GPL or the LGPL, and not to allow others to
 * use your version of this file under the terms of the MPL, indicate your
 * decision by deleting the provisions above and replace them with the notice
 * and other provisions required by the GPL or the LGPL. If you do not delete
 * the provisions above, a recipient may use your version of this file under
 * the terms of any one of the MPL, the GPL or the LGPL.
 *
 * ***** END LICENSE BLOCK ***** */

#ifndef __avmplus_NewString__
#define __avmplus_NewString__

namespace avmplus 
{
        class ByteArray;

        /// The utf8_t data type expressively means UTF-8 data.
        typedef uint8_t utf8_t;

        /**
        A String can have many faces, dependent on the way the string data is stored. 
        The most common is the kDirect type, meaning that the string data follows the
        instance data. The kStatic type has the string instance point to a static buffer.
        This buffer must exist as long as the string exists; character constants are
        ideal candidates for this type. The kDependent type is a string that points
        into another string; this type is created in a substring or concatenation operation. 
        Strings cannot be deleted directly, because they may be referenced be dependent strings.
        <p>
        String concatenation attempts first to use additional memory that the memory 
        allocator has left behind when a kDirect string was allocated. If the right-hand
        string fits into that buffer, the data is appended, and a dependent string is returned,
        pointing to the new, larger buffer(the original string keeps its initial length
        and thus does not know about the additional data). If the data does not fit, a new
        kDirect string is allocated with some extra bytes at the end, assuming that there may
        be more characters to append. The minimum of extra characters is 32, then it is twice 
        the new length, up to a platform-dependent maximum(usually 64K). This value can be 
        tweaked for platforms with memory constraints in favor of more copying operations.
        <p>
        Strings exist in 8, 16, and 32-bit flavors. The 8-bit flavor only holds the first
        256 Unicode characters. All widths ignore Unicode surrogate pairs, treating them
        as ordinary characters. Use the createUTFxx() methods to deal with surrogate pairs and 
        UTF-8 encoding. If the kAuto type is used during creation, a quick scan is made to
        see if a string would fit into a narrower width than the buffer suggests, i.e. if
        a 16-bit buffer only contains 8-bit characters.
        <p>
        Strings cannot be deleted, since the create() methods may choose to return standard
        string constants, or interned strings, or other strings that other code depends on.
        */
        class String : public AvmPlusScriptableObject
        {
        public:

                /// String type constants. Note that isDependent() and isStatic() rely on using these values as bitflags.
                enum Type
                {
                        kDynamic                        = 0,    // buffer is on the heap
                        kStatic                         = 1,    // buffer is static
                        kDependent                      = 2             // string points into master string
                };
                /// String width constants.
                enum Width
                {
                        kAuto   = -1,   // only used in APIs
                        k8              = 0,    // chosen such that i<<k8 == i*sizeof(uint8_t)
                        k16             = 1             // chosen such that i<<k16 == i*sizeof(uint16_t)
                };
                /**
                Use this constant to define the default width for this system. If you use anything
                else but kAuto, this would create strings of that width. This is not recommended.
                */
                static  const Width kDefaultWidth = kAuto;

                /**
                Create a string using Latin-1 data. Characters are just widened and copied.
                To create an UTF-8 string, use createUTF8().
                @param  core                            the AvmCore instance to use
                @param  buffer                          the character buffer; if NULL, assume an empty string.
                @param  len                                     the size in characters. If < 0, assume NULL termination and calculate.
                @param  desiredWidth            the desired width; use kAuto to get a string as narrow as possible
                @param  staticBuf                       if true, the buffer is static, and may be used by the string
                @return                                         the String instance
                */
                static  Stringp                         createLatin1(AvmCore* core, const char* buffer, int32_t len = -1, Width desiredWidth = kDefaultWidth, bool staticBuf = false);

                /**
                Create a string using UTF-8 data. To preserve backwards compatibility, an additional
                "strict" flag can be set to false. This allows a bug in the UTF-8 conversion routine
                to prevail, where invalid UTF-8 sequences are copied as single characters.
                @param  avm                                     the AvmCore instance to use
                @param  buffer                          the UTF-8 buffer; if NULL, assume an empty string.
                @param  len                                     the size in bytes. If < 0, assume NULL termination and calculate.
                @param  desiredWidth            the desired width; use kAuto to get a string as narrow as possible
                @param  staticBuf                       if true, the buffer is static, and may be used by the string
                @param  strict                          if false, return NULL on invalid characters (see above)
                @return                                         the String instance, or NULL on bad characters
                */
                static  Stringp                         createUTF8(AvmCore* core, const utf8_t* buffer, int32_t len = -1, 
                                                                                                String::Width desiredWidth = String::kDefaultWidth, 
                                                                                                bool staticBuf = false, bool strict = true);
                /**
                Create a string using UTF-16 data. If the desired width is too small to fit the source data, 
                return NULL.
                @param  avm                                     the AvmCore instance to use
                @param  buffer                          the UTF-16 buffer; if NULL, assume an empty string.
                @param  len                                     the size in characters. If < 0, assume NULL termination and calculate.
                @param  desiredWidth            the desired width; use kAuto to get a string as narrow as possible
                @param  staticBuf                       if true, the buffer is static, and may be used by the string
                @return                                         the String instance, or NULL on  characters too wide
                */
                static  Stringp                         createUTF16(AvmCore* core, const wchar* buffer, int32_t len = -1, 
                                                                                                 String::Width desiredWidth = String::kDefaultWidth, bool staticBuf = false);

                virtual                                         ~String();

                /**
                Create a string with a given width out of this string. If the width is equal to the current
                width, return this instance. If the desired width is too narrow to fit, or kAuto is passed
                in, return NULL.
                @param  w                                       the width of the new string(kAuto is not supported)
                @return                                         the String instance, or NULL on kAuto, or string too wide
                */
                                Stringp FASTCALL        getFixedWidthString(Width w) const;
                /**
                Returns the Atom equivalent of this String.  This is
                done by or'ing the proper type bits into the pointer.
                */
                REALLY_INLINE   Atom            atom() const { return Atom(AtomConstants::kStringType | uintptr_t(this)); }
                /**
                virtual version of atom():
                */
                virtual Atom                            toAtom() const { return atom(); }
                /**
                If this string is a static or dependent string, make it dynamic so the static 
                data can be released. The string is only
                made dynamic if the static string pointer falls within the given data buffer.
                This prevents unnecessary dynamization of static strings if the string data
                belongs to a different data buffer.
                */
                                void                            makeDynamic(const uint8_t* dataStart, uint32_t dataSize);
                /**
                Check the master of this string if this is a dependent string. If there is
                any indication that using this string would hold a lock on a big master
                string, create a dynamic string and release the master string. The AvmCore 
                uses this method to ensure that interned strings do not keep a lock on a huge
                master string if the master string is dynamic, trying to limit the waste of
                memory.
                */
                                void                            fixDependentString();
                /**
                Produce a has code of this string.
                */
                                int32_t                         hashCode() const;
                /**
                Use the same algorithm to produce a hash code for Latin1 data.
                */
                static  int32_t FASTCALL        hashCodeLatin1(const char* buf, int32_t len);
                /**
                Use the same algorithm to produce a hash code for UTF-16 data.
                */
                static  int32_t FASTCALL        hashCodeUTF16(const wchar* buf, int32_t len);
                /// Return the length in characters.
                REALLY_INLINE   int32_t         length() const { return m_length; }
                // overload used by AS3 glue code.
                                int                                     get_length() const { return m_length; }
                /// Is this string empty?
                REALLY_INLINE   bool            isEmpty() const { return m_length == 0; }
                /// Return the width constant.
                REALLY_INLINE   Width           getWidth() const { return Width(m_bitsAndFlags & TSTR_WIDTH_MASK); }
                /// Return the string type.
                REALLY_INLINE   int32_t         getType() const { return ((m_bitsAndFlags & TSTR_TYPE_MASK) >> TSTR_TYPE_SHIFT); }
                /// Return true iff getType() == kDependent.
                REALLY_INLINE   bool            isDependent() const { return (m_bitsAndFlags & (kDependent << TSTR_TYPE_SHIFT)) != 0; }
                /// Return true iff getType() == kStatic.
                REALLY_INLINE   bool            isStatic() const { return (m_bitsAndFlags & (kStatic << TSTR_TYPE_SHIFT)) != 0; }
                /// Is this an interned string?
                REALLY_INLINE   bool            isInterned() const { return (m_bitsAndFlags & TSTR_INTERNED_FLAG) != 0; }
                /// Mark this string as interned.
                REALLY_INLINE   void            setInterned() { m_bitsAndFlags |= TSTR_INTERNED_FLAG; }
                /**
                Return the character at the given position. No index checks!
                @param  index                           the index
                @return                                         the character at the index
                */
                                wchar FASTCALL          charAt(int32_t index) const;

                /**
                Compare the String with toCompare. If the length is > 0, compare
                the other string up to the given length.
                @param  other                           the string to compare with
                @param  start                           the starting position (in other)
                @param  length                          the length to compare (if > 0) (in other)
                @return = 0 if the strings are identical,
                        < 0 if this string is less than toCompare,
                        > 0 if this string is greater than toCompare
                 */
                                int32_t FASTCALL        Compare(String& other, int32_t other_start = 0, int32_t other_length = 0) const;

                /**
                Compare this string with another string.
                */
                                bool FASTCALL           equals(Stringp that) const;
                /**
                Compare this string with a Latin1 string.
                */
                                bool    FASTCALL        equalsLatin1(const char* p, int32_t len = -1) const;
                /**
                Compare this string with a UTF-16 string.
                */
                                bool    FASTCALL        equalsUTF16(const wchar* p, int32_t len) const;
                /**
                Localized compare - maps to compare().
                */
                                int32_t FASTCALL        localeCompare(Stringp other, const Atom* argv, int32_t argc);

                /*@{*/
                /**
                 * Returns the length of str, in # of characters.  If the length does not
                 * fit in an int32_t, these functions do not return.
                 */
                static int32_t FASTCALL Length(const wchar* str);
                static int32_t FASTCALL Length(const char* str);
                /*@}*/

                /**
                Implements String.indexOf().
                */
                                int32_t FASTCALL        indexOf(Stringp s, int32_t offset = 0) const;

                /**
                Convenience method for old code (boolean result)
                */
                REALLY_INLINE   bool            contains(Stringp s) const { return indexOf(s) >= 0; }
                /**
                Convenience method: indexOf() for a Latin-1 string within a given range.
                @param  p                                       the character string to compare; NULL returns -1
                @param  len                                     the number of characters to compare; if < 0, call Length()
                @param  start                           the starting position
                @param  end                                     the ending position
                @return                                         the index of the found position, or -1 if no match
                */
                                int32_t FASTCALL        indexOfLatin1(const char* p, int32_t len = -1, int32_t start = 0, int32_t end = 0x7FFFFFFF) const;

                /**
                Convenience method: indexOf() for a string of length 1.
                @param  c                                       the character code to compare
                @param  start                           the starting position
                @param  end                                     the ending position
                @return                                         the index of the found position, or -1 if no match
                */
                                int32_t FASTCALL        indexOfCharCode(wchar c, int32_t start = 0, int32_t end = 0x7FFFFFFF) const;
                /**
                Convenience method for old code (boolean result)
                */
                REALLY_INLINE   bool            containsLatin1(const char* p) const { return indexOfLatin1(p) >= 0; }

                /**
                Convenience method: Does a Latin-1 string match at the current position?
                @param  p                                       the character string to compare; NULL returns false
                @param  len                                     the number of characters to compare; if < 0, call Length()
                @param  pos                                     the position to match
                @return                                         true if the string matches
                */
                                bool     FASTCALL       matchesLatin1(const char* p, int32_t len, int32_t pos);

                /**
                Convenience method: Does a Latin-1 string match at the current position, ignoring case?
                @param  p                                       the character string to compare; NULL returns false
                @param  len                                     the number of characters to compare; if < 0, call Length()
                @param  pos                                     the position to match
                @return                                         true if the string matches
                */
                                bool     FASTCALL       matchesLatin1_caseless(const char* p, int32_t len, int32_t pos);
                /**
                Implements String.lastIndexOf().
                */
                                int32_t FASTCALL        lastIndexOf(Stringp s, int32_t offset = 0x7fffffff) const;
                /**
                Concatenate two strings, and return the result. If the right string fits into the buffer
                end of the left string, append the data and return a new dependent string pointing
                to that buffer. If it does not fit, create a kDirect string containing the entire
                buffer, with extra padding at the end to support in-place concatenation.
                @param  left                            the left string; may be NULL
                @param  right                           the right string; may be NULL, although not meaningful
                @return                                         the concatenated string
                */
                static  Stringp FASTCALL        concatStrings(Stringp leftStr, Stringp rightStr);
                /**
                Append a String instance.
                @param  src                                     the string to append
                @return                                         the concatenated string
                */
                                Stringp FASTCALL        append(Stringp str);

                /*
                Append a 8-bit-wide string. For Unicode, strings should be Latin1, not UTF8.
                */
                REALLY_INLINE   Stringp         appendLatin1(const char* p) { return _append(NULL, Pointers((const uint8_t*)p), Length(p), k8); }
                REALLY_INLINE   Stringp         appendLatin1(const char* p, int32_t len) { return _append(NULL, Pointers((const uint8_t*)p), len, k8); }
                /*
                Append a 16-bit-wide string. For Unicode, strings should be UTF16, but this is not enforced
                by this method: indeed, several callers expect to be able to create "illegal" UTF16 sequences
                via this call, for backwards compatibility. Thus, this is a dangerous call and should be used with
                caution (and is also the reason it is not named "appendUTF16").
                */
                REALLY_INLINE   Stringp         append16(const wchar* p) { return _append(NULL, Pointers(p), Length(p), k16); }
                REALLY_INLINE   Stringp         append16(const wchar* p, int32_t len) { return _append(NULL, Pointers(p), len, k16); }
                /**
                Implement String.substr(). The resulting String object points into the original string, 
                and holds a reference to the original string.
                */
                                Stringp FASTCALL        substr(int32_t start, int32_t len = 0x7fffffff);
                /**
                Implement String.substring(). The resulting String object points into the original string, 
                and holds a reference to the original string.
                */
                                Stringp FASTCALL        substring(int32_t start, int32_t end = 0x7fffffff);
                                Stringp FASTCALL        intern_substring(int32_t start, int32_t end = 0x7fffffff);

                /**
                Implement String.slice(). The resulting String object points into the original string, 
                and holds a reference to the original string.
                */
                                Stringp FASTCALL        slice(int32_t start, int32_t end);
                /**
                This routine is a very specific parser to generate a positive integer from a string.
                The following are supported:
                "0" - one single digit for zero - NOT "00" or any other form of zero
                [1-9]+[0-9]* up to 2^32-2(4294967294)
                2^32-1(4294967295) is not supported(see ECMA quote below).
                The ECMA that we're supporting with this routine is...
                cn:  the ES3 test for a valid array index is 
                 "A property name P(in the form of a string value) is an array index if and 
                 only if ToString(ToUint32(P)) is equal to P and ToUint32(P) is not equal to 2^32-1."
                Don't support 000000 as 0.
                We don't support 0x1234 as 1234 in hex since string(1234) doesn't equal '0x1234')
                No leading zeros are supported
                */
                                bool    FASTCALL        parseIndex(uint32_t& result) const;
                /**
                Returns a new string object which is a copy of this string object, with all 
                characters in the string converted to uppercase. 
                Unicode character classes for uppercase and lowercase are used. The conversion 
                behavior is compliant with the String.toUpperCase method. The method returns
                this instance if no changes were detected.
                @return                                         the resulting string or NULL
                */
                                Stringp FASTCALL        toUpperCase();
                /**
                Returns a new string object which is a copy of this string object, with all 
                characters in the string converted to lowercase. 
                Unicode character classes for uppercase and lowercase are used. The conversion 
                behavior is compliant with the String.toUpperCase method. The method returns
                this instance if no changes were detected.
                @return                                         the resulting string or NULL
                */
                                Stringp FASTCALL        toLowerCase();
                /**
                Change the case of a string according to the case mapper supplied.
                If no changes were detected, return this instance, otherwise, return
                a new instance.
                @param  unimapper                       the mapping function to call
                @return                                         the changed string
                */
                                Stringp FASTCALL        caseChange(uint32_t(*unimapper)(uint32_t));
                /**
                Returns a kIntegerAtom Atom if the string holds an integer that fits into
                such an atom. For use in our ScriptObject HashTable implementation.  If we 
                have a valid integer equivalent, it will never be zero since kIntptrType tag != 0.
                */
                                Atom    FASTCALL        getIntAtom() const;
                /**
                This conversion handles hex, octal, base 10 integer, float, and "Infinity"/"-Infinity".
                */
                                double                          toNumber();
                /**
                Check if this character is a valid space character.
                */
                static  bool                            isSpace(wchar ch);
                /**
                Is this string all whitespace?
                */
                                bool                            isWhitespace() const;

                /// Native functions, used by StringClass.cpp
                                int                                     _indexOf(Stringp s, int i=0);
                                int                                     AS3_indexOf(Stringp s, double i=0);

                                int                                     _lastIndexOf(Stringp s, int i=0x7fffffff);
                                int                                     AS3_lastIndexOf(Stringp s, double i=0x7fffffff);

                                Stringp                         _charAt(int i=0); 
                                Stringp                         AS3_charAt(double i=0); 

                                double                          _charCodeAt(int i); // returns NaN for out-of-bounds
                                double                          AS3_charCodeAt(double i); // returns NaN for out-of-bounds

                                int                                     AS3_localeCompare(Stringp other);

                                Stringp                         _substring(int i_start, int i_count);
                                Stringp                         AS3_substring(double d_start, double d_count);

                                Stringp                         _slice(int dStart, int dEnd);
                                Stringp                         AS3_slice(double dStart, double dEnd);

                                Stringp                         _substr(int dStart, int dEnd);
                                Stringp                         AS3_substr(double dStart, double dEnd);

                                Stringp                         AS3_toUpperCase();
                                Stringp                         AS3_toLowerCase();

                // Useful utilities used by the core code.
                static  wchar                           wCharToUpper(wchar ch) { return (wchar) unicharToUpper(ch); }
                static  wchar                           wCharToLower(wchar ch) { return (wchar) unicharToLower(ch); }
                static  uint32_t                        unicharToUpper(uint32_t ch);
                static  uint32_t                        unicharToLower(uint32_t ch);
#ifdef DEBUGGER
                virtual uint64                          size() const;
#endif

        private:
                friend class StringIndexer;
                friend class StUTF8String;
                friend class StUTF16String;

        private:
                /**
                        This is a union of three different pointers, or an offset value -- you
                        must know what type of String this is (static, dynamic, dependent) to know
                        how to interpret the field. 
                        
                        Note that the offset value is always in bytes, regardless of string width!
                        
                        *** WARNING ***
                        This struct is only used inside of String itself, and should not ever be allocated on the stack.
                        If you want to obtain a pointer to the start of the string's character buffer, the simplest
                        and safest way to do this is to use the Pointers struct (see below).
                */
                struct Buffer
                {
                        union
                        {
                                void*                   pv;
                                uint8_t*                p8;
                                wchar*                  p16;
                                uintptr_t               offset_bytes;
                        };
                        REALLY_INLINE explicit Buffer(const void* _pv) { pv = const_cast<void*>(_pv); }
                        REALLY_INLINE explicit Buffer(uintptr_t _offset_bytes) { offset_bytes = _offset_bytes; }
                };

                /**
                        Extra storage, for the Master pointer (for dependent strings) 
                        or index value (lazily calculated for other strings)
                */
                struct Extra
                {
                        union
                        {
                                Stringp                 master; // used for dependent strings
                mutable uint32_t                index;  // if not dependent, this is the index value for getIntAtom/parseIndex
                        };
                        REALLY_INLINE explicit Extra(Stringp _master) { master = _master; }
                };
                
                                Buffer                  m_buffer;       // buffer pointer (dynamic, static, or offset into master)
                                Extra                   m_extra;
                                int32_t                 m_length;                                       // length in characters
                mutable uint32_t                m_bitsAndFlags;                         // various bits and flags, see below (must be unsigned)
                                enum {
                                        TSTR_WIDTH_MASK                 = 0x00000001,   // string width (right-aligned for fast access)
                                        TSTR_TYPE_MASK                  = 0x00000006,   // type index, 2 bits
                                        TSTR_TYPE_SHIFT                 = 1,
                                        // If TSTR_7BIT_FLAG is set, the string has width of k8, with no characters having the high bit set.
                                        // Thus the string is both 7-bit ascii and "utf8-compatible" as-is; knowing this can produce
                                        // huge speedups in code that uses utf8 conversion heavily (in conjunction with "ascii" strings, of course).
                                        // Note that this bit is set lazily (and currently, only by StUTF8String), thus, if this bit is clear,
                                        // the string might still be 7-bit-ascii... we just haven't checked yet.
                                        TSTR_7BIT_FLAG                  = 0x00000008,   
                                        TSTR_7BIT_SHIFT                 = 3,    
                                        TSTR_INTERNED_FLAG              = 0x00000010,   // this string is interned
                                        TSTR_NOINT_FLAG                 = 0x00000020,   // set in getIntAtom() if the string is not an 28-bit integer
                                        TSTR_NOUINT_FLAG                = 0x00000040,   // set in parseIndex() if the string is not an unsigned integer
                                        TSTR_UINT28_FLAG                = 0x00000080,   // set if m_index contains value for getIntAtom()
                                        TSTR_UINT32_FLAG                = 0x00000100,   // set if m_index contains value for parseIndex()
                                        TSTR_CHARSLEFT_MASK             = 0xFFFFFE00,   // characters left in buffer field (for inplace concat)
                                        TSTR_CHARSLEFT_SHIFT    = 9
                                };

                /**
                        This is a TEMPORARY struct, always stack-allocated, that is used to extract the current starting
                        pointer for a string. It may look superficially similar to the "Buffer" struct, but is different,
                        in that the pointer is always correct (unlike Buffer, which might actually be an offset into a master).
                */
                struct Pointers
                {
                        union
                        {
                                void*                   pv;
                                uint8_t*                p8;
                                wchar*                  p16;
                        };
                        REALLY_INLINE explicit Pointers(const String* const self);
                        REALLY_INLINE explicit Pointers(const uint8_t* _p8) { p8 = const_cast<uint8_t*>(_p8); }
                        REALLY_INLINE explicit Pointers(const uint16_t* _p16) { p16 = const_cast<uint16_t*>(_p16); }
                };
                
                REALLY_INLINE   void            setType(char index)                     { m_bitsAndFlags = (m_bitsAndFlags & ~TSTR_TYPE_MASK) |(index << TSTR_TYPE_SHIFT); }
                REALLY_INLINE   int32_t         getCharsLeft() const            { return (m_bitsAndFlags & TSTR_CHARSLEFT_MASK) >> TSTR_CHARSLEFT_SHIFT; }
                REALLY_INLINE   void            setCharsLeft(int32_t n)         { m_bitsAndFlags = (m_bitsAndFlags & ~TSTR_CHARSLEFT_MASK) |(n << TSTR_CHARSLEFT_SHIFT); }

                // Create a string with no buffer.
                static  Stringp                         createDependent(MMgc::GC* gc, Stringp master, int32_t start, int32_t len);
                // Create a string with a dynamic buffer.
                static  Stringp                         createDynamic(MMgc::GC* gc, const void* data, int32_t len, Width w, bool is7bit, int32_t extra=0);
                // Create a string with a static buffer.
                static  Stringp                         createStatic(MMgc::GC* gc, const void* data, int32_t len, Width w, bool is7bit);

                // Convert the string data to a dynamic buffer.
                                void                            convertToDynamic();

                /**
                Low-level append worker. 
                */
                                Stringp                         _append(Stringp volatile * rightStrPtr, const Pointers& rightStr, int32_t numChars, Width width);

                #ifdef _DEBUG
                        void verify7bit() const;
                        //#define VERIFY_7BIT(s) do { if (s) (s)->verify7bit(); } while (0)
                        // extremely slow, so disabled by default, even in debug mode.
                        #define VERIFY_7BIT(s) do {  } while (0)
                #else
                        #define VERIFY_7BIT(s) do { } while (0)
                #endif

                /**
                Make operator new private - people should use the create functions
                */
                REALLY_INLINE   void*           operator new(size_t size, MMgc::GC *gc)
                {
                        return AvmPlusScriptableObject::operator new(size, gc);
                }
                REALLY_INLINE   void            operator delete(void*) {}       // Strings cannot be deleted

                // ctor for a static string.
                REALLY_INLINE                           String(const void* buffer, Width w, int32_t length, bool is7bit);
                // ctor for a dynamic string.
                REALLY_INLINE                           String(MMgc::GC* gc, void* buffer, Width w, int32_t length, int32_t charsLeft, bool is7bit);
                // ctor for a dependent string.
                REALLY_INLINE                           String(MMgc::GC* gc, Stringp master, int32_t start, int32_t length);
        };

        // Compare helpers
        REALLY_INLINE bool operator==(String& s1, String& s2)
        { 
                return s1.equals(&s2);
        }
        REALLY_INLINE bool operator!=(String& s1, String& s2)
        {
                return !s1.equals(&s2);
        }
        REALLY_INLINE bool operator<(String& s1, String& s2)
        { 
                return s2.Compare(s1) < 0; 
        }
        REALLY_INLINE bool operator>(String& s1, String& s2)
        { 
                return s2.Compare(s1) > 0;
        }
        REALLY_INLINE bool operator<=(String& s1, String& s2)
        { 
                return s2.Compare(s1) <= 0;
        }
        REALLY_INLINE bool operator>=(String& s1, String& s2)
        { 
                return s2.Compare(s1) >= 0; 
        }

        REALLY_INLINE /*static*/ bool String::isSpace(wchar ch)
        {
                const uint32_t IS_SPACE_MASK = 
                        (1U << (32-1)) |        // space
                        (1U << (9-1)) |         // tab
                        (1U << (10-1)) |        // LF
                        (1U << (13-1));         // CR
                
                ch -= 1;
                return (ch < 32) &              // bitwise and, *not* logical and -- avoids a branch
                                ((IS_SPACE_MASK & (1U<<ch)) != 0);
        }

        /**
        The StringIndexer class provides quick access to single characters by index.
        Use an instance of this class on the stack if multiple index access is required.
        This class does not need to call getData() for each index access, which charAt()
        without a Pointers argument does internally.
        */

        class StringIndexer
        {
        public:
                /// The constructor takes the string to index.
                                explicit                        StringIndexer(Stringp s);
                /// Return the embedded string.
                REALLY_INLINE   String*         operator->() const { return m_str; }
                /// Quick index operator.
                REALLY_INLINE   wchar           operator[](int index) const 
                { 
                        AvmAssert(index >= 0 && index < m_str->length());
                        return m_latin1 ?
                                        m_ptrs.p8[index] :
                                        m_ptrs.p16[index];
                }

        private:
                                Stringp const volatile  m_str;
                                String::Pointers const  m_ptrs;
                                int const                               m_latin1; // actually a bool, int-sized for speed

                // do not create on the heap
                                void*           operator new(size_t); // unimplemented
                                void            operator delete(void*); // unimplemented
        };

        /**
        The StUTF8String class is simply a data buffer containing 0-terminated UTF-8 data. 
        The instance can only be created on the stack to preserve the data buffer during GC.
        Note that the length() function returns the length not including the 0-terminator.
        Also note that the string might contain interior NULL characters (if the original
        String did) and thus String::Length, strlen, etc might return misleading values.
        */

        class StUTF8String
        {
        public:
                                explicit                        StUTF8String(Stringp str);
                                                                        ~StUTF8String();
                REALLY_INLINE   const char*     c_str() const { return m_buffer; }
                REALLY_INLINE   int32_t         length() const { return m_length; }
        private:
                // do not create on the heap
                void*                           operator new(size_t); // unimplemented
                void                            operator delete(void*); // unimplemented
        private:
                                const char*                     m_buffer;
                                int32_t                         m_length;
        };

        /**
        The StUTF16String class is simply a data buffer containing 0-terminated UTF-16 data. 
        The instance can only be created on the stack to preserve the data buffer during GC.
        Note that the length() function returns the length not including the 0-terminator.
        Also note that the string might contain interior NULL characters (if the original
        String did) and thus String::Length, strlen, etc might return misleading values.
        If the string is a 32-bit string, characters > 0xFFFF are converted to surrogate pairs. 
        */

        class StUTF16String
        {
        public:
                                explicit                StUTF16String(Stringp str);
                                                                ~StUTF16String();
                REALLY_INLINE   const wchar*    c_str() const { return m_buffer; }
                REALLY_INLINE   int32_t                 length() const { return m_length; }
        private:
                                const wchar*    m_buffer;
                                int32_t                 m_length;
                // do not create on the heap
                void*                                   operator new(size_t); // unimplemented
                void                                    operator delete(void*); // unimplemented
        };

        class StIndexableUTF8String : public StUTF8String
        {
        private:
                int32_t         m_lastPos;
                int32_t         m_lastUtf8Pos;
                bool            m_indexable;

        public:
                
                explicit StIndexableUTF8String(Stringp s);

                /**
                Convert a string index to an UTF-8 index. 
                Return the original index if out of range.
                */
                int32_t FASTCALL toUtf8Index(int32_t pos);
                
                /**
                Convert an UTF-8 index to a string index.
                Return the original index if < 0.
                */
                int32_t FASTCALL toIndex(int32_t uf8Pos);
        };

}

#endif  // __avmplus_NewString__

/* [<][>][^][v][top][bottom][index][help] */