dcl/_charset_encoder_8cpp_source.html

#if 0

#include <errno.h>

#endif

#include <dcl/Config.h>


#include <string.h>     // memcpy, strcpy

#include <wchar.h>      // wcrtomb


#include <dcl/Charset.h>

#include <dcl/Exception.h>


#if __DCL_HAVE_ALLOC_DEBUG

#undef __DCL_ALLOC_LEVEL

#define __DCL_ALLOC_LEVEL   __DCL_ALLOC_INTERNAL

#endif


#if __DCL_HAVE_THIS_FILE__

#undef __THIS_FILE__

static const char_t __THIS_FILE__[] = __T("dcl/CharsetEncoder.cpp");

#endif


__DCL_BEGIN_NAMESPACE


IMPLEMENT_CLASSINFO(CharsetConvertException, Exception)


CharsetConvertException::CharsetConvertException(int _errorCode)

    : Exception(NULL)

{

    __errorCode = _errorCode;

}


String CharsetConvertException::toString() const

{

    String str;

    switch(__errorCode) {

        case CS_NOERROR :

            str = L"No Error";

            break;

        case CS_ILLEGAL_UCS :

            str = L"Illegal UCS value.. can't convert to multi-bytes";

            break;

        case CS_SOURCE_FEW :

            str = L"Source multi bytes few";

            break;

        case CS_ILLEGAL_SEQUENCE :

            str = L"Illegal bytes sequence";

            break;

        case CS_ILLEGAL_UCS2 :

            str = L"Can not convert UCS4 to UCS2";

            break;

        default :

            __DCL_ASSERT(false);

            str = L"Unknown error";

    }

    return str;

}


#define ILLEGAL_UCS4        -1  // encode

#define BUFFER_SMALL        -2  // encode


IMPLEMENT_CLASSINFO(CharsetEncoder, Object)


CharsetEncoder::CharsetEncoder()

{

#ifdef __DCL_DEBUG

    __DCL_ASSERT(sizeof(ucs4_t) == 4);

#if __SIZEOF_WCHAR_T__ == 4

    __DCL_ASSERT(sizeof(wchar_t) == 4);

#endif

#if __SIZEOF_WCHAR_T__ == 2

    __DCL_ASSERT(sizeof(wchar_t) == 2);

#endif

    ucs4_t uc = 0x31323334;

#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__

    __DCL_ASSERT(memcmp(&uc, "1234", sizeof(uc)) == 0);

#else

    __DCL_ASSERT(memcmp(&uc, "4321", sizeof(uc)) == 0);

#endif

#endif


}


int CharsetEncoder::encode(     // UCS ==> MB, UTF

    const wchar_t*  _in,        // in: input wide characters

    size_t&         _inCount,   // in: count of input wchars, out: count of processed wchars

    byte_t*           _out,       // out: output buffer

    size_t&         _outCount   // in: buffer size (countof(_out[], byte_t), out: count of converted bytes

)

{

    __DCL_ASSERT(_in != NULL && _out != NULL);


    byte_t* dst = _out;

    byte_t* dstend = dst + _outCount;

    size_t dstlen;;


    const wchar_t* src = _in;

    const wchar_t* srcend = src + _inCount;


    int n = 0;


    while(src < srcend && (dstlen = dstend - dst) > 0) {

        n = toMultiByte(*src, dst, dstlen);

        if (n <= 0)

            break;


        dst += (size_t)n;

        src++;

    }


    _inCount = src - _in;

    _outCount = dst - _out;


    __DCL_ASSERT(n != 0);


    if (n == ILLEGAL_UCS4)

        return CS_ILLEGAL_UCS;

/*

    switch (n)

    {

    case ILLEGAL_UCS4 :

        return CS_ILLEGAL_UCS;

    case BUFFER_SMALL :

        return CS_BUFFER_SMALL;

    }

*/

    return CS_NOERROR;

}


size_t CharsetEncoder::getEncodedLength(const wchar_t* _wcs, size_t _wcslen)

    __DCL_THROWS1(CharsetConvertException*)

{

    const wchar_t* _in = _wcs;

    size_t inTotal = _wcslen;

    size_t outTotal = 0;

    byte_t buf[24];


    for ( ; ; ) {

        size_t _inCount = inTotal;

        size_t _outCount = __countof(buf, byte_t);

        int r = encode(_in, _inCount, buf, _outCount);

        outTotal += _outCount;

        if (r == CS_NOERROR) {

            inTotal -= _inCount;

            if (inTotal == 0)

                break;

            // else buffer small

        }

        else {

            throw(new CharsetConvertException(r));

        }

        _in += _inCount;

    }

    return outTotal;

}


ByteString CharsetEncoder::encode(const wchar_t* _wcs, size_t _wcslen)

    __DCL_THROWS1(CharsetConvertException*)

{

    __DCL_ASSERT(_wcs != NULL);

    if (_wcslen == (size_t)-1)

        _wcslen = String::length(_wcs);


    ByteString rstr;

    if (_wcslen) {

        const wchar_t* _in = _wcs;

        size_t _inCount = _wcslen;

#if 0

        size_t _outCount = getEncodedLength(_wcs, _wcslen);

#else

        size_t _outCount = _inCount * 6;            // UTF-8

#endif


        ByteBuffer* buf = ByteBuffer::create(_outCount);

        int rn = encode(_in, _inCount, (byte_t*)buf->data(), _outCount);

        __DCL_ASSERT(buf->__allocLength >= _outCount);

        if (rn != CS_NOERROR) {

            buf->release();

            throw new CharsetConvertException(rn);

        }

        buf->__dataLength = _outCount;

        ByteBuffer::shrink(buf);


        rstr.assign(buf);

        buf->release();

    }

    return rstr;

}


ByteString CharsetEncoder::encode(const String& _str)

    __DCL_THROWS1(CharsetConvertException*)

{

    return encode(_str, _str.length());

}


IMPLEMENT_CLASSINFO(UTF8Encoder, CharsetEncoder)


UTF8Encoder::UTF8Encoder(

    bool _addBOM            // Byte Order Mark

)

{

    __addBOM = _addBOM;

    reset();

}


void UTF8Encoder::reset()

{

    __addedBOM = false;

}


int UTF8Encoder::encode(        // UCS ==> MB, UTF

    const wchar_t*  _in,        // in: input wide characters

    size_t&         _inCount,   // in: count of input wchars, out: count of processed wchars

    byte_t*           _out,       // out: output buffer

    size_t&         _outCount   // in: buffer size (countof(_out[], byte_t), out: count of converted bytes

)

{

    __DCL_ASSERT(_in != NULL && _out != NULL);


    if (__addBOM && !__addedBOM) {

        if (_outCount >= 3) {

            __addedBOM = true;


            _out[0] = 0xEF;

            _out[1] = 0xBB;

            _out[2] = 0xBF;


            _outCount -= 3;

            _out += 3;


            int r = CharsetEncoder::encode(_in, _inCount, _out, _outCount);

            _outCount += 3;

            return r;

        }

        _inCount = 0;

        _outCount = 0;

        _inCount = 0;

    //  return CS_BUFFER_SMALL;

        return CS_NOERROR;

    }

    return CharsetEncoder::encode(_in, _inCount, _out, _outCount);

}


int UTF8Encoder::toMultiByte(ucs4_t _uc, byte_t* _mbs, size_t _mbslen)

{

    int count;

    if (_uc < 0x80)

        count = 1;

    else if (_uc < 0x800)

        count = 2;

    else if (_uc < 0x10000)

        count = 3;

    else if (_uc < 0x200000)

        count = 4;

    else if (_uc < 0x4000000)

        count = 5;

    else if (_uc <= 0x7fffffff)

        count = 6;

    else

        return ILLEGAL_UCS4;


    if (_mbslen < (size_t)count)

        return BUFFER_SMALL;


    switch (count) { /* note: code falls through cases! */

        case 6: _mbs[5] = 0x80 | (_uc & 0x3f); _uc = _uc >> 6; _uc |= 0x4000000;

        case 5: _mbs[4] = 0x80 | (_uc & 0x3f); _uc = _uc >> 6; _uc |= 0x200000;

        case 4: _mbs[3] = 0x80 | (_uc & 0x3f); _uc = _uc >> 6; _uc |= 0x10000;

        case 3: _mbs[2] = 0x80 | (_uc & 0x3f); _uc = _uc >> 6; _uc |= 0x800;

        case 2: _mbs[1] = 0x80 | (_uc & 0x3f); _uc = _uc >> 6; _uc |= 0xc0;

        case 1: _mbs[0] = _uc;

    }

    return count;

}


IMPLEMENT_CLASSINFO(UTF16Encoder, CharsetEncoder)


UTF16Encoder::UTF16Encoder(

    bool _addBOM,   //= true    // Byte Order Mark

    int _byteOrder  // = CS_DEFAULT_ENDIAN

)

{

    __DCL_ASSERT(_byteOrder == CS_BIG_ENDIAN || _byteOrder == CS_LITTLE_ENDIAN);

    __addBOM = _addBOM;

    __bigEndian = _byteOrder == CS_BIG_ENDIAN;

    reset();

}


void UTF16Encoder::reset()

{

    __addedBOM = false;

}


int UTF16Encoder::encode(       // UCS ==> MB, UTF

    const wchar_t*  _in,        // in: input wide characters

    size_t&         _inCount,   // in: count of input wchars, out: count of processed wchars

    byte_t*           _out,       // out: output buffer

    size_t&         _outCount   // in: buffer size (countof(_out[], byte_t), out: count of converted bytes

)

{

    __DCL_ASSERT(_in != NULL && _out != NULL);


    if (__addBOM && !__addedBOM) {

        if (_outCount >= 2) {

            __addedBOM = true;


            if (__bigEndian) {

                _out[0] = 0xFE;

                _out[1] = 0xFF;

            }

            else {

                _out[0] = 0xFF;

                _out[1] = 0xFE;

            }


            _outCount -= 2;

            _out += 2;


            return CharsetEncoder::encode(_in, _inCount, _out, _outCount);

        }

    //  return CS_BUFFER_SMALL;

        _inCount = 0;

        return CS_NOERROR;

    }

    return CharsetEncoder::encode(_in, _inCount, _out, _outCount);

}


int UTF16Encoder::toMultiByte(ucs4_t _uc, byte_t* _mbs, size_t _mbslen)

{

    if (_uc != 0xFFFE && !(_uc >= 0xD800 && _uc < 0xE000)) {

        int count = 0;

        if (_uc < 0x10000) {

            if (_mbslen >= 2) {

                if (__bigEndian) {

                    _mbs[0] = (unsigned char) (_uc >> 8);

                    _mbs[1] = (unsigned char) _uc;

                }

                else {

                    _mbs[1] = (unsigned char) (_uc >> 8);

                    _mbs[0] = (unsigned char) _uc;

                }

                return count + 2;

            }

            else

                return BUFFER_SMALL;

        }

        else if (_uc < 0x110000) {

            if (_mbslen >= 4) {

                ucs4_t uc1 = 0xd800 + ((_uc - 0x10000) >> 10);

                ucs4_t uc2 = 0xdc00 + ((_uc - 0x10000) & 0x3FF);

                if (__bigEndian) {

                    _mbs[0] = (unsigned char) (uc1 >> 8);

                    _mbs[1] = (unsigned char) uc1;

                    _mbs[2] = (unsigned char) (uc2 >> 8);

                    _mbs[3] = (unsigned char) uc2;

                }

                else {

                    _mbs[3] = (unsigned char) (uc1 >> 8);

                    _mbs[2] = (unsigned char) uc1;

                    _mbs[1] = (unsigned char) (uc2 >> 8);

                    _mbs[0] = (unsigned char) uc2;

                }

                return count + 4;

            }

            else

                return BUFFER_SMALL;

        }

    }

    return ILLEGAL_UCS4;

}


IMPLEMENT_CLASSINFO(UTF32Encoder, CharsetEncoder)


UTF32Encoder::UTF32Encoder(

    bool _addBOM,       //= true// Byte Order Mark

    int _byteOrder      // = CS_DEFAULT_ENDIAN

)

{

    __DCL_ASSERT(_byteOrder == CS_BIG_ENDIAN || _byteOrder == CS_LITTLE_ENDIAN);

    __addBOM = _addBOM;

    __bigEndian = _byteOrder == CS_BIG_ENDIAN;

    reset();

}


void UTF32Encoder::reset()

{

    __addedBOM = false;

}


int UTF32Encoder::encode(       // UCS ==> MB, UTF

    const wchar_t*  _in,        // in: input wide characters

    size_t&         _inCount,   // in: count of input wchars, out: count of processed wchars

    byte_t*           _out,       // out: output buffer

    size_t&         _outCount   // in: buffer size (countof(_out[], byte_t), out: count of converted bytes

)

{

    __DCL_ASSERT(_in != NULL && _out != NULL);


    if (__addBOM && !__addedBOM) {

        if (_outCount >= 4) {

            __addedBOM = true;


            if (__bigEndian) {

                _out[0] = 0x00;

                _out[1] = 0x00;

                _out[2] = 0xFE;

                _out[3] = 0xFF;

            }

            else {

                _out[0] = 0xFF;

                _out[1] = 0xFE;

                _out[2] = 0x00;

                _out[3] = 0x00;

            }


            _outCount -= 4;

            _out += 4;


            return CharsetEncoder::encode(_in, _inCount, _out, _outCount);

        }

    //  return CS_BUFFER_SMALL;

        _inCount = 0;

        return CS_NOERROR;

    }

    return CharsetEncoder::encode(_in, _inCount, _out, _outCount);

}


int UTF32Encoder::toMultiByte(ucs4_t _uc, byte_t* _mbs, size_t _mbslen)

{

    if (_uc < 0x110000 && !(_uc >= 0xd800 && _uc < 0xe000)) {

        int count = 0;

        if (_uc < 0x110000) {

            if (_mbslen >= 4) {

                if (__bigEndian) {

                    _mbs[0] = 0;

                    _mbs[1] = (unsigned char) (_uc >> 16);

                    _mbs[2] = (unsigned char) (_uc >> 8);

                    _mbs[3] = (unsigned char) _uc;

                }

                else {

                    _mbs[3] = 0;

                    _mbs[2] = (unsigned char) (_uc >> 16);

                    _mbs[1] = (unsigned char) (_uc >> 8);

                    _mbs[0] = (unsigned char) _uc;

                }

                return count + 4;

            }

            else

                return BUFFER_SMALL;

        }

    }

    return ILLEGAL_UCS4;

}


IMPLEMENT_CLASSINFO(AsciiEncoder, CharsetEncoder)


AsciiEncoder::AsciiEncoder()

{

}


int AsciiEncoder::toMultiByte(ucs4_t _uc, byte_t* _mbs, size_t _mbslen)

{

    if (_uc < 0x80) {

        *_mbs = _uc;

        return 1;

    }

    return ILLEGAL_UCS4;

}


IMPLEMENT_CLASSINFO(Latin1Encoder, CharsetEncoder)


Latin1Encoder::Latin1Encoder()

{

}


int Latin1Encoder::toMultiByte(ucs4_t _uc, byte_t* _mbs, size_t _mbslen)

{

    if (_uc < 0x100) {

        *_mbs = _uc;

        return 1;

    }

    return ILLEGAL_UCS4;

}


IMPLEMENT_CLASSINFO(LocaleEncoder, CharsetEncoder)


LocaleEncoder::LocaleEncoder()

{

    reset();

}


void LocaleEncoder::reset()

{

    memset(&__mbstate, 0, sizeof(__mbstate));

}


// wcsrtombs


int LocaleEncoder::encode(     // UCS ==> MB, UTF

    const wchar_t*  _in,        // in: input wide characters

    size_t&         _inCount,   // in: count of input wchars, out: count of processed wchars

    byte_t*           _out,       // out: output buffer

    size_t&         _outCount   // in: buffer size (countof(_out[], byte_t), out: count of converted bytes

)

{

    __DCL_ASSERT(_in != NULL && _out != NULL);


    byte_t* dst = _out;

    byte_t* dstend = dst + _outCount;

    size_t dstlen;


    const wchar_t* src = _in;

    const wchar_t* srcend = src + _inCount;


    size_t n = 0;

    char aBuf[24];

    while (src < srcend && (dstlen = dstend - dst) > 0) {

        n = wcrtomb(aBuf, *src, &__mbstate);

        if (n == (size_t) -1 || n > dstlen)

            // 에러이거나, 남은 버퍼보다 크면

            break;


        strncpy((char*)dst, aBuf, n);


        dst += n;

        src++;

    }


    _inCount = src - _in;

    _outCount = dst - _out;


    if (n == (size_t) -1)   // errno == EILSEQ

        return CS_ILLEGAL_UCS;

/*

    if (src < srcend && dst == dstend)

        return CS_BUFFER_SMALL;

*/

    return CS_NOERROR;

}


int LocaleEncoder::toMultiByte(ucs4_t _uc, byte_t* _mbs, size_t _mbslen)

{

    __DCL_ASSERT(false);

    return 0;

}


__DCL_END_NAMESPACE

__THIS_FILE__
#define __THIS_FILE__
Definition _trace.h:14

Charset.h

CS_BIG_ENDIAN
@ CS_BIG_ENDIAN
Definition Charset.h:64

CS_LITTLE_ENDIAN
@ CS_LITTLE_ENDIAN
Definition Charset.h:63

ucs4_t
__DCL_BEGIN_NAMESPACE typedef uint32_t ucs4_t
Definition Charset.h:35

CS_SOURCE_FEW
@ CS_SOURCE_FEW
Definition Charset.h:72

CS_NOERROR
@ CS_NOERROR
Definition Charset.h:68

CS_ILLEGAL_UCS
@ CS_ILLEGAL_UCS
Definition Charset.h:70

CS_ILLEGAL_UCS2
@ CS_ILLEGAL_UCS2
Definition Charset.h:74

CS_ILLEGAL_SEQUENCE
@ CS_ILLEGAL_SEQUENCE
Definition Charset.h:73

ILLEGAL_UCS4
#define ILLEGAL_UCS4
Definition CharsetEncoder.cpp:58

BUFFER_SMALL
#define BUFFER_SMALL
Definition CharsetEncoder.cpp:59

Config.h

NULL
#define NULL
Definition Config.h:316

__countof
#define __countof(array, type)
Definition Config.h:340

char_t
wchar_t char_t
Definition Config.h:251

byte_t
unsigned char byte_t
Definition Config.h:250

__DCL_THROWS1
#define __DCL_THROWS1(e)
Definition Config.h:152

Exception.h

r
IOException *size_t r
Definition MediaInfo.cpp:82

__DCL_ASSERT
#define __DCL_ASSERT(expr)
Definition Object.h:396

IMPLEMENT_CLASSINFO
#define IMPLEMENT_CLASSINFO(class_name, base_class_name)
Definition Object.h:247

__T
#define __T(str)
Definition Object.h:62

String.h

AsciiEncoder
Definition Charset.h:328

AsciiEncoder::toMultiByte
virtual int toMultiByte(ucs4_t _uc, byte_t *_mbs, size_t _mbslen)
Definition CharsetEncoder.cpp:460

CharsetConvertException
Definition Charset.h:78

CharsetConvertException::__errorCode
int __errorCode
Definition Charset.h:85

CharsetEncoder
Definition Charset.h:89

CharsetEncoder::toMultiByte
virtual int toMultiByte(ucs4_t _uc, byte_t *_mbs, size_t _mbslen)=0

CharsetEncoder::CharsetEncoder
CharsetEncoder()

Exception
Definition Exception.h:26

Exception::toString
virtual String toString() const
Definition Exception.cpp:40

Latin1Encoder
Definition Charset.h:364

Latin1Encoder::toMultiByte
virtual int toMultiByte(ucs4_t _uc, byte_t *_mbs, size_t _mbslen)
Definition CharsetEncoder.cpp:475

LocaleEncoder
Definition Charset.h:389

LocaleEncoder::encode
virtual int encode(const wchar_t *_in, size_t &_inCount, byte_t *_out, size_t &_outCount)
Definition CharsetEncoder.cpp:497

LocaleEncoder::toMultiByte
virtual int toMultiByte(ucs4_t _uc, byte_t *_mbs, size_t _mbslen)
Definition CharsetEncoder.cpp:539

LocaleEncoder::LocaleEncoder
LocaleEncoder()

LocaleEncoder::reset
virtual void reset()
Definition CharsetEncoder.cpp:491

Object
Definition Object.h:137

UTF16Encoder
Definition Charset.h:218

UTF16Encoder::toMultiByte
virtual int toMultiByte(ucs4_t _uc, byte_t *_mbs, size_t _mbslen)
Definition CharsetEncoder.cpp:327

UTF32Encoder
Definition Charset.h:273

UTF32Encoder::toMultiByte
virtual int toMultiByte(ucs4_t _uc, byte_t *_mbs, size_t _mbslen)
Definition CharsetEncoder.cpp:427

UTF8Encoder
Definition Charset.h:142

UTF8Encoder::toMultiByte
virtual int toMultiByte(ucs4_t _uc, byte_t *_mbs, size_t _mbslen)
Definition CharsetEncoder.cpp:243