DCL 4.0
Loading...
Searching...
No Matches
CharsetEncoder.cpp
Go to the documentation of this file.
1#if 0
2#include <errno.h>
3#endif
4#include <dcl/Config.h>
5
6#include <string.h> // memcpy, strcpy
7#include <wchar.h> // wcrtomb
8
9#include <dcl/Charset.h>
10#include <dcl/Exception.h>
11
12#if __DCL_HAVE_ALLOC_DEBUG
13#undef __DCL_ALLOC_LEVEL
14#define __DCL_ALLOC_LEVEL __DCL_ALLOC_INTERNAL
15#endif
16
17#if __DCL_DEBUG
18#undef __THIS_FILE__
19static const char_t __THIS_FILE__[] = __T("dcl/CharsetEncoder.cpp");
20#endif
21
22__DCL_BEGIN_NAMESPACE
23
25
26CharsetConvertException::CharsetConvertException(int _errorCode)
28{
29 __errorCode = _errorCode;
30}
31
33{
34 String str;
35 switch(__errorCode) {
36 case CS_NOERROR :
37 str = L"No Error";
38 break;
39 case CS_ILLEGAL_UCS :
40 str = L"Illegal UCS value.. can't convert to multi-bytes";
41 break;
42 case CS_SOURCE_FEW :
43 str = L"Source multi bytes few";
44 break;
46 str = L"Illegal bytes sequence";
47 break;
48 case CS_ILLEGAL_UCS2 :
49 str = L"Can not convert UCS4 to UCS2";
50 break;
51 default :
52 __DCL_ASSERT(false);
53 str = L"Unknown error";
54 }
55 return str;
56}
57
58#define ILLEGAL_UCS4 -1 // encode
59#define BUFFER_SMALL -2 // encode
60
62
64{
65#if __DCL_DEBUG
66 __DCL_ASSERT(sizeof(ucs4_t) == 4);
67#if __SIZEOF_WCHAR_T__ == 4
68 __DCL_ASSERT(sizeof(wchar_t) == 4);
69#endif
70#if __SIZEOF_WCHAR_T__ == 2
71 __DCL_ASSERT(sizeof(wchar_t) == 2);
72#endif
73 ucs4_t uc = 0x31323334;
74#if __BYTE_ORDER == __BIG_ENDIAN
75 __DCL_ASSERT(memcmp(&uc, "1234", sizeof(uc)) == 0);
76#else
77 __DCL_ASSERT(memcmp(&uc, "4321", sizeof(uc)) == 0);
78#endif
79#endif
80
81}
82
83int CharsetEncoder::encode( // UCS ==> MB, UTF
84 const wchar_t* _in, // in: input wide characters
85 size_t& _inCount, // in: count of input wchars, out: count of processed wchars
86 byte_t* _out, // out: output buffer
87 size_t& _outCount // in: buffer size (countof(_out[], byte_t), out: count of converted bytes
88 )
89{
90 __DCL_ASSERT(_in != NULL && _out != NULL);
91
92 byte_t* dst = _out;
93 byte_t* dstend = dst + _outCount;
94 size_t dstlen;;
95
96 const wchar_t* src = _in;
97 const wchar_t* srcend = src + _inCount;
98
99 int n = 0;
100
101 while(src < srcend && (dstlen = dstend - dst) > 0) {
102 n = toMultiByte(*src, dst, dstlen);
103 if (n <= 0)
104 break;
105
106 dst += (size_t)n;
107 src++;
108 }
109
110 _inCount = src - _in;
111 _outCount = dst - _out;
112
113 __DCL_ASSERT(n != 0);
114
115 if (n == ILLEGAL_UCS4)
116 return CS_ILLEGAL_UCS;
117/*
118 switch (n)
119 {
120 case ILLEGAL_UCS4 :
121 return CS_ILLEGAL_UCS;
122 case BUFFER_SMALL :
123 return CS_BUFFER_SMALL;
124 }
125*/
126 return CS_NOERROR;
127}
128
129size_t CharsetEncoder::getEncodedLength(const wchar_t* _wcs, size_t _wcslen)
131{
132 const wchar_t* _in = _wcs;
133 size_t inTotal = _wcslen;
134 size_t outTotal = 0;
135 byte_t buf[24];
136
137 for ( ; ; ) {
138 size_t _inCount = inTotal;
139 size_t _outCount = __countof(buf, byte_t);
140 int r = encode(_in, _inCount, buf, _outCount);
141 outTotal += _outCount;
142 if (r == CS_NOERROR) {
143 inTotal -= _inCount;
144 if (inTotal == 0)
145 break;
146 // else buffer small
147 }
148 else {
149 throw(new CharsetConvertException(r));
150 }
151 _in += _inCount;
152 }
153 return outTotal;
154}
155
156ByteString CharsetEncoder::encode(const wchar_t* _wcs, size_t _wcslen)
158{
159 __DCL_ASSERT(_wcs != NULL);
160 if (_wcslen == (size_t)-1)
161 _wcslen = String::length(_wcs);
162
163 ByteString rstr;
164 if (_wcslen) {
165 const wchar_t* _in = _wcs;
166 size_t _inCount = _wcslen;
167#if 0
168 size_t _outCount = getEncodedLength(_wcs, _wcslen);
169#else
170 size_t _outCount = _inCount * 6; // UTF-8
171#endif
172
173 ByteBuffer* buf = ByteBuffer::create(_outCount);
174 int rn = encode(_in, _inCount, (byte_t*)buf->data(), _outCount);
175 __DCL_ASSERT(buf->__allocLength >= _outCount);
176 if (rn != CS_NOERROR) {
177 buf->release();
178 throw new CharsetConvertException(rn);
179 }
180 buf->__dataLength = _outCount;
181 ByteBuffer::shrink(buf);
182
183 rstr.assign(buf);
184 buf->release();
185 }
186 return rstr;
187}
188
189ByteString CharsetEncoder::encode(const String& _str)
191{
192 return encode(_str, _str.length());
193}
194
196
197UTF8Encoder::UTF8Encoder(
198 bool _addBOM // Byte Order Mark
199 )
200{
201 __addBOM = _addBOM;
202 reset();
203}
204
205void UTF8Encoder::reset()
206{
207 __addedBOM = false;
208}
209
210int UTF8Encoder::encode( // UCS ==> MB, UTF
211 const wchar_t* _in, // in: input wide characters
212 size_t& _inCount, // in: count of input wchars, out: count of processed wchars
213 byte_t* _out, // out: output buffer
214 size_t& _outCount // in: buffer size (countof(_out[], byte_t), out: count of converted bytes
215 )
216{
217 __DCL_ASSERT(_in != NULL && _out != NULL);
218
219 if (__addBOM && !__addedBOM) {
220 if (_outCount >= 3) {
221 __addedBOM = true;
222
223 _out[0] = 0xEF;
224 _out[1] = 0xBB;
225 _out[2] = 0xBF;
226
227 _outCount -= 3;
228 _out += 3;
229
230 int r = CharsetEncoder::encode(_in, _inCount, _out, _outCount);
231 _outCount += 3;
232 return r;
233 }
234 _inCount = 0;
235 _outCount = 0;
236 _inCount = 0;
237 // return CS_BUFFER_SMALL;
238 return CS_NOERROR;
239 }
240 return CharsetEncoder::encode(_in, _inCount, _out, _outCount);
241}
242
243int UTF8Encoder::toMultiByte(ucs4_t _uc, byte_t* _mbs, size_t _mbslen)
244{
245 int count;
246 if (_uc < 0x80)
247 count = 1;
248 else if (_uc < 0x800)
249 count = 2;
250 else if (_uc < 0x10000)
251 count = 3;
252 else if (_uc < 0x200000)
253 count = 4;
254 else if (_uc < 0x4000000)
255 count = 5;
256 else if (_uc <= 0x7fffffff)
257 count = 6;
258 else
259 return ILLEGAL_UCS4;
260
261 if (_mbslen < (size_t)count)
262 return BUFFER_SMALL;
263
264 switch (count) { /* note: code falls through cases! */
265 case 6: _mbs[5] = 0x80 | (_uc & 0x3f); _uc = _uc >> 6; _uc |= 0x4000000;
266 case 5: _mbs[4] = 0x80 | (_uc & 0x3f); _uc = _uc >> 6; _uc |= 0x200000;
267 case 4: _mbs[3] = 0x80 | (_uc & 0x3f); _uc = _uc >> 6; _uc |= 0x10000;
268 case 3: _mbs[2] = 0x80 | (_uc & 0x3f); _uc = _uc >> 6; _uc |= 0x800;
269 case 2: _mbs[1] = 0x80 | (_uc & 0x3f); _uc = _uc >> 6; _uc |= 0xc0;
270 case 1: _mbs[0] = _uc;
271 }
272 return count;
273}
274
276
277UTF16Encoder::UTF16Encoder(
278 bool _addBOM, //= true // Byte Order Mark
279 int _byteOrder // = CS_DEFAULT_ENDIAN
280 )
281{
282 __DCL_ASSERT(_byteOrder == CS_BIG_ENDIAN || _byteOrder == CS_LITTLE_ENDIAN);
283 __addBOM = _addBOM;
284 __bigEndian = _byteOrder == CS_BIG_ENDIAN;
285 reset();
286}
287
288void UTF16Encoder::reset()
289{
290 __addedBOM = false;
291}
292
293int UTF16Encoder::encode( // UCS ==> MB, UTF
294 const wchar_t* _in, // in: input wide characters
295 size_t& _inCount, // in: count of input wchars, out: count of processed wchars
296 byte_t* _out, // out: output buffer
297 size_t& _outCount // in: buffer size (countof(_out[], byte_t), out: count of converted bytes
298 )
299{
300 __DCL_ASSERT(_in != NULL && _out != NULL);
301
302 if (__addBOM && !__addedBOM) {
303 if (_outCount >= 2) {
304 __addedBOM = true;
305
306 if (__bigEndian) {
307 _out[0] = 0xFE;
308 _out[1] = 0xFF;
309 }
310 else {
311 _out[0] = 0xFF;
312 _out[1] = 0xFE;
313 }
314
315 _outCount -= 2;
316 _out += 2;
317
318 return CharsetEncoder::encode(_in, _inCount, _out, _outCount);
319 }
320 // return CS_BUFFER_SMALL;
321 _inCount = 0;
322 return CS_NOERROR;
323 }
324 return CharsetEncoder::encode(_in, _inCount, _out, _outCount);
325}
326
327int UTF16Encoder::toMultiByte(ucs4_t _uc, byte_t* _mbs, size_t _mbslen)
328{
329 if (_uc != 0xFFFE && !(_uc >= 0xD800 && _uc < 0xE000)) {
330 int count = 0;
331 if (_uc < 0x10000) {
332 if (_mbslen >= 2) {
333 if (__bigEndian) {
334 _mbs[0] = (unsigned char) (_uc >> 8);
335 _mbs[1] = (unsigned char) _uc;
336 }
337 else {
338 _mbs[1] = (unsigned char) (_uc >> 8);
339 _mbs[0] = (unsigned char) _uc;
340 }
341 return count + 2;
342 }
343 else
344 return BUFFER_SMALL;
345 }
346 else if (_uc < 0x110000) {
347 if (_mbslen >= 4) {
348 ucs4_t uc1 = 0xd800 + ((_uc - 0x10000) >> 10);
349 ucs4_t uc2 = 0xdc00 + ((_uc - 0x10000) & 0x3FF);
350 if (__bigEndian) {
351 _mbs[0] = (unsigned char) (uc1 >> 8);
352 _mbs[1] = (unsigned char) uc1;
353 _mbs[2] = (unsigned char) (uc2 >> 8);
354 _mbs[3] = (unsigned char) uc2;
355 }
356 else {
357 _mbs[3] = (unsigned char) (uc1 >> 8);
358 _mbs[2] = (unsigned char) uc1;
359 _mbs[1] = (unsigned char) (uc2 >> 8);
360 _mbs[0] = (unsigned char) uc2;
361 }
362 return count + 4;
363 }
364 else
365 return BUFFER_SMALL;
366 }
367 }
368 return ILLEGAL_UCS4;
369}
370
372
373UTF32Encoder::UTF32Encoder(
374 bool _addBOM, //= true // Byte Order Mark
375 int _byteOrder // = CS_DEFAULT_ENDIAN
376 )
377{
378 __DCL_ASSERT(_byteOrder == CS_BIG_ENDIAN || _byteOrder == CS_LITTLE_ENDIAN);
379 __addBOM = _addBOM;
380 __bigEndian = _byteOrder == CS_BIG_ENDIAN;
381 reset();
382}
383
384void UTF32Encoder::reset()
385{
386 __addedBOM = false;
387}
388
389int UTF32Encoder::encode( // UCS ==> MB, UTF
390 const wchar_t* _in, // in: input wide characters
391 size_t& _inCount, // in: count of input wchars, out: count of processed wchars
392 byte_t* _out, // out: output buffer
393 size_t& _outCount // in: buffer size (countof(_out[], byte_t), out: count of converted bytes
394 )
395{
396 __DCL_ASSERT(_in != NULL && _out != NULL);
397
398 if (__addBOM && !__addedBOM) {
399 if (_outCount >= 4) {
400 __addedBOM = true;
401
402 if (__bigEndian) {
403 _out[0] = 0x00;
404 _out[1] = 0x00;
405 _out[2] = 0xFE;
406 _out[3] = 0xFF;
407 }
408 else {
409 _out[0] = 0xFF;
410 _out[1] = 0xFE;
411 _out[2] = 0x00;
412 _out[3] = 0x00;
413 }
414
415 _outCount -= 4;
416 _out += 4;
417
418 return CharsetEncoder::encode(_in, _inCount, _out, _outCount);
419 }
420 // return CS_BUFFER_SMALL;
421 _inCount = 0;
422 return CS_NOERROR;
423 }
424 return CharsetEncoder::encode(_in, _inCount, _out, _outCount);
425}
426
427int UTF32Encoder::toMultiByte(ucs4_t _uc, byte_t* _mbs, size_t _mbslen)
428{
429 if (_uc < 0x110000 && !(_uc >= 0xd800 && _uc < 0xe000)) {
430 int count = 0;
431 if (_uc < 0x110000) {
432 if (_mbslen >= 4) {
433 if (__bigEndian) {
434 _mbs[0] = 0;
435 _mbs[1] = (unsigned char) (_uc >> 16);
436 _mbs[2] = (unsigned char) (_uc >> 8);
437 _mbs[3] = (unsigned char) _uc;
438 }
439 else {
440 _mbs[3] = 0;
441 _mbs[2] = (unsigned char) (_uc >> 16);
442 _mbs[1] = (unsigned char) (_uc >> 8);
443 _mbs[0] = (unsigned char) _uc;
444 }
445 return count + 4;
446 }
447 else
448 return BUFFER_SMALL;
449 }
450 }
451 return ILLEGAL_UCS4;
452}
453
455
456AsciiEncoder::AsciiEncoder()
457{
458}
459
460int AsciiEncoder::toMultiByte(ucs4_t _uc, byte_t* _mbs, size_t _mbslen)
461{
462 if (_uc < 0x80) {
463 *_mbs = _uc;
464 return 1;
465 }
466 return ILLEGAL_UCS4;
467}
468
470
471Latin1Encoder::Latin1Encoder()
472{
473}
474
475int Latin1Encoder::toMultiByte(ucs4_t _uc, byte_t* _mbs, size_t _mbslen)
476{
477 if (_uc < 0x100) {
478 *_mbs = _uc;
479 return 1;
480 }
481 return ILLEGAL_UCS4;
482}
483
490
492{
493 memset(&__mbstate, 0, sizeof(__mbstate));
494}
495
496// wcsrtombs
497int LocaleEncoder::encode( // UCS ==> MB, UTF
498 const wchar_t* _in, // in: input wide characters
499 size_t& _inCount, // in: count of input wchars, out: count of processed wchars
500 byte_t* _out, // out: output buffer
501 size_t& _outCount // in: buffer size (countof(_out[], byte_t), out: count of converted bytes
502 )
503{
504 __DCL_ASSERT(_in != NULL && _out != NULL);
505
506 byte_t* dst = _out;
507 byte_t* dstend = dst + _outCount;
508 size_t dstlen;
509
510 const wchar_t* src = _in;
511 const wchar_t* srcend = src + _inCount;
512
513 size_t n = 0;
514 char aBuf[24];
515 while (src < srcend && (dstlen = dstend - dst) > 0) {
516 n = wcrtomb(aBuf, *src, &__mbstate);
517 if (n == (size_t) -1 || n > dstlen)
518 // 에러이거나, 남은 버퍼보다 크면
519 break;
520
521 strncpy((char*)dst, aBuf, n);
522
523 dst += n;
524 src++;
525 }
526
527 _inCount = src - _in;
528 _outCount = dst - _out;
529
530 if (n == (size_t) -1) // errno == EILSEQ
531 return CS_ILLEGAL_UCS;
532/*
533 if (src < srcend && dst == dstend)
534 return CS_BUFFER_SMALL;
535*/
536 return CS_NOERROR;
537}
538
539int LocaleEncoder::toMultiByte(ucs4_t _uc, byte_t* _mbs, size_t _mbslen)
540{
541 __DCL_ASSERT(false);
542 return 0;
543}
544
545__DCL_END_NAMESPACE
#define __THIS_FILE__
Definition _trace.h:14
@ CS_BIG_ENDIAN
Definition Charset.h:57
@ CS_LITTLE_ENDIAN
Definition Charset.h:56
__DCL_BEGIN_NAMESPACE typedef uint32_t ucs4_t
Definition Charset.h:29
@ CS_SOURCE_FEW
Definition Charset.h:65
@ CS_NOERROR
Definition Charset.h:61
@ CS_ILLEGAL_UCS
Definition Charset.h:63
@ CS_ILLEGAL_UCS2
Definition Charset.h:67
@ CS_ILLEGAL_SEQUENCE
Definition Charset.h:66
#define ILLEGAL_UCS4
#define BUFFER_SMALL
#define NULL
Definition Config.h:340
#define __countof(array, type)
Definition Config.h:365
wchar_t char_t
Definition Config.h:275
unsigned char byte_t
Definition Config.h:274
#define __DCL_THROWS1(e)
Definition Config.h:167
#define __DCL_ASSERT(expr)
Definition Object.h:371
#define IMPLEMENT_CLASSINFO(class_name, base_class_name)
Definition Object.h:228
#define __T(str)
Definition Object.h:44
ByteString r
ByteBuffer * buf
void CharsetConvertException *size_t n
Definition SQLField.cpp:253
virtual int toMultiByte(ucs4_t _uc, byte_t *_mbs, size_t _mbslen)
virtual int toMultiByte(ucs4_t _uc, byte_t *_mbs, size_t _mbslen)=0
virtual String toString() const
Definition Exception.cpp:40
virtual int toMultiByte(ucs4_t _uc, byte_t *_mbs, size_t _mbslen)
virtual int encode(const wchar_t *_in, size_t &_inCount, byte_t *_out, size_t &_outCount)
virtual int toMultiByte(ucs4_t _uc, byte_t *_mbs, size_t _mbslen)
virtual void reset()
virtual int toMultiByte(ucs4_t _uc, byte_t *_mbs, size_t _mbslen)
virtual int toMultiByte(ucs4_t _uc, byte_t *_mbs, size_t _mbslen)
virtual int toMultiByte(ucs4_t _uc, byte_t *_mbs, size_t _mbslen)