DCL 4.0
Loading...
Searching...
No Matches
CharsetDecoder.cpp
Go to the documentation of this file.
1#if 0
2#include <errno.h>
3#endif
4#include <dcl/Config.h>
5
6#include <string.h> // memcmp, memcpy
7#include <wchar.h> // mbrtowc
8
9#include <dcl/Charset.h>
10
11#if __DCL_HAVE_ALLOC_DEBUG
12#undef __DCL_ALLOC_LEVEL
13#define __DCL_ALLOC_LEVEL __DCL_ALLOC_INTERNAL
14#endif
15
16#if __DCL_DEBUG
17#undef __THIS_FILE__
18static const char_t __THIS_FILE__[] = __T("dcl/CharsetDecoder.cpp");
19#endif
20
21__DCL_BEGIN_NAMESPACE
22
23#define ILLEGAL_UCS2 -1 // decode
24#define ILLEGAL_SEQUENCE -2 // decode
25#define SOURCE_FEW -3 // decode
26#define SOURCE_FEW_N(_mbslen) (-3-(_mbslen))
27
29
31{
32#if __DCL_DEBUG
33 __DCL_ASSERT(sizeof(ucs4_t) == 4);
34#if __SIZEOF_WCHAR_T__ == 4
35 __DCL_ASSERT(sizeof(wchar_t) == 4);
36#endif
37#if __SIZEOF_WCHAR_T__ == 2
38 __DCL_ASSERT(sizeof(wchar_t) == 2);
39#endif
40 ucs4_t uc = 0x31323334;
41#if __BYTE_ORDER == __BIG_ENDIAN
42 __DCL_ASSERT(memcmp(&uc, "1234", sizeof(uc)) == 0);
43#else
44 __DCL_ASSERT(memcmp(&uc, "4321", sizeof(uc)) == 0);
45#endif
46#endif
47
48}
49
50int CharsetDecoder::decode( // MB, UTF ==> UCS
51 const byte_t* _in, // in: input bytes
52 size_t& _inCount, // in: count of input bytes, out: count of processed bytes
53 wchar_t* _out, // out: output buffer
54 size_t& _outCount // in: buffer size (countof(_out[], wchar_t)), out: count of converted wchars
55 )
56{
57 __DCL_ASSERT(_in != NULL && _out != NULL);
58
59 wchar_t* dst = _out;
60 wchar_t* dstend = dst + _outCount;
61
62 const byte_t* src = _in;
63 const byte_t* srcend = src + _inCount;
64 size_t srclen;
65
66 int _mbslen = 0;
67 while((srclen = srcend - src) > 0 && dst < dstend) {
68 ucs4_t uc;
69 _mbslen = toWideChar(src, srclen, &uc);
70
71 if (_mbslen <= 0)
72 break;
73
74#if __SIZEOF_WCHAR_T__ == 2
75 if (uc > 0xFFFF || (0xD800 <= uc && uc < 0xE000)) {
76 _mbslen = ILLEGAL_UCS2;
77 break;
78 }
79#endif
80 *dst = (wchar_t)uc;
81
82 src += (size_t)_mbslen;
83 dst++;
84 }
85
86 _inCount = src - _in;
87 _outCount = dst - _out;
88
89 if (_mbslen >= 0)
90 return CS_NOERROR;
91
92 switch (_mbslen) {
93 case ILLEGAL_UCS2 :
94 return CS_ILLEGAL_UCS2;
95 case ILLEGAL_SEQUENCE :
97 }
98 return CS_SOURCE_FEW;
99}
100
101size_t CharsetDecoder::getDecodedLength(const char* _mbs, size_t _mbslen)
103{
104 const byte_t* _in = (const byte_t*)_mbs;
105 size_t inTotal = _mbslen;
106 size_t outTotal = 0;
107 wchar_t buf[6];
108
109 for ( ; ; ) {
110 size_t _inCount = inTotal;
111 size_t _outCount = __countof(buf, wchar_t);
112 int r = decode(_in, _inCount, buf, _outCount);
113 outTotal += _outCount;
114 if (r == CS_NOERROR) {
115 inTotal -= _inCount;
116 if (inTotal == 0)
117 break;
118 // else buffer small
119 }
120 else {
121 throw(new CharsetConvertException(r));
122 }
123 _in += _inCount;
124 }
125 return outTotal;
126}
127
128String CharsetDecoder::decode(const char* _mbs, size_t _mbslen)
130{
131 __DCL_ASSERT(_mbs != NULL);
132 if (_mbslen == (size_t) -1)
133 _mbslen = ByteString::length(_mbs);
134
135 String rstr;
136 if (_mbslen) {
137 const byte_t* _in = (const byte_t*) _mbs;
138 size_t _inCount = _mbslen;
139#if 0
140 size_t _outCount = getDecodedLength(_mbs, _mbslen);
141#else
142 size_t _outCount = _mbslen;
143#endif
144
145 CharBuffer* buf = CharBuffer::create(_outCount);
146 int rn = decode(_in, _inCount, buf->data(), _outCount);
147 __DCL_ASSERT(_outCount <= buf->__allocLength);
148 if (rn != CS_NOERROR) {
149 buf->release();
150 throw new CharsetConvertException(rn);
151 }
152 buf->data()[_outCount] = L'\0';
153 buf->__dataLength = _outCount;
154 CharBuffer::shrink(buf);
155
156 rstr.assign(buf);
157 buf->release();
158 }
159 return rstr;
160}
161
162String CharsetDecoder::decode(const ByteString& _str)
164{
165 return decode(_str, _str.length());
166}
167
169
170UTF8Decoder::UTF8Decoder()
171{
172 reset();
173}
174
175void UTF8Decoder::reset()
176{
177 __hasBOM = false;
178}
179
180int UTF8Decoder::toWideChar(const byte_t* _mbs, size_t _mbslen, ucs4_t* _uc)
181{
182 int count = 0;
183 while (_mbslen > 0) {
184 byte_t c = _mbs[0];
185
186 if (c < 0x80) {
187 *_uc = c;
188 return count + 1;
189 }
190 else if (c < 0xC2) {
191 return ILLEGAL_SEQUENCE;
192 }
193 else if (c < 0xE0) {
194 if (_mbslen < 2)
195 return SOURCE_FEW_N(0);
196
197 if (!((_mbs[1] ^ 0x80) < 0x40))
198 return ILLEGAL_SEQUENCE;
199
200 *_uc = ((ucs4_t)(c & 0x1F) << 6)
201 | (ucs4_t)(_mbs[1] ^ 0x80);
202
203 return count + 2;
204 }
205 else if (c < 0xF0) {
206 if (_mbslen < 3)
207 return SOURCE_FEW_N(0);
208
209 if (!((_mbs[1] ^ 0x80) < 0x40 && (_mbs[2] ^ 0x80) < 0x40
210 && (c >= 0xE1 || _mbs[1] >= 0xA0)))
211 return ILLEGAL_SEQUENCE;
212
213 ucs4_t ucTemp = ((ucs4_t) (c & 0x0F) << 12)
214 | ((ucs4_t) (_mbs[1] ^ 0x80) << 6)
215 | (ucs4_t) (_mbs[2] ^ 0x80);
216 if (ucTemp == 0xFEFF) {
217 // BOM
218 __hasBOM = true;
219 _mbslen -= 3;
220 _mbs += 3;
221 }
222 else {
223 *_uc = ucTemp;
224 return count + 3;
225 }
226 }
227 else if (c < 0xF8) {
228 if (_mbslen < 4)
229 return SOURCE_FEW_N(0);
230
231 if (!((_mbs[1] ^ 0x80) < 0x40 && (_mbs[2] ^ 0x80) < 0x40
232 && (_mbs[3] ^ 0x80) < 0x40 && (c >= 0xF1 || _mbs[1] >= 0x90)))
233 return ILLEGAL_SEQUENCE;
234
235 *_uc = ((ucs4_t) (c & 0x07) << 18)
236 | ((ucs4_t) (_mbs[1] ^ 0x80) << 12)
237 | ((ucs4_t) (_mbs[2] ^ 0x80) << 6)
238 | (ucs4_t) (_mbs[3] ^ 0x80);
239
240 return count + 4;
241 }
242 else if (c < 0xFC) {
243 if (_mbslen < 5)
244 return SOURCE_FEW_N(0);
245
246 if (!((_mbs[1] ^ 0x80) < 0x40 && (_mbs[2] ^ 0x80) < 0x40
247 && (_mbs[3] ^ 0x80) < 0x40 && (_mbs[4] ^ 0x80) < 0x40
248 && (c >= 0xF9 || _mbs[1] >= 0x88)))
249 return ILLEGAL_SEQUENCE;
250
251 *_uc = ((ucs4_t) (c & 0x03) << 24)
252 | ((ucs4_t) (_mbs[1] ^ 0x80) << 18)
253 | ((ucs4_t) (_mbs[2] ^ 0x80) << 12)
254 | ((ucs4_t) (_mbs[3] ^ 0x80) << 6)
255 | (ucs4_t) (_mbs[4] ^ 0x80);
256
257 return count + 5;
258 }
259 else if (c < 0xFE) {
260 if (_mbslen < 6)
261 return SOURCE_FEW_N(0);
262
263 if (!((_mbs[1] ^ 0x80) < 0x40 && (_mbs[2] ^ 0x80) < 0x40
264 && (_mbs[3] ^ 0x80) < 0x40 && (_mbs[4] ^ 0x80) < 0x40
265 && (_mbs[5] ^ 0x80) < 0x40
266 && (c >= 0xFD || _mbs[1] >= 0x84)))
267 return ILLEGAL_SEQUENCE;
268
269 *_uc = ((ucs4_t) (c & 0x01) << 30)
270 | ((ucs4_t) (_mbs[1] ^ 0x80) << 24)
271 | ((ucs4_t) (_mbs[2] ^ 0x80) << 18)
272 | ((ucs4_t) (_mbs[3] ^ 0x80) << 12)
273 | ((ucs4_t) (_mbs[4] ^ 0x80) << 6)
274 | (ucs4_t) (_mbs[5] ^ 0x80);
275
276 return count + 6;
277 }
278 else
279 return ILLEGAL_SEQUENCE;
280 }
281 return SOURCE_FEW_N(0);
282}
283
285
286UTF16Decoder::UTF16Decoder(
287 int nDefaultByteOrder // = CS_DEFAULT_ENDIAN
288 )
289{
290 __DCL_ASSERT(nDefaultByteOrder == CS_LITTLE_ENDIAN
291 || nDefaultByteOrder == CS_BIG_ENDIAN);
292
293 __defaultBigEndian = nDefaultByteOrder == CS_BIG_ENDIAN;
294 reset();
295}
296
297void UTF16Decoder::reset()
298{
299 __hasBOM = false;
300 __bigEndian = __defaultBigEndian;
301}
302
303int UTF16Decoder::toWideChar(const byte_t* _mbs, size_t _mbslen, ucs4_t* _uc)
304{
305 int count = 0;
306 while(_mbslen >= 2) {
307 ucs4_t uc = __bigEndian ? (_mbs[0] << 8) + _mbs[1] : _mbs[0] + (_mbs[1] << 8);
308 if (uc == 0xFEFF) {
309 __hasBOM = true;
310 }
311 else if (uc == 0xFFFE) {
312 __hasBOM = true;
313 __bigEndian = !__bigEndian;
314 }
315 else if (uc >= 0xD800 && uc < 0xDC00) {
316 if (_mbslen >= 4) {
317 ucs4_t uc2 = __bigEndian ? (_mbs[2] << 8) + _mbs[3] : _mbs[2] + (_mbs[3] << 8);
318 if (!(uc2 >= 0xdc00 && uc2 < 0xe000))
319 return ILLEGAL_SEQUENCE;
320 *_uc = 0x10000 + ((uc - 0xd800) << 10) + (uc2 - 0xdc00);
321 return count + 4;
322 }
323 else
324 break;
325 }
326 else if (uc >= 0xdc00 && uc < 0xe000) {
327 return ILLEGAL_SEQUENCE;
328 }
329 else {
330 *_uc = uc;
331 return count + 2;
332 }
333 _mbs += 2; _mbslen -= 2; count += 2;
334 }
335 return SOURCE_FEW_N(count);
336}
337
339
340UTF32Decoder::UTF32Decoder(
341 int nDefaultByteOrder // = CS_DEFAULT_ENDIAN
342 )
343{
344 __DCL_ASSERT(nDefaultByteOrder == CS_LITTLE_ENDIAN
345 || nDefaultByteOrder == CS_BIG_ENDIAN);
346
347 __defaultBigEndian = nDefaultByteOrder == CS_BIG_ENDIAN;
348}
349
350void UTF32Decoder::reset()
351{
352 __hasBOM = false;
353 __bigEndian = __defaultBigEndian;
354}
355
356int UTF32Decoder::toWideChar(const byte_t* _mbs, size_t _mbslen, ucs4_t* _uc)
357{
358 int count = 0;
359 while(_mbslen >= 4) {
360 ucs4_t uc = __bigEndian ? (_mbs[0] << 24) + (_mbs[1] << 16) + (_mbs[2] << 8) + _mbs[3]
361 : _mbs[0] + (_mbs[1] << 8) + (_mbs[2] << 16) + (_mbs[3] << 24);
362
363 count += 4;
364
365 if (uc == 0x0000FEFF) {
366 __hasBOM = true;
367 }
368 else if (uc == 0xFFFE0000U) {
369 __hasBOM = true;
370 __bigEndian = !__bigEndian;
371 }
372 else {
373 if (uc < 0x110000 && !(uc >= 0xD800 && uc < 0xE000)) {
374 *_uc = uc;
375 return count;
376 }
377 else
378 return ILLEGAL_SEQUENCE;
379 }
380 _mbs += 4; _mbslen -= 4;
381 }
382 return SOURCE_FEW_N(count);
383}
384
386
387AsciiDecoder::AsciiDecoder()
388{
389}
390
391int AsciiDecoder::toWideChar(const byte_t* _mbs, size_t _mbslen, ucs4_t* _uc)
392{
393 byte_t c = *_mbs;
394 if (c < 0x80) {
395 *_uc = c;
396 return 1;
397 }
398 return ILLEGAL_SEQUENCE;
399}
400
401String AsciiDecoder::decode(const char* _mbs, size_t _mbslen)
402{
403 if (_mbslen == (size_t)-1)
404 _mbslen = ByteString::length(_mbs);
405
406 CharBuffer* buf = CharBuffer::create(_mbslen);
407 wchar_t* p = buf->data();
408 for(size_t i = 0; i < _mbslen; i++)
409 p[i] = (wchar_t)(byte_t)_mbs[i];
410
411 p[_mbslen] = L'\0';
412 buf->__dataLength = _mbslen;
413 __DCL_ASSERT(buf->data()[buf->__dataLength] == L'\0');
414
415 String r = buf;
416 buf->release();
417 return r;
418}
419
421
422Latin1Decoder::Latin1Decoder()
423{
424}
425
426int Latin1Decoder::toWideChar(const byte_t* _mbs, size_t _mbslen, ucs4_t* _uc)
427{
428 *_uc = (byte_t)*_mbs;
429 return 1;
430}
431
432String Latin1Decoder::decode(const char* _mbs, size_t _mbslen)
433{
434 if (_mbslen == (size_t)-1)
435 _mbslen = ByteString::length(_mbs);
436
437 CharBuffer* buf = CharBuffer::create(_mbslen);
438 wchar_t* p = buf->data();
439 for (size_t i = 0; i < _mbslen; i++)
440 p[i] = (wchar_t)(byte_t)_mbs[i];
441
442 p[_mbslen] = L'\0';
443 buf->__dataLength = _mbslen;
444 __DCL_ASSERT(buf->data()[buf->__dataLength] == L'\0');
445
446 String r = buf;
447 buf->release();
448 return r;
449}
450
452
453LocaleDecoder::LocaleDecoder()
454{
455 reset();
456}
457
458void LocaleDecoder::reset()
459{
460 memset(&__mbstate, 0, sizeof(__mbstate));
461}
462
463// mbsrtowcs
464int LocaleDecoder::decode( // MB ==> UCS
465 const byte_t* _in, // in: input bytes
466 size_t& _inCount, // in: count of input bytes, out: count of processed bytes
467 wchar_t* _out, // out: output buffer
468 size_t& _outCount // in: buffer size (countof(_out[], wchar_t)), out: count of converted wchars
469 )
470{
471 __DCL_ASSERT(_in != NULL && _out != NULL);
472
473 wchar_t* dst = _out;
474 wchar_t* dstend = dst + _outCount;
475
476 const byte_t* src = _in;
477 const byte_t* srcend = src + _inCount;
478 size_t srclen;
479 size_t _mbslen = 0;
480
481 while(dst < dstend && (srclen = srcend - src) > 0) {
482 _mbslen = mbrtowc(dst, (const char*) src, srclen, &__mbstate);
483 if (_mbslen == 0 || _mbslen == (size_t) -1 || _mbslen == (size_t) -2)
484 break;
485
486 src += _mbslen;
487 dst++;
488 }
489
490 _inCount = src - _in;
491 _outCount = dst - _out;
492
493 switch(_mbslen) {
494 case (size_t) -2 :
495 return CS_SOURCE_FEW;
496 case (size_t) -1 :
497 return CS_ILLEGAL_SEQUENCE;
498 }
499 return CS_NOERROR;
500}
501
502int LocaleDecoder::toWideChar(const byte_t* _mbs, size_t _mbslen, ucs4_t* _uc)
503{
504 __DCL_ASSERT(false);
505 return 0;
506}
507
508__DCL_END_NAMESPACE
#define __THIS_FILE__
Definition _trace.h:14
@ CS_BIG_ENDIAN
Definition Charset.h:57
@ CS_LITTLE_ENDIAN
Definition Charset.h:56
__DCL_BEGIN_NAMESPACE typedef uint32_t ucs4_t
Definition Charset.h:29
@ CS_SOURCE_FEW
Definition Charset.h:65
@ CS_NOERROR
Definition Charset.h:61
@ CS_ILLEGAL_UCS2
Definition Charset.h:67
@ CS_ILLEGAL_SEQUENCE
Definition Charset.h:66
#define ILLEGAL_SEQUENCE
#define SOURCE_FEW_N(_mbslen)
#define ILLEGAL_UCS2
#define NULL
Definition Config.h:340
#define __countof(array, type)
Definition Config.h:365
wchar_t char_t
Definition Config.h:275
unsigned char byte_t
Definition Config.h:274
#define __DCL_THROWS1(e)
Definition Config.h:167
#define __DCL_ASSERT(expr)
Definition Object.h:371
#define IMPLEMENT_CLASSINFO(class_name, base_class_name)
Definition Object.h:228
#define __T(str)
Definition Object.h:44
ByteString r
ByteBuffer * buf
static String decode(const char *_mbs, size_t _mbslen=(size_t) -1)
virtual int toWideChar(const byte_t *_mbs, size_t _mbslen, ucs4_t *_uc)
size_t getDecodedLength(const char *_mbs, size_t _mbslen) __DCL_THROWS1(CharsetConvertException *)
virtual int toWideChar(const byte_t *_mbs, size_t _mbslen, ucs4_t *_uc)=0
virtual int decode(const byte_t *_in, size_t &_inCount, wchar_t *_out, size_t &_outCount)
virtual int toWideChar(const byte_t *_mbs, size_t _mbslen, ucs4_t *_uc)
static String decode(const char *_mbs, size_t _nmbs=(size_t) -1)
virtual int toWideChar(const byte_t *_mbs, size_t _mbslen, ucs4_t *_uc)
virtual int toWideChar(const byte_t *_mbs, size_t _mbslen, ucs4_t *_uc)
virtual int toWideChar(const byte_t *_mbs, size_t _mbslen, ucs4_t *_uc)
virtual int toWideChar(const byte_t *_mbs, size_t _mbslen, ucs4_t *_uc)