DCL 3.7.6
Loading...
Searching...
No Matches
Charset.h
Go to the documentation of this file.
1#ifndef __DCL_CHARSET_H__
2#define __DCL_CHARSET_H__ 20071008
3
4#ifndef __DCL_CONFIG_H__
5#include <dcl/Config.h>
6#endif
7
8#ifdef __WINNT__
9 #ifndef _MSC_VER
10 // MinGW
11 #include <wchar.h>
12 #endif
13#elif defined(_AIX) || defined(__sun__) || defined(__FreeBSD__)
14 #include <wchar.h>
15#elif defined(__APPLE__)
16 #include <sys/_types/_mbstate_t.h>
17#elif defined(__ANDROID__)
18 #include <bits/mbstate_t.h>
19#elif !defined(__DEFINED_mbstate_t) && defined(__linux__)
20 #include <bits/types/mbstate_t.h>
21#endif
22
23#ifndef __DCL_OBJECT_H__
24#include <dcl/Object.h>
25#endif
26#ifndef __DCL_EXCEPTION_H__
27#include <dcl/Exception.h>
28#endif
29#ifndef __DCL_STRING_H__
30#include <dcl/String.h>
31#endif
32
33__DCL_BEGIN_NAMESPACE
34
35typedef uint32_t ucs4_t;
36typedef uint16_t utf16_t;
37typedef uint32_t utf32_t;
38
39#define IS_UTF8(bom) ((bom[0] == '\xef') && (bom[1] == '\xbb') \
40 && (bom[2] == '\xbf'))
41#define IS_UTF16BE(bom) ((bom[0] == '\xfe') && (bom[1] == '\xff'))
42#define IS_UTF16LE(bom) ((bom[0] == '\xff') && (bom[1] == '\xfe'))
43#define IS_UTF16(bom) (IS_UTF16BE(bom) || IS_UTF16LE(bom))
44#define IS_UTF32BE(bom) ((bom[0] == '\x00') && (bom[1] == '\x00') \
45 && (bom[2] == '\xfe') && (bom[3] == '\xff'))
46#define IS_UTF32LE(bom) ((bom[0] == '\xff') && (bom[1] == '\xfe') \
47 && (bom[2] == '\x00') && (bom[3] == '\x00'))
48#define IS_UTF32(bom) (IS_UTF32BE(bom) || IS_UTF32LE(bom))
49
51{
52 CS_LOCALE = (int)0,
53 CS_ASCII, // 7bit, US_ASCII
54 CS_LATIN1, // 8bit, ISO-8859-1
55 CS_UTF8, // http://www.faqs.org/rfcs/rfc3629.html
58};
59
61 // UTF16, UTF32
62 CS_DEFAULT_ENDIAN = __BYTE_ORDER__, // platform dependent default
65};
66
67enum {
69 // encode()
70 CS_ILLEGAL_UCS, // can't UCS4 ==> MB
71 // decode()
72 CS_SOURCE_FEW, // source bytes few
73 CS_ILLEGAL_SEQUENCE, // illegal bytes sequence
74 CS_ILLEGAL_UCS2 // can't UCS4 ==> UCS2 , on sizeof(wchar_t) == 2
75};
76
78{
80public:
81 CharsetConvertException(int _errorCode);
82
83 virtual String toString() const;
84protected:
86};
87
89{
91public:
92 // return: CS_BUFFER_SMALL, CS_ILLEGAL_UCS4
93 virtual int encode( // UCS ==> MB, UTF
94 const wchar_t* _in, // in: input wide characters
95 size_t& _inCount, // in: count of input wchars, out: count of processed wchars
96 byte_t* _out, // out: output buffer
97 size_t& _outCount // in: buffer size (countof(_out[], byte_t), out: count of converted bytes
98 );
99
100 size_t getEncodedLength(const wchar_t* _wcs, size_t _wcslen)
102
103 ByteString encode(const wchar_t* _wcs, size_t _wcslen = (size_t)-1)
105
106 ByteString encode(const String& _str)
108
109protected:
111
112 virtual int toMultiByte(ucs4_t _uc, byte_t* _mbs, size_t _mbslen) = 0;
113};
114
116{
118public:
119 // return: CS_SOURCE_FEW, CS_ILLEGAL_SEQUENCE, CS_ILLEGAL_UCS2
120 virtual int decode( // MB, UTF ==> UCS
121 const byte_t* _in, // in: input bytes
122 size_t& _inCount, // in: count of input bytes, out: count of processed bytes
123 wchar_t* _out, // out: output buffer
124 size_t& _outCount // in: buffer size (countof(_out[], wchar_t)), out: count of converted wchars
125 );
126
127 size_t getDecodedLength(const char* _mbs, size_t _mbslen)
129
130 String decode(const char* _mbs, size_t _mbslen = (size_t)-1)
132
133 String decode(const ByteString& _str)
135
136protected:
138 virtual int toWideChar(const byte_t* _mbs, size_t _mbslen, ucs4_t* _uc) = 0;
139};
140
142{
144public:
146 bool _addBOM = false // Byte Order Mark
147 );
148
149 void reset();
150
151 virtual int encode( // UCS ==> MB, UTF
152 const wchar_t* _in, // in: input wide characters
153 size_t& _inCount, // in: count of input wchars, out: count of processed wchars
154 byte_t* _out, // out: output buffer
155 size_t& _outCount // in: buffer size (countof(_out[], byte_t), out: count of converted bytes
156 );
157
158 static ByteString encode(const wchar_t* _wcs, size_t _wcslen)
160 {
161 UTF8Encoder encoder(false);
162 return ((CharsetEncoder*)&encoder)->encode(_wcs, _wcslen);
163 }
164
165 static ByteString encode(const String& _str)
167 {
168 return UTF8Encoder::encode(_str, _str.length());
169 }
170
171 // return countof(byte_t)
172 // Unicode 5.0 : countOfWchars * 4 + 3
173 // current implementation countOfWchars * 6 + 3
174 static size_t maxOutCount(size_t countOfWchars) { return countOfWchars * 4 + 3; }
175
176protected:
177 virtual int toMultiByte(ucs4_t _uc, byte_t* _mbs, size_t _mbslen);
178
179private:
180 bool __addBOM;
181 bool __addedBOM;
182};
183
185{
187public:
188 UTF8Decoder();
189
190 void reset();
191
192 bool hasBOM() const { return __hasBOM; }
193
194 static String decode(const char* _mbs, size_t _mbslen)
196 {
197 UTF8Decoder decoder;
198 return ((CharsetDecoder*)&decoder)->decode(_mbs, _mbslen);
199 }
200
201 static String decode(const ByteString& _str)
203 {
204 return UTF8Decoder::decode(_str, _str.length());
205 }
206
207 // return countof(wchar_t)
208 static size_t maxOutCount(size_t _countOfBytes) { return _countOfBytes; }
209
210protected:
211 virtual int toWideChar(const byte_t* _mbs, size_t _mbslen, ucs4_t* _uc);
212
213private:
214 bool __hasBOM; // BOM decoded?
215};
216
218{
220public:
222 bool _addBOM = true, // Byte Order Mark
223 int _byteOrder = CS_DEFAULT_ENDIAN
224 );
225
226 void reset();
227
228 // return countof(utf16_t)
229 static size_t maxOutCount(size_t countOfWchars) { return countOfWchars * 2 + 1; }
230
231 virtual int encode( // UCS ==> MB, UTF
232 const wchar_t* _in, // in: input wide characters
233 size_t& _inCount, // in: count of input wchars, out: count of processed wchars
234 byte_t* _out, // out: output buffer
235 size_t& _outCount // in: buffer size (countof(_out[], byte_t), out: count of converted bytes
236 );
237
238protected:
239 virtual int toMultiByte(ucs4_t _uc, byte_t* _mbs, size_t _mbslen);
240
241private:
242 bool __addBOM;
243 bool __addedBOM;
244 bool __bigEndian;
245};
246
248{
250public:
252 int nDefaultByteOrder = CS_DEFAULT_ENDIAN
253 );
254
255 void reset();
256
257 bool hasBOM() const { return __hasBOM; }
258
259 int byteOrder() const { return __bigEndian ? CS_BIG_ENDIAN : CS_LITTLE_ENDIAN; }
260
261 bool byteOrderChanged() const { return __bigEndian != __defaultBigEndian; }
262
263protected:
264 virtual int toWideChar(const byte_t* _mbs, size_t _mbslen, ucs4_t* _uc);
265
266private:
267 bool __hasBOM; // BOM decoded?
268 bool __bigEndian;
269 bool __defaultBigEndian;
270};
271
273{
275public:
277 bool _addBOM = true, // Byte Order Mark
278 int _byteOrder = CS_DEFAULT_ENDIAN
279 );
280
281 void reset();
282
283 virtual int encode( // UCS ==> MB, UTF
284 const wchar_t* _in, // in: input wide characters
285 size_t& _inCount, // in: count of input wchars, out: count of processed wchars
286 byte_t* _out, // out: output buffer
287 size_t& _outCount // in: buffer size (countof(_out[], byte_t), out: count of converted bytes
288 );
289
290 // return countof(utf32_t)
291 static size_t maxOutCount(size_t countOfWchars) { return countOfWchars + 1; }
292
293protected:
294 virtual int toMultiByte(ucs4_t _uc, byte_t* _mbs, size_t _mbslen);
295
296private:
297 bool __addBOM;
298 bool __addedBOM;
299 bool __bigEndian;
300};
301
303{
305public:
307 int nDefaultByteOrder = CS_DEFAULT_ENDIAN
308 );
309
310 void reset();
311
312 bool hasBOM() const { return __hasBOM; }
313
314 int byteOrder() const { return __bigEndian ? CS_BIG_ENDIAN : CS_LITTLE_ENDIAN; }
315
316 bool byteOrderChanged() const { return __bigEndian != __defaultBigEndian; }
317
318protected:
319 virtual int toWideChar(const byte_t* _mbs, size_t _mbslen, ucs4_t* _uc);
320
321private:
322 bool __hasBOM; // BOM decoded?
323 bool __bigEndian;
324 bool __defaultBigEndian;
325};
326
328{
330public:
331 AsciiEncoder();
332
333 static ByteString encode(const wchar_t* _wcs, size_t _wcslen)
335 {
336 AsciiEncoder encoder;
337 return ((CharsetEncoder*)&encoder)->encode(_wcs, _wcslen);
338 }
339
340 static ByteString encode(const String& _str)
342 {
343 return AsciiEncoder::encode(_str, _str.length());
344 }
345
346protected:
347 virtual int toMultiByte(ucs4_t _uc, byte_t* _mbs, size_t _mbslen);
348};
349
351{
353public:
354 AsciiDecoder();
355
356protected:
357 virtual int toWideChar(const byte_t* _mbs, size_t _mbslen, ucs4_t* _uc);
358
359public:
360 static String decode(const char* _mbs, size_t _mbslen = (size_t)-1);
361};
362
364{
366public:
368
369protected:
370 virtual int toMultiByte(ucs4_t _uc, byte_t* _mbs, size_t _mbslen);
371};
372
374{
376public:
378
379protected:
380 virtual int toWideChar(const byte_t* _mbs, size_t _mbslen, ucs4_t* _uc);
381
382public:
383 static String decode(const char* _mbs, size_t _nmbs = (size_t)-1);
384};
385
386// note: setlocale("", CTYPE);
387// locale dependent encoder
389{
391public:
393 virtual void reset();
394
395 // CS_BUFFER_SMALL, CS_ELLEGAL_UCS4
396 virtual int encode( // UCS ==> MB, UTF
397 const wchar_t* _in, // in: input wide characters
398 size_t& _inCount, // in: count of input wchars, out: count of processed wchars
399 byte_t* _out, // out: output buffer
400 size_t& _outCount // in: buffer size (countof(_out[], byte_t), out: count of converted bytes
401 );
402
403 static ByteString encode(const wchar_t* _wcs, size_t _wcslen)
405 {
406 LocaleEncoder encoder;
407 return ((CharsetEncoder*)&encoder)->encode(_wcs, _wcslen);
408 }
409
410 static ByteString encode(const String& _str)
412 {
413 return LocaleEncoder::encode(_str, _str.length());
414 }
415
416protected:
417 virtual int toMultiByte(ucs4_t _uc, byte_t* _mbs, size_t _mbslen);
418
419private:
420 mbstate_t __mbstate;
421};
422
424{
426public:
428 virtual void reset();
429
430 // return: CS_SOURCE_FEW, CS_ILLEGAL_SEQUENCE, CS_ILLEGAL_UCS2
431 virtual int decode( // MB, UTF ==> UCS
432 const byte_t* _in, // in: input bytes
433 size_t& _inCount, // in: count of input bytes, out: count of processed bytes
434 wchar_t* _out, // out: output buffer
435 size_t& _outCount // in: buffer size (countof(_out[], wchar_t)), out: count of converted wchars
436 );
437
438 static String decode(const char* _mbs, size_t _nmbs = (size_t)-1)
440 {
441 LocaleDecoder decoder;
442 return ((CharsetDecoder*)&decoder)->decode(_mbs, _nmbs);
443 }
444
445 static String decode(const ByteString& _str)
447 {
448 return LocaleDecoder::decode(_str, _str.length());
449 }
450
451protected:
452 virtual int toWideChar(const byte_t* _mbs, size_t _mbslen, ucs4_t* _uc);
453
454private:
455 mbstate_t __mbstate;
456};
457
458__DCL_END_NAMESPACE
459
460#endif // __DCL_CHARSET_H__
uint32_t utf32_t
Definition Charset.h:37
uint16_t utf16_t
Definition Charset.h:36
UnicodeByteOrder
Definition Charset.h:60
@ CS_DEFAULT_ENDIAN
Definition Charset.h:62
@ CS_BIG_ENDIAN
Definition Charset.h:64
@ CS_LITTLE_ENDIAN
Definition Charset.h:63
Charset
Definition Charset.h:51
@ CS_UTF8
Definition Charset.h:55
@ CS_LATIN1
Definition Charset.h:54
@ CS_UTF32
Definition Charset.h:57
@ CS_UTF16
Definition Charset.h:56
@ CS_ASCII
Definition Charset.h:53
@ CS_LOCALE
Definition Charset.h:52
__DCL_BEGIN_NAMESPACE typedef uint32_t ucs4_t
Definition Charset.h:35
@ CS_SOURCE_FEW
Definition Charset.h:72
@ CS_NOERROR
Definition Charset.h:68
@ CS_ILLEGAL_UCS
Definition Charset.h:70
@ CS_ILLEGAL_UCS2
Definition Charset.h:74
@ CS_ILLEGAL_SEQUENCE
Definition Charset.h:73
#define __ORDER_BIG_ENDIAN__
Definition Config.h:228
#define DCLCAPI
Definition Config.h:95
#define __ORDER_LITTLE_ENDIAN__
Definition Config.h:227
unsigned char byte_t
Definition Config.h:250
#define __DCL_THROWS1(e)
Definition Config.h:152
#define DECLARE_CLASSINFO(class_name)
Definition Object.h:229
static String decode(const char *_mbs, size_t _mbslen=(size_t) -1)
virtual int toWideChar(const byte_t *_mbs, size_t _mbslen, ucs4_t *_uc)
virtual int toMultiByte(ucs4_t _uc, byte_t *_mbs, size_t _mbslen)
size_t getDecodedLength(const char *_mbs, size_t _mbslen) __DCL_THROWS1(CharsetConvertException *)
virtual int toWideChar(const byte_t *_mbs, size_t _mbslen, ucs4_t *_uc)=0
virtual int decode(const byte_t *_in, size_t &_inCount, wchar_t *_out, size_t &_outCount)
virtual int toMultiByte(ucs4_t _uc, byte_t *_mbs, size_t _mbslen)=0
virtual String toString() const
Definition Exception.cpp:40
Exception(Exception *_cause=NULL)
virtual int toWideChar(const byte_t *_mbs, size_t _mbslen, ucs4_t *_uc)
static String decode(const char *_mbs, size_t _nmbs=(size_t) -1)
virtual int toMultiByte(ucs4_t _uc, byte_t *_mbs, size_t _mbslen)
virtual int toWideChar(const byte_t *_mbs, size_t _mbslen, ucs4_t *_uc)
static ByteString encode(const wchar_t *_wcs, size_t _wcslen) __DCL_THROWS1(CharsetConvertException *)
Definition Charset.h:403
virtual int encode(const wchar_t *_in, size_t &_inCount, byte_t *_out, size_t &_outCount)
static ByteString encode(const String &_str) __DCL_THROWS1(CharsetConvertException *)
Definition Charset.h:410
virtual void reset()
Object()
Definition Object.cpp:183
virtual int toWideChar(const byte_t *_mbs, size_t _mbslen, ucs4_t *_uc)
virtual int toMultiByte(ucs4_t _uc, byte_t *_mbs, size_t _mbslen)
virtual int toWideChar(const byte_t *_mbs, size_t _mbslen, ucs4_t *_uc)
virtual int toMultiByte(ucs4_t _uc, byte_t *_mbs, size_t _mbslen)
virtual int toWideChar(const byte_t *_mbs, size_t _mbslen, ucs4_t *_uc)
virtual int toMultiByte(ucs4_t _uc, byte_t *_mbs, size_t _mbslen)