DCL 3.7.4
Loading...
Searching...
No Matches
Charset.h
Go to the documentation of this file.
1#ifndef __DCL_CHARSET_H__
2#define __DCL_CHARSET_H__ 20071008
3
4#ifndef __DCL_CONFIG_H__
5#include <dcl/Config.h>
6#endif
7
8#ifdef __WINNT__
9 #ifndef _MSC_VER
10 // MinGW
11 #include <wchar.h>
12 #endif
13#elif defined(_AIX) || defined(__sun__)
14 #include <wchar.h>
15#elif defined(__APPLE__)
16 #include <sys/_types/_mbstate_t.h>
17#else
18 #include <bits/types/mbstate_t.h>
19#endif
20
21#ifndef __DCL_OBJECT_H__
22#include <dcl/Object.h>
23#endif
24#ifndef __DCL_EXCEPTION_H__
25#include <dcl/Exception.h>
26#endif
27#ifndef __DCL_STRING_H__
28#include <dcl/String.h>
29#endif
30
31__DCL_BEGIN_NAMESPACE
32
33typedef uint32_t ucs4_t;
34typedef uint16_t utf16_t;
35typedef uint32_t utf32_t;
36
37#define IS_UTF8(bom) ((bom[0] == '\xef') && (bom[1] == '\xbb') \
38 && (bom[2] == '\xbf'))
39#define IS_UTF16BE(bom) ((bom[0] == '\xfe') && (bom[1] == '\xff'))
40#define IS_UTF16LE(bom) ((bom[0] == '\xff') && (bom[1] == '\xfe'))
41#define IS_UTF16(bom) (IS_UTF16BE(bom) || IS_UTF16LE(bom))
42#define IS_UTF32BE(bom) ((bom[0] == '\x00') && (bom[1] == '\x00') \
43 && (bom[2] == '\xfe') && (bom[3] == '\xff'))
44#define IS_UTF32LE(bom) ((bom[0] == '\xff') && (bom[1] == '\xfe') \
45 && (bom[2] == '\x00') && (bom[3] == '\x00'))
46#define IS_UTF32(bom) (IS_UTF32BE(bom) || IS_UTF32LE(bom))
47
49{
50 CS_LOCALE = (int)0,
51 CS_ASCII, // 7bit, US_ASCII
52 CS_LATIN1, // 8bit, ISO-8859-1
53 CS_UTF8, // http://www.faqs.org/rfcs/rfc3629.html
56};
57
59 // UTF16, UTF32
60 CS_DEFAULT_ENDIAN = __BYTE_ORDER__, // platform dependent default
63};
64
65enum {
67 // encode()
68 CS_ILLEGAL_UCS, // can't UCS4 ==> MB
69 // decode()
70 CS_SOURCE_FEW, // source bytes few
71 CS_ILLEGAL_SEQUENCE, // illegal bytes sequence
72 CS_ILLEGAL_UCS2 // can't UCS4 ==> UCS2 , on sizeof(wchar_t) == 2
73};
74
76{
78public:
79 CharsetConvertException(int _errorCode);
80
81 virtual String toString() const;
82protected:
84};
85
87{
89public:
90 // return: CS_BUFFER_SMALL, CS_ILLEGAL_UCS4
91 virtual int encode( // UCS ==> MB, UTF
92 const wchar_t* _in, // in: input wide characters
93 size_t& _inCount, // in: count of input wchars, out: count of processed wchars
94 byte_t* _out, // out: output buffer
95 size_t& _outCount // in: buffer size (countof(_out[], byte_t), out: count of converted bytes
96 );
97
98 size_t getEncodedLength(const wchar_t* _wcs, size_t _wcslen)
100
101 ByteString encode(const wchar_t* _wcs, size_t _wcslen = (size_t)-1)
103
104 ByteString encode(const String& _str)
106
107protected:
109
110 virtual int toMultiByte(ucs4_t _uc, byte_t* _mbs, size_t _mbslen) = 0;
111};
112
114{
116public:
117 // return: CS_SOURCE_FEW, CS_ILLEGAL_SEQUENCE, CS_ILLEGAL_UCS2
118 virtual int decode( // MB, UTF ==> UCS
119 const byte_t* _in, // in: input bytes
120 size_t& _inCount, // in: count of input bytes, out: count of processed bytes
121 wchar_t* _out, // out: output buffer
122 size_t& _outCount // in: buffer size (countof(_out[], wchar_t)), out: count of converted wchars
123 );
124
125 size_t getDecodedLength(const char* _mbs, size_t _mbslen)
127
128 String decode(const char* _mbs, size_t _mbslen = (size_t)-1)
130
131 String decode(const ByteString& _str)
133
134protected:
136 virtual int toWideChar(const byte_t* _mbs, size_t _mbslen, ucs4_t* _uc) = 0;
137};
138
140{
142public:
144 bool _addBOM = false // Byte Order Mark
145 );
146
147 void reset();
148
149 virtual int encode( // UCS ==> MB, UTF
150 const wchar_t* _in, // in: input wide characters
151 size_t& _inCount, // in: count of input wchars, out: count of processed wchars
152 byte_t* _out, // out: output buffer
153 size_t& _outCount // in: buffer size (countof(_out[], byte_t), out: count of converted bytes
154 );
155
156 static ByteString encode(const wchar_t* _wcs, size_t _wcslen)
158 {
159 UTF8Encoder encoder(false);
160 return ((CharsetEncoder*)&encoder)->encode(_wcs, _wcslen);
161 }
162
163 static ByteString encode(const String& _str)
165 {
166 return UTF8Encoder::encode(_str, _str.length());
167 }
168
169 // return countof(byte_t)
170 // Unicode 5.0 : countOfWchars * 4 + 3
171 // current implementation countOfWchars * 6 + 3
172 static size_t maxOutCount(size_t countOfWchars) { return countOfWchars * 4 + 3; }
173
174protected:
175 virtual int toMultiByte(ucs4_t _uc, byte_t* _mbs, size_t _mbslen);
176
177private:
178 bool __addBOM;
179 bool __addedBOM;
180};
181
183{
185public:
186 UTF8Decoder();
187
188 void reset();
189
190 bool hasBOM() const { return __hasBOM; }
191
192 static String decode(const char* _mbs, size_t _mbslen)
194 {
195 UTF8Decoder decoder;
196 return ((CharsetDecoder*)&decoder)->decode(_mbs, _mbslen);
197 }
198
199 static String decode(const ByteString& _str)
201 {
202 return UTF8Decoder::decode(_str, _str.length());
203 }
204
205 // return countof(wchar_t)
206 static size_t maxOutCount(size_t _countOfBytes) { return _countOfBytes; }
207
208protected:
209 virtual int toWideChar(const byte_t* _mbs, size_t _mbslen, ucs4_t* _uc);
210
211private:
212 bool __hasBOM; // BOM decoded?
213};
214
216{
218public:
220 bool _addBOM = true, // Byte Order Mark
221 int _byteOrder = CS_DEFAULT_ENDIAN
222 );
223
224 void reset();
225
226 // return countof(utf16_t)
227 static size_t maxOutCount(size_t countOfWchars) { return countOfWchars * 2 + 1; }
228
229 virtual int encode( // UCS ==> MB, UTF
230 const wchar_t* _in, // in: input wide characters
231 size_t& _inCount, // in: count of input wchars, out: count of processed wchars
232 byte_t* _out, // out: output buffer
233 size_t& _outCount // in: buffer size (countof(_out[], byte_t), out: count of converted bytes
234 );
235
236protected:
237 virtual int toMultiByte(ucs4_t _uc, byte_t* _mbs, size_t _mbslen);
238
239private:
240 bool __addBOM;
241 bool __addedBOM;
242 bool __bigEndian;
243};
244
246{
248public:
250 int nDefaultByteOrder = CS_DEFAULT_ENDIAN
251 );
252
253 void reset();
254
255 bool hasBOM() const { return __hasBOM; }
256
257 int byteOrder() const { return __bigEndian ? CS_BIG_ENDIAN : CS_LITTLE_ENDIAN; }
258
259 bool byteOrderChanged() const { return __bigEndian != __defaultBigEndian; }
260
261protected:
262 virtual int toWideChar(const byte_t* _mbs, size_t _mbslen, ucs4_t* _uc);
263
264private:
265 bool __hasBOM; // BOM decoded?
266 bool __bigEndian;
267 bool __defaultBigEndian;
268};
269
271{
273public:
275 bool _addBOM = true, // Byte Order Mark
276 int _byteOrder = CS_DEFAULT_ENDIAN
277 );
278
279 void reset();
280
281 virtual int encode( // UCS ==> MB, UTF
282 const wchar_t* _in, // in: input wide characters
283 size_t& _inCount, // in: count of input wchars, out: count of processed wchars
284 byte_t* _out, // out: output buffer
285 size_t& _outCount // in: buffer size (countof(_out[], byte_t), out: count of converted bytes
286 );
287
288 // return countof(utf32_t)
289 static size_t maxOutCount(size_t countOfWchars) { return countOfWchars + 1; }
290
291protected:
292 virtual int toMultiByte(ucs4_t _uc, byte_t* _mbs, size_t _mbslen);
293
294private:
295 bool __addBOM;
296 bool __addedBOM;
297 bool __bigEndian;
298};
299
301{
303public:
305 int nDefaultByteOrder = CS_DEFAULT_ENDIAN
306 );
307
308 void reset();
309
310 bool hasBOM() const { return __hasBOM; }
311
312 int byteOrder() const { return __bigEndian ? CS_BIG_ENDIAN : CS_LITTLE_ENDIAN; }
313
314 bool byteOrderChanged() const { return __bigEndian != __defaultBigEndian; }
315
316protected:
317 virtual int toWideChar(const byte_t* _mbs, size_t _mbslen, ucs4_t* _uc);
318
319private:
320 bool __hasBOM; // BOM decoded?
321 bool __bigEndian;
322 bool __defaultBigEndian;
323};
324
326{
328public:
329 AsciiEncoder();
330
331 static ByteString encode(const wchar_t* _wcs, size_t _wcslen)
333 {
334 AsciiEncoder encoder;
335 return ((CharsetEncoder*)&encoder)->encode(_wcs, _wcslen);
336 }
337
338 static ByteString encode(const String& _str)
340 {
341 return AsciiEncoder::encode(_str, _str.length());
342 }
343
344protected:
345 virtual int toMultiByte(ucs4_t _uc, byte_t* _mbs, size_t _mbslen);
346};
347
349{
351public:
352 AsciiDecoder();
353
354protected:
355 virtual int toWideChar(const byte_t* _mbs, size_t _mbslen, ucs4_t* _uc);
356
357public:
358 static String decode(const char* _mbs, size_t _mbslen = (size_t)-1);
359};
360
362{
364public:
366
367protected:
368 virtual int toMultiByte(ucs4_t _uc, byte_t* _mbs, size_t _mbslen);
369};
370
372{
374public:
376
377protected:
378 virtual int toWideChar(const byte_t* _mbs, size_t _mbslen, ucs4_t* _uc);
379
380public:
381 static String decode(const char* _mbs, size_t _nmbs = (size_t)-1);
382};
383
384// note: setlocale("", CTYPE);
385// locale dependent encoder
387{
389public:
391 virtual void reset();
392
393 // CS_BUFFER_SMALL, CS_ELLEGAL_UCS4
394 virtual int encode( // UCS ==> MB, UTF
395 const wchar_t* _in, // in: input wide characters
396 size_t& _inCount, // in: count of input wchars, out: count of processed wchars
397 byte_t* _out, // out: output buffer
398 size_t& _outCount // in: buffer size (countof(_out[], byte_t), out: count of converted bytes
399 );
400
401 static ByteString encode(const wchar_t* _wcs, size_t _wcslen)
403 {
404 LocaleEncoder encoder;
405 return ((CharsetEncoder*)&encoder)->encode(_wcs, _wcslen);
406 }
407
408 static ByteString encode(const String& _str)
410 {
411 return LocaleEncoder::encode(_str, _str.length());
412 }
413
414protected:
415 virtual int toMultiByte(ucs4_t _uc, byte_t* _mbs, size_t _mbslen);
416
417private:
418 mbstate_t __mbstate;
419};
420
422{
424public:
426 virtual void reset();
427
428 // return: CS_SOURCE_FEW, CS_ILLEGAL_SEQUENCE, CS_ILLEGAL_UCS2
429 virtual int decode( // MB, UTF ==> UCS
430 const byte_t* _in, // in: input bytes
431 size_t& _inCount, // in: count of input bytes, out: count of processed bytes
432 wchar_t* _out, // out: output buffer
433 size_t& _outCount // in: buffer size (countof(_out[], wchar_t)), out: count of converted wchars
434 );
435
436 static String decode(const char* _mbs, size_t _nmbs = (size_t)-1)
438 {
439 LocaleDecoder decoder;
440 return ((CharsetDecoder*)&decoder)->decode(_mbs, _nmbs);
441 }
442
443 static String decode(const ByteString& _str)
445 {
446 return LocaleDecoder::decode(_str, _str.length());
447 }
448
449protected:
450 virtual int toWideChar(const byte_t* _mbs, size_t _mbslen, ucs4_t* _uc);
451
452private:
453 mbstate_t __mbstate;
454};
455
456__DCL_END_NAMESPACE
457
458#endif // __DCL_CHARSET_H__
uint32_t utf32_t
Definition Charset.h:35
uint16_t utf16_t
Definition Charset.h:34
UnicodeByteOrder
Definition Charset.h:58
@ CS_DEFAULT_ENDIAN
Definition Charset.h:60
@ CS_BIG_ENDIAN
Definition Charset.h:62
@ CS_LITTLE_ENDIAN
Definition Charset.h:61
Charset
Definition Charset.h:49
@ CS_UTF8
Definition Charset.h:53
@ CS_LATIN1
Definition Charset.h:52
@ CS_UTF32
Definition Charset.h:55
@ CS_UTF16
Definition Charset.h:54
@ CS_ASCII
Definition Charset.h:51
@ CS_LOCALE
Definition Charset.h:50
__DCL_BEGIN_NAMESPACE typedef uint32_t ucs4_t
Definition Charset.h:33
@ CS_SOURCE_FEW
Definition Charset.h:70
@ CS_NOERROR
Definition Charset.h:66
@ CS_ILLEGAL_UCS
Definition Charset.h:68
@ CS_ILLEGAL_UCS2
Definition Charset.h:72
@ CS_ILLEGAL_SEQUENCE
Definition Charset.h:71
#define __ORDER_BIG_ENDIAN__
Definition Config.h:224
#define DCLCAPI
Definition Config.h:95
#define __ORDER_LITTLE_ENDIAN__
Definition Config.h:223
unsigned char byte_t
Definition Config.h:246
#define __DCL_THROWS1(e)
Definition Config.h:152
#define DECLARE_CLASSINFO(class_name)
Definition Object.h:227
static String decode(const char *_mbs, size_t _mbslen=(size_t) -1)
virtual int toWideChar(const byte_t *_mbs, size_t _mbslen, ucs4_t *_uc)
virtual int toMultiByte(ucs4_t _uc, byte_t *_mbs, size_t _mbslen)
size_t getDecodedLength(const char *_mbs, size_t _mbslen) __DCL_THROWS1(CharsetConvertException *)
virtual int toWideChar(const byte_t *_mbs, size_t _mbslen, ucs4_t *_uc)=0
virtual int decode(const byte_t *_in, size_t &_inCount, wchar_t *_out, size_t &_outCount)
virtual int toMultiByte(ucs4_t _uc, byte_t *_mbs, size_t _mbslen)=0
virtual String toString() const
Definition Exception.cpp:40
Exception(Exception *_cause=NULL)
virtual int toWideChar(const byte_t *_mbs, size_t _mbslen, ucs4_t *_uc)
static String decode(const char *_mbs, size_t _nmbs=(size_t) -1)
virtual int toMultiByte(ucs4_t _uc, byte_t *_mbs, size_t _mbslen)
virtual int toWideChar(const byte_t *_mbs, size_t _mbslen, ucs4_t *_uc)
static ByteString encode(const wchar_t *_wcs, size_t _wcslen) __DCL_THROWS1(CharsetConvertException *)
Definition Charset.h:401
virtual int encode(const wchar_t *_in, size_t &_inCount, byte_t *_out, size_t &_outCount)
static ByteString encode(const String &_str) __DCL_THROWS1(CharsetConvertException *)
Definition Charset.h:408
virtual void reset()
Object()
Definition Object.cpp:183
virtual int toWideChar(const byte_t *_mbs, size_t _mbslen, ucs4_t *_uc)
virtual int toMultiByte(ucs4_t _uc, byte_t *_mbs, size_t _mbslen)
virtual int toWideChar(const byte_t *_mbs, size_t _mbslen, ucs4_t *_uc)
virtual int toMultiByte(ucs4_t _uc, byte_t *_mbs, size_t _mbslen)
virtual int toWideChar(const byte_t *_mbs, size_t _mbslen, ucs4_t *_uc)
virtual int toMultiByte(ucs4_t _uc, byte_t *_mbs, size_t _mbslen)