DCL 4.1
Loading...
Searching...
No Matches
Charset.h
Go to the documentation of this file.
1#ifndef __DCL_CHARSET_H__
2#define __DCL_CHARSET_H__ 20071008
3
4#ifndef __DCL_CONFIG_H__
5#include <dcl/Config.h>
6#endif
7
8#if __DCL_WINDOWS
9 #ifndef _MSC_VER
10 // MinGW
11 #include <wchar.h>
12 #endif
13#else
14 #include <bits/types/mbstate_t.h>
15#endif
16
17#ifndef __DCL_OBJECT_H__
18#include <dcl/Object.h>
19#endif
20#ifndef __DCL_EXCEPTION_H__
21#include <dcl/Exception.h>
22#endif
23#ifndef __DCL_STRING_H__
24#include <dcl/String.h>
25#endif
26
27__DCL_BEGIN_NAMESPACE
28
29typedef uint32_t ucs4_t;
30typedef uint16_t utf16_t;
31typedef uint32_t utf32_t;
32
33#define IS_UTF8(bom) ((bom[0] == 0xEF) && (bom[1] == 0xBB) && (bom[2] == 0xBF))
34#define IS_UTF16BE(bom) ((bom[0] == 0xFE) && (bom[1] == 0xFF))
35#define IS_UTF16LE(bom) ((bom[0] == 0xFF) && (bom[1] == 0xFE))
36#define IS_UTF16(bom) (IS_UTF16BE(bom) || IS_UTF16LE(bom))
37#define IS_UTF32BE(bom) ((bom[0] == 0x00) && (bom[1] == 0x00) \
38 && (bom[2] == 0xFE) && (bom[3] == 0xFF))
39#define IS_UTF32LE(bom) ((bom[0] == 0xFF) && (bom[1] == 0xFE) \
40 && (bom[2] == 0x00) && (bom[3] == 0x00))
41#define IS_UTF32(bom) (IS_UTF32BE(bom) || IS_UTF32LE(bom))
42
44{
45 CS_LOCALE = (int)0,
46 CS_ASCII, // 7bit, US_ASCII
47 CS_LATIN1, // 8bit, ISO-8859-1
48 CS_UTF8, // http://www.faqs.org/rfcs/rfc3629.html
51};
52
54 // UTF16, UTF32
55 CS_DEFAULT_ENDIAN = __BYTE_ORDER, // platform dependent default
58};
59
60enum {
62 // encode()
63 CS_ILLEGAL_UCS, // can't UCS4 ==> MB
64 // decode()
65 CS_SOURCE_FEW, // source bytes few
66 CS_ILLEGAL_SEQUENCE, // illegal bytes sequence
67 CS_ILLEGAL_UCS2 // can't UCS4 ==> UCS2 , on sizeof(wchar_t) == 2
68};
69
71{
73public:
74 CharsetConvertException(int _errorCode);
75 virtual String toString() const;
76protected:
78};
79
81{
83public:
84 // return: CS_BUFFER_SMALL, CS_ILLEGAL_UCS4
85 virtual int encode( // UCS ==> MB, UTF
86 const wchar_t* _in, // in: input wide characters
87 size_t& _inCount, // in: count of input wchars, out: count of processed wchars
88 byte_t* _out, // out: output buffer
89 size_t& _outCount // in: buffer size (countof(_out[], byte_t), out: count of converted bytes
90 );
91
92 size_t getEncodedLength(const wchar_t* _wcs, size_t _wcslen)
94
95 ByteString encode(const wchar_t* _wcs, size_t _wcslen = (size_t)-1)
97
98 ByteString encode(const String& _str)
100
101protected:
103 virtual int toMultiByte(ucs4_t _uc, byte_t* _mbs, size_t _mbslen) = 0;
104};
105
107{
109public:
110 // return: CS_SOURCE_FEW, CS_ILLEGAL_SEQUENCE, CS_ILLEGAL_UCS2
111 virtual int decode( // MB, UTF ==> UCS
112 const byte_t* _in, // in: input bytes
113 size_t& _inCount, // in: count of input bytes, out: count of processed bytes
114 wchar_t* _out, // out: output buffer
115 size_t& _outCount // in: buffer size (countof(_out[], wchar_t)), out: count of converted wchars
116 );
117
118 size_t getDecodedLength(const char* _mbs, size_t _mbslen)
120
121 String decode(const char* _mbs, size_t _mbslen = (size_t)-1)
123
124 String decode(const ByteString& _str)
126
127protected:
129 virtual int toWideChar(const byte_t* _mbs, size_t _mbslen, ucs4_t* _uc) = 0;
130};
131
133{
135public:
137 bool _addBOM = false // Byte Order Mark
138 );
139 void reset();
140
141 virtual int encode( // UCS ==> MB, UTF
142 const wchar_t* _in, // in: input wide characters
143 size_t& _inCount, // in: count of input wchars, out: count of processed wchars
144 byte_t* _out, // out: output buffer
145 size_t& _outCount // in: buffer size (countof(_out[], byte_t), out: count of converted bytes
146 );
147
148 static ByteString encode(const wchar_t* _wcs, size_t _wcslen)
150 {
151 UTF8Encoder encoder(false);
152 return ((CharsetEncoder*)&encoder)->encode(_wcs, _wcslen);
153 }
154
155 static ByteString encode(const String& _str)
157 {
158 return UTF8Encoder::encode(_str, _str.length());
159 }
160
161 // return countof(byte_t)
162 // Unicode 5.0 : countOfWchars * 4 + 3
163 // current implementation countOfWchars * 6 + 3
164 static size_t maxOutCount(size_t countOfWchars) { return countOfWchars * 4 + 3; }
165
166protected:
167 virtual int toMultiByte(ucs4_t _uc, byte_t* _mbs, size_t _mbslen);
168
169private:
170 bool __addBOM;
171 bool __addedBOM;
172};
173
175{
177public:
178 UTF8Decoder();
179 void reset();
180 bool hasBOM() const { return __hasBOM; }
181
182 static String decode(const char* _mbs, size_t _mbslen)
184 {
185 UTF8Decoder decoder;
186 return ((CharsetDecoder*)&decoder)->decode(_mbs, _mbslen);
187 }
188
189 static String decode(const ByteString& _str)
191 {
192 return UTF8Decoder::decode(_str, _str.length());
193 }
194
195 // return countof(wchar_t)
196 static size_t maxOutCount(size_t _countOfBytes) { return _countOfBytes; }
197
198protected:
199 virtual int toWideChar(const byte_t* _mbs, size_t _mbslen, ucs4_t* _uc);
200
201private:
202 bool __hasBOM; // BOM decoded?
203};
204
206{
208public:
210 bool _addBOM = true, // Byte Order Mark
211 int _byteOrder = CS_DEFAULT_ENDIAN
212 );
213 void reset();
214
215 // return countof(utf16_t)
216 static size_t maxOutCount(size_t countOfWchars) { return countOfWchars * 2 + 1; }
217
218 virtual int encode( // UCS ==> MB, UTF
219 const wchar_t* _in, // in: input wide characters
220 size_t& _inCount, // in: count of input wchars, out: count of processed wchars
221 byte_t* _out, // out: output buffer
222 size_t& _outCount // in: buffer size (countof(_out[], byte_t), out: count of converted bytes
223 );
224
225protected:
226 virtual int toMultiByte(ucs4_t _uc, byte_t* _mbs, size_t _mbslen);
227
228private:
229 bool __addBOM;
230 bool __addedBOM;
231 bool __bigEndian;
232};
233
235{
237public:
239 int nDefaultByteOrder = CS_DEFAULT_ENDIAN
240 );
241 void reset();
242 bool hasBOM() const { return __hasBOM; }
243 int byteOrder() const { return __bigEndian ? CS_BIG_ENDIAN : CS_LITTLE_ENDIAN; }
244 bool byteOrderChanged() const { return __bigEndian != __defaultBigEndian; }
245
246protected:
247 virtual int toWideChar(const byte_t* _mbs, size_t _mbslen, ucs4_t* _uc);
248
249private:
250 bool __hasBOM; // BOM decoded?
251 bool __bigEndian;
252 bool __defaultBigEndian;
253};
254
256{
258public:
260 bool _addBOM = true, // Byte Order Mark
261 int _byteOrder = CS_DEFAULT_ENDIAN
262 );
263 void reset();
264
265 virtual int encode( // UCS ==> MB, UTF
266 const wchar_t* _in, // in: input wide characters
267 size_t& _inCount, // in: count of input wchars, out: count of processed wchars
268 byte_t* _out, // out: output buffer
269 size_t& _outCount // in: buffer size (countof(_out[], byte_t), out: count of converted bytes
270 );
271
272 // return countof(utf32_t)
273 static size_t maxOutCount(size_t countOfWchars) { return countOfWchars + 1; }
274
275protected:
276 virtual int toMultiByte(ucs4_t _uc, byte_t* _mbs, size_t _mbslen);
277
278private:
279 bool __addBOM;
280 bool __addedBOM;
281 bool __bigEndian;
282};
283
285{
287public:
289 int nDefaultByteOrder = CS_DEFAULT_ENDIAN
290 );
291 void reset();
292 bool hasBOM() const { return __hasBOM; }
293 int byteOrder() const { return __bigEndian ? CS_BIG_ENDIAN : CS_LITTLE_ENDIAN; }
294 bool byteOrderChanged() const { return __bigEndian != __defaultBigEndian; }
295
296protected:
297 virtual int toWideChar(const byte_t* _mbs, size_t _mbslen, ucs4_t* _uc);
298
299private:
300 bool __hasBOM; // BOM decoded?
301 bool __bigEndian;
302 bool __defaultBigEndian;
303};
304
306{
308public:
309 AsciiEncoder();
310
311 static ByteString encode(const wchar_t* _wcs, size_t _wcslen)
313 {
314 AsciiEncoder encoder;
315 return ((CharsetEncoder*)&encoder)->encode(_wcs, _wcslen);
316 }
317
318 static ByteString encode(const String& _str)
320 {
321 return AsciiEncoder::encode(_str, _str.length());
322 }
323
324protected:
325 virtual int toMultiByte(ucs4_t _uc, byte_t* _mbs, size_t _mbslen);
326};
327
329{
331public:
332 AsciiDecoder();
333
334protected:
335 virtual int toWideChar(const byte_t* _mbs, size_t _mbslen, ucs4_t* _uc);
336
337public:
338 static String decode(const char* _mbs, size_t _mbslen = (size_t)-1);
339};
340
342{
344public:
346
347protected:
348 virtual int toMultiByte(ucs4_t _uc, byte_t* _mbs, size_t _mbslen);
349};
350
352{
354public:
356
357protected:
358 virtual int toWideChar(const byte_t* _mbs, size_t _mbslen, ucs4_t* _uc);
359
360public:
361 static String decode(const char* _mbs, size_t _nmbs = (size_t)-1);
362};
363
364// note: setlocale("", CTYPE);
365// locale dependent encoder
367{
369public:
371 virtual void reset();
372
373 // CS_BUFFER_SMALL, CS_ELLEGAL_UCS4
374 virtual int encode( // UCS ==> MB, UTF
375 const wchar_t* _in, // in: input wide characters
376 size_t& _inCount , // in: count of input wchars, out: count of processed wchars
377 byte_t* _out, // out: output buffer
378 size_t& _outCount // in: buffer size (countof(_out[], byte_t), out: count of converted bytes
379 );
380
381 static ByteString encode(const wchar_t* _wcs, size_t _wcslen)
383 {
384 LocaleEncoder encoder;
385 return ((CharsetEncoder*)&encoder)->encode(_wcs, _wcslen);
386 }
387
388 static ByteString encode(const String& _str)
390 {
391 return LocaleEncoder::encode(_str, _str.length());
392 }
393
394protected:
395 virtual int toMultiByte(ucs4_t _uc, byte_t* _mbs, size_t _mbslen);
396
397private:
398 mbstate_t __mbstate;
399};
400
402{
404public:
406 virtual void reset();
407
408 // return: CS_SOURCE_FEW, CS_ILLEGAL_SEQUENCE, CS_ILLEGAL_UCS2
409 virtual int decode( // MB, UTF ==> UCS
410 const byte_t* _in, // in: input bytes
411 size_t& _inCount, // in: count of input bytes, out: count of processed bytes
412 wchar_t* _out, // out: output buffer
413 size_t& _outCount // in: buffer size (countof(_out[], wchar_t)), out: count of converted wchars
414 );
415
416 static String decode(const char* _mbs, size_t _nmbs = (size_t)-1)
418 {
419 LocaleDecoder decoder;
420 return ((CharsetDecoder*)&decoder)->decode(_mbs, _nmbs);
421 }
422
423 static String decode(const ByteString& _str)
425 {
426 return LocaleDecoder::decode(_str, _str.length());
427 }
428
429protected:
430 virtual int toWideChar(const byte_t* _mbs, size_t _mbslen, ucs4_t* _uc);
431
432private:
433 mbstate_t __mbstate;
434};
435
436__DCL_END_NAMESPACE
437
438#endif // __DCL_CHARSET_H__
uint32_t utf32_t
Definition Charset.h:31
uint16_t utf16_t
Definition Charset.h:30
UnicodeByteOrder
Definition Charset.h:53
@ CS_DEFAULT_ENDIAN
Definition Charset.h:55
@ CS_BIG_ENDIAN
Definition Charset.h:57
@ CS_LITTLE_ENDIAN
Definition Charset.h:56
Charset
Definition Charset.h:44
@ CS_UTF8
Definition Charset.h:48
@ CS_LATIN1
Definition Charset.h:47
@ CS_UTF32
Definition Charset.h:50
@ CS_UTF16
Definition Charset.h:49
@ CS_ASCII
Definition Charset.h:46
@ CS_LOCALE
Definition Charset.h:45
__DCL_BEGIN_NAMESPACE typedef uint32_t ucs4_t
Definition Charset.h:29
@ CS_SOURCE_FEW
Definition Charset.h:65
@ CS_NOERROR
Definition Charset.h:61
@ CS_ILLEGAL_UCS
Definition Charset.h:63
@ CS_ILLEGAL_UCS2
Definition Charset.h:67
@ CS_ILLEGAL_SEQUENCE
Definition Charset.h:66
#define DCLCAPI
Definition Config.h:100
unsigned char byte_t
Definition Config.h:274
#define __BIG_ENDIAN
Definition Config.h:242
#define __LITTLE_ENDIAN
Definition Config.h:241
#define __DCL_THROWS1(e)
Definition Config.h:167
#define DECLARE_CLASSINFO(class_name)
Definition Object.h:210
static String decode(const char *_mbs, size_t _mbslen=(size_t) -1)
virtual int toWideChar(const byte_t *_mbs, size_t _mbslen, ucs4_t *_uc)
virtual int toMultiByte(ucs4_t _uc, byte_t *_mbs, size_t _mbslen)
size_t getDecodedLength(const char *_mbs, size_t _mbslen) __DCL_THROWS1(CharsetConvertException *)
virtual int toWideChar(const byte_t *_mbs, size_t _mbslen, ucs4_t *_uc)=0
virtual int decode(const byte_t *_in, size_t &_inCount, wchar_t *_out, size_t &_outCount)
virtual int toMultiByte(ucs4_t _uc, byte_t *_mbs, size_t _mbslen)=0
virtual String toString() const
Definition Exception.cpp:40
Exception(Exception *_cause=NULL)
virtual int toWideChar(const byte_t *_mbs, size_t _mbslen, ucs4_t *_uc)
static String decode(const char *_mbs, size_t _nmbs=(size_t) -1)
virtual int toMultiByte(ucs4_t _uc, byte_t *_mbs, size_t _mbslen)
virtual int toWideChar(const byte_t *_mbs, size_t _mbslen, ucs4_t *_uc)
static ByteString encode(const wchar_t *_wcs, size_t _wcslen) __DCL_THROWS1(CharsetConvertException *)
Definition Charset.h:381
virtual int encode(const wchar_t *_in, size_t &_inCount, byte_t *_out, size_t &_outCount)
static ByteString encode(const String &_str) __DCL_THROWS1(CharsetConvertException *)
Definition Charset.h:388
virtual void reset()
Object()
Definition Object.cpp:183
virtual int toWideChar(const byte_t *_mbs, size_t _mbslen, ucs4_t *_uc)
virtual int toMultiByte(ucs4_t _uc, byte_t *_mbs, size_t _mbslen)
virtual int toWideChar(const byte_t *_mbs, size_t _mbslen, ucs4_t *_uc)
virtual int toMultiByte(ucs4_t _uc, byte_t *_mbs, size_t _mbslen)
virtual int toWideChar(const byte_t *_mbs, size_t _mbslen, ucs4_t *_uc)
virtual int toMultiByte(ucs4_t _uc, byte_t *_mbs, size_t _mbslen)