ReactOS 0.4.15-dev-7918-g2a2556c
utf_converter Class Reference
Collaboration diagram for utf_converter:

Public Types

enum  enc_types {
  detect , utf8 , utf16le , utf16be ,
  utf32le , utf32be
}
 
enum  err_types {
  none , iopen , oopen , eof ,
  read , write , decode
}
 
enum  bom_types { bom , nobom }
 

Public Member Functions

 utf_converter (string ifname, string ofname, bom_types ofbom=bom, enc_types enc=detect)
 
err_types getError ()
 
enc_types getBOM ()
 
std::streamsize getByte (unsigned char &c)
 
std::streamsize getWord (unsigned short &w)
 
std::streamsize getDWord (wchar_t &d)
 
wchar_t get_wchar_t ()
 
void convert2utf16le ()
 
 ~utf_converter ()
 

Protected Attributes

err_types error
 
enc_types encoding
 
bom_types bom_type
 
unsigned char buffer [4]
 
unsigned char index
 
std::streamsize fill
 
fstream inputfile
 
fstream outputfile
 

Static Protected Attributes

static const unsigned char utf8table [64]
 

Detailed Description

Definition at line 26 of file utf16le.cpp.

Member Enumeration Documentation

◆ bom_types

Enumerator
bom 
nobom 

Definition at line 33 of file utf16le.cpp.

◆ enc_types

Enumerator
detect 
utf8 
utf16le 
utf16be 
utf32le 
utf32be 

Definition at line 31 of file utf16le.cpp.

◆ err_types

Enumerator
none 
iopen 
oopen 
eof 
read 
write 
decode 

Definition at line 32 of file utf16le.cpp.

Constructor & Destructor Documentation

◆ utf_converter()

utf_converter::utf_converter ( string  ifname,
string  ofname,
bom_types  ofbom = bom,
enc_types  enc = detect 
)
inline

Definition at line 43 of file utf16le.cpp.

43 : error(none), bom_type(ofbom), encoding(enc), fill(0), index(0)
44 {
45 enc_types tmp_enc;
46 inputfile.open(ifname.c_str(), ios::in | ios::binary);
47 if (!inputfile)
48 {
49 error = iopen;
50 return;
51 }
52 outputfile.open(ofname.c_str(), ios::out | ios::binary);
53 if (!outputfile)
54 {
55 error = oopen;
56 return;
57 }
58 tmp_enc = getBOM();
59 if (enc != detect)
60 {
61 if (enc != tmp_enc)
62 cerr << "Warning: UTF-BOM doesn't match encoding setting, but given encoding forced" << endl;
63 }
64 else
65 encoding = tmp_enc;
66 }
basic_ostream< _CharT, _Traits > &_STLP_CALL endl(basic_ostream< _CharT, _Traits > &__os)
Definition: _ostream.h:357
void open(const char *__s, ios_base::openmode __mod=ios_base::in|ios_base::out)
Definition: _fstream.h:674
const _CharT * c_str() const
Definition: _string.h:949
enc_types encoding
Definition: utf16le.cpp:36
bom_types bom_type
Definition: utf16le.cpp:37
fstream inputfile
Definition: utf16le.cpp:40
std::streamsize fill
Definition: utf16le.cpp:39
unsigned char index
Definition: utf16le.cpp:38
fstream outputfile
Definition: utf16le.cpp:40
err_types error
Definition: utf16le.cpp:35
enc_types getBOM()
Definition: utf16le.cpp:71
#define cerr
Definition: iostream.cpp:39

◆ ~utf_converter()

utf_converter::~utf_converter ( )
inline

Definition at line 258 of file utf16le.cpp.

259 {
260 if (inputfile)
262 if (outputfile)
264 }
void close()
Definition: _fstream.h:681

Member Function Documentation

◆ convert2utf16le()

void utf_converter::convert2utf16le ( )
inline

Definition at line 239 of file utf16le.cpp.

240 {
241 unsigned char buffer[2] = { 0xff, 0xfe };
242
243 if (bom_type == bom)
244 {
245 outputfile.write(reinterpret_cast<char*>(&buffer), 2); // write BOM
246 }
247
248 wchar_t c = get_wchar_t();
249
250 while (!inputfile.eof())
251 {
252 buffer[0] = c & 0xff;
253 buffer[1] = (c >> 8) & 0xff; // create utf16-le char
254 outputfile.write(reinterpret_cast<char*>(&buffer),2); // write char
255 c = get_wchar_t();
256 }
257 }
_Self & write(const char_type *__s, streamsize __n)
Definition: _ostream.c:430
bool eof() const
Definition: _ios_base.h:173
wchar_t get_wchar_t()
Definition: utf16le.cpp:178
GLuint buffer
Definition: glext.h:5915
const GLubyte * c
Definition: glext.h:8905

Referenced by main().

◆ get_wchar_t()

wchar_t utf_converter::get_wchar_t ( )
inline

Definition at line 178 of file utf16le.cpp.

179 {
180 wchar_t ret = (wchar_t)-1;
181 switch (encoding)
182 {
183 case detect: // if still unknown
184 encoding = utf8; // assume utf8 as default
185 case utf8:
186 unsigned char c, tmp;
187 if (!getByte(tmp))
188 return ret;
189 // table for 64 bytes (all 11xxxxxx resp. >=192)
190 // resulting byte is determined:
191 // lower 3 bits: number of following bytes (max.8) 0=error
192 // upper 5 bits: data filled with 0
193 if (tmp & 0x80)
194 {
195 if ((tmp & 0xc0) != 0xc0)
196 {
197 cerr << "UTF-8 Error: invalid data byte" << endl;
198 return ret;
199 }
200 unsigned char i = utf8table[tmp & 0x3f];
201 ret = i >> 3;
202 i &= 7;
203 while (i--)
204 {
205 ret <<= 6;
206 if (!getByte(c))
207 return wchar_t(-1);
208 ret |= c & 0x3f;
209 }
210 return ret;
211 }
212 else
213 return wchar_t(tmp);
214 case utf16le:
215 case utf16be:
216 unsigned short w,w2;
217 if (getWord(w) != 2)
218 return ret;
219 if ((w & 0xfc00) == 0xd800) // high surrogate first
220 {
221 if (getWord(w2) != 2)
222 return ret;
223 if ((w2 & 0xfc00) != 0xdc00)
224 {
225 cerr << "UTF-16 Error: invalid low surrogate" << endl;
226 return ret;
227 }
228 return (((w & 0x3ff) + 0x40) << 10) | (w2 & 0x3ff);
229 }
230 return w;
231 case utf32le:
232 case utf32be:
233 if (getDWord(ret) != 4)
234 return wchar_t (-1);
235 return ret;
236 }
237 return ret;
238 }
std::streamsize getWord(unsigned short &w)
Definition: utf16le.cpp:153
static const unsigned char utf8table[64]
Definition: utf16le.cpp:41
std::streamsize getByte(unsigned char &c)
Definition: utf16le.cpp:139
std::streamsize getDWord(wchar_t &d)
Definition: utf16le.cpp:166
GLdouble GLdouble GLint GLint GLdouble GLdouble GLint GLint GLdouble GLdouble w2
Definition: glext.h:8308
GLubyte GLubyte GLubyte GLubyte w
Definition: glext.h:6102
GLsizei GLenum const GLvoid GLsizei GLenum GLbyte GLbyte GLbyte GLdouble GLdouble GLdouble GLfloat GLfloat GLfloat GLint GLint GLint GLshort GLshort GLshort GLubyte GLubyte GLubyte GLuint GLuint GLuint GLushort GLushort GLushort GLbyte GLbyte GLbyte GLbyte GLdouble GLdouble GLdouble GLdouble GLfloat GLfloat GLfloat GLfloat GLint GLint GLint GLint GLshort GLshort GLshort GLshort GLubyte GLubyte GLubyte GLubyte GLuint GLuint GLuint GLuint GLushort GLushort GLushort GLushort GLboolean const GLdouble const GLfloat const GLint const GLshort const GLbyte const GLdouble const GLfloat const GLint const GLshort const GLdouble const GLfloat const GLint const GLshort const GLdouble const GLfloat const GLint const GLshort const GLdouble const GLfloat const GLint const GLshort const GLdouble const GLdouble const GLfloat const GLfloat const GLint const GLint const GLshort const GLshort const GLdouble const GLfloat const GLint const GLshort const GLdouble const GLfloat const GLint const GLshort const GLdouble const GLfloat const GLint const GLshort const GLdouble const GLfloat const GLint const GLshort const GLdouble const GLfloat const GLint const GLshort const GLdouble const GLfloat const GLint const GLshort const GLdouble const GLfloat const GLint const GLshort GLenum GLenum GLenum GLfloat GLenum GLint GLenum GLenum GLenum GLfloat GLenum GLenum GLint GLenum GLfloat GLenum GLint GLint GLushort GLenum GLenum GLfloat GLenum GLenum GLint GLfloat const GLubyte GLenum GLenum GLenum const GLfloat GLenum GLenum const GLint GLenum GLint GLint GLsizei GLsizei GLint GLenum GLenum const GLvoid GLenum GLenum const GLfloat GLenum GLenum const GLint GLenum GLenum const GLdouble GLenum GLenum const GLfloat GLenum GLenum const GLint GLsizei GLuint GLfloat GLuint GLbitfield GLfloat GLint GLuint GLboolean GLenum GLfloat GLenum GLbitfield GLenum GLfloat GLfloat GLint GLint const GLfloat GLenum GLfloat GLfloat GLint GLint GLfloat GLfloat GLint GLint const GLfloat GLint GLfloat GLfloat GLint GLfloat GLfloat GLint GLfloat GLfloat const GLdouble const GLfloat const GLdouble const GLfloat GLint i
Definition: glfuncs.h:248
#define c
Definition: ke_i.h:80
#define wchar_t
Definition: wchar.h:102
int ret

Referenced by convert2utf16le().

◆ getBOM()

enc_types utf_converter::getBOM ( )
inline

Definition at line 71 of file utf16le.cpp.

72 {
73 index = 0;
74 /* first byte can also detect with:
75 if ((buffer[0] & 0x11) || !buffer[0]))
76 valid values are 0xef, 0xff, 0xfe, 0x00
77 */
78 inputfile.read(reinterpret_cast<char*>(&buffer),4);
80 // stupid utf8 bom
81 if ((fill > 2) &&
82 (buffer[0] == 0xef) &&
83 (buffer[1] == 0xbb) &&
84 (buffer[2] == 0xbf))
85 {
86 index += 3;
87 fill -=3;
88#ifdef DISPLAY_DETECTED_UNICODE
89 cerr << "UTF-8 BOM found" << endl;
90#endif
91 return utf8;
92 }
93 if ((fill > 1) &&
94 (buffer[0] == 0xfe) &&
95 (buffer[1] == 0xff))
96 {
97 index += 2;
98 fill -= 2;
99#ifdef DISPLAY_DETECTED_UNICODE
100 cerr << "UTF-16BE BOM found" << endl;
101#endif
102 return utf16be;
103 }
104 if ((fill > 1) &&
105 (buffer[0] == 0xff) &&
106 (buffer[1] == 0xfe))
107 {
108 if ((fill == 4) &&
109 (buffer[2] == 0x00) &&
110 (buffer[3] == 0x00))
111 {
112 cerr << "UTF Error: ambiguous BOM UTF-16 or UTF-32; assume UTF-32" << endl;
113 fill = 0;
114 index = 0;
115 return utf32le;
116 }
117 fill -= 2;
118 index += 2;
119#ifdef DISPLAY_DETECTED_UNICODE
120 cerr << "UTF-16LE BOM found" << endl;
121#endif
122 return utf16le;
123 }
124 if ((fill == 4) &&
125 (buffer[0] == 0x00) &&
126 (buffer[1] == 0x00) &&
127 (buffer[2] == 0xfe) &&
128 (buffer[3] == 0xff))
129 {
130 fill = 0;
131 index = 0;
132#ifdef DISPLAY_DETECTED_UNICODE
133 cerr << "UTF-32BE BOM found" << endl;
134#endif
135 return utf32be;
136 }
137 return utf8; // no valid bom so use utf8 as default
138 }
streamsize gcount() const
Definition: _istream.h:125
_Self & read(char_type *__s, streamsize __n)
Definition: _istream.c:783
GLuint index
Definition: glext.h:6031

Referenced by utf_converter().

◆ getByte()

std::streamsize utf_converter::getByte ( unsigned char c)
inline

Definition at line 139 of file utf16le.cpp.

140 {
141 if (fill)
142 {
143 index %= 4;
144 --fill;
145 c = buffer[index++];
146 return 1;
147 } else
148 {
149 inputfile.read(reinterpret_cast<char*>(&c),1);
150 return inputfile.gcount();
151 }
152 }

Referenced by get_wchar_t(), getDWord(), and getWord().

◆ getDWord()

std::streamsize utf_converter::getDWord ( wchar_t d)
inline

Definition at line 166 of file utf16le.cpp.

167 {
168 unsigned char c[4];
169 for (int i=0;i<4;i++)
170 if (!getByte(c[i]))
171 return i;
172 if (encoding == utf32le)
173 d = c[0] | (c[1] << 8) | (c[2] << 16) | (c[3] << 24);
174 else
175 d = c[3] | (c[2] << 8) | (c[1] << 16) | (c[0] << 24);
176 return 4;
177 }
#define d
Definition: ke_i.h:81

Referenced by get_wchar_t().

◆ getError()

err_types utf_converter::getError ( )
inline

Definition at line 67 of file utf16le.cpp.

68 {
69 return error;
70 }

Referenced by main().

◆ getWord()

std::streamsize utf_converter::getWord ( unsigned short w)
inline

Definition at line 153 of file utf16le.cpp.

154 {
155 unsigned char c[2];
156 if (!getByte(c[0]))
157 return 0;
158 if (!getByte(c[1]))
159 return 1;
160 if (encoding == utf16le)
161 w = c[0] | (c[1] << 8);
162 else
163 w = c[1] | (c[0] << 8);
164 return 2;
165 }

Referenced by get_wchar_t().

Member Data Documentation

◆ bom_type

bom_types utf_converter::bom_type
protected

Definition at line 37 of file utf16le.cpp.

Referenced by convert2utf16le().

◆ buffer

unsigned char utf_converter::buffer[4]
protected

Definition at line 38 of file utf16le.cpp.

◆ encoding

enc_types utf_converter::encoding
protected

Definition at line 36 of file utf16le.cpp.

Referenced by get_wchar_t(), getDWord(), getWord(), and utf_converter().

◆ error

err_types utf_converter::error
protected

Definition at line 35 of file utf16le.cpp.

Referenced by getError(), and utf_converter().

◆ fill

std::streamsize utf_converter::fill
protected

Definition at line 39 of file utf16le.cpp.

Referenced by getBOM(), and getByte().

◆ index

unsigned char utf_converter::index
protected

Definition at line 38 of file utf16le.cpp.

◆ inputfile

fstream utf_converter::inputfile
protected

Definition at line 40 of file utf16le.cpp.

Referenced by convert2utf16le(), getBOM(), getByte(), utf_converter(), and ~utf_converter().

◆ outputfile

fstream utf_converter::outputfile
protected

Definition at line 40 of file utf16le.cpp.

Referenced by convert2utf16le(), utf_converter(), and ~utf_converter().

◆ utf8table

const unsigned char utf_converter::utf8table
staticprotected
Initial value:
= {
1, 9, 17, 25, 33, 41, 49, 57, 65, 73, 81, 89, 97, 105, 113, 121,
129, 137, 145, 153, 161, 169, 177, 185, 193, 201, 209, 217, 225, 233, 241, 249,
2, 10, 18, 26, 34, 42, 50, 58, 66, 74, 82, 90, 98, 106, 114, 122,
3, 11, 19, 27, 35, 43, 51, 59, 4, 12, 20, 28, 5, 13, 6, 7
}

Definition at line 41 of file utf16le.cpp.

Referenced by get_wchar_t().


The documentation for this class was generated from the following file: