ReactOS 0.4.15-dev-7834-g00c4b3d
utf16le.cpp
Go to the documentation of this file.
1/*
2 * Usage: utf16le inputfile outputfile
3 *
4 * This is a tool and is compiled using the host compiler,
5 * i.e. on Linux gcc and not mingw-gcc (cross-compiler).
6 * It's a converter from utf-8, utf-16 (LE/BE) and utf-32 (LE/BE)
7 * to utf-16 LE and especially made for automatic conversions of
8 * INF-files from utf-8 to utf-16LE (so we can furthermore
9 * store the INF files in utf-8 for subversion.
10 *
11 * Author: Matthias Kupfer (mkupfer@reactos.org)
12 */
13
14#include <fstream>
15#include <iostream>
16#include <string.h>
17
18//#define DISPLAY_DETECTED_UNICODE
19
20using namespace std;
21
22#ifdef _MSC_VER
23#define strcasecmp _stricmp
24#endif
25
27{
28public:
29 // detect can detect utf-8 and both utf-16 variants, but assume utf-32 only
30 // due to ambiguous BOM
33 enum bom_types { bom, nobom };
34protected:
38 unsigned char buffer[4], index; // need 4 char buffer for optional BOM handling
39 std::streamsize fill;
41 static const unsigned char utf8table[64];
42public:
43 utf_converter(string ifname, string ofname, bom_types ofbom = bom, enc_types enc = detect) : error(none), bom_type(ofbom), encoding(enc), fill(0), index(0)
44 {
45 enc_types tmp_enc;
46 inputfile.open(ifname.c_str(), ios::in | ios::binary);
47 if (!inputfile)
48 {
49 error = iopen;
50 return;
51 }
52 outputfile.open(ofname.c_str(), ios::out | ios::binary);
53 if (!outputfile)
54 {
55 error = oopen;
56 return;
57 }
58 tmp_enc = getBOM();
59 if (enc != detect)
60 {
61 if (enc != tmp_enc)
62 cerr << "Warning: UTF-BOM doesn't match encoding setting, but given encoding forced" << endl;
63 }
64 else
65 encoding = tmp_enc;
66 }
68 {
69 return error;
70 }
72 {
73 index = 0;
74 /* first byte can also detect with:
75 if ((buffer[0] & 0x11) || !buffer[0]))
76 valid values are 0xef, 0xff, 0xfe, 0x00
77 */
78 inputfile.read(reinterpret_cast<char*>(&buffer),4);
80 // stupid utf8 bom
81 if ((fill > 2) &&
82 (buffer[0] == 0xef) &&
83 (buffer[1] == 0xbb) &&
84 (buffer[2] == 0xbf))
85 {
86 index += 3;
87 fill -=3;
88#ifdef DISPLAY_DETECTED_UNICODE
89 cerr << "UTF-8 BOM found" << endl;
90#endif
91 return utf8;
92 }
93 if ((fill > 1) &&
94 (buffer[0] == 0xfe) &&
95 (buffer[1] == 0xff))
96 {
97 index += 2;
98 fill -= 2;
99#ifdef DISPLAY_DETECTED_UNICODE
100 cerr << "UTF-16BE BOM found" << endl;
101#endif
102 return utf16be;
103 }
104 if ((fill > 1) &&
105 (buffer[0] == 0xff) &&
106 (buffer[1] == 0xfe))
107 {
108 if ((fill == 4) &&
109 (buffer[2] == 0x00) &&
110 (buffer[3] == 0x00))
111 {
112 cerr << "UTF Error: ambiguous BOM UTF-16 or UTF-32; assume UTF-32" << endl;
113 fill = 0;
114 index = 0;
115 return utf32le;
116 }
117 fill -= 2;
118 index += 2;
119#ifdef DISPLAY_DETECTED_UNICODE
120 cerr << "UTF-16LE BOM found" << endl;
121#endif
122 return utf16le;
123 }
124 if ((fill == 4) &&
125 (buffer[0] == 0x00) &&
126 (buffer[1] == 0x00) &&
127 (buffer[2] == 0xfe) &&
128 (buffer[3] == 0xff))
129 {
130 fill = 0;
131 index = 0;
132#ifdef DISPLAY_DETECTED_UNICODE
133 cerr << "UTF-32BE BOM found" << endl;
134#endif
135 return utf32be;
136 }
137 return utf8; // no valid bom so use utf8 as default
138 }
139 std::streamsize getByte(unsigned char &c)
140 {
141 if (fill)
142 {
143 index %= 4;
144 --fill;
145 c = buffer[index++];
146 return 1;
147 } else
148 {
149 inputfile.read(reinterpret_cast<char*>(&c),1);
150 return inputfile.gcount();
151 }
152 }
153 std::streamsize getWord(unsigned short &w)
154 {
155 unsigned char c[2];
156 if (!getByte(c[0]))
157 return 0;
158 if (!getByte(c[1]))
159 return 1;
160 if (encoding == utf16le)
161 w = c[0] | (c[1] << 8);
162 else
163 w = c[1] | (c[0] << 8);
164 return 2;
165 }
166 std::streamsize getDWord(wchar_t &d)
167 {
168 unsigned char c[4];
169 for (int i=0;i<4;i++)
170 if (!getByte(c[i]))
171 return i;
172 if (encoding == utf32le)
173 d = c[0] | (c[1] << 8) | (c[2] << 16) | (c[3] << 24);
174 else
175 d = c[3] | (c[2] << 8) | (c[1] << 16) | (c[0] << 24);
176 return 4;
177 }
178 wchar_t get_wchar_t()
179 {
180 wchar_t ret = (wchar_t)-1;
181 switch (encoding)
182 {
183 case detect: // if still unknown
184 encoding = utf8; // assume utf8 as default
185 case utf8:
186 unsigned char c, tmp;
187 if (!getByte(tmp))
188 return ret;
189 // table for 64 bytes (all 11xxxxxx resp. >=192)
190 // resulting byte is determined:
191 // lower 3 bits: number of following bytes (max.8) 0=error
192 // upper 5 bits: data filled with 0
193 if (tmp & 0x80)
194 {
195 if ((tmp & 0xc0) != 0xc0)
196 {
197 cerr << "UTF-8 Error: invalid data byte" << endl;
198 return ret;
199 }
200 unsigned char i = utf8table[tmp & 0x3f];
201 ret = i >> 3;
202 i &= 7;
203 while (i--)
204 {
205 ret <<= 6;
206 if (!getByte(c))
207 return wchar_t(-1);
208 ret |= c & 0x3f;
209 }
210 return ret;
211 }
212 else
213 return wchar_t(tmp);
214 case utf16le:
215 case utf16be:
216 unsigned short w,w2;
217 if (getWord(w) != 2)
218 return ret;
219 if ((w & 0xfc00) == 0xd800) // high surrogate first
220 {
221 if (getWord(w2) != 2)
222 return ret;
223 if ((w2 & 0xfc00) != 0xdc00)
224 {
225 cerr << "UTF-16 Error: invalid low surrogate" << endl;
226 return ret;
227 }
228 return (((w & 0x3ff) + 0x40) << 10) | (w2 & 0x3ff);
229 }
230 return w;
231 case utf32le:
232 case utf32be:
233 if (getDWord(ret) != 4)
234 return wchar_t (-1);
235 return ret;
236 }
237 return ret;
238 }
240 {
241 unsigned char buffer[2] = { 0xff, 0xfe };
242
243 if (bom_type == bom)
244 {
245 outputfile.write(reinterpret_cast<char*>(&buffer), 2); // write BOM
246 }
247
248 wchar_t c = get_wchar_t();
249
250 while (!inputfile.eof())
251 {
252 buffer[0] = c & 0xff;
253 buffer[1] = (c >> 8) & 0xff; // create utf16-le char
254 outputfile.write(reinterpret_cast<char*>(&buffer),2); // write char
255 c = get_wchar_t();
256 }
257 }
259 {
260 if (inputfile)
262 if (outputfile)
264 }
265};
266
267const unsigned char utf_converter::utf8table[64] = {
2681, 9, 17, 25, 33, 41, 49, 57, 65, 73, 81, 89, 97, 105, 113, 121,
269129, 137, 145, 153, 161, 169, 177, 185, 193, 201, 209, 217, 225, 233, 241, 249,
2702, 10, 18, 26, 34, 42, 50, 58, 66, 74, 82, 90, 98, 106, 114, 122,
2713, 11, 19, 27, 35, 43, 51, 59, 4, 12, 20, 28, 5, 13, 6, 7
272};
273
274
275int main(int argc, char* argv[])
276{
278
279 if (argc < 3)
280 {
281 cout << "usage: " << argv[0] << " inputfile outputfile" << endl;
282 return -1;
283 }
284
286
287 if (argc == 4 && strcasecmp(argv[3], "nobom") == 0)
288 {
289 bom_type = utf_converter::nobom;
290 }
291
292 utf_converter conv(argv[1], argv[2], bom_type);
293
294 if ((err = conv.getError())!=utf_converter::none)
295 {
296 switch (err)
297 {
299 cerr << "Couldn't open input file." << endl;
300 break;
302 cerr << "Couldn't open output file." << endl;
303 break;
304 default:
305 cerr << "Unknown error." << endl;
306 }
307 return -1;
308 }
309 else
310 {
311 conv.convert2utf16le();
312 }
313
314 return 0;
315}
static int argc
Definition: ServiceArgs.c:12
basic_ostream< _CharT, _Traits > &_STLP_CALL endl(basic_ostream< _CharT, _Traits > &__os)
Definition: _ostream.h:357
void open(const char *__s, ios_base::openmode __mod=ios_base::in|ios_base::out)
Definition: _fstream.h:674
void close()
Definition: _fstream.h:681
streamsize gcount() const
Definition: _istream.h:125
_Self & read(char_type *__s, streamsize __n)
Definition: _istream.c:783
_Self & write(const char_type *__s, streamsize __n)
Definition: _ostream.c:430
bool eof() const
Definition: _ios_base.h:173
std::streamsize getWord(unsigned short &w)
Definition: utf16le.cpp:153
enc_types encoding
Definition: utf16le.cpp:36
wchar_t get_wchar_t()
Definition: utf16le.cpp:178
bom_types bom_type
Definition: utf16le.cpp:37
static const unsigned char utf8table[64]
Definition: utf16le.cpp:41
fstream inputfile
Definition: utf16le.cpp:40
utf_converter(string ifname, string ofname, bom_types ofbom=bom, enc_types enc=detect)
Definition: utf16le.cpp:43
std::streamsize getByte(unsigned char &c)
Definition: utf16le.cpp:139
std::streamsize fill
Definition: utf16le.cpp:39
err_types getError()
Definition: utf16le.cpp:67
void convert2utf16le()
Definition: utf16le.cpp:239
unsigned char index
Definition: utf16le.cpp:38
fstream outputfile
Definition: utf16le.cpp:40
err_types error
Definition: utf16le.cpp:35
enc_types getBOM()
Definition: utf16le.cpp:71
std::streamsize getDWord(wchar_t &d)
Definition: utf16le.cpp:166
int main()
Definition: test.c:6
#define strcasecmp
Definition: fake.h:9
GLdouble GLdouble GLint GLint GLdouble GLdouble GLint GLint GLdouble GLdouble w2
Definition: glext.h:8308
GLuint buffer
Definition: glext.h:5915
const GLubyte * c
Definition: glext.h:8905
GLuint index
Definition: glext.h:6031
GLubyte GLubyte GLubyte GLubyte w
Definition: glext.h:6102
GLsizei GLenum const GLvoid GLsizei GLenum GLbyte GLbyte GLbyte GLdouble GLdouble GLdouble GLfloat GLfloat GLfloat GLint GLint GLint GLshort GLshort GLshort GLubyte GLubyte GLubyte GLuint GLuint GLuint GLushort GLushort GLushort GLbyte GLbyte GLbyte GLbyte GLdouble GLdouble GLdouble GLdouble GLfloat GLfloat GLfloat GLfloat GLint GLint GLint GLint GLshort GLshort GLshort GLshort GLubyte GLubyte GLubyte GLubyte GLuint GLuint GLuint GLuint GLushort GLushort GLushort GLushort GLboolean const GLdouble const GLfloat const GLint const GLshort const GLbyte const GLdouble const GLfloat const GLint const GLshort const GLdouble const GLfloat const GLint const GLshort const GLdouble const GLfloat const GLint const GLshort const GLdouble const GLfloat const GLint const GLshort const GLdouble const GLdouble const GLfloat const GLfloat const GLint const GLint const GLshort const GLshort const GLdouble const GLfloat const GLint const GLshort const GLdouble const GLfloat const GLint const GLshort const GLdouble const GLfloat const GLint const GLshort const GLdouble const GLfloat const GLint const GLshort const GLdouble const GLfloat const GLint const GLshort const GLdouble const GLfloat const GLint const GLshort const GLdouble const GLfloat const GLint const GLshort GLenum GLenum GLenum GLfloat GLenum GLint GLenum GLenum GLenum GLfloat GLenum GLenum GLint GLenum GLfloat GLenum GLint GLint GLushort GLenum GLenum GLfloat GLenum GLenum GLint GLfloat const GLubyte GLenum GLenum GLenum const GLfloat GLenum GLenum const GLint GLenum GLint GLint GLsizei GLsizei GLint GLenum GLenum const GLvoid GLenum GLenum const GLfloat GLenum GLenum const GLint GLenum GLenum const GLdouble GLenum GLenum const GLfloat GLenum GLenum const GLint GLsizei GLuint GLfloat GLuint GLbitfield GLfloat GLint GLuint GLboolean GLenum GLfloat GLenum GLbitfield GLenum GLfloat GLfloat GLint GLint const GLfloat GLenum GLfloat GLfloat GLint GLint GLfloat GLfloat GLint GLint const GLfloat GLint GLfloat GLfloat GLint GLfloat GLfloat GLint GLfloat GLfloat const GLdouble const GLfloat const GLdouble const GLfloat GLint i
Definition: glfuncs.h:248
#define cout
Definition: iostream.cpp:38
#define cerr
Definition: iostream.cpp:39
#define d
Definition: ke_i.h:81
#define c
Definition: ke_i.h:80
#define argv
Definition: mplay32.c:18
Definition: features.h:417
#define err(...)
#define wchar_t
Definition: wchar.h:102
int ret