ReactOS  0.4.15-dev-499-g1f31905
utf16le.cpp
Go to the documentation of this file.
1 /*
2  * Usage: utf16le inputfile outputfile
3  *
4  * This is a tool and is compiled using the host compiler,
5  * i.e. on Linux gcc and not mingw-gcc (cross-compiler).
6  * It's a converter from utf-8, utf-16 (LE/BE) and utf-32 (LE/BE)
7  * to utf-16 LE and especially made for automatic conversions of
8  * INF-files from utf-8 to utf-16LE (so we can furthermore
9  * store the INF files in utf-8 for subversion.
10  *
11  * Author: Matthias Kupfer (mkupfer@reactos.org)
12  */
13 
14 #include <fstream>
15 #include <iostream>
16 #include <string.h>
17 
18 //#define DISPLAY_DETECTED_UNICODE
19 
20 using namespace std;
21 
22 #ifdef _MSC_VER
23 #define strcasecmp _stricmp
24 #endif
25 
27 {
28 public:
29  // detect can detect utf-8 and both utf-16 variants, but assume utf-32 only
30  // due to ambiguous BOM
31  enum enc_types { detect, utf8, utf16le, utf16be, utf32le, utf32be };
32  enum err_types { none, iopen, oopen, eof, read, write, decode };
33  enum bom_types { bom, nobom };
34 protected:
38  unsigned char buffer[4], index; // need 4 char buffer for optional BOM handling
40  fstream inputfile,outputfile;
41  static const unsigned char utf8table[64];
42 public:
43  utf_converter(string ifname, string ofname, bom_types ofbom = bom, enc_types enc = detect) : error(none), bom_type(ofbom), encoding(enc), fill(0), index(0)
44  {
45  enc_types tmp_enc;
46  inputfile.open(ifname.c_str(), ios::in | ios::binary);
47  if (!inputfile)
48  {
49  error = iopen;
50  return;
51  }
52  outputfile.open(ofname.c_str(), ios::out | ios::binary);
53  if (!outputfile)
54  {
55  error = oopen;
56  return;
57  }
58  tmp_enc = getBOM();
59  if (enc != detect)
60  {
61  if (enc != tmp_enc)
62  cerr << "Warning: UTF-BOM doesn't match encoding setting, but given encoding forced" << endl;
63  }
64  else
65  encoding = tmp_enc;
66  }
68  {
69  return error;
70  }
72  {
73  index = 0;
74  /* first byte can also detect with:
75  if ((buffer[0] & 0x11) || !buffer[0]))
76  valid values are 0xef, 0xff, 0xfe, 0x00
77  */
78  inputfile.read(reinterpret_cast<char*>(&buffer),4);
79  fill = inputfile.gcount();
80  // stupid utf8 bom
81  if ((fill > 2) &&
82  (buffer[0] == 0xef) &&
83  (buffer[1] == 0xbb) &&
84  (buffer[2] == 0xbf))
85  {
86  index += 3;
87  fill -=3;
88 #ifdef DISPLAY_DETECTED_UNICODE
89  cerr << "UTF-8 BOM found" << endl;
90 #endif
91  return utf8;
92  }
93  if ((fill > 1) &&
94  (buffer[0] == 0xfe) &&
95  (buffer[1] == 0xff))
96  {
97  index += 2;
98  fill -= 2;
99 #ifdef DISPLAY_DETECTED_UNICODE
100  cerr << "UTF-16BE BOM found" << endl;
101 #endif
102  return utf16be;
103  }
104  if ((fill > 1) &&
105  (buffer[0] == 0xff) &&
106  (buffer[1] == 0xfe))
107  {
108  if ((fill == 4) &&
109  (buffer[2] == 0x00) &&
110  (buffer[3] == 0x00))
111  {
112  cerr << "UTF Error: ambiguous BOM UTF-16 or UTF-32; assume UTF-32" << endl;
113  fill = 0;
114  index = 0;
115  return utf32le;
116  }
117  fill -= 2;
118  index += 2;
119 #ifdef DISPLAY_DETECTED_UNICODE
120  cerr << "UTF-16LE BOM found" << endl;
121 #endif
122  return utf16le;
123  }
124  if ((fill == 4) &&
125  (buffer[0] == 0x00) &&
126  (buffer[1] == 0x00) &&
127  (buffer[2] == 0xfe) &&
128  (buffer[3] == 0xff))
129  {
130  fill = 0;
131  index = 0;
132 #ifdef DISPLAY_DETECTED_UNICODE
133  cerr << "UTF-32BE BOM found" << endl;
134 #endif
135  return utf32be;
136  }
137  return utf8; // no valid bom so use utf8 as default
138  }
139  std::streamsize getByte(unsigned char &c)
140  {
141  if (fill)
142  {
143  index %= 4;
144  --fill;
145  c = buffer[index++];
146  return 1;
147  } else
148  {
149  inputfile.read(reinterpret_cast<char*>(&c),1);
150  return inputfile.gcount();
151  }
152  }
153  std::streamsize getWord(unsigned short &w)
154  {
155  unsigned char c[2];
156  if (!getByte(c[0]))
157  return 0;
158  if (!getByte(c[1]))
159  return 1;
160  if (encoding == utf16le)
161  w = c[0] | (c[1] << 8);
162  else
163  w = c[1] | (c[0] << 8);
164  return 2;
165  }
167  {
168  unsigned char c[4];
169  for (int i=0;i<4;i++)
170  if (!getByte(c[i]))
171  return i;
172  if (encoding == utf32le)
173  d = c[0] | (c[1] << 8) | (c[2] << 16) | (c[3] << 24);
174  else
175  d = c[3] | (c[2] << 8) | (c[1] << 16) | (c[0] << 24);
176  return 4;
177  }
178  wchar_t get_wchar_t()
179  {
180  wchar_t ret = (wchar_t)-1;
181  switch (encoding)
182  {
183  case detect: // if still unknwon
184  encoding = utf8; // assume utf8 as default
185  case utf8:
186  unsigned char c, tmp;
187  if (!getByte(tmp))
188  return ret;
189  // table for 64 bytes (all 11xxxxxx resp. >=192)
190  // resulting byte is determined:
191  // lower 3 bits: number of following bytes (max.8) 0=error
192  // upper 5 bits: data filled with 0
193  if (tmp & 0x80)
194  {
195  if ((tmp & 0xc0) != 0xc0)
196  {
197  cerr << "UTF-8 Error: invalid data byte" << endl;
198  return ret;
199  }
200  unsigned char i = utf8table[tmp & 0x3f];
201  ret = i >> 3;
202  i &= 7;
203  while (i--)
204  {
205  ret <<= 6;
206  if (!getByte(c))
207  return wchar_t(-1);
208  ret |= c & 0x3f;
209  }
210  return ret;
211  }
212  else
213  return wchar_t(tmp);
214  case utf16le:
215  case utf16be:
216  unsigned short w,w2;
217  if (getWord(w) != 2)
218  return ret;
219  if ((w & 0xfc00) == 0xd800) // high surrogate first
220  {
221  if (getWord(w2) != 2)
222  return ret;
223  if ((w2 & 0xfc00) != 0xdc00)
224  {
225  cerr << "UTF-16 Error: invalid low surrogate" << endl;
226  return ret;
227  }
228  return (((w & 0x3ff) + 0x40) << 10) | (w2 & 0x3ff);
229  }
230  return w;
231  case utf32le:
232  case utf32be:
233  if (getDWord(ret) != 4)
234  return wchar_t (-1);
235  return ret;
236  }
237  return ret;
238  }
240  {
241  unsigned char buffer[2] = { 0xff, 0xfe };
242 
243  if (bom_type == bom)
244  {
245  outputfile.write(reinterpret_cast<char*>(&buffer), 2); // write BOM
246  }
247 
248  wchar_t c = get_wchar_t();
249 
250  while (!inputfile.eof())
251  {
252  buffer[0] = c & 0xff;
253  buffer[1] = (c >> 8) & 0xff; // create utf16-le char
254  outputfile.write(reinterpret_cast<char*>(&buffer),2); // write char
255  c = get_wchar_t();
256  }
257  }
259  {
260  if (inputfile)
261  inputfile.close();
262  if (outputfile)
263  outputfile.close();
264  }
265 };
266 
267 const unsigned char utf_converter::utf8table[64] = {
268 1, 9, 17, 25, 33, 41, 49, 57, 65, 73, 81, 89, 97, 105, 113, 121,
269 129, 137, 145, 153, 161, 169, 177, 185, 193, 201, 209, 217, 225, 233, 241, 249,
270 2, 10, 18, 26, 34, 42, 50, 58, 66, 74, 82, 90, 98, 106, 114, 122,
271 3, 11, 19, 27, 35, 43, 51, 59, 4, 12, 20, 28, 5, 13, 6, 7
272 };
273 
274 
275 int main(int argc, char* argv[])
276 {
278 
279  if (argc < 3)
280  {
281  cout << "usage: " << argv[0] << " inputfile outputfile" << endl;
282  return -1;
283  }
284 
286 
287  if (argc == 4 && strcasecmp(argv[3], "nobom") == 0)
288  {
289  bom_type = utf_converter::nobom;
290  }
291 
292  utf_converter conv(argv[1], argv[2], bom_type);
293 
294  if ((err = conv.getError())!=utf_converter::none)
295  {
296  switch (err)
297  {
299  cerr << "Couldn't open input file." << endl;
300  break;
302  cerr << "Couldn't open output file." << endl;
303  break;
304  default:
305  cerr << "Unknown error." << endl;
306  }
307  return -1;
308  }
309  else
310  {
311  conv.convert2utf16le();
312  }
313 
314  return 0;
315 }
static int argc
Definition: ServiceArgs.c:12
GLubyte GLubyte GLubyte GLubyte w
Definition: glext.h:6102
#define error(str)
Definition: mkdosfs.c:1605
#define strcasecmp
Definition: fake.h:9
static const unsigned char utf8table[64]
Definition: utf16le.cpp:41
GLuint buffer
Definition: glext.h:5915
void convert2utf16le()
Definition: utf16le.cpp:239
Definition: features.h:417
#define argv
Definition: mplay32.c:18
fstream outputfile
Definition: utf16le.cpp:40
err_types error
Definition: utf16le.cpp:35
#define cout
Definition: iostream.cpp:38
GLsizei GLenum const GLvoid GLsizei GLenum GLbyte GLbyte GLbyte GLdouble GLdouble GLdouble GLfloat GLfloat GLfloat GLint GLint GLint GLshort GLshort GLshort GLubyte GLubyte GLubyte GLuint GLuint GLuint GLushort GLushort GLushort GLbyte GLbyte GLbyte GLbyte GLdouble GLdouble GLdouble GLdouble GLfloat GLfloat GLfloat GLfloat GLint GLint GLint GLint GLshort GLshort GLshort GLshort GLubyte GLubyte GLubyte GLubyte GLuint GLuint GLuint GLuint GLushort GLushort GLushort GLushort GLboolean const GLdouble const GLfloat const GLint const GLshort const GLbyte const GLdouble const GLfloat const GLint const GLshort const GLdouble const GLfloat const GLint const GLshort const GLdouble const GLfloat const GLint const GLshort const GLdouble const GLfloat const GLint const GLshort const GLdouble const GLdouble const GLfloat const GLfloat const GLint const GLint const GLshort const GLshort const GLdouble const GLfloat const GLint const GLshort const GLdouble const GLfloat const GLint const GLshort const GLdouble const GLfloat const GLint const GLshort const GLdouble const GLfloat const GLint const GLshort const GLdouble const GLfloat const GLint const GLshort const GLdouble const GLfloat const GLint const GLshort const GLdouble const GLfloat const GLint const GLshort GLenum GLenum GLenum GLfloat GLenum GLint GLenum GLenum GLenum GLfloat GLenum GLenum GLint GLenum GLfloat GLenum GLint GLint GLushort GLenum GLenum GLfloat GLenum GLenum GLint GLfloat const GLubyte GLenum GLenum GLenum const GLfloat GLenum GLenum const GLint GLenum GLint GLint GLsizei GLsizei GLint GLenum GLenum const GLvoid GLenum GLenum const GLfloat GLenum GLenum const GLint GLenum GLenum const GLdouble GLenum GLenum const GLfloat GLenum GLenum const GLint GLsizei GLuint GLfloat GLuint GLbitfield GLfloat GLint GLuint GLboolean GLenum GLfloat GLenum GLbitfield GLenum GLfloat GLfloat GLint GLint const GLfloat GLenum GLfloat GLfloat GLint GLint GLfloat GLfloat GLint GLint const GLfloat GLint GLfloat GLfloat GLint GLfloat GLfloat GLint GLfloat GLfloat const GLdouble const GLfloat const GLdouble const GLfloat GLint i
Definition: glfuncs.h:248
basic_ostream< _CharT, _Traits > &_STLP_CALL endl(basic_ostream< _CharT, _Traits > &__os)
Definition: _ostream.h:357
std::streamsize getByte(unsigned char &c)
Definition: utf16le.cpp:139
std::streamsize fill
Definition: utf16le.cpp:39
err_types getError()
Definition: utf16le.cpp:67
GLuint index
Definition: glext.h:6031
_Self & write(const char_type *__s, streamsize __n)
Definition: _ostream.c:430
#define d
Definition: ke_i.h:81
_STLP_DECLSPEC _Stl_aligned_buffer< ostream > cerr
Definition: iostream.cpp:102
_STLP_MOVE_TO_STD_NAMESPACE void fill(_ForwardIter __first, _ForwardIter __last, const _Tp &__val)
Definition: _algobase.h:449
void close()
Definition: _fstream.h:681
enc_types getBOM()
Definition: utf16le.cpp:71
std::streamsize getWord(unsigned short &w)
Definition: utf16le.cpp:153
const GLubyte * c
Definition: glext.h:8905
static FILE * out
Definition: regtests2xml.c:44
const GLuint GLenum const GLvoid * binary
Definition: glext.h:7538
int ret
bool eof() const
Definition: _ios_base.h:173
utf_converter(string ifname, string ofname, bom_types ofbom=bom, enc_types enc=detect)
Definition: utf16le.cpp:43
GLdouble GLdouble GLint GLint GLdouble GLdouble GLint GLint GLdouble GLdouble w2
Definition: glext.h:8308
#define err(...)
void open(const char *__s, ios_base::openmode __mod=ios_base::in|ios_base::out)
Definition: _fstream.h:674
_Self & read(char_type *__s, streamsize __n)
Definition: _istream.c:783
ed encoding
Definition: write.c:2825
_Check_return_ _CRTIMP int __cdecl __cdecl eof(_In_ int _FileHandle)
static BYTE decode(char c)
Definition: base64_codec.c:97
GLuint in
Definition: glext.h:9616
wchar_t get_wchar_t()
Definition: utf16le.cpp:178
streamsize gcount() const
Definition: _istream.h:125
ptrdiff_t streamsize
Definition: char_traits.h:81
#define c
Definition: ke_i.h:80
#define wchar_t
Definition: wchar.h:102
bom_types bom_type
Definition: utf16le.cpp:37
int main(int argc, char *argv[])
Definition: utf16le.cpp:275
unsigned char index
Definition: utf16le.cpp:38
std::streamsize getDWord(wchar_t &d)
Definition: utf16le.cpp:166
_CRTIMP int __cdecl read(_In_ int _FileHandle, _Out_writes_bytes_(_MaxCharCount) void *_DstBuf, _In_ unsigned int _MaxCharCount)
enc_types encoding
Definition: utf16le.cpp:36