ReactOS  0.4.15-dev-1070-ge1a01de
utf_converter Class Reference
Collaboration diagram for utf_converter:

Public Types

enum  enc_types {
  detect, utf8, utf16le, utf16be,
  utf32le, utf32be
}
 
enum  err_types {
  none, iopen, oopen, eof,
  read, write, decode
}
 
enum  bom_types { bom, nobom }
 

Public Member Functions

 utf_converter (string ifname, string ofname, bom_types ofbom=bom, enc_types enc=detect)
 
err_types getError ()
 
enc_types getBOM ()
 
std::streamsize getByte (unsigned char &c)
 
std::streamsize getWord (unsigned short &w)
 
std::streamsize getDWord (wchar_t &d)
 
wchar_t get_wchar_t ()
 
void convert2utf16le ()
 
 ~utf_converter ()
 

Protected Attributes

err_types error
 
enc_types encoding
 
bom_types bom_type
 
unsigned char buffer [4]
 
unsigned char index
 
std::streamsize fill
 
fstream inputfile
 
fstream outputfile
 

Static Protected Attributes

static const unsigned char utf8table [64]
 

Detailed Description

Definition at line 26 of file utf16le.cpp.

Member Enumeration Documentation

◆ bom_types

Enumerator
bom 
nobom 

Definition at line 33 of file utf16le.cpp.

◆ enc_types

Enumerator
detect 
utf8 
utf16le 
utf16be 
utf32le 
utf32be 

Definition at line 31 of file utf16le.cpp.

◆ err_types

Enumerator
none 
iopen 
oopen 
eof 
read 
write 
decode 

Definition at line 32 of file utf16le.cpp.

Constructor & Destructor Documentation

◆ utf_converter()

utf_converter::utf_converter ( string  ifname,
string  ofname,
bom_types  ofbom = bom,
enc_types  enc = detect 
)
inline

Definition at line 43 of file utf16le.cpp.

43  : error(none), bom_type(ofbom), encoding(enc), fill(0), index(0)
44  {
45  enc_types tmp_enc;
46  inputfile.open(ifname.c_str(), ios::in | ios::binary);
47  if (!inputfile)
48  {
49  error = iopen;
50  return;
51  }
52  outputfile.open(ofname.c_str(), ios::out | ios::binary);
53  if (!outputfile)
54  {
55  error = oopen;
56  return;
57  }
58  tmp_enc = getBOM();
59  if (enc != detect)
60  {
61  if (enc != tmp_enc)
62  cerr << "Warning: UTF-BOM doesn't match encoding setting, but given encoding forced" << endl;
63  }
64  else
65  encoding = tmp_enc;
66  }
fstream outputfile
Definition: utf16le.cpp:40
err_types error
Definition: utf16le.cpp:35
basic_ostream< _CharT, _Traits > &_STLP_CALL endl(basic_ostream< _CharT, _Traits > &__os)
Definition: _ostream.h:357
std::streamsize fill
Definition: utf16le.cpp:39
_STLP_DECLSPEC _Stl_aligned_buffer< ostream > cerr
Definition: iostream.cpp:102
enc_types getBOM()
Definition: utf16le.cpp:71
static FILE * out
Definition: regtests2xml.c:44
const GLuint GLenum const GLvoid * binary
Definition: glext.h:7538
void open(const char *__s, ios_base::openmode __mod=ios_base::in|ios_base::out)
Definition: _fstream.h:674
GLuint in
Definition: glext.h:9616
bom_types bom_type
Definition: utf16le.cpp:37
unsigned char index
Definition: utf16le.cpp:38
fstream inputfile
Definition: utf16le.cpp:40
enc_types encoding
Definition: utf16le.cpp:36

◆ ~utf_converter()

utf_converter::~utf_converter ( )
inline

Definition at line 258 of file utf16le.cpp.

259  {
260  if (inputfile)
261  inputfile.close();
262  if (outputfile)
263  outputfile.close();
264  }
fstream outputfile
Definition: utf16le.cpp:40
void close()
Definition: _fstream.h:681
fstream inputfile
Definition: utf16le.cpp:40

Member Function Documentation

◆ convert2utf16le()

void utf_converter::convert2utf16le ( )
inline

Definition at line 239 of file utf16le.cpp.

240  {
241  unsigned char buffer[2] = { 0xff, 0xfe };
242 
243  if (bom_type == bom)
244  {
245  outputfile.write(reinterpret_cast<char*>(&buffer), 2); // write BOM
246  }
247 
248  wchar_t c = get_wchar_t();
249 
250  while (!inputfile.eof())
251  {
252  buffer[0] = c & 0xff;
253  buffer[1] = (c >> 8) & 0xff; // create utf16-le char
254  outputfile.write(reinterpret_cast<char*>(&buffer),2); // write char
255  c = get_wchar_t();
256  }
257  }
GLuint buffer
Definition: glext.h:5915
fstream outputfile
Definition: utf16le.cpp:40
_Self & write(const char_type *__s, streamsize __n)
Definition: _ostream.c:430
const GLubyte * c
Definition: glext.h:8905
bool eof() const
Definition: _ios_base.h:173
wchar_t get_wchar_t()
Definition: utf16le.cpp:178
bom_types bom_type
Definition: utf16le.cpp:37
fstream inputfile
Definition: utf16le.cpp:40

Referenced by main().

◆ get_wchar_t()

wchar_t utf_converter::get_wchar_t ( )
inline

Definition at line 178 of file utf16le.cpp.

179  {
180  wchar_t ret = (wchar_t)-1;
181  switch (encoding)
182  {
183  case detect: // if still unknwon
184  encoding = utf8; // assume utf8 as default
185  case utf8:
186  unsigned char c, tmp;
187  if (!getByte(tmp))
188  return ret;
189  // table for 64 bytes (all 11xxxxxx resp. >=192)
190  // resulting byte is determined:
191  // lower 3 bits: number of following bytes (max.8) 0=error
192  // upper 5 bits: data filled with 0
193  if (tmp & 0x80)
194  {
195  if ((tmp & 0xc0) != 0xc0)
196  {
197  cerr << "UTF-8 Error: invalid data byte" << endl;
198  return ret;
199  }
200  unsigned char i = utf8table[tmp & 0x3f];
201  ret = i >> 3;
202  i &= 7;
203  while (i--)
204  {
205  ret <<= 6;
206  if (!getByte(c))
207  return wchar_t(-1);
208  ret |= c & 0x3f;
209  }
210  return ret;
211  }
212  else
213  return wchar_t(tmp);
214  case utf16le:
215  case utf16be:
216  unsigned short w,w2;
217  if (getWord(w) != 2)
218  return ret;
219  if ((w & 0xfc00) == 0xd800) // high surrogate first
220  {
221  if (getWord(w2) != 2)
222  return ret;
223  if ((w2 & 0xfc00) != 0xdc00)
224  {
225  cerr << "UTF-16 Error: invalid low surrogate" << endl;
226  return ret;
227  }
228  return (((w & 0x3ff) + 0x40) << 10) | (w2 & 0x3ff);
229  }
230  return w;
231  case utf32le:
232  case utf32be:
233  if (getDWord(ret) != 4)
234  return wchar_t (-1);
235  return ret;
236  }
237  return ret;
238  }
GLubyte GLubyte GLubyte GLubyte w
Definition: glext.h:6102
static const unsigned char utf8table[64]
Definition: utf16le.cpp:41
GLsizei GLenum const GLvoid GLsizei GLenum GLbyte GLbyte GLbyte GLdouble GLdouble GLdouble GLfloat GLfloat GLfloat GLint GLint GLint GLshort GLshort GLshort GLubyte GLubyte GLubyte GLuint GLuint GLuint GLushort GLushort GLushort GLbyte GLbyte GLbyte GLbyte GLdouble GLdouble GLdouble GLdouble GLfloat GLfloat GLfloat GLfloat GLint GLint GLint GLint GLshort GLshort GLshort GLshort GLubyte GLubyte GLubyte GLubyte GLuint GLuint GLuint GLuint GLushort GLushort GLushort GLushort GLboolean const GLdouble const GLfloat const GLint const GLshort const GLbyte const GLdouble const GLfloat const GLint const GLshort const GLdouble const GLfloat const GLint const GLshort const GLdouble const GLfloat const GLint const GLshort const GLdouble const GLfloat const GLint const GLshort const GLdouble const GLdouble const GLfloat const GLfloat const GLint const GLint const GLshort const GLshort const GLdouble const GLfloat const GLint const GLshort const GLdouble const GLfloat const GLint const GLshort const GLdouble const GLfloat const GLint const GLshort const GLdouble const GLfloat const GLint const GLshort const GLdouble const GLfloat const GLint const GLshort const GLdouble const GLfloat const GLint const GLshort const GLdouble const GLfloat const GLint const GLshort GLenum GLenum GLenum GLfloat GLenum GLint GLenum GLenum GLenum GLfloat GLenum GLenum GLint GLenum GLfloat GLenum GLint GLint GLushort GLenum GLenum GLfloat GLenum GLenum GLint GLfloat const GLubyte GLenum GLenum GLenum const GLfloat GLenum GLenum const GLint GLenum GLint GLint GLsizei GLsizei GLint GLenum GLenum const GLvoid GLenum GLenum const GLfloat GLenum GLenum const GLint GLenum GLenum const GLdouble GLenum GLenum const GLfloat GLenum GLenum const GLint GLsizei GLuint GLfloat GLuint GLbitfield GLfloat GLint GLuint GLboolean GLenum GLfloat GLenum GLbitfield GLenum GLfloat GLfloat GLint GLint const GLfloat GLenum GLfloat GLfloat GLint GLint GLfloat GLfloat GLint GLint const GLfloat GLint GLfloat GLfloat GLint GLfloat GLfloat GLint GLfloat GLfloat const GLdouble const GLfloat const GLdouble const GLfloat GLint i
Definition: glfuncs.h:248
basic_ostream< _CharT, _Traits > &_STLP_CALL endl(basic_ostream< _CharT, _Traits > &__os)
Definition: _ostream.h:357
std::streamsize getByte(unsigned char &c)
Definition: utf16le.cpp:139
_STLP_DECLSPEC _Stl_aligned_buffer< ostream > cerr
Definition: iostream.cpp:102
std::streamsize getWord(unsigned short &w)
Definition: utf16le.cpp:153
const GLubyte * c
Definition: glext.h:8905
int ret
GLdouble GLdouble GLint GLint GLdouble GLdouble GLint GLint GLdouble GLdouble w2
Definition: glext.h:8308
#define c
Definition: ke_i.h:80
#define wchar_t
Definition: wchar.h:102
std::streamsize getDWord(wchar_t &d)
Definition: utf16le.cpp:166
enc_types encoding
Definition: utf16le.cpp:36

◆ getBOM()

enc_types utf_converter::getBOM ( )
inline

Definition at line 71 of file utf16le.cpp.

72  {
73  index = 0;
74  /* first byte can also detect with:
75  if ((buffer[0] & 0x11) || !buffer[0]))
76  valid values are 0xef, 0xff, 0xfe, 0x00
77  */
78  inputfile.read(reinterpret_cast<char*>(&buffer),4);
79  fill = inputfile.gcount();
80  // stupid utf8 bom
81  if ((fill > 2) &&
82  (buffer[0] == 0xef) &&
83  (buffer[1] == 0xbb) &&
84  (buffer[2] == 0xbf))
85  {
86  index += 3;
87  fill -=3;
88 #ifdef DISPLAY_DETECTED_UNICODE
89  cerr << "UTF-8 BOM found" << endl;
90 #endif
91  return utf8;
92  }
93  if ((fill > 1) &&
94  (buffer[0] == 0xfe) &&
95  (buffer[1] == 0xff))
96  {
97  index += 2;
98  fill -= 2;
99 #ifdef DISPLAY_DETECTED_UNICODE
100  cerr << "UTF-16BE BOM found" << endl;
101 #endif
102  return utf16be;
103  }
104  if ((fill > 1) &&
105  (buffer[0] == 0xff) &&
106  (buffer[1] == 0xfe))
107  {
108  if ((fill == 4) &&
109  (buffer[2] == 0x00) &&
110  (buffer[3] == 0x00))
111  {
112  cerr << "UTF Error: ambiguous BOM UTF-16 or UTF-32; assume UTF-32" << endl;
113  fill = 0;
114  index = 0;
115  return utf32le;
116  }
117  fill -= 2;
118  index += 2;
119 #ifdef DISPLAY_DETECTED_UNICODE
120  cerr << "UTF-16LE BOM found" << endl;
121 #endif
122  return utf16le;
123  }
124  if ((fill == 4) &&
125  (buffer[0] == 0x00) &&
126  (buffer[1] == 0x00) &&
127  (buffer[2] == 0xfe) &&
128  (buffer[3] == 0xff))
129  {
130  fill = 0;
131  index = 0;
132 #ifdef DISPLAY_DETECTED_UNICODE
133  cerr << "UTF-32BE BOM found" << endl;
134 #endif
135  return utf32be;
136  }
137  return utf8; // no valid bom so use utf8 as default
138  }
GLuint buffer
Definition: glext.h:5915
basic_ostream< _CharT, _Traits > &_STLP_CALL endl(basic_ostream< _CharT, _Traits > &__os)
Definition: _ostream.h:357
std::streamsize fill
Definition: utf16le.cpp:39
GLuint index
Definition: glext.h:6031
_STLP_DECLSPEC _Stl_aligned_buffer< ostream > cerr
Definition: iostream.cpp:102
_Self & read(char_type *__s, streamsize __n)
Definition: _istream.c:783
streamsize gcount() const
Definition: _istream.h:125
fstream inputfile
Definition: utf16le.cpp:40

◆ getByte()

std::streamsize utf_converter::getByte ( unsigned char c)
inline

Definition at line 139 of file utf16le.cpp.

140  {
141  if (fill)
142  {
143  index %= 4;
144  --fill;
145  c = buffer[index++];
146  return 1;
147  } else
148  {
149  inputfile.read(reinterpret_cast<char*>(&c),1);
150  return inputfile.gcount();
151  }
152  }
GLuint buffer
Definition: glext.h:5915
std::streamsize fill
Definition: utf16le.cpp:39
GLuint index
Definition: glext.h:6031
const GLubyte * c
Definition: glext.h:8905
_Self & read(char_type *__s, streamsize __n)
Definition: _istream.c:783
streamsize gcount() const
Definition: _istream.h:125
fstream inputfile
Definition: utf16le.cpp:40

◆ getDWord()

std::streamsize utf_converter::getDWord ( wchar_t d)
inline

Definition at line 166 of file utf16le.cpp.

167  {
168  unsigned char c[4];
169  for (int i=0;i<4;i++)
170  if (!getByte(c[i]))
171  return i;
172  if (encoding == utf32le)
173  d = c[0] | (c[1] << 8) | (c[2] << 16) | (c[3] << 24);
174  else
175  d = c[3] | (c[2] << 8) | (c[1] << 16) | (c[0] << 24);
176  return 4;
177  }
GLsizei GLenum const GLvoid GLsizei GLenum GLbyte GLbyte GLbyte GLdouble GLdouble GLdouble GLfloat GLfloat GLfloat GLint GLint GLint GLshort GLshort GLshort GLubyte GLubyte GLubyte GLuint GLuint GLuint GLushort GLushort GLushort GLbyte GLbyte GLbyte GLbyte GLdouble GLdouble GLdouble GLdouble GLfloat GLfloat GLfloat GLfloat GLint GLint GLint GLint GLshort GLshort GLshort GLshort GLubyte GLubyte GLubyte GLubyte GLuint GLuint GLuint GLuint GLushort GLushort GLushort GLushort GLboolean const GLdouble const GLfloat const GLint const GLshort const GLbyte const GLdouble const GLfloat const GLint const GLshort const GLdouble const GLfloat const GLint const GLshort const GLdouble const GLfloat const GLint const GLshort const GLdouble const GLfloat const GLint const GLshort const GLdouble const GLdouble const GLfloat const GLfloat const GLint const GLint const GLshort const GLshort const GLdouble const GLfloat const GLint const GLshort const GLdouble const GLfloat const GLint const GLshort const GLdouble const GLfloat const GLint const GLshort const GLdouble const GLfloat const GLint const GLshort const GLdouble const GLfloat const GLint const GLshort const GLdouble const GLfloat const GLint const GLshort const GLdouble const GLfloat const GLint const GLshort GLenum GLenum GLenum GLfloat GLenum GLint GLenum GLenum GLenum GLfloat GLenum GLenum GLint GLenum GLfloat GLenum GLint GLint GLushort GLenum GLenum GLfloat GLenum GLenum GLint GLfloat const GLubyte GLenum GLenum GLenum const GLfloat GLenum GLenum const GLint GLenum GLint GLint GLsizei GLsizei GLint GLenum GLenum const GLvoid GLenum GLenum const GLfloat GLenum GLenum const GLint GLenum GLenum const GLdouble GLenum GLenum const GLfloat GLenum GLenum const GLint GLsizei GLuint GLfloat GLuint GLbitfield GLfloat GLint GLuint GLboolean GLenum GLfloat GLenum GLbitfield GLenum GLfloat GLfloat GLint GLint const GLfloat GLenum GLfloat GLfloat GLint GLint GLfloat GLfloat GLint GLint const GLfloat GLint GLfloat GLfloat GLint GLfloat GLfloat GLint GLfloat GLfloat const GLdouble const GLfloat const GLdouble const GLfloat GLint i
Definition: glfuncs.h:248
std::streamsize getByte(unsigned char &c)
Definition: utf16le.cpp:139
#define d
Definition: ke_i.h:81
const GLubyte * c
Definition: glext.h:8905
enc_types encoding
Definition: utf16le.cpp:36

◆ getError()

err_types utf_converter::getError ( )
inline

Definition at line 67 of file utf16le.cpp.

68  {
69  return error;
70  }
err_types error
Definition: utf16le.cpp:35

Referenced by main().

◆ getWord()

std::streamsize utf_converter::getWord ( unsigned short w)
inline

Definition at line 153 of file utf16le.cpp.

154  {
155  unsigned char c[2];
156  if (!getByte(c[0]))
157  return 0;
158  if (!getByte(c[1]))
159  return 1;
160  if (encoding == utf16le)
161  w = c[0] | (c[1] << 8);
162  else
163  w = c[1] | (c[0] << 8);
164  return 2;
165  }
GLubyte GLubyte GLubyte GLubyte w
Definition: glext.h:6102
std::streamsize getByte(unsigned char &c)
Definition: utf16le.cpp:139
const GLubyte * c
Definition: glext.h:8905
enc_types encoding
Definition: utf16le.cpp:36

Member Data Documentation

◆ bom_type

bom_types utf_converter::bom_type
protected

Definition at line 37 of file utf16le.cpp.

◆ buffer

unsigned char utf_converter::buffer[4]
protected

Definition at line 38 of file utf16le.cpp.

◆ encoding

enc_types utf_converter::encoding
protected

Definition at line 36 of file utf16le.cpp.

◆ error

err_types utf_converter::error
protected

Definition at line 35 of file utf16le.cpp.

◆ fill

std::streamsize utf_converter::fill
protected

Definition at line 39 of file utf16le.cpp.

◆ index

unsigned char utf_converter::index
protected

Definition at line 38 of file utf16le.cpp.

◆ inputfile

fstream utf_converter::inputfile
protected

Definition at line 40 of file utf16le.cpp.

◆ outputfile

fstream utf_converter::outputfile
protected

Definition at line 40 of file utf16le.cpp.

◆ utf8table

const unsigned char utf_converter::utf8table
staticprotected
Initial value:
= {
1, 9, 17, 25, 33, 41, 49, 57, 65, 73, 81, 89, 97, 105, 113, 121,
129, 137, 145, 153, 161, 169, 177, 185, 193, 201, 209, 217, 225, 233, 241, 249,
2, 10, 18, 26, 34, 42, 50, 58, 66, 74, 82, 90, 98, 106, 114, 122,
3, 11, 19, 27, 35, 43, 51, 59, 4, 12, 20, 28, 5, 13, 6, 7
}

Definition at line 41 of file utf16le.cpp.


The documentation for this class was generated from the following file: