Home | Info | Community | Development | myReactOS | Contact Us
ReactOS Development > Doxygenutf16le.cpp
Go to the documentation of this file.
00001 /* 00002 * Usage: utf16le inputfile outputfile 00003 * 00004 * This is a tool and is compiled using the host compiler, 00005 * i.e. on Linux gcc and not mingw-gcc (cross-compiler). 00006 * It's a converter from utf-8, utf-16 (LE/BE) and utf-32 (LE/BE) 00007 * to utf-16 LE and especially made for automatic conversions of 00008 * INF-files from utf-8 to utf-16LE (so we can furthermore 00009 * store the INF files in utf-8 for subversion. 00010 * 00011 * Author: Matthias Kupfer (mkupfer@reactos.org) 00012 */ 00013 00014 #include <fstream> 00015 #include <iostream> 00016 00017 //#define DISPLAY_DETECTED_UNICODE 00018 00019 using namespace std; 00020 00021 class utf_converter 00022 { 00023 public: 00024 // detect can detect utf-8 and both utf-16 variants, but assume utf-32 only 00025 // due to ambiguous BOM 00026 enum enc_types { detect, utf8, utf16le, utf16be, utf32le, utf32be }; 00027 enum err_types { none, iopen, oopen, eof, read, write, decode }; 00028 protected: 00029 err_types error; 00030 enc_types encoding; 00031 unsigned char buffer[4], fill, index; // need 4 char buffer for optional BOM handling 00032 fstream inputfile,outputfile; 00033 static const unsigned char utf8table[64]; 00034 public: 00035 utf_converter(string ifname, string ofname, enc_types enc = detect) : error(none), encoding(enc), fill(0), index(0) 00036 { 00037 enc_types tmp_enc; 00038 inputfile.open(ifname.c_str(), ios::in); 00039 if (!inputfile) 00040 { 00041 error = iopen; 00042 return; 00043 } 00044 outputfile.open(ofname.c_str(), ios::out); 00045 if (!outputfile) 00046 { 00047 error = oopen; 00048 return; 00049 } 00050 tmp_enc = getBOM(); 00051 if (enc != detect) 00052 { 00053 if (enc != tmp_enc) 00054 cerr << "Warning: UTF-BOM doesn't match encoding setting, but given encoding forced" << endl; 00055 } 00056 else 00057 encoding = tmp_enc; 00058 } 00059 err_types getError() 00060 { 00061 return error; 00062 } 00063 enc_types getBOM() 00064 { 00065 index = 0; 00066 /* first byte can also detect with: 00067 if ((buffer[0] & 0x11) || !buffer[0])) 00068 valid values are 0xef, 0xff, 0xfe, 0x00 00069 */ 00070 inputfile.read(reinterpret_cast<char*>(&buffer),4); 00071 fill =inputfile.gcount(); 00072 // stupid utf8 bom 00073 if ((fill > 2) && 00074 (buffer[0] == 0xef) && 00075 (buffer[1] == 0xbb) && 00076 (buffer[2] == 0xbf)) 00077 { 00078 index += 3; 00079 fill -=3; 00080 #ifdef DISPLAY_DETECTED_UNICODE 00081 cerr << "UTF-8 BOM found" << endl; 00082 #endif 00083 return utf8; 00084 } 00085 if ((fill > 1) && 00086 (buffer[0] == 0xfe) && 00087 (buffer[1] == 0xff)) 00088 { 00089 index += 2; 00090 fill -= 2; 00091 #ifdef DISPLAY_DETECTED_UNICODE 00092 cerr << "UTF-16BE BOM found" << endl; 00093 #endif 00094 return utf16be; 00095 } 00096 if ((fill > 1) && 00097 (buffer[0] == 0xff) && 00098 (buffer[1] == 0xfe)) 00099 { 00100 if ((fill == 4) && 00101 (buffer[2] == 0x00) && 00102 (buffer[3] == 0x00)) 00103 { 00104 cerr << "UTF Error: ambiguous BOM UTF-16 or UTF-32; assume UTF-32" << endl; 00105 fill = 0; 00106 index = 0; 00107 return utf32le; 00108 } 00109 fill -= 2; 00110 index += 2; 00111 #ifdef DISPLAY_DETECTED_UNICODE 00112 cerr << "UTF-16LE BOM found" << endl; 00113 #endif 00114 return utf16le; 00115 } 00116 if ((fill == 4) && 00117 (buffer[0] == 0x00) && 00118 (buffer[1] == 0x00) && 00119 (buffer[2] == 0xfe) && 00120 (buffer[3] == 0xff)) 00121 { 00122 fill = 0; 00123 index = 0; 00124 #ifdef DISPLAY_DETECTED_UNICODE 00125 cerr << "UTF-32BE BOM found" << endl; 00126 #endif 00127 return utf32be; 00128 } 00129 return utf8; // no valid bom so use utf8 as default 00130 } 00131 int getByte(unsigned char &c) 00132 { 00133 if (fill) 00134 { 00135 index %= 4; 00136 --fill; 00137 c = buffer[index++]; 00138 return 1; 00139 } else 00140 { 00141 inputfile.read(reinterpret_cast<char*>(&c),1); 00142 return inputfile.gcount(); 00143 } 00144 } 00145 int getWord(unsigned short &w) 00146 { 00147 unsigned char c[2]; 00148 if (!getByte(c[0])) 00149 return 0; 00150 if (!getByte(c[1])) 00151 return 1; 00152 if (encoding == utf16le) 00153 w = c[0] | (c[1] << 8); 00154 else 00155 w = c[1] | (c[0] << 8); 00156 return 2; 00157 } 00158 int getDWord(wchar_t &d) 00159 { 00160 unsigned char c[4]; 00161 for (int i=0;i<4;i++) 00162 if (!getByte(c[i])) 00163 return i; 00164 if (encoding == utf32le) 00165 d = c[0] | (c[1] << 8) | (c[2] << 16) | (c[3] << 24); 00166 else 00167 d = c[3] | (c[2] << 8) | (c[1] << 16) | (c[0] << 24); 00168 return 4; 00169 } 00170 wchar_t get_wchar_t() 00171 { 00172 wchar_t ret = (wchar_t)-1; 00173 switch (encoding) 00174 { 00175 case detect: // if still unknwon 00176 encoding = utf8; // assume utf8 as default 00177 case utf8: 00178 unsigned char c, tmp; 00179 if (!getByte(tmp)) 00180 return ret; 00181 // table for 64 bytes (all 11xxxxxx resp. >=192) 00182 // resulting byte is determined: 00183 // lower 3 bits: number of following bytes (max.8) 0=error 00184 // upper 5 bits: data filled with 0 00185 if (tmp & 0x80) 00186 { 00187 if ((tmp & 0xc0) != 0xc0) 00188 { 00189 cerr << "UTF-8 Error: invalid data byte" << endl; 00190 return ret; 00191 } 00192 unsigned char i = utf8table[tmp & 0x3f]; 00193 ret = i >> 3; 00194 i &= 7; 00195 while (i--) 00196 { 00197 ret <<= 6; 00198 if (!getByte(c)) 00199 return wchar_t(-1); 00200 ret |= c & 0x3f; 00201 } 00202 return ret; 00203 } 00204 else 00205 return wchar_t(tmp); 00206 case utf16le: 00207 case utf16be: 00208 unsigned short w,w2; 00209 if (getWord(w) != 2) 00210 return ret; 00211 if ((w & 0xfc00) == 0xd800) // high surrogate first 00212 { 00213 if (getWord(w2) != 2) 00214 return ret; 00215 if ((w2 & 0xfc00) != 0xdc00) 00216 { 00217 cerr << "UTF-16 Error: invalid low surrogate" << endl; 00218 return ret; 00219 } 00220 return (((w & 0x3ff) + 0x40) << 10) | (w2 & 0x3ff); 00221 } 00222 return w; 00223 case utf32le: 00224 case utf32be: 00225 if (getDWord(ret) != 4) 00226 return wchar_t (-1); 00227 return ret; 00228 } 00229 return ret; 00230 } 00231 void convert2utf16le() 00232 { 00233 wchar_t c; 00234 unsigned char buffer[2] = {0xff, 0xfe}; 00235 outputfile.write(reinterpret_cast<char*>(&buffer),2); // write BOM 00236 c = get_wchar_t(); 00237 while (!inputfile.eof()) 00238 { 00239 buffer[0] = c & 0xff; 00240 buffer[1] = (c >> 8) & 0xff; // create utf16-le char 00241 outputfile.write(reinterpret_cast<char*>(&buffer),2); // write char 00242 c = get_wchar_t(); 00243 } 00244 } 00245 ~utf_converter() 00246 { 00247 if (inputfile) 00248 inputfile.close(); 00249 if (outputfile) 00250 outputfile.close(); 00251 } 00252 }; 00253 00254 const unsigned char utf_converter::utf8table[64] = { 00255 1, 9, 17, 25, 33, 41, 49, 57, 65, 73, 81, 89, 97, 105, 113, 121, 00256 129, 137, 145, 153, 161, 169, 177, 185, 193, 201, 209, 217, 225, 233, 241, 249, 00257 2, 10, 18, 26, 34, 42, 50, 58, 66, 74, 82, 90, 98, 106, 114, 122, 00258 3, 11, 19, 27, 35, 43, 51, 59, 4, 12, 20, 28, 5, 13, 6, 7 00259 }; 00260 00261 00262 int main(int argc, char* argv[]) 00263 { 00264 utf_converter::err_types err; 00265 if (argc < 3) 00266 { 00267 cout << "usage: " << argv[0] << " inputfile outputfile" << endl; 00268 return -1; 00269 } 00270 utf_converter conv(argv[1],argv[2]); 00271 if ((err = conv.getError())!=utf_converter::none) 00272 { 00273 switch (err) 00274 { 00275 case utf_converter::iopen: 00276 cerr << "Couldn't open input file." << endl; 00277 break; 00278 case utf_converter::oopen: 00279 cerr << "Couldn't open output file." << endl; 00280 break; 00281 default: 00282 cerr << "Unknown error." << endl; 00283 } 00284 return -1; 00285 } else 00286 conv.convert2utf16le(); 00287 return 0; 00288 } 00289 00290 // vim:set ts=4 sw=4: Generated on Thu May 24 2012 04:38:12 for ReactOS by
1.7.6.1
|