ReactOS Fundraising Campaign 2012
 
€ 4,410 / € 30,000

Information | Donate

Home | Info | Community | Development | myReactOS | Contact Us

  1. Home
  2. Community
  3. Development
  4. myReactOS
  5. Fundraiser 2012

  1. Main Page
  2. Alphabetical List
  3. Data Structures
  4. Directories
  5. File List
  6. Data Fields
  7. Globals
  8. Related Pages

ReactOS Development > Doxygen

utf16le.cpp
Go to the documentation of this file.
00001 /*
00002  * Usage: utf16le inputfile outputfile
00003  *
00004  * This is a tool and is compiled using the host compiler,
00005  * i.e. on Linux gcc and not mingw-gcc (cross-compiler).
00006  * It's a converter from utf-8, utf-16 (LE/BE) and utf-32 (LE/BE)
00007  * to utf-16 LE and especially made for automatic conversions of
00008  * INF-files from utf-8 to utf-16LE (so we can furthermore 
00009  * store the INF files in utf-8 for subversion.
00010  *
00011  * Author: Matthias Kupfer (mkupfer@reactos.org)
00012  */
00013 
00014 #include <fstream>
00015 #include <iostream>
00016 
00017 //#define DISPLAY_DETECTED_UNICODE
00018 
00019 using namespace std;
00020 
00021 class utf_converter
00022 {
00023   public:
00024     // detect can detect utf-8 and both utf-16 variants, but assume utf-32 only
00025     // due to ambiguous BOM
00026     enum enc_types { detect, utf8, utf16le, utf16be, utf32le, utf32be };
00027     enum err_types { none, iopen, oopen, eof, read, write, decode };
00028   protected:
00029     err_types error;
00030     enc_types encoding;
00031     unsigned char buffer[4], fill, index; // need 4 char buffer for optional BOM handling
00032     fstream inputfile,outputfile;
00033     static const unsigned char utf8table[64];
00034   public:
00035     utf_converter(string ifname, string ofname, enc_types enc = detect) : error(none), encoding(enc), fill(0), index(0)
00036     {
00037         enc_types tmp_enc;
00038         inputfile.open(ifname.c_str(), ios::in);
00039         if (!inputfile)
00040         {
00041             error = iopen;
00042             return;
00043         }
00044         outputfile.open(ofname.c_str(), ios::out);
00045         if (!outputfile)
00046         {
00047             error = oopen;
00048             return;
00049         }
00050         tmp_enc = getBOM();
00051         if (enc != detect)
00052         {
00053             if (enc != tmp_enc)
00054                 cerr << "Warning: UTF-BOM doesn't match encoding setting, but given encoding forced" << endl;
00055         }
00056         else
00057             encoding = tmp_enc;
00058     }
00059     err_types getError()
00060     {
00061         return error;
00062     }
00063     enc_types getBOM()
00064     {
00065         index = 0;
00066         /* first byte can also detect with:
00067         if ((buffer[0] & 0x11) || !buffer[0]))
00068         valid values are 0xef, 0xff, 0xfe, 0x00
00069         */
00070         inputfile.read(reinterpret_cast<char*>(&buffer),4);
00071         fill =inputfile.gcount();
00072         // stupid utf8 bom
00073         if ((fill > 2) &&
00074             (buffer[0] == 0xef) &&
00075             (buffer[1] == 0xbb) &&
00076             (buffer[2] == 0xbf))
00077         {
00078             index += 3;
00079             fill -=3;
00080 #ifdef DISPLAY_DETECTED_UNICODE
00081             cerr << "UTF-8 BOM found" << endl;
00082 #endif
00083             return utf8;
00084         }
00085         if ((fill > 1) &&
00086             (buffer[0] == 0xfe) &&
00087             (buffer[1] == 0xff))
00088         {
00089             index += 2;
00090             fill -= 2;
00091 #ifdef DISPLAY_DETECTED_UNICODE
00092             cerr << "UTF-16BE BOM found" << endl;
00093 #endif
00094             return utf16be;
00095         }
00096         if ((fill > 1) &&
00097             (buffer[0] == 0xff) &&
00098             (buffer[1] == 0xfe))
00099         {
00100             if ((fill == 4) &&
00101                 (buffer[2] == 0x00) &&
00102                 (buffer[3] == 0x00))
00103             {   
00104                 cerr << "UTF Error: ambiguous BOM UTF-16 or UTF-32; assume UTF-32" << endl;
00105                 fill = 0;
00106                 index = 0;
00107                 return utf32le;
00108             }
00109             fill -= 2;
00110             index += 2;
00111 #ifdef DISPLAY_DETECTED_UNICODE
00112             cerr << "UTF-16LE BOM found" << endl;
00113 #endif
00114             return utf16le;
00115         }
00116         if ((fill == 4) &&
00117             (buffer[0] == 0x00) &&
00118             (buffer[1] == 0x00) &&
00119             (buffer[2] == 0xfe) &&
00120             (buffer[3] == 0xff))
00121         {
00122             fill = 0;
00123             index = 0;
00124 #ifdef DISPLAY_DETECTED_UNICODE
00125             cerr << "UTF-32BE BOM found" << endl;
00126 #endif
00127             return utf32be;
00128         }
00129         return utf8; // no valid bom so use utf8 as default
00130     }
00131     int getByte(unsigned char &c)
00132     {
00133         if (fill)
00134         {
00135             index %= 4;
00136             --fill;
00137             c = buffer[index++];
00138             return 1;
00139         } else
00140         {
00141             inputfile.read(reinterpret_cast<char*>(&c),1);
00142             return inputfile.gcount();
00143         }
00144     }
00145     int getWord(unsigned short &w)
00146     {
00147         unsigned char c[2];
00148         if (!getByte(c[0]))
00149                 return 0;
00150         if (!getByte(c[1]))
00151                 return 1;
00152         if (encoding == utf16le)
00153             w = c[0] | (c[1] << 8);
00154         else
00155             w = c[1] | (c[0] << 8);
00156         return 2;
00157     }
00158     int getDWord(wchar_t &d)
00159     {
00160         unsigned char c[4];
00161         for (int i=0;i<4;i++)
00162             if (!getByte(c[i]))
00163                     return i;
00164         if (encoding == utf32le)
00165             d = c[0] | (c[1] << 8) | (c[2] << 16) | (c[3] << 24);
00166         else
00167             d = c[3] | (c[2] << 8) | (c[1] << 16) | (c[0] << 24);
00168         return 4;
00169     }
00170     wchar_t get_wchar_t()
00171     {
00172         wchar_t ret = (wchar_t)-1;
00173         switch (encoding)
00174         {
00175             case detect: // if still unknwon
00176                 encoding = utf8; // assume utf8 as default
00177             case utf8:
00178                 unsigned char c, tmp;
00179                 if (!getByte(tmp))
00180                     return ret;
00181                 // table for 64 bytes (all 11xxxxxx resp. >=192)
00182                 // resulting byte is determined:
00183                 // lower 3 bits: number of following bytes (max.8) 0=error
00184                 // upper 5 bits: data filled with 0
00185                 if (tmp & 0x80)
00186                 {
00187                     if ((tmp & 0xc0) != 0xc0)
00188                     {
00189                         cerr << "UTF-8 Error: invalid data byte" << endl;
00190                         return ret;
00191                     }
00192                     unsigned char i = utf8table[tmp & 0x3f];
00193                     ret = i >> 3;
00194                     i &= 7;
00195                     while (i--)
00196                     {
00197                         ret <<= 6;
00198                         if (!getByte(c))
00199                             return wchar_t(-1);
00200                         ret |= c & 0x3f; 
00201                     }
00202                     return ret;
00203                 }
00204                 else
00205                     return wchar_t(tmp);
00206             case utf16le:
00207             case utf16be:
00208                 unsigned short w,w2;
00209                 if (getWord(w) != 2)
00210                     return ret;
00211                 if ((w & 0xfc00) == 0xd800) // high surrogate first
00212                 {
00213                     if (getWord(w2) != 2)
00214                         return ret;
00215                     if ((w2 & 0xfc00) != 0xdc00)
00216                     {
00217                         cerr << "UTF-16 Error: invalid low surrogate" << endl;
00218                         return ret;
00219                     }
00220                     return (((w & 0x3ff) + 0x40) << 10) | (w2 & 0x3ff);
00221                 }
00222                 return w;
00223             case utf32le:
00224             case utf32be:
00225                 if (getDWord(ret) != 4)
00226                     return wchar_t (-1);
00227                 return ret;
00228         }
00229         return ret;
00230     }
00231     void convert2utf16le()
00232     {
00233         wchar_t c;
00234         unsigned char buffer[2] = {0xff, 0xfe};
00235         outputfile.write(reinterpret_cast<char*>(&buffer),2); // write BOM
00236         c = get_wchar_t();
00237         while (!inputfile.eof())
00238         {
00239             buffer[0] = c & 0xff;
00240             buffer[1] = (c >> 8) & 0xff; // create utf16-le char
00241             outputfile.write(reinterpret_cast<char*>(&buffer),2); // write char
00242             c = get_wchar_t();
00243         }
00244     }
00245     ~utf_converter()
00246     {
00247         if (inputfile)
00248             inputfile.close();
00249         if (outputfile)
00250             outputfile.close();
00251     }
00252 };
00253 
00254 const unsigned char utf_converter::utf8table[64] = {
00255 1, 9, 17, 25, 33, 41, 49, 57, 65, 73, 81, 89, 97, 105, 113, 121,
00256 129, 137, 145, 153, 161, 169, 177, 185, 193, 201, 209, 217, 225, 233, 241, 249,
00257 2, 10, 18, 26, 34, 42, 50, 58, 66, 74, 82, 90, 98, 106, 114, 122,
00258 3, 11, 19, 27, 35, 43, 51, 59, 4, 12, 20, 28, 5, 13, 6, 7 
00259 };
00260 
00261 
00262 int main(int argc, char* argv[])
00263 {
00264     utf_converter::err_types err;
00265     if (argc < 3)
00266     {
00267         cout << "usage: " << argv[0] << " inputfile outputfile" << endl;
00268         return -1;
00269     }
00270     utf_converter conv(argv[1],argv[2]);
00271     if ((err = conv.getError())!=utf_converter::none)
00272     {
00273         switch (err)
00274         {
00275             case utf_converter::iopen:
00276                 cerr << "Couldn't open input file." << endl;
00277                 break;
00278             case utf_converter::oopen:
00279                 cerr << "Couldn't open output file." << endl;
00280                 break;
00281             default:
00282                 cerr << "Unknown error." << endl;
00283         }
00284         return -1;
00285     } else
00286     conv.convert2utf16le();
00287     return 0;
00288 }
00289 
00290 // vim:set ts=4 sw=4:

Generated on Thu May 24 2012 04:38:12 for ReactOS by doxygen 1.7.6.1

ReactOS is a registered trademark or a trademark of ReactOS Foundation in the United States and other countries.