d0/db3/utf16le_8cpp_source.html

/*

 * Usage: utf16le inputfile outputfile

 *

 * This is a tool and is compiled using the host compiler,

 * i.e. on Linux gcc and not mingw-gcc (cross-compiler).

 * It's a converter from utf-8, utf-16 (LE/BE) and utf-32 (LE/BE)

 * to utf-16 LE and especially made for automatic conversions of

 * INF-files from utf-8 to utf-16LE (so we can furthermore

 * store the INF files in utf-8 for subversion.

 *

 * Author: Matthias Kupfer (mkupfer@reactos.org)

 */


#include <fstream>

#include <iostream>

#include <string.h>


//#define DISPLAY_DETECTED_UNICODE


using namespace std;


#ifdef _MSC_VER

#define strcasecmp _stricmp

#endif


class utf_converter

{

public:

    // detect can detect utf-8 and both utf-16 variants, but assume utf-32 only

    // due to ambiguous BOM

    enum enc_types { detect, utf8, utf16le, utf16be, utf32le, utf32be };

    enum err_types { none, iopen, oopen, eof, read, write, decode };

    enum bom_types { bom, nobom };

protected:

    err_types error;

    enc_types encoding;

    bom_types bom_type;

    unsigned char buffer[4], index; // need 4 char buffer for optional BOM handling

    std::streamsize fill;

    fstream inputfile,outputfile;

    static const unsigned char utf8table[64];

public:

    utf_converter(string ifname, string ofname, bom_types ofbom = bom, enc_types enc = detect) : error(none), bom_type(ofbom), encoding(enc), fill(0), index(0)

    {

        enc_types tmp_enc;

        inputfile.open(ifname.c_str(), ios::in | ios::binary);

        if (!inputfile)

        {

            error = iopen;

            return;

        }

        outputfile.open(ofname.c_str(), ios::out | ios::binary);

        if (!outputfile)

        {

            error = oopen;

            return;

        }

        tmp_enc = getBOM();

        if (enc != detect)

        {

            if (enc != tmp_enc)

                cerr << "Warning: UTF-BOM doesn't match encoding setting, but given encoding forced" << endl;

        }

        else

            encoding = tmp_enc;

    }

    err_types getError()

    {

        return error;

    }

    enc_types getBOM()

    {

        index = 0;

        /* first byte can also detect with:

        if ((buffer[0] & 0x11) || !buffer[0]))

        valid values are 0xef, 0xff, 0xfe, 0x00

        */

        inputfile.read(reinterpret_cast<char*>(&buffer),4);

        fill = inputfile.gcount();

        // stupid utf8 bom

        if ((fill > 2) &&

            (buffer[0] == 0xef) &&

            (buffer[1] == 0xbb) &&

            (buffer[2] == 0xbf))

        {

            index += 3;

            fill -=3;

#ifdef DISPLAY_DETECTED_UNICODE

            cerr << "UTF-8 BOM found" << endl;

#endif

            return utf8;

        }

        if ((fill > 1) &&

            (buffer[0] == 0xfe) &&

            (buffer[1] == 0xff))

        {

            index += 2;

            fill -= 2;

#ifdef DISPLAY_DETECTED_UNICODE

            cerr << "UTF-16BE BOM found" << endl;

#endif

            return utf16be;

        }

        if ((fill > 1) &&

            (buffer[0] == 0xff) &&

            (buffer[1] == 0xfe))

        {

            if ((fill == 4) &&

                (buffer[2] == 0x00) &&

                (buffer[3] == 0x00))

            {

                cerr << "UTF Error: ambiguous BOM UTF-16 or UTF-32; assume UTF-32" << endl;

                fill = 0;

                index = 0;

                return utf32le;

            }

            fill -= 2;

            index += 2;

#ifdef DISPLAY_DETECTED_UNICODE

            cerr << "UTF-16LE BOM found" << endl;

#endif

            return utf16le;

        }

        if ((fill == 4) &&

            (buffer[0] == 0x00) &&

            (buffer[1] == 0x00) &&

            (buffer[2] == 0xfe) &&

            (buffer[3] == 0xff))

        {

            fill = 0;

            index = 0;

#ifdef DISPLAY_DETECTED_UNICODE

            cerr << "UTF-32BE BOM found" << endl;

#endif

            return utf32be;

        }

        return utf8; // no valid bom so use utf8 as default

    }

    std::streamsize getByte(unsigned char &c)

    {

        if (fill)

        {

            index %= 4;

            --fill;

            c = buffer[index++];

            return 1;

        } else

        {

            inputfile.read(reinterpret_cast<char*>(&c),1);

            return inputfile.gcount();

        }

    }

    std::streamsize getWord(unsigned short &w)

    {

        unsigned char c[2];

        if (!getByte(c[0]))

                return 0;

        if (!getByte(c[1]))

                return 1;

        if (encoding == utf16le)

            w = c[0] | (c[1] << 8);

        else

            w = c[1] | (c[0] << 8);

        return 2;

    }

    std::streamsize getDWord(wchar_t &d)

    {

        unsigned char c[4];

        for (int i=0;i<4;i++)

            if (!getByte(c[i]))

                    return i;

        if (encoding == utf32le)

            d = c[0] | (c[1] << 8) | (c[2] << 16) | (c[3] << 24);

        else

            d = c[3] | (c[2] << 8) | (c[1] << 16) | (c[0] << 24);

        return 4;

    }

    wchar_t get_wchar_t()

    {

        wchar_t ret = (wchar_t)-1;

        switch (encoding)

        {

            case detect: // if still unknown

                encoding = utf8; // assume utf8 as default

            case utf8:

                unsigned char c, tmp;

                if (!getByte(tmp))

                    return ret;

                // table for 64 bytes (all 11xxxxxx resp. >=192)

                // resulting byte is determined:

                // lower 3 bits: number of following bytes (max.8) 0=error

                // upper 5 bits: data filled with 0

                if (tmp & 0x80)

                {

                    if ((tmp & 0xc0) != 0xc0)

                    {

                        cerr << "UTF-8 Error: invalid data byte" << endl;

                        return ret;

                    }

                    unsigned char i = utf8table[tmp & 0x3f];

                    ret = i >> 3;

                    i &= 7;

                    while (i--)

                    {

                        ret <<= 6;

                        if (!getByte(c))

                            return wchar_t(-1);

                        ret |= c & 0x3f;

                    }

                    return ret;

                }

                else

                    return wchar_t(tmp);

            case utf16le:

            case utf16be:

                unsigned short w,w2;

                if (getWord(w) != 2)

                    return ret;

                if ((w & 0xfc00) == 0xd800) // high surrogate first

                {

                    if (getWord(w2) != 2)

                        return ret;

                    if ((w2 & 0xfc00) != 0xdc00)

                    {

                        cerr << "UTF-16 Error: invalid low surrogate" << endl;

                        return ret;

                    }

                    return (((w & 0x3ff) + 0x40) << 10) | (w2 & 0x3ff);

                }

                return w;

            case utf32le:

            case utf32be:

                if (getDWord(ret) != 4)

                    return wchar_t (-1);

                return ret;

        }

        return ret;

    }

    void convert2utf16le()

    {

        unsigned char buffer[2] = { 0xff, 0xfe };


        if (bom_type == bom)

        {

            outputfile.write(reinterpret_cast<char*>(&buffer), 2); // write BOM

        }


        wchar_t c = get_wchar_t();


        while (!inputfile.eof())

        {

            buffer[0] = c & 0xff;

            buffer[1] = (c >> 8) & 0xff; // create utf16-le char

            outputfile.write(reinterpret_cast<char*>(&buffer),2); // write char

            c = get_wchar_t();

        }

    }

    ~utf_converter()

    {

        if (inputfile)

            inputfile.close();

        if (outputfile)

            outputfile.close();

    }

};


const unsigned char utf_converter::utf8table[64] = {

1, 9, 17, 25, 33, 41, 49, 57, 65, 73, 81, 89, 97, 105, 113, 121,

129, 137, 145, 153, 161, 169, 177, 185, 193, 201, 209, 217, 225, 233, 241, 249,

2, 10, 18, 26, 34, 42, 50, 58, 66, 74, 82, 90, 98, 106, 114, 122,

3, 11, 19, 27, 35, 43, 51, 59, 4, 12, 20, 28, 5, 13, 6, 7

};


int main(int argc, char* argv[])

{

    utf_converter::err_types err;


    if (argc < 3)

    {

        cout << "usage: " << argv[0] << " inputfile outputfile" << endl;

        return -1;

    }


    utf_converter::bom_types bom_type = utf_converter::bom;


    if (argc == 4 && strcasecmp(argv[3], "nobom") == 0)

    {

        bom_type = utf_converter::nobom;

    }


    utf_converter conv(argv[1], argv[2], bom_type);


    if ((err = conv.getError())!=utf_converter::none)

    {

        switch (err)

        {

            case utf_converter::iopen:

                cerr << "Couldn't open input file." << endl;

                break;

            case utf_converter::oopen:

                cerr << "Couldn't open output file." << endl;

                break;

            default:

                cerr << "Unknown error." << endl;

        }

        return -1;

    }

    else

    {

        conv.convert2utf16le();

    }


    return 0;

}

argc
static int argc
Definition: ServiceArgs.c:12

endl
basic_ostream< _CharT, _Traits > &_STLP_CALL endl(basic_ostream< _CharT, _Traits > &__os)
Definition: _ostream.h:357

basic_fstream< char, char_traits< char > >

basic_fstream::open
void open(const char *__s, ios_base::openmode __mod=ios_base::in|ios_base::out)
Definition: _fstream.h:674

basic_fstream::close
void close()
Definition: _fstream.h:681

basic_istream::gcount
streamsize gcount() const
Definition: _istream.h:125

basic_istream::read
_Self & read(char_type *__s, streamsize __n)
Definition: _istream.c:783

basic_ostream::write
_Self & write(const char_type *__s, streamsize __n)
Definition: _ostream.c:430

ios_base::eof
bool eof() const
Definition: _ios_base.h:173

utf_converter
Definition: utf16le.cpp:27

utf_converter::getWord
std::streamsize getWord(unsigned short &w)
Definition: utf16le.cpp:153

utf_converter::encoding
enc_types encoding
Definition: utf16le.cpp:36

utf_converter::get_wchar_t
wchar_t get_wchar_t()
Definition: utf16le.cpp:178

utf_converter::bom_type
bom_types bom_type
Definition: utf16le.cpp:37

utf_converter::utf8table
static const unsigned char utf8table[64]
Definition: utf16le.cpp:41

utf_converter::bom_types
bom_types
Definition: utf16le.cpp:33

utf_converter::bom
@ bom
Definition: utf16le.cpp:33

utf_converter::nobom
@ nobom
Definition: utf16le.cpp:33

utf_converter::inputfile
fstream inputfile
Definition: utf16le.cpp:40

utf_converter::utf_converter
utf_converter(string ifname, string ofname, bom_types ofbom=bom, enc_types enc=detect)
Definition: utf16le.cpp:43

utf_converter::~utf_converter
~utf_converter()
Definition: utf16le.cpp:258

utf_converter::getByte
std::streamsize getByte(unsigned char &c)
Definition: utf16le.cpp:139

utf_converter::fill
std::streamsize fill
Definition: utf16le.cpp:39

utf_converter::getError
err_types getError()
Definition: utf16le.cpp:67

utf_converter::convert2utf16le
void convert2utf16le()
Definition: utf16le.cpp:239

utf_converter::err_types
err_types
Definition: utf16le.cpp:32

utf_converter::iopen
@ iopen
Definition: utf16le.cpp:32

utf_converter::eof
@ eof
Definition: utf16le.cpp:32

utf_converter::oopen
@ oopen
Definition: utf16le.cpp:32

utf_converter::decode
@ decode
Definition: utf16le.cpp:32

utf_converter::write
@ write
Definition: utf16le.cpp:32

utf_converter::none
@ none
Definition: utf16le.cpp:32

utf_converter::read
@ read
Definition: utf16le.cpp:32

utf_converter::index
unsigned char index
Definition: utf16le.cpp:38

utf_converter::outputfile
fstream outputfile
Definition: utf16le.cpp:40

utf_converter::enc_types
enc_types
Definition: utf16le.cpp:31

utf_converter::utf32le
@ utf32le
Definition: utf16le.cpp:31

utf_converter::utf16le
@ utf16le
Definition: utf16le.cpp:31

utf_converter::utf8
@ utf8
Definition: utf16le.cpp:31

utf_converter::utf16be
@ utf16be
Definition: utf16le.cpp:31

utf_converter::utf32be
@ utf32be
Definition: utf16le.cpp:31

utf_converter::detect
@ detect
Definition: utf16le.cpp:31

utf_converter::error
err_types error
Definition: utf16le.cpp:35

utf_converter::getBOM
enc_types getBOM()
Definition: utf16le.cpp:71

utf_converter::getDWord
std::streamsize getDWord(wchar_t &d)
Definition: utf16le.cpp:166

main
int main()
Definition: test.c:6

strcasecmp
#define strcasecmp
Definition: fake.h:9

w2
GLdouble GLdouble GLint GLint GLdouble GLdouble GLint GLint GLdouble GLdouble w2
Definition: glext.h:8308

buffer
GLuint buffer
Definition: glext.h:5915

c
const GLubyte * c
Definition: glext.h:8905

index
GLuint index
Definition: glext.h:6031

w
GLubyte GLubyte GLubyte GLubyte w
Definition: glext.h:6102

i
GLsizei GLenum const GLvoid GLsizei GLenum GLbyte GLbyte GLbyte GLdouble GLdouble GLdouble GLfloat GLfloat GLfloat GLint GLint GLint GLshort GLshort GLshort GLubyte GLubyte GLubyte GLuint GLuint GLuint GLushort GLushort GLushort GLbyte GLbyte GLbyte GLbyte GLdouble GLdouble GLdouble GLdouble GLfloat GLfloat GLfloat GLfloat GLint GLint GLint GLint GLshort GLshort GLshort GLshort GLubyte GLubyte GLubyte GLubyte GLuint GLuint GLuint GLuint GLushort GLushort GLushort GLushort GLboolean const GLdouble const GLfloat const GLint const GLshort const GLbyte const GLdouble const GLfloat const GLint const GLshort const GLdouble const GLfloat const GLint const GLshort const GLdouble const GLfloat const GLint const GLshort const GLdouble const GLfloat const GLint const GLshort const GLdouble const GLdouble const GLfloat const GLfloat const GLint const GLint const GLshort const GLshort const GLdouble const GLfloat const GLint const GLshort const GLdouble const GLfloat const GLint const GLshort const GLdouble const GLfloat const GLint const GLshort const GLdouble const GLfloat const GLint const GLshort const GLdouble const GLfloat const GLint const GLshort const GLdouble const GLfloat const GLint const GLshort const GLdouble const GLfloat const GLint const GLshort GLenum GLenum GLenum GLfloat GLenum GLint GLenum GLenum GLenum GLfloat GLenum GLenum GLint GLenum GLfloat GLenum GLint GLint GLushort GLenum GLenum GLfloat GLenum GLenum GLint GLfloat const GLubyte GLenum GLenum GLenum const GLfloat GLenum GLenum const GLint GLenum GLint GLint GLsizei GLsizei GLint GLenum GLenum const GLvoid GLenum GLenum const GLfloat GLenum GLenum const GLint GLenum GLenum const GLdouble GLenum GLenum const GLfloat GLenum GLenum const GLint GLsizei GLuint GLfloat GLuint GLbitfield GLfloat GLint GLuint GLboolean GLenum GLfloat GLenum GLbitfield GLenum GLfloat GLfloat GLint GLint const GLfloat GLenum GLfloat GLfloat GLint GLint GLfloat GLfloat GLint GLint const GLfloat GLint GLfloat GLfloat GLint GLfloat GLfloat GLint GLfloat GLfloat const GLdouble const GLfloat const GLdouble const GLfloat GLint i
Definition: glfuncs.h:248

cout
#define cout
Definition: iostream.cpp:38

cerr
#define cerr
Definition: iostream.cpp:39

d
#define d
Definition: ke_i.h:81

c
#define c
Definition: ke_i.h:80

argv
#define argv
Definition: mplay32.c:18

std
Definition: features.h:417

err
#define err(...)
Definition: reactos_support_code.h:30

wchar_t
#define wchar_t
Definition: wchar.h:102

ret
int ret
Definition: wcstombs-tests.c:31