ReactOS Fundraising Campaign 2012
 
€ 4,410 / € 30,000

Information | Donate

Home | Info | Community | Development | myReactOS | Contact Us

  1. Home
  2. Community
  3. Development
  4. myReactOS
  5. Fundraiser 2012

  1. Main Page
  2. Alphabetical List
  3. Data Structures
  4. Directories
  5. File List
  6. Data Fields
  7. Globals
  8. Related Pages

ReactOS Development > Doxygen

utf8.c
Go to the documentation of this file.
00001 /*
00002  * UTF-8 support routines
00003  *
00004  * Copyright 2000 Alexandre Julliard
00005  *
00006  * This library is free software; you can redistribute it and/or
00007  * modify it under the terms of the GNU Lesser General Public
00008  * License as published by the Free Software Foundation; either
00009  * version 2.1 of the License, or (at your option) any later version.
00010  *
00011  * This library is distributed in the hope that it will be useful,
00012  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00013  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00014  * Lesser General Public License for more details.
00015  *
00016  * You should have received a copy of the GNU Lesser General Public
00017  * License along with this library; if not, write to the Free Software
00018  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA
00019  */
00020 
00021 #include <string.h>
00022 
00023 #include "wine/unicode.h"
00024 
00025 extern WCHAR compose( const WCHAR *str );
00026 
00027 /* number of following bytes in sequence based on first byte value (for bytes above 0x7f) */
00028 static const char utf8_length[128] =
00029 {
00030     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x80-0x8f */
00031     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x90-0x9f */
00032     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0xa0-0xaf */
00033     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0xb0-0xbf */
00034     0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0xc0-0xcf */
00035     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0xd0-0xdf */
00036     2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, /* 0xe0-0xef */
00037     3,3,3,3,3,0,0,0,0,0,0,0,0,0,0,0  /* 0xf0-0xff */
00038 };
00039 
00040 /* first byte mask depending on UTF-8 sequence length */
00041 static const unsigned char utf8_mask[4] = { 0x7f, 0x1f, 0x0f, 0x07 };
00042 
00043 /* minimum Unicode value depending on UTF-8 sequence length */
00044 static const unsigned int utf8_minval[4] = { 0x0, 0x80, 0x800, 0x10000 };
00045 
00046 
00047 /* get the next char value taking surrogates into account */
00048 static inline unsigned int get_surrogate_value( const WCHAR *src, unsigned int srclen )
00049 {
00050     if (src[0] >= 0xd800 && src[0] <= 0xdfff)  /* surrogate pair */
00051     {
00052         if (src[0] > 0xdbff || /* invalid high surrogate */
00053             srclen <= 1 ||     /* missing low surrogate */
00054             src[1] < 0xdc00 || src[1] > 0xdfff) /* invalid low surrogate */
00055             return 0;
00056         return 0x10000 + ((src[0] & 0x3ff) << 10) + (src[1] & 0x3ff);
00057     }
00058     return src[0];
00059 }
00060 
00061 /* query necessary dst length for src string */
00062 static inline int get_length_wcs_utf8( int flags, const WCHAR *src, unsigned int srclen )
00063 {
00064     int len;
00065     unsigned int val;
00066 
00067     for (len = 0; srclen; srclen--, src++)
00068     {
00069         if (*src < 0x80)  /* 0x00-0x7f: 1 byte */
00070         {
00071             len++;
00072             continue;
00073         }
00074         if (*src < 0x800)  /* 0x80-0x7ff: 2 bytes */
00075         {
00076             len += 2;
00077             continue;
00078         }
00079         if (!(val = get_surrogate_value( src, srclen )))
00080         {
00081             if (flags & WC_ERR_INVALID_CHARS) return -2;
00082             continue;
00083         }
00084         if (val < 0x10000)  /* 0x800-0xffff: 3 bytes */
00085             len += 3;
00086         else   /* 0x10000-0x10ffff: 4 bytes */
00087         {
00088             len += 4;
00089             src++;
00090             srclen--;
00091         }
00092     }
00093     return len;
00094 }
00095 
00096 /* wide char to UTF-8 string conversion */
00097 /* return -1 on dst buffer overflow, -2 on invalid input char */
00098 int wine_utf8_wcstombs( int flags, const WCHAR *src, int srclen, char *dst, int dstlen )
00099 {
00100     int len;
00101 
00102     if (!dstlen) return get_length_wcs_utf8( flags, src, srclen );
00103 
00104     for (len = dstlen; srclen; srclen--, src++)
00105     {
00106         WCHAR ch = *src;
00107         unsigned int val;
00108 
00109         if (ch < 0x80)  /* 0x00-0x7f: 1 byte */
00110         {
00111             if (!len--) return -1;  /* overflow */
00112             *dst++ = ch;
00113             continue;
00114         }
00115 
00116         if (ch < 0x800)  /* 0x80-0x7ff: 2 bytes */
00117         {
00118             if ((len -= 2) < 0) return -1;  /* overflow */
00119             dst[1] = 0x80 | (ch & 0x3f);
00120             ch >>= 6;
00121             dst[0] = 0xc0 | ch;
00122             dst += 2;
00123             continue;
00124         }
00125 
00126         if (!(val = get_surrogate_value( src, srclen )))
00127         {
00128             if (flags & WC_ERR_INVALID_CHARS) return -2;
00129             continue;
00130         }
00131 
00132         if (val < 0x10000)  /* 0x800-0xffff: 3 bytes */
00133         {
00134             if ((len -= 3) < 0) return -1;  /* overflow */
00135             dst[2] = 0x80 | (val & 0x3f);
00136             val >>= 6;
00137             dst[1] = 0x80 | (val & 0x3f);
00138             val >>= 6;
00139             dst[0] = 0xe0 | val;
00140             dst += 3;
00141         }
00142         else   /* 0x10000-0x10ffff: 4 bytes */
00143         {
00144             if ((len -= 4) < 0) return -1;  /* overflow */
00145             dst[3] = 0x80 | (val & 0x3f);
00146             val >>= 6;
00147             dst[2] = 0x80 | (val & 0x3f);
00148             val >>= 6;
00149             dst[1] = 0x80 | (val & 0x3f);
00150             val >>= 6;
00151             dst[0] = 0xf0 | val;
00152             dst += 4;
00153             src++;
00154             srclen--;
00155         }
00156     }
00157     return dstlen - len;
00158 }
00159 
00160 /* helper for the various utf8 mbstowcs functions */
00161 static inline unsigned int decode_utf8_char( unsigned char ch, const char **str, const char *strend )
00162 {
00163     unsigned int len = utf8_length[ch-0x80];
00164     unsigned int res = ch & utf8_mask[len];
00165     const char *end = *str + len;
00166 
00167     if (end > strend) return ~0;
00168     switch(len)
00169     {
00170     case 3:
00171         if ((ch = end[-3] ^ 0x80) >= 0x40) break;
00172         res = (res << 6) | ch;
00173         (*str)++;
00174     case 2:
00175         if ((ch = end[-2] ^ 0x80) >= 0x40) break;
00176         res = (res << 6) | ch;
00177         (*str)++;
00178     case 1:
00179         if ((ch = end[-1] ^ 0x80) >= 0x40) break;
00180         res = (res << 6) | ch;
00181         (*str)++;
00182         if (res < utf8_minval[len]) break;
00183         return res;
00184     }
00185     return ~0;
00186 }
00187 
00188 /* query necessary dst length for src string with composition */
00189 static inline int get_length_mbs_utf8_compose( int flags, const char *src, int srclen )
00190 {
00191     int ret = 0;
00192     unsigned int res;
00193     WCHAR composed[2];
00194     const char *srcend = src + srclen;
00195 
00196     composed[0] = 0;
00197     while (src < srcend)
00198     {
00199         unsigned char ch = *src++;
00200         if (ch < 0x80)  /* special fast case for 7-bit ASCII */
00201         {
00202             composed[0] = ch;
00203             ret++;
00204             continue;
00205         }
00206         if ((res = decode_utf8_char( ch, &src, srcend )) <= 0xffff)
00207         {
00208             if (composed[0])
00209             {
00210                 composed[1] = res;
00211                 if ((composed[0] = compose( composed ))) continue;
00212             }
00213             composed[0] = res;
00214             ret++;
00215         }
00216         else if (res <= 0x10ffff)
00217         {
00218             ret += 2;
00219             composed[0] = 0;  /* no composition for surrogates */
00220         }
00221         else if (flags & MB_ERR_INVALID_CHARS) return -2;  /* bad char */
00222         /* otherwise ignore it */
00223     }
00224     return ret;
00225 }
00226 
00227 /* UTF-8 to wide char string conversion with composition */
00228 /* return -1 on dst buffer overflow, -2 on invalid input char */
00229 static int utf8_mbstowcs_compose( int flags, const char *src, int srclen, WCHAR *dst, int dstlen )
00230 {
00231     unsigned int res;
00232     const char *srcend = src + srclen;
00233     WCHAR composed[2];
00234     WCHAR *dstend = dst + dstlen;
00235 
00236     if (!dstlen) return get_length_mbs_utf8_compose( flags, src, srclen );
00237 
00238     composed[0] = 0;
00239     while (src < srcend)
00240     {
00241         unsigned char ch = *src++;
00242         if (ch < 0x80)  /* special fast case for 7-bit ASCII */
00243         {
00244             if (dst >= dstend) return -1;  /* overflow */
00245             *dst++ = composed[0] = ch;
00246             continue;
00247         }
00248         if ((res = decode_utf8_char( ch, &src, srcend )) <= 0xffff)
00249         {
00250             if (composed[0])
00251             {
00252                 composed[1] = res;
00253                 if ((composed[0] = compose( composed )))
00254                 {
00255                     dst[-1] = composed[0];
00256                     continue;
00257                 }
00258             }
00259             if (dst >= dstend) return -1;  /* overflow */
00260             *dst++ = composed[0] = res;
00261         }
00262         else if (res <= 0x10ffff) /* we need surrogates */
00263         {
00264             if (dst >= dstend - 1) return -1;  /* overflow */
00265             res -= 0x10000;
00266             *dst++ = 0xd800 | (res >> 10);
00267             *dst++ = 0xdc00 | (res & 0x3ff);
00268             composed[0] = 0;  /* no composition for surrogates */
00269         }
00270         else if (flags & MB_ERR_INVALID_CHARS) return -2;  /* bad char */
00271         /* otherwise ignore it */
00272     }
00273     return dstlen - (dstend - dst);
00274 }
00275 
00276 /* query necessary dst length for src string */
00277 static inline int get_length_mbs_utf8( int flags, const char *src, int srclen )
00278 {
00279     int ret = 0;
00280     unsigned int res;
00281     const char *srcend = src + srclen;
00282 
00283     while (src < srcend)
00284     {
00285         unsigned char ch = *src++;
00286         if (ch < 0x80)  /* special fast case for 7-bit ASCII */
00287         {
00288             ret++;
00289             continue;
00290         }
00291         if ((res = decode_utf8_char( ch, &src, srcend )) <= 0x10ffff)
00292         {
00293             if (res > 0xffff) ret++;
00294             ret++;
00295         }
00296         else if (flags & MB_ERR_INVALID_CHARS) return -2;  /* bad char */
00297         /* otherwise ignore it */
00298     }
00299     return ret;
00300 }
00301 
00302 /* UTF-8 to wide char string conversion */
00303 /* return -1 on dst buffer overflow, -2 on invalid input char */
00304 int wine_utf8_mbstowcs( int flags, const char *src, int srclen, WCHAR *dst, int dstlen )
00305 {
00306     unsigned int res;
00307     const char *srcend = src + srclen;
00308     WCHAR *dstend = dst + dstlen;
00309 
00310     if (flags & MB_COMPOSITE) return utf8_mbstowcs_compose( flags, src, srclen, dst, dstlen );
00311 
00312     if (!dstlen) return get_length_mbs_utf8( flags, src, srclen );
00313 
00314     while ((dst < dstend) && (src < srcend))
00315     {
00316         unsigned char ch = *src++;
00317         if (ch < 0x80)  /* special fast case for 7-bit ASCII */
00318         {
00319             *dst++ = ch;
00320             continue;
00321         }
00322         if ((res = decode_utf8_char( ch, &src, srcend )) <= 0xffff)
00323         {
00324             *dst++ = res;
00325         }
00326         else if (res <= 0x10ffff)  /* we need surrogates */
00327         {
00328             if (dst == dstend - 1) return -1;  /* overflow */
00329             res -= 0x10000;
00330             *dst++ = 0xd800 | (res >> 10);
00331             *dst++ = 0xdc00 | (res & 0x3ff);
00332         }
00333         else if (flags & MB_ERR_INVALID_CHARS) return -2;  /* bad char */
00334         /* otherwise ignore it */
00335     }
00336     if (src < srcend) return -1;  /* overflow */
00337     return dstlen - (dstend - dst);
00338 }

Generated on Sat May 26 2012 04:34:53 for ReactOS by doxygen 1.7.6.1

ReactOS is a registered trademark or a trademark of ReactOS Foundation in the United States and other countries.