ReactOS Fundraising Campaign 2012
 
€ 4,410 / € 30,000

Information | Donate

Home | Info | Community | Development | myReactOS | Contact Us

  1. Home
  2. Community
  3. Development
  4. myReactOS
  5. Fundraiser 2012

  1. Main Page
  2. Alphabetical List
  3. Data Structures
  4. Directories
  5. File List
  6. Data Fields
  7. Globals
  8. Related Pages

ReactOS Development > Doxygen

ff_unicode.c
Go to the documentation of this file.
00001 /*****************************************************************************
00002  *  FullFAT - High Performance, Thread-Safe Embedded FAT File-System         *
00003  *  Copyright (C) 2009  James Walmsley (james@worm.me.uk)                    *
00004  *                                                                           *
00005  *  This program is free software: you can redistribute it and/or modify     *
00006  *  it under the terms of the GNU General Public License as published by     *
00007  *  the Free Software Foundation, either version 3 of the License, or        *
00008  *  (at your option) any later version.                                      *
00009  *                                                                           *
00010  *  This program is distributed in the hope that it will be useful,          *
00011  *  but WITHOUT ANY WARRANTY; without even the implied warranty of           *
00012  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the            *
00013  *  GNU General Public License for more details.                             *
00014  *                                                                           *
00015  *  You should have received a copy of the GNU General Public License        *
00016  *  along with this program.  If not, see <http://www.gnu.org/licenses/>.    *
00017  *                                                                           *
00018  *  IMPORTANT NOTICE:                                                        *
00019  *  =================                                                        *
00020  *  Alternative Licensing is available directly from the Copyright holder,   *
00021  *  (James Walmsley). For more information consult LICENSING.TXT to obtain   *
00022  *  a Commercial license.                                                    *
00023  *                                                                           *
00024  *  See RESTRICTIONS.TXT for extra restrictions on the use of FullFAT.       *
00025  *                                                                           *
00026  *  Removing the above notice is illegal and will invalidate this license.   *
00027  *****************************************************************************
00028  *  See http://worm.me.uk/fullfat for more information.                      *
00029  *  Or  http://fullfat.googlecode.com/ for latest releases and the wiki.     *
00030  *****************************************************************************/
00031 
00042 #include "ff_unicode.h"
00043 #include "string.h"
00044 
00045 // UTF-8 Routines
00046 
00047 /*
00048    UCS-4 range (hex.)           UTF-8 octet sequence (binary)
00049    0000 0000-0000 007F   0xxxxxxx
00050    0000 0080-0000 07FF   110xxxxx 10xxxxxx
00051    0000 0800-0000 FFFF   1110xxxx 10xxxxxx 10xxxxxx
00052 
00053    0001 0000-001F FFFF   11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
00054    0020 0000-03FF FFFF   111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx   -- We don't encode these because we won't receive them. (Invalid UNICODE).
00055    0400 0000-7FFF FFFF   1111110x 10xxxxxx ... 10xxxxxx                 -- We don't encode these because we won't receive them. (Invalid UNICODE).
00056 */
00057 
00058 FF_T_UINT FF_GetUtf16SequenceLen(FF_T_UINT16 usLeadChar) {
00059     if((usLeadChar & 0xFC00) == 0xD800) {
00060         return 2;
00061     }
00062     return 1;
00063 }
00064 
00065 /*
00066     Returns the number of UTF-8 units read.
00067     Will not exceed ulSize UTF-16 units. (ulSize * 2 bytes).
00068 */
00069 /*
00070    UCS-4 range (hex.)           UTF-8 octet sequence (binary)
00071    0000 0000-0000 007F   0xxxxxxx
00072    0000 0080-0000 07FF   110xxxxx 10xxxxxx
00073    0000 0800-0000 FFFF   1110xxxx 10xxxxxx 10xxxxxx
00074 
00075    0001 0000-001F FFFF   11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
00076    0020 0000-03FF FFFF   111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx   -- We don't encode these because we won't receive them. (Invalid UNICODE).
00077    0400 0000-7FFF FFFF   1111110x 10xxxxxx ... 10xxxxxx                 -- We don't encode these because we won't receive them. (Invalid UNICODE).
00078 */
00079 FF_T_SINT32 FF_Utf8ctoUtf16c(FF_T_UINT16 *utf16Dest, const FF_T_UINT8 *utf8Source, FF_T_UINT32 ulSize) {
00080     FF_T_UINT32         ulUtf32char;
00081     FF_T_UINT16         utf16Source = 0;
00082     register FF_T_INT   uiSequenceNumber = 0;
00083 
00084     while((*utf8Source & (0x80 >> (uiSequenceNumber)))) {   // Count number of set bits before a zero.
00085         uiSequenceNumber++;
00086     }
00087 
00088     if(!uiSequenceNumber) {
00089         uiSequenceNumber++;
00090     }
00091 
00092     if(!ulSize) {
00093         return FF_ERR_UNICODE_DEST_TOO_SMALL;       
00094     }
00095 
00096     switch(uiSequenceNumber) {
00097         case 1:
00098                         utf16Source = (FF_T_UINT16) *utf8Source;
00099                         memcpy(utf16Dest,&utf16Source,sizeof(FF_T_UINT16));
00100             //bobtntfullfat *utf16Dest = (FF_T_UINT16) *utf8Source;
00101             break;
00102 
00103         case 2:
00104                         utf16Source =(FF_T_UINT16) ((*utf8Source & 0x1F) << 6) | ((*(utf8Source + 1) & 0x3F));
00105                         memcpy(utf16Dest,&utf16Source,sizeof(FF_T_UINT16));
00106             //bobtntfullfat *utf16Dest = (FF_T_UINT16) ((*utf8Source & 0x1F) << 6) | ((*(utf8Source + 1) & 0x3F));
00107             break;
00108 
00109         case 3:
00110                         utf16Source =(FF_T_UINT16) ((*utf8Source & 0x0F) << 12) | ((*(utf8Source + 1) & 0x3F) << 6) | ((*(utf8Source + 2) & 0x3F));
00111                         memcpy(utf16Dest,&utf16Source,sizeof(FF_T_UINT16));
00112             //bobtntfullfat *utf16Dest = (FF_T_UINT16) ((*utf8Source & 0x0F) << 12) | ((*(utf8Source + 1) & 0x3F) << 6) | ((*(utf8Source + 2) & 0x3F));
00113             break;
00114 
00115         case 4:
00116             // Convert to UTF-32 and then into UTF-16
00117             if(ulSize < 2) {
00118                 return FF_ERR_UNICODE_DEST_TOO_SMALL;
00119             }
00120             ulUtf32char = (FF_T_UINT16) ((*utf8Source & 0x0F) << 18) | ((*(utf8Source + 1) & 0x3F) << 12) | ((*(utf8Source + 2) & 0x3F) << 6) | ((*(utf8Source + 3) & 0x3F));
00121                         
00122                         utf16Source = (FF_T_UINT16) (((ulUtf32char - 0x10000) & 0xFFC00) >> 10) | 0xD800;
00123                         memcpy(utf16Dest,&utf16Source,sizeof(FF_T_UINT16));                        
00124                         utf16Source = (FF_T_UINT16) (((ulUtf32char - 0x10000) & 0x003FF) >> 00) | 0xDC00;
00125                         memcpy(utf16Dest+1,&utf16Source,sizeof(FF_T_UINT16));                                                
00126             //bobtntfullfat *(utf16Dest + 0) = (FF_T_UINT16) (((ulUtf32char - 0x10000) & 0xFFC00) >> 10) | 0xD800;
00127             //bobtntfullfat *(utf16Dest + 1) = (FF_T_UINT16) (((ulUtf32char - 0x10000) & 0x003FF) >> 00) | 0xDC00;
00128             break;
00129 
00130         default:
00131             break;
00132     }
00133 
00134     return uiSequenceNumber;
00135 }
00136 
00137 
00138 /*
00139     Returns the number of UTF-8 units required to encode the UTF-16 sequence.
00140     Will not exceed ulSize UTF-8 units. (ulSize  * 1 bytes).
00141 */
00142 FF_T_SINT32 FF_Utf16ctoUtf8c(FF_T_UINT8 *utf8Dest, const FF_T_UINT16 *utf16Source, FF_T_UINT32 ulSize) {
00143     FF_T_UINT32 ulUtf32char;
00144     FF_T_UINT16 ulUtf16char;
00145 
00146     if(!ulSize) {
00147         return FF_ERR_UNICODE_DEST_TOO_SMALL;
00148     }
00149 
00150         memcpy(&ulUtf16char, utf16Source, sizeof(FF_T_UINT16));
00151     if((/*bobtntfullfat *utf16Source*/ulUtf16char & 0xF800) == 0xD800) {    // A surrogate sequence was encountered. Must transform to UTF32 first.
00152                 ulUtf32char  = ((FF_T_UINT32) (ulUtf16char & 0x003FF) << 10) + 0x10000;
00153         //bobtntfullfat ulUtf32char  = ((FF_T_UINT32) (*(utf16Source + 0) & 0x003FF) << 10) + 0x10000;
00154 
00155                 memcpy(&ulUtf16char, utf16Source + 1, sizeof(FF_T_UINT16));                
00156         if((/*bobtntfullfat *(utf16Source + 1)*/ulUtf16char & 0xFC00) != 0xDC00) {
00157             return FF_ERR_UNICODE_INVALID_SEQUENCE; // Invalid UTF-16 sequence.
00158         }
00159         ulUtf32char |= ((FF_T_UINT32) (/*bobtntfullfat *(utf16Source + 1)*/ulUtf16char & 0x003FF));
00160 
00161     } else {
00162         ulUtf32char = (FF_T_UINT32) /*bobtntfullfat *utf16Source*/ulUtf16char;
00163     }
00164 
00165     // Now convert to the UTF-8 sequence.
00166     if(ulUtf32char < 0x00000080) {  // Single byte UTF-8 sequence.
00167         *(utf8Dest + 0) = (FF_T_UINT8) ulUtf32char;
00168         return 1;
00169     }
00170 
00171     if(ulUtf32char < 0x00000800) {  // Double byte UTF-8 sequence.
00172         if(ulSize < 2) {
00173             return FF_ERR_UNICODE_DEST_TOO_SMALL;
00174         }
00175         *(utf8Dest + 0) = (FF_T_UINT8) (0xC0 | ((ulUtf32char >> 6) & 0x1F));
00176         *(utf8Dest + 1) = (FF_T_UINT8) (0x80 | ((ulUtf32char >> 0) & 0x3F));
00177         return 2;
00178     }
00179 
00180     if(ulUtf32char < 0x00010000) {  // Triple byte UTF-8 sequence.
00181         if(ulSize < 3) {
00182             return FF_ERR_UNICODE_DEST_TOO_SMALL;
00183         }
00184         *(utf8Dest + 0) = (FF_T_UINT8) (0xE0 | ((ulUtf32char >> 12) & 0x0F));
00185         *(utf8Dest + 1) = (FF_T_UINT8) (0x80 | ((ulUtf32char >> 6 ) & 0x3F));
00186         *(utf8Dest + 2) = (FF_T_UINT8) (0x80 | ((ulUtf32char >> 0 ) & 0x3F));
00187         return 3;
00188     }
00189 
00190     if(ulUtf32char < 0x00200000) {  // Quadruple byte UTF-8 sequence.
00191         if(ulSize < 4) {
00192             return FF_ERR_UNICODE_DEST_TOO_SMALL;
00193         }
00194         *(utf8Dest + 0) = (FF_T_UINT8) (0xF0 | ((ulUtf32char >> 18) & 0x07));
00195         *(utf8Dest + 1) = (FF_T_UINT8) (0x80 | ((ulUtf32char >> 12) & 0x3F));
00196         *(utf8Dest + 2) = (FF_T_UINT8) (0x80 | ((ulUtf32char >> 6 ) & 0x3F));
00197         *(utf8Dest + 3) = (FF_T_UINT8) (0x80 | ((ulUtf32char >> 0 ) & 0x3F));
00198         return 4;
00199     }
00200 
00201     return FF_ERR_UNICODE_INVALID_CODE; // Invalid Charachter
00202 }
00203 
00204 
00205 // UTF-16 Support Functions
00206 
00207 // Converts a UTF-32 Charachter into its equivalent UTF-16 sequence.
00208 FF_T_SINT32 FF_Utf32ctoUtf16c(FF_T_UINT16 *utf16Dest, FF_T_UINT32 utf32char, FF_T_UINT32 ulSize) {
00209 
00210     // Check that its a valid UTF-32 wide-char! 
00211 
00212     if(utf32char >= 0xD800 && utf32char <= 0xDFFF) {    // This range is not a valid Unicode code point.
00213         return FF_ERR_UNICODE_INVALID_CODE; // Invalid charachter.
00214     }
00215 
00216     if(utf32char < 0x10000) {
00217         *utf16Dest = (FF_T_UINT16) utf32char; // Simple conversion! Char comes within UTF-16 space (without surrogates).
00218         return 1;
00219     }
00220 
00221     if(ulSize < 2) {
00222         return FF_ERR_UNICODE_DEST_TOO_SMALL;   // Not enough UTF-16 units to record this charachter.
00223     }
00224 
00225     if(utf32char < 0x00200000) {
00226         // Conversion to a UTF-16 Surrogate pair!
00227         //valueImage = utf32char - 0x10000;
00228         
00229         *(utf16Dest + 0) = (FF_T_UINT16) (((utf32char - 0x10000) & 0xFFC00) >> 10) | 0xD800;
00230         *(utf16Dest + 1) = (FF_T_UINT16) (((utf32char - 0x10000) & 0x003FF) >> 00) | 0xDC00;
00231         
00232         return 2;   // Surrogate pair encoded value.
00233     }
00234     
00235     return FF_ERR_UNICODE_INVALID_CODE; // Invalid Charachter
00236 }
00237 
00238 // Converts a UTF-16 sequence into its equivalent UTF-32 code point.
00239 FF_T_SINT32 FF_Utf16ctoUtf32c(FF_T_UINT32 *utf32Dest, const FF_T_UINT16 *utf16Source) {
00240     
00241     if((*utf16Source & 0xFC00) != 0xD800) { // Not a surrogate sequence.
00242         *utf32Dest = (FF_T_UINT32) *utf16Source;
00243         return 1;   // A single UTF-16 item was used to represent the charachter.
00244     }
00245     
00246     *utf32Dest  = ((FF_T_UINT32) (*(utf16Source + 0) & 0x003FF) << 10) + 0x10000;
00247     
00248     if((*(utf16Source + 1) & 0xFC00) != 0xDC00) {
00249         return FF_ERR_UNICODE_INVALID_SEQUENCE; // Invalid UTF-16 sequence.
00250     }
00251     *utf32Dest |= ((FF_T_UINT32) (*(utf16Source + 1) & 0x003FF));
00252     return 2;   // 2 utf-16 units make up the Unicode code-point.
00253 }
00254 
00255 
00256 /*
00257     Returns the total number of UTF-16 items required to represent
00258     the provided UTF-32 string in UTF-16 form.
00259 */
00260 /*
00261 FF_T_UINT FF_Utf32GetUtf16Len(const FF_T_UINT32 *utf32String) {
00262     FF_T_UINT utf16len = 0;
00263 
00264     while(*utf32String) {
00265         if(*utf32String++ <= 0xFFFF) {
00266             utf16len++;
00267         } else {
00268             utf16len += 2;
00269         }
00270     }
00271     
00272     return utf16len;
00273 }*/
00274 
00275 
00276 // String conversions
00277 
00278 FF_T_SINT32 FF_Utf32stoUtf8s(FF_T_UINT8 *Utf8String, FF_T_UINT32 *Utf32String) {
00279     int i = 0,y = 0;
00280 
00281     FF_T_UINT16 utf16buffer[2];
00282 
00283     while(Utf32String[i]) {
00284         // Convert to a UTF16 char.
00285         FF_Utf32ctoUtf16c(utf16buffer, Utf32String[i], 2);
00286         // Now convert the UTF16 to UTF8 sequence.
00287         y += FF_Utf16ctoUtf8c(&Utf8String[y], utf16buffer, 4);
00288         i++;
00289     }
00290 
00291     Utf8String[y] = '\0';
00292 
00293     return 0;
00294 }

Generated on Sun May 27 2012 04:34:09 for ReactOS by doxygen 1.7.6.1

ReactOS is a registered trademark or a trademark of ReactOS Foundation in the United States and other countries.