Home | Info | Community | Development | myReactOS | Contact Us
ReactOS Development > Doxygenff_unicode.c
Go to the documentation of this file.
00001 /***************************************************************************** 00002 * FullFAT - High Performance, Thread-Safe Embedded FAT File-System * 00003 * Copyright (C) 2009 James Walmsley (james@worm.me.uk) * 00004 * * 00005 * This program is free software: you can redistribute it and/or modify * 00006 * it under the terms of the GNU General Public License as published by * 00007 * the Free Software Foundation, either version 3 of the License, or * 00008 * (at your option) any later version. * 00009 * * 00010 * This program is distributed in the hope that it will be useful, * 00011 * but WITHOUT ANY WARRANTY; without even the implied warranty of * 00012 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * 00013 * GNU General Public License for more details. * 00014 * * 00015 * You should have received a copy of the GNU General Public License * 00016 * along with this program. If not, see <http://www.gnu.org/licenses/>. * 00017 * * 00018 * IMPORTANT NOTICE: * 00019 * ================= * 00020 * Alternative Licensing is available directly from the Copyright holder, * 00021 * (James Walmsley). For more information consult LICENSING.TXT to obtain * 00022 * a Commercial license. * 00023 * * 00024 * See RESTRICTIONS.TXT for extra restrictions on the use of FullFAT. * 00025 * * 00026 * Removing the above notice is illegal and will invalidate this license. * 00027 ***************************************************************************** 00028 * See http://worm.me.uk/fullfat for more information. * 00029 * Or http://fullfat.googlecode.com/ for latest releases and the wiki. * 00030 *****************************************************************************/ 00031 00042 #include "ff_unicode.h" 00043 #include "string.h" 00044 00045 // UTF-8 Routines 00046 00047 /* 00048 UCS-4 range (hex.) UTF-8 octet sequence (binary) 00049 0000 0000-0000 007F 0xxxxxxx 00050 0000 0080-0000 07FF 110xxxxx 10xxxxxx 00051 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx 00052 00053 0001 0000-001F FFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 00054 0020 0000-03FF FFFF 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx -- We don't encode these because we won't receive them. (Invalid UNICODE). 00055 0400 0000-7FFF FFFF 1111110x 10xxxxxx ... 10xxxxxx -- We don't encode these because we won't receive them. (Invalid UNICODE). 00056 */ 00057 00058 FF_T_UINT FF_GetUtf16SequenceLen(FF_T_UINT16 usLeadChar) { 00059 if((usLeadChar & 0xFC00) == 0xD800) { 00060 return 2; 00061 } 00062 return 1; 00063 } 00064 00065 /* 00066 Returns the number of UTF-8 units read. 00067 Will not exceed ulSize UTF-16 units. (ulSize * 2 bytes). 00068 */ 00069 /* 00070 UCS-4 range (hex.) UTF-8 octet sequence (binary) 00071 0000 0000-0000 007F 0xxxxxxx 00072 0000 0080-0000 07FF 110xxxxx 10xxxxxx 00073 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx 00074 00075 0001 0000-001F FFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 00076 0020 0000-03FF FFFF 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx -- We don't encode these because we won't receive them. (Invalid UNICODE). 00077 0400 0000-7FFF FFFF 1111110x 10xxxxxx ... 10xxxxxx -- We don't encode these because we won't receive them. (Invalid UNICODE). 00078 */ 00079 FF_T_SINT32 FF_Utf8ctoUtf16c(FF_T_UINT16 *utf16Dest, const FF_T_UINT8 *utf8Source, FF_T_UINT32 ulSize) { 00080 FF_T_UINT32 ulUtf32char; 00081 FF_T_UINT16 utf16Source = 0; 00082 register FF_T_INT uiSequenceNumber = 0; 00083 00084 while((*utf8Source & (0x80 >> (uiSequenceNumber)))) { // Count number of set bits before a zero. 00085 uiSequenceNumber++; 00086 } 00087 00088 if(!uiSequenceNumber) { 00089 uiSequenceNumber++; 00090 } 00091 00092 if(!ulSize) { 00093 return FF_ERR_UNICODE_DEST_TOO_SMALL; 00094 } 00095 00096 switch(uiSequenceNumber) { 00097 case 1: 00098 utf16Source = (FF_T_UINT16) *utf8Source; 00099 memcpy(utf16Dest,&utf16Source,sizeof(FF_T_UINT16)); 00100 //bobtntfullfat *utf16Dest = (FF_T_UINT16) *utf8Source; 00101 break; 00102 00103 case 2: 00104 utf16Source =(FF_T_UINT16) ((*utf8Source & 0x1F) << 6) | ((*(utf8Source + 1) & 0x3F)); 00105 memcpy(utf16Dest,&utf16Source,sizeof(FF_T_UINT16)); 00106 //bobtntfullfat *utf16Dest = (FF_T_UINT16) ((*utf8Source & 0x1F) << 6) | ((*(utf8Source + 1) & 0x3F)); 00107 break; 00108 00109 case 3: 00110 utf16Source =(FF_T_UINT16) ((*utf8Source & 0x0F) << 12) | ((*(utf8Source + 1) & 0x3F) << 6) | ((*(utf8Source + 2) & 0x3F)); 00111 memcpy(utf16Dest,&utf16Source,sizeof(FF_T_UINT16)); 00112 //bobtntfullfat *utf16Dest = (FF_T_UINT16) ((*utf8Source & 0x0F) << 12) | ((*(utf8Source + 1) & 0x3F) << 6) | ((*(utf8Source + 2) & 0x3F)); 00113 break; 00114 00115 case 4: 00116 // Convert to UTF-32 and then into UTF-16 00117 if(ulSize < 2) { 00118 return FF_ERR_UNICODE_DEST_TOO_SMALL; 00119 } 00120 ulUtf32char = (FF_T_UINT16) ((*utf8Source & 0x0F) << 18) | ((*(utf8Source + 1) & 0x3F) << 12) | ((*(utf8Source + 2) & 0x3F) << 6) | ((*(utf8Source + 3) & 0x3F)); 00121 00122 utf16Source = (FF_T_UINT16) (((ulUtf32char - 0x10000) & 0xFFC00) >> 10) | 0xD800; 00123 memcpy(utf16Dest,&utf16Source,sizeof(FF_T_UINT16)); 00124 utf16Source = (FF_T_UINT16) (((ulUtf32char - 0x10000) & 0x003FF) >> 00) | 0xDC00; 00125 memcpy(utf16Dest+1,&utf16Source,sizeof(FF_T_UINT16)); 00126 //bobtntfullfat *(utf16Dest + 0) = (FF_T_UINT16) (((ulUtf32char - 0x10000) & 0xFFC00) >> 10) | 0xD800; 00127 //bobtntfullfat *(utf16Dest + 1) = (FF_T_UINT16) (((ulUtf32char - 0x10000) & 0x003FF) >> 00) | 0xDC00; 00128 break; 00129 00130 default: 00131 break; 00132 } 00133 00134 return uiSequenceNumber; 00135 } 00136 00137 00138 /* 00139 Returns the number of UTF-8 units required to encode the UTF-16 sequence. 00140 Will not exceed ulSize UTF-8 units. (ulSize * 1 bytes). 00141 */ 00142 FF_T_SINT32 FF_Utf16ctoUtf8c(FF_T_UINT8 *utf8Dest, const FF_T_UINT16 *utf16Source, FF_T_UINT32 ulSize) { 00143 FF_T_UINT32 ulUtf32char; 00144 FF_T_UINT16 ulUtf16char; 00145 00146 if(!ulSize) { 00147 return FF_ERR_UNICODE_DEST_TOO_SMALL; 00148 } 00149 00150 memcpy(&ulUtf16char, utf16Source, sizeof(FF_T_UINT16)); 00151 if((/*bobtntfullfat *utf16Source*/ulUtf16char & 0xF800) == 0xD800) { // A surrogate sequence was encountered. Must transform to UTF32 first. 00152 ulUtf32char = ((FF_T_UINT32) (ulUtf16char & 0x003FF) << 10) + 0x10000; 00153 //bobtntfullfat ulUtf32char = ((FF_T_UINT32) (*(utf16Source + 0) & 0x003FF) << 10) + 0x10000; 00154 00155 memcpy(&ulUtf16char, utf16Source + 1, sizeof(FF_T_UINT16)); 00156 if((/*bobtntfullfat *(utf16Source + 1)*/ulUtf16char & 0xFC00) != 0xDC00) { 00157 return FF_ERR_UNICODE_INVALID_SEQUENCE; // Invalid UTF-16 sequence. 00158 } 00159 ulUtf32char |= ((FF_T_UINT32) (/*bobtntfullfat *(utf16Source + 1)*/ulUtf16char & 0x003FF)); 00160 00161 } else { 00162 ulUtf32char = (FF_T_UINT32) /*bobtntfullfat *utf16Source*/ulUtf16char; 00163 } 00164 00165 // Now convert to the UTF-8 sequence. 00166 if(ulUtf32char < 0x00000080) { // Single byte UTF-8 sequence. 00167 *(utf8Dest + 0) = (FF_T_UINT8) ulUtf32char; 00168 return 1; 00169 } 00170 00171 if(ulUtf32char < 0x00000800) { // Double byte UTF-8 sequence. 00172 if(ulSize < 2) { 00173 return FF_ERR_UNICODE_DEST_TOO_SMALL; 00174 } 00175 *(utf8Dest + 0) = (FF_T_UINT8) (0xC0 | ((ulUtf32char >> 6) & 0x1F)); 00176 *(utf8Dest + 1) = (FF_T_UINT8) (0x80 | ((ulUtf32char >> 0) & 0x3F)); 00177 return 2; 00178 } 00179 00180 if(ulUtf32char < 0x00010000) { // Triple byte UTF-8 sequence. 00181 if(ulSize < 3) { 00182 return FF_ERR_UNICODE_DEST_TOO_SMALL; 00183 } 00184 *(utf8Dest + 0) = (FF_T_UINT8) (0xE0 | ((ulUtf32char >> 12) & 0x0F)); 00185 *(utf8Dest + 1) = (FF_T_UINT8) (0x80 | ((ulUtf32char >> 6 ) & 0x3F)); 00186 *(utf8Dest + 2) = (FF_T_UINT8) (0x80 | ((ulUtf32char >> 0 ) & 0x3F)); 00187 return 3; 00188 } 00189 00190 if(ulUtf32char < 0x00200000) { // Quadruple byte UTF-8 sequence. 00191 if(ulSize < 4) { 00192 return FF_ERR_UNICODE_DEST_TOO_SMALL; 00193 } 00194 *(utf8Dest + 0) = (FF_T_UINT8) (0xF0 | ((ulUtf32char >> 18) & 0x07)); 00195 *(utf8Dest + 1) = (FF_T_UINT8) (0x80 | ((ulUtf32char >> 12) & 0x3F)); 00196 *(utf8Dest + 2) = (FF_T_UINT8) (0x80 | ((ulUtf32char >> 6 ) & 0x3F)); 00197 *(utf8Dest + 3) = (FF_T_UINT8) (0x80 | ((ulUtf32char >> 0 ) & 0x3F)); 00198 return 4; 00199 } 00200 00201 return FF_ERR_UNICODE_INVALID_CODE; // Invalid Charachter 00202 } 00203 00204 00205 // UTF-16 Support Functions 00206 00207 // Converts a UTF-32 Charachter into its equivalent UTF-16 sequence. 00208 FF_T_SINT32 FF_Utf32ctoUtf16c(FF_T_UINT16 *utf16Dest, FF_T_UINT32 utf32char, FF_T_UINT32 ulSize) { 00209 00210 // Check that its a valid UTF-32 wide-char! 00211 00212 if(utf32char >= 0xD800 && utf32char <= 0xDFFF) { // This range is not a valid Unicode code point. 00213 return FF_ERR_UNICODE_INVALID_CODE; // Invalid charachter. 00214 } 00215 00216 if(utf32char < 0x10000) { 00217 *utf16Dest = (FF_T_UINT16) utf32char; // Simple conversion! Char comes within UTF-16 space (without surrogates). 00218 return 1; 00219 } 00220 00221 if(ulSize < 2) { 00222 return FF_ERR_UNICODE_DEST_TOO_SMALL; // Not enough UTF-16 units to record this charachter. 00223 } 00224 00225 if(utf32char < 0x00200000) { 00226 // Conversion to a UTF-16 Surrogate pair! 00227 //valueImage = utf32char - 0x10000; 00228 00229 *(utf16Dest + 0) = (FF_T_UINT16) (((utf32char - 0x10000) & 0xFFC00) >> 10) | 0xD800; 00230 *(utf16Dest + 1) = (FF_T_UINT16) (((utf32char - 0x10000) & 0x003FF) >> 00) | 0xDC00; 00231 00232 return 2; // Surrogate pair encoded value. 00233 } 00234 00235 return FF_ERR_UNICODE_INVALID_CODE; // Invalid Charachter 00236 } 00237 00238 // Converts a UTF-16 sequence into its equivalent UTF-32 code point. 00239 FF_T_SINT32 FF_Utf16ctoUtf32c(FF_T_UINT32 *utf32Dest, const FF_T_UINT16 *utf16Source) { 00240 00241 if((*utf16Source & 0xFC00) != 0xD800) { // Not a surrogate sequence. 00242 *utf32Dest = (FF_T_UINT32) *utf16Source; 00243 return 1; // A single UTF-16 item was used to represent the charachter. 00244 } 00245 00246 *utf32Dest = ((FF_T_UINT32) (*(utf16Source + 0) & 0x003FF) << 10) + 0x10000; 00247 00248 if((*(utf16Source + 1) & 0xFC00) != 0xDC00) { 00249 return FF_ERR_UNICODE_INVALID_SEQUENCE; // Invalid UTF-16 sequence. 00250 } 00251 *utf32Dest |= ((FF_T_UINT32) (*(utf16Source + 1) & 0x003FF)); 00252 return 2; // 2 utf-16 units make up the Unicode code-point. 00253 } 00254 00255 00256 /* 00257 Returns the total number of UTF-16 items required to represent 00258 the provided UTF-32 string in UTF-16 form. 00259 */ 00260 /* 00261 FF_T_UINT FF_Utf32GetUtf16Len(const FF_T_UINT32 *utf32String) { 00262 FF_T_UINT utf16len = 0; 00263 00264 while(*utf32String) { 00265 if(*utf32String++ <= 0xFFFF) { 00266 utf16len++; 00267 } else { 00268 utf16len += 2; 00269 } 00270 } 00271 00272 return utf16len; 00273 }*/ 00274 00275 00276 // String conversions 00277 00278 FF_T_SINT32 FF_Utf32stoUtf8s(FF_T_UINT8 *Utf8String, FF_T_UINT32 *Utf32String) { 00279 int i = 0,y = 0; 00280 00281 FF_T_UINT16 utf16buffer[2]; 00282 00283 while(Utf32String[i]) { 00284 // Convert to a UTF16 char. 00285 FF_Utf32ctoUtf16c(utf16buffer, Utf32String[i], 2); 00286 // Now convert the UTF16 to UTF8 sequence. 00287 y += FF_Utf16ctoUtf8c(&Utf8String[y], utf16buffer, 4); 00288 i++; 00289 } 00290 00291 Utf8String[y] = '\0'; 00292 00293 return 0; 00294 } Generated on Sun May 27 2012 04:34:09 for ReactOS by
1.7.6.1
|