Home | Info | Community | Development | myReactOS | Contact Us
ReactOS Development > Doxygensortkey.c
Go to the documentation of this file.
00001 /* 00002 * Unicode sort key generation 00003 * 00004 * Copyright 2003 Dmitry Timoshkov 00005 * 00006 * This library is free software; you can redistribute it and/or 00007 * modify it under the terms of the GNU Lesser General Public 00008 * License as published by the Free Software Foundation; either 00009 * version 2.1 of the License, or (at your option) any later version. 00010 * 00011 * This library is distributed in the hope that it will be useful, 00012 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00013 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00014 * Lesser General Public License for more details. 00015 * 00016 * You should have received a copy of the GNU Lesser General Public 00017 * License along with this library; if not, write to the Free Software 00018 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA 00019 */ 00020 #include "wine/unicode.h" 00021 00022 #define get_char_typeW(x) iswctype((x) >> 8, (x) & 0xFF) 00023 extern int get_decomposition(WCHAR src, WCHAR *dst, unsigned int dstlen); 00024 extern const unsigned int collation_table[]; 00025 00026 /* 00027 * flags - normalization NORM_* flags 00028 * 00029 * FIXME: 'variable' flag not handled 00030 */ 00031 int wine_get_sortkey(int flags, const WCHAR *src, int srclen, char *dst, int dstlen) 00032 { 00033 WCHAR dummy[4]; /* no decomposition is larger than 4 chars */ 00034 int key_len[4]; 00035 char *key_ptr[4]; 00036 const WCHAR *src_save = src; 00037 int srclen_save = srclen; 00038 00039 key_len[0] = key_len[1] = key_len[2] = key_len[3] = 0; 00040 for (; srclen; srclen--, src++) 00041 { 00042 int decomposed_len = 1;/*get_decomposition(*src, dummy, 4);*/ 00043 dummy[0] = *src; 00044 if (decomposed_len) 00045 { 00046 int i; 00047 for (i = 0; i < decomposed_len; i++) 00048 { 00049 WCHAR wch = dummy[i]; 00050 unsigned int ce; 00051 00052 /* tests show that win2k just ignores NORM_IGNORENONSPACE, 00053 * and skips white space and punctuation characters for 00054 * NORM_IGNORESYMBOLS. 00055 */ 00056 if ((flags & NORM_IGNORESYMBOLS) && (get_char_typeW(wch) & (C1_PUNCT | C1_SPACE))) 00057 continue; 00058 00059 if (flags & NORM_IGNORECASE) wch = tolowerW(wch); 00060 00061 ce = collation_table[collation_table[wch >> 8] + (wch & 0xff)]; 00062 if (ce != (unsigned int)-1) 00063 { 00064 if (ce >> 16) key_len[0] += 2; 00065 if ((ce >> 8) & 0xff) key_len[1]++; 00066 if ((ce >> 4) & 0x0f) key_len[2]++; 00067 if (ce & 1) 00068 { 00069 if (wch >> 8) key_len[3]++; 00070 key_len[3]++; 00071 } 00072 } 00073 else 00074 { 00075 key_len[0] += 2; 00076 if (wch >> 8) key_len[0]++; 00077 if (wch & 0xff) key_len[0]++; 00078 } 00079 } 00080 } 00081 } 00082 00083 if (!dstlen) /* compute length */ 00084 /* 4 * '\1' + 1 * '\0' + key length */ 00085 return key_len[0] + key_len[1] + key_len[2] + key_len[3] + 4 + 1; 00086 00087 if (dstlen < key_len[0] + key_len[1] + key_len[2] + key_len[3] + 4 + 1) 00088 return 0; /* overflow */ 00089 00090 src = src_save; 00091 srclen = srclen_save; 00092 00093 key_ptr[0] = dst; 00094 key_ptr[1] = key_ptr[0] + key_len[0] + 1; 00095 key_ptr[2] = key_ptr[1] + key_len[1] + 1; 00096 key_ptr[3] = key_ptr[2] + key_len[2] + 1; 00097 00098 for (; srclen; srclen--, src++) 00099 { 00100 int decomposed_len = 1;/*get_decomposition(*src, dummy, 4);*/ 00101 dummy[0] = *src; 00102 if (decomposed_len) 00103 { 00104 int i; 00105 for (i = 0; i < decomposed_len; i++) 00106 { 00107 WCHAR wch = dummy[i]; 00108 unsigned int ce; 00109 00110 /* tests show that win2k just ignores NORM_IGNORENONSPACE, 00111 * and skips white space and punctuation characters for 00112 * NORM_IGNORESYMBOLS. 00113 */ 00114 if ((flags & NORM_IGNORESYMBOLS) && (get_char_typeW(wch) & (C1_PUNCT | C1_SPACE))) 00115 continue; 00116 00117 if (flags & NORM_IGNORECASE) wch = tolowerW(wch); 00118 00119 ce = collation_table[collation_table[wch >> 8] + (wch & 0xff)]; 00120 if (ce != (unsigned int)-1) 00121 { 00122 WCHAR key; 00123 if ((key = ce >> 16)) 00124 { 00125 *key_ptr[0]++ = key >> 8; 00126 *key_ptr[0]++ = key & 0xff; 00127 } 00128 /* make key 1 start from 2 */ 00129 if ((key = (ce >> 8) & 0xff)) *key_ptr[1]++ = key + 1; 00130 /* make key 2 start from 2 */ 00131 if ((key = (ce >> 4) & 0x0f)) *key_ptr[2]++ = key + 1; 00132 /* key 3 is always a character code */ 00133 if (ce & 1) 00134 { 00135 if (wch >> 8) *key_ptr[3]++ = wch >> 8; 00136 if (wch & 0xff) *key_ptr[3]++ = wch & 0xff; 00137 } 00138 } 00139 else 00140 { 00141 *key_ptr[0]++ = 0xff; 00142 *key_ptr[0]++ = 0xfe; 00143 if (wch >> 8) *key_ptr[0]++ = wch >> 8; 00144 if (wch & 0xff) *key_ptr[0]++ = wch & 0xff; 00145 } 00146 } 00147 } 00148 } 00149 00150 *key_ptr[0] = '\1'; 00151 *key_ptr[1] = '\1'; 00152 *key_ptr[2] = '\1'; 00153 *key_ptr[3]++ = '\1'; 00154 *key_ptr[3] = 0; 00155 00156 return key_ptr[3] - dst; 00157 } 00158 00159 static inline int compare_unicode_weights(int flags, const WCHAR *str1, int len1, 00160 const WCHAR *str2, int len2) 00161 { 00162 unsigned int ce1, ce2; 00163 int ret; 00164 00165 /* 32-bit collation element table format: 00166 * unicode weight - high 16 bit, diacritic weight - high 8 bit of low 16 bit, 00167 * case weight - high 4 bit of low 8 bit. 00168 */ 00169 while (len1 > 0 && len2 > 0) 00170 { 00171 if (flags & NORM_IGNORESYMBOLS) 00172 { 00173 int skip = 0; 00174 /* FIXME: not tested */ 00175 if (get_char_typeW(*str1) & (C1_PUNCT | C1_SPACE)) 00176 { 00177 str1++; 00178 len1--; 00179 skip = 1; 00180 } 00181 if (get_char_typeW(*str2) & (C1_PUNCT | C1_SPACE)) 00182 { 00183 str2++; 00184 len2--; 00185 skip = 1; 00186 } 00187 if (skip) continue; 00188 } 00189 00190 /* hyphen and apostrophe are treated differently depending on 00191 * whether SORT_STRINGSORT specified or not 00192 */ 00193 if (!(flags & SORT_STRINGSORT)) 00194 { 00195 if (*str1 == '-' || *str1 == '\'') 00196 { 00197 if (*str2 != '-' && *str2 != '\'') 00198 { 00199 str1++; 00200 len1--; 00201 continue; 00202 } 00203 } 00204 else if (*str2 == '-' || *str2 == '\'') 00205 { 00206 str2++; 00207 len2--; 00208 continue; 00209 } 00210 } 00211 00212 ce1 = collation_table[collation_table[*str1 >> 8] + (*str1 & 0xff)]; 00213 ce2 = collation_table[collation_table[*str2 >> 8] + (*str2 & 0xff)]; 00214 00215 if (ce1 != (unsigned int)-1 && ce2 != (unsigned int)-1) 00216 ret = (ce1 >> 16) - (ce2 >> 16); 00217 else 00218 ret = *str1 - *str2; 00219 00220 if (ret) return ret; 00221 00222 str1++; 00223 str2++; 00224 len1--; 00225 len2--; 00226 } 00227 return len1 - len2; 00228 } 00229 00230 static inline int compare_diacritic_weights(int flags, const WCHAR *str1, int len1, 00231 const WCHAR *str2, int len2) 00232 { 00233 unsigned int ce1, ce2; 00234 int ret; 00235 00236 /* 32-bit collation element table format: 00237 * unicode weight - high 16 bit, diacritic weight - high 8 bit of low 16 bit, 00238 * case weight - high 4 bit of low 8 bit. 00239 */ 00240 while (len1 > 0 && len2 > 0) 00241 { 00242 if (flags & NORM_IGNORESYMBOLS) 00243 { 00244 int skip = 0; 00245 /* FIXME: not tested */ 00246 if (get_char_typeW(*str1) & (C1_PUNCT | C1_SPACE)) 00247 { 00248 str1++; 00249 len1--; 00250 skip = 1; 00251 } 00252 if (get_char_typeW(*str2) & (C1_PUNCT | C1_SPACE)) 00253 { 00254 str2++; 00255 len2--; 00256 skip = 1; 00257 } 00258 if (skip) continue; 00259 } 00260 00261 ce1 = collation_table[collation_table[*str1 >> 8] + (*str1 & 0xff)]; 00262 ce2 = collation_table[collation_table[*str2 >> 8] + (*str2 & 0xff)]; 00263 00264 if (ce1 != (unsigned int)-1 && ce2 != (unsigned int)-1) 00265 ret = ((ce1 >> 8) & 0xff) - ((ce2 >> 8) & 0xff); 00266 else 00267 ret = *str1 - *str2; 00268 00269 if (ret) return ret; 00270 00271 str1++; 00272 str2++; 00273 len1--; 00274 len2--; 00275 } 00276 return len1 - len2; 00277 } 00278 00279 static inline int compare_case_weights(int flags, const WCHAR *str1, int len1, 00280 const WCHAR *str2, int len2) 00281 { 00282 unsigned int ce1, ce2; 00283 int ret; 00284 00285 /* 32-bit collation element table format: 00286 * unicode weight - high 16 bit, diacritic weight - high 8 bit of low 16 bit, 00287 * case weight - high 4 bit of low 8 bit. 00288 */ 00289 while (len1 > 0 && len2 > 0) 00290 { 00291 if (flags & NORM_IGNORESYMBOLS) 00292 { 00293 int skip = 0; 00294 /* FIXME: not tested */ 00295 if (get_char_typeW(*str1) & (C1_PUNCT | C1_SPACE)) 00296 { 00297 str1++; 00298 len1--; 00299 skip = 1; 00300 } 00301 if (get_char_typeW(*str2) & (C1_PUNCT | C1_SPACE)) 00302 { 00303 str2++; 00304 len2--; 00305 skip = 1; 00306 } 00307 if (skip) continue; 00308 } 00309 00310 ce1 = collation_table[collation_table[*str1 >> 8] + (*str1 & 0xff)]; 00311 ce2 = collation_table[collation_table[*str2 >> 8] + (*str2 & 0xff)]; 00312 00313 if (ce1 != (unsigned int)-1 && ce2 != (unsigned int)-1) 00314 ret = ((ce1 >> 4) & 0x0f) - ((ce2 >> 4) & 0x0f); 00315 else 00316 ret = *str1 - *str2; 00317 00318 if (ret) return ret; 00319 00320 str1++; 00321 str2++; 00322 len1--; 00323 len2--; 00324 } 00325 return len1 - len2; 00326 } 00327 00328 static inline int real_length(const WCHAR *str, int len) 00329 { 00330 while (len && !str[len - 1]) len--; 00331 return len; 00332 } 00333 00334 int wine_compare_string(int flags, const WCHAR *str1, int len1, 00335 const WCHAR *str2, int len2) 00336 { 00337 int ret; 00338 00339 len1 = real_length(str1, len1); 00340 len2 = real_length(str2, len2); 00341 00342 ret = compare_unicode_weights(flags, str1, len1, str2, len2); 00343 if (!ret) 00344 { 00345 if (!(flags & NORM_IGNORENONSPACE)) 00346 ret = compare_diacritic_weights(flags, str1, len1, str2, len2); 00347 if (!ret && !(flags & NORM_IGNORECASE)) 00348 ret = compare_case_weights(flags, str1, len1, str2, len2); 00349 } 00350 return ret; 00351 } Generated on Sat May 26 2012 04:23:09 for ReactOS by
1.7.6.1
|