ReactOS Fundraising Campaign 2012
 
€ 4,410 / € 30,000

Information | Donate

Home | Info | Community | Development | myReactOS | Contact Us

  1. Home
  2. Community
  3. Development
  4. myReactOS
  5. Fundraiser 2012

  1. Main Page
  2. Alphabetical List
  3. Data Structures
  4. Directories
  5. File List
  6. Data Fields
  7. Globals
  8. Related Pages

ReactOS Development > Doxygen

sortkey.c
Go to the documentation of this file.
00001 /*
00002  * Unicode sort key generation
00003  *
00004  * Copyright 2003 Dmitry Timoshkov
00005  *
00006  * This library is free software; you can redistribute it and/or
00007  * modify it under the terms of the GNU Lesser General Public
00008  * License as published by the Free Software Foundation; either
00009  * version 2.1 of the License, or (at your option) any later version.
00010  *
00011  * This library is distributed in the hope that it will be useful,
00012  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00013  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00014  * Lesser General Public License for more details.
00015  *
00016  * You should have received a copy of the GNU Lesser General Public
00017  * License along with this library; if not, write to the Free Software
00018  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA
00019  */
00020 #include "wine/unicode.h"
00021 
00022 extern int get_decomposition(WCHAR src, WCHAR *dst, unsigned int dstlen);
00023 extern const unsigned int collation_table[];
00024 
00025 /*
00026  * flags - normalization NORM_* flags
00027  *
00028  * FIXME: 'variable' flag not handled
00029  */
00030 int wine_get_sortkey(int flags, const WCHAR *src, int srclen, char *dst, int dstlen)
00031 {
00032     WCHAR dummy[4]; /* no decomposition is larger than 4 chars */
00033     int key_len[4];
00034     char *key_ptr[4];
00035     const WCHAR *src_save = src;
00036     int srclen_save = srclen;
00037 
00038     key_len[0] = key_len[1] = key_len[2] = key_len[3] = 0;
00039     for (; srclen; srclen--, src++)
00040     {
00041         int decomposed_len = 1;/*get_decomposition(*src, dummy, 4);*/
00042         dummy[0] = *src;
00043         if (decomposed_len)
00044         {
00045             int i;
00046             for (i = 0; i < decomposed_len; i++)
00047             {
00048                 WCHAR wch = dummy[i];
00049                 unsigned int ce;
00050 
00051                 /* tests show that win2k just ignores NORM_IGNORENONSPACE,
00052                  * and skips white space and punctuation characters for
00053                  * NORM_IGNORESYMBOLS.
00054                  */
00055                 if ((flags & NORM_IGNORESYMBOLS) && (get_char_typeW(wch) & (C1_PUNCT | C1_SPACE)))
00056                     continue;
00057 
00058                 if (flags & NORM_IGNORECASE) wch = tolowerW(wch);
00059 
00060                 ce = collation_table[collation_table[wch >> 8] + (wch & 0xff)];
00061                 if (ce != (unsigned int)-1)
00062                 {
00063                     if (ce >> 16) key_len[0] += 2;
00064                     if ((ce >> 8) & 0xff) key_len[1]++;
00065                     if ((ce >> 4) & 0x0f) key_len[2]++;
00066                     if (ce & 1)
00067                     {
00068                         if (wch >> 8) key_len[3]++;
00069                         key_len[3]++;
00070                     }
00071                 }
00072                 else
00073                 {
00074                     key_len[0] += 2;
00075                     if (wch >> 8) key_len[0]++;
00076                     if (wch & 0xff) key_len[0]++;
00077         }
00078             }
00079         }
00080     }
00081 
00082     if (!dstlen) /* compute length */
00083         /* 4 * '\1' + 1 * '\0' + key length */
00084         return key_len[0] + key_len[1] + key_len[2] + key_len[3] + 4 + 1;
00085 
00086     if (dstlen < key_len[0] + key_len[1] + key_len[2] + key_len[3] + 4 + 1)
00087         return 0; /* overflow */
00088 
00089     src = src_save;
00090     srclen = srclen_save;
00091 
00092     key_ptr[0] = dst;
00093     key_ptr[1] = key_ptr[0] + key_len[0] + 1;
00094     key_ptr[2] = key_ptr[1] + key_len[1] + 1;
00095     key_ptr[3] = key_ptr[2] + key_len[2] + 1;
00096 
00097     for (; srclen; srclen--, src++)
00098     {
00099         int decomposed_len = 1;/*get_decomposition(*src, dummy, 4);*/
00100         dummy[0] = *src;
00101         if (decomposed_len)
00102         {
00103             int i;
00104             for (i = 0; i < decomposed_len; i++)
00105             {
00106                 WCHAR wch = dummy[i];
00107                 unsigned int ce;
00108 
00109                 /* tests show that win2k just ignores NORM_IGNORENONSPACE,
00110                  * and skips white space and punctuation characters for
00111                  * NORM_IGNORESYMBOLS.
00112                  */
00113                 if ((flags & NORM_IGNORESYMBOLS) && (get_char_typeW(wch) & (C1_PUNCT | C1_SPACE)))
00114                     continue;
00115 
00116                 if (flags & NORM_IGNORECASE) wch = tolowerW(wch);
00117 
00118                 ce = collation_table[collation_table[wch >> 8] + (wch & 0xff)];
00119                 if (ce != (unsigned int)-1)
00120                 {
00121                     WCHAR key;
00122                     if ((key = ce >> 16))
00123                     {
00124                         *key_ptr[0]++ = key >> 8;
00125                         *key_ptr[0]++ = key & 0xff;
00126                     }
00127                     /* make key 1 start from 2 */
00128                     if ((key = (ce >> 8) & 0xff)) *key_ptr[1]++ = key + 1;
00129                     /* make key 2 start from 2 */
00130                     if ((key = (ce >> 4) & 0x0f)) *key_ptr[2]++ = key + 1;
00131                     /* key 3 is always a character code */
00132                     if (ce & 1)
00133                     {
00134                         if (wch >> 8) *key_ptr[3]++ = wch >> 8;
00135                         if (wch & 0xff) *key_ptr[3]++ = wch & 0xff;
00136                     }
00137                 }
00138                 else
00139                 {
00140                     *key_ptr[0]++ = 0xff;
00141                     *key_ptr[0]++ = 0xfe;
00142                     if (wch >> 8) *key_ptr[0]++ = wch >> 8;
00143                     if (wch & 0xff) *key_ptr[0]++ = wch & 0xff;
00144                 }
00145             }
00146         }
00147     }
00148 
00149     *key_ptr[0] = '\1';
00150     *key_ptr[1] = '\1';
00151     *key_ptr[2] = '\1';
00152     *key_ptr[3]++ = '\1';
00153     *key_ptr[3] = 0;
00154 
00155     return key_ptr[3] - dst;
00156 }
00157 
00158 static inline int compare_unicode_weights(int flags, const WCHAR *str1, int len1,
00159                                           const WCHAR *str2, int len2)
00160 {
00161     unsigned int ce1, ce2;
00162     int ret;
00163 
00164     /* 32-bit collation element table format:
00165      * unicode weight - high 16 bit, diacritic weight - high 8 bit of low 16 bit,
00166      * case weight - high 4 bit of low 8 bit.
00167      */
00168     while (len1 > 0 && len2 > 0)
00169     {
00170         if (flags & NORM_IGNORESYMBOLS)
00171         {
00172             int skip = 0;
00173             /* FIXME: not tested */
00174             if (get_char_typeW(*str1) & (C1_PUNCT | C1_SPACE))
00175             {
00176                 str1++;
00177                 len1--;
00178                 skip = 1;
00179             }
00180             if (get_char_typeW(*str2) & (C1_PUNCT | C1_SPACE))
00181             {
00182                 str2++;
00183                 len2--;
00184                 skip = 1;
00185             }
00186             if (skip) continue;
00187         }
00188 
00189        /* hyphen and apostrophe are treated differently depending on
00190         * whether SORT_STRINGSORT specified or not
00191         */
00192         if (!(flags & SORT_STRINGSORT))
00193         {
00194             if (*str1 == '-' || *str1 == '\'')
00195             {
00196                 if (*str2 != '-' && *str2 != '\'')
00197                 {
00198                     str1++;
00199                     len1--;
00200                     continue;
00201                 }
00202             }
00203             else if (*str2 == '-' || *str2 == '\'')
00204             {
00205                 str2++;
00206                 len2--;
00207                 continue;
00208             }
00209         }
00210 
00211         ce1 = collation_table[collation_table[*str1 >> 8] + (*str1 & 0xff)];
00212         ce2 = collation_table[collation_table[*str2 >> 8] + (*str2 & 0xff)];
00213 
00214         if (ce1 != (unsigned int)-1 && ce2 != (unsigned int)-1)
00215             ret = (ce1 >> 16) - (ce2 >> 16);
00216         else
00217             ret = *str1 - *str2;
00218 
00219         if (ret) return ret;
00220 
00221         str1++;
00222         str2++;
00223         len1--;
00224         len2--;
00225     }
00226     return len1 - len2;
00227 }
00228 
00229 static inline int compare_diacritic_weights(int flags, const WCHAR *str1, int len1,
00230                                             const WCHAR *str2, int len2)
00231 {
00232     unsigned int ce1, ce2;
00233     int ret;
00234 
00235     /* 32-bit collation element table format:
00236      * unicode weight - high 16 bit, diacritic weight - high 8 bit of low 16 bit,
00237      * case weight - high 4 bit of low 8 bit.
00238      */
00239     while (len1 > 0 && len2 > 0)
00240     {
00241         if (flags & NORM_IGNORESYMBOLS)
00242         {
00243             int skip = 0;
00244             /* FIXME: not tested */
00245             if (get_char_typeW(*str1) & (C1_PUNCT | C1_SPACE))
00246             {
00247                 str1++;
00248                 len1--;
00249                 skip = 1;
00250             }
00251             if (get_char_typeW(*str2) & (C1_PUNCT | C1_SPACE))
00252             {
00253                 str2++;
00254                 len2--;
00255                 skip = 1;
00256             }
00257             if (skip) continue;
00258         }
00259 
00260         ce1 = collation_table[collation_table[*str1 >> 8] + (*str1 & 0xff)];
00261         ce2 = collation_table[collation_table[*str2 >> 8] + (*str2 & 0xff)];
00262 
00263         if (ce1 != (unsigned int)-1 && ce2 != (unsigned int)-1)
00264             ret = ((ce1 >> 8) & 0xff) - ((ce2 >> 8) & 0xff);
00265         else
00266             ret = *str1 - *str2;
00267 
00268         if (ret) return ret;
00269 
00270         str1++;
00271         str2++;
00272         len1--;
00273         len2--;
00274     }
00275     return len1 - len2;
00276 }
00277 
00278 static inline int compare_case_weights(int flags, const WCHAR *str1, int len1,
00279                                        const WCHAR *str2, int len2)
00280 {
00281     unsigned int ce1, ce2;
00282     int ret;
00283 
00284     /* 32-bit collation element table format:
00285      * unicode weight - high 16 bit, diacritic weight - high 8 bit of low 16 bit,
00286      * case weight - high 4 bit of low 8 bit.
00287      */
00288     while (len1 > 0 && len2 > 0)
00289     {
00290         if (flags & NORM_IGNORESYMBOLS)
00291         {
00292             int skip = 0;
00293             /* FIXME: not tested */
00294             if (get_char_typeW(*str1) & (C1_PUNCT | C1_SPACE))
00295             {
00296                 str1++;
00297                 len1--;
00298                 skip = 1;
00299             }
00300             if (get_char_typeW(*str2) & (C1_PUNCT | C1_SPACE))
00301             {
00302                 str2++;
00303                 len2--;
00304                 skip = 1;
00305             }
00306             if (skip) continue;
00307         }
00308 
00309         ce1 = collation_table[collation_table[*str1 >> 8] + (*str1 & 0xff)];
00310         ce2 = collation_table[collation_table[*str2 >> 8] + (*str2 & 0xff)];
00311 
00312         if (ce1 != (unsigned int)-1 && ce2 != (unsigned int)-1)
00313             ret = ((ce1 >> 4) & 0x0f) - ((ce2 >> 4) & 0x0f);
00314         else
00315             ret = *str1 - *str2;
00316 
00317         if (ret) return ret;
00318 
00319         str1++;
00320         str2++;
00321         len1--;
00322         len2--;
00323     }
00324     return len1 - len2;
00325 }
00326 
00327 static inline int real_length(const WCHAR *str, int len)
00328 {
00329     while (len && !str[len - 1]) len--;
00330     return len;
00331 }
00332 
00333 int wine_compare_string(int flags, const WCHAR *str1, int len1,
00334                         const WCHAR *str2, int len2)
00335 {
00336     int ret;
00337 
00338     len1 = real_length(str1, len1);
00339     len2 = real_length(str2, len2);
00340 
00341     ret = compare_unicode_weights(flags, str1, len1, str2, len2);
00342     if (!ret)
00343     {
00344         if (!(flags & NORM_IGNORENONSPACE))
00345             ret = compare_diacritic_weights(flags, str1, len1, str2, len2);
00346         if (!ret && !(flags & NORM_IGNORECASE))
00347             ret = compare_case_weights(flags, str1, len1, str2, len2);
00348     }
00349     return ret;
00350 }

Generated on Sun May 27 2012 04:24:36 for ReactOS by doxygen 1.7.6.1

ReactOS is a registered trademark or a trademark of ReactOS Foundation in the United States and other countries.