ReactOS Fundraising Campaign 2012
 
€ 4,410 / € 30,000

Information | Donate

Home | Info | Community | Development | myReactOS | Contact Us

  1. Home
  2. Community
  3. Development
  4. myReactOS
  5. Fundraiser 2012

  1. Main Page
  2. Alphabetical List
  3. Data Structures
  4. Directories
  5. File List
  6. Data Fields
  7. Globals
  8. Related Pages

ReactOS Development > Doxygen

sortkey.c
Go to the documentation of this file.
00001 /*
00002  * Unicode sort key generation
00003  *
00004  * Copyright 2003 Dmitry Timoshkov
00005  *
00006  * This library is free software; you can redistribute it and/or
00007  * modify it under the terms of the GNU Lesser General Public
00008  * License as published by the Free Software Foundation; either
00009  * version 2.1 of the License, or (at your option) any later version.
00010  *
00011  * This library is distributed in the hope that it will be useful,
00012  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00013  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00014  * Lesser General Public License for more details.
00015  *
00016  * You should have received a copy of the GNU Lesser General Public
00017  * License along with this library; if not, write to the Free Software
00018  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA
00019  */
00020 #include "wine/unicode.h"
00021 
00022 #define get_char_typeW(x) iswctype((x) >> 8, (x) & 0xFF)
00023 extern int get_decomposition(WCHAR src, WCHAR *dst, unsigned int dstlen);
00024 extern const unsigned int collation_table[];
00025 
00026 /*
00027  * flags - normalization NORM_* flags
00028  *
00029  * FIXME: 'variable' flag not handled
00030  */
00031 int wine_get_sortkey(int flags, const WCHAR *src, int srclen, char *dst, int dstlen)
00032 {
00033     WCHAR dummy[4]; /* no decomposition is larger than 4 chars */
00034     int key_len[4];
00035     char *key_ptr[4];
00036     const WCHAR *src_save = src;
00037     int srclen_save = srclen;
00038 
00039     key_len[0] = key_len[1] = key_len[2] = key_len[3] = 0;
00040     for (; srclen; srclen--, src++)
00041     {
00042         int decomposed_len = 1;/*get_decomposition(*src, dummy, 4);*/
00043         dummy[0] = *src;
00044         if (decomposed_len)
00045         {
00046             int i;
00047             for (i = 0; i < decomposed_len; i++)
00048             {
00049                 WCHAR wch = dummy[i];
00050                 unsigned int ce;
00051 
00052                 /* tests show that win2k just ignores NORM_IGNORENONSPACE,
00053                  * and skips white space and punctuation characters for
00054                  * NORM_IGNORESYMBOLS.
00055                  */
00056                 if ((flags & NORM_IGNORESYMBOLS) && (get_char_typeW(wch) & (C1_PUNCT | C1_SPACE)))
00057                     continue;
00058 
00059                 if (flags & NORM_IGNORECASE) wch = tolowerW(wch);
00060 
00061                 ce = collation_table[collation_table[wch >> 8] + (wch & 0xff)];
00062                 if (ce != (unsigned int)-1)
00063                 {
00064                     if (ce >> 16) key_len[0] += 2;
00065                     if ((ce >> 8) & 0xff) key_len[1]++;
00066                     if ((ce >> 4) & 0x0f) key_len[2]++;
00067                     if (ce & 1)
00068                     {
00069                         if (wch >> 8) key_len[3]++;
00070                         key_len[3]++;
00071                     }
00072                 }
00073                 else
00074                 {
00075                     key_len[0] += 2;
00076                     if (wch >> 8) key_len[0]++;
00077                     if (wch & 0xff) key_len[0]++;
00078         }
00079             }
00080         }
00081     }
00082 
00083     if (!dstlen) /* compute length */
00084         /* 4 * '\1' + 1 * '\0' + key length */
00085         return key_len[0] + key_len[1] + key_len[2] + key_len[3] + 4 + 1;
00086 
00087     if (dstlen < key_len[0] + key_len[1] + key_len[2] + key_len[3] + 4 + 1)
00088         return 0; /* overflow */
00089 
00090     src = src_save;
00091     srclen = srclen_save;
00092 
00093     key_ptr[0] = dst;
00094     key_ptr[1] = key_ptr[0] + key_len[0] + 1;
00095     key_ptr[2] = key_ptr[1] + key_len[1] + 1;
00096     key_ptr[3] = key_ptr[2] + key_len[2] + 1;
00097 
00098     for (; srclen; srclen--, src++)
00099     {
00100         int decomposed_len = 1;/*get_decomposition(*src, dummy, 4);*/
00101         dummy[0] = *src;
00102         if (decomposed_len)
00103         {
00104             int i;
00105             for (i = 0; i < decomposed_len; i++)
00106             {
00107                 WCHAR wch = dummy[i];
00108                 unsigned int ce;
00109 
00110                 /* tests show that win2k just ignores NORM_IGNORENONSPACE,
00111                  * and skips white space and punctuation characters for
00112                  * NORM_IGNORESYMBOLS.
00113                  */
00114                 if ((flags & NORM_IGNORESYMBOLS) && (get_char_typeW(wch) & (C1_PUNCT | C1_SPACE)))
00115                     continue;
00116 
00117                 if (flags & NORM_IGNORECASE) wch = tolowerW(wch);
00118 
00119                 ce = collation_table[collation_table[wch >> 8] + (wch & 0xff)];
00120                 if (ce != (unsigned int)-1)
00121                 {
00122                     WCHAR key;
00123                     if ((key = ce >> 16))
00124                     {
00125                         *key_ptr[0]++ = key >> 8;
00126                         *key_ptr[0]++ = key & 0xff;
00127                     }
00128                     /* make key 1 start from 2 */
00129                     if ((key = (ce >> 8) & 0xff)) *key_ptr[1]++ = key + 1;
00130                     /* make key 2 start from 2 */
00131                     if ((key = (ce >> 4) & 0x0f)) *key_ptr[2]++ = key + 1;
00132                     /* key 3 is always a character code */
00133                     if (ce & 1)
00134                     {
00135                         if (wch >> 8) *key_ptr[3]++ = wch >> 8;
00136                         if (wch & 0xff) *key_ptr[3]++ = wch & 0xff;
00137                     }
00138                 }
00139                 else
00140                 {
00141                     *key_ptr[0]++ = 0xff;
00142                     *key_ptr[0]++ = 0xfe;
00143                     if (wch >> 8) *key_ptr[0]++ = wch >> 8;
00144                     if (wch & 0xff) *key_ptr[0]++ = wch & 0xff;
00145                 }
00146             }
00147         }
00148     }
00149 
00150     *key_ptr[0] = '\1';
00151     *key_ptr[1] = '\1';
00152     *key_ptr[2] = '\1';
00153     *key_ptr[3]++ = '\1';
00154     *key_ptr[3] = 0;
00155 
00156     return key_ptr[3] - dst;
00157 }
00158 
00159 static inline int compare_unicode_weights(int flags, const WCHAR *str1, int len1,
00160                                           const WCHAR *str2, int len2)
00161 {
00162     unsigned int ce1, ce2;
00163     int ret;
00164 
00165     /* 32-bit collation element table format:
00166      * unicode weight - high 16 bit, diacritic weight - high 8 bit of low 16 bit,
00167      * case weight - high 4 bit of low 8 bit.
00168      */
00169     while (len1 > 0 && len2 > 0)
00170     {
00171         if (flags & NORM_IGNORESYMBOLS)
00172         {
00173             int skip = 0;
00174             /* FIXME: not tested */
00175             if (get_char_typeW(*str1) & (C1_PUNCT | C1_SPACE))
00176             {
00177                 str1++;
00178                 len1--;
00179                 skip = 1;
00180             }
00181             if (get_char_typeW(*str2) & (C1_PUNCT | C1_SPACE))
00182             {
00183                 str2++;
00184                 len2--;
00185                 skip = 1;
00186             }
00187             if (skip) continue;
00188         }
00189 
00190        /* hyphen and apostrophe are treated differently depending on
00191         * whether SORT_STRINGSORT specified or not
00192         */
00193         if (!(flags & SORT_STRINGSORT))
00194         {
00195             if (*str1 == '-' || *str1 == '\'')
00196             {
00197                 if (*str2 != '-' && *str2 != '\'')
00198                 {
00199                     str1++;
00200                     len1--;
00201                     continue;
00202                 }
00203             }
00204             else if (*str2 == '-' || *str2 == '\'')
00205             {
00206                 str2++;
00207                 len2--;
00208                 continue;
00209             }
00210         }
00211 
00212         ce1 = collation_table[collation_table[*str1 >> 8] + (*str1 & 0xff)];
00213         ce2 = collation_table[collation_table[*str2 >> 8] + (*str2 & 0xff)];
00214 
00215         if (ce1 != (unsigned int)-1 && ce2 != (unsigned int)-1)
00216             ret = (ce1 >> 16) - (ce2 >> 16);
00217         else
00218             ret = *str1 - *str2;
00219 
00220         if (ret) return ret;
00221 
00222         str1++;
00223         str2++;
00224         len1--;
00225         len2--;
00226     }
00227     return len1 - len2;
00228 }
00229 
00230 static inline int compare_diacritic_weights(int flags, const WCHAR *str1, int len1,
00231                                             const WCHAR *str2, int len2)
00232 {
00233     unsigned int ce1, ce2;
00234     int ret;
00235 
00236     /* 32-bit collation element table format:
00237      * unicode weight - high 16 bit, diacritic weight - high 8 bit of low 16 bit,
00238      * case weight - high 4 bit of low 8 bit.
00239      */
00240     while (len1 > 0 && len2 > 0)
00241     {
00242         if (flags & NORM_IGNORESYMBOLS)
00243         {
00244             int skip = 0;
00245             /* FIXME: not tested */
00246             if (get_char_typeW(*str1) & (C1_PUNCT | C1_SPACE))
00247             {
00248                 str1++;
00249                 len1--;
00250                 skip = 1;
00251             }
00252             if (get_char_typeW(*str2) & (C1_PUNCT | C1_SPACE))
00253             {
00254                 str2++;
00255                 len2--;
00256                 skip = 1;
00257             }
00258             if (skip) continue;
00259         }
00260 
00261         ce1 = collation_table[collation_table[*str1 >> 8] + (*str1 & 0xff)];
00262         ce2 = collation_table[collation_table[*str2 >> 8] + (*str2 & 0xff)];
00263 
00264         if (ce1 != (unsigned int)-1 && ce2 != (unsigned int)-1)
00265             ret = ((ce1 >> 8) & 0xff) - ((ce2 >> 8) & 0xff);
00266         else
00267             ret = *str1 - *str2;
00268 
00269         if (ret) return ret;
00270 
00271         str1++;
00272         str2++;
00273         len1--;
00274         len2--;
00275     }
00276     return len1 - len2;
00277 }
00278 
00279 static inline int compare_case_weights(int flags, const WCHAR *str1, int len1,
00280                                        const WCHAR *str2, int len2)
00281 {
00282     unsigned int ce1, ce2;
00283     int ret;
00284 
00285     /* 32-bit collation element table format:
00286      * unicode weight - high 16 bit, diacritic weight - high 8 bit of low 16 bit,
00287      * case weight - high 4 bit of low 8 bit.
00288      */
00289     while (len1 > 0 && len2 > 0)
00290     {
00291         if (flags & NORM_IGNORESYMBOLS)
00292         {
00293             int skip = 0;
00294             /* FIXME: not tested */
00295             if (get_char_typeW(*str1) & (C1_PUNCT | C1_SPACE))
00296             {
00297                 str1++;
00298                 len1--;
00299                 skip = 1;
00300             }
00301             if (get_char_typeW(*str2) & (C1_PUNCT | C1_SPACE))
00302             {
00303                 str2++;
00304                 len2--;
00305                 skip = 1;
00306             }
00307             if (skip) continue;
00308         }
00309 
00310         ce1 = collation_table[collation_table[*str1 >> 8] + (*str1 & 0xff)];
00311         ce2 = collation_table[collation_table[*str2 >> 8] + (*str2 & 0xff)];
00312 
00313         if (ce1 != (unsigned int)-1 && ce2 != (unsigned int)-1)
00314             ret = ((ce1 >> 4) & 0x0f) - ((ce2 >> 4) & 0x0f);
00315         else
00316             ret = *str1 - *str2;
00317 
00318         if (ret) return ret;
00319 
00320         str1++;
00321         str2++;
00322         len1--;
00323         len2--;
00324     }
00325     return len1 - len2;
00326 }
00327 
00328 static inline int real_length(const WCHAR *str, int len)
00329 {
00330     while (len && !str[len - 1]) len--;
00331     return len;
00332 }
00333 
00334 int wine_compare_string(int flags, const WCHAR *str1, int len1,
00335                         const WCHAR *str2, int len2)
00336 {
00337     int ret;
00338 
00339     len1 = real_length(str1, len1);
00340     len2 = real_length(str2, len2);
00341 
00342     ret = compare_unicode_weights(flags, str1, len1, str2, len2);
00343     if (!ret)
00344     {
00345         if (!(flags & NORM_IGNORENONSPACE))
00346             ret = compare_diacritic_weights(flags, str1, len1, str2, len2);
00347         if (!ret && !(flags & NORM_IGNORECASE))
00348             ret = compare_case_weights(flags, str1, len1, str2, len2);
00349     }
00350     return ret;
00351 }

Generated on Sat May 26 2012 04:23:09 for ReactOS by doxygen 1.7.6.1

ReactOS is a registered trademark or a trademark of ReactOS Foundation in the United States and other countries.