ReactOS Fundraising Campaign 2012
 
€ 4,410 / € 30,000

Information | Donate

Home | Info | Community | Development | myReactOS | Contact Us

  1. Home
  2. Community
  3. Development
  4. myReactOS
  5. Fundraiser 2012

  1. Main Page
  2. Alphabetical List
  3. Data Structures
  4. Directories
  5. File List
  6. Data Fields
  7. Globals
  8. Related Pages

ReactOS Development > Doxygen

xmltok.c
Go to the documentation of this file.
00001 /* Copyright (c) 1998, 1999 Thai Open Source Software Center Ltd
00002    See the file COPYING for copying permission.
00003 */
00004 
00005 #include <stddef.h>
00006 
00007 #ifdef COMPILED_FROM_DSP
00008 #include "winconfig.h"
00009 #elif defined(MACOS_CLASSIC)
00010 #include "macconfig.h"
00011 #elif defined(__amigaos4__)
00012 #include "amigaconfig.h"
00013 #else
00014 #ifdef HAVE_EXPAT_CONFIG_H
00015 #include <expat_config.h>
00016 #endif
00017 #endif /* ndef COMPILED_FROM_DSP */
00018 
00019 #include "expat_external.h"
00020 #include "internal.h"
00021 #include "xmltok.h"
00022 #include "nametab.h"
00023 
00024 #ifdef XML_DTD
00025 #define IGNORE_SECTION_TOK_VTABLE , PREFIX(ignoreSectionTok)
00026 #else
00027 #define IGNORE_SECTION_TOK_VTABLE /* as nothing */
00028 #endif
00029 
00030 #define VTABLE1 \
00031   { PREFIX(prologTok), PREFIX(contentTok), \
00032     PREFIX(cdataSectionTok) IGNORE_SECTION_TOK_VTABLE }, \
00033   { PREFIX(attributeValueTok), PREFIX(entityValueTok) }, \
00034   PREFIX(sameName), \
00035   PREFIX(nameMatchesAscii), \
00036   PREFIX(nameLength), \
00037   PREFIX(skipS), \
00038   PREFIX(getAtts), \
00039   PREFIX(charRefNumber), \
00040   PREFIX(predefinedEntityName), \
00041   PREFIX(updatePosition), \
00042   PREFIX(isPublicId)
00043 
00044 #define VTABLE VTABLE1, PREFIX(toUtf8), PREFIX(toUtf16)
00045 
00046 #define UCS2_GET_NAMING(pages, hi, lo) \
00047    (namingBitmap[(pages[hi] << 3) + ((lo) >> 5)] & (1 << ((lo) & 0x1F)))
00048 
00049 /* A 2 byte UTF-8 representation splits the characters 11 bits between
00050    the bottom 5 and 6 bits of the bytes.  We need 8 bits to index into
00051    pages, 3 bits to add to that index and 5 bits to generate the mask.
00052 */
00053 #define UTF8_GET_NAMING2(pages, byte) \
00054     (namingBitmap[((pages)[(((byte)[0]) >> 2) & 7] << 3) \
00055                       + ((((byte)[0]) & 3) << 1) \
00056                       + ((((byte)[1]) >> 5) & 1)] \
00057          & (1 << (((byte)[1]) & 0x1F)))
00058 
00059 /* A 3 byte UTF-8 representation splits the characters 16 bits between
00060    the bottom 4, 6 and 6 bits of the bytes.  We need 8 bits to index
00061    into pages, 3 bits to add to that index and 5 bits to generate the
00062    mask.
00063 */
00064 #define UTF8_GET_NAMING3(pages, byte) \
00065   (namingBitmap[((pages)[((((byte)[0]) & 0xF) << 4) \
00066                              + ((((byte)[1]) >> 2) & 0xF)] \
00067                        << 3) \
00068                       + ((((byte)[1]) & 3) << 1) \
00069                       + ((((byte)[2]) >> 5) & 1)] \
00070          & (1 << (((byte)[2]) & 0x1F)))
00071 
00072 #define UTF8_GET_NAMING(pages, p, n) \
00073   ((n) == 2 \
00074   ? UTF8_GET_NAMING2(pages, (const unsigned char *)(p)) \
00075   : ((n) == 3 \
00076      ? UTF8_GET_NAMING3(pages, (const unsigned char *)(p)) \
00077      : 0))
00078 
00079 /* Detection of invalid UTF-8 sequences is based on Table 3.1B
00080    of Unicode 3.2: http://www.unicode.org/unicode/reports/tr28/
00081    with the additional restriction of not allowing the Unicode
00082    code points 0xFFFF and 0xFFFE (sequences EF,BF,BF and EF,BF,BE).
00083    Implementation details:
00084      (A & 0x80) == 0     means A < 0x80
00085    and
00086      (A & 0xC0) == 0xC0  means A > 0xBF
00087 */
00088 
00089 #define UTF8_INVALID2(p) \
00090   ((*p) < 0xC2 || ((p)[1] & 0x80) == 0 || ((p)[1] & 0xC0) == 0xC0)
00091 
00092 #define UTF8_INVALID3(p) \
00093   (((p)[2] & 0x80) == 0 \
00094   || \
00095   ((*p) == 0xEF && (p)[1] == 0xBF \
00096     ? \
00097     (p)[2] > 0xBD \
00098     : \
00099     ((p)[2] & 0xC0) == 0xC0) \
00100   || \
00101   ((*p) == 0xE0 \
00102     ? \
00103     (p)[1] < 0xA0 || ((p)[1] & 0xC0) == 0xC0 \
00104     : \
00105     ((p)[1] & 0x80) == 0 \
00106     || \
00107     ((*p) == 0xED ? (p)[1] > 0x9F : ((p)[1] & 0xC0) == 0xC0)))
00108 
00109 #define UTF8_INVALID4(p) \
00110   (((p)[3] & 0x80) == 0 || ((p)[3] & 0xC0) == 0xC0 \
00111   || \
00112   ((p)[2] & 0x80) == 0 || ((p)[2] & 0xC0) == 0xC0 \
00113   || \
00114   ((*p) == 0xF0 \
00115     ? \
00116     (p)[1] < 0x90 || ((p)[1] & 0xC0) == 0xC0 \
00117     : \
00118     ((p)[1] & 0x80) == 0 \
00119     || \
00120     ((*p) == 0xF4 ? (p)[1] > 0x8F : ((p)[1] & 0xC0) == 0xC0)))
00121 
00122 static int PTRFASTCALL
00123 isNever(const ENCODING *enc, const char *p)
00124 {
00125   return 0;
00126 }
00127 
00128 static int PTRFASTCALL
00129 utf8_isName2(const ENCODING *enc, const char *p)
00130 {
00131   return UTF8_GET_NAMING2(namePages, (const unsigned char *)p);
00132 }
00133 
00134 static int PTRFASTCALL
00135 utf8_isName3(const ENCODING *enc, const char *p)
00136 {
00137   return UTF8_GET_NAMING3(namePages, (const unsigned char *)p);
00138 }
00139 
00140 #define utf8_isName4 isNever
00141 
00142 static int PTRFASTCALL
00143 utf8_isNmstrt2(const ENCODING *enc, const char *p)
00144 {
00145   return UTF8_GET_NAMING2(nmstrtPages, (const unsigned char *)p);
00146 }
00147 
00148 static int PTRFASTCALL
00149 utf8_isNmstrt3(const ENCODING *enc, const char *p)
00150 {
00151   return UTF8_GET_NAMING3(nmstrtPages, (const unsigned char *)p);
00152 }
00153 
00154 #define utf8_isNmstrt4 isNever
00155 
00156 static int PTRFASTCALL
00157 utf8_isInvalid2(const ENCODING *enc, const char *p)
00158 {
00159   return UTF8_INVALID2((const unsigned char *)p);
00160 }
00161 
00162 static int PTRFASTCALL
00163 utf8_isInvalid3(const ENCODING *enc, const char *p)
00164 {
00165   return UTF8_INVALID3((const unsigned char *)p);
00166 }
00167 
00168 static int PTRFASTCALL
00169 utf8_isInvalid4(const ENCODING *enc, const char *p)
00170 {
00171   return UTF8_INVALID4((const unsigned char *)p);
00172 }
00173 
00174 struct normal_encoding {
00175   ENCODING enc;
00176   unsigned char type[256];
00177 #ifdef XML_MIN_SIZE
00178   int (PTRFASTCALL *byteType)(const ENCODING *, const char *);
00179   int (PTRFASTCALL *isNameMin)(const ENCODING *, const char *);
00180   int (PTRFASTCALL *isNmstrtMin)(const ENCODING *, const char *);
00181   int (PTRFASTCALL *byteToAscii)(const ENCODING *, const char *);
00182   int (PTRCALL *charMatches)(const ENCODING *, const char *, int);
00183 #endif /* XML_MIN_SIZE */
00184   int (PTRFASTCALL *isName2)(const ENCODING *, const char *);
00185   int (PTRFASTCALL *isName3)(const ENCODING *, const char *);
00186   int (PTRFASTCALL *isName4)(const ENCODING *, const char *);
00187   int (PTRFASTCALL *isNmstrt2)(const ENCODING *, const char *);
00188   int (PTRFASTCALL *isNmstrt3)(const ENCODING *, const char *);
00189   int (PTRFASTCALL *isNmstrt4)(const ENCODING *, const char *);
00190   int (PTRFASTCALL *isInvalid2)(const ENCODING *, const char *);
00191   int (PTRFASTCALL *isInvalid3)(const ENCODING *, const char *);
00192   int (PTRFASTCALL *isInvalid4)(const ENCODING *, const char *);
00193 };
00194 
00195 #define AS_NORMAL_ENCODING(enc)   ((const struct normal_encoding *) (enc))
00196 
00197 #ifdef XML_MIN_SIZE
00198 
00199 #define STANDARD_VTABLE(E) \
00200  E ## byteType, \
00201  E ## isNameMin, \
00202  E ## isNmstrtMin, \
00203  E ## byteToAscii, \
00204  E ## charMatches,
00205 
00206 #else
00207 
00208 #define STANDARD_VTABLE(E) /* as nothing */
00209 
00210 #endif
00211 
00212 #define NORMAL_VTABLE(E) \
00213  E ## isName2, \
00214  E ## isName3, \
00215  E ## isName4, \
00216  E ## isNmstrt2, \
00217  E ## isNmstrt3, \
00218  E ## isNmstrt4, \
00219  E ## isInvalid2, \
00220  E ## isInvalid3, \
00221  E ## isInvalid4
00222 
00223 static int FASTCALL checkCharRefNumber(int);
00224 
00225 #include "xmltok_impl.h"
00226 #include "ascii.h"
00227 
00228 #ifdef XML_MIN_SIZE
00229 #define sb_isNameMin isNever
00230 #define sb_isNmstrtMin isNever
00231 #endif
00232 
00233 #ifdef XML_MIN_SIZE
00234 #define MINBPC(enc) ((enc)->minBytesPerChar)
00235 #else
00236 /* minimum bytes per character */
00237 #define MINBPC(enc) 1
00238 #endif
00239 
00240 #define SB_BYTE_TYPE(enc, p) \
00241   (((struct normal_encoding *)(enc))->type[(unsigned char)*(p)])
00242 
00243 #ifdef XML_MIN_SIZE
00244 static int PTRFASTCALL
00245 sb_byteType(const ENCODING *enc, const char *p)
00246 {
00247   return SB_BYTE_TYPE(enc, p);
00248 }
00249 #define BYTE_TYPE(enc, p) \
00250  (AS_NORMAL_ENCODING(enc)->byteType(enc, p))
00251 #else
00252 #define BYTE_TYPE(enc, p) SB_BYTE_TYPE(enc, p)
00253 #endif
00254 
00255 #ifdef XML_MIN_SIZE
00256 #define BYTE_TO_ASCII(enc, p) \
00257  (AS_NORMAL_ENCODING(enc)->byteToAscii(enc, p))
00258 static int PTRFASTCALL
00259 sb_byteToAscii(const ENCODING *enc, const char *p)
00260 {
00261   return *p;
00262 }
00263 #else
00264 #define BYTE_TO_ASCII(enc, p) (*(p))
00265 #endif
00266 
00267 #define IS_NAME_CHAR(enc, p, n) \
00268  (AS_NORMAL_ENCODING(enc)->isName ## n(enc, p))
00269 #define IS_NMSTRT_CHAR(enc, p, n) \
00270  (AS_NORMAL_ENCODING(enc)->isNmstrt ## n(enc, p))
00271 #define IS_INVALID_CHAR(enc, p, n) \
00272  (AS_NORMAL_ENCODING(enc)->isInvalid ## n(enc, p))
00273 
00274 #ifdef XML_MIN_SIZE
00275 #define IS_NAME_CHAR_MINBPC(enc, p) \
00276  (AS_NORMAL_ENCODING(enc)->isNameMin(enc, p))
00277 #define IS_NMSTRT_CHAR_MINBPC(enc, p) \
00278  (AS_NORMAL_ENCODING(enc)->isNmstrtMin(enc, p))
00279 #else
00280 #define IS_NAME_CHAR_MINBPC(enc, p) (0)
00281 #define IS_NMSTRT_CHAR_MINBPC(enc, p) (0)
00282 #endif
00283 
00284 #ifdef XML_MIN_SIZE
00285 #define CHAR_MATCHES(enc, p, c) \
00286  (AS_NORMAL_ENCODING(enc)->charMatches(enc, p, c))
00287 static int PTRCALL
00288 sb_charMatches(const ENCODING *enc, const char *p, int c)
00289 {
00290   return *p == c;
00291 }
00292 #else
00293 /* c is an ASCII character */
00294 #define CHAR_MATCHES(enc, p, c) (*(p) == c)
00295 #endif
00296 
00297 #define PREFIX(ident) normal_ ## ident
00298 #include "xmltok_impl.c"
00299 
00300 #undef MINBPC
00301 #undef BYTE_TYPE
00302 #undef BYTE_TO_ASCII
00303 #undef CHAR_MATCHES
00304 #undef IS_NAME_CHAR
00305 #undef IS_NAME_CHAR_MINBPC
00306 #undef IS_NMSTRT_CHAR
00307 #undef IS_NMSTRT_CHAR_MINBPC
00308 #undef IS_INVALID_CHAR
00309 
00310 enum {  /* UTF8_cvalN is value of masked first byte of N byte sequence */
00311   UTF8_cval1 = 0x00,
00312   UTF8_cval2 = 0xc0,
00313   UTF8_cval3 = 0xe0,
00314   UTF8_cval4 = 0xf0
00315 };
00316 
00317 static void PTRCALL
00318 utf8_toUtf8(const ENCODING *enc,
00319             const char **fromP, const char *fromLim,
00320             char **toP, const char *toLim)
00321 {
00322   char *to;
00323   const char *from;
00324   if (fromLim - *fromP > toLim - *toP) {
00325     /* Avoid copying partial characters. */
00326     for (fromLim = *fromP + (toLim - *toP); fromLim > *fromP; fromLim--)
00327       if (((unsigned char)fromLim[-1] & 0xc0) != 0x80)
00328         break;
00329   }
00330   for (to = *toP, from = *fromP; from != fromLim; from++, to++)
00331     *to = *from;
00332   *fromP = from;
00333   *toP = to;
00334 }
00335 
00336 static void PTRCALL
00337 utf8_toUtf16(const ENCODING *enc,
00338              const char **fromP, const char *fromLim,
00339              unsigned short **toP, const unsigned short *toLim)
00340 {
00341   unsigned short *to = *toP;
00342   const char *from = *fromP;
00343   while (from != fromLim && to != toLim) {
00344     switch (((struct normal_encoding *)enc)->type[(unsigned char)*from]) {
00345     case BT_LEAD2:
00346       *to++ = (unsigned short)(((from[0] & 0x1f) << 6) | (from[1] & 0x3f));
00347       from += 2;
00348       break;
00349     case BT_LEAD3:
00350       *to++ = (unsigned short)(((from[0] & 0xf) << 12)
00351                                | ((from[1] & 0x3f) << 6) | (from[2] & 0x3f));
00352       from += 3;
00353       break;
00354     case BT_LEAD4:
00355       {
00356         unsigned long n;
00357         if (to + 1 == toLim)
00358           goto after;
00359         n = ((from[0] & 0x7) << 18) | ((from[1] & 0x3f) << 12)
00360             | ((from[2] & 0x3f) << 6) | (from[3] & 0x3f);
00361         n -= 0x10000;
00362         to[0] = (unsigned short)((n >> 10) | 0xD800);
00363         to[1] = (unsigned short)((n & 0x3FF) | 0xDC00);
00364         to += 2;
00365         from += 4;
00366       }
00367       break;
00368     default:
00369       *to++ = *from++;
00370       break;
00371     }
00372   }
00373 after:
00374   *fromP = from;
00375   *toP = to;
00376 }
00377 
00378 #ifdef XML_NS
00379 static const struct normal_encoding utf8_encoding_ns = {
00380   { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
00381   {
00382 #include "asciitab.h"
00383 #include "utf8tab.h"
00384   },
00385   STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
00386 };
00387 #endif
00388 
00389 static const struct normal_encoding utf8_encoding = {
00390   { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
00391   {
00392 #define BT_COLON BT_NMSTRT
00393 #include "asciitab.h"
00394 #undef BT_COLON
00395 #include "utf8tab.h"
00396   },
00397   STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
00398 };
00399 
00400 #ifdef XML_NS
00401 
00402 static const struct normal_encoding internal_utf8_encoding_ns = {
00403   { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
00404   {
00405 #include "iasciitab.h"
00406 #include "utf8tab.h"
00407   },
00408   STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
00409 };
00410 
00411 #endif
00412 
00413 static const struct normal_encoding internal_utf8_encoding = {
00414   { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
00415   {
00416 #define BT_COLON BT_NMSTRT
00417 #include "iasciitab.h"
00418 #undef BT_COLON
00419 #include "utf8tab.h"
00420   },
00421   STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
00422 };
00423 
00424 static void PTRCALL
00425 latin1_toUtf8(const ENCODING *enc,
00426               const char **fromP, const char *fromLim,
00427               char **toP, const char *toLim)
00428 {
00429   for (;;) {
00430     unsigned char c;
00431     if (*fromP == fromLim)
00432       break;
00433     c = (unsigned char)**fromP;
00434     if (c & 0x80) {
00435       if (toLim - *toP < 2)
00436         break;
00437       *(*toP)++ = (char)((c >> 6) | UTF8_cval2);
00438       *(*toP)++ = (char)((c & 0x3f) | 0x80);
00439       (*fromP)++;
00440     }
00441     else {
00442       if (*toP == toLim)
00443         break;
00444       *(*toP)++ = *(*fromP)++;
00445     }
00446   }
00447 }
00448 
00449 static void PTRCALL
00450 latin1_toUtf16(const ENCODING *enc,
00451                const char **fromP, const char *fromLim,
00452                unsigned short **toP, const unsigned short *toLim)
00453 {
00454   while (*fromP != fromLim && *toP != toLim)
00455     *(*toP)++ = (unsigned char)*(*fromP)++;
00456 }
00457 
00458 #ifdef XML_NS
00459 
00460 static const struct normal_encoding latin1_encoding_ns = {
00461   { VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0 },
00462   {
00463 #include "asciitab.h"
00464 #include "latin1tab.h"
00465   },
00466   STANDARD_VTABLE(sb_)
00467 };
00468 
00469 #endif
00470 
00471 static const struct normal_encoding latin1_encoding = {
00472   { VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0 },
00473   {
00474 #define BT_COLON BT_NMSTRT
00475 #include "asciitab.h"
00476 #undef BT_COLON
00477 #include "latin1tab.h"
00478   },
00479   STANDARD_VTABLE(sb_)
00480 };
00481 
00482 static void PTRCALL
00483 ascii_toUtf8(const ENCODING *enc,
00484              const char **fromP, const char *fromLim,
00485              char **toP, const char *toLim)
00486 {
00487   while (*fromP != fromLim && *toP != toLim)
00488     *(*toP)++ = *(*fromP)++;
00489 }
00490 
00491 #ifdef XML_NS
00492 
00493 static const struct normal_encoding ascii_encoding_ns = {
00494   { VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0 },
00495   {
00496 #include "asciitab.h"
00497 /* BT_NONXML == 0 */
00498   },
00499   STANDARD_VTABLE(sb_)
00500 };
00501 
00502 #endif
00503 
00504 static const struct normal_encoding ascii_encoding = {
00505   { VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0 },
00506   {
00507 #define BT_COLON BT_NMSTRT
00508 #include "asciitab.h"
00509 #undef BT_COLON
00510 /* BT_NONXML == 0 */
00511   },
00512   STANDARD_VTABLE(sb_)
00513 };
00514 
00515 static int PTRFASTCALL
00516 unicode_byte_type(char hi, char lo)
00517 {
00518   switch ((unsigned char)hi) {
00519   case 0xD8: case 0xD9: case 0xDA: case 0xDB:
00520     return BT_LEAD4;
00521   case 0xDC: case 0xDD: case 0xDE: case 0xDF:
00522     return BT_TRAIL;
00523   case 0xFF:
00524     switch ((unsigned char)lo) {
00525     case 0xFF:
00526     case 0xFE:
00527       return BT_NONXML;
00528     }
00529     break;
00530   }
00531   return BT_NONASCII;
00532 }
00533 
00534 #define DEFINE_UTF16_TO_UTF8(E) \
00535 static void  PTRCALL \
00536 E ## toUtf8(const ENCODING *enc, \
00537             const char **fromP, const char *fromLim, \
00538             char **toP, const char *toLim) \
00539 { \
00540   const char *from; \
00541   for (from = *fromP; from != fromLim; from += 2) { \
00542     int plane; \
00543     unsigned char lo2; \
00544     unsigned char lo = GET_LO(from); \
00545     unsigned char hi = GET_HI(from); \
00546     switch (hi) { \
00547     case 0: \
00548       if (lo < 0x80) { \
00549         if (*toP == toLim) { \
00550           *fromP = from; \
00551           return; \
00552         } \
00553         *(*toP)++ = lo; \
00554         break; \
00555       } \
00556       /* fall through */ \
00557     case 0x1: case 0x2: case 0x3: \
00558     case 0x4: case 0x5: case 0x6: case 0x7: \
00559       if (toLim -  *toP < 2) { \
00560         *fromP = from; \
00561         return; \
00562       } \
00563       *(*toP)++ = ((lo >> 6) | (hi << 2) |  UTF8_cval2); \
00564       *(*toP)++ = ((lo & 0x3f) | 0x80); \
00565       break; \
00566     default: \
00567       if (toLim -  *toP < 3)  { \
00568         *fromP = from; \
00569         return; \
00570       } \
00571       /* 16 bits divided 4, 6, 6 amongst 3 bytes */ \
00572       *(*toP)++ = ((hi >> 4) | UTF8_cval3); \
00573       *(*toP)++ = (((hi & 0xf) << 2) | (lo >> 6) | 0x80); \
00574       *(*toP)++ = ((lo & 0x3f) | 0x80); \
00575       break; \
00576     case 0xD8: case 0xD9: case 0xDA: case 0xDB: \
00577       if (toLim -  *toP < 4) { \
00578         *fromP = from; \
00579         return; \
00580       } \
00581       plane = (((hi & 0x3) << 2) | ((lo >> 6) & 0x3)) + 1; \
00582       *(*toP)++ = ((plane >> 2) | UTF8_cval4); \
00583       *(*toP)++ = (((lo >> 2) & 0xF) | ((plane & 0x3) << 4) | 0x80); \
00584       from += 2; \
00585       lo2 = GET_LO(from); \
00586       *(*toP)++ = (((lo & 0x3) << 4) \
00587                    | ((GET_HI(from) & 0x3) << 2) \
00588                    | (lo2 >> 6) \
00589                    | 0x80); \
00590       *(*toP)++ = ((lo2 & 0x3f) | 0x80); \
00591       break; \
00592     } \
00593   } \
00594   *fromP = from; \
00595 }
00596 
00597 #define DEFINE_UTF16_TO_UTF16(E) \
00598 static void  PTRCALL \
00599 E ## toUtf16(const ENCODING *enc, \
00600              const char **fromP, const char *fromLim, \
00601              unsigned short **toP, const unsigned short *toLim) \
00602 { \
00603   /* Avoid copying first half only of surrogate */ \
00604   if (fromLim - *fromP > ((toLim - *toP) << 1) \
00605       && (GET_HI(fromLim - 2) & 0xF8) == 0xD8) \
00606     fromLim -= 2; \
00607   for (; *fromP != fromLim && *toP != toLim; *fromP += 2) \
00608     *(*toP)++ = (GET_HI(*fromP) << 8) | GET_LO(*fromP); \
00609 }
00610 
00611 #define SET2(ptr, ch) \
00612   (((ptr)[0] = ((ch) & 0xff)), ((ptr)[1] = ((ch) >> 8)))
00613 #define GET_LO(ptr) ((unsigned char)(ptr)[0])
00614 #define GET_HI(ptr) ((unsigned char)(ptr)[1])
00615 
00616 DEFINE_UTF16_TO_UTF8(little2_)
00617 DEFINE_UTF16_TO_UTF16(little2_)
00618 
00619 #undef SET2
00620 #undef GET_LO
00621 #undef GET_HI
00622 
00623 #define SET2(ptr, ch) \
00624   (((ptr)[0] = ((ch) >> 8)), ((ptr)[1] = ((ch) & 0xFF)))
00625 #define GET_LO(ptr) ((unsigned char)(ptr)[1])
00626 #define GET_HI(ptr) ((unsigned char)(ptr)[0])
00627 
00628 DEFINE_UTF16_TO_UTF8(big2_)
00629 DEFINE_UTF16_TO_UTF16(big2_)
00630 
00631 #undef SET2
00632 #undef GET_LO
00633 #undef GET_HI
00634 
00635 #define LITTLE2_BYTE_TYPE(enc, p) \
00636  ((p)[1] == 0 \
00637   ? ((struct normal_encoding *)(enc))->type[(unsigned char)*(p)] \
00638   : unicode_byte_type((p)[1], (p)[0]))
00639 #define LITTLE2_BYTE_TO_ASCII(enc, p) ((p)[1] == 0 ? (p)[0] : -1)
00640 #define LITTLE2_CHAR_MATCHES(enc, p, c) ((p)[1] == 0 && (p)[0] == c)
00641 #define LITTLE2_IS_NAME_CHAR_MINBPC(enc, p) \
00642   UCS2_GET_NAMING(namePages, (unsigned char)p[1], (unsigned char)p[0])
00643 #define LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p) \
00644   UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[1], (unsigned char)p[0])
00645 
00646 #ifdef XML_MIN_SIZE
00647 
00648 static int PTRFASTCALL
00649 little2_byteType(const ENCODING *enc, const char *p)
00650 {
00651   return LITTLE2_BYTE_TYPE(enc, p);
00652 }
00653 
00654 static int PTRFASTCALL
00655 little2_byteToAscii(const ENCODING *enc, const char *p)
00656 {
00657   return LITTLE2_BYTE_TO_ASCII(enc, p);
00658 }
00659 
00660 static int PTRCALL
00661 little2_charMatches(const ENCODING *enc, const char *p, int c)
00662 {
00663   return LITTLE2_CHAR_MATCHES(enc, p, c);
00664 }
00665 
00666 static int PTRFASTCALL
00667 little2_isNameMin(const ENCODING *enc, const char *p)
00668 {
00669   return LITTLE2_IS_NAME_CHAR_MINBPC(enc, p);
00670 }
00671 
00672 static int PTRFASTCALL
00673 little2_isNmstrtMin(const ENCODING *enc, const char *p)
00674 {
00675   return LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p);
00676 }
00677 
00678 #undef VTABLE
00679 #define VTABLE VTABLE1, little2_toUtf8, little2_toUtf16
00680 
00681 #else /* not XML_MIN_SIZE */
00682 
00683 #undef PREFIX
00684 #define PREFIX(ident) little2_ ## ident
00685 #define MINBPC(enc) 2
00686 /* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
00687 #define BYTE_TYPE(enc, p) LITTLE2_BYTE_TYPE(enc, p)
00688 #define BYTE_TO_ASCII(enc, p) LITTLE2_BYTE_TO_ASCII(enc, p)
00689 #define CHAR_MATCHES(enc, p, c) LITTLE2_CHAR_MATCHES(enc, p, c)
00690 #define IS_NAME_CHAR(enc, p, n) 0
00691 #define IS_NAME_CHAR_MINBPC(enc, p) LITTLE2_IS_NAME_CHAR_MINBPC(enc, p)
00692 #define IS_NMSTRT_CHAR(enc, p, n) (0)
00693 #define IS_NMSTRT_CHAR_MINBPC(enc, p) LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p)
00694 
00695 #include "xmltok_impl.c"
00696 
00697 #undef MINBPC
00698 #undef BYTE_TYPE
00699 #undef BYTE_TO_ASCII
00700 #undef CHAR_MATCHES
00701 #undef IS_NAME_CHAR
00702 #undef IS_NAME_CHAR_MINBPC
00703 #undef IS_NMSTRT_CHAR
00704 #undef IS_NMSTRT_CHAR_MINBPC
00705 #undef IS_INVALID_CHAR
00706 
00707 #endif /* not XML_MIN_SIZE */
00708 
00709 #ifdef XML_NS
00710 
00711 static const struct normal_encoding little2_encoding_ns = {
00712   { VTABLE, 2, 0,
00713 #if BYTEORDER == 1234
00714     1
00715 #else
00716     0
00717 #endif
00718   },
00719   {
00720 #include "asciitab.h"
00721 #include "latin1tab.h"
00722   },
00723   STANDARD_VTABLE(little2_)
00724 };
00725 
00726 #endif
00727 
00728 static const struct normal_encoding little2_encoding = {
00729   { VTABLE, 2, 0,
00730 #if BYTEORDER == 1234
00731     1
00732 #else
00733     0
00734 #endif
00735   },
00736   {
00737 #define BT_COLON BT_NMSTRT
00738 #include "asciitab.h"
00739 #undef BT_COLON
00740 #include "latin1tab.h"
00741   },
00742   STANDARD_VTABLE(little2_)
00743 };
00744 
00745 #if BYTEORDER != 4321
00746 
00747 #ifdef XML_NS
00748 
00749 static const struct normal_encoding internal_little2_encoding_ns = {
00750   { VTABLE, 2, 0, 1 },
00751   {
00752 #include "iasciitab.h"
00753 #include "latin1tab.h"
00754   },
00755   STANDARD_VTABLE(little2_)
00756 };
00757 
00758 #endif
00759 
00760 static const struct normal_encoding internal_little2_encoding = {
00761   { VTABLE, 2, 0, 1 },
00762   {
00763 #define BT_COLON BT_NMSTRT
00764 #include "iasciitab.h"
00765 #undef BT_COLON
00766 #include "latin1tab.h"
00767   },
00768   STANDARD_VTABLE(little2_)
00769 };
00770 
00771 #endif
00772 
00773 
00774 #define BIG2_BYTE_TYPE(enc, p) \
00775  ((p)[0] == 0 \
00776   ? ((struct normal_encoding *)(enc))->type[(unsigned char)(p)[1]] \
00777   : unicode_byte_type((p)[0], (p)[1]))
00778 #define BIG2_BYTE_TO_ASCII(enc, p) ((p)[0] == 0 ? (p)[1] : -1)
00779 #define BIG2_CHAR_MATCHES(enc, p, c) ((p)[0] == 0 && (p)[1] == c)
00780 #define BIG2_IS_NAME_CHAR_MINBPC(enc, p) \
00781   UCS2_GET_NAMING(namePages, (unsigned char)p[0], (unsigned char)p[1])
00782 #define BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p) \
00783   UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[0], (unsigned char)p[1])
00784 
00785 #ifdef XML_MIN_SIZE
00786 
00787 static int PTRFASTCALL
00788 big2_byteType(const ENCODING *enc, const char *p)
00789 {
00790   return BIG2_BYTE_TYPE(enc, p);
00791 }
00792 
00793 static int PTRFASTCALL
00794 big2_byteToAscii(const ENCODING *enc, const char *p)
00795 {
00796   return BIG2_BYTE_TO_ASCII(enc, p);
00797 }
00798 
00799 static int PTRCALL
00800 big2_charMatches(const ENCODING *enc, const char *p, int c)
00801 {
00802   return BIG2_CHAR_MATCHES(enc, p, c);
00803 }
00804 
00805 static int PTRFASTCALL
00806 big2_isNameMin(const ENCODING *enc, const char *p)
00807 {
00808   return BIG2_IS_NAME_CHAR_MINBPC(enc, p);
00809 }
00810 
00811 static int PTRFASTCALL
00812 big2_isNmstrtMin(const ENCODING *enc, const char *p)
00813 {
00814   return BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p);
00815 }
00816 
00817 #undef VTABLE
00818 #define VTABLE VTABLE1, big2_toUtf8, big2_toUtf16
00819 
00820 #else /* not XML_MIN_SIZE */
00821 
00822 #undef PREFIX
00823 #define PREFIX(ident) big2_ ## ident
00824 #define MINBPC(enc) 2
00825 /* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
00826 #define BYTE_TYPE(enc, p) BIG2_BYTE_TYPE(enc, p)
00827 #define BYTE_TO_ASCII(enc, p) BIG2_BYTE_TO_ASCII(enc, p)
00828 #define CHAR_MATCHES(enc, p, c) BIG2_CHAR_MATCHES(enc, p, c)
00829 #define IS_NAME_CHAR(enc, p, n) 0
00830 #define IS_NAME_CHAR_MINBPC(enc, p) BIG2_IS_NAME_CHAR_MINBPC(enc, p)
00831 #define IS_NMSTRT_CHAR(enc, p, n) (0)
00832 #define IS_NMSTRT_CHAR_MINBPC(enc, p) BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p)
00833 
00834 #include "xmltok_impl.c"
00835 
00836 #undef MINBPC
00837 #undef BYTE_TYPE
00838 #undef BYTE_TO_ASCII
00839 #undef CHAR_MATCHES
00840 #undef IS_NAME_CHAR
00841 #undef IS_NAME_CHAR_MINBPC
00842 #undef IS_NMSTRT_CHAR
00843 #undef IS_NMSTRT_CHAR_MINBPC
00844 #undef IS_INVALID_CHAR
00845 
00846 #endif /* not XML_MIN_SIZE */
00847 
00848 #ifdef XML_NS
00849 
00850 static const struct normal_encoding big2_encoding_ns = {
00851   { VTABLE, 2, 0,
00852 #if BYTEORDER == 4321
00853   1
00854 #else
00855   0
00856 #endif
00857   },
00858   {
00859 #include "asciitab.h"
00860 #include "latin1tab.h"
00861   },
00862   STANDARD_VTABLE(big2_)
00863 };
00864 
00865 #endif
00866 
00867 static const struct normal_encoding big2_encoding = {
00868   { VTABLE, 2, 0,
00869 #if BYTEORDER == 4321
00870   1
00871 #else
00872   0
00873 #endif
00874   },
00875   {
00876 #define BT_COLON BT_NMSTRT
00877 #include "asciitab.h"
00878 #undef BT_COLON
00879 #include "latin1tab.h"
00880   },
00881   STANDARD_VTABLE(big2_)
00882 };
00883 
00884 #if BYTEORDER != 1234
00885 
00886 #ifdef XML_NS
00887 
00888 static const struct normal_encoding internal_big2_encoding_ns = {
00889   { VTABLE, 2, 0, 1 },
00890   {
00891 #include "iasciitab.h"
00892 #include "latin1tab.h"
00893   },
00894   STANDARD_VTABLE(big2_)
00895 };
00896 
00897 #endif
00898 
00899 static const struct normal_encoding internal_big2_encoding = {
00900   { VTABLE, 2, 0, 1 },
00901   {
00902 #define BT_COLON BT_NMSTRT
00903 #include "iasciitab.h"
00904 #undef BT_COLON
00905 #include "latin1tab.h"
00906   },
00907   STANDARD_VTABLE(big2_)
00908 };
00909 
00910 #endif
00911 
00912 #undef PREFIX
00913 
00914 static int FASTCALL
00915 streqci(const char *s1, const char *s2)
00916 {
00917   for (;;) {
00918     char c1 = *s1++;
00919     char c2 = *s2++;
00920     if (ASCII_a <= c1 && c1 <= ASCII_z)
00921       c1 += ASCII_A - ASCII_a;
00922     if (ASCII_a <= c2 && c2 <= ASCII_z)
00923       c2 += ASCII_A - ASCII_a;
00924     if (c1 != c2)
00925       return 0;
00926     if (!c1)
00927       break;
00928   }
00929   return 1;
00930 }
00931 
00932 static void PTRCALL
00933 initUpdatePosition(const ENCODING *enc, const char *ptr,
00934                    const char *end, POSITION *pos)
00935 {
00936   normal_updatePosition(&utf8_encoding.enc, ptr, end, pos);
00937 }
00938 
00939 static int
00940 toAscii(const ENCODING *enc, const char *ptr, const char *end)
00941 {
00942   char buf[1];
00943   char *p = buf;
00944   XmlUtf8Convert(enc, &ptr, end, &p, p + 1);
00945   if (p == buf)
00946     return -1;
00947   else
00948     return buf[0];
00949 }
00950 
00951 static int FASTCALL
00952 isSpace(int c)
00953 {
00954   switch (c) {
00955   case 0x20:
00956   case 0xD:
00957   case 0xA:
00958   case 0x9:
00959     return 1;
00960   }
00961   return 0;
00962 }
00963 
00964 /* Return 1 if there's just optional white space or there's an S
00965    followed by name=val.
00966 */
00967 static int
00968 parsePseudoAttribute(const ENCODING *enc,
00969                      const char *ptr,
00970                      const char *end,
00971                      const char **namePtr,
00972                      const char **nameEndPtr,
00973                      const char **valPtr,
00974                      const char **nextTokPtr)
00975 {
00976   int c;
00977   char open;
00978   if (ptr == end) {
00979     *namePtr = NULL;
00980     return 1;
00981   }
00982   if (!isSpace(toAscii(enc, ptr, end))) {
00983     *nextTokPtr = ptr;
00984     return 0;
00985   }
00986   do {
00987     ptr += enc->minBytesPerChar;
00988   } while (isSpace(toAscii(enc, ptr, end)));
00989   if (ptr == end) {
00990     *namePtr = NULL;
00991     return 1;
00992   }
00993   *namePtr = ptr;
00994   for (;;) {
00995     c = toAscii(enc, ptr, end);
00996     if (c == -1) {
00997       *nextTokPtr = ptr;
00998       return 0;
00999     }
01000     if (c == ASCII_EQUALS) {
01001       *nameEndPtr = ptr;
01002       break;
01003     }
01004     if (isSpace(c)) {
01005       *nameEndPtr = ptr;
01006       do {
01007         ptr += enc->minBytesPerChar;
01008       } while (isSpace(c = toAscii(enc, ptr, end)));
01009       if (c != ASCII_EQUALS) {
01010         *nextTokPtr = ptr;
01011         return 0;
01012       }
01013       break;
01014     }
01015     ptr += enc->minBytesPerChar;
01016   }
01017   if (ptr == *namePtr) {
01018     *nextTokPtr = ptr;
01019     return 0;
01020   }
01021   ptr += enc->minBytesPerChar;
01022   c = toAscii(enc, ptr, end);
01023   while (isSpace(c)) {
01024     ptr += enc->minBytesPerChar;
01025     c = toAscii(enc, ptr, end);
01026   }
01027   if (c != ASCII_QUOT && c != ASCII_APOS) {
01028     *nextTokPtr = ptr;
01029     return 0;
01030   }
01031   open = (char)c;
01032   ptr += enc->minBytesPerChar;
01033   *valPtr = ptr;
01034   for (;; ptr += enc->minBytesPerChar) {
01035     c = toAscii(enc, ptr, end);
01036     if (c == open)
01037       break;
01038     if (!(ASCII_a <= c && c <= ASCII_z)
01039         && !(ASCII_A <= c && c <= ASCII_Z)
01040         && !(ASCII_0 <= c && c <= ASCII_9)
01041         && c != ASCII_PERIOD
01042         && c != ASCII_MINUS
01043         && c != ASCII_UNDERSCORE) {
01044       *nextTokPtr = ptr;
01045       return 0;
01046     }
01047   }
01048   *nextTokPtr = ptr + enc->minBytesPerChar;
01049   return 1;
01050 }
01051 
01052 static const char KW_version[] = {
01053   ASCII_v, ASCII_e, ASCII_r, ASCII_s, ASCII_i, ASCII_o, ASCII_n, '\0'
01054 };
01055 
01056 static const char KW_encoding[] = {
01057   ASCII_e, ASCII_n, ASCII_c, ASCII_o, ASCII_d, ASCII_i, ASCII_n, ASCII_g, '\0'
01058 };
01059 
01060 static const char KW_standalone[] = {
01061   ASCII_s, ASCII_t, ASCII_a, ASCII_n, ASCII_d, ASCII_a, ASCII_l, ASCII_o,
01062   ASCII_n, ASCII_e, '\0'
01063 };
01064 
01065 static const char KW_yes[] = {
01066   ASCII_y, ASCII_e, ASCII_s,  '\0'
01067 };
01068 
01069 static const char KW_no[] = {
01070   ASCII_n, ASCII_o,  '\0'
01071 };
01072 
01073 static int
01074 doParseXmlDecl(const ENCODING *(*encodingFinder)(const ENCODING *,
01075                                                  const char *,
01076                                                  const char *),
01077                int isGeneralTextEntity,
01078                const ENCODING *enc,
01079                const char *ptr,
01080                const char *end,
01081                const char **badPtr,
01082                const char **versionPtr,
01083                const char **versionEndPtr,
01084                const char **encodingName,
01085                const ENCODING **encoding,
01086                int *standalone)
01087 {
01088   const char *val = NULL;
01089   const char *name = NULL;
01090   const char *nameEnd = NULL;
01091   ptr += 5 * enc->minBytesPerChar;
01092   end -= 2 * enc->minBytesPerChar;
01093   if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)
01094       || !name) {
01095     *badPtr = ptr;
01096     return 0;
01097   }
01098   if (!XmlNameMatchesAscii(enc, name, nameEnd, KW_version)) {
01099     if (!isGeneralTextEntity) {
01100       *badPtr = name;
01101       return 0;
01102     }
01103   }
01104   else {
01105     if (versionPtr)
01106       *versionPtr = val;
01107     if (versionEndPtr)
01108       *versionEndPtr = ptr;
01109     if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {
01110       *badPtr = ptr;
01111       return 0;
01112     }
01113     if (!name) {
01114       if (isGeneralTextEntity) {
01115         /* a TextDecl must have an EncodingDecl */
01116         *badPtr = ptr;
01117         return 0;
01118       }
01119       return 1;
01120     }
01121   }
01122   if (XmlNameMatchesAscii(enc, name, nameEnd, KW_encoding)) {
01123     int c = toAscii(enc, val, end);
01124     if (!(ASCII_a <= c && c <= ASCII_z) && !(ASCII_A <= c && c <= ASCII_Z)) {
01125       *badPtr = val;
01126       return 0;
01127     }
01128     if (encodingName)
01129       *encodingName = val;
01130     if (encoding)
01131       *encoding = encodingFinder(enc, val, ptr - enc->minBytesPerChar);
01132     if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {
01133       *badPtr = ptr;
01134       return 0;
01135     }
01136     if (!name)
01137       return 1;
01138   }
01139   if (!XmlNameMatchesAscii(enc, name, nameEnd, KW_standalone)
01140       || isGeneralTextEntity) {
01141     *badPtr = name;
01142     return 0;
01143   }
01144   if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_yes)) {
01145     if (standalone)
01146       *standalone = 1;
01147   }
01148   else if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_no)) {
01149     if (standalone)
01150       *standalone = 0;
01151   }
01152   else {
01153     *badPtr = val;
01154     return 0;
01155   }
01156   while (isSpace(toAscii(enc, ptr, end)))
01157     ptr += enc->minBytesPerChar;
01158   if (ptr != end) {
01159     *badPtr = ptr;
01160     return 0;
01161   }
01162   return 1;
01163 }
01164 
01165 static int FASTCALL
01166 checkCharRefNumber(int result)
01167 {
01168   switch (result >> 8) {
01169   case 0xD8: case 0xD9: case 0xDA: case 0xDB:
01170   case 0xDC: case 0xDD: case 0xDE: case 0xDF:
01171     return -1;
01172   case 0:
01173     if (latin1_encoding.type[result] == BT_NONXML)
01174       return -1;
01175     break;
01176   case 0xFF:
01177     if (result == 0xFFFE || result == 0xFFFF)
01178       return -1;
01179     break;
01180   }
01181   return result;
01182 }
01183 
01184 int FASTCALL
01185 XmlUtf8Encode(int c, char *buf)
01186 {
01187   enum {
01188     /* minN is minimum legal resulting value for N byte sequence */
01189     min2 = 0x80,
01190     min3 = 0x800,
01191     min4 = 0x10000
01192   };
01193 
01194   if (c < 0)
01195     return 0;
01196   if (c < min2) {
01197     buf[0] = (char)(c | UTF8_cval1);
01198     return 1;
01199   }
01200   if (c < min3) {
01201     buf[0] = (char)((c >> 6) | UTF8_cval2);
01202     buf[1] = (char)((c & 0x3f) | 0x80);
01203     return 2;
01204   }
01205   if (c < min4) {
01206     buf[0] = (char)((c >> 12) | UTF8_cval3);
01207     buf[1] = (char)(((c >> 6) & 0x3f) | 0x80);
01208     buf[2] = (char)((c & 0x3f) | 0x80);
01209     return 3;
01210   }
01211   if (c < 0x110000) {
01212     buf[0] = (char)((c >> 18) | UTF8_cval4);
01213     buf[1] = (char)(((c >> 12) & 0x3f) | 0x80);
01214     buf[2] = (char)(((c >> 6) & 0x3f) | 0x80);
01215     buf[3] = (char)((c & 0x3f) | 0x80);
01216     return 4;
01217   }
01218   return 0;
01219 }
01220 
01221 int FASTCALL
01222 XmlUtf16Encode(int charNum, unsigned short *buf)
01223 {
01224   if (charNum < 0)
01225     return 0;
01226   if (charNum < 0x10000) {
01227     buf[0] = (unsigned short)charNum;
01228     return 1;
01229   }
01230   if (charNum < 0x110000) {
01231     charNum -= 0x10000;
01232     buf[0] = (unsigned short)((charNum >> 10) + 0xD800);
01233     buf[1] = (unsigned short)((charNum & 0x3FF) + 0xDC00);
01234     return 2;
01235   }
01236   return 0;
01237 }
01238 
01239 struct unknown_encoding {
01240   struct normal_encoding normal;
01241   CONVERTER convert;
01242   void *userData;
01243   unsigned short utf16[256];
01244   char utf8[256][4];
01245 };
01246 
01247 #define AS_UNKNOWN_ENCODING(enc)  ((const struct unknown_encoding *) (enc))
01248 
01249 int
01250 XmlSizeOfUnknownEncoding(void)
01251 {
01252   return sizeof(struct unknown_encoding);
01253 }
01254 
01255 static int PTRFASTCALL
01256 unknown_isName(const ENCODING *enc, const char *p)
01257 {
01258   const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
01259   int c = uenc->convert(uenc->userData, p);
01260   if (c & ~0xFFFF)
01261     return 0;
01262   return UCS2_GET_NAMING(namePages, c >> 8, c & 0xFF);
01263 }
01264 
01265 static int PTRFASTCALL
01266 unknown_isNmstrt(const ENCODING *enc, const char *p)
01267 {
01268   const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
01269   int c = uenc->convert(uenc->userData, p);
01270   if (c & ~0xFFFF)
01271     return 0;
01272   return UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xFF);
01273 }
01274 
01275 static int PTRFASTCALL
01276 unknown_isInvalid(const ENCODING *enc, const char *p)
01277 {
01278   const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
01279   int c = uenc->convert(uenc->userData, p);
01280   return (c & ~0xFFFF) || checkCharRefNumber(c) < 0;
01281 }
01282 
01283 static void PTRCALL
01284 unknown_toUtf8(const ENCODING *enc,
01285                const char **fromP, const char *fromLim,
01286                char **toP, const char *toLim)
01287 {
01288   const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
01289   char buf[XML_UTF8_ENCODE_MAX];
01290   for (;;) {
01291     const char *utf8;
01292     int n;
01293     if (*fromP == fromLim)
01294       break;
01295     utf8 = uenc->utf8[(unsigned char)**fromP];
01296     n = *utf8++;
01297     if (n == 0) {
01298       int c = uenc->convert(uenc->userData, *fromP);
01299       n = XmlUtf8Encode(c, buf);
01300       if (n > toLim - *toP)
01301         break;
01302       utf8 = buf;
01303       *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP]
01304                  - (BT_LEAD2 - 2));
01305     }
01306     else {
01307       if (n > toLim - *toP)
01308         break;
01309       (*fromP)++;
01310     }
01311     do {
01312       *(*toP)++ = *utf8++;
01313     } while (--n != 0);
01314   }
01315 }
01316 
01317 static void PTRCALL
01318 unknown_toUtf16(const ENCODING *enc,
01319                 const char **fromP, const char *fromLim,
01320                 unsigned short **toP, const unsigned short *toLim)
01321 {
01322   const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
01323   while (*fromP != fromLim && *toP != toLim) {
01324     unsigned short c = uenc->utf16[(unsigned char)**fromP];
01325     if (c == 0) {
01326       c = (unsigned short)
01327           uenc->convert(uenc->userData, *fromP);
01328       *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP]
01329                  - (BT_LEAD2 - 2));
01330     }
01331     else
01332       (*fromP)++;
01333     *(*toP)++ = c;
01334   }
01335 }
01336 
01337 ENCODING *
01338 XmlInitUnknownEncoding(void *mem,
01339                        int *table,
01340                        CONVERTER convert,
01341                        void *userData)
01342 {
01343   int i;
01344   struct unknown_encoding *e = (struct unknown_encoding *)mem;
01345   for (i = 0; i < (int)sizeof(struct normal_encoding); i++)
01346     ((char *)mem)[i] = ((char *)&latin1_encoding)[i];
01347   for (i = 0; i < 128; i++)
01348     if (latin1_encoding.type[i] != BT_OTHER
01349         && latin1_encoding.type[i] != BT_NONXML
01350         && table[i] != i)
01351       return 0;
01352   for (i = 0; i < 256; i++) {
01353     int c = table[i];
01354     if (c == -1) {
01355       e->normal.type[i] = BT_MALFORM;
01356       /* This shouldn't really get used. */
01357       e->utf16[i] = 0xFFFF;
01358       e->utf8[i][0] = 1;
01359       e->utf8[i][1] = 0;
01360     }
01361     else if (c < 0) {
01362       if (c < -4)
01363         return 0;
01364       e->normal.type[i] = (unsigned char)(BT_LEAD2 - (c + 2));
01365       e->utf8[i][0] = 0;
01366       e->utf16[i] = 0;
01367     }
01368     else if (c < 0x80) {
01369       if (latin1_encoding.type[c] != BT_OTHER
01370           && latin1_encoding.type[c] != BT_NONXML
01371           && c != i)
01372         return 0;
01373       e->normal.type[i] = latin1_encoding.type[c];
01374       e->utf8[i][0] = 1;
01375       e->utf8[i][1] = (char)c;
01376       e->utf16[i] = (unsigned short)(c == 0 ? 0xFFFF : c);
01377     }
01378     else if (checkCharRefNumber(c) < 0) {
01379       e->normal.type[i] = BT_NONXML;
01380       /* This shouldn't really get used. */
01381       e->utf16[i] = 0xFFFF;
01382       e->utf8[i][0] = 1;
01383       e->utf8[i][1] = 0;
01384     }
01385     else {
01386       if (c > 0xFFFF)
01387         return 0;
01388       if (UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xff))
01389         e->normal.type[i] = BT_NMSTRT;
01390       else if (UCS2_GET_NAMING(namePages, c >> 8, c & 0xff))
01391         e->normal.type[i] = BT_NAME;
01392       else
01393         e->normal.type[i] = BT_OTHER;
01394       e->utf8[i][0] = (char)XmlUtf8Encode(c, e->utf8[i] + 1);
01395       e->utf16[i] = (unsigned short)c;
01396     }
01397   }
01398   e->userData = userData;
01399   e->convert = convert;
01400   if (convert) {
01401     e->normal.isName2 = unknown_isName;
01402     e->normal.isName3 = unknown_isName;
01403     e->normal.isName4 = unknown_isName;
01404     e->normal.isNmstrt2 = unknown_isNmstrt;
01405     e->normal.isNmstrt3 = unknown_isNmstrt;
01406     e->normal.isNmstrt4 = unknown_isNmstrt;
01407     e->normal.isInvalid2 = unknown_isInvalid;
01408     e->normal.isInvalid3 = unknown_isInvalid;
01409     e->normal.isInvalid4 = unknown_isInvalid;
01410   }
01411   e->normal.enc.utf8Convert = unknown_toUtf8;
01412   e->normal.enc.utf16Convert = unknown_toUtf16;
01413   return &(e->normal.enc);
01414 }
01415 
01416 /* If this enumeration is changed, getEncodingIndex and encodings
01417 must also be changed. */
01418 enum {
01419   UNKNOWN_ENC = -1,
01420   ISO_8859_1_ENC = 0,
01421   US_ASCII_ENC,
01422   UTF_8_ENC,
01423   UTF_16_ENC,
01424   UTF_16BE_ENC,
01425   UTF_16LE_ENC,
01426   /* must match encodingNames up to here */
01427   NO_ENC
01428 };
01429 
01430 static const char KW_ISO_8859_1[] = {
01431   ASCII_I, ASCII_S, ASCII_O, ASCII_MINUS, ASCII_8, ASCII_8, ASCII_5, ASCII_9,
01432   ASCII_MINUS, ASCII_1, '\0'
01433 };
01434 static const char KW_US_ASCII[] = {
01435   ASCII_U, ASCII_S, ASCII_MINUS, ASCII_A, ASCII_S, ASCII_C, ASCII_I, ASCII_I,
01436   '\0'
01437 };
01438 static const char KW_UTF_8[] =  {
01439   ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_8, '\0'
01440 };
01441 static const char KW_UTF_16[] = {
01442   ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, '\0'
01443 };
01444 static const char KW_UTF_16BE[] = {
01445   ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, ASCII_B, ASCII_E,
01446   '\0'
01447 };
01448 static const char KW_UTF_16LE[] = {
01449   ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, ASCII_L, ASCII_E,
01450   '\0'
01451 };
01452 
01453 static int FASTCALL
01454 getEncodingIndex(const char *name)
01455 {
01456   static const char * const encodingNames[] = {
01457     KW_ISO_8859_1,
01458     KW_US_ASCII,
01459     KW_UTF_8,
01460     KW_UTF_16,
01461     KW_UTF_16BE,
01462     KW_UTF_16LE,
01463   };
01464   int i;
01465   if (name == NULL)
01466     return NO_ENC;
01467   for (i = 0; i < (int)(sizeof(encodingNames)/sizeof(encodingNames[0])); i++)
01468     if (streqci(name, encodingNames[i]))
01469       return i;
01470   return UNKNOWN_ENC;
01471 }
01472 
01473 /* For binary compatibility, we store the index of the encoding
01474    specified at initialization in the isUtf16 member.
01475 */
01476 
01477 #define INIT_ENC_INDEX(enc) ((int)(enc)->initEnc.isUtf16)
01478 #define SET_INIT_ENC_INDEX(enc, i) ((enc)->initEnc.isUtf16 = (char)i)
01479 
01480 /* This is what detects the encoding.  encodingTable maps from
01481    encoding indices to encodings; INIT_ENC_INDEX(enc) is the index of
01482    the external (protocol) specified encoding; state is
01483    XML_CONTENT_STATE if we're parsing an external text entity, and
01484    XML_PROLOG_STATE otherwise.
01485 */
01486 
01487 
01488 static int
01489 initScan(const ENCODING * const *encodingTable,
01490          const INIT_ENCODING *enc,
01491          int state,
01492          const char *ptr,
01493          const char *end,
01494          const char **nextTokPtr)
01495 {
01496   const ENCODING **encPtr;
01497 
01498   if (ptr == end)
01499     return XML_TOK_NONE;
01500   encPtr = enc->encPtr;
01501   if (ptr + 1 == end) {
01502     /* only a single byte available for auto-detection */
01503 #ifndef XML_DTD /* FIXME */
01504     /* a well-formed document entity must have more than one byte */
01505     if (state != XML_CONTENT_STATE)
01506       return XML_TOK_PARTIAL;
01507 #endif
01508     /* so we're parsing an external text entity... */
01509     /* if UTF-16 was externally specified, then we need at least 2 bytes */
01510     switch (INIT_ENC_INDEX(enc)) {
01511     case UTF_16_ENC:
01512     case UTF_16LE_ENC:
01513     case UTF_16BE_ENC:
01514       return XML_TOK_PARTIAL;
01515     }
01516     switch ((unsigned char)*ptr) {
01517     case 0xFE:
01518     case 0xFF:
01519     case 0xEF: /* possibly first byte of UTF-8 BOM */
01520       if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
01521           && state == XML_CONTENT_STATE)
01522         break;
01523       /* fall through */
01524     case 0x00:
01525     case 0x3C:
01526       return XML_TOK_PARTIAL;
01527     }
01528   }
01529   else {
01530     switch (((unsigned char)ptr[0] << 8) | (unsigned char)ptr[1]) {
01531     case 0xFEFF:
01532       if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
01533           && state == XML_CONTENT_STATE)
01534         break;
01535       *nextTokPtr = ptr + 2;
01536       *encPtr = encodingTable[UTF_16BE_ENC];
01537       return XML_TOK_BOM;
01538     /* 00 3C is handled in the default case */
01539     case 0x3C00:
01540       if ((INIT_ENC_INDEX(enc) == UTF_16BE_ENC
01541            || INIT_ENC_INDEX(enc) == UTF_16_ENC)
01542           && state == XML_CONTENT_STATE)
01543         break;
01544       *encPtr = encodingTable[UTF_16LE_ENC];
01545       return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
01546     case 0xFFFE:
01547       if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
01548           && state == XML_CONTENT_STATE)
01549         break;
01550       *nextTokPtr = ptr + 2;
01551       *encPtr = encodingTable[UTF_16LE_ENC];
01552       return XML_TOK_BOM;
01553     case 0xEFBB:
01554       /* Maybe a UTF-8 BOM (EF BB BF) */
01555       /* If there's an explicitly specified (external) encoding
01556          of ISO-8859-1 or some flavour of UTF-16
01557          and this is an external text entity,
01558          don't look for the BOM,
01559          because it might be a legal data.
01560       */
01561       if (state == XML_CONTENT_STATE) {
01562         int e = INIT_ENC_INDEX(enc);
01563         if (e == ISO_8859_1_ENC || e == UTF_16BE_ENC
01564             || e == UTF_16LE_ENC || e == UTF_16_ENC)
01565           break;
01566       }
01567       if (ptr + 2 == end)
01568         return XML_TOK_PARTIAL;
01569       if ((unsigned char)ptr[2] == 0xBF) {
01570         *nextTokPtr = ptr + 3;
01571         *encPtr = encodingTable[UTF_8_ENC];
01572         return XML_TOK_BOM;
01573       }
01574       break;
01575     default:
01576       if (ptr[0] == '\0') {
01577         /* 0 isn't a legal data character. Furthermore a document
01578            entity can only start with ASCII characters.  So the only
01579            way this can fail to be big-endian UTF-16 if it it's an
01580            external parsed general entity that's labelled as
01581            UTF-16LE.
01582         */
01583         if (state == XML_CONTENT_STATE && INIT_ENC_INDEX(enc) == UTF_16LE_ENC)
01584           break;
01585         *encPtr = encodingTable[UTF_16BE_ENC];
01586         return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
01587       }
01588       else if (ptr[1] == '\0') {
01589         /* We could recover here in the case:
01590             - parsing an external entity
01591             - second byte is 0
01592             - no externally specified encoding
01593             - no encoding declaration
01594            by assuming UTF-16LE.  But we don't, because this would mean when
01595            presented just with a single byte, we couldn't reliably determine
01596            whether we needed further bytes.
01597         */
01598         if (state == XML_CONTENT_STATE)
01599           break;
01600         *encPtr = encodingTable[UTF_16LE_ENC];
01601         return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
01602       }
01603       break;
01604     }
01605   }
01606   *encPtr = encodingTable[INIT_ENC_INDEX(enc)];
01607   return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
01608 }
01609 
01610 
01611 #define NS(x) x
01612 #define ns(x) x
01613 #include "xmltok_ns.c"
01614 #undef NS
01615 #undef ns
01616 
01617 #ifdef XML_NS
01618 
01619 #define NS(x) x ## NS
01620 #define ns(x) x ## _ns
01621 
01622 #include "xmltok_ns.c"
01623 
01624 #undef NS
01625 #undef ns
01626 
01627 ENCODING *
01628 XmlInitUnknownEncodingNS(void *mem,
01629                          int *table,
01630                          CONVERTER convert,
01631                          void *userData)
01632 {
01633   ENCODING *enc = XmlInitUnknownEncoding(mem, table, convert, userData);
01634   if (enc)
01635     ((struct normal_encoding *)enc)->type[ASCII_COLON] = BT_COLON;
01636   return enc;
01637 }
01638 
01639 #endif /* XML_NS */

Generated on Sun May 27 2012 04:33:29 for ReactOS by doxygen 1.7.6.1

ReactOS is a registered trademark or a trademark of ReactOS Foundation in the United States and other countries.