ReactOS Fundraising Campaign 2012
 
€ 4,410 / € 30,000

Information | Donate

Home | Info | Community | Development | myReactOS | Contact Us

  1. Home
  2. Community
  3. Development
  4. myReactOS
  5. Fundraiser 2012

  1. Main Page
  2. Alphabetical List
  3. Data Structures
  4. Directories
  5. File List
  6. Data Fields
  7. Globals
  8. Related Pages

ReactOS Development > Doxygen

HTMLparser.c
Go to the documentation of this file.
00001 /*
00002  * HTMLparser.c : an HTML 4.0 non-verifying parser
00003  *
00004  * See Copyright for the status of this software.
00005  *
00006  * daniel@veillard.com
00007  */
00008 
00009 #define IN_LIBXML
00010 #include "libxml.h"
00011 #ifdef LIBXML_HTML_ENABLED
00012 
00013 #include <string.h>
00014 #ifdef HAVE_CTYPE_H
00015 #include <ctype.h>
00016 #endif
00017 #ifdef HAVE_STDLIB_H
00018 #include <stdlib.h>
00019 #endif
00020 #ifdef HAVE_SYS_STAT_H
00021 #include <sys/stat.h>
00022 #endif
00023 #ifdef HAVE_FCNTL_H
00024 #include <fcntl.h>
00025 #endif
00026 #ifdef HAVE_UNISTD_H
00027 #include <unistd.h>
00028 #endif
00029 #ifdef HAVE_ZLIB_H
00030 #include <zlib.h>
00031 #endif
00032 
00033 #include <libxml/xmlmemory.h>
00034 #include <libxml/tree.h>
00035 #include <libxml/parser.h>
00036 #include <libxml/parserInternals.h>
00037 #include <libxml/xmlerror.h>
00038 #include <libxml/HTMLparser.h>
00039 #include <libxml/HTMLtree.h>
00040 #include <libxml/entities.h>
00041 #include <libxml/encoding.h>
00042 #include <libxml/valid.h>
00043 #include <libxml/xmlIO.h>
00044 #include <libxml/globals.h>
00045 #include <libxml/uri.h>
00046 
00047 #define HTML_MAX_NAMELEN 1000
00048 #define HTML_PARSER_BIG_BUFFER_SIZE 1000
00049 #define HTML_PARSER_BUFFER_SIZE 100
00050 
00051 /* #define DEBUG */
00052 /* #define DEBUG_PUSH */
00053 
00054 static int htmlOmittedDefaultValue = 1;
00055 
00056 xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
00057                  xmlChar end, xmlChar  end2, xmlChar end3);
00058 static void htmlParseComment(htmlParserCtxtPtr ctxt);
00059 
00060 /************************************************************************
00061  *                                  *
00062  *      Some factorized error routines              *
00063  *                                  *
00064  ************************************************************************/
00065 
00073 static void
00074 htmlErrMemory(xmlParserCtxtPtr ctxt, const char *extra)
00075 {
00076     if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
00077         (ctxt->instate == XML_PARSER_EOF))
00078     return;
00079     if (ctxt != NULL) {
00080         ctxt->errNo = XML_ERR_NO_MEMORY;
00081         ctxt->instate = XML_PARSER_EOF;
00082         ctxt->disableSAX = 1;
00083     }
00084     if (extra)
00085         __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
00086                         XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra,
00087                         NULL, NULL, 0, 0,
00088                         "Memory allocation failed : %s\n", extra);
00089     else
00090         __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
00091                         XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, NULL,
00092                         NULL, NULL, 0, 0, "Memory allocation failed\n");
00093 }
00094 
00105 static void
00106 htmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error,
00107              const char *msg, const xmlChar *str1, const xmlChar *str2)
00108 {
00109     if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
00110         (ctxt->instate == XML_PARSER_EOF))
00111     return;
00112     if (ctxt != NULL)
00113     ctxt->errNo = error;
00114     __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
00115                     XML_ERR_ERROR, NULL, 0,
00116             (const char *) str1, (const char *) str2,
00117             NULL, 0, 0,
00118             msg, str1, str2);
00119     if (ctxt != NULL)
00120     ctxt->wellFormed = 0;
00121 }
00122 
00132 static void
00133 htmlParseErrInt(xmlParserCtxtPtr ctxt, xmlParserErrors error,
00134              const char *msg, int val)
00135 {
00136     if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
00137         (ctxt->instate == XML_PARSER_EOF))
00138     return;
00139     if (ctxt != NULL)
00140     ctxt->errNo = error;
00141     __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
00142                     XML_ERR_ERROR, NULL, 0, NULL, NULL,
00143             NULL, val, 0, msg, val);
00144     if (ctxt != NULL)
00145     ctxt->wellFormed = 0;
00146 }
00147 
00148 /************************************************************************
00149  *                                  *
00150  *  Parser stacks related functions and macros      *
00151  *                                  *
00152  ************************************************************************/
00153 
00163 static int
00164 htmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value)
00165 {
00166     if ((ctxt->html < 3) && (xmlStrEqual(value, BAD_CAST "head")))
00167         ctxt->html = 3;
00168     if ((ctxt->html < 10) && (xmlStrEqual(value, BAD_CAST "body")))
00169         ctxt->html = 10;
00170     if (ctxt->nameNr >= ctxt->nameMax) {
00171         ctxt->nameMax *= 2;
00172         ctxt->nameTab = (const xmlChar * *)
00173                          xmlRealloc((xmlChar * *)ctxt->nameTab,
00174                                     ctxt->nameMax *
00175                                     sizeof(ctxt->nameTab[0]));
00176         if (ctxt->nameTab == NULL) {
00177             htmlErrMemory(ctxt, NULL);
00178             return (0);
00179         }
00180     }
00181     ctxt->nameTab[ctxt->nameNr] = value;
00182     ctxt->name = value;
00183     return (ctxt->nameNr++);
00184 }
00193 static const xmlChar *
00194 htmlnamePop(htmlParserCtxtPtr ctxt)
00195 {
00196     const xmlChar *ret;
00197 
00198     if (ctxt->nameNr <= 0)
00199         return (NULL);
00200     ctxt->nameNr--;
00201     if (ctxt->nameNr < 0)
00202         return (NULL);
00203     if (ctxt->nameNr > 0)
00204         ctxt->name = ctxt->nameTab[ctxt->nameNr - 1];
00205     else
00206         ctxt->name = NULL;
00207     ret = ctxt->nameTab[ctxt->nameNr];
00208     ctxt->nameTab[ctxt->nameNr] = NULL;
00209     return (ret);
00210 }
00211 
00221 static int
00222 htmlNodeInfoPush(htmlParserCtxtPtr ctxt, htmlParserNodeInfo *value)
00223 {
00224     if (ctxt->nodeInfoNr >= ctxt->nodeInfoMax) {
00225         if (ctxt->nodeInfoMax == 0)
00226                 ctxt->nodeInfoMax = 5;
00227         ctxt->nodeInfoMax *= 2;
00228         ctxt->nodeInfoTab = (htmlParserNodeInfo *)
00229                          xmlRealloc((htmlParserNodeInfo *)ctxt->nodeInfoTab,
00230                                     ctxt->nodeInfoMax *
00231                                     sizeof(ctxt->nodeInfoTab[0]));
00232         if (ctxt->nodeInfoTab == NULL) {
00233             htmlErrMemory(ctxt, NULL);
00234             return (0);
00235         }
00236     }
00237     ctxt->nodeInfoTab[ctxt->nodeInfoNr] = *value;
00238     ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
00239     return (ctxt->nodeInfoNr++);
00240 }
00241 
00250 static htmlParserNodeInfo *
00251 htmlNodeInfoPop(htmlParserCtxtPtr ctxt)
00252 {
00253     if (ctxt->nodeInfoNr <= 0)
00254         return (NULL);
00255     ctxt->nodeInfoNr--;
00256     if (ctxt->nodeInfoNr < 0)
00257         return (NULL);
00258     if (ctxt->nodeInfoNr > 0)
00259         ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr - 1];
00260     else
00261         ctxt->nodeInfo = NULL;
00262     return &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
00263 }
00264 
00265 /*
00266  * Macros for accessing the content. Those should be used only by the parser,
00267  * and not exported.
00268  *
00269  * Dirty macros, i.e. one need to make assumption on the context to use them
00270  *
00271  *   CUR_PTR return the current pointer to the xmlChar to be parsed.
00272  *   CUR     returns the current xmlChar value, i.e. a 8 bit value if compiled
00273  *           in ISO-Latin or UTF-8, and the current 16 bit value if compiled
00274  *           in UNICODE mode. This should be used internally by the parser
00275  *           only to compare to ASCII values otherwise it would break when
00276  *           running with UTF-8 encoding.
00277  *   NXT(n)  returns the n'th next xmlChar. Same as CUR is should be used only
00278  *           to compare on ASCII based substring.
00279  *   UPP(n)  returns the n'th next xmlChar converted to uppercase. Same as CUR
00280  *           it should be used only to compare on ASCII based substring.
00281  *   SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
00282  *           strings without newlines within the parser.
00283  *
00284  * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
00285  *
00286  *   CURRENT Returns the current char value, with the full decoding of
00287  *           UTF-8 if we are using this mode. It returns an int.
00288  *   NEXT    Skip to the next character, this does the proper decoding
00289  *           in UTF-8 mode. It also pop-up unfinished entities on the fly.
00290  *   NEXTL(l) Skip the current unicode character of l xmlChars long.
00291  *   COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
00292  */
00293 
00294 #define UPPER (toupper(*ctxt->input->cur))
00295 
00296 #define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val),ctxt->input->col+=(val)
00297 
00298 #define NXT(val) ctxt->input->cur[(val)]
00299 
00300 #define UPP(val) (toupper(ctxt->input->cur[(val)]))
00301 
00302 #define CUR_PTR ctxt->input->cur
00303 
00304 #define SHRINK if ((ctxt->input->cur - ctxt->input->base > 2 * INPUT_CHUNK) && \
00305            (ctxt->input->end - ctxt->input->cur < 2 * INPUT_CHUNK)) \
00306     xmlParserInputShrink(ctxt->input)
00307 
00308 #define GROW if ((ctxt->progressive == 0) &&                \
00309          (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK))   \
00310     xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
00311 
00312 #define CURRENT ((int) (*ctxt->input->cur))
00313 
00314 #define SKIP_BLANKS htmlSkipBlankChars(ctxt)
00315 
00316 /* Inported from XML */
00317 
00318 /* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */
00319 #define CUR ((int) (*ctxt->input->cur))
00320 #define NEXT xmlNextChar(ctxt)
00321 
00322 #define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
00323 
00324 
00325 #define NEXTL(l) do {                           \
00326     if (*(ctxt->input->cur) == '\n') {                  \
00327     ctxt->input->line++; ctxt->input->col = 1;          \
00328     } else ctxt->input->col++;                      \
00329     ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++;        \
00330   } while (0)
00331 
00332 /************
00333     \
00334     if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
00335     if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
00336  ************/
00337 
00338 #define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
00339 #define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
00340 
00341 #define COPY_BUF(l,b,i,v)                       \
00342     if (l == 1) b[i++] = (xmlChar) v;                   \
00343     else i += xmlCopyChar(l,&b[i],v)
00344 
00359 static xmlChar *
00360 htmlFindEncoding(xmlParserCtxtPtr ctxt) {
00361     const xmlChar *start, *cur, *end;
00362 
00363     if ((ctxt == NULL) || (ctxt->input == NULL) ||
00364         (ctxt->input->encoding != NULL) || (ctxt->input->buf == NULL) ||
00365         (ctxt->input->buf->encoder != NULL))
00366         return(NULL);
00367     if ((ctxt->input->cur == NULL) || (ctxt->input->end == NULL))
00368         return(NULL);
00369 
00370     start = ctxt->input->cur;
00371     end = ctxt->input->end;
00372     /* we also expect the input buffer to be zero terminated */
00373     if (*end != 0)
00374         return(NULL);
00375 
00376     cur = xmlStrcasestr(start, BAD_CAST "HTTP-EQUIV");
00377     if (cur == NULL)
00378         return(NULL);
00379     cur = xmlStrcasestr(cur, BAD_CAST  "CONTENT");
00380     if (cur == NULL)
00381         return(NULL);
00382     cur = xmlStrcasestr(cur, BAD_CAST  "CHARSET=");
00383     if (cur == NULL)
00384         return(NULL);
00385     cur += 8;
00386     start = cur;
00387     while (((*cur >= 'A') && (*cur <= 'Z')) ||
00388            ((*cur >= 'a') && (*cur <= 'z')) ||
00389            ((*cur >= '0') && (*cur <= '9')) ||
00390            (*cur == '-') || (*cur == '_') || (*cur == ':') || (*cur == '/'))
00391            cur++;
00392     if (cur == start)
00393         return(NULL);
00394     return(xmlStrndup(start, cur - start));
00395 }
00396 
00411 static int
00412 htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
00413     if (ctxt->instate == XML_PARSER_EOF)
00414     return(0);
00415 
00416     if (ctxt->token != 0) {
00417     *len = 0;
00418     return(ctxt->token);
00419     }
00420     if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
00421     /*
00422      * We are supposed to handle UTF8, check it's valid
00423      * From rfc2044: encoding of the Unicode values on UTF-8:
00424      *
00425      * UCS-4 range (hex.)           UTF-8 octet sequence (binary)
00426      * 0000 0000-0000 007F   0xxxxxxx
00427      * 0000 0080-0000 07FF   110xxxxx 10xxxxxx
00428      * 0000 0800-0000 FFFF   1110xxxx 10xxxxxx 10xxxxxx
00429      *
00430      * Check for the 0x110000 limit too
00431      */
00432     const unsigned char *cur = ctxt->input->cur;
00433     unsigned char c;
00434     unsigned int val;
00435 
00436     c = *cur;
00437     if (c & 0x80) {
00438         if (cur[1] == 0) {
00439         xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
00440                 cur = ctxt->input->cur;
00441             }
00442         if ((cur[1] & 0xc0) != 0x80)
00443         goto encoding_error;
00444         if ((c & 0xe0) == 0xe0) {
00445 
00446         if (cur[2] == 0) {
00447             xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
00448                     cur = ctxt->input->cur;
00449                 }
00450         if ((cur[2] & 0xc0) != 0x80)
00451             goto encoding_error;
00452         if ((c & 0xf0) == 0xf0) {
00453             if (cur[3] == 0) {
00454             xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
00455                         cur = ctxt->input->cur;
00456                     }
00457             if (((c & 0xf8) != 0xf0) ||
00458             ((cur[3] & 0xc0) != 0x80))
00459             goto encoding_error;
00460             /* 4-byte code */
00461             *len = 4;
00462             val = (cur[0] & 0x7) << 18;
00463             val |= (cur[1] & 0x3f) << 12;
00464             val |= (cur[2] & 0x3f) << 6;
00465             val |= cur[3] & 0x3f;
00466         } else {
00467           /* 3-byte code */
00468             *len = 3;
00469             val = (cur[0] & 0xf) << 12;
00470             val |= (cur[1] & 0x3f) << 6;
00471             val |= cur[2] & 0x3f;
00472         }
00473         } else {
00474           /* 2-byte code */
00475         *len = 2;
00476         val = (cur[0] & 0x1f) << 6;
00477         val |= cur[1] & 0x3f;
00478         }
00479         if (!IS_CHAR(val)) {
00480             htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
00481                 "Char 0x%X out of allowed range\n", val);
00482         }
00483         return(val);
00484     } else {
00485             if ((*ctxt->input->cur == 0) &&
00486                 (ctxt->input->cur < ctxt->input->end)) {
00487                     htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
00488                 "Char 0x%X out of allowed range\n", 0);
00489                 *len = 1;
00490                 return(' ');
00491             }
00492         /* 1-byte code */
00493         *len = 1;
00494         return((int) *ctxt->input->cur);
00495     }
00496     }
00497     /*
00498      * Assume it's a fixed length encoding (1) with
00499      * a compatible encoding for the ASCII set, since
00500      * XML constructs only use < 128 chars
00501      */
00502     *len = 1;
00503     if ((int) *ctxt->input->cur < 0x80)
00504     return((int) *ctxt->input->cur);
00505 
00506     /*
00507      * Humm this is bad, do an automatic flow conversion
00508      */
00509     {
00510         xmlChar * guess;
00511         xmlCharEncodingHandlerPtr handler;
00512 
00513         guess = htmlFindEncoding(ctxt);
00514         if (guess == NULL) {
00515             xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
00516         } else {
00517             if (ctxt->input->encoding != NULL)
00518                 xmlFree((xmlChar *) ctxt->input->encoding);
00519             ctxt->input->encoding = guess;
00520             handler = xmlFindCharEncodingHandler((const char *) guess);
00521             if (handler != NULL) {
00522                 xmlSwitchToEncoding(ctxt, handler);
00523             } else {
00524                 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
00525                              "Unsupported encoding %s", guess, NULL);
00526             }
00527         }
00528         ctxt->charset = XML_CHAR_ENCODING_UTF8;
00529     }
00530 
00531     return(xmlCurrentChar(ctxt, len));
00532 
00533 encoding_error:
00534     /*
00535      * If we detect an UTF8 error that probably mean that the
00536      * input encoding didn't get properly advertized in the
00537      * declaration header. Report the error and switch the encoding
00538      * to ISO-Latin-1 (if you don't like this policy, just declare the
00539      * encoding !)
00540      */
00541     {
00542         char buffer[150];
00543 
00544     if (ctxt->input->end - ctxt->input->cur >= 4) {
00545         snprintf(buffer, 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
00546                 ctxt->input->cur[0], ctxt->input->cur[1],
00547                 ctxt->input->cur[2], ctxt->input->cur[3]);
00548     } else {
00549         snprintf(buffer, 149, "Bytes: 0x%02X\n", ctxt->input->cur[0]);
00550     }
00551     htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
00552              "Input is not proper UTF-8, indicate encoding !\n",
00553              BAD_CAST buffer, NULL);
00554     }
00555 
00556     ctxt->charset = XML_CHAR_ENCODING_8859_1;
00557     *len = 1;
00558     return((int) *ctxt->input->cur);
00559 }
00560 
00570 static int
00571 htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
00572     int res = 0;
00573 
00574     while (IS_BLANK_CH(*(ctxt->input->cur))) {
00575     if ((*ctxt->input->cur == 0) &&
00576         (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
00577         xmlPopInput(ctxt);
00578     } else {
00579         if (*(ctxt->input->cur) == '\n') {
00580         ctxt->input->line++; ctxt->input->col = 1;
00581         } else ctxt->input->col++;
00582         ctxt->input->cur++;
00583         ctxt->nbChars++;
00584         if (*ctxt->input->cur == 0)
00585         xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
00586     }
00587     res++;
00588     }
00589     return(res);
00590 }
00591 
00592 
00593 
00594 /************************************************************************
00595  *                                  *
00596  *  The list of HTML elements and their properties      *
00597  *                                  *
00598  ************************************************************************/
00599 
00600 /*
00601  *  Start Tag: 1 means the start tag can be ommited
00602  *  End Tag:   1 means the end tag can be ommited
00603  *             2 means it's forbidden (empty elements)
00604  *             3 means the tag is stylistic and should be closed easily
00605  *  Depr:      this element is deprecated
00606  *  DTD:       1 means that this element is valid only in the Loose DTD
00607  *             2 means that this element is valid only in the Frameset DTD
00608  *
00609  * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
00610     , subElements , impliedsubelt , Attributes, userdata
00611  */
00612 
00613 /* Definitions and a couple of vars for HTML Elements */
00614 
00615 #define FONTSTYLE "tt", "i", "b", "u", "s", "strike", "big", "small"
00616 #define NB_FONTSTYLE 8
00617 #define PHRASE "em", "strong", "dfn", "code", "samp", "kbd", "var", "cite", "abbr", "acronym"
00618 #define NB_PHRASE 10
00619 #define SPECIAL "a", "img", "applet", "embed", "object", "font", "basefont", "br", "script", "map", "q", "sub", "sup", "span", "bdo", "iframe"
00620 #define NB_SPECIAL 16
00621 #define INLINE FONTSTYLE, PHRASE, SPECIAL, FORMCTRL
00622 #define NB_INLINE NB_PCDATA + NB_FONTSTYLE + NB_PHRASE + NB_SPECIAL + NB_FORMCTRL
00623 #define BLOCK HEADING, LIST, "pre", "p", "dl", "div", "center", "noscript", "noframes", "blockquote", "form", "isindex", "hr", "table", "fieldset", "address"
00624 #define NB_BLOCK NB_HEADING + NB_LIST + 14
00625 #define FORMCTRL "input", "select", "textarea", "label", "button"
00626 #define NB_FORMCTRL 5
00627 #define PCDATA
00628 #define NB_PCDATA 0
00629 #define HEADING "h1", "h2", "h3", "h4", "h5", "h6"
00630 #define NB_HEADING 6
00631 #define LIST "ul", "ol", "dir", "menu"
00632 #define NB_LIST 4
00633 #define MODIFIER
00634 #define NB_MODIFIER 0
00635 #define FLOW BLOCK,INLINE
00636 #define NB_FLOW NB_BLOCK + NB_INLINE
00637 #define EMPTY NULL
00638 
00639 
00640 static const char* const html_flow[] = { FLOW, NULL } ;
00641 static const char* const html_inline[] = { INLINE, NULL } ;
00642 
00643 /* placeholders: elts with content but no subelements */
00644 static const char* const html_pcdata[] = { NULL } ;
00645 #define html_cdata html_pcdata
00646 
00647 
00648 /* ... and for HTML Attributes */
00649 
00650 #define COREATTRS "id", "class", "style", "title"
00651 #define NB_COREATTRS 4
00652 #define I18N "lang", "dir"
00653 #define NB_I18N 2
00654 #define EVENTS "onclick", "ondblclick", "onmousedown", "onmouseup", "onmouseover", "onmouseout", "onkeypress", "onkeydown", "onkeyup"
00655 #define NB_EVENTS 9
00656 #define ATTRS COREATTRS,I18N,EVENTS
00657 #define NB_ATTRS NB_NB_COREATTRS + NB_I18N + NB_EVENTS
00658 #define CELLHALIGN "align", "char", "charoff"
00659 #define NB_CELLHALIGN 3
00660 #define CELLVALIGN "valign"
00661 #define NB_CELLVALIGN 1
00662 
00663 static const char* const html_attrs[] = { ATTRS, NULL } ;
00664 static const char* const core_i18n_attrs[] = { COREATTRS, I18N, NULL } ;
00665 static const char* const core_attrs[] = { COREATTRS, NULL } ;
00666 static const char* const i18n_attrs[] = { I18N, NULL } ;
00667 
00668 
00669 /* Other declarations that should go inline ... */
00670 static const char* const a_attrs[] = { ATTRS, "charset", "type", "name",
00671     "href", "hreflang", "rel", "rev", "accesskey", "shape", "coords",
00672     "tabindex", "onfocus", "onblur", NULL } ;
00673 static const char* const target_attr[] = { "target", NULL } ;
00674 static const char* const rows_cols_attr[] = { "rows", "cols", NULL } ;
00675 static const char* const alt_attr[] = { "alt", NULL } ;
00676 static const char* const src_alt_attrs[] = { "src", "alt", NULL } ;
00677 static const char* const href_attrs[] = { "href", NULL } ;
00678 static const char* const clear_attrs[] = { "clear", NULL } ;
00679 static const char* const inline_p[] = { INLINE, "p", NULL } ;
00680 
00681 static const char* const flow_param[] = { FLOW, "param", NULL } ;
00682 static const char* const applet_attrs[] = { COREATTRS , "codebase",
00683         "archive", "alt", "name", "height", "width", "align",
00684         "hspace", "vspace", NULL } ;
00685 static const char* const area_attrs[] = { "shape", "coords", "href", "nohref",
00686     "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
00687 static const char* const basefont_attrs[] =
00688     { "id", "size", "color", "face", NULL } ;
00689 static const char* const quote_attrs[] = { ATTRS, "cite", NULL } ;
00690 static const char* const body_contents[] = { FLOW, "ins", "del", NULL } ;
00691 static const char* const body_attrs[] = { ATTRS, "onload", "onunload", NULL } ;
00692 static const char* const body_depr[] = { "background", "bgcolor", "text",
00693     "link", "vlink", "alink", NULL } ;
00694 static const char* const button_attrs[] = { ATTRS, "name", "value", "type",
00695     "disabled", "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
00696 
00697 
00698 static const char* const col_attrs[] = { ATTRS, "span", "width", CELLHALIGN, CELLVALIGN, NULL } ;
00699 static const char* const col_elt[] = { "col", NULL } ;
00700 static const char* const edit_attrs[] = { ATTRS, "datetime", "cite", NULL } ;
00701 static const char* const compact_attrs[] = { ATTRS, "compact", NULL } ;
00702 static const char* const dl_contents[] = { "dt", "dd", NULL } ;
00703 static const char* const compact_attr[] = { "compact", NULL } ;
00704 static const char* const label_attr[] = { "label", NULL } ;
00705 static const char* const fieldset_contents[] = { FLOW, "legend" } ;
00706 static const char* const font_attrs[] = { COREATTRS, I18N, "size", "color", "face" , NULL } ;
00707 static const char* const form_contents[] = { HEADING, LIST, INLINE, "pre", "p", "div", "center", "noscript", "noframes", "blockquote", "isindex", "hr", "table", "fieldset", "address", NULL } ;
00708 static const char* const form_attrs[] = { ATTRS, "method", "enctype", "accept", "name", "onsubmit", "onreset", "accept-charset", NULL } ;
00709 static const char* const frame_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling" , NULL } ;
00710 static const char* const frameset_attrs[] = { COREATTRS, "rows", "cols", "onload", "onunload", NULL } ;
00711 static const char* const frameset_contents[] = { "frameset", "frame", "noframes", NULL } ;
00712 static const char* const head_attrs[] = { I18N, "profile", NULL } ;
00713 static const char* const head_contents[] = { "title", "isindex", "base", "script", "style", "meta", "link", "object", NULL } ;
00714 static const char* const hr_depr[] = { "align", "noshade", "size", "width", NULL } ;
00715 static const char* const version_attr[] = { "version", NULL } ;
00716 static const char* const html_content[] = { "head", "body", "frameset", NULL } ;
00717 static const char* const iframe_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width", NULL } ;
00718 static const char* const img_attrs[] = { ATTRS, "longdesc", "name", "height", "width", "usemap", "ismap", NULL } ;
00719 static const char* const embed_attrs[] = { COREATTRS, "align", "alt", "border", "code", "codebase", "frameborder", "height", "hidden", "hspace", "name", "palette", "pluginspace", "pluginurl", "src", "type", "units", "vspace", "width", NULL } ;
00720 static const char* const input_attrs[] = { ATTRS, "type", "name", "value", "checked", "disabled", "readonly", "size", "maxlength", "src", "alt", "usemap", "ismap", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", "accept", NULL } ;
00721 static const char* const prompt_attrs[] = { COREATTRS, I18N, "prompt", NULL } ;
00722 static const char* const label_attrs[] = { ATTRS, "for", "accesskey", "onfocus", "onblur", NULL } ;
00723 static const char* const legend_attrs[] = { ATTRS, "accesskey", NULL } ;
00724 static const char* const align_attr[] = { "align", NULL } ;
00725 static const char* const link_attrs[] = { ATTRS, "charset", "href", "hreflang", "type", "rel", "rev", "media", NULL } ;
00726 static const char* const map_contents[] = { BLOCK, "area", NULL } ;
00727 static const char* const name_attr[] = { "name", NULL } ;
00728 static const char* const action_attr[] = { "action", NULL } ;
00729 static const char* const blockli_elt[] = { BLOCK, "li", NULL } ;
00730 static const char* const meta_attrs[] = { I18N, "http-equiv", "name", "scheme", NULL } ;
00731 static const char* const content_attr[] = { "content", NULL } ;
00732 static const char* const type_attr[] = { "type", NULL } ;
00733 static const char* const noframes_content[] = { "body", FLOW MODIFIER, NULL } ;
00734 static const char* const object_contents[] = { FLOW, "param", NULL } ;
00735 static const char* const object_attrs[] = { ATTRS, "declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height", "width", "usemap", "name", "tabindex", NULL } ;
00736 static const char* const object_depr[] = { "align", "border", "hspace", "vspace", NULL } ;
00737 static const char* const ol_attrs[] = { "type", "compact", "start", NULL} ;
00738 static const char* const option_elt[] = { "option", NULL } ;
00739 static const char* const optgroup_attrs[] = { ATTRS, "disabled", NULL } ;
00740 static const char* const option_attrs[] = { ATTRS, "disabled", "label", "selected", "value", NULL } ;
00741 static const char* const param_attrs[] = { "id", "value", "valuetype", "type", NULL } ;
00742 static const char* const width_attr[] = { "width", NULL } ;
00743 static const char* const pre_content[] = { PHRASE, "tt", "i", "b", "u", "s", "strike", "a", "br", "script", "map", "q", "span", "bdo", "iframe", NULL } ;
00744 static const char* const script_attrs[] = { "charset", "src", "defer", "event", "for", NULL } ;
00745 static const char* const language_attr[] = { "language", NULL } ;
00746 static const char* const select_content[] = { "optgroup", "option", NULL } ;
00747 static const char* const select_attrs[] = { ATTRS, "name", "size", "multiple", "disabled", "tabindex", "onfocus", "onblur", "onchange", NULL } ;
00748 static const char* const style_attrs[] = { I18N, "media", "title", NULL } ;
00749 static const char* const table_attrs[] = { ATTRS, "summary", "width", "border", "frame", "rules", "cellspacing", "cellpadding", "datapagesize", NULL } ;
00750 static const char* const table_depr[] = { "align", "bgcolor", NULL } ;
00751 static const char* const table_contents[] = { "caption", "col", "colgroup", "thead", "tfoot", "tbody", "tr", NULL} ;
00752 static const char* const tr_elt[] = { "tr", NULL } ;
00753 static const char* const talign_attrs[] = { ATTRS, CELLHALIGN, CELLVALIGN, NULL} ;
00754 static const char* const th_td_depr[] = { "nowrap", "bgcolor", "width", "height", NULL } ;
00755 static const char* const th_td_attr[] = { ATTRS, "abbr", "axis", "headers", "scope", "rowspan", "colspan", CELLHALIGN, CELLVALIGN, NULL } ;
00756 static const char* const textarea_attrs[] = { ATTRS, "name", "disabled", "readonly", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", NULL } ;
00757 static const char* const tr_contents[] = { "th", "td", NULL } ;
00758 static const char* const bgcolor_attr[] = { "bgcolor", NULL } ;
00759 static const char* const li_elt[] = { "li", NULL } ;
00760 static const char* const ul_depr[] = { "type", "compact", NULL} ;
00761 static const char* const dir_attr[] = { "dir", NULL} ;
00762 
00763 #define DECL (const char**)
00764 
00765 static const htmlElemDesc
00766 html40ElementTable[] = {
00767 { "a",      0, 0, 0, 0, 0, 0, 1, "anchor ",
00768     DECL html_inline , NULL , DECL a_attrs , DECL target_attr, NULL
00769 },
00770 { "abbr",   0, 0, 0, 0, 0, 0, 1, "abbreviated form",
00771     DECL html_inline , NULL , DECL html_attrs, NULL, NULL
00772 },
00773 { "acronym",    0, 0, 0, 0, 0, 0, 1, "",
00774     DECL html_inline , NULL , DECL html_attrs, NULL, NULL
00775 },
00776 { "address",    0, 0, 0, 0, 0, 0, 0, "information on author ",
00777     DECL inline_p  , NULL , DECL html_attrs, NULL, NULL
00778 },
00779 { "applet", 0, 0, 0, 0, 1, 1, 2, "java applet ",
00780     DECL flow_param , NULL , NULL , DECL applet_attrs, NULL
00781 },
00782 { "area",   0, 2, 2, 1, 0, 0, 0, "client-side image map area ",
00783     EMPTY ,  NULL , DECL area_attrs , DECL target_attr, DECL alt_attr
00784 },
00785 { "b",      0, 3, 0, 0, 0, 0, 1, "bold text style",
00786     DECL html_inline , NULL , DECL html_attrs, NULL, NULL
00787 },
00788 { "base",   0, 2, 2, 1, 0, 0, 0, "document base uri ",
00789     EMPTY , NULL , NULL , DECL target_attr, DECL href_attrs
00790 },
00791 { "basefont",   0, 2, 2, 1, 1, 1, 1, "base font size " ,
00792     EMPTY , NULL , NULL, DECL basefont_attrs, NULL
00793 },
00794 { "bdo",    0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride ",
00795     DECL html_inline , NULL , DECL core_i18n_attrs, NULL, DECL dir_attr
00796 },
00797 { "big",    0, 3, 0, 0, 0, 0, 1, "large text style",
00798     DECL html_inline , NULL , DECL html_attrs, NULL, NULL
00799 },
00800 { "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation ",
00801     DECL html_flow , NULL , DECL quote_attrs , NULL, NULL
00802 },
00803 { "body",   1, 1, 0, 0, 0, 0, 0, "document body ",
00804     DECL body_contents , "div" , DECL body_attrs, DECL body_depr, NULL
00805 },
00806 { "br",     0, 2, 2, 1, 0, 0, 1, "forced line break ",
00807     EMPTY , NULL , DECL core_attrs, DECL clear_attrs , NULL
00808 },
00809 { "button", 0, 0, 0, 0, 0, 0, 2, "push button ",
00810     DECL html_flow MODIFIER , NULL , DECL button_attrs, NULL, NULL
00811 },
00812 { "caption",    0, 0, 0, 0, 0, 0, 0, "table caption ",
00813     DECL html_inline , NULL , DECL html_attrs, NULL, NULL
00814 },
00815 { "center", 0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center ",
00816     DECL html_flow , NULL , NULL, DECL html_attrs, NULL
00817 },
00818 { "cite",   0, 0, 0, 0, 0, 0, 1, "citation",
00819     DECL html_inline , NULL , DECL html_attrs, NULL, NULL
00820 },
00821 { "code",   0, 0, 0, 0, 0, 0, 1, "computer code fragment",
00822     DECL html_inline , NULL , DECL html_attrs, NULL, NULL
00823 },
00824 { "col",    0, 2, 2, 1, 0, 0, 0, "table column ",
00825     EMPTY , NULL , DECL col_attrs , NULL, NULL
00826 },
00827 { "colgroup",   0, 1, 0, 0, 0, 0, 0, "table column group ",
00828     DECL col_elt , "col" , DECL col_attrs , NULL, NULL
00829 },
00830 { "dd",     0, 1, 0, 0, 0, 0, 0, "definition description ",
00831     DECL html_flow , NULL , DECL html_attrs, NULL, NULL
00832 },
00833 { "del",    0, 0, 0, 0, 0, 0, 2, "deleted text ",
00834     DECL html_flow , NULL , DECL edit_attrs , NULL, NULL
00835 },
00836 { "dfn",    0, 0, 0, 0, 0, 0, 1, "instance definition",
00837     DECL html_inline , NULL , DECL html_attrs, NULL, NULL
00838 },
00839 { "dir",    0, 0, 0, 0, 1, 1, 0, "directory list",
00840     DECL blockli_elt, "li" , NULL, DECL compact_attrs, NULL
00841 },
00842 { "div",    0, 0, 0, 0, 0, 0, 0, "generic language/style container",
00843     DECL html_flow, NULL, DECL html_attrs, DECL align_attr, NULL
00844 },
00845 { "dl",     0, 0, 0, 0, 0, 0, 0, "definition list ",
00846     DECL dl_contents , "dd" , DECL html_attrs, DECL compact_attr, NULL
00847 },
00848 { "dt",     0, 1, 0, 0, 0, 0, 0, "definition term ",
00849     DECL html_inline, NULL, DECL html_attrs, NULL, NULL
00850 },
00851 { "em",     0, 3, 0, 0, 0, 0, 1, "emphasis",
00852     DECL html_inline, NULL, DECL html_attrs, NULL, NULL
00853 },
00854 { "embed",  0, 1, 0, 0, 1, 1, 1, "generic embedded object ",
00855     EMPTY, NULL, DECL embed_attrs, NULL, NULL
00856 },
00857 { "fieldset",   0, 0, 0, 0, 0, 0, 0, "form control group ",
00858     DECL fieldset_contents , NULL, DECL html_attrs, NULL, NULL
00859 },
00860 { "font",   0, 3, 0, 0, 1, 1, 1, "local change to font ",
00861     DECL html_inline, NULL, NULL, DECL font_attrs, NULL
00862 },
00863 { "form",   0, 0, 0, 0, 0, 0, 0, "interactive form ",
00864     DECL form_contents, "fieldset", DECL form_attrs , DECL target_attr, DECL action_attr
00865 },
00866 { "frame",  0, 2, 2, 1, 0, 2, 0, "subwindow " ,
00867     EMPTY, NULL, NULL, DECL frame_attrs, NULL
00868 },
00869 { "frameset",   0, 0, 0, 0, 0, 2, 0, "window subdivision" ,
00870     DECL frameset_contents, "noframes" , NULL , DECL frameset_attrs, NULL
00871 },
00872 { "h1",     0, 0, 0, 0, 0, 0, 0, "heading ",
00873     DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
00874 },
00875 { "h2",     0, 0, 0, 0, 0, 0, 0, "heading ",
00876     DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
00877 },
00878 { "h3",     0, 0, 0, 0, 0, 0, 0, "heading ",
00879     DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
00880 },
00881 { "h4",     0, 0, 0, 0, 0, 0, 0, "heading ",
00882     DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
00883 },
00884 { "h5",     0, 0, 0, 0, 0, 0, 0, "heading ",
00885     DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
00886 },
00887 { "h6",     0, 0, 0, 0, 0, 0, 0, "heading ",
00888     DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
00889 },
00890 { "head",   1, 1, 0, 0, 0, 0, 0, "document head ",
00891     DECL head_contents, NULL, DECL head_attrs, NULL, NULL
00892 },
00893 { "hr",     0, 2, 2, 1, 0, 0, 0, "horizontal rule " ,
00894     EMPTY, NULL, DECL html_attrs, DECL hr_depr, NULL
00895 },
00896 { "html",   1, 1, 0, 0, 0, 0, 0, "document root element ",
00897     DECL html_content , NULL , DECL i18n_attrs, DECL version_attr, NULL
00898 },
00899 { "i",      0, 3, 0, 0, 0, 0, 1, "italic text style",
00900     DECL html_inline, NULL, DECL html_attrs, NULL, NULL
00901 },
00902 { "iframe", 0, 0, 0, 0, 0, 1, 2, "inline subwindow ",
00903     DECL html_flow, NULL, NULL, DECL iframe_attrs, NULL
00904 },
00905 { "img",    0, 2, 2, 1, 0, 0, 1, "embedded image ",
00906     EMPTY, NULL, DECL img_attrs, DECL align_attr, DECL src_alt_attrs
00907 },
00908 { "input",  0, 2, 2, 1, 0, 0, 1, "form control ",
00909     EMPTY, NULL, DECL input_attrs , DECL align_attr, NULL
00910 },
00911 { "ins",    0, 0, 0, 0, 0, 0, 2, "inserted text",
00912     DECL html_flow, NULL, DECL edit_attrs, NULL, NULL
00913 },
00914 { "isindex",    0, 2, 2, 1, 1, 1, 0, "single line prompt ",
00915     EMPTY, NULL, NULL, DECL prompt_attrs, NULL
00916 },
00917 { "kbd",    0, 0, 0, 0, 0, 0, 1, "text to be entered by the user",
00918     DECL html_inline, NULL, DECL html_attrs, NULL, NULL
00919 },
00920 { "label",  0, 0, 0, 0, 0, 0, 1, "form field label text ",
00921     DECL html_inline MODIFIER, NULL, DECL label_attrs , NULL, NULL
00922 },
00923 { "legend", 0, 0, 0, 0, 0, 0, 0, "fieldset legend ",
00924     DECL html_inline, NULL, DECL legend_attrs , DECL align_attr, NULL
00925 },
00926 { "li",     0, 1, 1, 0, 0, 0, 0, "list item ",
00927     DECL html_flow, NULL, DECL html_attrs, NULL, NULL
00928 },
00929 { "link",   0, 2, 2, 1, 0, 0, 0, "a media-independent link ",
00930     EMPTY, NULL, DECL link_attrs, DECL target_attr, NULL
00931 },
00932 { "map",    0, 0, 0, 0, 0, 0, 2, "client-side image map ",
00933     DECL map_contents , NULL, DECL html_attrs , NULL, DECL name_attr
00934 },
00935 { "menu",   0, 0, 0, 0, 1, 1, 0, "menu list ",
00936     DECL blockli_elt , NULL, NULL, DECL compact_attrs, NULL
00937 },
00938 { "meta",   0, 2, 2, 1, 0, 0, 0, "generic metainformation ",
00939     EMPTY, NULL, DECL meta_attrs , NULL , DECL content_attr
00940 },
00941 { "noframes",   0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering ",
00942     DECL noframes_content, "body" , DECL html_attrs, NULL, NULL
00943 },
00944 { "noscript",   0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering ",
00945     DECL html_flow, "div", DECL html_attrs, NULL, NULL
00946 },
00947 { "object", 0, 0, 0, 0, 0, 0, 2, "generic embedded object ",
00948     DECL object_contents , "div" , DECL object_attrs, DECL object_depr, NULL
00949 },
00950 { "ol",     0, 0, 0, 0, 0, 0, 0, "ordered list ",
00951     DECL li_elt , "li" , DECL html_attrs, DECL ol_attrs, NULL
00952 },
00953 { "optgroup",   0, 0, 0, 0, 0, 0, 0, "option group ",
00954     DECL option_elt , "option", DECL optgroup_attrs, NULL, DECL label_attr
00955 },
00956 { "option", 0, 1, 0, 0, 0, 0, 0, "selectable choice " ,
00957     DECL html_pcdata, NULL, DECL option_attrs, NULL, NULL
00958 },
00959 { "p",      0, 1, 0, 0, 0, 0, 0, "paragraph ",
00960     DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
00961 },
00962 { "param",  0, 2, 2, 1, 0, 0, 0, "named property value ",
00963     EMPTY, NULL, DECL param_attrs, NULL, DECL name_attr
00964 },
00965 { "pre",    0, 0, 0, 0, 0, 0, 0, "preformatted text ",
00966     DECL pre_content, NULL, DECL html_attrs, DECL width_attr, NULL
00967 },
00968 { "q",      0, 0, 0, 0, 0, 0, 1, "short inline quotation ",
00969     DECL html_inline, NULL, DECL quote_attrs, NULL, NULL
00970 },
00971 { "s",      0, 3, 0, 0, 1, 1, 1, "strike-through text style",
00972     DECL html_inline, NULL, NULL, DECL html_attrs, NULL
00973 },
00974 { "samp",   0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc.",
00975     DECL html_inline, NULL, DECL html_attrs, NULL, NULL
00976 },
00977 { "script", 0, 0, 0, 0, 0, 0, 2, "script statements ",
00978     DECL html_cdata, NULL, DECL script_attrs, DECL language_attr, DECL type_attr
00979 },
00980 { "select", 0, 0, 0, 0, 0, 0, 1, "option selector ",
00981     DECL select_content, NULL, DECL select_attrs, NULL, NULL
00982 },
00983 { "small",  0, 3, 0, 0, 0, 0, 1, "small text style",
00984     DECL html_inline, NULL, DECL html_attrs, NULL, NULL
00985 },
00986 { "span",   0, 0, 0, 0, 0, 0, 1, "generic language/style container ",
00987     DECL html_inline, NULL, DECL html_attrs, NULL, NULL
00988 },
00989 { "strike", 0, 3, 0, 0, 1, 1, 1, "strike-through text",
00990     DECL html_inline, NULL, NULL, DECL html_attrs, NULL
00991 },
00992 { "strong", 0, 3, 0, 0, 0, 0, 1, "strong emphasis",
00993     DECL html_inline, NULL, DECL html_attrs, NULL, NULL
00994 },
00995 { "style",  0, 0, 0, 0, 0, 0, 0, "style info ",
00996     DECL html_cdata, NULL, DECL style_attrs, NULL, DECL type_attr
00997 },
00998 { "sub",    0, 3, 0, 0, 0, 0, 1, "subscript",
00999     DECL html_inline, NULL, DECL html_attrs, NULL, NULL
01000 },
01001 { "sup",    0, 3, 0, 0, 0, 0, 1, "superscript ",
01002     DECL html_inline, NULL, DECL html_attrs, NULL, NULL
01003 },
01004 { "table",  0, 0, 0, 0, 0, 0, 0, "",
01005     DECL table_contents , "tr" , DECL table_attrs , DECL table_depr, NULL
01006 },
01007 { "tbody",  1, 0, 0, 0, 0, 0, 0, "table body ",
01008     DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
01009 },
01010 { "td",     0, 0, 0, 0, 0, 0, 0, "table data cell",
01011     DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
01012 },
01013 { "textarea",   0, 0, 0, 0, 0, 0, 1, "multi-line text field ",
01014     DECL html_pcdata, NULL, DECL textarea_attrs, NULL, DECL rows_cols_attr
01015 },
01016 { "tfoot",  0, 1, 0, 0, 0, 0, 0, "table footer ",
01017     DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
01018 },
01019 { "th",     0, 1, 0, 0, 0, 0, 0, "table header cell",
01020     DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
01021 },
01022 { "thead",  0, 1, 0, 0, 0, 0, 0, "table header ",
01023     DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
01024 },
01025 { "title",  0, 0, 0, 0, 0, 0, 0, "document title ",
01026     DECL html_pcdata, NULL, DECL i18n_attrs, NULL, NULL
01027 },
01028 { "tr",     0, 0, 0, 0, 0, 0, 0, "table row ",
01029     DECL tr_contents , "td" , DECL talign_attrs, DECL bgcolor_attr, NULL
01030 },
01031 { "tt",     0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style",
01032     DECL html_inline, NULL, DECL html_attrs, NULL, NULL
01033 },
01034 { "u",      0, 3, 0, 0, 1, 1, 1, "underlined text style",
01035     DECL html_inline, NULL, NULL, DECL html_attrs, NULL
01036 },
01037 { "ul",     0, 0, 0, 0, 0, 0, 0, "unordered list ",
01038     DECL li_elt , "li" , DECL html_attrs, DECL ul_depr, NULL
01039 },
01040 { "var",    0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument",
01041     DECL html_inline, NULL, DECL html_attrs, NULL, NULL
01042 }
01043 };
01044 
01045 /*
01046  * start tags that imply the end of current element
01047  */
01048 static const char * const htmlStartClose[] = {
01049 "form",     "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
01050         "dl", "ul", "ol", "menu", "dir", "address", "pre",
01051         "listing", "xmp", "head", NULL,
01052 "head",     "p", NULL,
01053 "title",    "p", NULL,
01054 "body",     "head", "style", "link", "title", "p", NULL,
01055 "frameset", "head", "style", "link", "title", "p", NULL,
01056 "li",       "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
01057         "pre", "listing", "xmp", "head", "li", NULL,
01058 "hr",       "p", "head", NULL,
01059 "h1",       "p", "head", NULL,
01060 "h2",       "p", "head", NULL,
01061 "h3",       "p", "head", NULL,
01062 "h4",       "p", "head", NULL,
01063 "h5",       "p", "head", NULL,
01064 "h6",       "p", "head", NULL,
01065 "dir",      "p", "head", NULL,
01066 "address",  "p", "head", "ul", NULL,
01067 "pre",      "p", "head", "ul", NULL,
01068 "listing",  "p", "head", NULL,
01069 "xmp",      "p", "head", NULL,
01070 "blockquote",   "p", "head", NULL,
01071 "dl",       "p", "dt", "menu", "dir", "address", "pre", "listing",
01072         "xmp", "head", NULL,
01073 "dt",       "p", "menu", "dir", "address", "pre", "listing", "xmp",
01074                 "head", "dd", NULL,
01075 "dd",       "p", "menu", "dir", "address", "pre", "listing", "xmp",
01076                 "head", "dt", NULL,
01077 "ul",       "p", "head", "ol", "menu", "dir", "address", "pre",
01078         "listing", "xmp", NULL,
01079 "ol",       "p", "head", "ul", NULL,
01080 "menu",     "p", "head", "ul", NULL,
01081 "p",        "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", FONTSTYLE, NULL,
01082 "div",      "p", "head", NULL,
01083 "noscript", "p", "head", NULL,
01084 "center",   "font", "b", "i", "p", "head", NULL,
01085 "a",        "a", NULL,
01086 "caption",  "p", NULL,
01087 "colgroup", "caption", "colgroup", "col", "p", NULL,
01088 "col",      "caption", "col", "p", NULL,
01089 "table",    "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
01090         "listing", "xmp", "a", NULL,
01091 "th",       "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
01092 "td",       "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
01093 "tr",       "th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
01094 "thead",    "caption", "col", "colgroup", NULL,
01095 "tfoot",    "th", "td", "tr", "caption", "col", "colgroup", "thead",
01096         "tbody", "p", NULL,
01097 "tbody",    "th", "td", "tr", "caption", "col", "colgroup", "thead",
01098         "tfoot", "tbody", "p", NULL,
01099 "optgroup", "option", NULL,
01100 "option",   "option", NULL,
01101 "fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
01102         "pre", "listing", "xmp", "a", NULL,
01103 NULL
01104 };
01105 
01106 /*
01107  * The list of HTML elements which are supposed not to have
01108  * CDATA content and where a p element will be implied
01109  *
01110  * TODO: extend that list by reading the HTML SGML DTD on
01111  *       implied paragraph
01112  */
01113 static const char *const htmlNoContentElements[] = {
01114     "html",
01115     "head",
01116     NULL
01117 };
01118 
01119 /*
01120  * The list of HTML attributes which are of content %Script;
01121  * NOTE: when adding ones, check htmlIsScriptAttribute() since
01122  *       it assumes the name starts with 'on'
01123  */
01124 static const char *const htmlScriptAttributes[] = {
01125     "onclick",
01126     "ondblclick",
01127     "onmousedown",
01128     "onmouseup",
01129     "onmouseover",
01130     "onmousemove",
01131     "onmouseout",
01132     "onkeypress",
01133     "onkeydown",
01134     "onkeyup",
01135     "onload",
01136     "onunload",
01137     "onfocus",
01138     "onblur",
01139     "onsubmit",
01140     "onrest",
01141     "onchange",
01142     "onselect"
01143 };
01144 
01145 /*
01146  * This table is used by the htmlparser to know what to do with
01147  * broken html pages. By assigning different priorities to different
01148  * elements the parser can decide how to handle extra endtags.
01149  * Endtags are only allowed to close elements with lower or equal
01150  * priority.
01151  */
01152 
01153 typedef struct {
01154     const char *name;
01155     int priority;
01156 } elementPriority;
01157 
01158 static const elementPriority htmlEndPriority[] = {
01159     {"div",   150},
01160     {"td",    160},
01161     {"th",    160},
01162     {"tr",    170},
01163     {"thead", 180},
01164     {"tbody", 180},
01165     {"tfoot", 180},
01166     {"table", 190},
01167     {"head",  200},
01168     {"body",  200},
01169     {"html",  220},
01170     {NULL,    100} /* Default priority */
01171 };
01172 
01173 static const char** htmlStartCloseIndex[100];
01174 static int htmlStartCloseIndexinitialized = 0;
01175 
01176 /************************************************************************
01177  *                                  *
01178  *  functions to handle HTML specific data          *
01179  *                                  *
01180  ************************************************************************/
01181 
01189 void
01190 htmlInitAutoClose(void) {
01191     int indx, i = 0;
01192 
01193     if (htmlStartCloseIndexinitialized) return;
01194 
01195     for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL;
01196     indx = 0;
01197     while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) {
01198         htmlStartCloseIndex[indx++] = (const char**) &htmlStartClose[i];
01199     while (htmlStartClose[i] != NULL) i++;
01200     i++;
01201     }
01202     htmlStartCloseIndexinitialized = 1;
01203 }
01204 
01213 const htmlElemDesc *
01214 htmlTagLookup(const xmlChar *tag) {
01215     unsigned int i;
01216 
01217     for (i = 0; i < (sizeof(html40ElementTable) /
01218                      sizeof(html40ElementTable[0]));i++) {
01219         if (!xmlStrcasecmp(tag, BAD_CAST html40ElementTable[i].name))
01220         return((htmlElemDescPtr) &html40ElementTable[i]);
01221     }
01222     return(NULL);
01223 }
01224 
01231 static int
01232 htmlGetEndPriority (const xmlChar *name) {
01233     int i = 0;
01234 
01235     while ((htmlEndPriority[i].name != NULL) &&
01236        (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
01237     i++;
01238 
01239     return(htmlEndPriority[i].priority);
01240 }
01241 
01242 
01254 static int
01255 htmlCheckAutoClose(const xmlChar * newtag, const xmlChar * oldtag)
01256 {
01257     int i, indx;
01258     const char **closed = NULL;
01259 
01260     if (htmlStartCloseIndexinitialized == 0)
01261         htmlInitAutoClose();
01262 
01263     /* inefficient, but not a big deal */
01264     for (indx = 0; indx < 100; indx++) {
01265         closed = htmlStartCloseIndex[indx];
01266         if (closed == NULL)
01267             return (0);
01268         if (xmlStrEqual(BAD_CAST * closed, newtag))
01269             break;
01270     }
01271 
01272     i = closed - htmlStartClose;
01273     i++;
01274     while (htmlStartClose[i] != NULL) {
01275         if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) {
01276             return (1);
01277         }
01278         i++;
01279     }
01280     return (0);
01281 }
01282 
01291 static void
01292 htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
01293 {
01294     const htmlElemDesc *info;
01295     int i, priority;
01296 
01297     priority = htmlGetEndPriority(newtag);
01298 
01299     for (i = (ctxt->nameNr - 1); i >= 0; i--) {
01300 
01301         if (xmlStrEqual(newtag, ctxt->nameTab[i]))
01302             break;
01303         /*
01304          * A missplaced endtag can only close elements with lower
01305          * or equal priority, so if we find an element with higher
01306          * priority before we find an element with
01307          * matching name, we just ignore this endtag
01308          */
01309         if (htmlGetEndPriority(ctxt->nameTab[i]) > priority)
01310             return;
01311     }
01312     if (i < 0)
01313         return;
01314 
01315     while (!xmlStrEqual(newtag, ctxt->name)) {
01316         info = htmlTagLookup(ctxt->name);
01317         if ((info != NULL) && (info->endTag == 3)) {
01318             htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
01319                      "Opening and ending tag mismatch: %s and %s\n",
01320              newtag, ctxt->name);
01321         }
01322         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
01323             ctxt->sax->endElement(ctxt->userData, ctxt->name);
01324     htmlnamePop(ctxt);
01325     }
01326 }
01327 
01334 static void
01335 htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)
01336 {
01337     int i;
01338 
01339     if (ctxt->nameNr == 0)
01340         return;
01341     for (i = (ctxt->nameNr - 1); i >= 0; i--) {
01342         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
01343             ctxt->sax->endElement(ctxt->userData, ctxt->name);
01344     htmlnamePop(ctxt);
01345     }
01346 }
01347 
01360 static void
01361 htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
01362 {
01363     while ((newtag != NULL) && (ctxt->name != NULL) &&
01364            (htmlCheckAutoClose(newtag, ctxt->name))) {
01365         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
01366             ctxt->sax->endElement(ctxt->userData, ctxt->name);
01367     htmlnamePop(ctxt);
01368     }
01369     if (newtag == NULL) {
01370         htmlAutoCloseOnEnd(ctxt);
01371         return;
01372     }
01373     while ((newtag == NULL) && (ctxt->name != NULL) &&
01374            ((xmlStrEqual(ctxt->name, BAD_CAST "head")) ||
01375             (xmlStrEqual(ctxt->name, BAD_CAST "body")) ||
01376             (xmlStrEqual(ctxt->name, BAD_CAST "html")))) {
01377         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
01378             ctxt->sax->endElement(ctxt->userData, ctxt->name);
01379     htmlnamePop(ctxt);
01380     }
01381 }
01382 
01396 int
01397 htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
01398     htmlNodePtr child;
01399 
01400     if (elem == NULL) return(1);
01401     if (xmlStrEqual(name, elem->name)) return(0);
01402     if (htmlCheckAutoClose(elem->name, name)) return(1);
01403     child = elem->children;
01404     while (child != NULL) {
01405         if (htmlAutoCloseTag(doc, name, child)) return(1);
01406     child = child->next;
01407     }
01408     return(0);
01409 }
01410 
01422 int
01423 htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
01424     htmlNodePtr child;
01425 
01426     if (elem == NULL) return(1);
01427     child = elem->children;
01428     while (child != NULL) {
01429     if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
01430     child = child->next;
01431     }
01432     return(0);
01433 }
01434 
01444 static void
01445 htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
01446     int i;
01447 
01448     if (ctxt->options & HTML_PARSE_NOIMPLIED)
01449         return;
01450     if (!htmlOmittedDefaultValue)
01451     return;
01452     if (xmlStrEqual(newtag, BAD_CAST"html"))
01453     return;
01454     if (ctxt->nameNr <= 0) {
01455     htmlnamePush(ctxt, BAD_CAST"html");
01456     if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
01457         ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
01458     }
01459     if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
01460         return;
01461     if ((ctxt->nameNr <= 1) &&
01462         ((xmlStrEqual(newtag, BAD_CAST"script")) ||
01463      (xmlStrEqual(newtag, BAD_CAST"style")) ||
01464      (xmlStrEqual(newtag, BAD_CAST"meta")) ||
01465      (xmlStrEqual(newtag, BAD_CAST"link")) ||
01466      (xmlStrEqual(newtag, BAD_CAST"title")) ||
01467      (xmlStrEqual(newtag, BAD_CAST"base")))) {
01468         if (ctxt->html >= 3) {
01469             /* we already saw or generated an <head> before */
01470             return;
01471         }
01472         /*
01473          * dropped OBJECT ... i you put it first BODY will be
01474          * assumed !
01475          */
01476         htmlnamePush(ctxt, BAD_CAST"head");
01477         if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
01478             ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
01479     } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
01480            (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
01481            (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
01482         if (ctxt->html >= 10) {
01483             /* we already saw or generated a <body> before */
01484             return;
01485         }
01486     for (i = 0;i < ctxt->nameNr;i++) {
01487         if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
01488         return;
01489         }
01490         if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
01491         return;
01492         }
01493     }
01494 
01495     htmlnamePush(ctxt, BAD_CAST"body");
01496     if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
01497         ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
01498     }
01499 }
01500 
01512 static int
01513 htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
01514     const xmlChar *tag;
01515     int i;
01516 
01517     if (ctxt == NULL)
01518     return(-1);
01519     tag = ctxt->name;
01520     if (tag == NULL) {
01521     htmlAutoClose(ctxt, BAD_CAST"p");
01522     htmlCheckImplied(ctxt, BAD_CAST"p");
01523     htmlnamePush(ctxt, BAD_CAST"p");
01524     if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
01525         ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
01526     return(1);
01527     }
01528     if (!htmlOmittedDefaultValue)
01529     return(0);
01530     for (i = 0; htmlNoContentElements[i] != NULL; i++) {
01531     if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
01532         htmlAutoClose(ctxt, BAD_CAST"p");
01533         htmlCheckImplied(ctxt, BAD_CAST"p");
01534         htmlnamePush(ctxt, BAD_CAST"p");
01535         if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
01536         ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
01537         return(1);
01538     }
01539     }
01540     return(0);
01541 }
01542 
01551 int
01552 htmlIsScriptAttribute(const xmlChar *name) {
01553     unsigned int i;
01554 
01555     if (name == NULL)
01556       return(0);
01557     /*
01558      * all script attributes start with 'on'
01559      */
01560     if ((name[0] != 'o') || (name[1] != 'n'))
01561       return(0);
01562     for (i = 0;
01563      i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
01564      i++) {
01565     if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
01566         return(1);
01567     }
01568     return(0);
01569 }
01570 
01571 /************************************************************************
01572  *                                  *
01573  *  The list of HTML predefined entities            *
01574  *                                  *
01575  ************************************************************************/
01576 
01577 
01578 static const htmlEntityDesc  html40EntitiesTable[] = {
01579 /*
01580  * the 4 absolute ones, plus apostrophe.
01581  */
01582 { 34,   "quot", "quotation mark = APL quote, U+0022 ISOnum" },
01583 { 38,   "amp",  "ampersand, U+0026 ISOnum" },
01584 { 39,   "apos", "single quote" },
01585 { 60,   "lt",   "less-than sign, U+003C ISOnum" },
01586 { 62,   "gt",   "greater-than sign, U+003E ISOnum" },
01587 
01588 /*
01589  * A bunch still in the 128-255 range
01590  * Replacing them depend really on the charset used.
01591  */
01592 { 160,  "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
01593 { 161,  "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
01594 { 162,  "cent", "cent sign, U+00A2 ISOnum" },
01595 { 163,  "pound","pound sign, U+00A3 ISOnum" },
01596 { 164,  "curren","currency sign, U+00A4 ISOnum" },
01597 { 165,  "yen",  "yen sign = yuan sign, U+00A5 ISOnum" },
01598 { 166,  "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
01599 { 167,  "sect", "section sign, U+00A7 ISOnum" },
01600 { 168,  "uml",  "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
01601 { 169,  "copy", "copyright sign, U+00A9 ISOnum" },
01602 { 170,  "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
01603 { 171,  "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
01604 { 172,  "not",  "not sign, U+00AC ISOnum" },
01605 { 173,  "shy",  "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
01606 { 174,  "reg",  "registered sign = registered trade mark sign, U+00AE ISOnum" },
01607 { 175,  "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
01608 { 176,  "deg",  "degree sign, U+00B0 ISOnum" },
01609 { 177,  "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
01610 { 178,  "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
01611 { 179,  "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
01612 { 180,  "acute","acute accent = spacing acute, U+00B4 ISOdia" },
01613 { 181,  "micro","micro sign, U+00B5 ISOnum" },
01614 { 182,  "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
01615 { 183,  "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
01616 { 184,  "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
01617 { 185,  "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
01618 { 186,  "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
01619 { 187,  "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
01620 { 188,  "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
01621 { 189,  "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
01622 { 190,  "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
01623 { 191,  "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
01624 { 192,  "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
01625 { 193,  "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
01626 { 194,  "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
01627 { 195,  "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
01628 { 196,  "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
01629 { 197,  "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
01630 { 198,  "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
01631 { 199,  "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
01632 { 200,  "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
01633 { 201,  "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
01634 { 202,  "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
01635 { 203,  "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
01636 { 204,  "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
01637 { 205,  "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
01638 { 206,  "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
01639 { 207,  "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
01640 { 208,  "ETH",  "latin capital letter ETH, U+00D0 ISOlat1" },
01641 { 209,  "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
01642 { 210,  "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
01643 { 211,  "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
01644 { 212,  "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
01645 { 213,  "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
01646 { 214,  "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
01647 { 215,  "times","multiplication sign, U+00D7 ISOnum" },
01648 { 216,  "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
01649 { 217,  "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
01650 { 218,  "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
01651 { 219,  "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
01652 { 220,  "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
01653 { 221,  "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
01654 { 222,  "THORN","latin capital letter THORN, U+00DE ISOlat1" },
01655 { 223,  "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
01656 { 224,  "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
01657 { 225,  "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
01658 { 226,  "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
01659 { 227,  "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
01660 { 228,  "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
01661 { 229,  "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
01662 { 230,  "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
01663 { 231,  "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
01664 { 232,  "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
01665 { 233,  "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
01666 { 234,  "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
01667 { 235,  "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
01668 { 236,  "igrave","latin small letter i with grave, U+00EC ISOlat1" },
01669 { 237,  "iacute","latin small letter i with acute, U+00ED ISOlat1" },
01670 { 238,  "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
01671 { 239,  "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
01672 { 240,  "eth",  "latin small letter eth, U+00F0 ISOlat1" },
01673 { 241,  "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
01674 { 242,  "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
01675 { 243,  "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
01676 { 244,  "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
01677 { 245,  "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
01678 { 246,  "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
01679 { 247,  "divide","division sign, U+00F7 ISOnum" },
01680 { 248,  "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
01681 { 249,  "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
01682 { 250,  "uacute","latin small letter u with acute, U+00FA ISOlat1" },
01683 { 251,  "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
01684 { 252,  "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
01685 { 253,  "yacute","latin small letter y with acute, U+00FD ISOlat1" },
01686 { 254,  "thorn","latin small letter thorn with, U+00FE ISOlat1" },
01687 { 255,  "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
01688 
01689 { 338,  "OElig","latin capital ligature OE, U+0152 ISOlat2" },
01690 { 339,  "oelig","latin small ligature oe, U+0153 ISOlat2" },
01691 { 352,  "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
01692 { 353,  "scaron","latin small letter s with caron, U+0161 ISOlat2" },
01693 { 376,  "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
01694 
01695 /*
01696  * Anything below should really be kept as entities references
01697  */
01698 { 402,  "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
01699 
01700 { 710,  "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
01701 { 732,  "tilde","small tilde, U+02DC ISOdia" },
01702 
01703 { 913,  "Alpha","greek capital letter alpha, U+0391" },
01704 { 914,  "Beta", "greek capital letter beta, U+0392" },
01705 { 915,  "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
01706 { 916,  "Delta","greek capital letter delta, U+0394 ISOgrk3" },
01707 { 917,  "Epsilon","greek capital letter epsilon, U+0395" },
01708 { 918,  "Zeta", "greek capital letter zeta, U+0396" },
01709 { 919,  "Eta",  "greek capital letter eta, U+0397" },
01710 { 920,  "Theta","greek capital letter theta, U+0398 ISOgrk3" },
01711 { 921,  "Iota", "greek capital letter iota, U+0399" },
01712 { 922,  "Kappa","greek capital letter kappa, U+039A" },
01713 { 923,  "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
01714 { 924,  "Mu",   "greek capital letter mu, U+039C" },
01715 { 925,  "Nu",   "greek capital letter nu, U+039D" },
01716 { 926,  "Xi",   "greek capital letter xi, U+039E ISOgrk3" },
01717 { 927,  "Omicron","greek capital letter omicron, U+039F" },
01718 { 928,  "Pi",   "greek capital letter pi, U+03A0 ISOgrk3" },
01719 { 929,  "Rho",  "greek capital letter rho, U+03A1" },
01720 { 931,  "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
01721 { 932,  "Tau",  "greek capital letter tau, U+03A4" },
01722 { 933,  "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
01723 { 934,  "Phi",  "greek capital letter phi, U+03A6 ISOgrk3" },
01724 { 935,  "Chi",  "greek capital letter chi, U+03A7" },
01725 { 936,  "Psi",  "greek capital letter psi, U+03A8 ISOgrk3" },
01726 { 937,  "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
01727 
01728 { 945,  "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
01729 { 946,  "beta", "greek small letter beta, U+03B2 ISOgrk3" },
01730 { 947,  "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
01731 { 948,  "delta","greek small letter delta, U+03B4 ISOgrk3" },
01732 { 949,  "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
01733 { 950,  "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
01734 { 951,  "eta",  "greek small letter eta, U+03B7 ISOgrk3" },
01735 { 952,  "theta","greek small letter theta, U+03B8 ISOgrk3" },
01736 { 953,  "iota", "greek small letter iota, U+03B9 ISOgrk3" },
01737 { 954,  "kappa","greek small letter kappa, U+03BA ISOgrk3" },
01738 { 955,  "lambda","greek small letter lambda, U+03BB ISOgrk3" },
01739 { 956,  "mu",   "greek small letter mu, U+03BC ISOgrk3" },
01740 { 957,  "nu",   "greek small letter nu, U+03BD ISOgrk3" },
01741 { 958,  "xi",   "greek small letter xi, U+03BE ISOgrk3" },
01742 { 959,  "omicron","greek small letter omicron, U+03BF NEW" },
01743 { 960,  "pi",   "greek small letter pi, U+03C0 ISOgrk3" },
01744 { 961,  "rho",  "greek small letter rho, U+03C1 ISOgrk3" },
01745 { 962,  "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
01746 { 963,  "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
01747 { 964,  "tau",  "greek small letter tau, U+03C4 ISOgrk3" },
01748 { 965,  "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
01749 { 966,  "phi",  "greek small letter phi, U+03C6 ISOgrk3" },
01750 { 967,  "chi",  "greek small letter chi, U+03C7 ISOgrk3" },
01751 { 968,  "psi",  "greek small letter psi, U+03C8 ISOgrk3" },
01752 { 969,  "omega","greek small letter omega, U+03C9 ISOgrk3" },
01753 { 977,  "thetasym","greek small letter theta symbol, U+03D1 NEW" },
01754 { 978,  "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
01755 { 982,  "piv",  "greek pi symbol, U+03D6 ISOgrk3" },
01756 
01757 { 8194, "ensp", "en space, U+2002 ISOpub" },
01758 { 8195, "emsp", "em space, U+2003 ISOpub" },
01759 { 8201, "thinsp","thin space, U+2009 ISOpub" },
01760 { 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
01761 { 8205, "zwj",  "zero width joiner, U+200D NEW RFC 2070" },
01762 { 8206, "lrm",  "left-to-right mark, U+200E NEW RFC 2070" },
01763 { 8207, "rlm",  "right-to-left mark, U+200F NEW RFC 2070" },
01764 { 8211, "ndash","en dash, U+2013 ISOpub" },
01765 { 8212, "mdash","em dash, U+2014 ISOpub" },
01766 { 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
01767 { 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
01768 { 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
01769 { 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
01770 { 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
01771 { 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
01772 { 8224, "dagger","dagger, U+2020 ISOpub" },
01773 { 8225, "Dagger","double dagger, U+2021 ISOpub" },
01774 
01775 { 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
01776 { 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
01777 
01778 { 8240, "permil","per mille sign, U+2030 ISOtech" },
01779 
01780 { 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
01781 { 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
01782 
01783 { 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
01784 { 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
01785 
01786 { 8254, "oline","overline = spacing overscore, U+203E NEW" },
01787 { 8260, "frasl","fraction slash, U+2044 NEW" },
01788 
01789 { 8364, "euro", "euro sign, U+20AC NEW" },
01790 
01791 { 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
01792 { 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
01793 { 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
01794 { 8482, "trade","trade mark sign, U+2122 ISOnum" },
01795 { 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
01796 { 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
01797 { 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
01798 { 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
01799 { 8595, "darr", "downwards arrow, U+2193 ISOnum" },
01800 { 8596, "harr", "left right arrow, U+2194 ISOamsa" },
01801 { 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
01802 { 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
01803 { 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
01804 { 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
01805 { 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
01806 { 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
01807 
01808 { 8704, "forall","for all, U+2200 ISOtech" },
01809 { 8706, "part", "partial differential, U+2202 ISOtech" },
01810 { 8707, "exist","there exists, U+2203 ISOtech" },
01811 { 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
01812 { 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
01813 { 8712, "isin", "element of, U+2208 ISOtech" },
01814 { 8713, "notin","not an element of, U+2209 ISOtech" },
01815 { 8715, "ni",   "contains as member, U+220B ISOtech" },
01816 { 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
01817 { 8721, "sum",  "n-ary summation, U+2211 ISOamsb" },
01818 { 8722, "minus","minus sign, U+2212 ISOtech" },
01819 { 8727, "lowast","asterisk operator, U+2217 ISOtech" },
01820 { 8730, "radic","square root = radical sign, U+221A ISOtech" },
01821 { 8733, "prop", "proportional to, U+221D ISOtech" },
01822 { 8734, "infin","infinity, U+221E ISOtech" },
01823 { 8736, "ang",  "angle, U+2220 ISOamso" },
01824 { 8743, "and",  "logical and = wedge, U+2227 ISOtech" },
01825 { 8744, "or",   "logical or = vee, U+2228 ISOtech" },
01826 { 8745, "cap",  "intersection = cap, U+2229 ISOtech" },
01827 { 8746, "cup",  "union = cup, U+222A ISOtech" },
01828 { 8747, "int",  "integral, U+222B ISOtech" },
01829 { 8756, "there4","therefore, U+2234 ISOtech" },
01830 { 8764, "sim",  "tilde operator = varies with = similar to, U+223C ISOtech" },
01831 { 8773, "cong", "approximately equal to, U+2245 ISOtech" },
01832 { 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
01833 { 8800, "ne",   "not equal to, U+2260 ISOtech" },
01834 { 8801, "equiv","identical to, U+2261 ISOtech" },
01835 { 8804, "le",   "less-than or equal to, U+2264 ISOtech" },
01836 { 8805, "ge",   "greater-than or equal to, U+2265 ISOtech" },
01837 { 8834, "sub",  "subset of, U+2282 ISOtech" },
01838 { 8835, "sup",  "superset of, U+2283 ISOtech" },
01839 { 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
01840 { 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
01841 { 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
01842 { 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
01843 { 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
01844 { 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
01845 { 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
01846 { 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
01847 { 8969, "rceil","right ceiling, U+2309 ISOamsc" },
01848 { 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
01849 { 8971, "rfloor","right floor, U+230B ISOamsc" },
01850 { 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
01851 { 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
01852 { 9674, "loz",  "lozenge, U+25CA ISOpub" },
01853 
01854 { 9824, "spades","black spade suit, U+2660 ISOpub" },
01855 { 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
01856 { 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
01857 { 9830, "diams","black diamond suit, U+2666 ISOpub" },
01858 
01859 };
01860 
01861 /************************************************************************
01862  *                                  *
01863  *      Commodity functions to handle entities          *
01864  *                                  *
01865  ************************************************************************/
01866 
01867 /*
01868  * Macro used to grow the current buffer.
01869  */
01870 #define growBuffer(buffer) {                        \
01871     xmlChar *tmp;                           \
01872     buffer##_size *= 2;                         \
01873     tmp = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
01874     if (tmp == NULL) {                      \
01875     htmlErrMemory(ctxt, "growing buffer\n");            \
01876     xmlFree(buffer);                        \
01877     return(NULL);                           \
01878     }                                   \
01879     buffer = tmp;                           \
01880 }
01881 
01892 const htmlEntityDesc *
01893 htmlEntityLookup(const xmlChar *name) {
01894     unsigned int i;
01895 
01896     for (i = 0;i < (sizeof(html40EntitiesTable)/
01897                     sizeof(html40EntitiesTable[0]));i++) {
01898         if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
01899             return((htmlEntityDescPtr) &html40EntitiesTable[i]);
01900     }
01901     }
01902     return(NULL);
01903 }
01904 
01915 const htmlEntityDesc *
01916 htmlEntityValueLookup(unsigned int value) {
01917     unsigned int i;
01918 
01919     for (i = 0;i < (sizeof(html40EntitiesTable)/
01920                     sizeof(html40EntitiesTable[0]));i++) {
01921         if (html40EntitiesTable[i].value >= value) {
01922         if (html40EntitiesTable[i].value > value)
01923         break;
01924             return((htmlEntityDescPtr) &html40EntitiesTable[i]);
01925     }
01926     }
01927     return(NULL);
01928 }
01929 
01945 int
01946 UTF8ToHtml(unsigned char* out, int *outlen,
01947               const unsigned char* in, int *inlen) {
01948     const unsigned char* processed = in;
01949     const unsigned char* outend;
01950     const unsigned char* outstart = out;
01951     const unsigned char* instart = in;
01952     const unsigned char* inend;
01953     unsigned int c, d;
01954     int trailing;
01955 
01956     if ((out == NULL) || (outlen == NULL) || (inlen == NULL)) return(-1);
01957     if (in == NULL) {
01958         /*
01959      * initialization nothing to do
01960      */
01961     *outlen = 0;
01962     *inlen = 0;
01963     return(0);
01964     }
01965     inend = in + (*inlen);
01966     outend = out + (*outlen);
01967     while (in < inend) {
01968     d = *in++;
01969     if      (d < 0x80)  { c= d; trailing= 0; }
01970     else if (d < 0xC0) {
01971         /* trailing byte in leading position */
01972         *outlen = out - outstart;
01973         *inlen = processed - instart;
01974         return(-2);
01975         } else if (d < 0xE0)  { c= d & 0x1F; trailing= 1; }
01976         else if (d < 0xF0)  { c= d & 0x0F; trailing= 2; }
01977         else if (d < 0xF8)  { c= d & 0x07; trailing= 3; }
01978     else {
01979         /* no chance for this in Ascii */
01980         *outlen = out - outstart;
01981         *inlen = processed - instart;
01982         return(-2);
01983     }
01984 
01985     if (inend - in < trailing) {
01986         break;
01987     }
01988 
01989     for ( ; trailing; trailing--) {
01990         if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
01991         break;
01992         c <<= 6;
01993         c |= d & 0x3F;
01994     }
01995 
01996     /* assertion: c is a single UTF-4 value */
01997     if (c < 0x80) {
01998         if (out + 1 >= outend)
01999         break;
02000         *out++ = c;
02001     } else {
02002         int len;
02003         const htmlEntityDesc * ent;
02004         const char *cp;
02005         char nbuf[16];
02006 
02007         /*
02008          * Try to lookup a predefined HTML entity for it
02009          */
02010 
02011         ent = htmlEntityValueLookup(c);
02012         if (ent == NULL) {
02013           snprintf(nbuf, sizeof(nbuf), "#%u", c);
02014           cp = nbuf;
02015         }
02016         else
02017           cp = ent->name;
02018         len = strlen(cp);
02019         if (out + 2 + len >= outend)
02020         break;
02021         *out++ = '&';
02022         memcpy(out, cp, len);
02023         out += len;
02024         *out++ = ';';
02025     }
02026     processed = in;
02027     }
02028     *outlen = out - outstart;
02029     *inlen = processed - instart;
02030     return(0);
02031 }
02032 
02049 int
02050 htmlEncodeEntities(unsigned char* out, int *outlen,
02051            const unsigned char* in, int *inlen, int quoteChar) {
02052     const unsigned char* processed = in;
02053     const unsigned char* outend;
02054     const unsigned char* outstart = out;
02055     const unsigned char* instart = in;
02056     const unsigned char* inend;
02057     unsigned int c, d;
02058     int trailing;
02059 
02060     if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL))
02061         return(-1);
02062     outend = out + (*outlen);
02063     inend = in + (*inlen);
02064     while (in < inend) {
02065     d = *in++;
02066     if      (d < 0x80)  { c= d; trailing= 0; }
02067     else if (d < 0xC0) {
02068         /* trailing byte in leading position */
02069         *outlen = out - outstart;
02070         *inlen = processed - instart;
02071         return(-2);
02072         } else if (d < 0xE0)  { c= d & 0x1F; trailing= 1; }
02073         else if (d < 0xF0)  { c= d & 0x0F; trailing= 2; }
02074         else if (d < 0xF8)  { c= d & 0x07; trailing= 3; }
02075     else {
02076         /* no chance for this in Ascii */
02077         *outlen = out - outstart;
02078         *inlen = processed - instart;
02079         return(-2);
02080     }
02081 
02082     if (inend - in < trailing)
02083         break;
02084 
02085     while (trailing--) {
02086         if (((d= *in++) & 0xC0) != 0x80) {
02087         *outlen = out - outstart;
02088         *inlen = processed - instart;
02089         return(-2);
02090         }
02091         c <<= 6;
02092         c |= d & 0x3F;
02093     }
02094 
02095     /* assertion: c is a single UTF-4 value */
02096     if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
02097         (c != '&') && (c != '<') && (c != '>')) {
02098         if (out >= outend)
02099         break;
02100         *out++ = c;
02101     } else {
02102         const htmlEntityDesc * ent;
02103         const char *cp;
02104         char nbuf[16];
02105         int len;
02106 
02107         /*
02108          * Try to lookup a predefined HTML entity for it
02109          */
02110         ent = htmlEntityValueLookup(c);
02111         if (ent == NULL) {
02112         snprintf(nbuf, sizeof(nbuf), "#%u", c);
02113         cp = nbuf;
02114         }
02115         else
02116         cp = ent->name;
02117         len = strlen(cp);
02118         if (out + 2 + len > outend)
02119         break;
02120         *out++ = '&';
02121         memcpy(out, cp, len);
02122         out += len;
02123         *out++ = ';';
02124     }
02125     processed = in;
02126     }
02127     *outlen = out - outstart;
02128     *inlen = processed - instart;
02129     return(0);
02130 }
02131 
02132 /************************************************************************
02133  *                                  *
02134  *      Commodity functions to handle streams           *
02135  *                                  *
02136  ************************************************************************/
02137 
02145 static htmlParserInputPtr
02146 htmlNewInputStream(htmlParserCtxtPtr ctxt) {
02147     htmlParserInputPtr input;
02148 
02149     input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
02150     if (input == NULL) {
02151         htmlErrMemory(ctxt, "couldn't allocate a new input stream\n");
02152     return(NULL);
02153     }
02154     memset(input, 0, sizeof(htmlParserInput));
02155     input->filename = NULL;
02156     input->directory = NULL;
02157     input->base = NULL;
02158     input->cur = NULL;
02159     input->buf = NULL;
02160     input->line = 1;
02161     input->col = 1;
02162     input->buf = NULL;
02163     input->free = NULL;
02164     input->version = NULL;
02165     input->consumed = 0;
02166     input->length = 0;
02167     return(input);
02168 }
02169 
02170 
02171 /************************************************************************
02172  *                                  *
02173  *      Commodity functions, cleanup needed ?           *
02174  *                                  *
02175  ************************************************************************/
02176 /*
02177  * all tags allowing pc data from the html 4.01 loose dtd
02178  * NOTE: it might be more apropriate to integrate this information
02179  * into the html40ElementTable array but I don't want to risk any
02180  * binary incomptibility
02181  */
02182 static const char *allowPCData[] = {
02183     "a", "abbr", "acronym", "address", "applet", "b", "bdo", "big",
02184     "blockquote", "body", "button", "caption", "center", "cite", "code",
02185     "dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2",
02186     "h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend",
02187     "li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp",
02188     "small", "span", "strike", "strong", "td", "th", "tt", "u", "var"
02189 };
02190 
02202 static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
02203     unsigned int i;
02204     int j;
02205     xmlNodePtr lastChild;
02206     xmlDtdPtr dtd;
02207 
02208     for (j = 0;j < len;j++)
02209         if (!(IS_BLANK_CH(str[j]))) return(0);
02210 
02211     if (CUR == 0) return(1);
02212     if (CUR != '<') return(0);
02213     if (ctxt->name == NULL)
02214     return(1);
02215     if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
02216     return(1);
02217     if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
02218     return(1);
02219 
02220     /* Only strip CDATA children of the body tag for strict HTML DTDs */
02221     if (xmlStrEqual(ctxt->name, BAD_CAST "body") && ctxt->myDoc != NULL) {
02222         dtd = xmlGetIntSubset(ctxt->myDoc);
02223         if (dtd != NULL && dtd->ExternalID != NULL) {
02224             if (!xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4.01//EN") ||
02225                     !xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4//EN"))
02226                 return(1);
02227         }
02228     }
02229 
02230     if (ctxt->node == NULL) return(0);
02231     lastChild = xmlGetLastChild(ctxt->node);
02232     while ((lastChild) && (lastChild->type == XML_COMMENT_NODE))
02233     lastChild = lastChild->prev;
02234     if (lastChild == NULL) {
02235         if ((ctxt->node->type != XML_ELEMENT_NODE) &&
02236             (ctxt->node->content != NULL)) return(0);
02237     /* keep ws in constructs like ...<b> </b>...
02238        for all tags "b" allowing PCDATA */
02239     for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
02240         if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) {
02241         return(0);
02242         }
02243     }
02244     } else if (xmlNodeIsText(lastChild)) {
02245         return(0);
02246     } else {
02247     /* keep ws in constructs like <p><b>xy</b> <i>z</i><p>
02248        for all tags "p" allowing PCDATA */
02249     for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
02250         if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) {
02251         return(0);
02252         }
02253     }
02254     }
02255     return(1);
02256 }
02257 
02268 htmlDocPtr
02269 htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
02270     xmlDocPtr cur;
02271 
02272     /*
02273      * Allocate a new document and fill the fields.
02274      */
02275     cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
02276     if (cur == NULL) {
02277     htmlErrMemory(NULL, "HTML document creation failed\n");
02278     return(NULL);
02279     }
02280     memset(cur, 0, sizeof(xmlDoc));
02281 
02282     cur->type = XML_HTML_DOCUMENT_NODE;
02283     cur->version = NULL;
02284     cur->intSubset = NULL;
02285     cur->doc = cur;
02286     cur->name = NULL;
02287     cur->children = NULL;
02288     cur->extSubset = NULL;
02289     cur->oldNs = NULL;
02290     cur->encoding = NULL;
02291     cur->standalone = 1;
02292     cur->compression = 0;
02293     cur->ids = NULL;
02294     cur->refs = NULL;
02295     cur->_private = NULL;
02296     cur->charset = XML_CHAR_ENCODING_UTF8;
02297     cur->properties = XML_DOC_HTML | XML_DOC_USERBUILT;
02298     if ((ExternalID != NULL) ||
02299     (URI != NULL))
02300     xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI);
02301     return(cur);
02302 }
02303 
02313 htmlDocPtr
02314 htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
02315     if ((URI == NULL) && (ExternalID == NULL))
02316     return(htmlNewDocNoDtD(
02317             BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
02318             BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
02319 
02320     return(htmlNewDocNoDtD(URI, ExternalID));
02321 }
02322 
02323 
02324 /************************************************************************
02325  *                                  *
02326  *          The parser itself               *
02327  *  Relates to http://www.w3.org/TR/html40              *
02328  *                                  *
02329  ************************************************************************/
02330 
02331 /************************************************************************
02332  *                                  *
02333  *          The parser itself               *
02334  *                                  *
02335  ************************************************************************/
02336 
02337 static const xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt);
02338 
02349 static const xmlChar *
02350 htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
02351     int i = 0;
02352     xmlChar loc[HTML_PARSER_BUFFER_SIZE];
02353 
02354     if (!IS_ASCII_LETTER(CUR) && (CUR != '_') &&
02355         (CUR != ':') && (CUR != '.')) return(NULL);
02356 
02357     while ((i < HTML_PARSER_BUFFER_SIZE) &&
02358            ((IS_ASCII_LETTER(CUR)) || (IS_ASCII_DIGIT(CUR)) ||
02359        (CUR == ':') || (CUR == '-') || (CUR == '_') ||
02360            (CUR == '.'))) {
02361     if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
02362         else loc[i] = CUR;
02363     i++;
02364 
02365     NEXT;
02366     }
02367 
02368     return(xmlDictLookup(ctxt->dict, loc, i));
02369 }
02370 
02371 
02383 static const xmlChar *
02384 htmlParseHTMLName_nonInvasive(htmlParserCtxtPtr ctxt) {
02385     int i = 0;
02386     xmlChar loc[HTML_PARSER_BUFFER_SIZE];
02387 
02388     if (!IS_ASCII_LETTER(NXT(1)) && (NXT(1) != '_') &&
02389         (NXT(1) != ':')) return(NULL);
02390 
02391     while ((i < HTML_PARSER_BUFFER_SIZE) &&
02392            ((IS_ASCII_LETTER(NXT(1+i))) || (IS_ASCII_DIGIT(NXT(1+i))) ||
02393        (NXT(1+i) == ':') || (NXT(1+i) == '-') || (NXT(1+i) == '_'))) {
02394     if ((NXT(1+i) >= 'A') && (NXT(1+i) <= 'Z')) loc[i] = NXT(1+i) + 0x20;
02395         else loc[i] = NXT(1+i);
02396     i++;
02397     }
02398 
02399     return(xmlDictLookup(ctxt->dict, loc, i));
02400 }
02401 
02402 
02412 static const xmlChar *
02413 htmlParseName(htmlParserCtxtPtr ctxt) {
02414     const xmlChar *in;
02415     const xmlChar *ret;
02416     int count = 0;
02417 
02418     GROW;
02419 
02420     /*
02421      * Accelerator for simple ASCII names
02422      */
02423     in = ctxt->input->cur;
02424     if (((*in >= 0x61) && (*in <= 0x7A)) ||
02425     ((*in >= 0x41) && (*in <= 0x5A)) ||
02426     (*in == '_') || (*in == ':')) {
02427     in++;
02428     while (((*in >= 0x61) && (*in <= 0x7A)) ||
02429            ((*in >= 0x41) && (*in <= 0x5A)) ||
02430            ((*in >= 0x30) && (*in <= 0x39)) ||
02431            (*in == '_') || (*in == '-') ||
02432            (*in == ':') || (*in == '.'))
02433         in++;
02434     if ((*in > 0) && (*in < 0x80)) {
02435         count = in - ctxt->input->cur;
02436         ret = xmlDictLookup(ctxt->dict, ctxt->input->cur, count);
02437         ctxt->input->cur = in;
02438         ctxt->nbChars += count;
02439         ctxt->input->col += count;
02440         return(ret);
02441     }
02442     }
02443     return(htmlParseNameComplex(ctxt));
02444 }
02445 
02446 static const xmlChar *
02447 htmlParseNameComplex(xmlParserCtxtPtr ctxt) {
02448     int len = 0, l;
02449     int c;
02450     int count = 0;
02451 
02452     /*
02453      * Handler for more complex cases
02454      */
02455     GROW;
02456     c = CUR_CHAR(l);
02457     if ((c == ' ') || (c == '>') || (c == '/') || /* accelerators */
02458     (!IS_LETTER(c) && (c != '_') &&
02459          (c != ':'))) {
02460     return(NULL);
02461     }
02462 
02463     while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */
02464        ((IS_LETTER(c)) || (IS_DIGIT(c)) ||
02465             (c == '.') || (c == '-') ||
02466         (c == '_') || (c == ':') ||
02467         (IS_COMBINING(c)) ||
02468         (IS_EXTENDER(c)))) {
02469     if (count++ > 100) {
02470         count = 0;
02471         GROW;
02472     }
02473     len += l;
02474     NEXTL(l);
02475     c = CUR_CHAR(l);
02476     }
02477     return(xmlDictLookup(ctxt->dict, ctxt->input->cur - len, len));
02478 }
02479 
02480 
02492 static xmlChar *
02493 htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
02494     xmlChar *buffer = NULL;
02495     int buffer_size = 0;
02496     xmlChar *out = NULL;
02497     const xmlChar *name = NULL;
02498     const xmlChar *cur = NULL;
02499     const htmlEntityDesc * ent;
02500 
02501     /*
02502      * allocate a translation buffer.
02503      */
02504     buffer_size = HTML_PARSER_BUFFER_SIZE;
02505     buffer = (xmlChar *) xmlMallocAtomic(buffer_size * sizeof(xmlChar));
02506     if (buffer == NULL) {
02507     htmlErrMemory(ctxt, "buffer allocation failed\n");
02508     return(NULL);
02509     }
02510     out = buffer;
02511 
02512     /*
02513      * Ok loop until we reach one of the ending chars
02514      */
02515     while ((CUR != 0) && (CUR != stop)) {
02516     if ((stop == 0) && (CUR == '>')) break;
02517     if ((stop == 0) && (IS_BLANK_CH(CUR))) break;
02518         if (CUR == '&') {
02519         if (NXT(1) == '#') {
02520         unsigned int c;
02521         int bits;
02522 
02523         c = htmlParseCharRef(ctxt);
02524         if      (c <    0x80)
02525                 { *out++  = c;                bits= -6; }
02526         else if (c <   0x800)
02527                 { *out++  =((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
02528         else if (c < 0x10000)
02529                 { *out++  =((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
02530         else
02531                 { *out++  =((c >> 18) & 0x07) | 0xF0;  bits= 12; }
02532 
02533         for ( ; bits >= 0; bits-= 6) {
02534             *out++  = ((c >> bits) & 0x3F) | 0x80;
02535         }
02536 
02537         if (out - buffer > buffer_size - 100) {
02538             int indx = out - buffer;
02539 
02540             growBuffer(buffer);
02541             out = &buffer[indx];
02542         }
02543         } else {
02544         ent = htmlParseEntityRef(ctxt, &name);
02545         if (name == NULL) {
02546             *out++ = '&';
02547             if (out - buffer > buffer_size - 100) {
02548             int indx = out - buffer;
02549 
02550             growBuffer(buffer);
02551             out = &buffer[indx];
02552             }
02553         } else if (ent == NULL) {
02554             *out++ = '&';
02555             cur = name;
02556             while (*cur != 0) {
02557             if (out - buffer > buffer_size - 100) {
02558                 int indx = out - buffer;
02559 
02560                 growBuffer(buffer);
02561                 out = &buffer[indx];
02562             }
02563             *out++ = *cur++;
02564             }
02565         } else {
02566             unsigned int c;
02567             int bits;
02568 
02569             if (out - buffer > buffer_size - 100) {
02570             int indx = out - buffer;
02571 
02572             growBuffer(buffer);
02573             out = &buffer[indx];
02574             }
02575             c = ent->value;
02576             if      (c <    0x80)
02577             { *out++  = c;                bits= -6; }
02578             else if (c <   0x800)
02579             { *out++  =((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
02580             else if (c < 0x10000)
02581             { *out++  =((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
02582             else
02583             { *out++  =((c >> 18) & 0x07) | 0xF0;  bits= 12; }
02584 
02585             for ( ; bits >= 0; bits-= 6) {
02586             *out++  = ((c >> bits) & 0x3F) | 0x80;
02587             }
02588         }
02589         }
02590     } else {
02591         unsigned int c;
02592         int bits, l;
02593 
02594         if (out - buffer > buffer_size - 100) {
02595         int indx = out - buffer;
02596 
02597         growBuffer(buffer);
02598         out = &buffer[indx];
02599         }
02600         c = CUR_CHAR(l);
02601         if      (c <    0x80)
02602             { *out++  = c;                bits= -6; }
02603         else if (c <   0x800)
02604             { *out++  =((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
02605         else if (c < 0x10000)
02606             { *out++  =((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
02607         else
02608             { *out++  =((c >> 18) & 0x07) | 0xF0;  bits= 12; }
02609 
02610         for ( ; bits >= 0; bits-= 6) {
02611         *out++  = ((c >> bits) & 0x3F) | 0x80;
02612         }
02613         NEXT;
02614     }
02615     }
02616     *out = 0;
02617     return(buffer);
02618 }
02619 
02632 const htmlEntityDesc *
02633 htmlParseEntityRef(htmlParserCtxtPtr ctxt, const xmlChar **str) {
02634     const xmlChar *name;
02635     const htmlEntityDesc * ent = NULL;
02636 
02637     if (str != NULL) *str = NULL;
02638     if ((ctxt == NULL) || (ctxt->input == NULL)) return(NULL);
02639 
02640     if (CUR == '&') {
02641         NEXT;
02642         name = htmlParseName(ctxt);
02643     if (name == NULL) {
02644         htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
02645                      "htmlParseEntityRef: no name\n", NULL, NULL);
02646     } else {
02647         GROW;
02648         if (CUR == ';') {
02649             if (str != NULL)
02650             *str = name;
02651 
02652         /*
02653          * Lookup the entity in the table.
02654          */
02655         ent = htmlEntityLookup(name);
02656         if (ent != NULL) /* OK that's ugly !!! */
02657             NEXT;
02658         } else {
02659         htmlParseErr(ctxt, XML_ERR_ENTITYREF_SEMICOL_MISSING,
02660                      "htmlParseEntityRef: expecting ';'\n",
02661                  NULL, NULL);
02662             if (str != NULL)
02663             *str = name;
02664         }
02665     }
02666     }
02667     return(ent);
02668 }
02669 
02682 static xmlChar *
02683 htmlParseAttValue(htmlParserCtxtPtr ctxt) {
02684     xmlChar *ret = NULL;
02685 
02686     if (CUR == '"') {
02687         NEXT;
02688     ret = htmlParseHTMLAttribute(ctxt, '"');
02689         if (CUR != '"') {
02690         htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
02691                      "AttValue: \" expected\n", NULL, NULL);
02692     } else
02693         NEXT;
02694     } else if (CUR == '\'') {
02695         NEXT;
02696     ret = htmlParseHTMLAttribute(ctxt, '\'');
02697         if (CUR != '\'') {
02698         htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
02699                      "AttValue: ' expected\n", NULL, NULL);
02700     } else
02701         NEXT;
02702     } else {
02703         /*
02704      * That's an HTMLism, the attribute value may not be quoted
02705      */
02706     ret = htmlParseHTMLAttribute(ctxt, 0);
02707     if (ret == NULL) {
02708         htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_WITHOUT_VALUE,
02709                      "AttValue: no value found\n", NULL, NULL);
02710     }
02711     }
02712     return(ret);
02713 }
02714 
02726 static xmlChar *
02727 htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
02728     const xmlChar *q;
02729     xmlChar *ret = NULL;
02730 
02731     if (CUR == '"') {
02732         NEXT;
02733     q = CUR_PTR;
02734     while ((IS_CHAR_CH(CUR)) && (CUR != '"'))
02735         NEXT;
02736     if (!IS_CHAR_CH(CUR)) {
02737         htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
02738              "Unfinished SystemLiteral\n", NULL, NULL);
02739     } else {
02740         ret = xmlStrndup(q, CUR_PTR - q);
02741         NEXT;
02742         }
02743     } else if (CUR == '\'') {
02744         NEXT;
02745     q = CUR_PTR;
02746     while ((IS_CHAR_CH(CUR)) && (CUR != '\''))
02747         NEXT;
02748     if (!IS_CHAR_CH(CUR)) {
02749         htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
02750              "Unfinished SystemLiteral\n", NULL, NULL);
02751     } else {
02752         ret = xmlStrndup(q, CUR_PTR - q);
02753         NEXT;
02754         }
02755     } else {
02756     htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
02757                  " or ' expected\n", NULL, NULL);
02758     }
02759 
02760     return(ret);
02761 }
02762 
02774 static xmlChar *
02775 htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
02776     const xmlChar *q;
02777     xmlChar *ret = NULL;
02778     /*
02779      * Name ::= (Letter | '_') (NameChar)*
02780      */
02781     if (CUR == '"') {
02782         NEXT;
02783     q = CUR_PTR;
02784     while (IS_PUBIDCHAR_CH(CUR)) NEXT;
02785     if (CUR != '"') {
02786         htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
02787                      "Unfinished PubidLiteral\n", NULL, NULL);
02788     } else {
02789         ret = xmlStrndup(q, CUR_PTR - q);
02790         NEXT;
02791     }
02792     } else if (CUR == '\'') {
02793         NEXT;
02794     q = CUR_PTR;
02795     while ((IS_PUBIDCHAR_CH(CUR)) && (CUR != '\''))
02796         NEXT;
02797     if (CUR != '\'') {
02798         htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
02799                      "Unfinished PubidLiteral\n", NULL, NULL);
02800     } else {
02801         ret = xmlStrndup(q, CUR_PTR - q);
02802         NEXT;
02803     }
02804     } else {
02805     htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
02806                  "PubidLiteral \" or ' expected\n", NULL, NULL);
02807     }
02808 
02809     return(ret);
02810 }
02811 
02833 static void
02834 htmlParseScript(htmlParserCtxtPtr ctxt) {
02835     xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
02836     int nbchar = 0;
02837     int cur,l;
02838 
02839     SHRINK;
02840     cur = CUR_CHAR(l);
02841     while (IS_CHAR_CH(cur)) {
02842     if ((cur == '<') && (NXT(1) == '/')) {
02843             /*
02844              * One should break here, the specification is clear:
02845              * Authors should therefore escape "</" within the content.
02846              * Escape mechanisms are specific to each scripting or
02847              * style sheet language.
02848              *
02849              * In recovery mode, only break if end tag match the
02850              * current tag, effectively ignoring all tags inside the
02851              * script/style block and treating the entire block as
02852              * CDATA.
02853              */
02854             if (ctxt->recovery) {
02855                 if (xmlStrncasecmp(ctxt->name, ctxt->input->cur+2,
02856                    xmlStrlen(ctxt->name)) == 0)
02857                 {
02858                     break; /* while */
02859                 } else {
02860             htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
02861                  "Element %s embeds close tag\n",
02862                          ctxt->name, NULL);
02863         }
02864             } else {
02865                 if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
02866                     ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
02867                 {
02868                     break; /* while */
02869                 }
02870             }
02871     }
02872     COPY_BUF(l,buf,nbchar,cur);
02873     if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
02874         if (ctxt->sax->cdataBlock!= NULL) {
02875         /*
02876          * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
02877          */
02878         ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
02879         } else if (ctxt->sax->characters != NULL) {
02880         ctxt->sax->characters(ctxt->userData, buf, nbchar);
02881         }
02882         nbchar = 0;
02883     }
02884     GROW;
02885     NEXTL(l);
02886     cur = CUR_CHAR(l);
02887     }
02888 
02889     if ((!(IS_CHAR_CH(cur))) && (!((cur == 0) && (ctxt->progressive)))) {
02890         htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
02891                     "Invalid char in CDATA 0x%X\n", cur);
02892         if (ctxt->input->cur < ctxt->input->end) {
02893             NEXT;
02894         }
02895     }
02896 
02897     if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
02898     if (ctxt->sax->cdataBlock!= NULL) {
02899         /*
02900          * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
02901          */
02902         ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
02903     } else if (ctxt->sax->characters != NULL) {
02904         ctxt->sax->characters(ctxt->userData, buf, nbchar);
02905     }
02906     }
02907 }
02908 
02909 
02920 static void
02921 htmlParseCharData(htmlParserCtxtPtr ctxt) {
02922     xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
02923     int nbchar = 0;
02924     int cur, l;
02925     int chunk = 0;
02926 
02927     SHRINK;
02928     cur = CUR_CHAR(l);
02929     while (((cur != '<') || (ctxt->token == '<')) &&
02930            ((cur != '&') || (ctxt->token == '&')) &&
02931        (cur != 0)) {
02932     if (!(IS_CHAR(cur))) {
02933         htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
02934                     "Invalid char in CDATA 0x%X\n", cur);
02935     } else {
02936         COPY_BUF(l,buf,nbchar,cur);
02937     }
02938     if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
02939         /*
02940          * Ok the segment is to be consumed as chars.
02941          */
02942         if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
02943         if (areBlanks(ctxt, buf, nbchar)) {
02944             if (ctxt->sax->ignorableWhitespace != NULL)
02945             ctxt->sax->ignorableWhitespace(ctxt->userData,
02946                                            buf, nbchar);
02947         } else {
02948             htmlCheckParagraph(ctxt);
02949             if (ctxt->sax->characters != NULL)
02950             ctxt->sax->characters(ctxt->userData, buf, nbchar);
02951         }
02952         }
02953         nbchar = 0;
02954     }
02955     NEXTL(l);
02956         chunk++;
02957         if (chunk > HTML_PARSER_BUFFER_SIZE) {
02958             chunk = 0;
02959             SHRINK;
02960             GROW;
02961         }
02962     cur = CUR_CHAR(l);
02963     if (cur == 0) {
02964         SHRINK;
02965         GROW;
02966         cur = CUR_CHAR(l);
02967     }
02968     }
02969     if (nbchar != 0) {
02970         buf[nbchar] = 0;
02971 
02972     /*
02973      * Ok the segment is to be consumed as chars.
02974      */
02975     if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
02976         if (areBlanks(ctxt, buf, nbchar)) {
02977         if (ctxt->sax->ignorableWhitespace != NULL)
02978             ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar);
02979         } else {
02980         htmlCheckParagraph(ctxt);
02981         if (ctxt->sax->characters != NULL)
02982             ctxt->sax->characters(ctxt->userData, buf, nbchar);
02983         }
02984     }
02985     } else {
02986     /*
02987      * Loop detection
02988      */
02989     if (cur == 0)
02990         ctxt->instate = XML_PARSER_EOF;
02991     }
02992 }
02993 
03011 static xmlChar *
03012 htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
03013     xmlChar *URI = NULL;
03014 
03015     if ((UPPER == 'S') && (UPP(1) == 'Y') &&
03016          (UPP(2) == 'S') && (UPP(3) == 'T') &&
03017      (UPP(4) == 'E') && (UPP(5) == 'M')) {
03018         SKIP(6);
03019     if (!IS_BLANK_CH(CUR)) {
03020         htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
03021                      "Space required after 'SYSTEM'\n", NULL, NULL);
03022     }
03023         SKIP_BLANKS;
03024     URI = htmlParseSystemLiteral(ctxt);
03025     if (URI == NULL) {
03026         htmlParseErr(ctxt, XML_ERR_URI_REQUIRED,
03027                      "htmlParseExternalID: SYSTEM, no URI\n", NULL, NULL);
03028         }
03029     } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
03030            (UPP(2) == 'B') && (UPP(3) == 'L') &&
03031            (UPP(4) == 'I') && (UPP(5) == 'C')) {
03032         SKIP(6);
03033     if (!IS_BLANK_CH(CUR)) {
03034         htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
03035                      "Space required after 'PUBLIC'\n", NULL, NULL);
03036     }
03037         SKIP_BLANKS;
03038     *publicID = htmlParsePubidLiteral(ctxt);
03039     if (*publicID == NULL) {
03040         htmlParseErr(ctxt, XML_ERR_PUBID_REQUIRED,
03041                      "htmlParseExternalID: PUBLIC, no Public Identifier\n",
03042              NULL, NULL);
03043     }
03044         SKIP_BLANKS;
03045         if ((CUR == '"') || (CUR == '\'')) {
03046         URI = htmlParseSystemLiteral(ctxt);
03047     }
03048     }
03049     return(URI);
03050 }
03051 
03060 static void
03061 htmlParsePI(htmlParserCtxtPtr ctxt) {
03062     xmlChar *buf = NULL;
03063     int len = 0;
03064     int size = HTML_PARSER_BUFFER_SIZE;
03065     int cur, l;
03066     const xmlChar *target;
03067     xmlParserInputState state;
03068     int count = 0;
03069 
03070     if ((RAW == '<') && (NXT(1) == '?')) {
03071     state = ctxt->instate;
03072         ctxt->instate = XML_PARSER_PI;
03073     /*
03074      * this is a Processing Instruction.
03075      */
03076     SKIP(2);
03077     SHRINK;
03078 
03079     /*
03080      * Parse the target name and check for special support like
03081      * namespace.
03082      */
03083         target = htmlParseName(ctxt);
03084     if (target != NULL) {
03085         if (RAW == '>') {
03086         SKIP(1);
03087 
03088         /*
03089          * SAX: PI detected.
03090          */
03091         if ((ctxt->sax) && (!ctxt->disableSAX) &&
03092             (ctxt->sax->processingInstruction != NULL))
03093             ctxt->sax->processingInstruction(ctxt->userData,
03094                                              target, NULL);
03095         ctxt->instate = state;
03096         return;
03097         }
03098         buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
03099         if (buf == NULL) {
03100         htmlErrMemory(ctxt, NULL);
03101         ctxt->instate = state;
03102         return;
03103         }
03104         cur = CUR;
03105         if (!IS_BLANK(cur)) {
03106         htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
03107               "ParsePI: PI %s space expected\n", target, NULL);
03108         }
03109             SKIP_BLANKS;
03110         cur = CUR_CHAR(l);
03111         while (IS_CHAR(cur) && (cur != '>')) {
03112         if (len + 5 >= size) {
03113             xmlChar *tmp;
03114 
03115             size *= 2;
03116             tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
03117             if (tmp == NULL) {
03118             htmlErrMemory(ctxt, NULL);
03119             xmlFree(buf);
03120             ctxt->instate = state;
03121             return;
03122             }
03123             buf = tmp;
03124         }
03125         count++;
03126         if (count > 50) {
03127             GROW;
03128             count = 0;
03129         }
03130         COPY_BUF(l,buf,len,cur);
03131         NEXTL(l);
03132         cur = CUR_CHAR(l);
03133         if (cur == 0) {
03134             SHRINK;
03135             GROW;
03136             cur = CUR_CHAR(l);
03137         }
03138         }
03139         buf[len] = 0;
03140         if (cur != '>') {
03141         htmlParseErr(ctxt, XML_ERR_PI_NOT_FINISHED,
03142               "ParsePI: PI %s never end ...\n", target, NULL);
03143         } else {
03144         SKIP(1);
03145 
03146         /*
03147          * SAX: PI detected.
03148          */
03149         if ((ctxt->sax) && (!ctxt->disableSAX) &&
03150             (ctxt->sax->processingInstruction != NULL))
03151             ctxt->sax->processingInstruction(ctxt->userData,
03152                                              target, buf);
03153         }
03154         xmlFree(buf);
03155     } else {
03156         htmlParseErr(ctxt, XML_ERR_PI_NOT_STARTED,
03157                          "PI is not started correctly", NULL, NULL);
03158     }
03159     ctxt->instate = state;
03160     }
03161 }
03162 
03171 static void
03172 htmlParseComment(htmlParserCtxtPtr ctxt) {
03173     xmlChar *buf = NULL;
03174     int len;
03175     int size = HTML_PARSER_BUFFER_SIZE;
03176     int q, ql;
03177     int r, rl;
03178     int cur, l;
03179     xmlParserInputState state;
03180 
03181     /*
03182      * Check that there is a comment right here.
03183      */
03184     if ((RAW != '<') || (NXT(1) != '!') ||
03185         (NXT(2) != '-') || (NXT(3) != '-')) return;
03186 
03187     state = ctxt->instate;
03188     ctxt->instate = XML_PARSER_COMMENT;
03189     SHRINK;
03190     SKIP(4);
03191     buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
03192     if (buf == NULL) {
03193         htmlErrMemory(ctxt, "buffer allocation failed\n");
03194     ctxt->instate = state;
03195     return;
03196     }
03197     q = CUR_CHAR(ql);
03198     NEXTL(ql);
03199     r = CUR_CHAR(rl);
03200     NEXTL(rl);
03201     cur = CUR_CHAR(l);
03202     len = 0;
03203     while (IS_CHAR(cur) &&
03204            ((cur != '>') ||
03205         (r != '-') || (q != '-'))) {
03206     if (len + 5 >= size) {
03207         xmlChar *tmp;
03208 
03209         size *= 2;
03210         tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
03211         if (tmp == NULL) {
03212             xmlFree(buf);
03213             htmlErrMemory(ctxt, "growing buffer failed\n");
03214         ctxt->instate = state;
03215         return;
03216         }
03217         buf = tmp;
03218     }
03219     COPY_BUF(ql,buf,len,q);
03220     q = r;
03221     ql = rl;
03222     r = cur;
03223     rl = l;
03224     NEXTL(l);
03225     cur = CUR_CHAR(l);
03226     if (cur == 0) {
03227         SHRINK;
03228         GROW;
03229         cur = CUR_CHAR(l);
03230     }
03231     }
03232     buf[len] = 0;
03233     if (!IS_CHAR(cur)) {
03234     htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
03235                  "Comment not terminated \n<!--%.50s\n", buf, NULL);
03236     xmlFree(buf);
03237     } else {
03238         NEXT;
03239     if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
03240         (!ctxt->disableSAX))
03241         ctxt->sax->comment(ctxt->userData, buf);
03242     xmlFree(buf);
03243     }
03244     ctxt->instate = state;
03245 }
03246 
03258 int
03259 htmlParseCharRef(htmlParserCtxtPtr ctxt) {
03260     int val = 0;
03261 
03262     if ((ctxt == NULL) || (ctxt->input == NULL)) {
03263     htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
03264              "htmlParseCharRef: context error\n",
03265              NULL, NULL);
03266         return(0);
03267     }
03268     if ((CUR == '&') && (NXT(1) == '#') &&
03269         ((NXT(2) == 'x') || NXT(2) == 'X')) {
03270     SKIP(3);
03271     while (CUR != ';') {
03272         if ((CUR >= '0') && (CUR <= '9'))
03273             val = val * 16 + (CUR - '0');
03274         else if ((CUR >= 'a') && (CUR <= 'f'))
03275             val = val * 16 + (CUR - 'a') + 10;
03276         else if ((CUR >= 'A') && (CUR <= 'F'))
03277             val = val * 16 + (CUR - 'A') + 10;
03278         else {
03279             htmlParseErr(ctxt, XML_ERR_INVALID_HEX_CHARREF,
03280                      "htmlParseCharRef: missing semicolon\n",
03281                  NULL, NULL);
03282         break;
03283         }
03284         NEXT;
03285     }
03286     if (CUR == ';')
03287         NEXT;
03288     } else if  ((CUR == '&') && (NXT(1) == '#')) {
03289     SKIP(2);
03290     while (CUR != ';') {
03291         if ((CUR >= '0') && (CUR <= '9'))
03292             val = val * 10 + (CUR - '0');
03293         else {
03294             htmlParseErr(ctxt, XML_ERR_INVALID_DEC_CHARREF,
03295                      "htmlParseCharRef: missing semicolon\n",
03296                  NULL, NULL);
03297         break;
03298         }
03299         NEXT;
03300     }
03301     if (CUR == ';')
03302         NEXT;
03303     } else {
03304     htmlParseErr(ctxt, XML_ERR_INVALID_CHARREF,
03305                  "htmlParseCharRef: invalid value\n", NULL, NULL);
03306     }
03307     /*
03308      * Check the value IS_CHAR ...
03309      */
03310     if (IS_CHAR(val)) {
03311         return(val);
03312     } else {
03313     htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
03314             "htmlParseCharRef: invalid xmlChar value %d\n",
03315             val);
03316     }
03317     return(0);
03318 }
03319 
03320 
03331 static void
03332 htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
03333     const xmlChar *name;
03334     xmlChar *ExternalID = NULL;
03335     xmlChar *URI = NULL;
03336 
03337     /*
03338      * We know that '<!DOCTYPE' has been detected.
03339      */
03340     SKIP(9);
03341 
03342     SKIP_BLANKS;
03343 
03344     /*
03345      * Parse the DOCTYPE name.
03346      */
03347     name = htmlParseName(ctxt);
03348     if (name == NULL) {
03349     htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
03350                  "htmlParseDocTypeDecl : no DOCTYPE name !\n",
03351              NULL, NULL);
03352     }
03353     /*
03354      * Check that upper(name) == "HTML" !!!!!!!!!!!!!
03355      */
03356 
03357     SKIP_BLANKS;
03358 
03359     /*
03360      * Check for SystemID and ExternalID
03361      */
03362     URI = htmlParseExternalID(ctxt, &ExternalID);
03363     SKIP_BLANKS;
03364 
03365     /*
03366      * We should be at the end of the DOCTYPE declaration.
03367      */
03368     if (CUR != '>') {
03369     htmlParseErr(ctxt, XML_ERR_DOCTYPE_NOT_FINISHED,
03370                  "DOCTYPE improperly terminated\n", NULL, NULL);
03371         /* We shouldn't try to resynchronize ... */
03372     }
03373     NEXT;
03374 
03375     /*
03376      * Create or update the document accordingly to the DOCTYPE
03377      */
03378     if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
03379     (!ctxt->disableSAX))
03380     ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
03381 
03382     /*
03383      * Cleanup, since we don't use all those identifiers
03384      */
03385     if (URI != NULL) xmlFree(URI);
03386     if (ExternalID != NULL) xmlFree(ExternalID);
03387 }
03388 
03410 static const xmlChar *
03411 htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
03412     const xmlChar *name;
03413     xmlChar *val = NULL;
03414 
03415     *value = NULL;
03416     name = htmlParseHTMLName(ctxt);
03417     if (name == NULL) {
03418     htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
03419                  "error parsing attribute name\n", NULL, NULL);
03420         return(NULL);
03421     }
03422 
03423     /*
03424      * read the value
03425      */
03426     SKIP_BLANKS;
03427     if (CUR == '=') {
03428         NEXT;
03429     SKIP_BLANKS;
03430     val = htmlParseAttValue(ctxt);
03431     }
03432 
03433     *value = val;
03434     return(name);
03435 }
03436 
03447 static void
03448 htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
03449     const xmlChar *encoding;
03450 
03451     if ((ctxt == NULL) || (attvalue == NULL))
03452     return;
03453 
03454     /* do not change encoding */
03455     if (ctxt->input->encoding != NULL)
03456         return;
03457 
03458     encoding = xmlStrcasestr(attvalue, BAD_CAST"charset=");
03459     if (encoding != NULL) {
03460     encoding += 8;
03461     } else {
03462     encoding = xmlStrcasestr(attvalue, BAD_CAST"charset =");
03463     if (encoding != NULL)
03464         encoding += 9;
03465     }
03466     if (encoding != NULL) {
03467     xmlCharEncoding enc;
03468     xmlCharEncodingHandlerPtr handler;
03469 
03470     while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
03471 
03472     if (ctxt->input->encoding != NULL)
03473         xmlFree((xmlChar *) ctxt->input->encoding);
03474     ctxt->input->encoding = xmlStrdup(encoding);
03475 
03476     enc = xmlParseCharEncoding((const char *) encoding);
03477     /*
03478      * registered set of known encodings
03479      */
03480     if (enc != XML_CHAR_ENCODING_ERROR) {
03481         if (((enc == XML_CHAR_ENCODING_UTF16LE) ||
03482              (enc == XML_CHAR_ENCODING_UTF16BE) ||
03483          (enc == XML_CHAR_ENCODING_UCS4LE) ||
03484          (enc == XML_CHAR_ENCODING_UCS4BE)) &&
03485         (ctxt->input->buf != NULL) &&
03486         (ctxt->input->buf->encoder == NULL)) {
03487         htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
03488                      "htmlCheckEncoding: wrong encoding meta\n",
03489                  NULL, NULL);
03490         } else {
03491         xmlSwitchEncoding(ctxt, enc);
03492         }
03493         ctxt->charset = XML_CHAR_ENCODING_UTF8;
03494     } else {
03495         /*
03496          * fallback for unknown encodings
03497          */
03498         handler = xmlFindCharEncodingHandler((const char *) encoding);
03499         if (handler != NULL) {
03500         xmlSwitchToEncoding(ctxt, handler);
03501         ctxt->charset = XML_CHAR_ENCODING_UTF8;
03502         } else {
03503         ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
03504         }
03505     }
03506 
03507     if ((ctxt->input->buf != NULL) &&
03508         (ctxt->input->buf->encoder != NULL) &&
03509         (ctxt->input->buf->raw != NULL) &&
03510         (ctxt->input->buf->buffer != NULL)) {
03511         int nbchars;
03512         int processed;
03513 
03514         /*
03515          * convert as much as possible to the parser reading buffer.
03516          */
03517         processed = ctxt->input->cur - ctxt->input->base;
03518         xmlBufferShrink(ctxt->input->buf->buffer, processed);
03519         nbchars = xmlCharEncInFunc(ctxt->input->buf->encoder,
03520                                ctxt->input->buf->buffer,
03521                        ctxt->input->buf->raw);
03522         if (nbchars < 0) {
03523         htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
03524                      "htmlCheckEncoding: encoder error\n",
03525                  NULL, NULL);
03526         }
03527         ctxt->input->base =
03528         ctxt->input->cur = ctxt->input->buf->buffer->content;
03529             ctxt->input->end =
03530                           &ctxt->input->base[ctxt->input->buf->buffer->use];
03531     }
03532     }
03533 }
03534 
03542 static void
03543 htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
03544     int i;
03545     const xmlChar *att, *value;
03546     int http = 0;
03547     const xmlChar *content = NULL;
03548 
03549     if ((ctxt == NULL) || (atts == NULL))
03550     return;
03551 
03552     i = 0;
03553     att = atts[i++];
03554     while (att != NULL) {
03555     value = atts[i++];
03556     if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
03557      && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
03558         http = 1;
03559     else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
03560         content = value;
03561     att = atts[i++];
03562     }
03563     if ((http) && (content != NULL))
03564     htmlCheckEncoding(ctxt, content);
03565 
03566 }
03567 
03588 static int
03589 htmlParseStartTag(htmlParserCtxtPtr ctxt) {
03590     const xmlChar *name;
03591     const xmlChar *attname;
03592     xmlChar *attvalue;
03593     const xmlChar **atts;
03594     int nbatts = 0;
03595     int maxatts;
03596     int meta = 0;
03597     int i;
03598     int discardtag = 0;
03599 
03600     if (ctxt->instate == XML_PARSER_EOF)
03601         return(-1);
03602     if ((ctxt == NULL) || (ctxt->input == NULL)) {
03603     htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
03604              "htmlParseStartTag: context error\n", NULL, NULL);
03605     return -1;
03606     }
03607     if (CUR != '<') return -1;
03608     NEXT;
03609 
03610     atts = ctxt->atts;
03611     maxatts = ctxt->maxatts;
03612 
03613     GROW;
03614     name = htmlParseHTMLName(ctxt);
03615     if (name == NULL) {
03616     htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
03617                  "htmlParseStartTag: invalid element name\n",
03618              NULL, NULL);
03619     /* Dump the bogus tag like browsers do */
03620     while ((IS_CHAR_CH(CUR)) && (CUR != '>') &&
03621                (ctxt->instate != XML_PARSER_EOF))
03622         NEXT;
03623         return -1;
03624     }
03625     if (xmlStrEqual(name, BAD_CAST"meta"))
03626     meta = 1;
03627 
03628     /*
03629      * Check for auto-closure of HTML elements.
03630      */
03631     htmlAutoClose(ctxt, name);
03632 
03633     /*
03634      * Check for implied HTML elements.
03635      */
03636     htmlCheckImplied(ctxt, name);
03637 
03638     /*
03639      * Avoid html at any level > 0, head at any level != 1
03640      * or any attempt to recurse body
03641      */
03642     if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
03643     htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
03644                  "htmlParseStartTag: misplaced <html> tag\n",
03645              name, NULL);
03646     discardtag = 1;
03647     ctxt->depth++;
03648     }
03649     if ((ctxt->nameNr != 1) &&
03650     (xmlStrEqual(name, BAD_CAST"head"))) {
03651     htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
03652                  "htmlParseStartTag: misplaced <head> tag\n",
03653              name, NULL);
03654     discardtag = 1;
03655     ctxt->depth++;
03656     }
03657     if (xmlStrEqual(name, BAD_CAST"body")) {
03658     int indx;
03659     for (indx = 0;indx < ctxt->nameNr;indx++) {
03660         if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
03661         htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
03662                      "htmlParseStartTag: misplaced <body> tag\n",
03663                  name, NULL);
03664         discardtag = 1;
03665         ctxt->depth++;
03666         }
03667     }
03668     }
03669 
03670     /*
03671      * Now parse the attributes, it ends up with the ending
03672      *
03673      * (S Attribute)* S?
03674      */
03675     SKIP_BLANKS;
03676     while ((IS_CHAR_CH(CUR)) &&
03677            (CUR != '>') &&
03678        ((CUR != '/') || (NXT(1) != '>'))) {
03679     long cons = ctxt->nbChars;
03680 
03681     GROW;
03682     attname = htmlParseAttribute(ctxt, &attvalue);
03683         if (attname != NULL) {
03684 
03685         /*
03686          * Well formedness requires at most one declaration of an attribute
03687          */
03688         for (i = 0; i < nbatts;i += 2) {
03689             if (xmlStrEqual(atts[i], attname)) {
03690             htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_REDEFINED,
03691                          "Attribute %s redefined\n", attname, NULL);
03692             if (attvalue != NULL)
03693             xmlFree(attvalue);
03694             goto failed;
03695         }
03696         }
03697 
03698         /*
03699          * Add the pair to atts
03700          */
03701         if (atts == NULL) {
03702             maxatts = 22; /* allow for 10 attrs by default */
03703             atts = (const xmlChar **)
03704                xmlMalloc(maxatts * sizeof(xmlChar *));
03705         if (atts == NULL) {
03706             htmlErrMemory(ctxt, NULL);
03707             if (attvalue != NULL)
03708             xmlFree(attvalue);
03709             goto failed;
03710         }
03711         ctxt->atts = atts;
03712         ctxt->maxatts = maxatts;
03713         } else if (nbatts + 4 > maxatts) {
03714             const xmlChar **n;
03715 
03716             maxatts *= 2;
03717             n = (const xmlChar **) xmlRealloc((void *) atts,
03718                          maxatts * sizeof(const xmlChar *));
03719         if (n == NULL) {
03720             htmlErrMemory(ctxt, NULL);
03721             if (attvalue != NULL)
03722             xmlFree(attvalue);
03723             goto failed;
03724         }
03725         atts = n;
03726         ctxt->atts = atts;
03727         ctxt->maxatts = maxatts;
03728         }
03729         atts[nbatts++] = attname;
03730         atts[nbatts++] = attvalue;
03731         atts[nbatts] = NULL;
03732         atts[nbatts + 1] = NULL;
03733     }
03734     else {
03735         if (attvalue != NULL)
03736             xmlFree(attvalue);
03737         /* Dump the bogus attribute string up to the next blank or
03738          * the end of the tag. */
03739         while ((IS_CHAR_CH(CUR)) &&
03740                !(IS_BLANK_CH(CUR)) && (CUR != '>') &&
03741            ((CUR != '/') || (NXT(1) != '>')))
03742         NEXT;
03743     }
03744 
03745 failed:
03746     SKIP_BLANKS;
03747         if (cons == ctxt->nbChars) {
03748         htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
03749                      "htmlParseStartTag: problem parsing attributes\n",
03750              NULL, NULL);
03751         break;
03752     }
03753     }
03754 
03755     /*
03756      * Handle specific association to the META tag
03757      */
03758     if (meta && (nbatts != 0))
03759     htmlCheckMeta(ctxt, atts);
03760 
03761     /*
03762      * SAX: Start of Element !
03763      */
03764     if (!discardtag) {
03765     htmlnamePush(ctxt, name);
03766     if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) {
03767         if (nbatts != 0)
03768         ctxt->sax->startElement(ctxt->userData, name, atts);
03769         else
03770         ctxt->sax->startElement(ctxt->userData, name, NULL);
03771     }
03772     }
03773 
03774     if (atts != NULL) {
03775         for (i = 1;i < nbatts;i += 2) {
03776         if (atts[i] != NULL)
03777         xmlFree((xmlChar *) atts[i]);
03778     }
03779     }
03780 
03781     return(discardtag);
03782 }
03783 
03799 static int
03800 htmlParseEndTag(htmlParserCtxtPtr ctxt)
03801 {
03802     const xmlChar *name;
03803     const xmlChar *oldname;
03804     int i, ret;
03805 
03806     if ((CUR != '<') || (NXT(1) != '/')) {
03807         htmlParseErr(ctxt, XML_ERR_LTSLASH_REQUIRED,
03808                  "htmlParseEndTag: '</' not found\n", NULL, NULL);
03809         return (0);
03810     }
03811     SKIP(2);
03812 
03813     name = htmlParseHTMLName(ctxt);
03814     if (name == NULL)
03815         return (0);
03816     /*
03817      * We should definitely be at the ending "S? '>'" part
03818      */
03819     SKIP_BLANKS;
03820     if ((!IS_CHAR_CH(CUR)) || (CUR != '>')) {
03821         htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
03822                  "End tag : expected '>'\n", NULL, NULL);
03823     if (ctxt->recovery) {
03824         /*
03825          * We're not at the ending > !!
03826          * Error, unless in recover mode where we search forwards
03827          * until we find a >
03828          */
03829         while (CUR != '\0' && CUR != '>') NEXT;
03830         NEXT;
03831     }
03832     } else
03833         NEXT;
03834 
03835     /*
03836      * if we ignored misplaced tags in htmlParseStartTag don't pop them
03837      * out now.
03838      */
03839     if ((ctxt->depth > 0) &&
03840         (xmlStrEqual(name, BAD_CAST "html") ||
03841          xmlStrEqual(name, BAD_CAST "body") ||
03842      xmlStrEqual(name, BAD_CAST "head"))) {
03843     ctxt->depth--;
03844     return (0);
03845     }
03846 
03847     /*
03848      * If the name read is not one of the element in the parsing stack
03849      * then return, it's just an error.
03850      */
03851     for (i = (ctxt->nameNr - 1); i >= 0; i--) {
03852         if (xmlStrEqual(name, ctxt->nameTab[i]))
03853             break;
03854     }
03855     if (i < 0) {
03856         htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
03857                  "Unexpected end tag : %s\n", name, NULL);
03858         return (0);
03859     }
03860 
03861 
03862     /*
03863      * Check for auto-closure of HTML elements.
03864      */
03865 
03866     htmlAutoCloseOnClose(ctxt, name);
03867 
03868     /*
03869      * Well formedness constraints, opening and closing must match.
03870      * With the exception that the autoclose may have popped stuff out
03871      * of the stack.
03872      */
03873     if (!xmlStrEqual(name, ctxt->name)) {
03874         if ((ctxt->name != NULL) && (!xmlStrEqual(ctxt->name, name))) {
03875             htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
03876                      "Opening and ending tag mismatch: %s and %s\n",
03877              name, ctxt->name);
03878         }
03879     }
03880 
03881     /*
03882      * SAX: End of Tag
03883      */
03884     oldname = ctxt->name;
03885     if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
03886         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
03887             ctxt->sax->endElement(ctxt->userData, name);
03888         htmlnamePop(ctxt);
03889         ret = 1;
03890     } else {
03891         ret = 0;
03892     }
03893 
03894     return (ret);
03895 }
03896 
03897 
03906 static void
03907 htmlParseReference(htmlParserCtxtPtr ctxt) {
03908     const htmlEntityDesc * ent;
03909     xmlChar out[6];
03910     const xmlChar *name;
03911     if (CUR != '&') return;
03912 
03913     if (NXT(1) == '#') {
03914     unsigned int c;
03915     int bits, i = 0;
03916 
03917     c = htmlParseCharRef(ctxt);
03918     if (c == 0)
03919         return;
03920 
03921         if      (c <    0x80) { out[i++]= c;                bits= -6; }
03922         else if (c <   0x800) { out[i++]=((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
03923         else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
03924         else                  { out[i++]=((c >> 18) & 0x07) | 0xF0;  bits= 12; }
03925 
03926         for ( ; bits >= 0; bits-= 6) {
03927             out[i++]= ((c >> bits) & 0x3F) | 0x80;
03928         }
03929     out[i] = 0;
03930 
03931     htmlCheckParagraph(ctxt);
03932     if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
03933         ctxt->sax->characters(ctxt->userData, out, i);
03934     } else {
03935     ent = htmlParseEntityRef(ctxt, &name);
03936     if (name == NULL) {
03937         htmlCheckParagraph(ctxt);
03938         if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
03939             ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
03940         return;
03941     }
03942     if ((ent == NULL) || !(ent->value > 0)) {
03943         htmlCheckParagraph(ctxt);
03944         if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
03945         ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
03946         ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
03947         /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
03948         }
03949     } else {
03950         unsigned int c;
03951         int bits, i = 0;
03952 
03953         c = ent->value;
03954         if      (c <    0x80)
03955                 { out[i++]= c;                bits= -6; }
03956         else if (c <   0x800)
03957                 { out[i++]=((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
03958         else if (c < 0x10000)
03959                 { out[i++]=((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
03960         else
03961                 { out[i++]=((c >> 18) & 0x07) | 0xF0;  bits= 12; }
03962 
03963         for ( ; bits >= 0; bits-= 6) {
03964         out[i++]= ((c >> bits) & 0x3F) | 0x80;
03965         }
03966         out[i] = 0;
03967 
03968         htmlCheckParagraph(ctxt);
03969         if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
03970         ctxt->sax->characters(ctxt->userData, out, i);
03971     }
03972     }
03973 }
03974 
03983 static void
03984 htmlParseContent(htmlParserCtxtPtr ctxt) {
03985     xmlChar *currentNode;
03986     int depth;
03987     const xmlChar *name;
03988 
03989     currentNode = xmlStrdup(ctxt->name);
03990     depth = ctxt->nameNr;
03991     while (1) {
03992     long cons = ctxt->nbChars;
03993 
03994         GROW;
03995 
03996         if (ctxt->instate == XML_PARSER_EOF)
03997             break;
03998 
03999     /*
04000      * Our tag or one of it's parent or children is ending.
04001      */
04002         if ((CUR == '<') && (NXT(1) == '/')) {
04003         if (htmlParseEndTag(ctxt) &&
04004         ((currentNode != NULL) || (ctxt->nameNr == 0))) {
04005         if (currentNode != NULL)
04006             xmlFree(currentNode);
04007         return;
04008         }
04009         continue; /* while */
04010         }
04011 
04012     else if ((CUR == '<') &&
04013              ((IS_ASCII_LETTER(NXT(1))) ||
04014           (NXT(1) == '_') || (NXT(1) == ':'))) {
04015         name = htmlParseHTMLName_nonInvasive(ctxt);
04016         if (name == NULL) {
04017             htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
04018              "htmlParseStartTag: invalid element name\n",
04019              NULL, NULL);
04020             /* Dump the bogus tag like browsers do */
04021         while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
04022                 NEXT;
04023 
04024             if (currentNode != NULL)
04025                 xmlFree(currentNode);
04026             return;
04027         }
04028 
04029         if (ctxt->name != NULL) {
04030             if (htmlCheckAutoClose(name, ctxt->name) == 1) {
04031                 htmlAutoClose(ctxt, name);
04032                 continue;
04033             }
04034         }
04035     }
04036 
04037     /*
04038      * Has this node been popped out during parsing of
04039      * the next element
04040      */
04041         if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
04042         (!xmlStrEqual(currentNode, ctxt->name)))
04043          {
04044         if (currentNode != NULL) xmlFree(currentNode);
04045         return;
04046     }
04047 
04048     if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
04049         (xmlStrEqual(currentNode, BAD_CAST"style")))) {
04050         /*
04051          * Handle SCRIPT/STYLE separately
04052          */
04053         htmlParseScript(ctxt);
04054     } else {
04055         /*
04056          * Sometimes DOCTYPE arrives in the middle of the document
04057          */
04058         if ((CUR == '<') && (NXT(1) == '!') &&
04059         (UPP(2) == 'D') && (UPP(3) == 'O') &&
04060         (UPP(4) == 'C') && (UPP(5) == 'T') &&
04061         (UPP(6) == 'Y') && (UPP(7) == 'P') &&
04062         (UPP(8) == 'E')) {
04063         htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
04064                      "Misplaced DOCTYPE declaration\n",
04065                  BAD_CAST "DOCTYPE" , NULL);
04066         htmlParseDocTypeDecl(ctxt);
04067         }
04068 
04069         /*
04070          * First case :  a comment
04071          */
04072         if ((CUR == '<') && (NXT(1) == '!') &&
04073         (NXT(2) == '-') && (NXT(3) == '-')) {
04074         htmlParseComment(ctxt);
04075         }
04076 
04077         /*
04078          * Second case : a Processing Instruction.
04079          */
04080         else if ((CUR == '<') && (NXT(1) == '?')) {
04081         htmlParsePI(ctxt);
04082         }
04083 
04084         /*
04085          * Third case :  a sub-element.
04086          */
04087         else if (CUR == '<') {
04088         htmlParseElement(ctxt);
04089         }
04090 
04091         /*
04092          * Fourth case : a reference. If if has not been resolved,
04093          *    parsing returns it's Name, create the node
04094          */
04095         else if (CUR == '&') {
04096         htmlParseReference(ctxt);
04097         }
04098 
04099         /*
04100          * Fifth case : end of the resource
04101          */
04102         else if (CUR == 0) {
04103         htmlAutoCloseOnEnd(ctxt);
04104         break;
04105         }
04106 
04107         /*
04108          * Last case, text. Note that References are handled directly.
04109          */
04110         else {
04111         htmlParseCharData(ctxt);
04112         }
04113 
04114         if (cons == ctxt->nbChars) {
04115         if (ctxt->node != NULL) {
04116             htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
04117                          "detected an error in element content\n",
04118                  NULL, NULL);
04119         }
04120         break;
04121         }
04122     }
04123         GROW;
04124     }
04125     if (currentNode != NULL) xmlFree(currentNode);
04126 }
04127 
04140 void
04141 htmlParseElement(htmlParserCtxtPtr ctxt) {
04142     const xmlChar *name;
04143     xmlChar *currentNode = NULL;
04144     const htmlElemDesc * info;
04145     htmlParserNodeInfo node_info;
04146     int failed;
04147     int depth;
04148     const xmlChar *oldptr;
04149 
04150     if ((ctxt == NULL) || (ctxt->input == NULL)) {
04151     htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
04152              "htmlParseElement: context error\n", NULL, NULL);
04153     return;
04154     }
04155 
04156     if (ctxt->instate == XML_PARSER_EOF)
04157         return;
04158 
04159     /* Capture start position */
04160     if (ctxt->record_info) {
04161         node_info.begin_pos = ctxt->input->consumed +
04162                           (CUR_PTR - ctxt->input->base);
04163     node_info.begin_line = ctxt->input->line;
04164     }
04165 
04166     failed = htmlParseStartTag(ctxt);
04167     name = ctxt->name;
04168     if ((failed == -1) || (name == NULL)) {
04169     if (CUR == '>')
04170         NEXT;
04171         return;
04172     }
04173 
04174     /*
04175      * Lookup the info for that element.
04176      */
04177     info = htmlTagLookup(name);
04178     if (info == NULL) {
04179     htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
04180                  "Tag %s invalid\n", name, NULL);
04181     }
04182 
04183     /*
04184      * Check for an Empty Element labeled the XML/SGML way
04185      */
04186     if ((CUR == '/') && (NXT(1) == '>')) {
04187         SKIP(2);
04188     if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
04189         ctxt->sax->endElement(ctxt->userData, name);
04190     htmlnamePop(ctxt);
04191     return;
04192     }
04193 
04194     if (CUR == '>') {
04195         NEXT;
04196     } else {
04197     htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
04198                  "Couldn't find end of Start Tag %s\n", name, NULL);
04199 
04200     /*
04201      * end of parsing of this node.
04202      */
04203     if (xmlStrEqual(name, ctxt->name)) {
04204         nodePop(ctxt);
04205         htmlnamePop(ctxt);
04206     }
04207 
04208     /*
04209      * Capture end position and add node
04210      */
04211     if (ctxt->record_info) {
04212        node_info.end_pos = ctxt->input->consumed +
04213                   (CUR_PTR - ctxt->input->base);
04214        node_info.end_line = ctxt->input->line;
04215        node_info.node = ctxt->node;
04216        xmlParserAddNodeInfo(ctxt, &node_info);
04217     }
04218     return;
04219     }
04220 
04221     /*
04222      * Check for an Empty Element from DTD definition
04223      */
04224     if ((info != NULL) && (info->empty)) {
04225     if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
04226         ctxt->sax->endElement(ctxt->userData, name);
04227     htmlnamePop(ctxt);
04228     return;
04229     }
04230 
04231     /*
04232      * Parse the content of the element:
04233      */
04234     currentNode = xmlStrdup(ctxt->name);
04235     depth = ctxt->nameNr;
04236     while (IS_CHAR_CH(CUR)) {
04237     oldptr = ctxt->input->cur;
04238     htmlParseContent(ctxt);
04239     if (oldptr==ctxt->input->cur) break;
04240     if (ctxt->nameNr < depth) break;
04241     }
04242 
04243     /*
04244      * Capture end position and add node
04245      */
04246     if ( currentNode != NULL && ctxt->record_info ) {
04247        node_info.end_pos = ctxt->input->consumed +
04248                           (CUR_PTR - ctxt->input->base);
04249        node_info.end_line = ctxt->input->line;
04250        node_info.node = ctxt->node;
04251        xmlParserAddNodeInfo(ctxt, &node_info);
04252     }
04253     if (!IS_CHAR_CH(CUR)) {
04254     htmlAutoCloseOnEnd(ctxt);
04255     }
04256 
04257     if (currentNode != NULL)
04258     xmlFree(currentNode);
04259 }
04260 
04261 static void
04262 htmlParserFinishElementParsing(htmlParserCtxtPtr ctxt) {
04263     /*
04264      * Capture end position and add node
04265      */
04266     if ( ctxt->node != NULL && ctxt->record_info ) {
04267        ctxt->nodeInfo->end_pos = ctxt->input->consumed +
04268                                 (CUR_PTR - ctxt->input->base);
04269        ctxt->nodeInfo->end_line = ctxt->input->line;
04270        ctxt->nodeInfo->node = ctxt->node;
04271        xmlParserAddNodeInfo(ctxt, ctxt->nodeInfo);
04272        htmlNodeInfoPop(ctxt);
04273     }
04274     if (!IS_CHAR_CH(CUR)) {
04275        htmlAutoCloseOnEnd(ctxt);
04276     }
04277 }
04278 
04290 static void
04291 htmlParseElementInternal(htmlParserCtxtPtr ctxt) {
04292     const xmlChar *name;
04293     const htmlElemDesc * info;
04294     htmlParserNodeInfo node_info;
04295     int failed;
04296 
04297     if ((ctxt == NULL) || (ctxt->input == NULL)) {
04298     htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
04299              "htmlParseElementInternal: context error\n", NULL, NULL);
04300     return;
04301     }
04302 
04303     if (ctxt->instate == XML_PARSER_EOF)
04304         return;
04305 
04306     /* Capture start position */
04307     if (ctxt->record_info) {
04308         node_info.begin_pos = ctxt->input->consumed +
04309                           (CUR_PTR - ctxt->input->base);
04310     node_info.begin_line = ctxt->input->line;
04311     }
04312 
04313     failed = htmlParseStartTag(ctxt);
04314     name = ctxt->name;
04315     if ((failed == -1) || (name == NULL)) {
04316     if (CUR == '>')
04317         NEXT;
04318         return;
04319     }
04320 
04321     /*
04322      * Lookup the info for that element.
04323      */
04324     info = htmlTagLookup(name);
04325     if (info == NULL) {
04326     htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
04327                  "Tag %s invalid\n", name, NULL);
04328     }
04329 
04330     /*
04331      * Check for an Empty Element labeled the XML/SGML way
04332      */
04333     if ((CUR == '/') && (NXT(1) == '>')) {
04334         SKIP(2);
04335     if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
04336         ctxt->sax->endElement(ctxt->userData, name);
04337     htmlnamePop(ctxt);
04338     return;
04339     }
04340 
04341     if (CUR == '>') {
04342         NEXT;
04343     } else {
04344     htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
04345                  "Couldn't find end of Start Tag %s\n", name, NULL);
04346 
04347     /*
04348      * end of parsing of this node.
04349      */
04350     if (xmlStrEqual(name, ctxt->name)) {
04351         nodePop(ctxt);
04352         htmlnamePop(ctxt);
04353     }
04354 
04355         if (ctxt->record_info)
04356             htmlNodeInfoPush(ctxt, &node_info);
04357         htmlParserFinishElementParsing(ctxt);
04358     return;
04359     }
04360 
04361     /*
04362      * Check for an Empty Element from DTD definition
04363      */
04364     if ((info != NULL) && (info->empty)) {
04365     if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
04366         ctxt->sax->endElement(ctxt->userData, name);
04367     htmlnamePop(ctxt);
04368     return;
04369     }
04370 
04371     if (ctxt->record_info)
04372         htmlNodeInfoPush(ctxt, &node_info);
04373 }
04374 
04383 static void
04384 htmlParseContentInternal(htmlParserCtxtPtr ctxt) {
04385     xmlChar *currentNode;
04386     int depth;
04387     const xmlChar *name;
04388 
04389     currentNode = xmlStrdup(ctxt->name);
04390     depth = ctxt->nameNr;
04391     while (1) {
04392     long cons = ctxt->nbChars;
04393 
04394         GROW;
04395 
04396         if (ctxt->instate == XML_PARSER_EOF)
04397             break;
04398 
04399     /*
04400      * Our tag or one of it's parent or children is ending.
04401      */
04402         if ((CUR == '<') && (NXT(1) == '/')) {
04403         if (htmlParseEndTag(ctxt) &&
04404         ((currentNode != NULL) || (ctxt->nameNr == 0))) {
04405         if (currentNode != NULL)
04406             xmlFree(currentNode);
04407 
04408             currentNode = xmlStrdup(ctxt->name);
04409             depth = ctxt->nameNr;
04410         }
04411         continue; /* while */
04412         }
04413 
04414     else if ((CUR == '<') &&
04415              ((IS_ASCII_LETTER(NXT(1))) ||
04416           (NXT(1) == '_') || (NXT(1) == ':'))) {
04417         name = htmlParseHTMLName_nonInvasive(ctxt);
04418         if (name == NULL) {
04419             htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
04420              "htmlParseStartTag: invalid element name\n",
04421              NULL, NULL);
04422             /* Dump the bogus tag like browsers do */
04423             while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
04424                 NEXT;
04425 
04426             htmlParserFinishElementParsing(ctxt);
04427             if (currentNode != NULL)
04428                 xmlFree(currentNode);
04429 
04430             currentNode = xmlStrdup(ctxt->name);
04431             depth = ctxt->nameNr;
04432             continue;
04433         }
04434 
04435         if (ctxt->name != NULL) {
04436             if (htmlCheckAutoClose(name, ctxt->name) == 1) {
04437                 htmlAutoClose(ctxt, name);
04438                 continue;
04439             }
04440         }
04441     }
04442 
04443     /*
04444      * Has this node been popped out during parsing of
04445      * the next element
04446      */
04447         if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
04448         (!xmlStrEqual(currentNode, ctxt->name)))
04449          {
04450         htmlParserFinishElementParsing(ctxt);
04451         if (currentNode != NULL) xmlFree(currentNode);
04452 
04453         currentNode = xmlStrdup(ctxt->name);
04454         depth = ctxt->nameNr;
04455         continue;
04456     }
04457 
04458     if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
04459         (xmlStrEqual(currentNode, BAD_CAST"style")))) {
04460         /*
04461          * Handle SCRIPT/STYLE separately
04462          */
04463         htmlParseScript(ctxt);
04464     } else {
04465         /*
04466          * Sometimes DOCTYPE arrives in the middle of the document
04467          */
04468         if ((CUR == '<') && (NXT(1) == '!') &&
04469         (UPP(2) == 'D') && (UPP(3) == 'O') &&
04470         (UPP(4) == 'C') && (UPP(5) == 'T') &&
04471         (UPP(6) == 'Y') && (UPP(7) == 'P') &&
04472         (UPP(8) == 'E')) {
04473         htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
04474                      "Misplaced DOCTYPE declaration\n",
04475                  BAD_CAST "DOCTYPE" , NULL);
04476         htmlParseDocTypeDecl(ctxt);
04477         }
04478 
04479         /*
04480          * First case :  a comment
04481          */
04482         if ((CUR == '<') && (NXT(1) == '!') &&
04483         (NXT(2) == '-') && (NXT(3) == '-')) {
04484         htmlParseComment(ctxt);
04485         }
04486 
04487         /*
04488          * Second case : a Processing Instruction.
04489          */
04490         else if ((CUR == '<') && (NXT(1) == '?')) {
04491         htmlParsePI(ctxt);
04492         }
04493 
04494         /*
04495          * Third case :  a sub-element.
04496          */
04497         else if (CUR == '<') {
04498         htmlParseElementInternal(ctxt);
04499         if (currentNode != NULL) xmlFree(currentNode);
04500 
04501         currentNode = xmlStrdup(ctxt->name);
04502         depth = ctxt->nameNr;
04503         }
04504 
04505         /*
04506          * Fourth case : a reference. If if has not been resolved,
04507          *    parsing returns it's Name, create the node
04508          */
04509         else if (CUR == '&') {
04510         htmlParseReference(ctxt);
04511         }
04512 
04513         /*
04514          * Fifth case : end of the resource
04515          */
04516         else if (CUR == 0) {
04517         htmlAutoCloseOnEnd(ctxt);
04518         break;
04519         }
04520 
04521         /*
04522          * Last case, text. Note that References are handled directly.
04523          */
04524         else {
04525         htmlParseCharData(ctxt);
04526         }
04527 
04528         if (cons == ctxt->nbChars) {
04529         if (ctxt->node != NULL) {
04530             htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
04531                          "detected an error in element content\n",
04532                  NULL, NULL);
04533         }
04534         break;
04535         }
04536     }
04537         GROW;
04538     }
04539     if (currentNode != NULL) xmlFree(currentNode);
04540 }
04541 
04550 void
04551 __htmlParseContent(void *ctxt) {
04552     if (ctxt != NULL)
04553     htmlParseContentInternal((htmlParserCtxtPtr) ctxt);
04554 }
04555 
04567 int
04568 htmlParseDocument(htmlParserCtxtPtr ctxt) {
04569     xmlChar start[4];
04570     xmlCharEncoding enc;
04571     xmlDtdPtr dtd;
04572 
04573     xmlInitParser();
04574 
04575     htmlDefaultSAXHandlerInit();
04576 
04577     if ((ctxt == NULL) || (ctxt->input == NULL)) {
04578     htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
04579              "htmlParseDocument: context error\n", NULL, NULL);
04580     return(XML_ERR_INTERNAL_ERROR);
04581     }
04582     ctxt->html = 1;
04583     ctxt->linenumbers = 1;
04584     GROW;
04585     /*
04586      * SAX: beginning of the document processing.
04587      */
04588     if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
04589         ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
04590 
04591     if ((ctxt->encoding == (const xmlChar *)XML_CHAR_ENCODING_NONE) &&
04592         ((ctxt->input->end - ctxt->input->cur) >= 4)) {
04593     /*
04594      * Get the 4 first bytes and decode the charset
04595      * if enc != XML_CHAR_ENCODING_NONE
04596      * plug some encoding conversion routines.
04597      */
04598     start[0] = RAW;
04599     start[1] = NXT(1);
04600     start[2] = NXT(2);
04601     start[3] = NXT(3);
04602     enc = xmlDetectCharEncoding(&start[0], 4);
04603     if (enc != XML_CHAR_ENCODING_NONE) {
04604         xmlSwitchEncoding(ctxt, enc);
04605     }
04606     }
04607 
04608     /*
04609      * Wipe out everything which is before the first '<'
04610      */
04611     SKIP_BLANKS;
04612     if (CUR == 0) {
04613     htmlParseErr(ctxt, XML_ERR_DOCUMENT_EMPTY,
04614                  "Document is empty\n", NULL, NULL);
04615     }
04616 
04617     if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
04618     ctxt->sax->startDocument(ctxt->userData);
04619 
04620 
04621     /*
04622      * Parse possible comments and PIs before any content
04623      */
04624     while (((CUR == '<') && (NXT(1) == '!') &&
04625             (NXT(2) == '-') && (NXT(3) == '-')) ||
04626        ((CUR == '<') && (NXT(1) == '?'))) {
04627         htmlParseComment(ctxt);
04628         htmlParsePI(ctxt);
04629     SKIP_BLANKS;
04630     }
04631 
04632 
04633     /*
04634      * Then possibly doc type declaration(s) and more Misc
04635      * (doctypedecl Misc*)?
04636      */
04637     if ((CUR == '<') && (NXT(1) == '!') &&
04638     (UPP(2) == 'D') && (UPP(3) == 'O') &&
04639     (UPP(4) == 'C') && (UPP(5) == 'T') &&
04640     (UPP(6) == 'Y') && (UPP(7) == 'P') &&
04641     (UPP(8) == 'E')) {
04642     htmlParseDocTypeDecl(ctxt);
04643     }
04644     SKIP_BLANKS;
04645 
04646     /*
04647      * Parse possible comments and PIs before any content
04648      */
04649     while (((CUR == '<') && (NXT(1) == '!') &&
04650             (NXT(2) == '-') && (NXT(3) == '-')) ||
04651        ((CUR == '<') && (NXT(1) == '?'))) {
04652         htmlParseComment(ctxt);
04653         htmlParsePI(ctxt);
04654     SKIP_BLANKS;
04655     }
04656 
04657     /*
04658      * Time to start parsing the tree itself
04659      */
04660     htmlParseContentInternal(ctxt);
04661 
04662     /*
04663      * autoclose
04664      */
04665     if (CUR == 0)
04666     htmlAutoCloseOnEnd(ctxt);
04667 
04668 
04669     /*
04670      * SAX: end of the document processing.
04671      */
04672     if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
04673         ctxt->sax->endDocument(ctxt->userData);
04674 
04675     if ((!(ctxt->options & HTML_PARSE_NODEFDTD)) && (ctxt->myDoc != NULL)) {
04676     dtd = xmlGetIntSubset(ctxt->myDoc);
04677     if (dtd == NULL)
04678         ctxt->myDoc->intSubset =
04679         xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
04680             BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
04681             BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
04682     }
04683     if (! ctxt->wellFormed) return(-1);
04684     return(0);
04685 }
04686 
04687 
04688 /************************************************************************
04689  *                                  *
04690  *          Parser contexts handling            *
04691  *                                  *
04692  ************************************************************************/
04693 
04703 static int
04704 htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
04705 {
04706     htmlSAXHandler *sax;
04707 
04708     if (ctxt == NULL) return(-1);
04709     memset(ctxt, 0, sizeof(htmlParserCtxt));
04710 
04711     ctxt->dict = xmlDictCreate();
04712     if (ctxt->dict == NULL) {
04713         htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
04714     return(-1);
04715     }
04716     sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
04717     if (sax == NULL) {
04718         htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
04719     return(-1);
04720     }
04721     else
04722         memset(sax, 0, sizeof(htmlSAXHandler));
04723 
04724     /* Allocate the Input stack */
04725     ctxt->inputTab = (htmlParserInputPtr *)
04726                       xmlMalloc(5 * sizeof(htmlParserInputPtr));
04727     if (ctxt->inputTab == NULL) {
04728         htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
04729     ctxt->inputNr = 0;
04730     ctxt->inputMax = 0;
04731     ctxt->input = NULL;
04732     return(-1);
04733     }
04734     ctxt->inputNr = 0;
04735     ctxt->inputMax = 5;
04736     ctxt->input = NULL;
04737     ctxt->version = NULL;
04738     ctxt->encoding = NULL;
04739     ctxt->standalone = -1;
04740     ctxt->instate = XML_PARSER_START;
04741 
04742     /* Allocate the Node stack */
04743     ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
04744     if (ctxt->nodeTab == NULL) {
04745         htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
04746     ctxt->nodeNr = 0;
04747     ctxt->nodeMax = 0;
04748     ctxt->node = NULL;
04749     ctxt->inputNr = 0;
04750     ctxt->inputMax = 0;
04751     ctxt->input = NULL;
04752     return(-1);
04753     }
04754     ctxt->nodeNr = 0;
04755     ctxt->nodeMax = 10;
04756     ctxt->node = NULL;
04757 
04758     /* Allocate the Name stack */
04759     ctxt->nameTab = (const xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
04760     if (ctxt->nameTab == NULL) {
04761         htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
04762     ctxt->nameNr = 0;
04763     ctxt->nameMax = 0;
04764     ctxt->name = NULL;
04765     ctxt->nodeNr = 0;
04766     ctxt->nodeMax = 0;
04767     ctxt->node = NULL;
04768     ctxt->inputNr = 0;
04769     ctxt->inputMax = 0;
04770     ctxt->input = NULL;
04771     return(-1);
04772     }
04773     ctxt->nameNr = 0;
04774     ctxt->nameMax = 10;
04775     ctxt->name = NULL;
04776 
04777     ctxt->nodeInfoTab = NULL;
04778     ctxt->nodeInfoNr  = 0;
04779     ctxt->nodeInfoMax = 0;
04780 
04781     if (sax == NULL) ctxt->sax = (xmlSAXHandlerPtr) &htmlDefaultSAXHandler;
04782     else {
04783         ctxt->sax = sax;
04784     memcpy(sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
04785     }
04786     ctxt->userData = ctxt;
04787     ctxt->myDoc = NULL;
04788     ctxt->wellFormed = 1;
04789     ctxt->replaceEntities = 0;
04790     ctxt->linenumbers = xmlLineNumbersDefaultValue;
04791     ctxt->html = 1;
04792     ctxt->vctxt.finishDtd = XML_CTXT_FINISH_DTD_0;
04793     ctxt->vctxt.userData = ctxt;
04794     ctxt->vctxt.error = xmlParserValidityError;
04795     ctxt->vctxt.warning = xmlParserValidityWarning;
04796     ctxt->record_info = 0;
04797     ctxt->validate = 0;
04798     ctxt->nbChars = 0;
04799     ctxt->checkIndex = 0;
04800     ctxt->catalogs = NULL;
04801     xmlInitNodeInfoSeq(&ctxt->node_seq);
04802     return(0);
04803 }
04804 
04813 void
04814 htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
04815 {
04816     xmlFreeParserCtxt(ctxt);
04817 }
04818 
04827 htmlParserCtxtPtr
04828 htmlNewParserCtxt(void)
04829 {
04830     xmlParserCtxtPtr ctxt;
04831 
04832     ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt));
04833     if (ctxt == NULL) {
04834         htmlErrMemory(NULL, "NewParserCtxt: out of memory\n");
04835     return(NULL);
04836     }
04837     memset(ctxt, 0, sizeof(xmlParserCtxt));
04838     if (htmlInitParserCtxt(ctxt) < 0) {
04839         htmlFreeParserCtxt(ctxt);
04840     return(NULL);
04841     }
04842     return(ctxt);
04843 }
04844 
04854 htmlParserCtxtPtr
04855 htmlCreateMemoryParserCtxt(const char *buffer, int size) {
04856     xmlParserCtxtPtr ctxt;
04857     xmlParserInputPtr input;
04858     xmlParserInputBufferPtr buf;
04859 
04860     if (buffer == NULL)
04861     return(NULL);
04862     if (size <= 0)
04863     return(NULL);
04864 
04865     ctxt = htmlNewParserCtxt();
04866     if (ctxt == NULL)
04867     return(NULL);
04868 
04869     buf = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
04870     if (buf == NULL) return(NULL);
04871 
04872     input = xmlNewInputStream(ctxt);
04873     if (input == NULL) {
04874     xmlFreeParserCtxt(ctxt);
04875     return(NULL);
04876     }
04877 
04878     input->filename = NULL;
04879     input->buf = buf;
04880     input->base = input->buf->buffer->content;
04881     input->cur = input->buf->buffer->content;
04882     input->end = &input->buf->buffer->content[input->buf->buffer->use];
04883 
04884     inputPush(ctxt, input);
04885     return(ctxt);
04886 }
04887 
04899 static htmlParserCtxtPtr
04900 htmlCreateDocParserCtxt(const xmlChar *cur, const char *encoding) {
04901     int len;
04902     htmlParserCtxtPtr ctxt;
04903 
04904     if (cur == NULL)
04905     return(NULL);
04906     len = xmlStrlen(cur);
04907     ctxt = htmlCreateMemoryParserCtxt((char *)cur, len);
04908     if (ctxt == NULL)
04909     return(NULL);
04910 
04911     if (encoding != NULL) {
04912     xmlCharEncoding enc;
04913     xmlCharEncodingHandlerPtr handler;
04914 
04915     if (ctxt->input->encoding != NULL)
04916         xmlFree((xmlChar *) ctxt->input->encoding);
04917     ctxt->input->encoding = xmlStrdup((const xmlChar *) encoding);
04918 
04919     enc = xmlParseCharEncoding(encoding);
04920     /*
04921      * registered set of known encodings
04922      */
04923     if (enc != XML_CHAR_ENCODING_ERROR) {
04924         xmlSwitchEncoding(ctxt, enc);
04925         if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
04926         htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
04927                      "Unsupported encoding %s\n",
04928                  (const xmlChar *) encoding, NULL);
04929         }
04930     } else {
04931         /*
04932          * fallback for unknown encodings
04933          */
04934         handler = xmlFindCharEncodingHandler((const char *) encoding);
04935         if (handler != NULL) {
04936         xmlSwitchToEncoding(ctxt, handler);
04937         } else {
04938         htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
04939                      "Unsupported encoding %s\n",
04940                  (const xmlChar *) encoding, NULL);
04941         }
04942     }
04943     }
04944     return(ctxt);
04945 }
04946 
04947 #ifdef LIBXML_PUSH_ENABLED
04948 /************************************************************************
04949  *                                  *
04950  *  Progressive parsing interfaces              *
04951  *                                  *
04952  ************************************************************************/
04953 
04972 static int
04973 htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
04974                         xmlChar next, xmlChar third, int iscomment,
04975                         int ignoreattrval)
04976 {
04977     int base, len;
04978     htmlParserInputPtr in;
04979     const xmlChar *buf;
04980     int incomment = 0;
04981     int invalue = 0;
04982     char valdellim = 0x0;
04983 
04984     in = ctxt->input;
04985     if (in == NULL)
04986         return (-1);
04987 
04988     base = in->cur - in->base;
04989     if (base < 0)
04990         return (-1);
04991 
04992     if (ctxt->checkIndex > base)
04993         base = ctxt->checkIndex;
04994 
04995     if (in->buf == NULL) {
04996         buf = in->base;
04997         len = in->length;
04998     } else {
04999         buf = in->buf->buffer->content;
05000         len = in->buf->buffer->use;
05001     }
05002 
05003     /* take into account the sequence length */
05004     if (third)
05005         len -= 2;
05006     else if (next)
05007         len--;
05008     for (; base < len; base++) {
05009         if ((!incomment) && (base + 4 < len) && (!iscomment)) {
05010             if ((buf[base] == '<') && (buf[base + 1] == '!') &&
05011                 (buf[base + 2] == '-') && (buf[base + 3] == '-')) {
05012                 incomment = 1;
05013                 /* do not increment past <! - some people use <!--> */
05014                 base += 2;
05015             }
05016         }
05017         if (ignoreattrval) {
05018             if (buf[base] == '"' || buf[base] == '\'') {
05019                 if (invalue) {
05020                     if (buf[base] == valdellim) {
05021                         invalue = 0;
05022                         continue;
05023                     }
05024                 } else {
05025                     valdellim = buf[base];
05026                     invalue = 1;
05027                     continue;
05028                 }
05029             } else if (invalue) {
05030                 continue;
05031             }
05032         }
05033         if (incomment) {
05034             if (base + 3 > len)
05035                 return (-1);
05036             if ((buf[base] == '-') && (buf[base + 1] == '-') &&
05037                 (buf[base + 2] == '>')) {
05038                 incomment = 0;
05039                 base += 2;
05040             }
05041             continue;
05042         }
05043         if (buf[base] == first) {
05044             if (third != 0) {
05045                 if ((buf[base + 1] != next) || (buf[base + 2] != third))
05046                     continue;
05047             } else if (next != 0) {
05048                 if (buf[base + 1] != next)
05049                     continue;
05050             }
05051             ctxt->checkIndex = 0;
05052 #ifdef DEBUG_PUSH
05053             if (next == 0)
05054                 xmlGenericError(xmlGenericErrorContext,
05055                                 "HPP: lookup '%c' found at %d\n",
05056                                 first, base);
05057             else if (third == 0)
05058                 xmlGenericError(xmlGenericErrorContext,
05059                                 "HPP: lookup '%c%c' found at %d\n",
05060                                 first, next, base);
05061             else
05062                 xmlGenericError(xmlGenericErrorContext,
05063                                 "HPP: lookup '%c%c%c' found at %d\n",
05064                                 first, next, third, base);
05065 #endif
05066             return (base - (in->cur - in->base));
05067         }
05068     }
05069     if ((!incomment) && (!invalue))
05070         ctxt->checkIndex = base;
05071 #ifdef DEBUG_PUSH
05072     if (next == 0)
05073         xmlGenericError(xmlGenericErrorContext,
05074                         "HPP: lookup '%c' failed\n", first);
05075     else if (third == 0)
05076         xmlGenericError(xmlGenericErrorContext,
05077                         "HPP: lookup '%c%c' failed\n", first, next);
05078     else
05079         xmlGenericError(xmlGenericErrorContext,
05080                         "HPP: lookup '%c%c%c' failed\n", first, next,
05081                         third);
05082 #endif
05083     return (-1);
05084 }
05085 
05101 static int
05102 htmlParseLookupChars(htmlParserCtxtPtr ctxt, const xmlChar * stop,
05103                      int stopLen)
05104 {
05105     int base, len;
05106     htmlParserInputPtr in;
05107     const xmlChar *buf;
05108     int incomment = 0;
05109     int i;
05110 
05111     in = ctxt->input;
05112     if (in == NULL)
05113         return (-1);
05114 
05115     base = in->cur - in->base;
05116     if (base < 0)
05117         return (-1);
05118 
05119     if (ctxt->checkIndex > base)
05120         base = ctxt->checkIndex;
05121 
05122     if (in->buf == NULL) {
05123         buf = in->base;
05124         len = in->length;
05125     } else {
05126         buf = in->buf->buffer->content;
05127         len = in->buf->buffer->use;
05128     }
05129 
05130     for (; base < len; base++) {
05131         if (!incomment && (base + 4 < len)) {
05132             if ((buf[base] == '<') && (buf[base + 1] == '!') &&
05133                 (buf[base + 2] == '-') && (buf[base + 3] == '-')) {
05134                 incomment = 1;
05135                 /* do not increment past <! - some people use <!--> */
05136                 base += 2;
05137             }
05138         }
05139         if (incomment) {
05140             if (base + 3 > len)
05141                 return (-1);
05142             if ((buf[base] == '-') && (buf[base + 1] == '-') &&
05143                 (buf[base + 2] == '>')) {
05144                 incomment = 0;
05145                 base += 2;
05146             }
05147             continue;
05148         }
05149         for (i = 0; i < stopLen; ++i) {
05150             if (buf[base] == stop[i]) {
05151                 ctxt->checkIndex = 0;
05152                 return (base - (in->cur - in->base));
05153             }
05154         }
05155     }
05156     ctxt->checkIndex = base;
05157     return (-1);
05158 }
05159 
05169 static int
05170 htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
05171     int ret = 0;
05172     htmlParserInputPtr in;
05173     int avail = 0;
05174     xmlChar cur, next;
05175 
05176 #ifdef DEBUG_PUSH
05177     switch (ctxt->instate) {
05178     case XML_PARSER_EOF:
05179         xmlGenericError(xmlGenericErrorContext,
05180             "HPP: try EOF\n"); break;
05181     case XML_PARSER_START:
05182         xmlGenericError(xmlGenericErrorContext,
05183             "HPP: try START\n"); break;
05184     case XML_PARSER_MISC:
05185         xmlGenericError(xmlGenericErrorContext,
05186             "HPP: try MISC\n");break;
05187     case XML_PARSER_COMMENT:
05188         xmlGenericError(xmlGenericErrorContext,
05189             "HPP: try COMMENT\n");break;
05190     case XML_PARSER_PROLOG:
05191         xmlGenericError(xmlGenericErrorContext,
05192             "HPP: try PROLOG\n");break;
05193     case XML_PARSER_START_TAG:
05194         xmlGenericError(xmlGenericErrorContext,
05195             "HPP: try START_TAG\n");break;
05196     case XML_PARSER_CONTENT:
05197         xmlGenericError(xmlGenericErrorContext,
05198             "HPP: try CONTENT\n");break;
05199     case XML_PARSER_CDATA_SECTION:
05200         xmlGenericError(xmlGenericErrorContext,
05201             "HPP: try CDATA_SECTION\n");break;
05202     case XML_PARSER_END_TAG:
05203         xmlGenericError(xmlGenericErrorContext,
05204             "HPP: try END_TAG\n");break;
05205     case XML_PARSER_ENTITY_DECL:
05206         xmlGenericError(xmlGenericErrorContext,
05207             "HPP: try ENTITY_DECL\n");break;
05208     case XML_PARSER_ENTITY_VALUE:
05209         xmlGenericError(xmlGenericErrorContext,
05210             "HPP: try ENTITY_VALUE\n");break;
05211     case XML_PARSER_ATTRIBUTE_VALUE:
05212         xmlGenericError(xmlGenericErrorContext,
05213             "HPP: try ATTRIBUTE_VALUE\n");break;
05214     case XML_PARSER_DTD:
05215         xmlGenericError(xmlGenericErrorContext,
05216             "HPP: try DTD\n");break;
05217     case XML_PARSER_EPILOG:
05218         xmlGenericError(xmlGenericErrorContext,
05219             "HPP: try EPILOG\n");break;
05220     case XML_PARSER_PI:
05221         xmlGenericError(xmlGenericErrorContext,
05222             "HPP: try PI\n");break;
05223     case XML_PARSER_SYSTEM_LITERAL:
05224         xmlGenericError(xmlGenericErrorContext,
05225             "HPP: try SYSTEM_LITERAL\n");break;
05226     }
05227 #endif
05228 
05229     while (1) {
05230 
05231     in = ctxt->input;
05232     if (in == NULL) break;
05233     if (in->buf == NULL)
05234         avail = in->length - (in->cur - in->base);
05235     else
05236         avail = in->buf->buffer->use - (in->cur - in->base);
05237     if ((avail == 0) && (terminate)) {
05238         htmlAutoCloseOnEnd(ctxt);
05239         if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
05240         /*
05241          * SAX: end of the document processing.
05242          */
05243         ctxt->instate = XML_PARSER_EOF;
05244         if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
05245             ctxt->sax->endDocument(ctxt->userData);
05246         }
05247     }
05248         if (avail < 1)
05249         goto done;
05250     cur = in->cur[0];
05251     if (cur == 0) {
05252         SKIP(1);
05253         continue;
05254     }
05255 
05256         switch (ctxt->instate) {
05257             case XML_PARSER_EOF:
05258             /*
05259          * Document parsing is done !
05260          */
05261             goto done;
05262             case XML_PARSER_START:
05263             /*
05264          * Very first chars read from the document flow.
05265          */
05266         cur = in->cur[0];
05267         if (IS_BLANK_CH(cur)) {
05268             SKIP_BLANKS;
05269             if (in->buf == NULL)
05270             avail = in->length - (in->cur - in->base);
05271             else
05272             avail = in->buf->buffer->use - (in->cur - in->base);
05273         }
05274         if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
05275             ctxt->sax->setDocumentLocator(ctxt->userData,
05276                           &xmlDefaultSAXLocator);
05277         if ((ctxt->sax) && (ctxt->sax->startDocument) &&
05278                 (!ctxt->disableSAX))
05279             ctxt->sax->startDocument(ctxt->userData);
05280 
05281         cur = in->cur[0];
05282         next = in->cur[1];
05283         if ((cur == '<') && (next == '!') &&
05284             (UPP(2) == 'D') && (UPP(3) == 'O') &&
05285             (UPP(4) == 'C') && (UPP(5) == 'T') &&
05286             (UPP(6) == 'Y') && (UPP(7) == 'P') &&
05287             (UPP(8) == 'E')) {
05288             if ((!terminate) &&
05289                 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
05290             goto done;
05291 #ifdef DEBUG_PUSH
05292             xmlGenericError(xmlGenericErrorContext,
05293                 "HPP: Parsing internal subset\n");
05294 #endif
05295             htmlParseDocTypeDecl(ctxt);
05296             ctxt->instate = XML_PARSER_PROLOG;
05297 #ifdef DEBUG_PUSH
05298             xmlGenericError(xmlGenericErrorContext,
05299                 "HPP: entering PROLOG\n");
05300 #endif
05301                 } else {
05302             ctxt->instate = XML_PARSER_MISC;
05303 #ifdef DEBUG_PUSH
05304             xmlGenericError(xmlGenericErrorContext,
05305                 "HPP: entering MISC\n");
05306 #endif
05307         }
05308         break;
05309             case XML_PARSER_MISC:
05310         SKIP_BLANKS;
05311         if (in->buf == NULL)
05312             avail = in->length - (in->cur - in->base);
05313         else
05314             avail = in->buf->buffer->use - (in->cur - in->base);
05315         if (avail < 2)
05316             goto done;
05317         cur = in->cur[0];
05318         next = in->cur[1];
05319             if ((cur == '<') && (next == '!') &&
05320             (in->cur[2] == '-') && (in->cur[3] == '-')) {
05321             if ((!terminate) &&
05322                 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
05323             goto done;
05324 #ifdef DEBUG_PUSH
05325             xmlGenericError(xmlGenericErrorContext,
05326                 "HPP: Parsing Comment\n");
05327 #endif
05328             htmlParseComment(ctxt);
05329             ctxt->instate = XML_PARSER_MISC;
05330             } else if ((cur == '<') && (next == '?')) {
05331             if ((!terminate) &&
05332                 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
05333             goto done;
05334 #ifdef DEBUG_PUSH
05335             xmlGenericError(xmlGenericErrorContext,
05336                 "HPP: Parsing PI\n");
05337 #endif
05338             htmlParsePI(ctxt);
05339             ctxt->instate = XML_PARSER_MISC;
05340         } else if ((cur == '<') && (next == '!') &&
05341             (UPP(2) == 'D') && (UPP(3) == 'O') &&
05342             (UPP(4) == 'C') && (UPP(5) == 'T') &&
05343             (UPP(6) == 'Y') && (UPP(7) == 'P') &&
05344             (UPP(8) == 'E')) {
05345             if ((!terminate) &&
05346                 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
05347             goto done;
05348 #ifdef DEBUG_PUSH
05349             xmlGenericError(xmlGenericErrorContext,
05350                 "HPP: Parsing internal subset\n");
05351 #endif
05352             htmlParseDocTypeDecl(ctxt);
05353             ctxt->instate = XML_PARSER_PROLOG;
05354 #ifdef DEBUG_PUSH
05355             xmlGenericError(xmlGenericErrorContext,
05356                 "HPP: entering PROLOG\n");
05357 #endif
05358         } else if ((cur == '<') && (next == '!') &&
05359                    (avail < 9)) {
05360             goto done;
05361         } else {
05362             ctxt->instate = XML_PARSER_START_TAG;
05363 #ifdef DEBUG_PUSH
05364             xmlGenericError(xmlGenericErrorContext,
05365                 "HPP: entering START_TAG\n");
05366 #endif
05367         }
05368         break;
05369             case XML_PARSER_PROLOG:
05370         SKIP_BLANKS;
05371         if (in->buf == NULL)
05372             avail = in->length - (in->cur - in->base);
05373         else
05374             avail = in->buf->buffer->use - (in->cur - in->base);
05375         if (avail < 2)
05376             goto done;
05377         cur = in->cur[0];
05378         next = in->cur[1];
05379         if ((cur == '<') && (next == '!') &&
05380             (in->cur[2] == '-') && (in->cur[3] == '-')) {
05381             if ((!terminate) &&
05382                 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
05383             goto done;
05384 #ifdef DEBUG_PUSH
05385             xmlGenericError(xmlGenericErrorContext,
05386                 "HPP: Parsing Comment\n");
05387 #endif
05388             htmlParseComment(ctxt);
05389             ctxt->instate = XML_PARSER_PROLOG;
05390             } else if ((cur == '<') && (next == '?')) {
05391             if ((!terminate) &&
05392                 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
05393             goto done;
05394 #ifdef DEBUG_PUSH
05395             xmlGenericError(xmlGenericErrorContext,
05396                 "HPP: Parsing PI\n");
05397 #endif
05398             htmlParsePI(ctxt);
05399             ctxt->instate = XML_PARSER_PROLOG;
05400         } else if ((cur == '<') && (next == '!') &&
05401                    (avail < 4)) {
05402             goto done;
05403         } else {
05404             ctxt->instate = XML_PARSER_START_TAG;
05405 #ifdef DEBUG_PUSH
05406             xmlGenericError(xmlGenericErrorContext,
05407                 "HPP: entering START_TAG\n");
05408 #endif
05409         }
05410         break;
05411             case XML_PARSER_EPILOG:
05412         if (in->buf == NULL)
05413             avail = in->length - (in->cur - in->base);
05414         else
05415             avail = in->buf->buffer->use - (in->cur - in->base);
05416         if (avail < 1)
05417             goto done;
05418         cur = in->cur[0];
05419         if (IS_BLANK_CH(cur)) {
05420             htmlParseCharData(ctxt);
05421             goto done;
05422         }
05423         if (avail < 2)
05424             goto done;
05425         next = in->cur[1];
05426             if ((cur == '<') && (next == '!') &&
05427             (in->cur[2] == '-') && (in->cur[3] == '-')) {
05428             if ((!terminate) &&
05429                 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
05430             goto done;
05431 #ifdef DEBUG_PUSH
05432             xmlGenericError(xmlGenericErrorContext,
05433                 "HPP: Parsing Comment\n");
05434 #endif
05435             htmlParseComment(ctxt);
05436             ctxt->instate = XML_PARSER_EPILOG;
05437             } else if ((cur == '<') && (next == '?')) {
05438             if ((!terminate) &&
05439                 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
05440             goto done;
05441 #ifdef DEBUG_PUSH
05442             xmlGenericError(xmlGenericErrorContext,
05443                 "HPP: Parsing PI\n");
05444 #endif
05445             htmlParsePI(ctxt);
05446             ctxt->instate = XML_PARSER_EPILOG;
05447         } else if ((cur == '<') && (next == '!') &&
05448                    (avail < 4)) {
05449             goto done;
05450         } else {
05451             ctxt->errNo = XML_ERR_DOCUMENT_END;
05452             ctxt->wellFormed = 0;
05453             ctxt->instate = XML_PARSER_EOF;
05454 #ifdef DEBUG_PUSH
05455             xmlGenericError(xmlGenericErrorContext,
05456                 "HPP: entering EOF\n");
05457 #endif
05458             if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
05459             ctxt->sax->endDocument(ctxt->userData);
05460             goto done;
05461         }
05462         break;
05463             case XML_PARSER_START_TAG: {
05464             const xmlChar *name;
05465         int failed;
05466         const htmlElemDesc * info;
05467 
05468         if (avail < 2)
05469             goto done;
05470         cur = in->cur[0];
05471             if (cur != '<') {
05472             ctxt->instate = XML_PARSER_CONTENT;
05473 #ifdef DEBUG_PUSH
05474             xmlGenericError(xmlGenericErrorContext,
05475                 "HPP: entering CONTENT\n");
05476 #endif
05477             break;
05478         }
05479         if (in->cur[1] == '/') {
05480             ctxt->instate = XML_PARSER_END_TAG;
05481             ctxt->checkIndex = 0;
05482 #ifdef DEBUG_PUSH
05483             xmlGenericError(xmlGenericErrorContext,
05484                 "HPP: entering END_TAG\n");
05485 #endif
05486             break;
05487         }
05488         if ((!terminate) &&
05489             (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
05490             goto done;
05491 
05492         failed = htmlParseStartTag(ctxt);
05493         name = ctxt->name;
05494         if ((failed == -1) ||
05495             (name == NULL)) {
05496             if (CUR == '>')
05497             NEXT;
05498             break;
05499         }
05500 
05501         /*
05502          * Lookup the info for that element.
05503          */
05504         info = htmlTagLookup(name);
05505         if (info == NULL) {
05506             htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
05507                          "Tag %s invalid\n", name, NULL);
05508         }
05509 
05510         /*
05511          * Check for an Empty Element labeled the XML/SGML way
05512          */
05513         if ((CUR == '/') && (NXT(1) == '>')) {
05514             SKIP(2);
05515             if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
05516             ctxt->sax->endElement(ctxt->userData, name);
05517             htmlnamePop(ctxt);
05518             ctxt->instate = XML_PARSER_CONTENT;
05519 #ifdef DEBUG_PUSH
05520             xmlGenericError(xmlGenericErrorContext,
05521                 "HPP: entering CONTENT\n");
05522 #endif
05523             break;
05524         }
05525 
05526         if (CUR == '>') {
05527             NEXT;
05528         } else {
05529             htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
05530                          "Couldn't find end of Start Tag %s\n",
05531                  name, NULL);
05532 
05533             /*
05534              * end of parsing of this node.
05535              */
05536             if (xmlStrEqual(name, ctxt->name)) {
05537             nodePop(ctxt);
05538             htmlnamePop(ctxt);
05539             }
05540 
05541             ctxt->instate = XML_PARSER_CONTENT;
05542 #ifdef DEBUG_PUSH
05543             xmlGenericError(xmlGenericErrorContext,
05544                 "HPP: entering CONTENT\n");
05545 #endif
05546             break;
05547         }
05548 
05549         /*
05550          * Check for an Empty Element from DTD definition
05551          */
05552         if ((info != NULL) && (info->empty)) {
05553             if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
05554             ctxt->sax->endElement(ctxt->userData, name);
05555             htmlnamePop(ctxt);
05556         }
05557         ctxt->instate = XML_PARSER_CONTENT;
05558 #ifdef DEBUG_PUSH
05559         xmlGenericError(xmlGenericErrorContext,
05560             "HPP: entering CONTENT\n");
05561 #endif
05562                 break;
05563         }
05564             case XML_PARSER_CONTENT: {
05565         long cons;
05566                 /*
05567          * Handle preparsed entities and charRef
05568          */
05569         if (ctxt->token != 0) {
05570             xmlChar chr[2] = { 0 , 0 } ;
05571 
05572             chr[0] = (xmlChar) ctxt->token;
05573             htmlCheckParagraph(ctxt);
05574             if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
05575             ctxt->sax->characters(ctxt->userData, chr, 1);
05576             ctxt->token = 0;
05577             ctxt->checkIndex = 0;
05578         }
05579         if ((avail == 1) && (terminate)) {
05580             cur = in->cur[0];
05581             if ((cur != '<') && (cur != '&')) {
05582             if (ctxt->sax != NULL) {
05583                 if (IS_BLANK_CH(cur)) {
05584                 if (ctxt->sax->ignorableWhitespace != NULL)
05585                     ctxt->sax->ignorableWhitespace(
05586                         ctxt->userData, &cur, 1);
05587                 } else {
05588                 htmlCheckParagraph(ctxt);
05589                 if (ctxt->sax->characters != NULL)
05590                     ctxt->sax->characters(
05591                         ctxt->userData, &cur, 1);
05592                 }
05593             }
05594             ctxt->token = 0;
05595             ctxt->checkIndex = 0;
05596             in->cur++;
05597             break;
05598             }
05599         }
05600         if (avail < 2)
05601             goto done;
05602         cur = in->cur[0];
05603         next = in->cur[1];
05604         cons = ctxt->nbChars;
05605         if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) ||
05606             (xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
05607             /*
05608              * Handle SCRIPT/STYLE separately
05609              */
05610             if (!terminate) {
05611                 int idx;
05612             xmlChar val;
05613 
05614             idx = htmlParseLookupSequence(ctxt, '<', '/', 0, 0, 0);
05615             if (idx < 0)
05616                 goto done;
05617                 val = in->cur[idx + 2];
05618             if (val == 0) /* bad cut of input */
05619                 goto done;
05620             }
05621             htmlParseScript(ctxt);
05622             if ((cur == '<') && (next == '/')) {
05623             ctxt->instate = XML_PARSER_END_TAG;
05624             ctxt->checkIndex = 0;
05625 #ifdef DEBUG_PUSH
05626             xmlGenericError(xmlGenericErrorContext,
05627                 "HPP: entering END_TAG\n");
05628 #endif
05629             break;
05630             }
05631         } else {
05632             /*
05633              * Sometimes DOCTYPE arrives in the middle of the document
05634              */
05635             if ((cur == '<') && (next == '!') &&
05636             (UPP(2) == 'D') && (UPP(3) == 'O') &&
05637             (UPP(4) == 'C') && (UPP(5) == 'T') &&
05638             (UPP(6) == 'Y') && (UPP(7) == 'P') &&
05639             (UPP(8) == 'E')) {
05640             if ((!terminate) &&
05641                 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
05642                 goto done;
05643             htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
05644                          "Misplaced DOCTYPE declaration\n",
05645                      BAD_CAST "DOCTYPE" , NULL);
05646             htmlParseDocTypeDecl(ctxt);
05647             } else if ((cur == '<') && (next == '!') &&
05648             (in->cur[2] == '-') && (in->cur[3] == '-')) {
05649             if ((!terminate) &&
05650                 (htmlParseLookupSequence(
05651                 ctxt, '-', '-', '>', 1, 1) < 0))
05652                 goto done;
05653 #ifdef DEBUG_PUSH
05654             xmlGenericError(xmlGenericErrorContext,
05655                 "HPP: Parsing Comment\n");
05656 #endif
05657             htmlParseComment(ctxt);
05658             ctxt->instate = XML_PARSER_CONTENT;
05659             } else if ((cur == '<') && (next == '?')) {
05660             if ((!terminate) &&
05661                 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
05662                 goto done;
05663 #ifdef DEBUG_PUSH
05664             xmlGenericError(xmlGenericErrorContext,
05665                 "HPP: Parsing PI\n");
05666 #endif
05667             htmlParsePI(ctxt);
05668             ctxt->instate = XML_PARSER_CONTENT;
05669             } else if ((cur == '<') && (next == '!') && (avail < 4)) {
05670             goto done;
05671             } else if ((cur == '<') && (next == '/')) {
05672             ctxt->instate = XML_PARSER_END_TAG;
05673             ctxt->checkIndex = 0;
05674 #ifdef DEBUG_PUSH
05675             xmlGenericError(xmlGenericErrorContext,
05676                 "HPP: entering END_TAG\n");
05677 #endif
05678             break;
05679             } else if (cur == '<') {
05680             ctxt->instate = XML_PARSER_START_TAG;
05681             ctxt->checkIndex = 0;
05682 #ifdef DEBUG_PUSH
05683             xmlGenericError(xmlGenericErrorContext,
05684                 "HPP: entering START_TAG\n");
05685 #endif
05686             break;
05687             } else if (cur == '&') {
05688             if ((!terminate) &&
05689                 (htmlParseLookupChars(ctxt,
05690                                                   BAD_CAST "; >/", 4) < 0))
05691                 goto done;
05692 #ifdef DEBUG_PUSH
05693             xmlGenericError(xmlGenericErrorContext,
05694                 "HPP: Parsing Reference\n");
05695 #endif
05696             /* TODO: check generation of subtrees if noent !!! */
05697             htmlParseReference(ctxt);
05698             } else {
05699                 /*
05700              * check that the text sequence is complete
05701              * before handing out the data to the parser
05702              * to avoid problems with erroneous end of
05703              * data detection.
05704              */
05705             if ((!terminate) &&
05706                             (htmlParseLookupChars(ctxt, BAD_CAST "<&", 2) < 0))
05707                 goto done;
05708             ctxt->checkIndex = 0;
05709 #ifdef DEBUG_PUSH
05710             xmlGenericError(xmlGenericErrorContext,
05711                 "HPP: Parsing char data\n");
05712 #endif
05713             htmlParseCharData(ctxt);
05714             }
05715         }
05716         if (cons == ctxt->nbChars) {
05717             if (ctxt->node != NULL) {
05718             htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
05719                          "detected an error in element content\n",
05720                      NULL, NULL);
05721             }
05722             NEXT;
05723             break;
05724         }
05725 
05726         break;
05727         }
05728             case XML_PARSER_END_TAG:
05729         if (avail < 2)
05730             goto done;
05731         if ((!terminate) &&
05732             (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
05733             goto done;
05734         htmlParseEndTag(ctxt);
05735         if (ctxt->nameNr == 0) {
05736             ctxt->instate = XML_PARSER_EPILOG;
05737         } else {
05738             ctxt->instate = XML_PARSER_CONTENT;
05739         }
05740         ctxt->checkIndex = 0;
05741 #ifdef DEBUG_PUSH
05742         xmlGenericError(xmlGenericErrorContext,
05743             "HPP: entering CONTENT\n");
05744 #endif
05745             break;
05746             case XML_PARSER_CDATA_SECTION:
05747         htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
05748             "HPP: internal error, state == CDATA\n",
05749                  NULL, NULL);
05750         ctxt->instate = XML_PARSER_CONTENT;
05751         ctxt->checkIndex = 0;
05752 #ifdef DEBUG_PUSH
05753         xmlGenericError(xmlGenericErrorContext,
05754             "HPP: entering CONTENT\n");
05755 #endif
05756         break;
05757             case XML_PARSER_DTD:
05758         htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
05759             "HPP: internal error, state == DTD\n",
05760                  NULL, NULL);
05761         ctxt->instate = XML_PARSER_CONTENT;
05762         ctxt->checkIndex = 0;
05763 #ifdef DEBUG_PUSH
05764         xmlGenericError(xmlGenericErrorContext,
05765             "HPP: entering CONTENT\n");
05766 #endif
05767         break;
05768             case XML_PARSER_COMMENT:
05769         htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
05770             "HPP: internal error, state == COMMENT\n",
05771                  NULL, NULL);
05772         ctxt->instate = XML_PARSER_CONTENT;
05773         ctxt->checkIndex = 0;
05774 #ifdef DEBUG_PUSH
05775         xmlGenericError(xmlGenericErrorContext,
05776             "HPP: entering CONTENT\n");
05777 #endif
05778         break;
05779             case XML_PARSER_PI:
05780         htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
05781             "HPP: internal error, state == PI\n",
05782                  NULL, NULL);
05783         ctxt->instate = XML_PARSER_CONTENT;
05784         ctxt->checkIndex = 0;
05785 #ifdef DEBUG_PUSH
05786         xmlGenericError(xmlGenericErrorContext,
05787             "HPP: entering CONTENT\n");
05788 #endif
05789         break;
05790             case XML_PARSER_ENTITY_DECL:
05791         htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
05792             "HPP: internal error, state == ENTITY_DECL\n",
05793                  NULL, NULL);
05794         ctxt->instate = XML_PARSER_CONTENT;
05795         ctxt->checkIndex = 0;
05796 #ifdef DEBUG_PUSH
05797         xmlGenericError(xmlGenericErrorContext,
05798             "HPP: entering CONTENT\n");
05799 #endif
05800         break;
05801             case XML_PARSER_ENTITY_VALUE:
05802         htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
05803             "HPP: internal error, state == ENTITY_VALUE\n",
05804                  NULL, NULL);
05805         ctxt->instate = XML_PARSER_CONTENT;
05806         ctxt->checkIndex = 0;
05807 #ifdef DEBUG_PUSH
05808         xmlGenericError(xmlGenericErrorContext,
05809             "HPP: entering DTD\n");
05810 #endif
05811         break;
05812             case XML_PARSER_ATTRIBUTE_VALUE:
05813         htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
05814             "HPP: internal error, state == ATTRIBUTE_VALUE\n",
05815                  NULL, NULL);
05816         ctxt->instate = XML_PARSER_START_TAG;
05817         ctxt->checkIndex = 0;
05818 #ifdef DEBUG_PUSH
05819         xmlGenericError(xmlGenericErrorContext,
05820             "HPP: entering START_TAG\n");
05821 #endif
05822         break;
05823         case XML_PARSER_SYSTEM_LITERAL:
05824         htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
05825             "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n",
05826                  NULL, NULL);
05827         ctxt->instate = XML_PARSER_CONTENT;
05828         ctxt->checkIndex = 0;
05829 #ifdef DEBUG_PUSH
05830         xmlGenericError(xmlGenericErrorContext,
05831             "HPP: entering CONTENT\n");
05832 #endif
05833         break;
05834         case XML_PARSER_IGNORE:
05835         htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
05836             "HPP: internal error, state == XML_PARSER_IGNORE\n",
05837                  NULL, NULL);
05838         ctxt->instate = XML_PARSER_CONTENT;
05839         ctxt->checkIndex = 0;
05840 #ifdef DEBUG_PUSH
05841         xmlGenericError(xmlGenericErrorContext,
05842             "HPP: entering CONTENT\n");
05843 #endif
05844         break;
05845         case XML_PARSER_PUBLIC_LITERAL:
05846         htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
05847             "HPP: internal error, state == XML_PARSER_LITERAL\n",
05848                  NULL, NULL);
05849         ctxt->instate = XML_PARSER_CONTENT;
05850         ctxt->checkIndex = 0;
05851 #ifdef DEBUG_PUSH
05852         xmlGenericError(xmlGenericErrorContext,
05853             "HPP: entering CONTENT\n");
05854 #endif
05855         break;
05856 
05857     }
05858     }
05859 done:
05860     if ((avail == 0) && (terminate)) {
05861     htmlAutoCloseOnEnd(ctxt);
05862     if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
05863         /*
05864          * SAX: end of the document processing.
05865          */
05866         ctxt->instate = XML_PARSER_EOF;
05867         if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
05868         ctxt->sax->endDocument(ctxt->userData);
05869     }
05870     }
05871     if ((ctxt->myDoc != NULL) &&
05872     ((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
05873      (ctxt->instate == XML_PARSER_EPILOG))) {
05874     xmlDtdPtr dtd;
05875     dtd = xmlGetIntSubset(ctxt->myDoc);
05876     if (dtd == NULL)
05877         ctxt->myDoc->intSubset =
05878         xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
05879             BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
05880             BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
05881     }
05882 #ifdef DEBUG_PUSH
05883     xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
05884 #endif
05885     return(ret);
05886 }
05887 
05899 int
05900 htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
05901               int terminate) {
05902     if ((ctxt == NULL) || (ctxt->input == NULL)) {
05903     htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
05904              "htmlParseChunk: context error\n", NULL, NULL);
05905     return(XML_ERR_INTERNAL_ERROR);
05906     }
05907     if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
05908         (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF))  {
05909     int base = ctxt->input->base - ctxt->input->buf->buffer->content;
05910     int cur = ctxt->input->cur - ctxt->input->base;
05911     int res;
05912 
05913     res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
05914     if (res < 0) {
05915         ctxt->errNo = XML_PARSER_EOF;
05916         ctxt->disableSAX = 1;
05917         return (XML_PARSER_EOF);
05918     }
05919     ctxt->input->base = ctxt->input->buf->buffer->content + base;
05920     ctxt->input->cur = ctxt->input->base + cur;
05921     ctxt->input->end =
05922       &ctxt->input->buf->buffer->content[ctxt->input->buf->buffer->use];
05923 #ifdef DEBUG_PUSH
05924     xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
05925 #endif
05926 
05927 #if 0
05928     if ((terminate) || (ctxt->input->buf->buffer->use > 80))
05929         htmlParseTryOrFinish(ctxt, terminate);
05930 #endif
05931     } else if (ctxt->instate != XML_PARSER_EOF) {
05932     if ((ctxt->input != NULL) && ctxt->input->buf != NULL) {
05933         xmlParserInputBufferPtr in = ctxt->input->buf;
05934         if ((in->encoder != NULL) && (in->buffer != NULL) &&
05935             (in->raw != NULL)) {
05936         int nbchars;
05937 
05938         nbchars = xmlCharEncInFunc(in->encoder, in->buffer, in->raw);
05939         if (nbchars < 0) {
05940             htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
05941                      "encoder error\n", NULL, NULL);
05942             return(XML_ERR_INVALID_ENCODING);
05943         }
05944         }
05945     }
05946     }
05947     htmlParseTryOrFinish(ctxt, terminate);
05948     if (terminate) {
05949     if ((ctxt->instate != XML_PARSER_EOF) &&
05950         (ctxt->instate != XML_PARSER_EPILOG) &&
05951         (ctxt->instate != XML_PARSER_MISC)) {
05952         ctxt->errNo = XML_ERR_DOCUMENT_END;
05953         ctxt->wellFormed = 0;
05954     }
05955     if (ctxt->instate != XML_PARSER_EOF) {
05956         if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
05957         ctxt->sax->endDocument(ctxt->userData);
05958     }
05959     ctxt->instate = XML_PARSER_EOF;
05960     }
05961     return((xmlParserErrors) ctxt->errNo);
05962 }
05963 
05964 /************************************************************************
05965  *                                  *
05966  *          User entry points               *
05967  *                                  *
05968  ************************************************************************/
05969 
05985 htmlParserCtxtPtr
05986 htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
05987                          const char *chunk, int size, const char *filename,
05988              xmlCharEncoding enc) {
05989     htmlParserCtxtPtr ctxt;
05990     htmlParserInputPtr inputStream;
05991     xmlParserInputBufferPtr buf;
05992 
05993     xmlInitParser();
05994 
05995     buf = xmlAllocParserInputBuffer(enc);
05996     if (buf == NULL) return(NULL);
05997 
05998     ctxt = htmlNewParserCtxt();
05999     if (ctxt == NULL) {
06000     xmlFreeParserInputBuffer(buf);
06001     return(NULL);
06002     }
06003     if(enc==XML_CHAR_ENCODING_UTF8 || buf->encoder)
06004     ctxt->charset=XML_CHAR_ENCODING_UTF8;
06005     if (sax != NULL) {
06006     if (ctxt->sax != (xmlSAXHandlerPtr) &htmlDefaultSAXHandler)
06007         xmlFree(ctxt->sax);
06008     ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
06009     if (ctxt->sax == NULL) {
06010         xmlFree(buf);
06011         xmlFree(ctxt);
06012         return(NULL);
06013     }
06014     memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
06015     if (user_data != NULL)
06016         ctxt->userData = user_data;
06017     }
06018     if (filename == NULL) {
06019     ctxt->directory = NULL;
06020     } else {
06021         ctxt->directory = xmlParserGetDirectory(filename);
06022     }
06023 
06024     inputStream = htmlNewInputStream(ctxt);
06025     if (inputStream == NULL) {
06026     xmlFreeParserCtxt(ctxt);
06027     xmlFree(buf);
06028     return(NULL);
06029     }
06030 
06031     if (filename == NULL)
06032     inputStream->filename = NULL;
06033     else
06034     inputStream->filename = (char *)
06035         xmlCanonicPath((const xmlChar *) filename);
06036     inputStream->buf = buf;
06037     inputStream->base = inputStream->buf->buffer->content;
06038     inputStream->cur = inputStream->buf->buffer->content;
06039     inputStream->end =
06040     &inputStream->buf->buffer->content[inputStream->buf->buffer->use];
06041 
06042     inputPush(ctxt, inputStream);
06043 
06044     if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
06045         (ctxt->input->buf != NULL))  {
06046     int base = ctxt->input->base - ctxt->input->buf->buffer->content;
06047     int cur = ctxt->input->cur - ctxt->input->base;
06048 
06049     xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
06050 
06051     ctxt->input->base = ctxt->input->buf->buffer->content + base;
06052     ctxt->input->cur = ctxt->input->base + cur;
06053     ctxt->input->end =
06054         &ctxt->input->buf->buffer->content[ctxt->input->buf->buffer->use];
06055 #ifdef DEBUG_PUSH
06056     xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
06057 #endif
06058     }
06059     ctxt->progressive = 1;
06060 
06061     return(ctxt);
06062 }
06063 #endif /* LIBXML_PUSH_ENABLED */
06064 
06080 htmlDocPtr
06081 htmlSAXParseDoc(xmlChar *cur, const char *encoding, htmlSAXHandlerPtr sax, void *userData) {
06082     htmlDocPtr ret;
06083     htmlParserCtxtPtr ctxt;
06084 
06085     xmlInitParser();
06086 
06087     if (cur == NULL) return(NULL);
06088 
06089 
06090     ctxt = htmlCreateDocParserCtxt(cur, encoding);
06091     if (ctxt == NULL) return(NULL);
06092     if (sax != NULL) {
06093         if (ctxt->sax != NULL) xmlFree (ctxt->sax);
06094         ctxt->sax = sax;
06095         ctxt->userData = userData;
06096     }
06097 
06098     htmlParseDocument(ctxt);
06099     ret = ctxt->myDoc;
06100     if (sax != NULL) {
06101     ctxt->sax = NULL;
06102     ctxt->userData = NULL;
06103     }
06104     htmlFreeParserCtxt(ctxt);
06105 
06106     return(ret);
06107 }
06108 
06119 htmlDocPtr
06120 htmlParseDoc(xmlChar *cur, const char *encoding) {
06121     return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
06122 }
06123 
06124 
06136 htmlParserCtxtPtr
06137 htmlCreateFileParserCtxt(const char *filename, const char *encoding)
06138 {
06139     htmlParserCtxtPtr ctxt;
06140     htmlParserInputPtr inputStream;
06141     char *canonicFilename;
06142     /* htmlCharEncoding enc; */
06143     xmlChar *content, *content_line = (xmlChar *) "charset=";
06144 
06145     if (filename == NULL)
06146         return(NULL);
06147 
06148     ctxt = htmlNewParserCtxt();
06149     if (ctxt == NULL) {
06150     return(NULL);
06151     }
06152     canonicFilename = (char *) xmlCanonicPath((const xmlChar *) filename);
06153     if (canonicFilename == NULL) {
06154 #ifdef LIBXML_SAX1_ENABLED
06155     if (xmlDefaultSAXHandler.error != NULL) {
06156         xmlDefaultSAXHandler.error(NULL, "out of memory\n");
06157     }
06158 #endif
06159     xmlFreeParserCtxt(ctxt);
06160     return(NULL);
06161     }
06162 
06163     inputStream = xmlLoadExternalEntity(canonicFilename, NULL, ctxt);
06164     xmlFree(canonicFilename);
06165     if (inputStream == NULL) {
06166     xmlFreeParserCtxt(ctxt);
06167     return(NULL);
06168     }
06169 
06170     inputPush(ctxt, inputStream);
06171 
06172     /* set encoding */
06173     if (encoding) {
06174         content = xmlMallocAtomic (xmlStrlen(content_line) + strlen(encoding) + 1);
06175     if (content) {
06176         strcpy ((char *)content, (char *)content_line);
06177             strcat ((char *)content, (char *)encoding);
06178             htmlCheckEncoding (ctxt, content);
06179         xmlFree (content);
06180     }
06181     }
06182 
06183     return(ctxt);
06184 }
06185 
06202 htmlDocPtr
06203 htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
06204                  void *userData) {
06205     htmlDocPtr ret;
06206     htmlParserCtxtPtr ctxt;
06207     htmlSAXHandlerPtr oldsax = NULL;
06208 
06209     xmlInitParser();
06210 
06211     ctxt = htmlCreateFileParserCtxt(filename, encoding);
06212     if (ctxt == NULL) return(NULL);
06213     if (sax != NULL) {
06214     oldsax = ctxt->sax;
06215         ctxt->sax = sax;
06216         ctxt->userData = userData;
06217     }
06218 
06219     htmlParseDocument(ctxt);
06220 
06221     ret = ctxt->myDoc;
06222     if (sax != NULL) {
06223         ctxt->sax = oldsax;
06224         ctxt->userData = NULL;
06225     }
06226     htmlFreeParserCtxt(ctxt);
06227 
06228     return(ret);
06229 }
06230 
06242 htmlDocPtr
06243 htmlParseFile(const char *filename, const char *encoding) {
06244     return(htmlSAXParseFile(filename, encoding, NULL, NULL));
06245 }
06246 
06256 int
06257 htmlHandleOmittedElem(int val) {
06258     int old = htmlOmittedDefaultValue;
06259 
06260     htmlOmittedDefaultValue = val;
06261     return(old);
06262 }
06263 
06274 int
06275 htmlElementAllowedHere(const htmlElemDesc* parent, const xmlChar* elt) {
06276   const char** p ;
06277 
06278   if ( ! elt || ! parent || ! parent->subelts )
06279     return 0 ;
06280 
06281   for ( p = parent->subelts; *p; ++p )
06282     if ( !xmlStrcmp((const xmlChar *)*p, elt) )
06283       return 1 ;
06284 
06285   return 0 ;
06286 }
06297 htmlStatus
06298 htmlElementStatusHere(const htmlElemDesc* parent, const htmlElemDesc* elt) {
06299   if ( ! parent || ! elt )
06300     return HTML_INVALID ;
06301   if ( ! htmlElementAllowedHere(parent, (const xmlChar*) elt->name ) )
06302     return HTML_INVALID ;
06303 
06304   return ( elt->dtd == 0 ) ? HTML_VALID : HTML_DEPRECATED ;
06305 }
06317 htmlStatus
06318 htmlAttrAllowed(const htmlElemDesc* elt, const xmlChar* attr, int legacy) {
06319   const char** p ;
06320 
06321   if ( !elt || ! attr )
06322     return HTML_INVALID ;
06323 
06324   if ( elt->attrs_req )
06325     for ( p = elt->attrs_req; *p; ++p)
06326       if ( !xmlStrcmp((const xmlChar*)*p, attr) )
06327         return HTML_REQUIRED ;
06328 
06329   if ( elt->attrs_opt )
06330     for ( p = elt->attrs_opt; *p; ++p)
06331       if ( !xmlStrcmp((const xmlChar*)*p, attr) )
06332         return HTML_VALID ;
06333 
06334   if ( legacy && elt->attrs_depr )
06335     for ( p = elt->attrs_depr; *p; ++p)
06336       if ( !xmlStrcmp((const xmlChar*)*p, attr) )
06337         return HTML_DEPRECATED ;
06338 
06339   return HTML_INVALID ;
06340 }
06355 htmlStatus
06356 htmlNodeStatus(const htmlNodePtr node, int legacy) {
06357   if ( ! node )
06358     return HTML_INVALID ;
06359 
06360   switch ( node->type ) {
06361     case XML_ELEMENT_NODE:
06362       return legacy
06363     ? ( htmlElementAllowedHere (
06364         htmlTagLookup(node->parent->name) , node->name
06365         ) ? HTML_VALID : HTML_INVALID )
06366     : htmlElementStatusHere(
06367         htmlTagLookup(node->parent->name) ,
06368         htmlTagLookup(node->name) )
06369     ;
06370     case XML_ATTRIBUTE_NODE:
06371       return htmlAttrAllowed(
06372     htmlTagLookup(node->parent->name) , node->name, legacy) ;
06373     default: return HTML_NA ;
06374   }
06375 }
06376 /************************************************************************
06377  *                                  *
06378  *  New set (2.6.0) of simpler and more flexible APIs       *
06379  *                                  *
06380  ************************************************************************/
06388 #define DICT_FREE(str)                      \
06389     if ((str) && ((!dict) ||                \
06390         (xmlDictOwns(dict, (const xmlChar *)(str)) == 0)))  \
06391         xmlFree((char *)(str));
06392 
06399 void
06400 htmlCtxtReset(htmlParserCtxtPtr ctxt)
06401 {
06402     xmlParserInputPtr input;
06403     xmlDictPtr dict;
06404 
06405     if (ctxt == NULL)
06406         return;
06407 
06408     xmlInitParser();
06409     dict = ctxt->dict;
06410 
06411     while ((input = inputPop(ctxt)) != NULL) { /* Non consuming */
06412         xmlFreeInputStream(input);
06413     }
06414     ctxt->inputNr = 0;
06415     ctxt->input = NULL;
06416 
06417     ctxt->spaceNr = 0;
06418     if (ctxt->spaceTab != NULL) {
06419     ctxt->spaceTab[0] = -1;
06420     ctxt->space = &ctxt->spaceTab[0];
06421     } else {
06422     ctxt->space = NULL;
06423     }
06424 
06425 
06426     ctxt->nodeNr = 0;
06427     ctxt->node = NULL;
06428 
06429     ctxt->nameNr = 0;
06430     ctxt->name = NULL;
06431 
06432     DICT_FREE(ctxt->version);
06433     ctxt->version = NULL;
06434     DICT_FREE(ctxt->encoding);
06435     ctxt->encoding = NULL;
06436     DICT_FREE(ctxt->directory);
06437     ctxt->directory = NULL;
06438     DICT_FREE(ctxt->extSubURI);
06439     ctxt->extSubURI = NULL;
06440     DICT_FREE(ctxt->extSubSystem);
06441     ctxt->extSubSystem = NULL;
06442     if (ctxt->myDoc != NULL)
06443         xmlFreeDoc(ctxt->myDoc);
06444     ctxt->myDoc = NULL;
06445 
06446     ctxt->standalone = -1;
06447     ctxt->hasExternalSubset = 0;
06448     ctxt->hasPErefs = 0;
06449     ctxt->html = 1;
06450     ctxt->external = 0;
06451     ctxt->instate = XML_PARSER_START;
06452     ctxt->token = 0;
06453 
06454     ctxt->wellFormed = 1;
06455     ctxt->nsWellFormed = 1;
06456     ctxt->disableSAX = 0;
06457     ctxt->valid = 1;
06458     ctxt->vctxt.userData = ctxt;
06459     ctxt->vctxt.error = xmlParserValidityError;
06460     ctxt->vctxt.warning = xmlParserValidityWarning;
06461     ctxt->record_info = 0;
06462     ctxt->nbChars = 0;
06463     ctxt->checkIndex = 0;
06464     ctxt->inSubset = 0;
06465     ctxt->errNo = XML_ERR_OK;
06466     ctxt->depth = 0;
06467     ctxt->charset = XML_CHAR_ENCODING_NONE;
06468     ctxt->catalogs = NULL;
06469     xmlInitNodeInfoSeq(&ctxt->node_seq);
06470 
06471     if (ctxt->attsDefault != NULL) {
06472         xmlHashFree(ctxt->attsDefault, (xmlHashDeallocator) xmlFree);
06473         ctxt->attsDefault = NULL;
06474     }
06475     if (ctxt->attsSpecial != NULL) {
06476         xmlHashFree(ctxt->attsSpecial, NULL);
06477         ctxt->attsSpecial = NULL;
06478     }
06479 }
06480 
06491 int
06492 htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options)
06493 {
06494     if (ctxt == NULL)
06495         return(-1);
06496 
06497     if (options & HTML_PARSE_NOWARNING) {
06498         ctxt->sax->warning = NULL;
06499         ctxt->vctxt.warning = NULL;
06500         options -= XML_PARSE_NOWARNING;
06501     ctxt->options |= XML_PARSE_NOWARNING;
06502     }
06503     if (options & HTML_PARSE_NOERROR) {
06504         ctxt->sax->error = NULL;
06505         ctxt->vctxt.error = NULL;
06506         ctxt->sax->fatalError = NULL;
06507         options -= XML_PARSE_NOERROR;
06508     ctxt->options |= XML_PARSE_NOERROR;
06509     }
06510     if (options & HTML_PARSE_PEDANTIC) {
06511         ctxt->pedantic = 1;
06512         options -= XML_PARSE_PEDANTIC;
06513     ctxt->options |= XML_PARSE_PEDANTIC;
06514     } else
06515         ctxt->pedantic = 0;
06516     if (options & XML_PARSE_NOBLANKS) {
06517         ctxt->keepBlanks = 0;
06518         ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace;
06519         options -= XML_PARSE_NOBLANKS;
06520     ctxt->options |= XML_PARSE_NOBLANKS;
06521     } else
06522         ctxt->keepBlanks = 1;
06523     if (options & HTML_PARSE_RECOVER) {
06524         ctxt->recovery = 1;
06525     options -= HTML_PARSE_RECOVER;
06526     } else
06527         ctxt->recovery = 0;
06528     if (options & HTML_PARSE_COMPACT) {
06529     ctxt->options |= HTML_PARSE_COMPACT;
06530         options -= HTML_PARSE_COMPACT;
06531     }
06532     if (options & XML_PARSE_HUGE) {
06533     ctxt->options |= XML_PARSE_HUGE;
06534         options -= XML_PARSE_HUGE;
06535     }
06536     if (options & HTML_PARSE_NODEFDTD) {
06537     ctxt->options |= HTML_PARSE_NODEFDTD;
06538         options -= HTML_PARSE_NODEFDTD;
06539     }
06540     ctxt->dictNames = 0;
06541     return (options);
06542 }
06543 
06556 static htmlDocPtr
06557 htmlDoRead(htmlParserCtxtPtr ctxt, const char *URL, const char *encoding,
06558           int options, int reuse)
06559 {
06560     htmlDocPtr ret;
06561 
06562     htmlCtxtUseOptions(ctxt, options);
06563     ctxt->html = 1;
06564     if (encoding != NULL) {
06565         xmlCharEncodingHandlerPtr hdlr;
06566 
06567     hdlr = xmlFindCharEncodingHandler(encoding);
06568     if (hdlr != NULL) {
06569         xmlSwitchToEncoding(ctxt, hdlr);
06570         if (ctxt->input->encoding != NULL)
06571           xmlFree((xmlChar *) ctxt->input->encoding);
06572             ctxt->input->encoding = xmlStrdup((xmlChar *)encoding);
06573         }
06574     }
06575     if ((URL != NULL) && (ctxt->input != NULL) &&
06576         (ctxt->input->filename == NULL))
06577         ctxt->input->filename = (char *) xmlStrdup((const xmlChar *) URL);
06578     htmlParseDocument(ctxt);
06579     ret = ctxt->myDoc;
06580     ctxt->myDoc = NULL;
06581     if (!reuse) {
06582         if ((ctxt->dictNames) &&
06583         (ret != NULL) &&
06584         (ret->dict == ctxt->dict))
06585         ctxt->dict = NULL;
06586     xmlFreeParserCtxt(ctxt);
06587     }
06588     return (ret);
06589 }
06590 
06602 htmlDocPtr
06603 htmlReadDoc(const xmlChar * cur, const char *URL, const char *encoding, int options)
06604 {
06605     htmlParserCtxtPtr ctxt;
06606 
06607     if (cur == NULL)
06608         return (NULL);
06609 
06610     xmlInitParser();
06611     ctxt = htmlCreateDocParserCtxt(cur, NULL);
06612     if (ctxt == NULL)
06613         return (NULL);
06614     return (htmlDoRead(ctxt, URL, encoding, options, 0));
06615 }
06616 
06627 htmlDocPtr
06628 htmlReadFile(const char *filename, const char *encoding, int options)
06629 {
06630     htmlParserCtxtPtr ctxt;
06631 
06632     xmlInitParser();
06633     ctxt = htmlCreateFileParserCtxt(filename, encoding);
06634     if (ctxt == NULL)
06635         return (NULL);
06636     return (htmlDoRead(ctxt, NULL, NULL, options, 0));
06637 }
06638 
06651 htmlDocPtr
06652 htmlReadMemory(const char *buffer, int size, const char *URL, const char *encoding, int options)
06653 {
06654     htmlParserCtxtPtr ctxt;
06655 
06656     xmlInitParser();
06657     ctxt = xmlCreateMemoryParserCtxt(buffer, size);
06658     if (ctxt == NULL)
06659         return (NULL);
06660     htmlDefaultSAXHandlerInit();
06661     if (ctxt->sax != NULL)
06662         memcpy(ctxt->sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
06663     return (htmlDoRead(ctxt, URL, encoding, options, 0));
06664 }
06665 
06677 htmlDocPtr
06678 htmlReadFd(int fd, const char *URL, const char *encoding, int options)
06679 {
06680     htmlParserCtxtPtr ctxt;
06681     xmlParserInputBufferPtr input;
06682     xmlParserInputPtr stream;
06683 
06684     if (fd < 0)
06685         return (NULL);
06686 
06687     xmlInitParser();
06688     input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
06689     if (input == NULL)
06690         return (NULL);
06691     ctxt = xmlNewParserCtxt();
06692     if (ctxt == NULL) {
06693         xmlFreeParserInputBuffer(input);
06694         return (NULL);
06695     }
06696     stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
06697     if (stream == NULL) {
06698         xmlFreeParserInputBuffer(input);
06699     xmlFreeParserCtxt(ctxt);
06700         return (NULL);
06701     }
06702     inputPush(ctxt, stream);
06703     return (htmlDoRead(ctxt, URL, encoding, options, 0));
06704 }
06705 
06719 htmlDocPtr
06720 htmlReadIO(xmlInputReadCallback ioread, xmlInputCloseCallback ioclose,
06721           void *ioctx, const char *URL, const char *encoding, int options)
06722 {
06723     htmlParserCtxtPtr ctxt;
06724     xmlParserInputBufferPtr input;
06725     xmlParserInputPtr stream;
06726 
06727     if (ioread == NULL)
06728         return (NULL);
06729     xmlInitParser();
06730 
06731     input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
06732                                          XML_CHAR_ENCODING_NONE);
06733     if (input == NULL)
06734         return (NULL);
06735     ctxt = htmlNewParserCtxt();
06736     if (ctxt == NULL) {
06737         xmlFreeParserInputBuffer(input);
06738         return (NULL);
06739     }
06740     stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
06741     if (stream == NULL) {
06742         xmlFreeParserInputBuffer(input);
06743     xmlFreeParserCtxt(ctxt);
06744         return (NULL);
06745     }
06746     inputPush(ctxt, stream);
06747     return (htmlDoRead(ctxt, URL, encoding, options, 0));
06748 }
06749 
06763 htmlDocPtr
06764 htmlCtxtReadDoc(htmlParserCtxtPtr ctxt, const xmlChar * cur,
06765                const char *URL, const char *encoding, int options)
06766 {
06767     xmlParserInputPtr stream;
06768 
06769     if (cur == NULL)
06770         return (NULL);
06771     if (ctxt == NULL)
06772         return (NULL);
06773 
06774     htmlCtxtReset(ctxt);
06775 
06776     stream = xmlNewStringInputStream(ctxt, cur);
06777     if (stream == NULL) {
06778         return (NULL);
06779     }
06780     inputPush(ctxt, stream);
06781     return (htmlDoRead(ctxt, URL, encoding, options, 1));
06782 }
06783 
06796 htmlDocPtr
06797 htmlCtxtReadFile(htmlParserCtxtPtr ctxt, const char *filename,
06798                 const char *encoding, int options)
06799 {
06800     xmlParserInputPtr stream;
06801 
06802     if (filename == NULL)
06803         return (NULL);
06804     if (ctxt == NULL)
06805         return (NULL);
06806 
06807     htmlCtxtReset(ctxt);
06808 
06809     stream = xmlLoadExternalEntity(filename, NULL, ctxt);
06810     if (stream == NULL) {
06811         return (NULL);
06812     }
06813     inputPush(ctxt, stream);
06814     return (htmlDoRead(ctxt, NULL, encoding, options, 1));
06815 }
06816 
06831 htmlDocPtr
06832 htmlCtxtReadMemory(htmlParserCtxtPtr ctxt, const char *buffer, int size,
06833                   const char *URL, const char *encoding, int options)
06834 {
06835     xmlParserInputBufferPtr input;
06836     xmlParserInputPtr stream;
06837 
06838     if (ctxt == NULL)
06839         return (NULL);
06840     if (buffer == NULL)
06841         return (NULL);
06842 
06843     htmlCtxtReset(ctxt);
06844 
06845     input = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
06846     if (input == NULL) {
06847     return(NULL);
06848     }
06849 
06850     stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
06851     if (stream == NULL) {
06852     xmlFreeParserInputBuffer(input);
06853     return(NULL);
06854     }
06855 
06856     inputPush(ctxt, stream);
06857     return (htmlDoRead(ctxt, URL, encoding, options, 1));
06858 }
06859 
06873 htmlDocPtr
06874 htmlCtxtReadFd(htmlParserCtxtPtr ctxt, int fd,
06875               const char *URL, const char *encoding, int options)
06876 {
06877     xmlParserInputBufferPtr input;
06878     xmlParserInputPtr stream;
06879 
06880     if (fd < 0)
06881         return (NULL);
06882     if (ctxt == NULL)
06883         return (NULL);
06884 
06885     htmlCtxtReset(ctxt);
06886 
06887 
06888     input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
06889     if (input == NULL)
06890         return (NULL);
06891     stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
06892     if (stream == NULL) {
06893         xmlFreeParserInputBuffer(input);
06894         return (NULL);
06895     }
06896     inputPush(ctxt, stream);
06897     return (htmlDoRead(ctxt, URL, encoding, options, 1));
06898 }
06899 
06915 htmlDocPtr
06916 htmlCtxtReadIO(htmlParserCtxtPtr ctxt, xmlInputReadCallback ioread,
06917               xmlInputCloseCallback ioclose, void *ioctx,
06918           const char *URL,
06919               const char *encoding, int options)
06920 {
06921     xmlParserInputBufferPtr input;
06922     xmlParserInputPtr stream;
06923 
06924     if (ioread == NULL)
06925         return (NULL);
06926     if (ctxt == NULL)
06927         return (NULL);
06928 
06929     htmlCtxtReset(ctxt);
06930 
06931     input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
06932                                          XML_CHAR_ENCODING_NONE);
06933     if (input == NULL)
06934         return (NULL);
06935     stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
06936     if (stream == NULL) {
06937         xmlFreeParserInputBuffer(input);
06938         return (NULL);
06939     }
06940     inputPush(ctxt, stream);
06941     return (htmlDoRead(ctxt, URL, encoding, options, 1));
06942 }
06943 
06944 #define bottom_HTMLparser
06945 #include "elfgcchack.h"
06946 #endif /* LIBXML_HTML_ENABLED */

Generated on Sat May 26 2012 04:33:18 for ReactOS by doxygen 1.7.6.1

ReactOS is a registered trademark or a trademark of ReactOS Foundation in the United States and other countries.