Home | Info | Community | Development | myReactOS | Contact Us
ReactOS Development > DoxygenHTMLparser.c
Go to the documentation of this file.
00001 /* 00002 * HTMLparser.c : an HTML 4.0 non-verifying parser 00003 * 00004 * See Copyright for the status of this software. 00005 * 00006 * daniel@veillard.com 00007 */ 00008 00009 #define IN_LIBXML 00010 #include "libxml.h" 00011 #ifdef LIBXML_HTML_ENABLED 00012 00013 #include <string.h> 00014 #ifdef HAVE_CTYPE_H 00015 #include <ctype.h> 00016 #endif 00017 #ifdef HAVE_STDLIB_H 00018 #include <stdlib.h> 00019 #endif 00020 #ifdef HAVE_SYS_STAT_H 00021 #include <sys/stat.h> 00022 #endif 00023 #ifdef HAVE_FCNTL_H 00024 #include <fcntl.h> 00025 #endif 00026 #ifdef HAVE_UNISTD_H 00027 #include <unistd.h> 00028 #endif 00029 #ifdef HAVE_ZLIB_H 00030 #include <zlib.h> 00031 #endif 00032 00033 #include <libxml/xmlmemory.h> 00034 #include <libxml/tree.h> 00035 #include <libxml/parser.h> 00036 #include <libxml/parserInternals.h> 00037 #include <libxml/xmlerror.h> 00038 #include <libxml/HTMLparser.h> 00039 #include <libxml/HTMLtree.h> 00040 #include <libxml/entities.h> 00041 #include <libxml/encoding.h> 00042 #include <libxml/valid.h> 00043 #include <libxml/xmlIO.h> 00044 #include <libxml/globals.h> 00045 #include <libxml/uri.h> 00046 00047 #define HTML_MAX_NAMELEN 1000 00048 #define HTML_PARSER_BIG_BUFFER_SIZE 1000 00049 #define HTML_PARSER_BUFFER_SIZE 100 00050 00051 /* #define DEBUG */ 00052 /* #define DEBUG_PUSH */ 00053 00054 static int htmlOmittedDefaultValue = 1; 00055 00056 xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len, 00057 xmlChar end, xmlChar end2, xmlChar end3); 00058 static void htmlParseComment(htmlParserCtxtPtr ctxt); 00059 00060 /************************************************************************ 00061 * * 00062 * Some factorized error routines * 00063 * * 00064 ************************************************************************/ 00065 00073 static void 00074 htmlErrMemory(xmlParserCtxtPtr ctxt, const char *extra) 00075 { 00076 if ((ctxt != NULL) && (ctxt->disableSAX != 0) && 00077 (ctxt->instate == XML_PARSER_EOF)) 00078 return; 00079 if (ctxt != NULL) { 00080 ctxt->errNo = XML_ERR_NO_MEMORY; 00081 ctxt->instate = XML_PARSER_EOF; 00082 ctxt->disableSAX = 1; 00083 } 00084 if (extra) 00085 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER, 00086 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra, 00087 NULL, NULL, 0, 0, 00088 "Memory allocation failed : %s\n", extra); 00089 else 00090 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER, 00091 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, NULL, 00092 NULL, NULL, 0, 0, "Memory allocation failed\n"); 00093 } 00094 00105 static void 00106 htmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error, 00107 const char *msg, const xmlChar *str1, const xmlChar *str2) 00108 { 00109 if ((ctxt != NULL) && (ctxt->disableSAX != 0) && 00110 (ctxt->instate == XML_PARSER_EOF)) 00111 return; 00112 if (ctxt != NULL) 00113 ctxt->errNo = error; 00114 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error, 00115 XML_ERR_ERROR, NULL, 0, 00116 (const char *) str1, (const char *) str2, 00117 NULL, 0, 0, 00118 msg, str1, str2); 00119 if (ctxt != NULL) 00120 ctxt->wellFormed = 0; 00121 } 00122 00132 static void 00133 htmlParseErrInt(xmlParserCtxtPtr ctxt, xmlParserErrors error, 00134 const char *msg, int val) 00135 { 00136 if ((ctxt != NULL) && (ctxt->disableSAX != 0) && 00137 (ctxt->instate == XML_PARSER_EOF)) 00138 return; 00139 if (ctxt != NULL) 00140 ctxt->errNo = error; 00141 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error, 00142 XML_ERR_ERROR, NULL, 0, NULL, NULL, 00143 NULL, val, 0, msg, val); 00144 if (ctxt != NULL) 00145 ctxt->wellFormed = 0; 00146 } 00147 00148 /************************************************************************ 00149 * * 00150 * Parser stacks related functions and macros * 00151 * * 00152 ************************************************************************/ 00153 00163 static int 00164 htmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value) 00165 { 00166 if ((ctxt->html < 3) && (xmlStrEqual(value, BAD_CAST "head"))) 00167 ctxt->html = 3; 00168 if ((ctxt->html < 10) && (xmlStrEqual(value, BAD_CAST "body"))) 00169 ctxt->html = 10; 00170 if (ctxt->nameNr >= ctxt->nameMax) { 00171 ctxt->nameMax *= 2; 00172 ctxt->nameTab = (const xmlChar * *) 00173 xmlRealloc((xmlChar * *)ctxt->nameTab, 00174 ctxt->nameMax * 00175 sizeof(ctxt->nameTab[0])); 00176 if (ctxt->nameTab == NULL) { 00177 htmlErrMemory(ctxt, NULL); 00178 return (0); 00179 } 00180 } 00181 ctxt->nameTab[ctxt->nameNr] = value; 00182 ctxt->name = value; 00183 return (ctxt->nameNr++); 00184 } 00193 static const xmlChar * 00194 htmlnamePop(htmlParserCtxtPtr ctxt) 00195 { 00196 const xmlChar *ret; 00197 00198 if (ctxt->nameNr <= 0) 00199 return (NULL); 00200 ctxt->nameNr--; 00201 if (ctxt->nameNr < 0) 00202 return (NULL); 00203 if (ctxt->nameNr > 0) 00204 ctxt->name = ctxt->nameTab[ctxt->nameNr - 1]; 00205 else 00206 ctxt->name = NULL; 00207 ret = ctxt->nameTab[ctxt->nameNr]; 00208 ctxt->nameTab[ctxt->nameNr] = NULL; 00209 return (ret); 00210 } 00211 00221 static int 00222 htmlNodeInfoPush(htmlParserCtxtPtr ctxt, htmlParserNodeInfo *value) 00223 { 00224 if (ctxt->nodeInfoNr >= ctxt->nodeInfoMax) { 00225 if (ctxt->nodeInfoMax == 0) 00226 ctxt->nodeInfoMax = 5; 00227 ctxt->nodeInfoMax *= 2; 00228 ctxt->nodeInfoTab = (htmlParserNodeInfo *) 00229 xmlRealloc((htmlParserNodeInfo *)ctxt->nodeInfoTab, 00230 ctxt->nodeInfoMax * 00231 sizeof(ctxt->nodeInfoTab[0])); 00232 if (ctxt->nodeInfoTab == NULL) { 00233 htmlErrMemory(ctxt, NULL); 00234 return (0); 00235 } 00236 } 00237 ctxt->nodeInfoTab[ctxt->nodeInfoNr] = *value; 00238 ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr]; 00239 return (ctxt->nodeInfoNr++); 00240 } 00241 00250 static htmlParserNodeInfo * 00251 htmlNodeInfoPop(htmlParserCtxtPtr ctxt) 00252 { 00253 if (ctxt->nodeInfoNr <= 0) 00254 return (NULL); 00255 ctxt->nodeInfoNr--; 00256 if (ctxt->nodeInfoNr < 0) 00257 return (NULL); 00258 if (ctxt->nodeInfoNr > 0) 00259 ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr - 1]; 00260 else 00261 ctxt->nodeInfo = NULL; 00262 return &ctxt->nodeInfoTab[ctxt->nodeInfoNr]; 00263 } 00264 00265 /* 00266 * Macros for accessing the content. Those should be used only by the parser, 00267 * and not exported. 00268 * 00269 * Dirty macros, i.e. one need to make assumption on the context to use them 00270 * 00271 * CUR_PTR return the current pointer to the xmlChar to be parsed. 00272 * CUR returns the current xmlChar value, i.e. a 8 bit value if compiled 00273 * in ISO-Latin or UTF-8, and the current 16 bit value if compiled 00274 * in UNICODE mode. This should be used internally by the parser 00275 * only to compare to ASCII values otherwise it would break when 00276 * running with UTF-8 encoding. 00277 * NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only 00278 * to compare on ASCII based substring. 00279 * UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR 00280 * it should be used only to compare on ASCII based substring. 00281 * SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined 00282 * strings without newlines within the parser. 00283 * 00284 * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding 00285 * 00286 * CURRENT Returns the current char value, with the full decoding of 00287 * UTF-8 if we are using this mode. It returns an int. 00288 * NEXT Skip to the next character, this does the proper decoding 00289 * in UTF-8 mode. It also pop-up unfinished entities on the fly. 00290 * NEXTL(l) Skip the current unicode character of l xmlChars long. 00291 * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly 00292 */ 00293 00294 #define UPPER (toupper(*ctxt->input->cur)) 00295 00296 #define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val),ctxt->input->col+=(val) 00297 00298 #define NXT(val) ctxt->input->cur[(val)] 00299 00300 #define UPP(val) (toupper(ctxt->input->cur[(val)])) 00301 00302 #define CUR_PTR ctxt->input->cur 00303 00304 #define SHRINK if ((ctxt->input->cur - ctxt->input->base > 2 * INPUT_CHUNK) && \ 00305 (ctxt->input->end - ctxt->input->cur < 2 * INPUT_CHUNK)) \ 00306 xmlParserInputShrink(ctxt->input) 00307 00308 #define GROW if ((ctxt->progressive == 0) && \ 00309 (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK)) \ 00310 xmlParserInputGrow(ctxt->input, INPUT_CHUNK) 00311 00312 #define CURRENT ((int) (*ctxt->input->cur)) 00313 00314 #define SKIP_BLANKS htmlSkipBlankChars(ctxt) 00315 00316 /* Inported from XML */ 00317 00318 /* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */ 00319 #define CUR ((int) (*ctxt->input->cur)) 00320 #define NEXT xmlNextChar(ctxt) 00321 00322 #define RAW (ctxt->token ? -1 : (*ctxt->input->cur)) 00323 00324 00325 #define NEXTL(l) do { \ 00326 if (*(ctxt->input->cur) == '\n') { \ 00327 ctxt->input->line++; ctxt->input->col = 1; \ 00328 } else ctxt->input->col++; \ 00329 ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++; \ 00330 } while (0) 00331 00332 /************ 00333 \ 00334 if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \ 00335 if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt); 00336 ************/ 00337 00338 #define CUR_CHAR(l) htmlCurrentChar(ctxt, &l) 00339 #define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l) 00340 00341 #define COPY_BUF(l,b,i,v) \ 00342 if (l == 1) b[i++] = (xmlChar) v; \ 00343 else i += xmlCopyChar(l,&b[i],v) 00344 00359 static xmlChar * 00360 htmlFindEncoding(xmlParserCtxtPtr ctxt) { 00361 const xmlChar *start, *cur, *end; 00362 00363 if ((ctxt == NULL) || (ctxt->input == NULL) || 00364 (ctxt->input->encoding != NULL) || (ctxt->input->buf == NULL) || 00365 (ctxt->input->buf->encoder != NULL)) 00366 return(NULL); 00367 if ((ctxt->input->cur == NULL) || (ctxt->input->end == NULL)) 00368 return(NULL); 00369 00370 start = ctxt->input->cur; 00371 end = ctxt->input->end; 00372 /* we also expect the input buffer to be zero terminated */ 00373 if (*end != 0) 00374 return(NULL); 00375 00376 cur = xmlStrcasestr(start, BAD_CAST "HTTP-EQUIV"); 00377 if (cur == NULL) 00378 return(NULL); 00379 cur = xmlStrcasestr(cur, BAD_CAST "CONTENT"); 00380 if (cur == NULL) 00381 return(NULL); 00382 cur = xmlStrcasestr(cur, BAD_CAST "CHARSET="); 00383 if (cur == NULL) 00384 return(NULL); 00385 cur += 8; 00386 start = cur; 00387 while (((*cur >= 'A') && (*cur <= 'Z')) || 00388 ((*cur >= 'a') && (*cur <= 'z')) || 00389 ((*cur >= '0') && (*cur <= '9')) || 00390 (*cur == '-') || (*cur == '_') || (*cur == ':') || (*cur == '/')) 00391 cur++; 00392 if (cur == start) 00393 return(NULL); 00394 return(xmlStrndup(start, cur - start)); 00395 } 00396 00411 static int 00412 htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) { 00413 if (ctxt->instate == XML_PARSER_EOF) 00414 return(0); 00415 00416 if (ctxt->token != 0) { 00417 *len = 0; 00418 return(ctxt->token); 00419 } 00420 if (ctxt->charset == XML_CHAR_ENCODING_UTF8) { 00421 /* 00422 * We are supposed to handle UTF8, check it's valid 00423 * From rfc2044: encoding of the Unicode values on UTF-8: 00424 * 00425 * UCS-4 range (hex.) UTF-8 octet sequence (binary) 00426 * 0000 0000-0000 007F 0xxxxxxx 00427 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx 00428 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx 00429 * 00430 * Check for the 0x110000 limit too 00431 */ 00432 const unsigned char *cur = ctxt->input->cur; 00433 unsigned char c; 00434 unsigned int val; 00435 00436 c = *cur; 00437 if (c & 0x80) { 00438 if (cur[1] == 0) { 00439 xmlParserInputGrow(ctxt->input, INPUT_CHUNK); 00440 cur = ctxt->input->cur; 00441 } 00442 if ((cur[1] & 0xc0) != 0x80) 00443 goto encoding_error; 00444 if ((c & 0xe0) == 0xe0) { 00445 00446 if (cur[2] == 0) { 00447 xmlParserInputGrow(ctxt->input, INPUT_CHUNK); 00448 cur = ctxt->input->cur; 00449 } 00450 if ((cur[2] & 0xc0) != 0x80) 00451 goto encoding_error; 00452 if ((c & 0xf0) == 0xf0) { 00453 if (cur[3] == 0) { 00454 xmlParserInputGrow(ctxt->input, INPUT_CHUNK); 00455 cur = ctxt->input->cur; 00456 } 00457 if (((c & 0xf8) != 0xf0) || 00458 ((cur[3] & 0xc0) != 0x80)) 00459 goto encoding_error; 00460 /* 4-byte code */ 00461 *len = 4; 00462 val = (cur[0] & 0x7) << 18; 00463 val |= (cur[1] & 0x3f) << 12; 00464 val |= (cur[2] & 0x3f) << 6; 00465 val |= cur[3] & 0x3f; 00466 } else { 00467 /* 3-byte code */ 00468 *len = 3; 00469 val = (cur[0] & 0xf) << 12; 00470 val |= (cur[1] & 0x3f) << 6; 00471 val |= cur[2] & 0x3f; 00472 } 00473 } else { 00474 /* 2-byte code */ 00475 *len = 2; 00476 val = (cur[0] & 0x1f) << 6; 00477 val |= cur[1] & 0x3f; 00478 } 00479 if (!IS_CHAR(val)) { 00480 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR, 00481 "Char 0x%X out of allowed range\n", val); 00482 } 00483 return(val); 00484 } else { 00485 if ((*ctxt->input->cur == 0) && 00486 (ctxt->input->cur < ctxt->input->end)) { 00487 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR, 00488 "Char 0x%X out of allowed range\n", 0); 00489 *len = 1; 00490 return(' '); 00491 } 00492 /* 1-byte code */ 00493 *len = 1; 00494 return((int) *ctxt->input->cur); 00495 } 00496 } 00497 /* 00498 * Assume it's a fixed length encoding (1) with 00499 * a compatible encoding for the ASCII set, since 00500 * XML constructs only use < 128 chars 00501 */ 00502 *len = 1; 00503 if ((int) *ctxt->input->cur < 0x80) 00504 return((int) *ctxt->input->cur); 00505 00506 /* 00507 * Humm this is bad, do an automatic flow conversion 00508 */ 00509 { 00510 xmlChar * guess; 00511 xmlCharEncodingHandlerPtr handler; 00512 00513 guess = htmlFindEncoding(ctxt); 00514 if (guess == NULL) { 00515 xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1); 00516 } else { 00517 if (ctxt->input->encoding != NULL) 00518 xmlFree((xmlChar *) ctxt->input->encoding); 00519 ctxt->input->encoding = guess; 00520 handler = xmlFindCharEncodingHandler((const char *) guess); 00521 if (handler != NULL) { 00522 xmlSwitchToEncoding(ctxt, handler); 00523 } else { 00524 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING, 00525 "Unsupported encoding %s", guess, NULL); 00526 } 00527 } 00528 ctxt->charset = XML_CHAR_ENCODING_UTF8; 00529 } 00530 00531 return(xmlCurrentChar(ctxt, len)); 00532 00533 encoding_error: 00534 /* 00535 * If we detect an UTF8 error that probably mean that the 00536 * input encoding didn't get properly advertized in the 00537 * declaration header. Report the error and switch the encoding 00538 * to ISO-Latin-1 (if you don't like this policy, just declare the 00539 * encoding !) 00540 */ 00541 { 00542 char buffer[150]; 00543 00544 if (ctxt->input->end - ctxt->input->cur >= 4) { 00545 snprintf(buffer, 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n", 00546 ctxt->input->cur[0], ctxt->input->cur[1], 00547 ctxt->input->cur[2], ctxt->input->cur[3]); 00548 } else { 00549 snprintf(buffer, 149, "Bytes: 0x%02X\n", ctxt->input->cur[0]); 00550 } 00551 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING, 00552 "Input is not proper UTF-8, indicate encoding !\n", 00553 BAD_CAST buffer, NULL); 00554 } 00555 00556 ctxt->charset = XML_CHAR_ENCODING_8859_1; 00557 *len = 1; 00558 return((int) *ctxt->input->cur); 00559 } 00560 00570 static int 00571 htmlSkipBlankChars(xmlParserCtxtPtr ctxt) { 00572 int res = 0; 00573 00574 while (IS_BLANK_CH(*(ctxt->input->cur))) { 00575 if ((*ctxt->input->cur == 0) && 00576 (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) { 00577 xmlPopInput(ctxt); 00578 } else { 00579 if (*(ctxt->input->cur) == '\n') { 00580 ctxt->input->line++; ctxt->input->col = 1; 00581 } else ctxt->input->col++; 00582 ctxt->input->cur++; 00583 ctxt->nbChars++; 00584 if (*ctxt->input->cur == 0) 00585 xmlParserInputGrow(ctxt->input, INPUT_CHUNK); 00586 } 00587 res++; 00588 } 00589 return(res); 00590 } 00591 00592 00593 00594 /************************************************************************ 00595 * * 00596 * The list of HTML elements and their properties * 00597 * * 00598 ************************************************************************/ 00599 00600 /* 00601 * Start Tag: 1 means the start tag can be ommited 00602 * End Tag: 1 means the end tag can be ommited 00603 * 2 means it's forbidden (empty elements) 00604 * 3 means the tag is stylistic and should be closed easily 00605 * Depr: this element is deprecated 00606 * DTD: 1 means that this element is valid only in the Loose DTD 00607 * 2 means that this element is valid only in the Frameset DTD 00608 * 00609 * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description 00610 , subElements , impliedsubelt , Attributes, userdata 00611 */ 00612 00613 /* Definitions and a couple of vars for HTML Elements */ 00614 00615 #define FONTSTYLE "tt", "i", "b", "u", "s", "strike", "big", "small" 00616 #define NB_FONTSTYLE 8 00617 #define PHRASE "em", "strong", "dfn", "code", "samp", "kbd", "var", "cite", "abbr", "acronym" 00618 #define NB_PHRASE 10 00619 #define SPECIAL "a", "img", "applet", "embed", "object", "font", "basefont", "br", "script", "map", "q", "sub", "sup", "span", "bdo", "iframe" 00620 #define NB_SPECIAL 16 00621 #define INLINE FONTSTYLE, PHRASE, SPECIAL, FORMCTRL 00622 #define NB_INLINE NB_PCDATA + NB_FONTSTYLE + NB_PHRASE + NB_SPECIAL + NB_FORMCTRL 00623 #define BLOCK HEADING, LIST, "pre", "p", "dl", "div", "center", "noscript", "noframes", "blockquote", "form", "isindex", "hr", "table", "fieldset", "address" 00624 #define NB_BLOCK NB_HEADING + NB_LIST + 14 00625 #define FORMCTRL "input", "select", "textarea", "label", "button" 00626 #define NB_FORMCTRL 5 00627 #define PCDATA 00628 #define NB_PCDATA 0 00629 #define HEADING "h1", "h2", "h3", "h4", "h5", "h6" 00630 #define NB_HEADING 6 00631 #define LIST "ul", "ol", "dir", "menu" 00632 #define NB_LIST 4 00633 #define MODIFIER 00634 #define NB_MODIFIER 0 00635 #define FLOW BLOCK,INLINE 00636 #define NB_FLOW NB_BLOCK + NB_INLINE 00637 #define EMPTY NULL 00638 00639 00640 static const char* const html_flow[] = { FLOW, NULL } ; 00641 static const char* const html_inline[] = { INLINE, NULL } ; 00642 00643 /* placeholders: elts with content but no subelements */ 00644 static const char* const html_pcdata[] = { NULL } ; 00645 #define html_cdata html_pcdata 00646 00647 00648 /* ... and for HTML Attributes */ 00649 00650 #define COREATTRS "id", "class", "style", "title" 00651 #define NB_COREATTRS 4 00652 #define I18N "lang", "dir" 00653 #define NB_I18N 2 00654 #define EVENTS "onclick", "ondblclick", "onmousedown", "onmouseup", "onmouseover", "onmouseout", "onkeypress", "onkeydown", "onkeyup" 00655 #define NB_EVENTS 9 00656 #define ATTRS COREATTRS,I18N,EVENTS 00657 #define NB_ATTRS NB_NB_COREATTRS + NB_I18N + NB_EVENTS 00658 #define CELLHALIGN "align", "char", "charoff" 00659 #define NB_CELLHALIGN 3 00660 #define CELLVALIGN "valign" 00661 #define NB_CELLVALIGN 1 00662 00663 static const char* const html_attrs[] = { ATTRS, NULL } ; 00664 static const char* const core_i18n_attrs[] = { COREATTRS, I18N, NULL } ; 00665 static const char* const core_attrs[] = { COREATTRS, NULL } ; 00666 static const char* const i18n_attrs[] = { I18N, NULL } ; 00667 00668 00669 /* Other declarations that should go inline ... */ 00670 static const char* const a_attrs[] = { ATTRS, "charset", "type", "name", 00671 "href", "hreflang", "rel", "rev", "accesskey", "shape", "coords", 00672 "tabindex", "onfocus", "onblur", NULL } ; 00673 static const char* const target_attr[] = { "target", NULL } ; 00674 static const char* const rows_cols_attr[] = { "rows", "cols", NULL } ; 00675 static const char* const alt_attr[] = { "alt", NULL } ; 00676 static const char* const src_alt_attrs[] = { "src", "alt", NULL } ; 00677 static const char* const href_attrs[] = { "href", NULL } ; 00678 static const char* const clear_attrs[] = { "clear", NULL } ; 00679 static const char* const inline_p[] = { INLINE, "p", NULL } ; 00680 00681 static const char* const flow_param[] = { FLOW, "param", NULL } ; 00682 static const char* const applet_attrs[] = { COREATTRS , "codebase", 00683 "archive", "alt", "name", "height", "width", "align", 00684 "hspace", "vspace", NULL } ; 00685 static const char* const area_attrs[] = { "shape", "coords", "href", "nohref", 00686 "tabindex", "accesskey", "onfocus", "onblur", NULL } ; 00687 static const char* const basefont_attrs[] = 00688 { "id", "size", "color", "face", NULL } ; 00689 static const char* const quote_attrs[] = { ATTRS, "cite", NULL } ; 00690 static const char* const body_contents[] = { FLOW, "ins", "del", NULL } ; 00691 static const char* const body_attrs[] = { ATTRS, "onload", "onunload", NULL } ; 00692 static const char* const body_depr[] = { "background", "bgcolor", "text", 00693 "link", "vlink", "alink", NULL } ; 00694 static const char* const button_attrs[] = { ATTRS, "name", "value", "type", 00695 "disabled", "tabindex", "accesskey", "onfocus", "onblur", NULL } ; 00696 00697 00698 static const char* const col_attrs[] = { ATTRS, "span", "width", CELLHALIGN, CELLVALIGN, NULL } ; 00699 static const char* const col_elt[] = { "col", NULL } ; 00700 static const char* const edit_attrs[] = { ATTRS, "datetime", "cite", NULL } ; 00701 static const char* const compact_attrs[] = { ATTRS, "compact", NULL } ; 00702 static const char* const dl_contents[] = { "dt", "dd", NULL } ; 00703 static const char* const compact_attr[] = { "compact", NULL } ; 00704 static const char* const label_attr[] = { "label", NULL } ; 00705 static const char* const fieldset_contents[] = { FLOW, "legend" } ; 00706 static const char* const font_attrs[] = { COREATTRS, I18N, "size", "color", "face" , NULL } ; 00707 static const char* const form_contents[] = { HEADING, LIST, INLINE, "pre", "p", "div", "center", "noscript", "noframes", "blockquote", "isindex", "hr", "table", "fieldset", "address", NULL } ; 00708 static const char* const form_attrs[] = { ATTRS, "method", "enctype", "accept", "name", "onsubmit", "onreset", "accept-charset", NULL } ; 00709 static const char* const frame_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling" , NULL } ; 00710 static const char* const frameset_attrs[] = { COREATTRS, "rows", "cols", "onload", "onunload", NULL } ; 00711 static const char* const frameset_contents[] = { "frameset", "frame", "noframes", NULL } ; 00712 static const char* const head_attrs[] = { I18N, "profile", NULL } ; 00713 static const char* const head_contents[] = { "title", "isindex", "base", "script", "style", "meta", "link", "object", NULL } ; 00714 static const char* const hr_depr[] = { "align", "noshade", "size", "width", NULL } ; 00715 static const char* const version_attr[] = { "version", NULL } ; 00716 static const char* const html_content[] = { "head", "body", "frameset", NULL } ; 00717 static const char* const iframe_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width", NULL } ; 00718 static const char* const img_attrs[] = { ATTRS, "longdesc", "name", "height", "width", "usemap", "ismap", NULL } ; 00719 static const char* const embed_attrs[] = { COREATTRS, "align", "alt", "border", "code", "codebase", "frameborder", "height", "hidden", "hspace", "name", "palette", "pluginspace", "pluginurl", "src", "type", "units", "vspace", "width", NULL } ; 00720 static const char* const input_attrs[] = { ATTRS, "type", "name", "value", "checked", "disabled", "readonly", "size", "maxlength", "src", "alt", "usemap", "ismap", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", "accept", NULL } ; 00721 static const char* const prompt_attrs[] = { COREATTRS, I18N, "prompt", NULL } ; 00722 static const char* const label_attrs[] = { ATTRS, "for", "accesskey", "onfocus", "onblur", NULL } ; 00723 static const char* const legend_attrs[] = { ATTRS, "accesskey", NULL } ; 00724 static const char* const align_attr[] = { "align", NULL } ; 00725 static const char* const link_attrs[] = { ATTRS, "charset", "href", "hreflang", "type", "rel", "rev", "media", NULL } ; 00726 static const char* const map_contents[] = { BLOCK, "area", NULL } ; 00727 static const char* const name_attr[] = { "name", NULL } ; 00728 static const char* const action_attr[] = { "action", NULL } ; 00729 static const char* const blockli_elt[] = { BLOCK, "li", NULL } ; 00730 static const char* const meta_attrs[] = { I18N, "http-equiv", "name", "scheme", NULL } ; 00731 static const char* const content_attr[] = { "content", NULL } ; 00732 static const char* const type_attr[] = { "type", NULL } ; 00733 static const char* const noframes_content[] = { "body", FLOW MODIFIER, NULL } ; 00734 static const char* const object_contents[] = { FLOW, "param", NULL } ; 00735 static const char* const object_attrs[] = { ATTRS, "declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height", "width", "usemap", "name", "tabindex", NULL } ; 00736 static const char* const object_depr[] = { "align", "border", "hspace", "vspace", NULL } ; 00737 static const char* const ol_attrs[] = { "type", "compact", "start", NULL} ; 00738 static const char* const option_elt[] = { "option", NULL } ; 00739 static const char* const optgroup_attrs[] = { ATTRS, "disabled", NULL } ; 00740 static const char* const option_attrs[] = { ATTRS, "disabled", "label", "selected", "value", NULL } ; 00741 static const char* const param_attrs[] = { "id", "value", "valuetype", "type", NULL } ; 00742 static const char* const width_attr[] = { "width", NULL } ; 00743 static const char* const pre_content[] = { PHRASE, "tt", "i", "b", "u", "s", "strike", "a", "br", "script", "map", "q", "span", "bdo", "iframe", NULL } ; 00744 static const char* const script_attrs[] = { "charset", "src", "defer", "event", "for", NULL } ; 00745 static const char* const language_attr[] = { "language", NULL } ; 00746 static const char* const select_content[] = { "optgroup", "option", NULL } ; 00747 static const char* const select_attrs[] = { ATTRS, "name", "size", "multiple", "disabled", "tabindex", "onfocus", "onblur", "onchange", NULL } ; 00748 static const char* const style_attrs[] = { I18N, "media", "title", NULL } ; 00749 static const char* const table_attrs[] = { ATTRS, "summary", "width", "border", "frame", "rules", "cellspacing", "cellpadding", "datapagesize", NULL } ; 00750 static const char* const table_depr[] = { "align", "bgcolor", NULL } ; 00751 static const char* const table_contents[] = { "caption", "col", "colgroup", "thead", "tfoot", "tbody", "tr", NULL} ; 00752 static const char* const tr_elt[] = { "tr", NULL } ; 00753 static const char* const talign_attrs[] = { ATTRS, CELLHALIGN, CELLVALIGN, NULL} ; 00754 static const char* const th_td_depr[] = { "nowrap", "bgcolor", "width", "height", NULL } ; 00755 static const char* const th_td_attr[] = { ATTRS, "abbr", "axis", "headers", "scope", "rowspan", "colspan", CELLHALIGN, CELLVALIGN, NULL } ; 00756 static const char* const textarea_attrs[] = { ATTRS, "name", "disabled", "readonly", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", NULL } ; 00757 static const char* const tr_contents[] = { "th", "td", NULL } ; 00758 static const char* const bgcolor_attr[] = { "bgcolor", NULL } ; 00759 static const char* const li_elt[] = { "li", NULL } ; 00760 static const char* const ul_depr[] = { "type", "compact", NULL} ; 00761 static const char* const dir_attr[] = { "dir", NULL} ; 00762 00763 #define DECL (const char**) 00764 00765 static const htmlElemDesc 00766 html40ElementTable[] = { 00767 { "a", 0, 0, 0, 0, 0, 0, 1, "anchor ", 00768 DECL html_inline , NULL , DECL a_attrs , DECL target_attr, NULL 00769 }, 00770 { "abbr", 0, 0, 0, 0, 0, 0, 1, "abbreviated form", 00771 DECL html_inline , NULL , DECL html_attrs, NULL, NULL 00772 }, 00773 { "acronym", 0, 0, 0, 0, 0, 0, 1, "", 00774 DECL html_inline , NULL , DECL html_attrs, NULL, NULL 00775 }, 00776 { "address", 0, 0, 0, 0, 0, 0, 0, "information on author ", 00777 DECL inline_p , NULL , DECL html_attrs, NULL, NULL 00778 }, 00779 { "applet", 0, 0, 0, 0, 1, 1, 2, "java applet ", 00780 DECL flow_param , NULL , NULL , DECL applet_attrs, NULL 00781 }, 00782 { "area", 0, 2, 2, 1, 0, 0, 0, "client-side image map area ", 00783 EMPTY , NULL , DECL area_attrs , DECL target_attr, DECL alt_attr 00784 }, 00785 { "b", 0, 3, 0, 0, 0, 0, 1, "bold text style", 00786 DECL html_inline , NULL , DECL html_attrs, NULL, NULL 00787 }, 00788 { "base", 0, 2, 2, 1, 0, 0, 0, "document base uri ", 00789 EMPTY , NULL , NULL , DECL target_attr, DECL href_attrs 00790 }, 00791 { "basefont", 0, 2, 2, 1, 1, 1, 1, "base font size " , 00792 EMPTY , NULL , NULL, DECL basefont_attrs, NULL 00793 }, 00794 { "bdo", 0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride ", 00795 DECL html_inline , NULL , DECL core_i18n_attrs, NULL, DECL dir_attr 00796 }, 00797 { "big", 0, 3, 0, 0, 0, 0, 1, "large text style", 00798 DECL html_inline , NULL , DECL html_attrs, NULL, NULL 00799 }, 00800 { "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation ", 00801 DECL html_flow , NULL , DECL quote_attrs , NULL, NULL 00802 }, 00803 { "body", 1, 1, 0, 0, 0, 0, 0, "document body ", 00804 DECL body_contents , "div" , DECL body_attrs, DECL body_depr, NULL 00805 }, 00806 { "br", 0, 2, 2, 1, 0, 0, 1, "forced line break ", 00807 EMPTY , NULL , DECL core_attrs, DECL clear_attrs , NULL 00808 }, 00809 { "button", 0, 0, 0, 0, 0, 0, 2, "push button ", 00810 DECL html_flow MODIFIER , NULL , DECL button_attrs, NULL, NULL 00811 }, 00812 { "caption", 0, 0, 0, 0, 0, 0, 0, "table caption ", 00813 DECL html_inline , NULL , DECL html_attrs, NULL, NULL 00814 }, 00815 { "center", 0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center ", 00816 DECL html_flow , NULL , NULL, DECL html_attrs, NULL 00817 }, 00818 { "cite", 0, 0, 0, 0, 0, 0, 1, "citation", 00819 DECL html_inline , NULL , DECL html_attrs, NULL, NULL 00820 }, 00821 { "code", 0, 0, 0, 0, 0, 0, 1, "computer code fragment", 00822 DECL html_inline , NULL , DECL html_attrs, NULL, NULL 00823 }, 00824 { "col", 0, 2, 2, 1, 0, 0, 0, "table column ", 00825 EMPTY , NULL , DECL col_attrs , NULL, NULL 00826 }, 00827 { "colgroup", 0, 1, 0, 0, 0, 0, 0, "table column group ", 00828 DECL col_elt , "col" , DECL col_attrs , NULL, NULL 00829 }, 00830 { "dd", 0, 1, 0, 0, 0, 0, 0, "definition description ", 00831 DECL html_flow , NULL , DECL html_attrs, NULL, NULL 00832 }, 00833 { "del", 0, 0, 0, 0, 0, 0, 2, "deleted text ", 00834 DECL html_flow , NULL , DECL edit_attrs , NULL, NULL 00835 }, 00836 { "dfn", 0, 0, 0, 0, 0, 0, 1, "instance definition", 00837 DECL html_inline , NULL , DECL html_attrs, NULL, NULL 00838 }, 00839 { "dir", 0, 0, 0, 0, 1, 1, 0, "directory list", 00840 DECL blockli_elt, "li" , NULL, DECL compact_attrs, NULL 00841 }, 00842 { "div", 0, 0, 0, 0, 0, 0, 0, "generic language/style container", 00843 DECL html_flow, NULL, DECL html_attrs, DECL align_attr, NULL 00844 }, 00845 { "dl", 0, 0, 0, 0, 0, 0, 0, "definition list ", 00846 DECL dl_contents , "dd" , DECL html_attrs, DECL compact_attr, NULL 00847 }, 00848 { "dt", 0, 1, 0, 0, 0, 0, 0, "definition term ", 00849 DECL html_inline, NULL, DECL html_attrs, NULL, NULL 00850 }, 00851 { "em", 0, 3, 0, 0, 0, 0, 1, "emphasis", 00852 DECL html_inline, NULL, DECL html_attrs, NULL, NULL 00853 }, 00854 { "embed", 0, 1, 0, 0, 1, 1, 1, "generic embedded object ", 00855 EMPTY, NULL, DECL embed_attrs, NULL, NULL 00856 }, 00857 { "fieldset", 0, 0, 0, 0, 0, 0, 0, "form control group ", 00858 DECL fieldset_contents , NULL, DECL html_attrs, NULL, NULL 00859 }, 00860 { "font", 0, 3, 0, 0, 1, 1, 1, "local change to font ", 00861 DECL html_inline, NULL, NULL, DECL font_attrs, NULL 00862 }, 00863 { "form", 0, 0, 0, 0, 0, 0, 0, "interactive form ", 00864 DECL form_contents, "fieldset", DECL form_attrs , DECL target_attr, DECL action_attr 00865 }, 00866 { "frame", 0, 2, 2, 1, 0, 2, 0, "subwindow " , 00867 EMPTY, NULL, NULL, DECL frame_attrs, NULL 00868 }, 00869 { "frameset", 0, 0, 0, 0, 0, 2, 0, "window subdivision" , 00870 DECL frameset_contents, "noframes" , NULL , DECL frameset_attrs, NULL 00871 }, 00872 { "h1", 0, 0, 0, 0, 0, 0, 0, "heading ", 00873 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL 00874 }, 00875 { "h2", 0, 0, 0, 0, 0, 0, 0, "heading ", 00876 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL 00877 }, 00878 { "h3", 0, 0, 0, 0, 0, 0, 0, "heading ", 00879 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL 00880 }, 00881 { "h4", 0, 0, 0, 0, 0, 0, 0, "heading ", 00882 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL 00883 }, 00884 { "h5", 0, 0, 0, 0, 0, 0, 0, "heading ", 00885 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL 00886 }, 00887 { "h6", 0, 0, 0, 0, 0, 0, 0, "heading ", 00888 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL 00889 }, 00890 { "head", 1, 1, 0, 0, 0, 0, 0, "document head ", 00891 DECL head_contents, NULL, DECL head_attrs, NULL, NULL 00892 }, 00893 { "hr", 0, 2, 2, 1, 0, 0, 0, "horizontal rule " , 00894 EMPTY, NULL, DECL html_attrs, DECL hr_depr, NULL 00895 }, 00896 { "html", 1, 1, 0, 0, 0, 0, 0, "document root element ", 00897 DECL html_content , NULL , DECL i18n_attrs, DECL version_attr, NULL 00898 }, 00899 { "i", 0, 3, 0, 0, 0, 0, 1, "italic text style", 00900 DECL html_inline, NULL, DECL html_attrs, NULL, NULL 00901 }, 00902 { "iframe", 0, 0, 0, 0, 0, 1, 2, "inline subwindow ", 00903 DECL html_flow, NULL, NULL, DECL iframe_attrs, NULL 00904 }, 00905 { "img", 0, 2, 2, 1, 0, 0, 1, "embedded image ", 00906 EMPTY, NULL, DECL img_attrs, DECL align_attr, DECL src_alt_attrs 00907 }, 00908 { "input", 0, 2, 2, 1, 0, 0, 1, "form control ", 00909 EMPTY, NULL, DECL input_attrs , DECL align_attr, NULL 00910 }, 00911 { "ins", 0, 0, 0, 0, 0, 0, 2, "inserted text", 00912 DECL html_flow, NULL, DECL edit_attrs, NULL, NULL 00913 }, 00914 { "isindex", 0, 2, 2, 1, 1, 1, 0, "single line prompt ", 00915 EMPTY, NULL, NULL, DECL prompt_attrs, NULL 00916 }, 00917 { "kbd", 0, 0, 0, 0, 0, 0, 1, "text to be entered by the user", 00918 DECL html_inline, NULL, DECL html_attrs, NULL, NULL 00919 }, 00920 { "label", 0, 0, 0, 0, 0, 0, 1, "form field label text ", 00921 DECL html_inline MODIFIER, NULL, DECL label_attrs , NULL, NULL 00922 }, 00923 { "legend", 0, 0, 0, 0, 0, 0, 0, "fieldset legend ", 00924 DECL html_inline, NULL, DECL legend_attrs , DECL align_attr, NULL 00925 }, 00926 { "li", 0, 1, 1, 0, 0, 0, 0, "list item ", 00927 DECL html_flow, NULL, DECL html_attrs, NULL, NULL 00928 }, 00929 { "link", 0, 2, 2, 1, 0, 0, 0, "a media-independent link ", 00930 EMPTY, NULL, DECL link_attrs, DECL target_attr, NULL 00931 }, 00932 { "map", 0, 0, 0, 0, 0, 0, 2, "client-side image map ", 00933 DECL map_contents , NULL, DECL html_attrs , NULL, DECL name_attr 00934 }, 00935 { "menu", 0, 0, 0, 0, 1, 1, 0, "menu list ", 00936 DECL blockli_elt , NULL, NULL, DECL compact_attrs, NULL 00937 }, 00938 { "meta", 0, 2, 2, 1, 0, 0, 0, "generic metainformation ", 00939 EMPTY, NULL, DECL meta_attrs , NULL , DECL content_attr 00940 }, 00941 { "noframes", 0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering ", 00942 DECL noframes_content, "body" , DECL html_attrs, NULL, NULL 00943 }, 00944 { "noscript", 0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering ", 00945 DECL html_flow, "div", DECL html_attrs, NULL, NULL 00946 }, 00947 { "object", 0, 0, 0, 0, 0, 0, 2, "generic embedded object ", 00948 DECL object_contents , "div" , DECL object_attrs, DECL object_depr, NULL 00949 }, 00950 { "ol", 0, 0, 0, 0, 0, 0, 0, "ordered list ", 00951 DECL li_elt , "li" , DECL html_attrs, DECL ol_attrs, NULL 00952 }, 00953 { "optgroup", 0, 0, 0, 0, 0, 0, 0, "option group ", 00954 DECL option_elt , "option", DECL optgroup_attrs, NULL, DECL label_attr 00955 }, 00956 { "option", 0, 1, 0, 0, 0, 0, 0, "selectable choice " , 00957 DECL html_pcdata, NULL, DECL option_attrs, NULL, NULL 00958 }, 00959 { "p", 0, 1, 0, 0, 0, 0, 0, "paragraph ", 00960 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL 00961 }, 00962 { "param", 0, 2, 2, 1, 0, 0, 0, "named property value ", 00963 EMPTY, NULL, DECL param_attrs, NULL, DECL name_attr 00964 }, 00965 { "pre", 0, 0, 0, 0, 0, 0, 0, "preformatted text ", 00966 DECL pre_content, NULL, DECL html_attrs, DECL width_attr, NULL 00967 }, 00968 { "q", 0, 0, 0, 0, 0, 0, 1, "short inline quotation ", 00969 DECL html_inline, NULL, DECL quote_attrs, NULL, NULL 00970 }, 00971 { "s", 0, 3, 0, 0, 1, 1, 1, "strike-through text style", 00972 DECL html_inline, NULL, NULL, DECL html_attrs, NULL 00973 }, 00974 { "samp", 0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc.", 00975 DECL html_inline, NULL, DECL html_attrs, NULL, NULL 00976 }, 00977 { "script", 0, 0, 0, 0, 0, 0, 2, "script statements ", 00978 DECL html_cdata, NULL, DECL script_attrs, DECL language_attr, DECL type_attr 00979 }, 00980 { "select", 0, 0, 0, 0, 0, 0, 1, "option selector ", 00981 DECL select_content, NULL, DECL select_attrs, NULL, NULL 00982 }, 00983 { "small", 0, 3, 0, 0, 0, 0, 1, "small text style", 00984 DECL html_inline, NULL, DECL html_attrs, NULL, NULL 00985 }, 00986 { "span", 0, 0, 0, 0, 0, 0, 1, "generic language/style container ", 00987 DECL html_inline, NULL, DECL html_attrs, NULL, NULL 00988 }, 00989 { "strike", 0, 3, 0, 0, 1, 1, 1, "strike-through text", 00990 DECL html_inline, NULL, NULL, DECL html_attrs, NULL 00991 }, 00992 { "strong", 0, 3, 0, 0, 0, 0, 1, "strong emphasis", 00993 DECL html_inline, NULL, DECL html_attrs, NULL, NULL 00994 }, 00995 { "style", 0, 0, 0, 0, 0, 0, 0, "style info ", 00996 DECL html_cdata, NULL, DECL style_attrs, NULL, DECL type_attr 00997 }, 00998 { "sub", 0, 3, 0, 0, 0, 0, 1, "subscript", 00999 DECL html_inline, NULL, DECL html_attrs, NULL, NULL 01000 }, 01001 { "sup", 0, 3, 0, 0, 0, 0, 1, "superscript ", 01002 DECL html_inline, NULL, DECL html_attrs, NULL, NULL 01003 }, 01004 { "table", 0, 0, 0, 0, 0, 0, 0, "", 01005 DECL table_contents , "tr" , DECL table_attrs , DECL table_depr, NULL 01006 }, 01007 { "tbody", 1, 0, 0, 0, 0, 0, 0, "table body ", 01008 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL 01009 }, 01010 { "td", 0, 0, 0, 0, 0, 0, 0, "table data cell", 01011 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL 01012 }, 01013 { "textarea", 0, 0, 0, 0, 0, 0, 1, "multi-line text field ", 01014 DECL html_pcdata, NULL, DECL textarea_attrs, NULL, DECL rows_cols_attr 01015 }, 01016 { "tfoot", 0, 1, 0, 0, 0, 0, 0, "table footer ", 01017 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL 01018 }, 01019 { "th", 0, 1, 0, 0, 0, 0, 0, "table header cell", 01020 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL 01021 }, 01022 { "thead", 0, 1, 0, 0, 0, 0, 0, "table header ", 01023 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL 01024 }, 01025 { "title", 0, 0, 0, 0, 0, 0, 0, "document title ", 01026 DECL html_pcdata, NULL, DECL i18n_attrs, NULL, NULL 01027 }, 01028 { "tr", 0, 0, 0, 0, 0, 0, 0, "table row ", 01029 DECL tr_contents , "td" , DECL talign_attrs, DECL bgcolor_attr, NULL 01030 }, 01031 { "tt", 0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style", 01032 DECL html_inline, NULL, DECL html_attrs, NULL, NULL 01033 }, 01034 { "u", 0, 3, 0, 0, 1, 1, 1, "underlined text style", 01035 DECL html_inline, NULL, NULL, DECL html_attrs, NULL 01036 }, 01037 { "ul", 0, 0, 0, 0, 0, 0, 0, "unordered list ", 01038 DECL li_elt , "li" , DECL html_attrs, DECL ul_depr, NULL 01039 }, 01040 { "var", 0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument", 01041 DECL html_inline, NULL, DECL html_attrs, NULL, NULL 01042 } 01043 }; 01044 01045 /* 01046 * start tags that imply the end of current element 01047 */ 01048 static const char * const htmlStartClose[] = { 01049 "form", "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6", 01050 "dl", "ul", "ol", "menu", "dir", "address", "pre", 01051 "listing", "xmp", "head", NULL, 01052 "head", "p", NULL, 01053 "title", "p", NULL, 01054 "body", "head", "style", "link", "title", "p", NULL, 01055 "frameset", "head", "style", "link", "title", "p", NULL, 01056 "li", "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address", 01057 "pre", "listing", "xmp", "head", "li", NULL, 01058 "hr", "p", "head", NULL, 01059 "h1", "p", "head", NULL, 01060 "h2", "p", "head", NULL, 01061 "h3", "p", "head", NULL, 01062 "h4", "p", "head", NULL, 01063 "h5", "p", "head", NULL, 01064 "h6", "p", "head", NULL, 01065 "dir", "p", "head", NULL, 01066 "address", "p", "head", "ul", NULL, 01067 "pre", "p", "head", "ul", NULL, 01068 "listing", "p", "head", NULL, 01069 "xmp", "p", "head", NULL, 01070 "blockquote", "p", "head", NULL, 01071 "dl", "p", "dt", "menu", "dir", "address", "pre", "listing", 01072 "xmp", "head", NULL, 01073 "dt", "p", "menu", "dir", "address", "pre", "listing", "xmp", 01074 "head", "dd", NULL, 01075 "dd", "p", "menu", "dir", "address", "pre", "listing", "xmp", 01076 "head", "dt", NULL, 01077 "ul", "p", "head", "ol", "menu", "dir", "address", "pre", 01078 "listing", "xmp", NULL, 01079 "ol", "p", "head", "ul", NULL, 01080 "menu", "p", "head", "ul", NULL, 01081 "p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", FONTSTYLE, NULL, 01082 "div", "p", "head", NULL, 01083 "noscript", "p", "head", NULL, 01084 "center", "font", "b", "i", "p", "head", NULL, 01085 "a", "a", NULL, 01086 "caption", "p", NULL, 01087 "colgroup", "caption", "colgroup", "col", "p", NULL, 01088 "col", "caption", "col", "p", NULL, 01089 "table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre", 01090 "listing", "xmp", "a", NULL, 01091 "th", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL, 01092 "td", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL, 01093 "tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL, 01094 "thead", "caption", "col", "colgroup", NULL, 01095 "tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead", 01096 "tbody", "p", NULL, 01097 "tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead", 01098 "tfoot", "tbody", "p", NULL, 01099 "optgroup", "option", NULL, 01100 "option", "option", NULL, 01101 "fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", 01102 "pre", "listing", "xmp", "a", NULL, 01103 NULL 01104 }; 01105 01106 /* 01107 * The list of HTML elements which are supposed not to have 01108 * CDATA content and where a p element will be implied 01109 * 01110 * TODO: extend that list by reading the HTML SGML DTD on 01111 * implied paragraph 01112 */ 01113 static const char *const htmlNoContentElements[] = { 01114 "html", 01115 "head", 01116 NULL 01117 }; 01118 01119 /* 01120 * The list of HTML attributes which are of content %Script; 01121 * NOTE: when adding ones, check htmlIsScriptAttribute() since 01122 * it assumes the name starts with 'on' 01123 */ 01124 static const char *const htmlScriptAttributes[] = { 01125 "onclick", 01126 "ondblclick", 01127 "onmousedown", 01128 "onmouseup", 01129 "onmouseover", 01130 "onmousemove", 01131 "onmouseout", 01132 "onkeypress", 01133 "onkeydown", 01134 "onkeyup", 01135 "onload", 01136 "onunload", 01137 "onfocus", 01138 "onblur", 01139 "onsubmit", 01140 "onrest", 01141 "onchange", 01142 "onselect" 01143 }; 01144 01145 /* 01146 * This table is used by the htmlparser to know what to do with 01147 * broken html pages. By assigning different priorities to different 01148 * elements the parser can decide how to handle extra endtags. 01149 * Endtags are only allowed to close elements with lower or equal 01150 * priority. 01151 */ 01152 01153 typedef struct { 01154 const char *name; 01155 int priority; 01156 } elementPriority; 01157 01158 static const elementPriority htmlEndPriority[] = { 01159 {"div", 150}, 01160 {"td", 160}, 01161 {"th", 160}, 01162 {"tr", 170}, 01163 {"thead", 180}, 01164 {"tbody", 180}, 01165 {"tfoot", 180}, 01166 {"table", 190}, 01167 {"head", 200}, 01168 {"body", 200}, 01169 {"html", 220}, 01170 {NULL, 100} /* Default priority */ 01171 }; 01172 01173 static const char** htmlStartCloseIndex[100]; 01174 static int htmlStartCloseIndexinitialized = 0; 01175 01176 /************************************************************************ 01177 * * 01178 * functions to handle HTML specific data * 01179 * * 01180 ************************************************************************/ 01181 01189 void 01190 htmlInitAutoClose(void) { 01191 int indx, i = 0; 01192 01193 if (htmlStartCloseIndexinitialized) return; 01194 01195 for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL; 01196 indx = 0; 01197 while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) { 01198 htmlStartCloseIndex[indx++] = (const char**) &htmlStartClose[i]; 01199 while (htmlStartClose[i] != NULL) i++; 01200 i++; 01201 } 01202 htmlStartCloseIndexinitialized = 1; 01203 } 01204 01213 const htmlElemDesc * 01214 htmlTagLookup(const xmlChar *tag) { 01215 unsigned int i; 01216 01217 for (i = 0; i < (sizeof(html40ElementTable) / 01218 sizeof(html40ElementTable[0]));i++) { 01219 if (!xmlStrcasecmp(tag, BAD_CAST html40ElementTable[i].name)) 01220 return((htmlElemDescPtr) &html40ElementTable[i]); 01221 } 01222 return(NULL); 01223 } 01224 01231 static int 01232 htmlGetEndPriority (const xmlChar *name) { 01233 int i = 0; 01234 01235 while ((htmlEndPriority[i].name != NULL) && 01236 (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name))) 01237 i++; 01238 01239 return(htmlEndPriority[i].priority); 01240 } 01241 01242 01254 static int 01255 htmlCheckAutoClose(const xmlChar * newtag, const xmlChar * oldtag) 01256 { 01257 int i, indx; 01258 const char **closed = NULL; 01259 01260 if (htmlStartCloseIndexinitialized == 0) 01261 htmlInitAutoClose(); 01262 01263 /* inefficient, but not a big deal */ 01264 for (indx = 0; indx < 100; indx++) { 01265 closed = htmlStartCloseIndex[indx]; 01266 if (closed == NULL) 01267 return (0); 01268 if (xmlStrEqual(BAD_CAST * closed, newtag)) 01269 break; 01270 } 01271 01272 i = closed - htmlStartClose; 01273 i++; 01274 while (htmlStartClose[i] != NULL) { 01275 if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) { 01276 return (1); 01277 } 01278 i++; 01279 } 01280 return (0); 01281 } 01282 01291 static void 01292 htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag) 01293 { 01294 const htmlElemDesc *info; 01295 int i, priority; 01296 01297 priority = htmlGetEndPriority(newtag); 01298 01299 for (i = (ctxt->nameNr - 1); i >= 0; i--) { 01300 01301 if (xmlStrEqual(newtag, ctxt->nameTab[i])) 01302 break; 01303 /* 01304 * A missplaced endtag can only close elements with lower 01305 * or equal priority, so if we find an element with higher 01306 * priority before we find an element with 01307 * matching name, we just ignore this endtag 01308 */ 01309 if (htmlGetEndPriority(ctxt->nameTab[i]) > priority) 01310 return; 01311 } 01312 if (i < 0) 01313 return; 01314 01315 while (!xmlStrEqual(newtag, ctxt->name)) { 01316 info = htmlTagLookup(ctxt->name); 01317 if ((info != NULL) && (info->endTag == 3)) { 01318 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH, 01319 "Opening and ending tag mismatch: %s and %s\n", 01320 newtag, ctxt->name); 01321 } 01322 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) 01323 ctxt->sax->endElement(ctxt->userData, ctxt->name); 01324 htmlnamePop(ctxt); 01325 } 01326 } 01327 01334 static void 01335 htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt) 01336 { 01337 int i; 01338 01339 if (ctxt->nameNr == 0) 01340 return; 01341 for (i = (ctxt->nameNr - 1); i >= 0; i--) { 01342 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) 01343 ctxt->sax->endElement(ctxt->userData, ctxt->name); 01344 htmlnamePop(ctxt); 01345 } 01346 } 01347 01360 static void 01361 htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag) 01362 { 01363 while ((newtag != NULL) && (ctxt->name != NULL) && 01364 (htmlCheckAutoClose(newtag, ctxt->name))) { 01365 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) 01366 ctxt->sax->endElement(ctxt->userData, ctxt->name); 01367 htmlnamePop(ctxt); 01368 } 01369 if (newtag == NULL) { 01370 htmlAutoCloseOnEnd(ctxt); 01371 return; 01372 } 01373 while ((newtag == NULL) && (ctxt->name != NULL) && 01374 ((xmlStrEqual(ctxt->name, BAD_CAST "head")) || 01375 (xmlStrEqual(ctxt->name, BAD_CAST "body")) || 01376 (xmlStrEqual(ctxt->name, BAD_CAST "html")))) { 01377 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) 01378 ctxt->sax->endElement(ctxt->userData, ctxt->name); 01379 htmlnamePop(ctxt); 01380 } 01381 } 01382 01396 int 01397 htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) { 01398 htmlNodePtr child; 01399 01400 if (elem == NULL) return(1); 01401 if (xmlStrEqual(name, elem->name)) return(0); 01402 if (htmlCheckAutoClose(elem->name, name)) return(1); 01403 child = elem->children; 01404 while (child != NULL) { 01405 if (htmlAutoCloseTag(doc, name, child)) return(1); 01406 child = child->next; 01407 } 01408 return(0); 01409 } 01410 01422 int 01423 htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) { 01424 htmlNodePtr child; 01425 01426 if (elem == NULL) return(1); 01427 child = elem->children; 01428 while (child != NULL) { 01429 if (htmlAutoCloseTag(doc, elem->name, child)) return(1); 01430 child = child->next; 01431 } 01432 return(0); 01433 } 01434 01444 static void 01445 htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) { 01446 int i; 01447 01448 if (ctxt->options & HTML_PARSE_NOIMPLIED) 01449 return; 01450 if (!htmlOmittedDefaultValue) 01451 return; 01452 if (xmlStrEqual(newtag, BAD_CAST"html")) 01453 return; 01454 if (ctxt->nameNr <= 0) { 01455 htmlnamePush(ctxt, BAD_CAST"html"); 01456 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) 01457 ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL); 01458 } 01459 if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head"))) 01460 return; 01461 if ((ctxt->nameNr <= 1) && 01462 ((xmlStrEqual(newtag, BAD_CAST"script")) || 01463 (xmlStrEqual(newtag, BAD_CAST"style")) || 01464 (xmlStrEqual(newtag, BAD_CAST"meta")) || 01465 (xmlStrEqual(newtag, BAD_CAST"link")) || 01466 (xmlStrEqual(newtag, BAD_CAST"title")) || 01467 (xmlStrEqual(newtag, BAD_CAST"base")))) { 01468 if (ctxt->html >= 3) { 01469 /* we already saw or generated an <head> before */ 01470 return; 01471 } 01472 /* 01473 * dropped OBJECT ... i you put it first BODY will be 01474 * assumed ! 01475 */ 01476 htmlnamePush(ctxt, BAD_CAST"head"); 01477 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) 01478 ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL); 01479 } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) && 01480 (!xmlStrEqual(newtag, BAD_CAST"frame")) && 01481 (!xmlStrEqual(newtag, BAD_CAST"frameset"))) { 01482 if (ctxt->html >= 10) { 01483 /* we already saw or generated a <body> before */ 01484 return; 01485 } 01486 for (i = 0;i < ctxt->nameNr;i++) { 01487 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) { 01488 return; 01489 } 01490 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) { 01491 return; 01492 } 01493 } 01494 01495 htmlnamePush(ctxt, BAD_CAST"body"); 01496 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) 01497 ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL); 01498 } 01499 } 01500 01512 static int 01513 htmlCheckParagraph(htmlParserCtxtPtr ctxt) { 01514 const xmlChar *tag; 01515 int i; 01516 01517 if (ctxt == NULL) 01518 return(-1); 01519 tag = ctxt->name; 01520 if (tag == NULL) { 01521 htmlAutoClose(ctxt, BAD_CAST"p"); 01522 htmlCheckImplied(ctxt, BAD_CAST"p"); 01523 htmlnamePush(ctxt, BAD_CAST"p"); 01524 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) 01525 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL); 01526 return(1); 01527 } 01528 if (!htmlOmittedDefaultValue) 01529 return(0); 01530 for (i = 0; htmlNoContentElements[i] != NULL; i++) { 01531 if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) { 01532 htmlAutoClose(ctxt, BAD_CAST"p"); 01533 htmlCheckImplied(ctxt, BAD_CAST"p"); 01534 htmlnamePush(ctxt, BAD_CAST"p"); 01535 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) 01536 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL); 01537 return(1); 01538 } 01539 } 01540 return(0); 01541 } 01542 01551 int 01552 htmlIsScriptAttribute(const xmlChar *name) { 01553 unsigned int i; 01554 01555 if (name == NULL) 01556 return(0); 01557 /* 01558 * all script attributes start with 'on' 01559 */ 01560 if ((name[0] != 'o') || (name[1] != 'n')) 01561 return(0); 01562 for (i = 0; 01563 i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]); 01564 i++) { 01565 if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i])) 01566 return(1); 01567 } 01568 return(0); 01569 } 01570 01571 /************************************************************************ 01572 * * 01573 * The list of HTML predefined entities * 01574 * * 01575 ************************************************************************/ 01576 01577 01578 static const htmlEntityDesc html40EntitiesTable[] = { 01579 /* 01580 * the 4 absolute ones, plus apostrophe. 01581 */ 01582 { 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" }, 01583 { 38, "amp", "ampersand, U+0026 ISOnum" }, 01584 { 39, "apos", "single quote" }, 01585 { 60, "lt", "less-than sign, U+003C ISOnum" }, 01586 { 62, "gt", "greater-than sign, U+003E ISOnum" }, 01587 01588 /* 01589 * A bunch still in the 128-255 range 01590 * Replacing them depend really on the charset used. 01591 */ 01592 { 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" }, 01593 { 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" }, 01594 { 162, "cent", "cent sign, U+00A2 ISOnum" }, 01595 { 163, "pound","pound sign, U+00A3 ISOnum" }, 01596 { 164, "curren","currency sign, U+00A4 ISOnum" }, 01597 { 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" }, 01598 { 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" }, 01599 { 167, "sect", "section sign, U+00A7 ISOnum" }, 01600 { 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" }, 01601 { 169, "copy", "copyright sign, U+00A9 ISOnum" }, 01602 { 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" }, 01603 { 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" }, 01604 { 172, "not", "not sign, U+00AC ISOnum" }, 01605 { 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" }, 01606 { 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" }, 01607 { 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" }, 01608 { 176, "deg", "degree sign, U+00B0 ISOnum" }, 01609 { 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" }, 01610 { 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" }, 01611 { 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" }, 01612 { 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" }, 01613 { 181, "micro","micro sign, U+00B5 ISOnum" }, 01614 { 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" }, 01615 { 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" }, 01616 { 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" }, 01617 { 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" }, 01618 { 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" }, 01619 { 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" }, 01620 { 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" }, 01621 { 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" }, 01622 { 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" }, 01623 { 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" }, 01624 { 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" }, 01625 { 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" }, 01626 { 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" }, 01627 { 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" }, 01628 { 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" }, 01629 { 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" }, 01630 { 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" }, 01631 { 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" }, 01632 { 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" }, 01633 { 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" }, 01634 { 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" }, 01635 { 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" }, 01636 { 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" }, 01637 { 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" }, 01638 { 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" }, 01639 { 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" }, 01640 { 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" }, 01641 { 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" }, 01642 { 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" }, 01643 { 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" }, 01644 { 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" }, 01645 { 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" }, 01646 { 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" }, 01647 { 215, "times","multiplication sign, U+00D7 ISOnum" }, 01648 { 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" }, 01649 { 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" }, 01650 { 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" }, 01651 { 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" }, 01652 { 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" }, 01653 { 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" }, 01654 { 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" }, 01655 { 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" }, 01656 { 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" }, 01657 { 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" }, 01658 { 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" }, 01659 { 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" }, 01660 { 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" }, 01661 { 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" }, 01662 { 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" }, 01663 { 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" }, 01664 { 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" }, 01665 { 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" }, 01666 { 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" }, 01667 { 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" }, 01668 { 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" }, 01669 { 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" }, 01670 { 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" }, 01671 { 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" }, 01672 { 240, "eth", "latin small letter eth, U+00F0 ISOlat1" }, 01673 { 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" }, 01674 { 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" }, 01675 { 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" }, 01676 { 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" }, 01677 { 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" }, 01678 { 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" }, 01679 { 247, "divide","division sign, U+00F7 ISOnum" }, 01680 { 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" }, 01681 { 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" }, 01682 { 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" }, 01683 { 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" }, 01684 { 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" }, 01685 { 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" }, 01686 { 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" }, 01687 { 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" }, 01688 01689 { 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" }, 01690 { 339, "oelig","latin small ligature oe, U+0153 ISOlat2" }, 01691 { 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" }, 01692 { 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" }, 01693 { 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" }, 01694 01695 /* 01696 * Anything below should really be kept as entities references 01697 */ 01698 { 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" }, 01699 01700 { 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" }, 01701 { 732, "tilde","small tilde, U+02DC ISOdia" }, 01702 01703 { 913, "Alpha","greek capital letter alpha, U+0391" }, 01704 { 914, "Beta", "greek capital letter beta, U+0392" }, 01705 { 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" }, 01706 { 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" }, 01707 { 917, "Epsilon","greek capital letter epsilon, U+0395" }, 01708 { 918, "Zeta", "greek capital letter zeta, U+0396" }, 01709 { 919, "Eta", "greek capital letter eta, U+0397" }, 01710 { 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" }, 01711 { 921, "Iota", "greek capital letter iota, U+0399" }, 01712 { 922, "Kappa","greek capital letter kappa, U+039A" }, 01713 { 923, "Lambda", "greek capital letter lambda, U+039B ISOgrk3" }, 01714 { 924, "Mu", "greek capital letter mu, U+039C" }, 01715 { 925, "Nu", "greek capital letter nu, U+039D" }, 01716 { 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" }, 01717 { 927, "Omicron","greek capital letter omicron, U+039F" }, 01718 { 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" }, 01719 { 929, "Rho", "greek capital letter rho, U+03A1" }, 01720 { 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" }, 01721 { 932, "Tau", "greek capital letter tau, U+03A4" }, 01722 { 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" }, 01723 { 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" }, 01724 { 935, "Chi", "greek capital letter chi, U+03A7" }, 01725 { 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" }, 01726 { 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" }, 01727 01728 { 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" }, 01729 { 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" }, 01730 { 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" }, 01731 { 948, "delta","greek small letter delta, U+03B4 ISOgrk3" }, 01732 { 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" }, 01733 { 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" }, 01734 { 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" }, 01735 { 952, "theta","greek small letter theta, U+03B8 ISOgrk3" }, 01736 { 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" }, 01737 { 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" }, 01738 { 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" }, 01739 { 956, "mu", "greek small letter mu, U+03BC ISOgrk3" }, 01740 { 957, "nu", "greek small letter nu, U+03BD ISOgrk3" }, 01741 { 958, "xi", "greek small letter xi, U+03BE ISOgrk3" }, 01742 { 959, "omicron","greek small letter omicron, U+03BF NEW" }, 01743 { 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" }, 01744 { 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" }, 01745 { 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" }, 01746 { 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" }, 01747 { 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" }, 01748 { 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" }, 01749 { 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" }, 01750 { 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" }, 01751 { 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" }, 01752 { 969, "omega","greek small letter omega, U+03C9 ISOgrk3" }, 01753 { 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" }, 01754 { 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" }, 01755 { 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" }, 01756 01757 { 8194, "ensp", "en space, U+2002 ISOpub" }, 01758 { 8195, "emsp", "em space, U+2003 ISOpub" }, 01759 { 8201, "thinsp","thin space, U+2009 ISOpub" }, 01760 { 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" }, 01761 { 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" }, 01762 { 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" }, 01763 { 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" }, 01764 { 8211, "ndash","en dash, U+2013 ISOpub" }, 01765 { 8212, "mdash","em dash, U+2014 ISOpub" }, 01766 { 8216, "lsquo","left single quotation mark, U+2018 ISOnum" }, 01767 { 8217, "rsquo","right single quotation mark, U+2019 ISOnum" }, 01768 { 8218, "sbquo","single low-9 quotation mark, U+201A NEW" }, 01769 { 8220, "ldquo","left double quotation mark, U+201C ISOnum" }, 01770 { 8221, "rdquo","right double quotation mark, U+201D ISOnum" }, 01771 { 8222, "bdquo","double low-9 quotation mark, U+201E NEW" }, 01772 { 8224, "dagger","dagger, U+2020 ISOpub" }, 01773 { 8225, "Dagger","double dagger, U+2021 ISOpub" }, 01774 01775 { 8226, "bull", "bullet = black small circle, U+2022 ISOpub" }, 01776 { 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" }, 01777 01778 { 8240, "permil","per mille sign, U+2030 ISOtech" }, 01779 01780 { 8242, "prime","prime = minutes = feet, U+2032 ISOtech" }, 01781 { 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" }, 01782 01783 { 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" }, 01784 { 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" }, 01785 01786 { 8254, "oline","overline = spacing overscore, U+203E NEW" }, 01787 { 8260, "frasl","fraction slash, U+2044 NEW" }, 01788 01789 { 8364, "euro", "euro sign, U+20AC NEW" }, 01790 01791 { 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" }, 01792 { 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" }, 01793 { 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" }, 01794 { 8482, "trade","trade mark sign, U+2122 ISOnum" }, 01795 { 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" }, 01796 { 8592, "larr", "leftwards arrow, U+2190 ISOnum" }, 01797 { 8593, "uarr", "upwards arrow, U+2191 ISOnum" }, 01798 { 8594, "rarr", "rightwards arrow, U+2192 ISOnum" }, 01799 { 8595, "darr", "downwards arrow, U+2193 ISOnum" }, 01800 { 8596, "harr", "left right arrow, U+2194 ISOamsa" }, 01801 { 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" }, 01802 { 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" }, 01803 { 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" }, 01804 { 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" }, 01805 { 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" }, 01806 { 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" }, 01807 01808 { 8704, "forall","for all, U+2200 ISOtech" }, 01809 { 8706, "part", "partial differential, U+2202 ISOtech" }, 01810 { 8707, "exist","there exists, U+2203 ISOtech" }, 01811 { 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" }, 01812 { 8711, "nabla","nabla = backward difference, U+2207 ISOtech" }, 01813 { 8712, "isin", "element of, U+2208 ISOtech" }, 01814 { 8713, "notin","not an element of, U+2209 ISOtech" }, 01815 { 8715, "ni", "contains as member, U+220B ISOtech" }, 01816 { 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" }, 01817 { 8721, "sum", "n-ary summation, U+2211 ISOamsb" }, 01818 { 8722, "minus","minus sign, U+2212 ISOtech" }, 01819 { 8727, "lowast","asterisk operator, U+2217 ISOtech" }, 01820 { 8730, "radic","square root = radical sign, U+221A ISOtech" }, 01821 { 8733, "prop", "proportional to, U+221D ISOtech" }, 01822 { 8734, "infin","infinity, U+221E ISOtech" }, 01823 { 8736, "ang", "angle, U+2220 ISOamso" }, 01824 { 8743, "and", "logical and = wedge, U+2227 ISOtech" }, 01825 { 8744, "or", "logical or = vee, U+2228 ISOtech" }, 01826 { 8745, "cap", "intersection = cap, U+2229 ISOtech" }, 01827 { 8746, "cup", "union = cup, U+222A ISOtech" }, 01828 { 8747, "int", "integral, U+222B ISOtech" }, 01829 { 8756, "there4","therefore, U+2234 ISOtech" }, 01830 { 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" }, 01831 { 8773, "cong", "approximately equal to, U+2245 ISOtech" }, 01832 { 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" }, 01833 { 8800, "ne", "not equal to, U+2260 ISOtech" }, 01834 { 8801, "equiv","identical to, U+2261 ISOtech" }, 01835 { 8804, "le", "less-than or equal to, U+2264 ISOtech" }, 01836 { 8805, "ge", "greater-than or equal to, U+2265 ISOtech" }, 01837 { 8834, "sub", "subset of, U+2282 ISOtech" }, 01838 { 8835, "sup", "superset of, U+2283 ISOtech" }, 01839 { 8836, "nsub", "not a subset of, U+2284 ISOamsn" }, 01840 { 8838, "sube", "subset of or equal to, U+2286 ISOtech" }, 01841 { 8839, "supe", "superset of or equal to, U+2287 ISOtech" }, 01842 { 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" }, 01843 { 8855, "otimes","circled times = vector product, U+2297 ISOamsb" }, 01844 { 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" }, 01845 { 8901, "sdot", "dot operator, U+22C5 ISOamsb" }, 01846 { 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" }, 01847 { 8969, "rceil","right ceiling, U+2309 ISOamsc" }, 01848 { 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" }, 01849 { 8971, "rfloor","right floor, U+230B ISOamsc" }, 01850 { 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" }, 01851 { 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" }, 01852 { 9674, "loz", "lozenge, U+25CA ISOpub" }, 01853 01854 { 9824, "spades","black spade suit, U+2660 ISOpub" }, 01855 { 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" }, 01856 { 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" }, 01857 { 9830, "diams","black diamond suit, U+2666 ISOpub" }, 01858 01859 }; 01860 01861 /************************************************************************ 01862 * * 01863 * Commodity functions to handle entities * 01864 * * 01865 ************************************************************************/ 01866 01867 /* 01868 * Macro used to grow the current buffer. 01869 */ 01870 #define growBuffer(buffer) { \ 01871 xmlChar *tmp; \ 01872 buffer##_size *= 2; \ 01873 tmp = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \ 01874 if (tmp == NULL) { \ 01875 htmlErrMemory(ctxt, "growing buffer\n"); \ 01876 xmlFree(buffer); \ 01877 return(NULL); \ 01878 } \ 01879 buffer = tmp; \ 01880 } 01881 01892 const htmlEntityDesc * 01893 htmlEntityLookup(const xmlChar *name) { 01894 unsigned int i; 01895 01896 for (i = 0;i < (sizeof(html40EntitiesTable)/ 01897 sizeof(html40EntitiesTable[0]));i++) { 01898 if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) { 01899 return((htmlEntityDescPtr) &html40EntitiesTable[i]); 01900 } 01901 } 01902 return(NULL); 01903 } 01904 01915 const htmlEntityDesc * 01916 htmlEntityValueLookup(unsigned int value) { 01917 unsigned int i; 01918 01919 for (i = 0;i < (sizeof(html40EntitiesTable)/ 01920 sizeof(html40EntitiesTable[0]));i++) { 01921 if (html40EntitiesTable[i].value >= value) { 01922 if (html40EntitiesTable[i].value > value) 01923 break; 01924 return((htmlEntityDescPtr) &html40EntitiesTable[i]); 01925 } 01926 } 01927 return(NULL); 01928 } 01929 01945 int 01946 UTF8ToHtml(unsigned char* out, int *outlen, 01947 const unsigned char* in, int *inlen) { 01948 const unsigned char* processed = in; 01949 const unsigned char* outend; 01950 const unsigned char* outstart = out; 01951 const unsigned char* instart = in; 01952 const unsigned char* inend; 01953 unsigned int c, d; 01954 int trailing; 01955 01956 if ((out == NULL) || (outlen == NULL) || (inlen == NULL)) return(-1); 01957 if (in == NULL) { 01958 /* 01959 * initialization nothing to do 01960 */ 01961 *outlen = 0; 01962 *inlen = 0; 01963 return(0); 01964 } 01965 inend = in + (*inlen); 01966 outend = out + (*outlen); 01967 while (in < inend) { 01968 d = *in++; 01969 if (d < 0x80) { c= d; trailing= 0; } 01970 else if (d < 0xC0) { 01971 /* trailing byte in leading position */ 01972 *outlen = out - outstart; 01973 *inlen = processed - instart; 01974 return(-2); 01975 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; } 01976 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; } 01977 else if (d < 0xF8) { c= d & 0x07; trailing= 3; } 01978 else { 01979 /* no chance for this in Ascii */ 01980 *outlen = out - outstart; 01981 *inlen = processed - instart; 01982 return(-2); 01983 } 01984 01985 if (inend - in < trailing) { 01986 break; 01987 } 01988 01989 for ( ; trailing; trailing--) { 01990 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80)) 01991 break; 01992 c <<= 6; 01993 c |= d & 0x3F; 01994 } 01995 01996 /* assertion: c is a single UTF-4 value */ 01997 if (c < 0x80) { 01998 if (out + 1 >= outend) 01999 break; 02000 *out++ = c; 02001 } else { 02002 int len; 02003 const htmlEntityDesc * ent; 02004 const char *cp; 02005 char nbuf[16]; 02006 02007 /* 02008 * Try to lookup a predefined HTML entity for it 02009 */ 02010 02011 ent = htmlEntityValueLookup(c); 02012 if (ent == NULL) { 02013 snprintf(nbuf, sizeof(nbuf), "#%u", c); 02014 cp = nbuf; 02015 } 02016 else 02017 cp = ent->name; 02018 len = strlen(cp); 02019 if (out + 2 + len >= outend) 02020 break; 02021 *out++ = '&'; 02022 memcpy(out, cp, len); 02023 out += len; 02024 *out++ = ';'; 02025 } 02026 processed = in; 02027 } 02028 *outlen = out - outstart; 02029 *inlen = processed - instart; 02030 return(0); 02031 } 02032 02049 int 02050 htmlEncodeEntities(unsigned char* out, int *outlen, 02051 const unsigned char* in, int *inlen, int quoteChar) { 02052 const unsigned char* processed = in; 02053 const unsigned char* outend; 02054 const unsigned char* outstart = out; 02055 const unsigned char* instart = in; 02056 const unsigned char* inend; 02057 unsigned int c, d; 02058 int trailing; 02059 02060 if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL)) 02061 return(-1); 02062 outend = out + (*outlen); 02063 inend = in + (*inlen); 02064 while (in < inend) { 02065 d = *in++; 02066 if (d < 0x80) { c= d; trailing= 0; } 02067 else if (d < 0xC0) { 02068 /* trailing byte in leading position */ 02069 *outlen = out - outstart; 02070 *inlen = processed - instart; 02071 return(-2); 02072 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; } 02073 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; } 02074 else if (d < 0xF8) { c= d & 0x07; trailing= 3; } 02075 else { 02076 /* no chance for this in Ascii */ 02077 *outlen = out - outstart; 02078 *inlen = processed - instart; 02079 return(-2); 02080 } 02081 02082 if (inend - in < trailing) 02083 break; 02084 02085 while (trailing--) { 02086 if (((d= *in++) & 0xC0) != 0x80) { 02087 *outlen = out - outstart; 02088 *inlen = processed - instart; 02089 return(-2); 02090 } 02091 c <<= 6; 02092 c |= d & 0x3F; 02093 } 02094 02095 /* assertion: c is a single UTF-4 value */ 02096 if ((c < 0x80) && (c != (unsigned int) quoteChar) && 02097 (c != '&') && (c != '<') && (c != '>')) { 02098 if (out >= outend) 02099 break; 02100 *out++ = c; 02101 } else { 02102 const htmlEntityDesc * ent; 02103 const char *cp; 02104 char nbuf[16]; 02105 int len; 02106 02107 /* 02108 * Try to lookup a predefined HTML entity for it 02109 */ 02110 ent = htmlEntityValueLookup(c); 02111 if (ent == NULL) { 02112 snprintf(nbuf, sizeof(nbuf), "#%u", c); 02113 cp = nbuf; 02114 } 02115 else 02116 cp = ent->name; 02117 len = strlen(cp); 02118 if (out + 2 + len > outend) 02119 break; 02120 *out++ = '&'; 02121 memcpy(out, cp, len); 02122 out += len; 02123 *out++ = ';'; 02124 } 02125 processed = in; 02126 } 02127 *outlen = out - outstart; 02128 *inlen = processed - instart; 02129 return(0); 02130 } 02131 02132 /************************************************************************ 02133 * * 02134 * Commodity functions to handle streams * 02135 * * 02136 ************************************************************************/ 02137 02145 static htmlParserInputPtr 02146 htmlNewInputStream(htmlParserCtxtPtr ctxt) { 02147 htmlParserInputPtr input; 02148 02149 input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput)); 02150 if (input == NULL) { 02151 htmlErrMemory(ctxt, "couldn't allocate a new input stream\n"); 02152 return(NULL); 02153 } 02154 memset(input, 0, sizeof(htmlParserInput)); 02155 input->filename = NULL; 02156 input->directory = NULL; 02157 input->base = NULL; 02158 input->cur = NULL; 02159 input->buf = NULL; 02160 input->line = 1; 02161 input->col = 1; 02162 input->buf = NULL; 02163 input->free = NULL; 02164 input->version = NULL; 02165 input->consumed = 0; 02166 input->length = 0; 02167 return(input); 02168 } 02169 02170 02171 /************************************************************************ 02172 * * 02173 * Commodity functions, cleanup needed ? * 02174 * * 02175 ************************************************************************/ 02176 /* 02177 * all tags allowing pc data from the html 4.01 loose dtd 02178 * NOTE: it might be more apropriate to integrate this information 02179 * into the html40ElementTable array but I don't want to risk any 02180 * binary incomptibility 02181 */ 02182 static const char *allowPCData[] = { 02183 "a", "abbr", "acronym", "address", "applet", "b", "bdo", "big", 02184 "blockquote", "body", "button", "caption", "center", "cite", "code", 02185 "dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2", 02186 "h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend", 02187 "li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp", 02188 "small", "span", "strike", "strong", "td", "th", "tt", "u", "var" 02189 }; 02190 02202 static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) { 02203 unsigned int i; 02204 int j; 02205 xmlNodePtr lastChild; 02206 xmlDtdPtr dtd; 02207 02208 for (j = 0;j < len;j++) 02209 if (!(IS_BLANK_CH(str[j]))) return(0); 02210 02211 if (CUR == 0) return(1); 02212 if (CUR != '<') return(0); 02213 if (ctxt->name == NULL) 02214 return(1); 02215 if (xmlStrEqual(ctxt->name, BAD_CAST"html")) 02216 return(1); 02217 if (xmlStrEqual(ctxt->name, BAD_CAST"head")) 02218 return(1); 02219 02220 /* Only strip CDATA children of the body tag for strict HTML DTDs */ 02221 if (xmlStrEqual(ctxt->name, BAD_CAST "body") && ctxt->myDoc != NULL) { 02222 dtd = xmlGetIntSubset(ctxt->myDoc); 02223 if (dtd != NULL && dtd->ExternalID != NULL) { 02224 if (!xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4.01//EN") || 02225 !xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4//EN")) 02226 return(1); 02227 } 02228 } 02229 02230 if (ctxt->node == NULL) return(0); 02231 lastChild = xmlGetLastChild(ctxt->node); 02232 while ((lastChild) && (lastChild->type == XML_COMMENT_NODE)) 02233 lastChild = lastChild->prev; 02234 if (lastChild == NULL) { 02235 if ((ctxt->node->type != XML_ELEMENT_NODE) && 02236 (ctxt->node->content != NULL)) return(0); 02237 /* keep ws in constructs like ...<b> </b>... 02238 for all tags "b" allowing PCDATA */ 02239 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) { 02240 if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) { 02241 return(0); 02242 } 02243 } 02244 } else if (xmlNodeIsText(lastChild)) { 02245 return(0); 02246 } else { 02247 /* keep ws in constructs like <p><b>xy</b> <i>z</i><p> 02248 for all tags "p" allowing PCDATA */ 02249 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) { 02250 if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) { 02251 return(0); 02252 } 02253 } 02254 } 02255 return(1); 02256 } 02257 02268 htmlDocPtr 02269 htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) { 02270 xmlDocPtr cur; 02271 02272 /* 02273 * Allocate a new document and fill the fields. 02274 */ 02275 cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc)); 02276 if (cur == NULL) { 02277 htmlErrMemory(NULL, "HTML document creation failed\n"); 02278 return(NULL); 02279 } 02280 memset(cur, 0, sizeof(xmlDoc)); 02281 02282 cur->type = XML_HTML_DOCUMENT_NODE; 02283 cur->version = NULL; 02284 cur->intSubset = NULL; 02285 cur->doc = cur; 02286 cur->name = NULL; 02287 cur->children = NULL; 02288 cur->extSubset = NULL; 02289 cur->oldNs = NULL; 02290 cur->encoding = NULL; 02291 cur->standalone = 1; 02292 cur->compression = 0; 02293 cur->ids = NULL; 02294 cur->refs = NULL; 02295 cur->_private = NULL; 02296 cur->charset = XML_CHAR_ENCODING_UTF8; 02297 cur->properties = XML_DOC_HTML | XML_DOC_USERBUILT; 02298 if ((ExternalID != NULL) || 02299 (URI != NULL)) 02300 xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI); 02301 return(cur); 02302 } 02303 02313 htmlDocPtr 02314 htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) { 02315 if ((URI == NULL) && (ExternalID == NULL)) 02316 return(htmlNewDocNoDtD( 02317 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd", 02318 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN")); 02319 02320 return(htmlNewDocNoDtD(URI, ExternalID)); 02321 } 02322 02323 02324 /************************************************************************ 02325 * * 02326 * The parser itself * 02327 * Relates to http://www.w3.org/TR/html40 * 02328 * * 02329 ************************************************************************/ 02330 02331 /************************************************************************ 02332 * * 02333 * The parser itself * 02334 * * 02335 ************************************************************************/ 02336 02337 static const xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt); 02338 02349 static const xmlChar * 02350 htmlParseHTMLName(htmlParserCtxtPtr ctxt) { 02351 int i = 0; 02352 xmlChar loc[HTML_PARSER_BUFFER_SIZE]; 02353 02354 if (!IS_ASCII_LETTER(CUR) && (CUR != '_') && 02355 (CUR != ':') && (CUR != '.')) return(NULL); 02356 02357 while ((i < HTML_PARSER_BUFFER_SIZE) && 02358 ((IS_ASCII_LETTER(CUR)) || (IS_ASCII_DIGIT(CUR)) || 02359 (CUR == ':') || (CUR == '-') || (CUR == '_') || 02360 (CUR == '.'))) { 02361 if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20; 02362 else loc[i] = CUR; 02363 i++; 02364 02365 NEXT; 02366 } 02367 02368 return(xmlDictLookup(ctxt->dict, loc, i)); 02369 } 02370 02371 02383 static const xmlChar * 02384 htmlParseHTMLName_nonInvasive(htmlParserCtxtPtr ctxt) { 02385 int i = 0; 02386 xmlChar loc[HTML_PARSER_BUFFER_SIZE]; 02387 02388 if (!IS_ASCII_LETTER(NXT(1)) && (NXT(1) != '_') && 02389 (NXT(1) != ':')) return(NULL); 02390 02391 while ((i < HTML_PARSER_BUFFER_SIZE) && 02392 ((IS_ASCII_LETTER(NXT(1+i))) || (IS_ASCII_DIGIT(NXT(1+i))) || 02393 (NXT(1+i) == ':') || (NXT(1+i) == '-') || (NXT(1+i) == '_'))) { 02394 if ((NXT(1+i) >= 'A') && (NXT(1+i) <= 'Z')) loc[i] = NXT(1+i) + 0x20; 02395 else loc[i] = NXT(1+i); 02396 i++; 02397 } 02398 02399 return(xmlDictLookup(ctxt->dict, loc, i)); 02400 } 02401 02402 02412 static const xmlChar * 02413 htmlParseName(htmlParserCtxtPtr ctxt) { 02414 const xmlChar *in; 02415 const xmlChar *ret; 02416 int count = 0; 02417 02418 GROW; 02419 02420 /* 02421 * Accelerator for simple ASCII names 02422 */ 02423 in = ctxt->input->cur; 02424 if (((*in >= 0x61) && (*in <= 0x7A)) || 02425 ((*in >= 0x41) && (*in <= 0x5A)) || 02426 (*in == '_') || (*in == ':')) { 02427 in++; 02428 while (((*in >= 0x61) && (*in <= 0x7A)) || 02429 ((*in >= 0x41) && (*in <= 0x5A)) || 02430 ((*in >= 0x30) && (*in <= 0x39)) || 02431 (*in == '_') || (*in == '-') || 02432 (*in == ':') || (*in == '.')) 02433 in++; 02434 if ((*in > 0) && (*in < 0x80)) { 02435 count = in - ctxt->input->cur; 02436 ret = xmlDictLookup(ctxt->dict, ctxt->input->cur, count); 02437 ctxt->input->cur = in; 02438 ctxt->nbChars += count; 02439 ctxt->input->col += count; 02440 return(ret); 02441 } 02442 } 02443 return(htmlParseNameComplex(ctxt)); 02444 } 02445 02446 static const xmlChar * 02447 htmlParseNameComplex(xmlParserCtxtPtr ctxt) { 02448 int len = 0, l; 02449 int c; 02450 int count = 0; 02451 02452 /* 02453 * Handler for more complex cases 02454 */ 02455 GROW; 02456 c = CUR_CHAR(l); 02457 if ((c == ' ') || (c == '>') || (c == '/') || /* accelerators */ 02458 (!IS_LETTER(c) && (c != '_') && 02459 (c != ':'))) { 02460 return(NULL); 02461 } 02462 02463 while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */ 02464 ((IS_LETTER(c)) || (IS_DIGIT(c)) || 02465 (c == '.') || (c == '-') || 02466 (c == '_') || (c == ':') || 02467 (IS_COMBINING(c)) || 02468 (IS_EXTENDER(c)))) { 02469 if (count++ > 100) { 02470 count = 0; 02471 GROW; 02472 } 02473 len += l; 02474 NEXTL(l); 02475 c = CUR_CHAR(l); 02476 } 02477 return(xmlDictLookup(ctxt->dict, ctxt->input->cur - len, len)); 02478 } 02479 02480 02492 static xmlChar * 02493 htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) { 02494 xmlChar *buffer = NULL; 02495 int buffer_size = 0; 02496 xmlChar *out = NULL; 02497 const xmlChar *name = NULL; 02498 const xmlChar *cur = NULL; 02499 const htmlEntityDesc * ent; 02500 02501 /* 02502 * allocate a translation buffer. 02503 */ 02504 buffer_size = HTML_PARSER_BUFFER_SIZE; 02505 buffer = (xmlChar *) xmlMallocAtomic(buffer_size * sizeof(xmlChar)); 02506 if (buffer == NULL) { 02507 htmlErrMemory(ctxt, "buffer allocation failed\n"); 02508 return(NULL); 02509 } 02510 out = buffer; 02511 02512 /* 02513 * Ok loop until we reach one of the ending chars 02514 */ 02515 while ((CUR != 0) && (CUR != stop)) { 02516 if ((stop == 0) && (CUR == '>')) break; 02517 if ((stop == 0) && (IS_BLANK_CH(CUR))) break; 02518 if (CUR == '&') { 02519 if (NXT(1) == '#') { 02520 unsigned int c; 02521 int bits; 02522 02523 c = htmlParseCharRef(ctxt); 02524 if (c < 0x80) 02525 { *out++ = c; bits= -6; } 02526 else if (c < 0x800) 02527 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; } 02528 else if (c < 0x10000) 02529 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; } 02530 else 02531 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; } 02532 02533 for ( ; bits >= 0; bits-= 6) { 02534 *out++ = ((c >> bits) & 0x3F) | 0x80; 02535 } 02536 02537 if (out - buffer > buffer_size - 100) { 02538 int indx = out - buffer; 02539 02540 growBuffer(buffer); 02541 out = &buffer[indx]; 02542 } 02543 } else { 02544 ent = htmlParseEntityRef(ctxt, &name); 02545 if (name == NULL) { 02546 *out++ = '&'; 02547 if (out - buffer > buffer_size - 100) { 02548 int indx = out - buffer; 02549 02550 growBuffer(buffer); 02551 out = &buffer[indx]; 02552 } 02553 } else if (ent == NULL) { 02554 *out++ = '&'; 02555 cur = name; 02556 while (*cur != 0) { 02557 if (out - buffer > buffer_size - 100) { 02558 int indx = out - buffer; 02559 02560 growBuffer(buffer); 02561 out = &buffer[indx]; 02562 } 02563 *out++ = *cur++; 02564 } 02565 } else { 02566 unsigned int c; 02567 int bits; 02568 02569 if (out - buffer > buffer_size - 100) { 02570 int indx = out - buffer; 02571 02572 growBuffer(buffer); 02573 out = &buffer[indx]; 02574 } 02575 c = ent->value; 02576 if (c < 0x80) 02577 { *out++ = c; bits= -6; } 02578 else if (c < 0x800) 02579 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; } 02580 else if (c < 0x10000) 02581 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; } 02582 else 02583 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; } 02584 02585 for ( ; bits >= 0; bits-= 6) { 02586 *out++ = ((c >> bits) & 0x3F) | 0x80; 02587 } 02588 } 02589 } 02590 } else { 02591 unsigned int c; 02592 int bits, l; 02593 02594 if (out - buffer > buffer_size - 100) { 02595 int indx = out - buffer; 02596 02597 growBuffer(buffer); 02598 out = &buffer[indx]; 02599 } 02600 c = CUR_CHAR(l); 02601 if (c < 0x80) 02602 { *out++ = c; bits= -6; } 02603 else if (c < 0x800) 02604 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; } 02605 else if (c < 0x10000) 02606 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; } 02607 else 02608 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; } 02609 02610 for ( ; bits >= 0; bits-= 6) { 02611 *out++ = ((c >> bits) & 0x3F) | 0x80; 02612 } 02613 NEXT; 02614 } 02615 } 02616 *out = 0; 02617 return(buffer); 02618 } 02619 02632 const htmlEntityDesc * 02633 htmlParseEntityRef(htmlParserCtxtPtr ctxt, const xmlChar **str) { 02634 const xmlChar *name; 02635 const htmlEntityDesc * ent = NULL; 02636 02637 if (str != NULL) *str = NULL; 02638 if ((ctxt == NULL) || (ctxt->input == NULL)) return(NULL); 02639 02640 if (CUR == '&') { 02641 NEXT; 02642 name = htmlParseName(ctxt); 02643 if (name == NULL) { 02644 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED, 02645 "htmlParseEntityRef: no name\n", NULL, NULL); 02646 } else { 02647 GROW; 02648 if (CUR == ';') { 02649 if (str != NULL) 02650 *str = name; 02651 02652 /* 02653 * Lookup the entity in the table. 02654 */ 02655 ent = htmlEntityLookup(name); 02656 if (ent != NULL) /* OK that's ugly !!! */ 02657 NEXT; 02658 } else { 02659 htmlParseErr(ctxt, XML_ERR_ENTITYREF_SEMICOL_MISSING, 02660 "htmlParseEntityRef: expecting ';'\n", 02661 NULL, NULL); 02662 if (str != NULL) 02663 *str = name; 02664 } 02665 } 02666 } 02667 return(ent); 02668 } 02669 02682 static xmlChar * 02683 htmlParseAttValue(htmlParserCtxtPtr ctxt) { 02684 xmlChar *ret = NULL; 02685 02686 if (CUR == '"') { 02687 NEXT; 02688 ret = htmlParseHTMLAttribute(ctxt, '"'); 02689 if (CUR != '"') { 02690 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED, 02691 "AttValue: \" expected\n", NULL, NULL); 02692 } else 02693 NEXT; 02694 } else if (CUR == '\'') { 02695 NEXT; 02696 ret = htmlParseHTMLAttribute(ctxt, '\''); 02697 if (CUR != '\'') { 02698 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED, 02699 "AttValue: ' expected\n", NULL, NULL); 02700 } else 02701 NEXT; 02702 } else { 02703 /* 02704 * That's an HTMLism, the attribute value may not be quoted 02705 */ 02706 ret = htmlParseHTMLAttribute(ctxt, 0); 02707 if (ret == NULL) { 02708 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_WITHOUT_VALUE, 02709 "AttValue: no value found\n", NULL, NULL); 02710 } 02711 } 02712 return(ret); 02713 } 02714 02726 static xmlChar * 02727 htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) { 02728 const xmlChar *q; 02729 xmlChar *ret = NULL; 02730 02731 if (CUR == '"') { 02732 NEXT; 02733 q = CUR_PTR; 02734 while ((IS_CHAR_CH(CUR)) && (CUR != '"')) 02735 NEXT; 02736 if (!IS_CHAR_CH(CUR)) { 02737 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED, 02738 "Unfinished SystemLiteral\n", NULL, NULL); 02739 } else { 02740 ret = xmlStrndup(q, CUR_PTR - q); 02741 NEXT; 02742 } 02743 } else if (CUR == '\'') { 02744 NEXT; 02745 q = CUR_PTR; 02746 while ((IS_CHAR_CH(CUR)) && (CUR != '\'')) 02747 NEXT; 02748 if (!IS_CHAR_CH(CUR)) { 02749 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED, 02750 "Unfinished SystemLiteral\n", NULL, NULL); 02751 } else { 02752 ret = xmlStrndup(q, CUR_PTR - q); 02753 NEXT; 02754 } 02755 } else { 02756 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED, 02757 " or ' expected\n", NULL, NULL); 02758 } 02759 02760 return(ret); 02761 } 02762 02774 static xmlChar * 02775 htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) { 02776 const xmlChar *q; 02777 xmlChar *ret = NULL; 02778 /* 02779 * Name ::= (Letter | '_') (NameChar)* 02780 */ 02781 if (CUR == '"') { 02782 NEXT; 02783 q = CUR_PTR; 02784 while (IS_PUBIDCHAR_CH(CUR)) NEXT; 02785 if (CUR != '"') { 02786 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED, 02787 "Unfinished PubidLiteral\n", NULL, NULL); 02788 } else { 02789 ret = xmlStrndup(q, CUR_PTR - q); 02790 NEXT; 02791 } 02792 } else if (CUR == '\'') { 02793 NEXT; 02794 q = CUR_PTR; 02795 while ((IS_PUBIDCHAR_CH(CUR)) && (CUR != '\'')) 02796 NEXT; 02797 if (CUR != '\'') { 02798 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED, 02799 "Unfinished PubidLiteral\n", NULL, NULL); 02800 } else { 02801 ret = xmlStrndup(q, CUR_PTR - q); 02802 NEXT; 02803 } 02804 } else { 02805 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED, 02806 "PubidLiteral \" or ' expected\n", NULL, NULL); 02807 } 02808 02809 return(ret); 02810 } 02811 02833 static void 02834 htmlParseScript(htmlParserCtxtPtr ctxt) { 02835 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5]; 02836 int nbchar = 0; 02837 int cur,l; 02838 02839 SHRINK; 02840 cur = CUR_CHAR(l); 02841 while (IS_CHAR_CH(cur)) { 02842 if ((cur == '<') && (NXT(1) == '/')) { 02843 /* 02844 * One should break here, the specification is clear: 02845 * Authors should therefore escape "</" within the content. 02846 * Escape mechanisms are specific to each scripting or 02847 * style sheet language. 02848 * 02849 * In recovery mode, only break if end tag match the 02850 * current tag, effectively ignoring all tags inside the 02851 * script/style block and treating the entire block as 02852 * CDATA. 02853 */ 02854 if (ctxt->recovery) { 02855 if (xmlStrncasecmp(ctxt->name, ctxt->input->cur+2, 02856 xmlStrlen(ctxt->name)) == 0) 02857 { 02858 break; /* while */ 02859 } else { 02860 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH, 02861 "Element %s embeds close tag\n", 02862 ctxt->name, NULL); 02863 } 02864 } else { 02865 if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) || 02866 ((NXT(2) >= 'a') && (NXT(2) <= 'z'))) 02867 { 02868 break; /* while */ 02869 } 02870 } 02871 } 02872 COPY_BUF(l,buf,nbchar,cur); 02873 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) { 02874 if (ctxt->sax->cdataBlock!= NULL) { 02875 /* 02876 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE 02877 */ 02878 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar); 02879 } else if (ctxt->sax->characters != NULL) { 02880 ctxt->sax->characters(ctxt->userData, buf, nbchar); 02881 } 02882 nbchar = 0; 02883 } 02884 GROW; 02885 NEXTL(l); 02886 cur = CUR_CHAR(l); 02887 } 02888 02889 if ((!(IS_CHAR_CH(cur))) && (!((cur == 0) && (ctxt->progressive)))) { 02890 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR, 02891 "Invalid char in CDATA 0x%X\n", cur); 02892 if (ctxt->input->cur < ctxt->input->end) { 02893 NEXT; 02894 } 02895 } 02896 02897 if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) { 02898 if (ctxt->sax->cdataBlock!= NULL) { 02899 /* 02900 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE 02901 */ 02902 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar); 02903 } else if (ctxt->sax->characters != NULL) { 02904 ctxt->sax->characters(ctxt->userData, buf, nbchar); 02905 } 02906 } 02907 } 02908 02909 02920 static void 02921 htmlParseCharData(htmlParserCtxtPtr ctxt) { 02922 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5]; 02923 int nbchar = 0; 02924 int cur, l; 02925 int chunk = 0; 02926 02927 SHRINK; 02928 cur = CUR_CHAR(l); 02929 while (((cur != '<') || (ctxt->token == '<')) && 02930 ((cur != '&') || (ctxt->token == '&')) && 02931 (cur != 0)) { 02932 if (!(IS_CHAR(cur))) { 02933 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR, 02934 "Invalid char in CDATA 0x%X\n", cur); 02935 } else { 02936 COPY_BUF(l,buf,nbchar,cur); 02937 } 02938 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) { 02939 /* 02940 * Ok the segment is to be consumed as chars. 02941 */ 02942 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) { 02943 if (areBlanks(ctxt, buf, nbchar)) { 02944 if (ctxt->sax->ignorableWhitespace != NULL) 02945 ctxt->sax->ignorableWhitespace(ctxt->userData, 02946 buf, nbchar); 02947 } else { 02948 htmlCheckParagraph(ctxt); 02949 if (ctxt->sax->characters != NULL) 02950 ctxt->sax->characters(ctxt->userData, buf, nbchar); 02951 } 02952 } 02953 nbchar = 0; 02954 } 02955 NEXTL(l); 02956 chunk++; 02957 if (chunk > HTML_PARSER_BUFFER_SIZE) { 02958 chunk = 0; 02959 SHRINK; 02960 GROW; 02961 } 02962 cur = CUR_CHAR(l); 02963 if (cur == 0) { 02964 SHRINK; 02965 GROW; 02966 cur = CUR_CHAR(l); 02967 } 02968 } 02969 if (nbchar != 0) { 02970 buf[nbchar] = 0; 02971 02972 /* 02973 * Ok the segment is to be consumed as chars. 02974 */ 02975 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) { 02976 if (areBlanks(ctxt, buf, nbchar)) { 02977 if (ctxt->sax->ignorableWhitespace != NULL) 02978 ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar); 02979 } else { 02980 htmlCheckParagraph(ctxt); 02981 if (ctxt->sax->characters != NULL) 02982 ctxt->sax->characters(ctxt->userData, buf, nbchar); 02983 } 02984 } 02985 } else { 02986 /* 02987 * Loop detection 02988 */ 02989 if (cur == 0) 02990 ctxt->instate = XML_PARSER_EOF; 02991 } 02992 } 02993 03011 static xmlChar * 03012 htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) { 03013 xmlChar *URI = NULL; 03014 03015 if ((UPPER == 'S') && (UPP(1) == 'Y') && 03016 (UPP(2) == 'S') && (UPP(3) == 'T') && 03017 (UPP(4) == 'E') && (UPP(5) == 'M')) { 03018 SKIP(6); 03019 if (!IS_BLANK_CH(CUR)) { 03020 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED, 03021 "Space required after 'SYSTEM'\n", NULL, NULL); 03022 } 03023 SKIP_BLANKS; 03024 URI = htmlParseSystemLiteral(ctxt); 03025 if (URI == NULL) { 03026 htmlParseErr(ctxt, XML_ERR_URI_REQUIRED, 03027 "htmlParseExternalID: SYSTEM, no URI\n", NULL, NULL); 03028 } 03029 } else if ((UPPER == 'P') && (UPP(1) == 'U') && 03030 (UPP(2) == 'B') && (UPP(3) == 'L') && 03031 (UPP(4) == 'I') && (UPP(5) == 'C')) { 03032 SKIP(6); 03033 if (!IS_BLANK_CH(CUR)) { 03034 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED, 03035 "Space required after 'PUBLIC'\n", NULL, NULL); 03036 } 03037 SKIP_BLANKS; 03038 *publicID = htmlParsePubidLiteral(ctxt); 03039 if (*publicID == NULL) { 03040 htmlParseErr(ctxt, XML_ERR_PUBID_REQUIRED, 03041 "htmlParseExternalID: PUBLIC, no Public Identifier\n", 03042 NULL, NULL); 03043 } 03044 SKIP_BLANKS; 03045 if ((CUR == '"') || (CUR == '\'')) { 03046 URI = htmlParseSystemLiteral(ctxt); 03047 } 03048 } 03049 return(URI); 03050 } 03051 03060 static void 03061 htmlParsePI(htmlParserCtxtPtr ctxt) { 03062 xmlChar *buf = NULL; 03063 int len = 0; 03064 int size = HTML_PARSER_BUFFER_SIZE; 03065 int cur, l; 03066 const xmlChar *target; 03067 xmlParserInputState state; 03068 int count = 0; 03069 03070 if ((RAW == '<') && (NXT(1) == '?')) { 03071 state = ctxt->instate; 03072 ctxt->instate = XML_PARSER_PI; 03073 /* 03074 * this is a Processing Instruction. 03075 */ 03076 SKIP(2); 03077 SHRINK; 03078 03079 /* 03080 * Parse the target name and check for special support like 03081 * namespace. 03082 */ 03083 target = htmlParseName(ctxt); 03084 if (target != NULL) { 03085 if (RAW == '>') { 03086 SKIP(1); 03087 03088 /* 03089 * SAX: PI detected. 03090 */ 03091 if ((ctxt->sax) && (!ctxt->disableSAX) && 03092 (ctxt->sax->processingInstruction != NULL)) 03093 ctxt->sax->processingInstruction(ctxt->userData, 03094 target, NULL); 03095 ctxt->instate = state; 03096 return; 03097 } 03098 buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar)); 03099 if (buf == NULL) { 03100 htmlErrMemory(ctxt, NULL); 03101 ctxt->instate = state; 03102 return; 03103 } 03104 cur = CUR; 03105 if (!IS_BLANK(cur)) { 03106 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED, 03107 "ParsePI: PI %s space expected\n", target, NULL); 03108 } 03109 SKIP_BLANKS; 03110 cur = CUR_CHAR(l); 03111 while (IS_CHAR(cur) && (cur != '>')) { 03112 if (len + 5 >= size) { 03113 xmlChar *tmp; 03114 03115 size *= 2; 03116 tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar)); 03117 if (tmp == NULL) { 03118 htmlErrMemory(ctxt, NULL); 03119 xmlFree(buf); 03120 ctxt->instate = state; 03121 return; 03122 } 03123 buf = tmp; 03124 } 03125 count++; 03126 if (count > 50) { 03127 GROW; 03128 count = 0; 03129 } 03130 COPY_BUF(l,buf,len,cur); 03131 NEXTL(l); 03132 cur = CUR_CHAR(l); 03133 if (cur == 0) { 03134 SHRINK; 03135 GROW; 03136 cur = CUR_CHAR(l); 03137 } 03138 } 03139 buf[len] = 0; 03140 if (cur != '>') { 03141 htmlParseErr(ctxt, XML_ERR_PI_NOT_FINISHED, 03142 "ParsePI: PI %s never end ...\n", target, NULL); 03143 } else { 03144 SKIP(1); 03145 03146 /* 03147 * SAX: PI detected. 03148 */ 03149 if ((ctxt->sax) && (!ctxt->disableSAX) && 03150 (ctxt->sax->processingInstruction != NULL)) 03151 ctxt->sax->processingInstruction(ctxt->userData, 03152 target, buf); 03153 } 03154 xmlFree(buf); 03155 } else { 03156 htmlParseErr(ctxt, XML_ERR_PI_NOT_STARTED, 03157 "PI is not started correctly", NULL, NULL); 03158 } 03159 ctxt->instate = state; 03160 } 03161 } 03162 03171 static void 03172 htmlParseComment(htmlParserCtxtPtr ctxt) { 03173 xmlChar *buf = NULL; 03174 int len; 03175 int size = HTML_PARSER_BUFFER_SIZE; 03176 int q, ql; 03177 int r, rl; 03178 int cur, l; 03179 xmlParserInputState state; 03180 03181 /* 03182 * Check that there is a comment right here. 03183 */ 03184 if ((RAW != '<') || (NXT(1) != '!') || 03185 (NXT(2) != '-') || (NXT(3) != '-')) return; 03186 03187 state = ctxt->instate; 03188 ctxt->instate = XML_PARSER_COMMENT; 03189 SHRINK; 03190 SKIP(4); 03191 buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar)); 03192 if (buf == NULL) { 03193 htmlErrMemory(ctxt, "buffer allocation failed\n"); 03194 ctxt->instate = state; 03195 return; 03196 } 03197 q = CUR_CHAR(ql); 03198 NEXTL(ql); 03199 r = CUR_CHAR(rl); 03200 NEXTL(rl); 03201 cur = CUR_CHAR(l); 03202 len = 0; 03203 while (IS_CHAR(cur) && 03204 ((cur != '>') || 03205 (r != '-') || (q != '-'))) { 03206 if (len + 5 >= size) { 03207 xmlChar *tmp; 03208 03209 size *= 2; 03210 tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar)); 03211 if (tmp == NULL) { 03212 xmlFree(buf); 03213 htmlErrMemory(ctxt, "growing buffer failed\n"); 03214 ctxt->instate = state; 03215 return; 03216 } 03217 buf = tmp; 03218 } 03219 COPY_BUF(ql,buf,len,q); 03220 q = r; 03221 ql = rl; 03222 r = cur; 03223 rl = l; 03224 NEXTL(l); 03225 cur = CUR_CHAR(l); 03226 if (cur == 0) { 03227 SHRINK; 03228 GROW; 03229 cur = CUR_CHAR(l); 03230 } 03231 } 03232 buf[len] = 0; 03233 if (!IS_CHAR(cur)) { 03234 htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED, 03235 "Comment not terminated \n<!--%.50s\n", buf, NULL); 03236 xmlFree(buf); 03237 } else { 03238 NEXT; 03239 if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) && 03240 (!ctxt->disableSAX)) 03241 ctxt->sax->comment(ctxt->userData, buf); 03242 xmlFree(buf); 03243 } 03244 ctxt->instate = state; 03245 } 03246 03258 int 03259 htmlParseCharRef(htmlParserCtxtPtr ctxt) { 03260 int val = 0; 03261 03262 if ((ctxt == NULL) || (ctxt->input == NULL)) { 03263 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 03264 "htmlParseCharRef: context error\n", 03265 NULL, NULL); 03266 return(0); 03267 } 03268 if ((CUR == '&') && (NXT(1) == '#') && 03269 ((NXT(2) == 'x') || NXT(2) == 'X')) { 03270 SKIP(3); 03271 while (CUR != ';') { 03272 if ((CUR >= '0') && (CUR <= '9')) 03273 val = val * 16 + (CUR - '0'); 03274 else if ((CUR >= 'a') && (CUR <= 'f')) 03275 val = val * 16 + (CUR - 'a') + 10; 03276 else if ((CUR >= 'A') && (CUR <= 'F')) 03277 val = val * 16 + (CUR - 'A') + 10; 03278 else { 03279 htmlParseErr(ctxt, XML_ERR_INVALID_HEX_CHARREF, 03280 "htmlParseCharRef: missing semicolon\n", 03281 NULL, NULL); 03282 break; 03283 } 03284 NEXT; 03285 } 03286 if (CUR == ';') 03287 NEXT; 03288 } else if ((CUR == '&') && (NXT(1) == '#')) { 03289 SKIP(2); 03290 while (CUR != ';') { 03291 if ((CUR >= '0') && (CUR <= '9')) 03292 val = val * 10 + (CUR - '0'); 03293 else { 03294 htmlParseErr(ctxt, XML_ERR_INVALID_DEC_CHARREF, 03295 "htmlParseCharRef: missing semicolon\n", 03296 NULL, NULL); 03297 break; 03298 } 03299 NEXT; 03300 } 03301 if (CUR == ';') 03302 NEXT; 03303 } else { 03304 htmlParseErr(ctxt, XML_ERR_INVALID_CHARREF, 03305 "htmlParseCharRef: invalid value\n", NULL, NULL); 03306 } 03307 /* 03308 * Check the value IS_CHAR ... 03309 */ 03310 if (IS_CHAR(val)) { 03311 return(val); 03312 } else { 03313 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR, 03314 "htmlParseCharRef: invalid xmlChar value %d\n", 03315 val); 03316 } 03317 return(0); 03318 } 03319 03320 03331 static void 03332 htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) { 03333 const xmlChar *name; 03334 xmlChar *ExternalID = NULL; 03335 xmlChar *URI = NULL; 03336 03337 /* 03338 * We know that '<!DOCTYPE' has been detected. 03339 */ 03340 SKIP(9); 03341 03342 SKIP_BLANKS; 03343 03344 /* 03345 * Parse the DOCTYPE name. 03346 */ 03347 name = htmlParseName(ctxt); 03348 if (name == NULL) { 03349 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED, 03350 "htmlParseDocTypeDecl : no DOCTYPE name !\n", 03351 NULL, NULL); 03352 } 03353 /* 03354 * Check that upper(name) == "HTML" !!!!!!!!!!!!! 03355 */ 03356 03357 SKIP_BLANKS; 03358 03359 /* 03360 * Check for SystemID and ExternalID 03361 */ 03362 URI = htmlParseExternalID(ctxt, &ExternalID); 03363 SKIP_BLANKS; 03364 03365 /* 03366 * We should be at the end of the DOCTYPE declaration. 03367 */ 03368 if (CUR != '>') { 03369 htmlParseErr(ctxt, XML_ERR_DOCTYPE_NOT_FINISHED, 03370 "DOCTYPE improperly terminated\n", NULL, NULL); 03371 /* We shouldn't try to resynchronize ... */ 03372 } 03373 NEXT; 03374 03375 /* 03376 * Create or update the document accordingly to the DOCTYPE 03377 */ 03378 if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) && 03379 (!ctxt->disableSAX)) 03380 ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI); 03381 03382 /* 03383 * Cleanup, since we don't use all those identifiers 03384 */ 03385 if (URI != NULL) xmlFree(URI); 03386 if (ExternalID != NULL) xmlFree(ExternalID); 03387 } 03388 03410 static const xmlChar * 03411 htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) { 03412 const xmlChar *name; 03413 xmlChar *val = NULL; 03414 03415 *value = NULL; 03416 name = htmlParseHTMLName(ctxt); 03417 if (name == NULL) { 03418 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED, 03419 "error parsing attribute name\n", NULL, NULL); 03420 return(NULL); 03421 } 03422 03423 /* 03424 * read the value 03425 */ 03426 SKIP_BLANKS; 03427 if (CUR == '=') { 03428 NEXT; 03429 SKIP_BLANKS; 03430 val = htmlParseAttValue(ctxt); 03431 } 03432 03433 *value = val; 03434 return(name); 03435 } 03436 03447 static void 03448 htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) { 03449 const xmlChar *encoding; 03450 03451 if ((ctxt == NULL) || (attvalue == NULL)) 03452 return; 03453 03454 /* do not change encoding */ 03455 if (ctxt->input->encoding != NULL) 03456 return; 03457 03458 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset="); 03459 if (encoding != NULL) { 03460 encoding += 8; 03461 } else { 03462 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset ="); 03463 if (encoding != NULL) 03464 encoding += 9; 03465 } 03466 if (encoding != NULL) { 03467 xmlCharEncoding enc; 03468 xmlCharEncodingHandlerPtr handler; 03469 03470 while ((*encoding == ' ') || (*encoding == '\t')) encoding++; 03471 03472 if (ctxt->input->encoding != NULL) 03473 xmlFree((xmlChar *) ctxt->input->encoding); 03474 ctxt->input->encoding = xmlStrdup(encoding); 03475 03476 enc = xmlParseCharEncoding((const char *) encoding); 03477 /* 03478 * registered set of known encodings 03479 */ 03480 if (enc != XML_CHAR_ENCODING_ERROR) { 03481 if (((enc == XML_CHAR_ENCODING_UTF16LE) || 03482 (enc == XML_CHAR_ENCODING_UTF16BE) || 03483 (enc == XML_CHAR_ENCODING_UCS4LE) || 03484 (enc == XML_CHAR_ENCODING_UCS4BE)) && 03485 (ctxt->input->buf != NULL) && 03486 (ctxt->input->buf->encoder == NULL)) { 03487 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING, 03488 "htmlCheckEncoding: wrong encoding meta\n", 03489 NULL, NULL); 03490 } else { 03491 xmlSwitchEncoding(ctxt, enc); 03492 } 03493 ctxt->charset = XML_CHAR_ENCODING_UTF8; 03494 } else { 03495 /* 03496 * fallback for unknown encodings 03497 */ 03498 handler = xmlFindCharEncodingHandler((const char *) encoding); 03499 if (handler != NULL) { 03500 xmlSwitchToEncoding(ctxt, handler); 03501 ctxt->charset = XML_CHAR_ENCODING_UTF8; 03502 } else { 03503 ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING; 03504 } 03505 } 03506 03507 if ((ctxt->input->buf != NULL) && 03508 (ctxt->input->buf->encoder != NULL) && 03509 (ctxt->input->buf->raw != NULL) && 03510 (ctxt->input->buf->buffer != NULL)) { 03511 int nbchars; 03512 int processed; 03513 03514 /* 03515 * convert as much as possible to the parser reading buffer. 03516 */ 03517 processed = ctxt->input->cur - ctxt->input->base; 03518 xmlBufferShrink(ctxt->input->buf->buffer, processed); 03519 nbchars = xmlCharEncInFunc(ctxt->input->buf->encoder, 03520 ctxt->input->buf->buffer, 03521 ctxt->input->buf->raw); 03522 if (nbchars < 0) { 03523 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING, 03524 "htmlCheckEncoding: encoder error\n", 03525 NULL, NULL); 03526 } 03527 ctxt->input->base = 03528 ctxt->input->cur = ctxt->input->buf->buffer->content; 03529 ctxt->input->end = 03530 &ctxt->input->base[ctxt->input->buf->buffer->use]; 03531 } 03532 } 03533 } 03534 03542 static void 03543 htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) { 03544 int i; 03545 const xmlChar *att, *value; 03546 int http = 0; 03547 const xmlChar *content = NULL; 03548 03549 if ((ctxt == NULL) || (atts == NULL)) 03550 return; 03551 03552 i = 0; 03553 att = atts[i++]; 03554 while (att != NULL) { 03555 value = atts[i++]; 03556 if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv")) 03557 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type"))) 03558 http = 1; 03559 else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content"))) 03560 content = value; 03561 att = atts[i++]; 03562 } 03563 if ((http) && (content != NULL)) 03564 htmlCheckEncoding(ctxt, content); 03565 03566 } 03567 03588 static int 03589 htmlParseStartTag(htmlParserCtxtPtr ctxt) { 03590 const xmlChar *name; 03591 const xmlChar *attname; 03592 xmlChar *attvalue; 03593 const xmlChar **atts; 03594 int nbatts = 0; 03595 int maxatts; 03596 int meta = 0; 03597 int i; 03598 int discardtag = 0; 03599 03600 if (ctxt->instate == XML_PARSER_EOF) 03601 return(-1); 03602 if ((ctxt == NULL) || (ctxt->input == NULL)) { 03603 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 03604 "htmlParseStartTag: context error\n", NULL, NULL); 03605 return -1; 03606 } 03607 if (CUR != '<') return -1; 03608 NEXT; 03609 03610 atts = ctxt->atts; 03611 maxatts = ctxt->maxatts; 03612 03613 GROW; 03614 name = htmlParseHTMLName(ctxt); 03615 if (name == NULL) { 03616 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED, 03617 "htmlParseStartTag: invalid element name\n", 03618 NULL, NULL); 03619 /* Dump the bogus tag like browsers do */ 03620 while ((IS_CHAR_CH(CUR)) && (CUR != '>') && 03621 (ctxt->instate != XML_PARSER_EOF)) 03622 NEXT; 03623 return -1; 03624 } 03625 if (xmlStrEqual(name, BAD_CAST"meta")) 03626 meta = 1; 03627 03628 /* 03629 * Check for auto-closure of HTML elements. 03630 */ 03631 htmlAutoClose(ctxt, name); 03632 03633 /* 03634 * Check for implied HTML elements. 03635 */ 03636 htmlCheckImplied(ctxt, name); 03637 03638 /* 03639 * Avoid html at any level > 0, head at any level != 1 03640 * or any attempt to recurse body 03641 */ 03642 if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) { 03643 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR, 03644 "htmlParseStartTag: misplaced <html> tag\n", 03645 name, NULL); 03646 discardtag = 1; 03647 ctxt->depth++; 03648 } 03649 if ((ctxt->nameNr != 1) && 03650 (xmlStrEqual(name, BAD_CAST"head"))) { 03651 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR, 03652 "htmlParseStartTag: misplaced <head> tag\n", 03653 name, NULL); 03654 discardtag = 1; 03655 ctxt->depth++; 03656 } 03657 if (xmlStrEqual(name, BAD_CAST"body")) { 03658 int indx; 03659 for (indx = 0;indx < ctxt->nameNr;indx++) { 03660 if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) { 03661 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR, 03662 "htmlParseStartTag: misplaced <body> tag\n", 03663 name, NULL); 03664 discardtag = 1; 03665 ctxt->depth++; 03666 } 03667 } 03668 } 03669 03670 /* 03671 * Now parse the attributes, it ends up with the ending 03672 * 03673 * (S Attribute)* S? 03674 */ 03675 SKIP_BLANKS; 03676 while ((IS_CHAR_CH(CUR)) && 03677 (CUR != '>') && 03678 ((CUR != '/') || (NXT(1) != '>'))) { 03679 long cons = ctxt->nbChars; 03680 03681 GROW; 03682 attname = htmlParseAttribute(ctxt, &attvalue); 03683 if (attname != NULL) { 03684 03685 /* 03686 * Well formedness requires at most one declaration of an attribute 03687 */ 03688 for (i = 0; i < nbatts;i += 2) { 03689 if (xmlStrEqual(atts[i], attname)) { 03690 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_REDEFINED, 03691 "Attribute %s redefined\n", attname, NULL); 03692 if (attvalue != NULL) 03693 xmlFree(attvalue); 03694 goto failed; 03695 } 03696 } 03697 03698 /* 03699 * Add the pair to atts 03700 */ 03701 if (atts == NULL) { 03702 maxatts = 22; /* allow for 10 attrs by default */ 03703 atts = (const xmlChar **) 03704 xmlMalloc(maxatts * sizeof(xmlChar *)); 03705 if (atts == NULL) { 03706 htmlErrMemory(ctxt, NULL); 03707 if (attvalue != NULL) 03708 xmlFree(attvalue); 03709 goto failed; 03710 } 03711 ctxt->atts = atts; 03712 ctxt->maxatts = maxatts; 03713 } else if (nbatts + 4 > maxatts) { 03714 const xmlChar **n; 03715 03716 maxatts *= 2; 03717 n = (const xmlChar **) xmlRealloc((void *) atts, 03718 maxatts * sizeof(const xmlChar *)); 03719 if (n == NULL) { 03720 htmlErrMemory(ctxt, NULL); 03721 if (attvalue != NULL) 03722 xmlFree(attvalue); 03723 goto failed; 03724 } 03725 atts = n; 03726 ctxt->atts = atts; 03727 ctxt->maxatts = maxatts; 03728 } 03729 atts[nbatts++] = attname; 03730 atts[nbatts++] = attvalue; 03731 atts[nbatts] = NULL; 03732 atts[nbatts + 1] = NULL; 03733 } 03734 else { 03735 if (attvalue != NULL) 03736 xmlFree(attvalue); 03737 /* Dump the bogus attribute string up to the next blank or 03738 * the end of the tag. */ 03739 while ((IS_CHAR_CH(CUR)) && 03740 !(IS_BLANK_CH(CUR)) && (CUR != '>') && 03741 ((CUR != '/') || (NXT(1) != '>'))) 03742 NEXT; 03743 } 03744 03745 failed: 03746 SKIP_BLANKS; 03747 if (cons == ctxt->nbChars) { 03748 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 03749 "htmlParseStartTag: problem parsing attributes\n", 03750 NULL, NULL); 03751 break; 03752 } 03753 } 03754 03755 /* 03756 * Handle specific association to the META tag 03757 */ 03758 if (meta && (nbatts != 0)) 03759 htmlCheckMeta(ctxt, atts); 03760 03761 /* 03762 * SAX: Start of Element ! 03763 */ 03764 if (!discardtag) { 03765 htmlnamePush(ctxt, name); 03766 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) { 03767 if (nbatts != 0) 03768 ctxt->sax->startElement(ctxt->userData, name, atts); 03769 else 03770 ctxt->sax->startElement(ctxt->userData, name, NULL); 03771 } 03772 } 03773 03774 if (atts != NULL) { 03775 for (i = 1;i < nbatts;i += 2) { 03776 if (atts[i] != NULL) 03777 xmlFree((xmlChar *) atts[i]); 03778 } 03779 } 03780 03781 return(discardtag); 03782 } 03783 03799 static int 03800 htmlParseEndTag(htmlParserCtxtPtr ctxt) 03801 { 03802 const xmlChar *name; 03803 const xmlChar *oldname; 03804 int i, ret; 03805 03806 if ((CUR != '<') || (NXT(1) != '/')) { 03807 htmlParseErr(ctxt, XML_ERR_LTSLASH_REQUIRED, 03808 "htmlParseEndTag: '</' not found\n", NULL, NULL); 03809 return (0); 03810 } 03811 SKIP(2); 03812 03813 name = htmlParseHTMLName(ctxt); 03814 if (name == NULL) 03815 return (0); 03816 /* 03817 * We should definitely be at the ending "S? '>'" part 03818 */ 03819 SKIP_BLANKS; 03820 if ((!IS_CHAR_CH(CUR)) || (CUR != '>')) { 03821 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED, 03822 "End tag : expected '>'\n", NULL, NULL); 03823 if (ctxt->recovery) { 03824 /* 03825 * We're not at the ending > !! 03826 * Error, unless in recover mode where we search forwards 03827 * until we find a > 03828 */ 03829 while (CUR != '\0' && CUR != '>') NEXT; 03830 NEXT; 03831 } 03832 } else 03833 NEXT; 03834 03835 /* 03836 * if we ignored misplaced tags in htmlParseStartTag don't pop them 03837 * out now. 03838 */ 03839 if ((ctxt->depth > 0) && 03840 (xmlStrEqual(name, BAD_CAST "html") || 03841 xmlStrEqual(name, BAD_CAST "body") || 03842 xmlStrEqual(name, BAD_CAST "head"))) { 03843 ctxt->depth--; 03844 return (0); 03845 } 03846 03847 /* 03848 * If the name read is not one of the element in the parsing stack 03849 * then return, it's just an error. 03850 */ 03851 for (i = (ctxt->nameNr - 1); i >= 0; i--) { 03852 if (xmlStrEqual(name, ctxt->nameTab[i])) 03853 break; 03854 } 03855 if (i < 0) { 03856 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH, 03857 "Unexpected end tag : %s\n", name, NULL); 03858 return (0); 03859 } 03860 03861 03862 /* 03863 * Check for auto-closure of HTML elements. 03864 */ 03865 03866 htmlAutoCloseOnClose(ctxt, name); 03867 03868 /* 03869 * Well formedness constraints, opening and closing must match. 03870 * With the exception that the autoclose may have popped stuff out 03871 * of the stack. 03872 */ 03873 if (!xmlStrEqual(name, ctxt->name)) { 03874 if ((ctxt->name != NULL) && (!xmlStrEqual(ctxt->name, name))) { 03875 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH, 03876 "Opening and ending tag mismatch: %s and %s\n", 03877 name, ctxt->name); 03878 } 03879 } 03880 03881 /* 03882 * SAX: End of Tag 03883 */ 03884 oldname = ctxt->name; 03885 if ((oldname != NULL) && (xmlStrEqual(oldname, name))) { 03886 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) 03887 ctxt->sax->endElement(ctxt->userData, name); 03888 htmlnamePop(ctxt); 03889 ret = 1; 03890 } else { 03891 ret = 0; 03892 } 03893 03894 return (ret); 03895 } 03896 03897 03906 static void 03907 htmlParseReference(htmlParserCtxtPtr ctxt) { 03908 const htmlEntityDesc * ent; 03909 xmlChar out[6]; 03910 const xmlChar *name; 03911 if (CUR != '&') return; 03912 03913 if (NXT(1) == '#') { 03914 unsigned int c; 03915 int bits, i = 0; 03916 03917 c = htmlParseCharRef(ctxt); 03918 if (c == 0) 03919 return; 03920 03921 if (c < 0x80) { out[i++]= c; bits= -6; } 03922 else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; } 03923 else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; } 03924 else { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; } 03925 03926 for ( ; bits >= 0; bits-= 6) { 03927 out[i++]= ((c >> bits) & 0x3F) | 0x80; 03928 } 03929 out[i] = 0; 03930 03931 htmlCheckParagraph(ctxt); 03932 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) 03933 ctxt->sax->characters(ctxt->userData, out, i); 03934 } else { 03935 ent = htmlParseEntityRef(ctxt, &name); 03936 if (name == NULL) { 03937 htmlCheckParagraph(ctxt); 03938 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) 03939 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1); 03940 return; 03941 } 03942 if ((ent == NULL) || !(ent->value > 0)) { 03943 htmlCheckParagraph(ctxt); 03944 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) { 03945 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1); 03946 ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name)); 03947 /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */ 03948 } 03949 } else { 03950 unsigned int c; 03951 int bits, i = 0; 03952 03953 c = ent->value; 03954 if (c < 0x80) 03955 { out[i++]= c; bits= -6; } 03956 else if (c < 0x800) 03957 { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; } 03958 else if (c < 0x10000) 03959 { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; } 03960 else 03961 { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; } 03962 03963 for ( ; bits >= 0; bits-= 6) { 03964 out[i++]= ((c >> bits) & 0x3F) | 0x80; 03965 } 03966 out[i] = 0; 03967 03968 htmlCheckParagraph(ctxt); 03969 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) 03970 ctxt->sax->characters(ctxt->userData, out, i); 03971 } 03972 } 03973 } 03974 03983 static void 03984 htmlParseContent(htmlParserCtxtPtr ctxt) { 03985 xmlChar *currentNode; 03986 int depth; 03987 const xmlChar *name; 03988 03989 currentNode = xmlStrdup(ctxt->name); 03990 depth = ctxt->nameNr; 03991 while (1) { 03992 long cons = ctxt->nbChars; 03993 03994 GROW; 03995 03996 if (ctxt->instate == XML_PARSER_EOF) 03997 break; 03998 03999 /* 04000 * Our tag or one of it's parent or children is ending. 04001 */ 04002 if ((CUR == '<') && (NXT(1) == '/')) { 04003 if (htmlParseEndTag(ctxt) && 04004 ((currentNode != NULL) || (ctxt->nameNr == 0))) { 04005 if (currentNode != NULL) 04006 xmlFree(currentNode); 04007 return; 04008 } 04009 continue; /* while */ 04010 } 04011 04012 else if ((CUR == '<') && 04013 ((IS_ASCII_LETTER(NXT(1))) || 04014 (NXT(1) == '_') || (NXT(1) == ':'))) { 04015 name = htmlParseHTMLName_nonInvasive(ctxt); 04016 if (name == NULL) { 04017 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED, 04018 "htmlParseStartTag: invalid element name\n", 04019 NULL, NULL); 04020 /* Dump the bogus tag like browsers do */ 04021 while ((IS_CHAR_CH(CUR)) && (CUR != '>')) 04022 NEXT; 04023 04024 if (currentNode != NULL) 04025 xmlFree(currentNode); 04026 return; 04027 } 04028 04029 if (ctxt->name != NULL) { 04030 if (htmlCheckAutoClose(name, ctxt->name) == 1) { 04031 htmlAutoClose(ctxt, name); 04032 continue; 04033 } 04034 } 04035 } 04036 04037 /* 04038 * Has this node been popped out during parsing of 04039 * the next element 04040 */ 04041 if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) && 04042 (!xmlStrEqual(currentNode, ctxt->name))) 04043 { 04044 if (currentNode != NULL) xmlFree(currentNode); 04045 return; 04046 } 04047 04048 if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) || 04049 (xmlStrEqual(currentNode, BAD_CAST"style")))) { 04050 /* 04051 * Handle SCRIPT/STYLE separately 04052 */ 04053 htmlParseScript(ctxt); 04054 } else { 04055 /* 04056 * Sometimes DOCTYPE arrives in the middle of the document 04057 */ 04058 if ((CUR == '<') && (NXT(1) == '!') && 04059 (UPP(2) == 'D') && (UPP(3) == 'O') && 04060 (UPP(4) == 'C') && (UPP(5) == 'T') && 04061 (UPP(6) == 'Y') && (UPP(7) == 'P') && 04062 (UPP(8) == 'E')) { 04063 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR, 04064 "Misplaced DOCTYPE declaration\n", 04065 BAD_CAST "DOCTYPE" , NULL); 04066 htmlParseDocTypeDecl(ctxt); 04067 } 04068 04069 /* 04070 * First case : a comment 04071 */ 04072 if ((CUR == '<') && (NXT(1) == '!') && 04073 (NXT(2) == '-') && (NXT(3) == '-')) { 04074 htmlParseComment(ctxt); 04075 } 04076 04077 /* 04078 * Second case : a Processing Instruction. 04079 */ 04080 else if ((CUR == '<') && (NXT(1) == '?')) { 04081 htmlParsePI(ctxt); 04082 } 04083 04084 /* 04085 * Third case : a sub-element. 04086 */ 04087 else if (CUR == '<') { 04088 htmlParseElement(ctxt); 04089 } 04090 04091 /* 04092 * Fourth case : a reference. If if has not been resolved, 04093 * parsing returns it's Name, create the node 04094 */ 04095 else if (CUR == '&') { 04096 htmlParseReference(ctxt); 04097 } 04098 04099 /* 04100 * Fifth case : end of the resource 04101 */ 04102 else if (CUR == 0) { 04103 htmlAutoCloseOnEnd(ctxt); 04104 break; 04105 } 04106 04107 /* 04108 * Last case, text. Note that References are handled directly. 04109 */ 04110 else { 04111 htmlParseCharData(ctxt); 04112 } 04113 04114 if (cons == ctxt->nbChars) { 04115 if (ctxt->node != NULL) { 04116 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 04117 "detected an error in element content\n", 04118 NULL, NULL); 04119 } 04120 break; 04121 } 04122 } 04123 GROW; 04124 } 04125 if (currentNode != NULL) xmlFree(currentNode); 04126 } 04127 04140 void 04141 htmlParseElement(htmlParserCtxtPtr ctxt) { 04142 const xmlChar *name; 04143 xmlChar *currentNode = NULL; 04144 const htmlElemDesc * info; 04145 htmlParserNodeInfo node_info; 04146 int failed; 04147 int depth; 04148 const xmlChar *oldptr; 04149 04150 if ((ctxt == NULL) || (ctxt->input == NULL)) { 04151 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 04152 "htmlParseElement: context error\n", NULL, NULL); 04153 return; 04154 } 04155 04156 if (ctxt->instate == XML_PARSER_EOF) 04157 return; 04158 04159 /* Capture start position */ 04160 if (ctxt->record_info) { 04161 node_info.begin_pos = ctxt->input->consumed + 04162 (CUR_PTR - ctxt->input->base); 04163 node_info.begin_line = ctxt->input->line; 04164 } 04165 04166 failed = htmlParseStartTag(ctxt); 04167 name = ctxt->name; 04168 if ((failed == -1) || (name == NULL)) { 04169 if (CUR == '>') 04170 NEXT; 04171 return; 04172 } 04173 04174 /* 04175 * Lookup the info for that element. 04176 */ 04177 info = htmlTagLookup(name); 04178 if (info == NULL) { 04179 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG, 04180 "Tag %s invalid\n", name, NULL); 04181 } 04182 04183 /* 04184 * Check for an Empty Element labeled the XML/SGML way 04185 */ 04186 if ((CUR == '/') && (NXT(1) == '>')) { 04187 SKIP(2); 04188 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) 04189 ctxt->sax->endElement(ctxt->userData, name); 04190 htmlnamePop(ctxt); 04191 return; 04192 } 04193 04194 if (CUR == '>') { 04195 NEXT; 04196 } else { 04197 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED, 04198 "Couldn't find end of Start Tag %s\n", name, NULL); 04199 04200 /* 04201 * end of parsing of this node. 04202 */ 04203 if (xmlStrEqual(name, ctxt->name)) { 04204 nodePop(ctxt); 04205 htmlnamePop(ctxt); 04206 } 04207 04208 /* 04209 * Capture end position and add node 04210 */ 04211 if (ctxt->record_info) { 04212 node_info.end_pos = ctxt->input->consumed + 04213 (CUR_PTR - ctxt->input->base); 04214 node_info.end_line = ctxt->input->line; 04215 node_info.node = ctxt->node; 04216 xmlParserAddNodeInfo(ctxt, &node_info); 04217 } 04218 return; 04219 } 04220 04221 /* 04222 * Check for an Empty Element from DTD definition 04223 */ 04224 if ((info != NULL) && (info->empty)) { 04225 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) 04226 ctxt->sax->endElement(ctxt->userData, name); 04227 htmlnamePop(ctxt); 04228 return; 04229 } 04230 04231 /* 04232 * Parse the content of the element: 04233 */ 04234 currentNode = xmlStrdup(ctxt->name); 04235 depth = ctxt->nameNr; 04236 while (IS_CHAR_CH(CUR)) { 04237 oldptr = ctxt->input->cur; 04238 htmlParseContent(ctxt); 04239 if (oldptr==ctxt->input->cur) break; 04240 if (ctxt->nameNr < depth) break; 04241 } 04242 04243 /* 04244 * Capture end position and add node 04245 */ 04246 if ( currentNode != NULL && ctxt->record_info ) { 04247 node_info.end_pos = ctxt->input->consumed + 04248 (CUR_PTR - ctxt->input->base); 04249 node_info.end_line = ctxt->input->line; 04250 node_info.node = ctxt->node; 04251 xmlParserAddNodeInfo(ctxt, &node_info); 04252 } 04253 if (!IS_CHAR_CH(CUR)) { 04254 htmlAutoCloseOnEnd(ctxt); 04255 } 04256 04257 if (currentNode != NULL) 04258 xmlFree(currentNode); 04259 } 04260 04261 static void 04262 htmlParserFinishElementParsing(htmlParserCtxtPtr ctxt) { 04263 /* 04264 * Capture end position and add node 04265 */ 04266 if ( ctxt->node != NULL && ctxt->record_info ) { 04267 ctxt->nodeInfo->end_pos = ctxt->input->consumed + 04268 (CUR_PTR - ctxt->input->base); 04269 ctxt->nodeInfo->end_line = ctxt->input->line; 04270 ctxt->nodeInfo->node = ctxt->node; 04271 xmlParserAddNodeInfo(ctxt, ctxt->nodeInfo); 04272 htmlNodeInfoPop(ctxt); 04273 } 04274 if (!IS_CHAR_CH(CUR)) { 04275 htmlAutoCloseOnEnd(ctxt); 04276 } 04277 } 04278 04290 static void 04291 htmlParseElementInternal(htmlParserCtxtPtr ctxt) { 04292 const xmlChar *name; 04293 const htmlElemDesc * info; 04294 htmlParserNodeInfo node_info; 04295 int failed; 04296 04297 if ((ctxt == NULL) || (ctxt->input == NULL)) { 04298 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 04299 "htmlParseElementInternal: context error\n", NULL, NULL); 04300 return; 04301 } 04302 04303 if (ctxt->instate == XML_PARSER_EOF) 04304 return; 04305 04306 /* Capture start position */ 04307 if (ctxt->record_info) { 04308 node_info.begin_pos = ctxt->input->consumed + 04309 (CUR_PTR - ctxt->input->base); 04310 node_info.begin_line = ctxt->input->line; 04311 } 04312 04313 failed = htmlParseStartTag(ctxt); 04314 name = ctxt->name; 04315 if ((failed == -1) || (name == NULL)) { 04316 if (CUR == '>') 04317 NEXT; 04318 return; 04319 } 04320 04321 /* 04322 * Lookup the info for that element. 04323 */ 04324 info = htmlTagLookup(name); 04325 if (info == NULL) { 04326 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG, 04327 "Tag %s invalid\n", name, NULL); 04328 } 04329 04330 /* 04331 * Check for an Empty Element labeled the XML/SGML way 04332 */ 04333 if ((CUR == '/') && (NXT(1) == '>')) { 04334 SKIP(2); 04335 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) 04336 ctxt->sax->endElement(ctxt->userData, name); 04337 htmlnamePop(ctxt); 04338 return; 04339 } 04340 04341 if (CUR == '>') { 04342 NEXT; 04343 } else { 04344 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED, 04345 "Couldn't find end of Start Tag %s\n", name, NULL); 04346 04347 /* 04348 * end of parsing of this node. 04349 */ 04350 if (xmlStrEqual(name, ctxt->name)) { 04351 nodePop(ctxt); 04352 htmlnamePop(ctxt); 04353 } 04354 04355 if (ctxt->record_info) 04356 htmlNodeInfoPush(ctxt, &node_info); 04357 htmlParserFinishElementParsing(ctxt); 04358 return; 04359 } 04360 04361 /* 04362 * Check for an Empty Element from DTD definition 04363 */ 04364 if ((info != NULL) && (info->empty)) { 04365 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) 04366 ctxt->sax->endElement(ctxt->userData, name); 04367 htmlnamePop(ctxt); 04368 return; 04369 } 04370 04371 if (ctxt->record_info) 04372 htmlNodeInfoPush(ctxt, &node_info); 04373 } 04374 04383 static void 04384 htmlParseContentInternal(htmlParserCtxtPtr ctxt) { 04385 xmlChar *currentNode; 04386 int depth; 04387 const xmlChar *name; 04388 04389 currentNode = xmlStrdup(ctxt->name); 04390 depth = ctxt->nameNr; 04391 while (1) { 04392 long cons = ctxt->nbChars; 04393 04394 GROW; 04395 04396 if (ctxt->instate == XML_PARSER_EOF) 04397 break; 04398 04399 /* 04400 * Our tag or one of it's parent or children is ending. 04401 */ 04402 if ((CUR == '<') && (NXT(1) == '/')) { 04403 if (htmlParseEndTag(ctxt) && 04404 ((currentNode != NULL) || (ctxt->nameNr == 0))) { 04405 if (currentNode != NULL) 04406 xmlFree(currentNode); 04407 04408 currentNode = xmlStrdup(ctxt->name); 04409 depth = ctxt->nameNr; 04410 } 04411 continue; /* while */ 04412 } 04413 04414 else if ((CUR == '<') && 04415 ((IS_ASCII_LETTER(NXT(1))) || 04416 (NXT(1) == '_') || (NXT(1) == ':'))) { 04417 name = htmlParseHTMLName_nonInvasive(ctxt); 04418 if (name == NULL) { 04419 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED, 04420 "htmlParseStartTag: invalid element name\n", 04421 NULL, NULL); 04422 /* Dump the bogus tag like browsers do */ 04423 while ((IS_CHAR_CH(CUR)) && (CUR != '>')) 04424 NEXT; 04425 04426 htmlParserFinishElementParsing(ctxt); 04427 if (currentNode != NULL) 04428 xmlFree(currentNode); 04429 04430 currentNode = xmlStrdup(ctxt->name); 04431 depth = ctxt->nameNr; 04432 continue; 04433 } 04434 04435 if (ctxt->name != NULL) { 04436 if (htmlCheckAutoClose(name, ctxt->name) == 1) { 04437 htmlAutoClose(ctxt, name); 04438 continue; 04439 } 04440 } 04441 } 04442 04443 /* 04444 * Has this node been popped out during parsing of 04445 * the next element 04446 */ 04447 if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) && 04448 (!xmlStrEqual(currentNode, ctxt->name))) 04449 { 04450 htmlParserFinishElementParsing(ctxt); 04451 if (currentNode != NULL) xmlFree(currentNode); 04452 04453 currentNode = xmlStrdup(ctxt->name); 04454 depth = ctxt->nameNr; 04455 continue; 04456 } 04457 04458 if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) || 04459 (xmlStrEqual(currentNode, BAD_CAST"style")))) { 04460 /* 04461 * Handle SCRIPT/STYLE separately 04462 */ 04463 htmlParseScript(ctxt); 04464 } else { 04465 /* 04466 * Sometimes DOCTYPE arrives in the middle of the document 04467 */ 04468 if ((CUR == '<') && (NXT(1) == '!') && 04469 (UPP(2) == 'D') && (UPP(3) == 'O') && 04470 (UPP(4) == 'C') && (UPP(5) == 'T') && 04471 (UPP(6) == 'Y') && (UPP(7) == 'P') && 04472 (UPP(8) == 'E')) { 04473 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR, 04474 "Misplaced DOCTYPE declaration\n", 04475 BAD_CAST "DOCTYPE" , NULL); 04476 htmlParseDocTypeDecl(ctxt); 04477 } 04478 04479 /* 04480 * First case : a comment 04481 */ 04482 if ((CUR == '<') && (NXT(1) == '!') && 04483 (NXT(2) == '-') && (NXT(3) == '-')) { 04484 htmlParseComment(ctxt); 04485 } 04486 04487 /* 04488 * Second case : a Processing Instruction. 04489 */ 04490 else if ((CUR == '<') && (NXT(1) == '?')) { 04491 htmlParsePI(ctxt); 04492 } 04493 04494 /* 04495 * Third case : a sub-element. 04496 */ 04497 else if (CUR == '<') { 04498 htmlParseElementInternal(ctxt); 04499 if (currentNode != NULL) xmlFree(currentNode); 04500 04501 currentNode = xmlStrdup(ctxt->name); 04502 depth = ctxt->nameNr; 04503 } 04504 04505 /* 04506 * Fourth case : a reference. If if has not been resolved, 04507 * parsing returns it's Name, create the node 04508 */ 04509 else if (CUR == '&') { 04510 htmlParseReference(ctxt); 04511 } 04512 04513 /* 04514 * Fifth case : end of the resource 04515 */ 04516 else if (CUR == 0) { 04517 htmlAutoCloseOnEnd(ctxt); 04518 break; 04519 } 04520 04521 /* 04522 * Last case, text. Note that References are handled directly. 04523 */ 04524 else { 04525 htmlParseCharData(ctxt); 04526 } 04527 04528 if (cons == ctxt->nbChars) { 04529 if (ctxt->node != NULL) { 04530 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 04531 "detected an error in element content\n", 04532 NULL, NULL); 04533 } 04534 break; 04535 } 04536 } 04537 GROW; 04538 } 04539 if (currentNode != NULL) xmlFree(currentNode); 04540 } 04541 04550 void 04551 __htmlParseContent(void *ctxt) { 04552 if (ctxt != NULL) 04553 htmlParseContentInternal((htmlParserCtxtPtr) ctxt); 04554 } 04555 04567 int 04568 htmlParseDocument(htmlParserCtxtPtr ctxt) { 04569 xmlChar start[4]; 04570 xmlCharEncoding enc; 04571 xmlDtdPtr dtd; 04572 04573 xmlInitParser(); 04574 04575 htmlDefaultSAXHandlerInit(); 04576 04577 if ((ctxt == NULL) || (ctxt->input == NULL)) { 04578 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 04579 "htmlParseDocument: context error\n", NULL, NULL); 04580 return(XML_ERR_INTERNAL_ERROR); 04581 } 04582 ctxt->html = 1; 04583 ctxt->linenumbers = 1; 04584 GROW; 04585 /* 04586 * SAX: beginning of the document processing. 04587 */ 04588 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator)) 04589 ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator); 04590 04591 if ((ctxt->encoding == (const xmlChar *)XML_CHAR_ENCODING_NONE) && 04592 ((ctxt->input->end - ctxt->input->cur) >= 4)) { 04593 /* 04594 * Get the 4 first bytes and decode the charset 04595 * if enc != XML_CHAR_ENCODING_NONE 04596 * plug some encoding conversion routines. 04597 */ 04598 start[0] = RAW; 04599 start[1] = NXT(1); 04600 start[2] = NXT(2); 04601 start[3] = NXT(3); 04602 enc = xmlDetectCharEncoding(&start[0], 4); 04603 if (enc != XML_CHAR_ENCODING_NONE) { 04604 xmlSwitchEncoding(ctxt, enc); 04605 } 04606 } 04607 04608 /* 04609 * Wipe out everything which is before the first '<' 04610 */ 04611 SKIP_BLANKS; 04612 if (CUR == 0) { 04613 htmlParseErr(ctxt, XML_ERR_DOCUMENT_EMPTY, 04614 "Document is empty\n", NULL, NULL); 04615 } 04616 04617 if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX)) 04618 ctxt->sax->startDocument(ctxt->userData); 04619 04620 04621 /* 04622 * Parse possible comments and PIs before any content 04623 */ 04624 while (((CUR == '<') && (NXT(1) == '!') && 04625 (NXT(2) == '-') && (NXT(3) == '-')) || 04626 ((CUR == '<') && (NXT(1) == '?'))) { 04627 htmlParseComment(ctxt); 04628 htmlParsePI(ctxt); 04629 SKIP_BLANKS; 04630 } 04631 04632 04633 /* 04634 * Then possibly doc type declaration(s) and more Misc 04635 * (doctypedecl Misc*)? 04636 */ 04637 if ((CUR == '<') && (NXT(1) == '!') && 04638 (UPP(2) == 'D') && (UPP(3) == 'O') && 04639 (UPP(4) == 'C') && (UPP(5) == 'T') && 04640 (UPP(6) == 'Y') && (UPP(7) == 'P') && 04641 (UPP(8) == 'E')) { 04642 htmlParseDocTypeDecl(ctxt); 04643 } 04644 SKIP_BLANKS; 04645 04646 /* 04647 * Parse possible comments and PIs before any content 04648 */ 04649 while (((CUR == '<') && (NXT(1) == '!') && 04650 (NXT(2) == '-') && (NXT(3) == '-')) || 04651 ((CUR == '<') && (NXT(1) == '?'))) { 04652 htmlParseComment(ctxt); 04653 htmlParsePI(ctxt); 04654 SKIP_BLANKS; 04655 } 04656 04657 /* 04658 * Time to start parsing the tree itself 04659 */ 04660 htmlParseContentInternal(ctxt); 04661 04662 /* 04663 * autoclose 04664 */ 04665 if (CUR == 0) 04666 htmlAutoCloseOnEnd(ctxt); 04667 04668 04669 /* 04670 * SAX: end of the document processing. 04671 */ 04672 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL)) 04673 ctxt->sax->endDocument(ctxt->userData); 04674 04675 if ((!(ctxt->options & HTML_PARSE_NODEFDTD)) && (ctxt->myDoc != NULL)) { 04676 dtd = xmlGetIntSubset(ctxt->myDoc); 04677 if (dtd == NULL) 04678 ctxt->myDoc->intSubset = 04679 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html", 04680 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN", 04681 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd"); 04682 } 04683 if (! ctxt->wellFormed) return(-1); 04684 return(0); 04685 } 04686 04687 04688 /************************************************************************ 04689 * * 04690 * Parser contexts handling * 04691 * * 04692 ************************************************************************/ 04693 04703 static int 04704 htmlInitParserCtxt(htmlParserCtxtPtr ctxt) 04705 { 04706 htmlSAXHandler *sax; 04707 04708 if (ctxt == NULL) return(-1); 04709 memset(ctxt, 0, sizeof(htmlParserCtxt)); 04710 04711 ctxt->dict = xmlDictCreate(); 04712 if (ctxt->dict == NULL) { 04713 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n"); 04714 return(-1); 04715 } 04716 sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler)); 04717 if (sax == NULL) { 04718 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n"); 04719 return(-1); 04720 } 04721 else 04722 memset(sax, 0, sizeof(htmlSAXHandler)); 04723 04724 /* Allocate the Input stack */ 04725 ctxt->inputTab = (htmlParserInputPtr *) 04726 xmlMalloc(5 * sizeof(htmlParserInputPtr)); 04727 if (ctxt->inputTab == NULL) { 04728 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n"); 04729 ctxt->inputNr = 0; 04730 ctxt->inputMax = 0; 04731 ctxt->input = NULL; 04732 return(-1); 04733 } 04734 ctxt->inputNr = 0; 04735 ctxt->inputMax = 5; 04736 ctxt->input = NULL; 04737 ctxt->version = NULL; 04738 ctxt->encoding = NULL; 04739 ctxt->standalone = -1; 04740 ctxt->instate = XML_PARSER_START; 04741 04742 /* Allocate the Node stack */ 04743 ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr)); 04744 if (ctxt->nodeTab == NULL) { 04745 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n"); 04746 ctxt->nodeNr = 0; 04747 ctxt->nodeMax = 0; 04748 ctxt->node = NULL; 04749 ctxt->inputNr = 0; 04750 ctxt->inputMax = 0; 04751 ctxt->input = NULL; 04752 return(-1); 04753 } 04754 ctxt->nodeNr = 0; 04755 ctxt->nodeMax = 10; 04756 ctxt->node = NULL; 04757 04758 /* Allocate the Name stack */ 04759 ctxt->nameTab = (const xmlChar **) xmlMalloc(10 * sizeof(xmlChar *)); 04760 if (ctxt->nameTab == NULL) { 04761 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n"); 04762 ctxt->nameNr = 0; 04763 ctxt->nameMax = 0; 04764 ctxt->name = NULL; 04765 ctxt->nodeNr = 0; 04766 ctxt->nodeMax = 0; 04767 ctxt->node = NULL; 04768 ctxt->inputNr = 0; 04769 ctxt->inputMax = 0; 04770 ctxt->input = NULL; 04771 return(-1); 04772 } 04773 ctxt->nameNr = 0; 04774 ctxt->nameMax = 10; 04775 ctxt->name = NULL; 04776 04777 ctxt->nodeInfoTab = NULL; 04778 ctxt->nodeInfoNr = 0; 04779 ctxt->nodeInfoMax = 0; 04780 04781 if (sax == NULL) ctxt->sax = (xmlSAXHandlerPtr) &htmlDefaultSAXHandler; 04782 else { 04783 ctxt->sax = sax; 04784 memcpy(sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1)); 04785 } 04786 ctxt->userData = ctxt; 04787 ctxt->myDoc = NULL; 04788 ctxt->wellFormed = 1; 04789 ctxt->replaceEntities = 0; 04790 ctxt->linenumbers = xmlLineNumbersDefaultValue; 04791 ctxt->html = 1; 04792 ctxt->vctxt.finishDtd = XML_CTXT_FINISH_DTD_0; 04793 ctxt->vctxt.userData = ctxt; 04794 ctxt->vctxt.error = xmlParserValidityError; 04795 ctxt->vctxt.warning = xmlParserValidityWarning; 04796 ctxt->record_info = 0; 04797 ctxt->validate = 0; 04798 ctxt->nbChars = 0; 04799 ctxt->checkIndex = 0; 04800 ctxt->catalogs = NULL; 04801 xmlInitNodeInfoSeq(&ctxt->node_seq); 04802 return(0); 04803 } 04804 04813 void 04814 htmlFreeParserCtxt(htmlParserCtxtPtr ctxt) 04815 { 04816 xmlFreeParserCtxt(ctxt); 04817 } 04818 04827 htmlParserCtxtPtr 04828 htmlNewParserCtxt(void) 04829 { 04830 xmlParserCtxtPtr ctxt; 04831 04832 ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt)); 04833 if (ctxt == NULL) { 04834 htmlErrMemory(NULL, "NewParserCtxt: out of memory\n"); 04835 return(NULL); 04836 } 04837 memset(ctxt, 0, sizeof(xmlParserCtxt)); 04838 if (htmlInitParserCtxt(ctxt) < 0) { 04839 htmlFreeParserCtxt(ctxt); 04840 return(NULL); 04841 } 04842 return(ctxt); 04843 } 04844 04854 htmlParserCtxtPtr 04855 htmlCreateMemoryParserCtxt(const char *buffer, int size) { 04856 xmlParserCtxtPtr ctxt; 04857 xmlParserInputPtr input; 04858 xmlParserInputBufferPtr buf; 04859 04860 if (buffer == NULL) 04861 return(NULL); 04862 if (size <= 0) 04863 return(NULL); 04864 04865 ctxt = htmlNewParserCtxt(); 04866 if (ctxt == NULL) 04867 return(NULL); 04868 04869 buf = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE); 04870 if (buf == NULL) return(NULL); 04871 04872 input = xmlNewInputStream(ctxt); 04873 if (input == NULL) { 04874 xmlFreeParserCtxt(ctxt); 04875 return(NULL); 04876 } 04877 04878 input->filename = NULL; 04879 input->buf = buf; 04880 input->base = input->buf->buffer->content; 04881 input->cur = input->buf->buffer->content; 04882 input->end = &input->buf->buffer->content[input->buf->buffer->use]; 04883 04884 inputPush(ctxt, input); 04885 return(ctxt); 04886 } 04887 04899 static htmlParserCtxtPtr 04900 htmlCreateDocParserCtxt(const xmlChar *cur, const char *encoding) { 04901 int len; 04902 htmlParserCtxtPtr ctxt; 04903 04904 if (cur == NULL) 04905 return(NULL); 04906 len = xmlStrlen(cur); 04907 ctxt = htmlCreateMemoryParserCtxt((char *)cur, len); 04908 if (ctxt == NULL) 04909 return(NULL); 04910 04911 if (encoding != NULL) { 04912 xmlCharEncoding enc; 04913 xmlCharEncodingHandlerPtr handler; 04914 04915 if (ctxt->input->encoding != NULL) 04916 xmlFree((xmlChar *) ctxt->input->encoding); 04917 ctxt->input->encoding = xmlStrdup((const xmlChar *) encoding); 04918 04919 enc = xmlParseCharEncoding(encoding); 04920 /* 04921 * registered set of known encodings 04922 */ 04923 if (enc != XML_CHAR_ENCODING_ERROR) { 04924 xmlSwitchEncoding(ctxt, enc); 04925 if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) { 04926 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING, 04927 "Unsupported encoding %s\n", 04928 (const xmlChar *) encoding, NULL); 04929 } 04930 } else { 04931 /* 04932 * fallback for unknown encodings 04933 */ 04934 handler = xmlFindCharEncodingHandler((const char *) encoding); 04935 if (handler != NULL) { 04936 xmlSwitchToEncoding(ctxt, handler); 04937 } else { 04938 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING, 04939 "Unsupported encoding %s\n", 04940 (const xmlChar *) encoding, NULL); 04941 } 04942 } 04943 } 04944 return(ctxt); 04945 } 04946 04947 #ifdef LIBXML_PUSH_ENABLED 04948 /************************************************************************ 04949 * * 04950 * Progressive parsing interfaces * 04951 * * 04952 ************************************************************************/ 04953 04972 static int 04973 htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first, 04974 xmlChar next, xmlChar third, int iscomment, 04975 int ignoreattrval) 04976 { 04977 int base, len; 04978 htmlParserInputPtr in; 04979 const xmlChar *buf; 04980 int incomment = 0; 04981 int invalue = 0; 04982 char valdellim = 0x0; 04983 04984 in = ctxt->input; 04985 if (in == NULL) 04986 return (-1); 04987 04988 base = in->cur - in->base; 04989 if (base < 0) 04990 return (-1); 04991 04992 if (ctxt->checkIndex > base) 04993 base = ctxt->checkIndex; 04994 04995 if (in->buf == NULL) { 04996 buf = in->base; 04997 len = in->length; 04998 } else { 04999 buf = in->buf->buffer->content; 05000 len = in->buf->buffer->use; 05001 } 05002 05003 /* take into account the sequence length */ 05004 if (third) 05005 len -= 2; 05006 else if (next) 05007 len--; 05008 for (; base < len; base++) { 05009 if ((!incomment) && (base + 4 < len) && (!iscomment)) { 05010 if ((buf[base] == '<') && (buf[base + 1] == '!') && 05011 (buf[base + 2] == '-') && (buf[base + 3] == '-')) { 05012 incomment = 1; 05013 /* do not increment past <! - some people use <!--> */ 05014 base += 2; 05015 } 05016 } 05017 if (ignoreattrval) { 05018 if (buf[base] == '"' || buf[base] == '\'') { 05019 if (invalue) { 05020 if (buf[base] == valdellim) { 05021 invalue = 0; 05022 continue; 05023 } 05024 } else { 05025 valdellim = buf[base]; 05026 invalue = 1; 05027 continue; 05028 } 05029 } else if (invalue) { 05030 continue; 05031 } 05032 } 05033 if (incomment) { 05034 if (base + 3 > len) 05035 return (-1); 05036 if ((buf[base] == '-') && (buf[base + 1] == '-') && 05037 (buf[base + 2] == '>')) { 05038 incomment = 0; 05039 base += 2; 05040 } 05041 continue; 05042 } 05043 if (buf[base] == first) { 05044 if (third != 0) { 05045 if ((buf[base + 1] != next) || (buf[base + 2] != third)) 05046 continue; 05047 } else if (next != 0) { 05048 if (buf[base + 1] != next) 05049 continue; 05050 } 05051 ctxt->checkIndex = 0; 05052 #ifdef DEBUG_PUSH 05053 if (next == 0) 05054 xmlGenericError(xmlGenericErrorContext, 05055 "HPP: lookup '%c' found at %d\n", 05056 first, base); 05057 else if (third == 0) 05058 xmlGenericError(xmlGenericErrorContext, 05059 "HPP: lookup '%c%c' found at %d\n", 05060 first, next, base); 05061 else 05062 xmlGenericError(xmlGenericErrorContext, 05063 "HPP: lookup '%c%c%c' found at %d\n", 05064 first, next, third, base); 05065 #endif 05066 return (base - (in->cur - in->base)); 05067 } 05068 } 05069 if ((!incomment) && (!invalue)) 05070 ctxt->checkIndex = base; 05071 #ifdef DEBUG_PUSH 05072 if (next == 0) 05073 xmlGenericError(xmlGenericErrorContext, 05074 "HPP: lookup '%c' failed\n", first); 05075 else if (third == 0) 05076 xmlGenericError(xmlGenericErrorContext, 05077 "HPP: lookup '%c%c' failed\n", first, next); 05078 else 05079 xmlGenericError(xmlGenericErrorContext, 05080 "HPP: lookup '%c%c%c' failed\n", first, next, 05081 third); 05082 #endif 05083 return (-1); 05084 } 05085 05101 static int 05102 htmlParseLookupChars(htmlParserCtxtPtr ctxt, const xmlChar * stop, 05103 int stopLen) 05104 { 05105 int base, len; 05106 htmlParserInputPtr in; 05107 const xmlChar *buf; 05108 int incomment = 0; 05109 int i; 05110 05111 in = ctxt->input; 05112 if (in == NULL) 05113 return (-1); 05114 05115 base = in->cur - in->base; 05116 if (base < 0) 05117 return (-1); 05118 05119 if (ctxt->checkIndex > base) 05120 base = ctxt->checkIndex; 05121 05122 if (in->buf == NULL) { 05123 buf = in->base; 05124 len = in->length; 05125 } else { 05126 buf = in->buf->buffer->content; 05127 len = in->buf->buffer->use; 05128 } 05129 05130 for (; base < len; base++) { 05131 if (!incomment && (base + 4 < len)) { 05132 if ((buf[base] == '<') && (buf[base + 1] == '!') && 05133 (buf[base + 2] == '-') && (buf[base + 3] == '-')) { 05134 incomment = 1; 05135 /* do not increment past <! - some people use <!--> */ 05136 base += 2; 05137 } 05138 } 05139 if (incomment) { 05140 if (base + 3 > len) 05141 return (-1); 05142 if ((buf[base] == '-') && (buf[base + 1] == '-') && 05143 (buf[base + 2] == '>')) { 05144 incomment = 0; 05145 base += 2; 05146 } 05147 continue; 05148 } 05149 for (i = 0; i < stopLen; ++i) { 05150 if (buf[base] == stop[i]) { 05151 ctxt->checkIndex = 0; 05152 return (base - (in->cur - in->base)); 05153 } 05154 } 05155 } 05156 ctxt->checkIndex = base; 05157 return (-1); 05158 } 05159 05169 static int 05170 htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) { 05171 int ret = 0; 05172 htmlParserInputPtr in; 05173 int avail = 0; 05174 xmlChar cur, next; 05175 05176 #ifdef DEBUG_PUSH 05177 switch (ctxt->instate) { 05178 case XML_PARSER_EOF: 05179 xmlGenericError(xmlGenericErrorContext, 05180 "HPP: try EOF\n"); break; 05181 case XML_PARSER_START: 05182 xmlGenericError(xmlGenericErrorContext, 05183 "HPP: try START\n"); break; 05184 case XML_PARSER_MISC: 05185 xmlGenericError(xmlGenericErrorContext, 05186 "HPP: try MISC\n");break; 05187 case XML_PARSER_COMMENT: 05188 xmlGenericError(xmlGenericErrorContext, 05189 "HPP: try COMMENT\n");break; 05190 case XML_PARSER_PROLOG: 05191 xmlGenericError(xmlGenericErrorContext, 05192 "HPP: try PROLOG\n");break; 05193 case XML_PARSER_START_TAG: 05194 xmlGenericError(xmlGenericErrorContext, 05195 "HPP: try START_TAG\n");break; 05196 case XML_PARSER_CONTENT: 05197 xmlGenericError(xmlGenericErrorContext, 05198 "HPP: try CONTENT\n");break; 05199 case XML_PARSER_CDATA_SECTION: 05200 xmlGenericError(xmlGenericErrorContext, 05201 "HPP: try CDATA_SECTION\n");break; 05202 case XML_PARSER_END_TAG: 05203 xmlGenericError(xmlGenericErrorContext, 05204 "HPP: try END_TAG\n");break; 05205 case XML_PARSER_ENTITY_DECL: 05206 xmlGenericError(xmlGenericErrorContext, 05207 "HPP: try ENTITY_DECL\n");break; 05208 case XML_PARSER_ENTITY_VALUE: 05209 xmlGenericError(xmlGenericErrorContext, 05210 "HPP: try ENTITY_VALUE\n");break; 05211 case XML_PARSER_ATTRIBUTE_VALUE: 05212 xmlGenericError(xmlGenericErrorContext, 05213 "HPP: try ATTRIBUTE_VALUE\n");break; 05214 case XML_PARSER_DTD: 05215 xmlGenericError(xmlGenericErrorContext, 05216 "HPP: try DTD\n");break; 05217 case XML_PARSER_EPILOG: 05218 xmlGenericError(xmlGenericErrorContext, 05219 "HPP: try EPILOG\n");break; 05220 case XML_PARSER_PI: 05221 xmlGenericError(xmlGenericErrorContext, 05222 "HPP: try PI\n");break; 05223 case XML_PARSER_SYSTEM_LITERAL: 05224 xmlGenericError(xmlGenericErrorContext, 05225 "HPP: try SYSTEM_LITERAL\n");break; 05226 } 05227 #endif 05228 05229 while (1) { 05230 05231 in = ctxt->input; 05232 if (in == NULL) break; 05233 if (in->buf == NULL) 05234 avail = in->length - (in->cur - in->base); 05235 else 05236 avail = in->buf->buffer->use - (in->cur - in->base); 05237 if ((avail == 0) && (terminate)) { 05238 htmlAutoCloseOnEnd(ctxt); 05239 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) { 05240 /* 05241 * SAX: end of the document processing. 05242 */ 05243 ctxt->instate = XML_PARSER_EOF; 05244 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL)) 05245 ctxt->sax->endDocument(ctxt->userData); 05246 } 05247 } 05248 if (avail < 1) 05249 goto done; 05250 cur = in->cur[0]; 05251 if (cur == 0) { 05252 SKIP(1); 05253 continue; 05254 } 05255 05256 switch (ctxt->instate) { 05257 case XML_PARSER_EOF: 05258 /* 05259 * Document parsing is done ! 05260 */ 05261 goto done; 05262 case XML_PARSER_START: 05263 /* 05264 * Very first chars read from the document flow. 05265 */ 05266 cur = in->cur[0]; 05267 if (IS_BLANK_CH(cur)) { 05268 SKIP_BLANKS; 05269 if (in->buf == NULL) 05270 avail = in->length - (in->cur - in->base); 05271 else 05272 avail = in->buf->buffer->use - (in->cur - in->base); 05273 } 05274 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator)) 05275 ctxt->sax->setDocumentLocator(ctxt->userData, 05276 &xmlDefaultSAXLocator); 05277 if ((ctxt->sax) && (ctxt->sax->startDocument) && 05278 (!ctxt->disableSAX)) 05279 ctxt->sax->startDocument(ctxt->userData); 05280 05281 cur = in->cur[0]; 05282 next = in->cur[1]; 05283 if ((cur == '<') && (next == '!') && 05284 (UPP(2) == 'D') && (UPP(3) == 'O') && 05285 (UPP(4) == 'C') && (UPP(5) == 'T') && 05286 (UPP(6) == 'Y') && (UPP(7) == 'P') && 05287 (UPP(8) == 'E')) { 05288 if ((!terminate) && 05289 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0)) 05290 goto done; 05291 #ifdef DEBUG_PUSH 05292 xmlGenericError(xmlGenericErrorContext, 05293 "HPP: Parsing internal subset\n"); 05294 #endif 05295 htmlParseDocTypeDecl(ctxt); 05296 ctxt->instate = XML_PARSER_PROLOG; 05297 #ifdef DEBUG_PUSH 05298 xmlGenericError(xmlGenericErrorContext, 05299 "HPP: entering PROLOG\n"); 05300 #endif 05301 } else { 05302 ctxt->instate = XML_PARSER_MISC; 05303 #ifdef DEBUG_PUSH 05304 xmlGenericError(xmlGenericErrorContext, 05305 "HPP: entering MISC\n"); 05306 #endif 05307 } 05308 break; 05309 case XML_PARSER_MISC: 05310 SKIP_BLANKS; 05311 if (in->buf == NULL) 05312 avail = in->length - (in->cur - in->base); 05313 else 05314 avail = in->buf->buffer->use - (in->cur - in->base); 05315 if (avail < 2) 05316 goto done; 05317 cur = in->cur[0]; 05318 next = in->cur[1]; 05319 if ((cur == '<') && (next == '!') && 05320 (in->cur[2] == '-') && (in->cur[3] == '-')) { 05321 if ((!terminate) && 05322 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0)) 05323 goto done; 05324 #ifdef DEBUG_PUSH 05325 xmlGenericError(xmlGenericErrorContext, 05326 "HPP: Parsing Comment\n"); 05327 #endif 05328 htmlParseComment(ctxt); 05329 ctxt->instate = XML_PARSER_MISC; 05330 } else if ((cur == '<') && (next == '?')) { 05331 if ((!terminate) && 05332 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0)) 05333 goto done; 05334 #ifdef DEBUG_PUSH 05335 xmlGenericError(xmlGenericErrorContext, 05336 "HPP: Parsing PI\n"); 05337 #endif 05338 htmlParsePI(ctxt); 05339 ctxt->instate = XML_PARSER_MISC; 05340 } else if ((cur == '<') && (next == '!') && 05341 (UPP(2) == 'D') && (UPP(3) == 'O') && 05342 (UPP(4) == 'C') && (UPP(5) == 'T') && 05343 (UPP(6) == 'Y') && (UPP(7) == 'P') && 05344 (UPP(8) == 'E')) { 05345 if ((!terminate) && 05346 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0)) 05347 goto done; 05348 #ifdef DEBUG_PUSH 05349 xmlGenericError(xmlGenericErrorContext, 05350 "HPP: Parsing internal subset\n"); 05351 #endif 05352 htmlParseDocTypeDecl(ctxt); 05353 ctxt->instate = XML_PARSER_PROLOG; 05354 #ifdef DEBUG_PUSH 05355 xmlGenericError(xmlGenericErrorContext, 05356 "HPP: entering PROLOG\n"); 05357 #endif 05358 } else if ((cur == '<') && (next == '!') && 05359 (avail < 9)) { 05360 goto done; 05361 } else { 05362 ctxt->instate = XML_PARSER_START_TAG; 05363 #ifdef DEBUG_PUSH 05364 xmlGenericError(xmlGenericErrorContext, 05365 "HPP: entering START_TAG\n"); 05366 #endif 05367 } 05368 break; 05369 case XML_PARSER_PROLOG: 05370 SKIP_BLANKS; 05371 if (in->buf == NULL) 05372 avail = in->length - (in->cur - in->base); 05373 else 05374 avail = in->buf->buffer->use - (in->cur - in->base); 05375 if (avail < 2) 05376 goto done; 05377 cur = in->cur[0]; 05378 next = in->cur[1]; 05379 if ((cur == '<') && (next == '!') && 05380 (in->cur[2] == '-') && (in->cur[3] == '-')) { 05381 if ((!terminate) && 05382 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0)) 05383 goto done; 05384 #ifdef DEBUG_PUSH 05385 xmlGenericError(xmlGenericErrorContext, 05386 "HPP: Parsing Comment\n"); 05387 #endif 05388 htmlParseComment(ctxt); 05389 ctxt->instate = XML_PARSER_PROLOG; 05390 } else if ((cur == '<') && (next == '?')) { 05391 if ((!terminate) && 05392 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0)) 05393 goto done; 05394 #ifdef DEBUG_PUSH 05395 xmlGenericError(xmlGenericErrorContext, 05396 "HPP: Parsing PI\n"); 05397 #endif 05398 htmlParsePI(ctxt); 05399 ctxt->instate = XML_PARSER_PROLOG; 05400 } else if ((cur == '<') && (next == '!') && 05401 (avail < 4)) { 05402 goto done; 05403 } else { 05404 ctxt->instate = XML_PARSER_START_TAG; 05405 #ifdef DEBUG_PUSH 05406 xmlGenericError(xmlGenericErrorContext, 05407 "HPP: entering START_TAG\n"); 05408 #endif 05409 } 05410 break; 05411 case XML_PARSER_EPILOG: 05412 if (in->buf == NULL) 05413 avail = in->length - (in->cur - in->base); 05414 else 05415 avail = in->buf->buffer->use - (in->cur - in->base); 05416 if (avail < 1) 05417 goto done; 05418 cur = in->cur[0]; 05419 if (IS_BLANK_CH(cur)) { 05420 htmlParseCharData(ctxt); 05421 goto done; 05422 } 05423 if (avail < 2) 05424 goto done; 05425 next = in->cur[1]; 05426 if ((cur == '<') && (next == '!') && 05427 (in->cur[2] == '-') && (in->cur[3] == '-')) { 05428 if ((!terminate) && 05429 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0)) 05430 goto done; 05431 #ifdef DEBUG_PUSH 05432 xmlGenericError(xmlGenericErrorContext, 05433 "HPP: Parsing Comment\n"); 05434 #endif 05435 htmlParseComment(ctxt); 05436 ctxt->instate = XML_PARSER_EPILOG; 05437 } else if ((cur == '<') && (next == '?')) { 05438 if ((!terminate) && 05439 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0)) 05440 goto done; 05441 #ifdef DEBUG_PUSH 05442 xmlGenericError(xmlGenericErrorContext, 05443 "HPP: Parsing PI\n"); 05444 #endif 05445 htmlParsePI(ctxt); 05446 ctxt->instate = XML_PARSER_EPILOG; 05447 } else if ((cur == '<') && (next == '!') && 05448 (avail < 4)) { 05449 goto done; 05450 } else { 05451 ctxt->errNo = XML_ERR_DOCUMENT_END; 05452 ctxt->wellFormed = 0; 05453 ctxt->instate = XML_PARSER_EOF; 05454 #ifdef DEBUG_PUSH 05455 xmlGenericError(xmlGenericErrorContext, 05456 "HPP: entering EOF\n"); 05457 #endif 05458 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL)) 05459 ctxt->sax->endDocument(ctxt->userData); 05460 goto done; 05461 } 05462 break; 05463 case XML_PARSER_START_TAG: { 05464 const xmlChar *name; 05465 int failed; 05466 const htmlElemDesc * info; 05467 05468 if (avail < 2) 05469 goto done; 05470 cur = in->cur[0]; 05471 if (cur != '<') { 05472 ctxt->instate = XML_PARSER_CONTENT; 05473 #ifdef DEBUG_PUSH 05474 xmlGenericError(xmlGenericErrorContext, 05475 "HPP: entering CONTENT\n"); 05476 #endif 05477 break; 05478 } 05479 if (in->cur[1] == '/') { 05480 ctxt->instate = XML_PARSER_END_TAG; 05481 ctxt->checkIndex = 0; 05482 #ifdef DEBUG_PUSH 05483 xmlGenericError(xmlGenericErrorContext, 05484 "HPP: entering END_TAG\n"); 05485 #endif 05486 break; 05487 } 05488 if ((!terminate) && 05489 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0)) 05490 goto done; 05491 05492 failed = htmlParseStartTag(ctxt); 05493 name = ctxt->name; 05494 if ((failed == -1) || 05495 (name == NULL)) { 05496 if (CUR == '>') 05497 NEXT; 05498 break; 05499 } 05500 05501 /* 05502 * Lookup the info for that element. 05503 */ 05504 info = htmlTagLookup(name); 05505 if (info == NULL) { 05506 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG, 05507 "Tag %s invalid\n", name, NULL); 05508 } 05509 05510 /* 05511 * Check for an Empty Element labeled the XML/SGML way 05512 */ 05513 if ((CUR == '/') && (NXT(1) == '>')) { 05514 SKIP(2); 05515 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) 05516 ctxt->sax->endElement(ctxt->userData, name); 05517 htmlnamePop(ctxt); 05518 ctxt->instate = XML_PARSER_CONTENT; 05519 #ifdef DEBUG_PUSH 05520 xmlGenericError(xmlGenericErrorContext, 05521 "HPP: entering CONTENT\n"); 05522 #endif 05523 break; 05524 } 05525 05526 if (CUR == '>') { 05527 NEXT; 05528 } else { 05529 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED, 05530 "Couldn't find end of Start Tag %s\n", 05531 name, NULL); 05532 05533 /* 05534 * end of parsing of this node. 05535 */ 05536 if (xmlStrEqual(name, ctxt->name)) { 05537 nodePop(ctxt); 05538 htmlnamePop(ctxt); 05539 } 05540 05541 ctxt->instate = XML_PARSER_CONTENT; 05542 #ifdef DEBUG_PUSH 05543 xmlGenericError(xmlGenericErrorContext, 05544 "HPP: entering CONTENT\n"); 05545 #endif 05546 break; 05547 } 05548 05549 /* 05550 * Check for an Empty Element from DTD definition 05551 */ 05552 if ((info != NULL) && (info->empty)) { 05553 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) 05554 ctxt->sax->endElement(ctxt->userData, name); 05555 htmlnamePop(ctxt); 05556 } 05557 ctxt->instate = XML_PARSER_CONTENT; 05558 #ifdef DEBUG_PUSH 05559 xmlGenericError(xmlGenericErrorContext, 05560 "HPP: entering CONTENT\n"); 05561 #endif 05562 break; 05563 } 05564 case XML_PARSER_CONTENT: { 05565 long cons; 05566 /* 05567 * Handle preparsed entities and charRef 05568 */ 05569 if (ctxt->token != 0) { 05570 xmlChar chr[2] = { 0 , 0 } ; 05571 05572 chr[0] = (xmlChar) ctxt->token; 05573 htmlCheckParagraph(ctxt); 05574 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) 05575 ctxt->sax->characters(ctxt->userData, chr, 1); 05576 ctxt->token = 0; 05577 ctxt->checkIndex = 0; 05578 } 05579 if ((avail == 1) && (terminate)) { 05580 cur = in->cur[0]; 05581 if ((cur != '<') && (cur != '&')) { 05582 if (ctxt->sax != NULL) { 05583 if (IS_BLANK_CH(cur)) { 05584 if (ctxt->sax->ignorableWhitespace != NULL) 05585 ctxt->sax->ignorableWhitespace( 05586 ctxt->userData, &cur, 1); 05587 } else { 05588 htmlCheckParagraph(ctxt); 05589 if (ctxt->sax->characters != NULL) 05590 ctxt->sax->characters( 05591 ctxt->userData, &cur, 1); 05592 } 05593 } 05594 ctxt->token = 0; 05595 ctxt->checkIndex = 0; 05596 in->cur++; 05597 break; 05598 } 05599 } 05600 if (avail < 2) 05601 goto done; 05602 cur = in->cur[0]; 05603 next = in->cur[1]; 05604 cons = ctxt->nbChars; 05605 if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) || 05606 (xmlStrEqual(ctxt->name, BAD_CAST"style"))) { 05607 /* 05608 * Handle SCRIPT/STYLE separately 05609 */ 05610 if (!terminate) { 05611 int idx; 05612 xmlChar val; 05613 05614 idx = htmlParseLookupSequence(ctxt, '<', '/', 0, 0, 0); 05615 if (idx < 0) 05616 goto done; 05617 val = in->cur[idx + 2]; 05618 if (val == 0) /* bad cut of input */ 05619 goto done; 05620 } 05621 htmlParseScript(ctxt); 05622 if ((cur == '<') && (next == '/')) { 05623 ctxt->instate = XML_PARSER_END_TAG; 05624 ctxt->checkIndex = 0; 05625 #ifdef DEBUG_PUSH 05626 xmlGenericError(xmlGenericErrorContext, 05627 "HPP: entering END_TAG\n"); 05628 #endif 05629 break; 05630 } 05631 } else { 05632 /* 05633 * Sometimes DOCTYPE arrives in the middle of the document 05634 */ 05635 if ((cur == '<') && (next == '!') && 05636 (UPP(2) == 'D') && (UPP(3) == 'O') && 05637 (UPP(4) == 'C') && (UPP(5) == 'T') && 05638 (UPP(6) == 'Y') && (UPP(7) == 'P') && 05639 (UPP(8) == 'E')) { 05640 if ((!terminate) && 05641 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0)) 05642 goto done; 05643 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR, 05644 "Misplaced DOCTYPE declaration\n", 05645 BAD_CAST "DOCTYPE" , NULL); 05646 htmlParseDocTypeDecl(ctxt); 05647 } else if ((cur == '<') && (next == '!') && 05648 (in->cur[2] == '-') && (in->cur[3] == '-')) { 05649 if ((!terminate) && 05650 (htmlParseLookupSequence( 05651 ctxt, '-', '-', '>', 1, 1) < 0)) 05652 goto done; 05653 #ifdef DEBUG_PUSH 05654 xmlGenericError(xmlGenericErrorContext, 05655 "HPP: Parsing Comment\n"); 05656 #endif 05657 htmlParseComment(ctxt); 05658 ctxt->instate = XML_PARSER_CONTENT; 05659 } else if ((cur == '<') && (next == '?')) { 05660 if ((!terminate) && 05661 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0)) 05662 goto done; 05663 #ifdef DEBUG_PUSH 05664 xmlGenericError(xmlGenericErrorContext, 05665 "HPP: Parsing PI\n"); 05666 #endif 05667 htmlParsePI(ctxt); 05668 ctxt->instate = XML_PARSER_CONTENT; 05669 } else if ((cur == '<') && (next == '!') && (avail < 4)) { 05670 goto done; 05671 } else if ((cur == '<') && (next == '/')) { 05672 ctxt->instate = XML_PARSER_END_TAG; 05673 ctxt->checkIndex = 0; 05674 #ifdef DEBUG_PUSH 05675 xmlGenericError(xmlGenericErrorContext, 05676 "HPP: entering END_TAG\n"); 05677 #endif 05678 break; 05679 } else if (cur == '<') { 05680 ctxt->instate = XML_PARSER_START_TAG; 05681 ctxt->checkIndex = 0; 05682 #ifdef DEBUG_PUSH 05683 xmlGenericError(xmlGenericErrorContext, 05684 "HPP: entering START_TAG\n"); 05685 #endif 05686 break; 05687 } else if (cur == '&') { 05688 if ((!terminate) && 05689 (htmlParseLookupChars(ctxt, 05690 BAD_CAST "; >/", 4) < 0)) 05691 goto done; 05692 #ifdef DEBUG_PUSH 05693 xmlGenericError(xmlGenericErrorContext, 05694 "HPP: Parsing Reference\n"); 05695 #endif 05696 /* TODO: check generation of subtrees if noent !!! */ 05697 htmlParseReference(ctxt); 05698 } else { 05699 /* 05700 * check that the text sequence is complete 05701 * before handing out the data to the parser 05702 * to avoid problems with erroneous end of 05703 * data detection. 05704 */ 05705 if ((!terminate) && 05706 (htmlParseLookupChars(ctxt, BAD_CAST "<&", 2) < 0)) 05707 goto done; 05708 ctxt->checkIndex = 0; 05709 #ifdef DEBUG_PUSH 05710 xmlGenericError(xmlGenericErrorContext, 05711 "HPP: Parsing char data\n"); 05712 #endif 05713 htmlParseCharData(ctxt); 05714 } 05715 } 05716 if (cons == ctxt->nbChars) { 05717 if (ctxt->node != NULL) { 05718 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 05719 "detected an error in element content\n", 05720 NULL, NULL); 05721 } 05722 NEXT; 05723 break; 05724 } 05725 05726 break; 05727 } 05728 case XML_PARSER_END_TAG: 05729 if (avail < 2) 05730 goto done; 05731 if ((!terminate) && 05732 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0)) 05733 goto done; 05734 htmlParseEndTag(ctxt); 05735 if (ctxt->nameNr == 0) { 05736 ctxt->instate = XML_PARSER_EPILOG; 05737 } else { 05738 ctxt->instate = XML_PARSER_CONTENT; 05739 } 05740 ctxt->checkIndex = 0; 05741 #ifdef DEBUG_PUSH 05742 xmlGenericError(xmlGenericErrorContext, 05743 "HPP: entering CONTENT\n"); 05744 #endif 05745 break; 05746 case XML_PARSER_CDATA_SECTION: 05747 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 05748 "HPP: internal error, state == CDATA\n", 05749 NULL, NULL); 05750 ctxt->instate = XML_PARSER_CONTENT; 05751 ctxt->checkIndex = 0; 05752 #ifdef DEBUG_PUSH 05753 xmlGenericError(xmlGenericErrorContext, 05754 "HPP: entering CONTENT\n"); 05755 #endif 05756 break; 05757 case XML_PARSER_DTD: 05758 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 05759 "HPP: internal error, state == DTD\n", 05760 NULL, NULL); 05761 ctxt->instate = XML_PARSER_CONTENT; 05762 ctxt->checkIndex = 0; 05763 #ifdef DEBUG_PUSH 05764 xmlGenericError(xmlGenericErrorContext, 05765 "HPP: entering CONTENT\n"); 05766 #endif 05767 break; 05768 case XML_PARSER_COMMENT: 05769 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 05770 "HPP: internal error, state == COMMENT\n", 05771 NULL, NULL); 05772 ctxt->instate = XML_PARSER_CONTENT; 05773 ctxt->checkIndex = 0; 05774 #ifdef DEBUG_PUSH 05775 xmlGenericError(xmlGenericErrorContext, 05776 "HPP: entering CONTENT\n"); 05777 #endif 05778 break; 05779 case XML_PARSER_PI: 05780 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 05781 "HPP: internal error, state == PI\n", 05782 NULL, NULL); 05783 ctxt->instate = XML_PARSER_CONTENT; 05784 ctxt->checkIndex = 0; 05785 #ifdef DEBUG_PUSH 05786 xmlGenericError(xmlGenericErrorContext, 05787 "HPP: entering CONTENT\n"); 05788 #endif 05789 break; 05790 case XML_PARSER_ENTITY_DECL: 05791 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 05792 "HPP: internal error, state == ENTITY_DECL\n", 05793 NULL, NULL); 05794 ctxt->instate = XML_PARSER_CONTENT; 05795 ctxt->checkIndex = 0; 05796 #ifdef DEBUG_PUSH 05797 xmlGenericError(xmlGenericErrorContext, 05798 "HPP: entering CONTENT\n"); 05799 #endif 05800 break; 05801 case XML_PARSER_ENTITY_VALUE: 05802 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 05803 "HPP: internal error, state == ENTITY_VALUE\n", 05804 NULL, NULL); 05805 ctxt->instate = XML_PARSER_CONTENT; 05806 ctxt->checkIndex = 0; 05807 #ifdef DEBUG_PUSH 05808 xmlGenericError(xmlGenericErrorContext, 05809 "HPP: entering DTD\n"); 05810 #endif 05811 break; 05812 case XML_PARSER_ATTRIBUTE_VALUE: 05813 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 05814 "HPP: internal error, state == ATTRIBUTE_VALUE\n", 05815 NULL, NULL); 05816 ctxt->instate = XML_PARSER_START_TAG; 05817 ctxt->checkIndex = 0; 05818 #ifdef DEBUG_PUSH 05819 xmlGenericError(xmlGenericErrorContext, 05820 "HPP: entering START_TAG\n"); 05821 #endif 05822 break; 05823 case XML_PARSER_SYSTEM_LITERAL: 05824 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 05825 "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n", 05826 NULL, NULL); 05827 ctxt->instate = XML_PARSER_CONTENT; 05828 ctxt->checkIndex = 0; 05829 #ifdef DEBUG_PUSH 05830 xmlGenericError(xmlGenericErrorContext, 05831 "HPP: entering CONTENT\n"); 05832 #endif 05833 break; 05834 case XML_PARSER_IGNORE: 05835 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 05836 "HPP: internal error, state == XML_PARSER_IGNORE\n", 05837 NULL, NULL); 05838 ctxt->instate = XML_PARSER_CONTENT; 05839 ctxt->checkIndex = 0; 05840 #ifdef DEBUG_PUSH 05841 xmlGenericError(xmlGenericErrorContext, 05842 "HPP: entering CONTENT\n"); 05843 #endif 05844 break; 05845 case XML_PARSER_PUBLIC_LITERAL: 05846 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 05847 "HPP: internal error, state == XML_PARSER_LITERAL\n", 05848 NULL, NULL); 05849 ctxt->instate = XML_PARSER_CONTENT; 05850 ctxt->checkIndex = 0; 05851 #ifdef DEBUG_PUSH 05852 xmlGenericError(xmlGenericErrorContext, 05853 "HPP: entering CONTENT\n"); 05854 #endif 05855 break; 05856 05857 } 05858 } 05859 done: 05860 if ((avail == 0) && (terminate)) { 05861 htmlAutoCloseOnEnd(ctxt); 05862 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) { 05863 /* 05864 * SAX: end of the document processing. 05865 */ 05866 ctxt->instate = XML_PARSER_EOF; 05867 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL)) 05868 ctxt->sax->endDocument(ctxt->userData); 05869 } 05870 } 05871 if ((ctxt->myDoc != NULL) && 05872 ((terminate) || (ctxt->instate == XML_PARSER_EOF) || 05873 (ctxt->instate == XML_PARSER_EPILOG))) { 05874 xmlDtdPtr dtd; 05875 dtd = xmlGetIntSubset(ctxt->myDoc); 05876 if (dtd == NULL) 05877 ctxt->myDoc->intSubset = 05878 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html", 05879 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN", 05880 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd"); 05881 } 05882 #ifdef DEBUG_PUSH 05883 xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret); 05884 #endif 05885 return(ret); 05886 } 05887 05899 int 05900 htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size, 05901 int terminate) { 05902 if ((ctxt == NULL) || (ctxt->input == NULL)) { 05903 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 05904 "htmlParseChunk: context error\n", NULL, NULL); 05905 return(XML_ERR_INTERNAL_ERROR); 05906 } 05907 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) && 05908 (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) { 05909 int base = ctxt->input->base - ctxt->input->buf->buffer->content; 05910 int cur = ctxt->input->cur - ctxt->input->base; 05911 int res; 05912 05913 res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk); 05914 if (res < 0) { 05915 ctxt->errNo = XML_PARSER_EOF; 05916 ctxt->disableSAX = 1; 05917 return (XML_PARSER_EOF); 05918 } 05919 ctxt->input->base = ctxt->input->buf->buffer->content + base; 05920 ctxt->input->cur = ctxt->input->base + cur; 05921 ctxt->input->end = 05922 &ctxt->input->buf->buffer->content[ctxt->input->buf->buffer->use]; 05923 #ifdef DEBUG_PUSH 05924 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size); 05925 #endif 05926 05927 #if 0 05928 if ((terminate) || (ctxt->input->buf->buffer->use > 80)) 05929 htmlParseTryOrFinish(ctxt, terminate); 05930 #endif 05931 } else if (ctxt->instate != XML_PARSER_EOF) { 05932 if ((ctxt->input != NULL) && ctxt->input->buf != NULL) { 05933 xmlParserInputBufferPtr in = ctxt->input->buf; 05934 if ((in->encoder != NULL) && (in->buffer != NULL) && 05935 (in->raw != NULL)) { 05936 int nbchars; 05937 05938 nbchars = xmlCharEncInFunc(in->encoder, in->buffer, in->raw); 05939 if (nbchars < 0) { 05940 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING, 05941 "encoder error\n", NULL, NULL); 05942 return(XML_ERR_INVALID_ENCODING); 05943 } 05944 } 05945 } 05946 } 05947 htmlParseTryOrFinish(ctxt, terminate); 05948 if (terminate) { 05949 if ((ctxt->instate != XML_PARSER_EOF) && 05950 (ctxt->instate != XML_PARSER_EPILOG) && 05951 (ctxt->instate != XML_PARSER_MISC)) { 05952 ctxt->errNo = XML_ERR_DOCUMENT_END; 05953 ctxt->wellFormed = 0; 05954 } 05955 if (ctxt->instate != XML_PARSER_EOF) { 05956 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL)) 05957 ctxt->sax->endDocument(ctxt->userData); 05958 } 05959 ctxt->instate = XML_PARSER_EOF; 05960 } 05961 return((xmlParserErrors) ctxt->errNo); 05962 } 05963 05964 /************************************************************************ 05965 * * 05966 * User entry points * 05967 * * 05968 ************************************************************************/ 05969 05985 htmlParserCtxtPtr 05986 htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data, 05987 const char *chunk, int size, const char *filename, 05988 xmlCharEncoding enc) { 05989 htmlParserCtxtPtr ctxt; 05990 htmlParserInputPtr inputStream; 05991 xmlParserInputBufferPtr buf; 05992 05993 xmlInitParser(); 05994 05995 buf = xmlAllocParserInputBuffer(enc); 05996 if (buf == NULL) return(NULL); 05997 05998 ctxt = htmlNewParserCtxt(); 05999 if (ctxt == NULL) { 06000 xmlFreeParserInputBuffer(buf); 06001 return(NULL); 06002 } 06003 if(enc==XML_CHAR_ENCODING_UTF8 || buf->encoder) 06004 ctxt->charset=XML_CHAR_ENCODING_UTF8; 06005 if (sax != NULL) { 06006 if (ctxt->sax != (xmlSAXHandlerPtr) &htmlDefaultSAXHandler) 06007 xmlFree(ctxt->sax); 06008 ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler)); 06009 if (ctxt->sax == NULL) { 06010 xmlFree(buf); 06011 xmlFree(ctxt); 06012 return(NULL); 06013 } 06014 memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler)); 06015 if (user_data != NULL) 06016 ctxt->userData = user_data; 06017 } 06018 if (filename == NULL) { 06019 ctxt->directory = NULL; 06020 } else { 06021 ctxt->directory = xmlParserGetDirectory(filename); 06022 } 06023 06024 inputStream = htmlNewInputStream(ctxt); 06025 if (inputStream == NULL) { 06026 xmlFreeParserCtxt(ctxt); 06027 xmlFree(buf); 06028 return(NULL); 06029 } 06030 06031 if (filename == NULL) 06032 inputStream->filename = NULL; 06033 else 06034 inputStream->filename = (char *) 06035 xmlCanonicPath((const xmlChar *) filename); 06036 inputStream->buf = buf; 06037 inputStream->base = inputStream->buf->buffer->content; 06038 inputStream->cur = inputStream->buf->buffer->content; 06039 inputStream->end = 06040 &inputStream->buf->buffer->content[inputStream->buf->buffer->use]; 06041 06042 inputPush(ctxt, inputStream); 06043 06044 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) && 06045 (ctxt->input->buf != NULL)) { 06046 int base = ctxt->input->base - ctxt->input->buf->buffer->content; 06047 int cur = ctxt->input->cur - ctxt->input->base; 06048 06049 xmlParserInputBufferPush(ctxt->input->buf, size, chunk); 06050 06051 ctxt->input->base = ctxt->input->buf->buffer->content + base; 06052 ctxt->input->cur = ctxt->input->base + cur; 06053 ctxt->input->end = 06054 &ctxt->input->buf->buffer->content[ctxt->input->buf->buffer->use]; 06055 #ifdef DEBUG_PUSH 06056 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size); 06057 #endif 06058 } 06059 ctxt->progressive = 1; 06060 06061 return(ctxt); 06062 } 06063 #endif /* LIBXML_PUSH_ENABLED */ 06064 06080 htmlDocPtr 06081 htmlSAXParseDoc(xmlChar *cur, const char *encoding, htmlSAXHandlerPtr sax, void *userData) { 06082 htmlDocPtr ret; 06083 htmlParserCtxtPtr ctxt; 06084 06085 xmlInitParser(); 06086 06087 if (cur == NULL) return(NULL); 06088 06089 06090 ctxt = htmlCreateDocParserCtxt(cur, encoding); 06091 if (ctxt == NULL) return(NULL); 06092 if (sax != NULL) { 06093 if (ctxt->sax != NULL) xmlFree (ctxt->sax); 06094 ctxt->sax = sax; 06095 ctxt->userData = userData; 06096 } 06097 06098 htmlParseDocument(ctxt); 06099 ret = ctxt->myDoc; 06100 if (sax != NULL) { 06101 ctxt->sax = NULL; 06102 ctxt->userData = NULL; 06103 } 06104 htmlFreeParserCtxt(ctxt); 06105 06106 return(ret); 06107 } 06108 06119 htmlDocPtr 06120 htmlParseDoc(xmlChar *cur, const char *encoding) { 06121 return(htmlSAXParseDoc(cur, encoding, NULL, NULL)); 06122 } 06123 06124 06136 htmlParserCtxtPtr 06137 htmlCreateFileParserCtxt(const char *filename, const char *encoding) 06138 { 06139 htmlParserCtxtPtr ctxt; 06140 htmlParserInputPtr inputStream; 06141 char *canonicFilename; 06142 /* htmlCharEncoding enc; */ 06143 xmlChar *content, *content_line = (xmlChar *) "charset="; 06144 06145 if (filename == NULL) 06146 return(NULL); 06147 06148 ctxt = htmlNewParserCtxt(); 06149 if (ctxt == NULL) { 06150 return(NULL); 06151 } 06152 canonicFilename = (char *) xmlCanonicPath((const xmlChar *) filename); 06153 if (canonicFilename == NULL) { 06154 #ifdef LIBXML_SAX1_ENABLED 06155 if (xmlDefaultSAXHandler.error != NULL) { 06156 xmlDefaultSAXHandler.error(NULL, "out of memory\n"); 06157 } 06158 #endif 06159 xmlFreeParserCtxt(ctxt); 06160 return(NULL); 06161 } 06162 06163 inputStream = xmlLoadExternalEntity(canonicFilename, NULL, ctxt); 06164 xmlFree(canonicFilename); 06165 if (inputStream == NULL) { 06166 xmlFreeParserCtxt(ctxt); 06167 return(NULL); 06168 } 06169 06170 inputPush(ctxt, inputStream); 06171 06172 /* set encoding */ 06173 if (encoding) { 06174 content = xmlMallocAtomic (xmlStrlen(content_line) + strlen(encoding) + 1); 06175 if (content) { 06176 strcpy ((char *)content, (char *)content_line); 06177 strcat ((char *)content, (char *)encoding); 06178 htmlCheckEncoding (ctxt, content); 06179 xmlFree (content); 06180 } 06181 } 06182 06183 return(ctxt); 06184 } 06185 06202 htmlDocPtr 06203 htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax, 06204 void *userData) { 06205 htmlDocPtr ret; 06206 htmlParserCtxtPtr ctxt; 06207 htmlSAXHandlerPtr oldsax = NULL; 06208 06209 xmlInitParser(); 06210 06211 ctxt = htmlCreateFileParserCtxt(filename, encoding); 06212 if (ctxt == NULL) return(NULL); 06213 if (sax != NULL) { 06214 oldsax = ctxt->sax; 06215 ctxt->sax = sax; 06216 ctxt->userData = userData; 06217 } 06218 06219 htmlParseDocument(ctxt); 06220 06221 ret = ctxt->myDoc; 06222 if (sax != NULL) { 06223 ctxt->sax = oldsax; 06224 ctxt->userData = NULL; 06225 } 06226 htmlFreeParserCtxt(ctxt); 06227 06228 return(ret); 06229 } 06230 06242 htmlDocPtr 06243 htmlParseFile(const char *filename, const char *encoding) { 06244 return(htmlSAXParseFile(filename, encoding, NULL, NULL)); 06245 } 06246 06256 int 06257 htmlHandleOmittedElem(int val) { 06258 int old = htmlOmittedDefaultValue; 06259 06260 htmlOmittedDefaultValue = val; 06261 return(old); 06262 } 06263 06274 int 06275 htmlElementAllowedHere(const htmlElemDesc* parent, const xmlChar* elt) { 06276 const char** p ; 06277 06278 if ( ! elt || ! parent || ! parent->subelts ) 06279 return 0 ; 06280 06281 for ( p = parent->subelts; *p; ++p ) 06282 if ( !xmlStrcmp((const xmlChar *)*p, elt) ) 06283 return 1 ; 06284 06285 return 0 ; 06286 } 06297 htmlStatus 06298 htmlElementStatusHere(const htmlElemDesc* parent, const htmlElemDesc* elt) { 06299 if ( ! parent || ! elt ) 06300 return HTML_INVALID ; 06301 if ( ! htmlElementAllowedHere(parent, (const xmlChar*) elt->name ) ) 06302 return HTML_INVALID ; 06303 06304 return ( elt->dtd == 0 ) ? HTML_VALID : HTML_DEPRECATED ; 06305 } 06317 htmlStatus 06318 htmlAttrAllowed(const htmlElemDesc* elt, const xmlChar* attr, int legacy) { 06319 const char** p ; 06320 06321 if ( !elt || ! attr ) 06322 return HTML_INVALID ; 06323 06324 if ( elt->attrs_req ) 06325 for ( p = elt->attrs_req; *p; ++p) 06326 if ( !xmlStrcmp((const xmlChar*)*p, attr) ) 06327 return HTML_REQUIRED ; 06328 06329 if ( elt->attrs_opt ) 06330 for ( p = elt->attrs_opt; *p; ++p) 06331 if ( !xmlStrcmp((const xmlChar*)*p, attr) ) 06332 return HTML_VALID ; 06333 06334 if ( legacy && elt->attrs_depr ) 06335 for ( p = elt->attrs_depr; *p; ++p) 06336 if ( !xmlStrcmp((const xmlChar*)*p, attr) ) 06337 return HTML_DEPRECATED ; 06338 06339 return HTML_INVALID ; 06340 } 06355 htmlStatus 06356 htmlNodeStatus(const htmlNodePtr node, int legacy) { 06357 if ( ! node ) 06358 return HTML_INVALID ; 06359 06360 switch ( node->type ) { 06361 case XML_ELEMENT_NODE: 06362 return legacy 06363 ? ( htmlElementAllowedHere ( 06364 htmlTagLookup(node->parent->name) , node->name 06365 ) ? HTML_VALID : HTML_INVALID ) 06366 : htmlElementStatusHere( 06367 htmlTagLookup(node->parent->name) , 06368 htmlTagLookup(node->name) ) 06369 ; 06370 case XML_ATTRIBUTE_NODE: 06371 return htmlAttrAllowed( 06372 htmlTagLookup(node->parent->name) , node->name, legacy) ; 06373 default: return HTML_NA ; 06374 } 06375 } 06376 /************************************************************************ 06377 * * 06378 * New set (2.6.0) of simpler and more flexible APIs * 06379 * * 06380 ************************************************************************/ 06388 #define DICT_FREE(str) \ 06389 if ((str) && ((!dict) || \ 06390 (xmlDictOwns(dict, (const xmlChar *)(str)) == 0))) \ 06391 xmlFree((char *)(str)); 06392 06399 void 06400 htmlCtxtReset(htmlParserCtxtPtr ctxt) 06401 { 06402 xmlParserInputPtr input; 06403 xmlDictPtr dict; 06404 06405 if (ctxt == NULL) 06406 return; 06407 06408 xmlInitParser(); 06409 dict = ctxt->dict; 06410 06411 while ((input = inputPop(ctxt)) != NULL) { /* Non consuming */ 06412 xmlFreeInputStream(input); 06413 } 06414 ctxt->inputNr = 0; 06415 ctxt->input = NULL; 06416 06417 ctxt->spaceNr = 0; 06418 if (ctxt->spaceTab != NULL) { 06419 ctxt->spaceTab[0] = -1; 06420 ctxt->space = &ctxt->spaceTab[0]; 06421 } else { 06422 ctxt->space = NULL; 06423 } 06424 06425 06426 ctxt->nodeNr = 0; 06427 ctxt->node = NULL; 06428 06429 ctxt->nameNr = 0; 06430 ctxt->name = NULL; 06431 06432 DICT_FREE(ctxt->version); 06433 ctxt->version = NULL; 06434 DICT_FREE(ctxt->encoding); 06435 ctxt->encoding = NULL; 06436 DICT_FREE(ctxt->directory); 06437 ctxt->directory = NULL; 06438 DICT_FREE(ctxt->extSubURI); 06439 ctxt->extSubURI = NULL; 06440 DICT_FREE(ctxt->extSubSystem); 06441 ctxt->extSubSystem = NULL; 06442 if (ctxt->myDoc != NULL) 06443 xmlFreeDoc(ctxt->myDoc); 06444 ctxt->myDoc = NULL; 06445 06446 ctxt->standalone = -1; 06447 ctxt->hasExternalSubset = 0; 06448 ctxt->hasPErefs = 0; 06449 ctxt->html = 1; 06450 ctxt->external = 0; 06451 ctxt->instate = XML_PARSER_START; 06452 ctxt->token = 0; 06453 06454 ctxt->wellFormed = 1; 06455 ctxt->nsWellFormed = 1; 06456 ctxt->disableSAX = 0; 06457 ctxt->valid = 1; 06458 ctxt->vctxt.userData = ctxt; 06459 ctxt->vctxt.error = xmlParserValidityError; 06460 ctxt->vctxt.warning = xmlParserValidityWarning; 06461 ctxt->record_info = 0; 06462 ctxt->nbChars = 0; 06463 ctxt->checkIndex = 0; 06464 ctxt->inSubset = 0; 06465 ctxt->errNo = XML_ERR_OK; 06466 ctxt->depth = 0; 06467 ctxt->charset = XML_CHAR_ENCODING_NONE; 06468 ctxt->catalogs = NULL; 06469 xmlInitNodeInfoSeq(&ctxt->node_seq); 06470 06471 if (ctxt->attsDefault != NULL) { 06472 xmlHashFree(ctxt->attsDefault, (xmlHashDeallocator) xmlFree); 06473 ctxt->attsDefault = NULL; 06474 } 06475 if (ctxt->attsSpecial != NULL) { 06476 xmlHashFree(ctxt->attsSpecial, NULL); 06477 ctxt->attsSpecial = NULL; 06478 } 06479 } 06480 06491 int 06492 htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options) 06493 { 06494 if (ctxt == NULL) 06495 return(-1); 06496 06497 if (options & HTML_PARSE_NOWARNING) { 06498 ctxt->sax->warning = NULL; 06499 ctxt->vctxt.warning = NULL; 06500 options -= XML_PARSE_NOWARNING; 06501 ctxt->options |= XML_PARSE_NOWARNING; 06502 } 06503 if (options & HTML_PARSE_NOERROR) { 06504 ctxt->sax->error = NULL; 06505 ctxt->vctxt.error = NULL; 06506 ctxt->sax->fatalError = NULL; 06507 options -= XML_PARSE_NOERROR; 06508 ctxt->options |= XML_PARSE_NOERROR; 06509 } 06510 if (options & HTML_PARSE_PEDANTIC) { 06511 ctxt->pedantic = 1; 06512 options -= XML_PARSE_PEDANTIC; 06513 ctxt->options |= XML_PARSE_PEDANTIC; 06514 } else 06515 ctxt->pedantic = 0; 06516 if (options & XML_PARSE_NOBLANKS) { 06517 ctxt->keepBlanks = 0; 06518 ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace; 06519 options -= XML_PARSE_NOBLANKS; 06520 ctxt->options |= XML_PARSE_NOBLANKS; 06521 } else 06522 ctxt->keepBlanks = 1; 06523 if (options & HTML_PARSE_RECOVER) { 06524 ctxt->recovery = 1; 06525 options -= HTML_PARSE_RECOVER; 06526 } else 06527 ctxt->recovery = 0; 06528 if (options & HTML_PARSE_COMPACT) { 06529 ctxt->options |= HTML_PARSE_COMPACT; 06530 options -= HTML_PARSE_COMPACT; 06531 } 06532 if (options & XML_PARSE_HUGE) { 06533 ctxt->options |= XML_PARSE_HUGE; 06534 options -= XML_PARSE_HUGE; 06535 } 06536 if (options & HTML_PARSE_NODEFDTD) { 06537 ctxt->options |= HTML_PARSE_NODEFDTD; 06538 options -= HTML_PARSE_NODEFDTD; 06539 } 06540 ctxt->dictNames = 0; 06541 return (options); 06542 } 06543 06556 static htmlDocPtr 06557 htmlDoRead(htmlParserCtxtPtr ctxt, const char *URL, const char *encoding, 06558 int options, int reuse) 06559 { 06560 htmlDocPtr ret; 06561 06562 htmlCtxtUseOptions(ctxt, options); 06563 ctxt->html = 1; 06564 if (encoding != NULL) { 06565 xmlCharEncodingHandlerPtr hdlr; 06566 06567 hdlr = xmlFindCharEncodingHandler(encoding); 06568 if (hdlr != NULL) { 06569 xmlSwitchToEncoding(ctxt, hdlr); 06570 if (ctxt->input->encoding != NULL) 06571 xmlFree((xmlChar *) ctxt->input->encoding); 06572 ctxt->input->encoding = xmlStrdup((xmlChar *)encoding); 06573 } 06574 } 06575 if ((URL != NULL) && (ctxt->input != NULL) && 06576 (ctxt->input->filename == NULL)) 06577 ctxt->input->filename = (char *) xmlStrdup((const xmlChar *) URL); 06578 htmlParseDocument(ctxt); 06579 ret = ctxt->myDoc; 06580 ctxt->myDoc = NULL; 06581 if (!reuse) { 06582 if ((ctxt->dictNames) && 06583 (ret != NULL) && 06584 (ret->dict == ctxt->dict)) 06585 ctxt->dict = NULL; 06586 xmlFreeParserCtxt(ctxt); 06587 } 06588 return (ret); 06589 } 06590 06602 htmlDocPtr 06603 htmlReadDoc(const xmlChar * cur, const char *URL, const char *encoding, int options) 06604 { 06605 htmlParserCtxtPtr ctxt; 06606 06607 if (cur == NULL) 06608 return (NULL); 06609 06610 xmlInitParser(); 06611 ctxt = htmlCreateDocParserCtxt(cur, NULL); 06612 if (ctxt == NULL) 06613 return (NULL); 06614 return (htmlDoRead(ctxt, URL, encoding, options, 0)); 06615 } 06616 06627 htmlDocPtr 06628 htmlReadFile(const char *filename, const char *encoding, int options) 06629 { 06630 htmlParserCtxtPtr ctxt; 06631 06632 xmlInitParser(); 06633 ctxt = htmlCreateFileParserCtxt(filename, encoding); 06634 if (ctxt == NULL) 06635 return (NULL); 06636 return (htmlDoRead(ctxt, NULL, NULL, options, 0)); 06637 } 06638 06651 htmlDocPtr 06652 htmlReadMemory(const char *buffer, int size, const char *URL, const char *encoding, int options) 06653 { 06654 htmlParserCtxtPtr ctxt; 06655 06656 xmlInitParser(); 06657 ctxt = xmlCreateMemoryParserCtxt(buffer, size); 06658 if (ctxt == NULL) 06659 return (NULL); 06660 htmlDefaultSAXHandlerInit(); 06661 if (ctxt->sax != NULL) 06662 memcpy(ctxt->sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1)); 06663 return (htmlDoRead(ctxt, URL, encoding, options, 0)); 06664 } 06665 06677 htmlDocPtr 06678 htmlReadFd(int fd, const char *URL, const char *encoding, int options) 06679 { 06680 htmlParserCtxtPtr ctxt; 06681 xmlParserInputBufferPtr input; 06682 xmlParserInputPtr stream; 06683 06684 if (fd < 0) 06685 return (NULL); 06686 06687 xmlInitParser(); 06688 input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE); 06689 if (input == NULL) 06690 return (NULL); 06691 ctxt = xmlNewParserCtxt(); 06692 if (ctxt == NULL) { 06693 xmlFreeParserInputBuffer(input); 06694 return (NULL); 06695 } 06696 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE); 06697 if (stream == NULL) { 06698 xmlFreeParserInputBuffer(input); 06699 xmlFreeParserCtxt(ctxt); 06700 return (NULL); 06701 } 06702 inputPush(ctxt, stream); 06703 return (htmlDoRead(ctxt, URL, encoding, options, 0)); 06704 } 06705 06719 htmlDocPtr 06720 htmlReadIO(xmlInputReadCallback ioread, xmlInputCloseCallback ioclose, 06721 void *ioctx, const char *URL, const char *encoding, int options) 06722 { 06723 htmlParserCtxtPtr ctxt; 06724 xmlParserInputBufferPtr input; 06725 xmlParserInputPtr stream; 06726 06727 if (ioread == NULL) 06728 return (NULL); 06729 xmlInitParser(); 06730 06731 input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx, 06732 XML_CHAR_ENCODING_NONE); 06733 if (input == NULL) 06734 return (NULL); 06735 ctxt = htmlNewParserCtxt(); 06736 if (ctxt == NULL) { 06737 xmlFreeParserInputBuffer(input); 06738 return (NULL); 06739 } 06740 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE); 06741 if (stream == NULL) { 06742 xmlFreeParserInputBuffer(input); 06743 xmlFreeParserCtxt(ctxt); 06744 return (NULL); 06745 } 06746 inputPush(ctxt, stream); 06747 return (htmlDoRead(ctxt, URL, encoding, options, 0)); 06748 } 06749 06763 htmlDocPtr 06764 htmlCtxtReadDoc(htmlParserCtxtPtr ctxt, const xmlChar * cur, 06765 const char *URL, const char *encoding, int options) 06766 { 06767 xmlParserInputPtr stream; 06768 06769 if (cur == NULL) 06770 return (NULL); 06771 if (ctxt == NULL) 06772 return (NULL); 06773 06774 htmlCtxtReset(ctxt); 06775 06776 stream = xmlNewStringInputStream(ctxt, cur); 06777 if (stream == NULL) { 06778 return (NULL); 06779 } 06780 inputPush(ctxt, stream); 06781 return (htmlDoRead(ctxt, URL, encoding, options, 1)); 06782 } 06783 06796 htmlDocPtr 06797 htmlCtxtReadFile(htmlParserCtxtPtr ctxt, const char *filename, 06798 const char *encoding, int options) 06799 { 06800 xmlParserInputPtr stream; 06801 06802 if (filename == NULL) 06803 return (NULL); 06804 if (ctxt == NULL) 06805 return (NULL); 06806 06807 htmlCtxtReset(ctxt); 06808 06809 stream = xmlLoadExternalEntity(filename, NULL, ctxt); 06810 if (stream == NULL) { 06811 return (NULL); 06812 } 06813 inputPush(ctxt, stream); 06814 return (htmlDoRead(ctxt, NULL, encoding, options, 1)); 06815 } 06816 06831 htmlDocPtr 06832 htmlCtxtReadMemory(htmlParserCtxtPtr ctxt, const char *buffer, int size, 06833 const char *URL, const char *encoding, int options) 06834 { 06835 xmlParserInputBufferPtr input; 06836 xmlParserInputPtr stream; 06837 06838 if (ctxt == NULL) 06839 return (NULL); 06840 if (buffer == NULL) 06841 return (NULL); 06842 06843 htmlCtxtReset(ctxt); 06844 06845 input = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE); 06846 if (input == NULL) { 06847 return(NULL); 06848 } 06849 06850 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE); 06851 if (stream == NULL) { 06852 xmlFreeParserInputBuffer(input); 06853 return(NULL); 06854 } 06855 06856 inputPush(ctxt, stream); 06857 return (htmlDoRead(ctxt, URL, encoding, options, 1)); 06858 } 06859 06873 htmlDocPtr 06874 htmlCtxtReadFd(htmlParserCtxtPtr ctxt, int fd, 06875 const char *URL, const char *encoding, int options) 06876 { 06877 xmlParserInputBufferPtr input; 06878 xmlParserInputPtr stream; 06879 06880 if (fd < 0) 06881 return (NULL); 06882 if (ctxt == NULL) 06883 return (NULL); 06884 06885 htmlCtxtReset(ctxt); 06886 06887 06888 input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE); 06889 if (input == NULL) 06890 return (NULL); 06891 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE); 06892 if (stream == NULL) { 06893 xmlFreeParserInputBuffer(input); 06894 return (NULL); 06895 } 06896 inputPush(ctxt, stream); 06897 return (htmlDoRead(ctxt, URL, encoding, options, 1)); 06898 } 06899 06915 htmlDocPtr 06916 htmlCtxtReadIO(htmlParserCtxtPtr ctxt, xmlInputReadCallback ioread, 06917 xmlInputCloseCallback ioclose, void *ioctx, 06918 const char *URL, 06919 const char *encoding, int options) 06920 { 06921 xmlParserInputBufferPtr input; 06922 xmlParserInputPtr stream; 06923 06924 if (ioread == NULL) 06925 return (NULL); 06926 if (ctxt == NULL) 06927 return (NULL); 06928 06929 htmlCtxtReset(ctxt); 06930 06931 input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx, 06932 XML_CHAR_ENCODING_NONE); 06933 if (input == NULL) 06934 return (NULL); 06935 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE); 06936 if (stream == NULL) { 06937 xmlFreeParserInputBuffer(input); 06938 return (NULL); 06939 } 06940 inputPush(ctxt, stream); 06941 return (htmlDoRead(ctxt, URL, encoding, options, 1)); 06942 } 06943 06944 #define bottom_HTMLparser 06945 #include "elfgcchack.h" 06946 #endif /* LIBXML_HTML_ENABLED */ Generated on Sat May 26 2012 04:33:18 for ReactOS by
1.7.6.1
|