ReactOS  0.4.15-dev-3303-g1ade494
HTMLparser.c
Go to the documentation of this file.
1 /*
2  * HTMLparser.c : an HTML 4.0 non-verifying parser
3  *
4  * See Copyright for the status of this software.
5  *
6  * daniel@veillard.com
7  */
8 
9 #define IN_LIBXML
10 #include "libxml.h"
11 #ifdef LIBXML_HTML_ENABLED
12 
13 #include <string.h>
14 #ifdef HAVE_CTYPE_H
15 #include <ctype.h>
16 #endif
17 #ifdef HAVE_STDLIB_H
18 #include <stdlib.h>
19 #endif
20 #ifdef HAVE_SYS_STAT_H
21 #include <sys/stat.h>
22 #endif
23 #ifdef HAVE_FCNTL_H
24 #include <fcntl.h>
25 #endif
26 #ifdef HAVE_UNISTD_H
27 #include <unistd.h>
28 #endif
29 #ifdef LIBXML_ZLIB_ENABLED
30 #include <zlib.h>
31 #endif
32 
33 #include <libxml/xmlmemory.h>
34 #include <libxml/tree.h>
35 #include <libxml/parser.h>
36 #include <libxml/parserInternals.h>
37 #include <libxml/xmlerror.h>
38 #include <libxml/HTMLparser.h>
39 #include <libxml/HTMLtree.h>
40 #include <libxml/entities.h>
41 #include <libxml/encoding.h>
42 #include <libxml/valid.h>
43 #include <libxml/xmlIO.h>
44 #include <libxml/globals.h>
45 #include <libxml/uri.h>
46 
47 #include "buf.h"
48 #include "enc.h"
49 
50 #define HTML_MAX_NAMELEN 1000
51 #define HTML_PARSER_BIG_BUFFER_SIZE 1000
52 #define HTML_PARSER_BUFFER_SIZE 100
53 
54 /* #define DEBUG */
55 /* #define DEBUG_PUSH */
56 
57 static int htmlOmittedDefaultValue = 1;
58 
59 xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
60  xmlChar end, xmlChar end2, xmlChar end3);
61 static void htmlParseComment(htmlParserCtxtPtr ctxt);
62 
63 /************************************************************************
64  * *
65  * Some factorized error routines *
66  * *
67  ************************************************************************/
68 
76 static void
77 htmlErrMemory(xmlParserCtxtPtr ctxt, const char *extra)
78 {
79  if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
80  (ctxt->instate == XML_PARSER_EOF))
81  return;
82  if (ctxt != NULL) {
83  ctxt->errNo = XML_ERR_NO_MEMORY;
84  ctxt->instate = XML_PARSER_EOF;
85  ctxt->disableSAX = 1;
86  }
87  if (extra)
88  __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
90  NULL, NULL, 0, 0,
91  "Memory allocation failed : %s\n", extra);
92  else
93  __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
95  NULL, NULL, 0, 0, "Memory allocation failed\n");
96 }
97 
108 static void LIBXML_ATTR_FORMAT(3,0)
109 htmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error,
110  const char *msg, const xmlChar *str1, const xmlChar *str2)
111 {
112  if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
113  (ctxt->instate == XML_PARSER_EOF))
114  return;
115  if (ctxt != NULL)
116  ctxt->errNo = error;
117  __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
118  XML_ERR_ERROR, NULL, 0,
119  (const char *) str1, (const char *) str2,
120  NULL, 0, 0,
121  msg, str1, str2);
122  if (ctxt != NULL)
123  ctxt->wellFormed = 0;
124 }
125 
135 static void LIBXML_ATTR_FORMAT(3,0)
136 htmlParseErrInt(xmlParserCtxtPtr ctxt, xmlParserErrors error,
137  const char *msg, int val)
138 {
139  if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
140  (ctxt->instate == XML_PARSER_EOF))
141  return;
142  if (ctxt != NULL)
143  ctxt->errNo = error;
144  __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
145  XML_ERR_ERROR, NULL, 0, NULL, NULL,
146  NULL, val, 0, msg, val);
147  if (ctxt != NULL)
148  ctxt->wellFormed = 0;
149 }
150 
151 /************************************************************************
152  * *
153  * Parser stacks related functions and macros *
154  * *
155  ************************************************************************/
156 
166 static int
167 htmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value)
168 {
169  if ((ctxt->html < 3) && (xmlStrEqual(value, BAD_CAST "head")))
170  ctxt->html = 3;
171  if ((ctxt->html < 10) && (xmlStrEqual(value, BAD_CAST "body")))
172  ctxt->html = 10;
173  if (ctxt->nameNr >= ctxt->nameMax) {
174  ctxt->nameMax *= 2;
175  ctxt->nameTab = (const xmlChar * *)
176  xmlRealloc((xmlChar * *)ctxt->nameTab,
177  ctxt->nameMax *
178  sizeof(ctxt->nameTab[0]));
179  if (ctxt->nameTab == NULL) {
180  htmlErrMemory(ctxt, NULL);
181  return (0);
182  }
183  }
184  ctxt->nameTab[ctxt->nameNr] = value;
185  ctxt->name = value;
186  return (ctxt->nameNr++);
187 }
196 static const xmlChar *
197 htmlnamePop(htmlParserCtxtPtr ctxt)
198 {
199  const xmlChar *ret;
200 
201  if (ctxt->nameNr <= 0)
202  return (NULL);
203  ctxt->nameNr--;
204  if (ctxt->nameNr < 0)
205  return (NULL);
206  if (ctxt->nameNr > 0)
207  ctxt->name = ctxt->nameTab[ctxt->nameNr - 1];
208  else
209  ctxt->name = NULL;
210  ret = ctxt->nameTab[ctxt->nameNr];
211  ctxt->nameTab[ctxt->nameNr] = NULL;
212  return (ret);
213 }
214 
224 static int
225 htmlNodeInfoPush(htmlParserCtxtPtr ctxt, htmlParserNodeInfo *value)
226 {
227  if (ctxt->nodeInfoNr >= ctxt->nodeInfoMax) {
228  if (ctxt->nodeInfoMax == 0)
229  ctxt->nodeInfoMax = 5;
230  ctxt->nodeInfoMax *= 2;
231  ctxt->nodeInfoTab = (htmlParserNodeInfo *)
232  xmlRealloc((htmlParserNodeInfo *)ctxt->nodeInfoTab,
233  ctxt->nodeInfoMax *
234  sizeof(ctxt->nodeInfoTab[0]));
235  if (ctxt->nodeInfoTab == NULL) {
236  htmlErrMemory(ctxt, NULL);
237  return (0);
238  }
239  }
240  ctxt->nodeInfoTab[ctxt->nodeInfoNr] = *value;
241  ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
242  return (ctxt->nodeInfoNr++);
243 }
244 
253 static htmlParserNodeInfo *
254 htmlNodeInfoPop(htmlParserCtxtPtr ctxt)
255 {
256  if (ctxt->nodeInfoNr <= 0)
257  return (NULL);
258  ctxt->nodeInfoNr--;
259  if (ctxt->nodeInfoNr < 0)
260  return (NULL);
261  if (ctxt->nodeInfoNr > 0)
262  ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr - 1];
263  else
264  ctxt->nodeInfo = NULL;
265  return &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
266 }
267 
268 /*
269  * Macros for accessing the content. Those should be used only by the parser,
270  * and not exported.
271  *
272  * Dirty macros, i.e. one need to make assumption on the context to use them
273  *
274  * CUR_PTR return the current pointer to the xmlChar to be parsed.
275  * CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
276  * in ISO-Latin or UTF-8, and the current 16 bit value if compiled
277  * in UNICODE mode. This should be used internally by the parser
278  * only to compare to ASCII values otherwise it would break when
279  * running with UTF-8 encoding.
280  * NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
281  * to compare on ASCII based substring.
282  * UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
283  * it should be used only to compare on ASCII based substring.
284  * SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
285  * strings without newlines within the parser.
286  *
287  * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
288  *
289  * CURRENT Returns the current char value, with the full decoding of
290  * UTF-8 if we are using this mode. It returns an int.
291  * NEXT Skip to the next character, this does the proper decoding
292  * in UTF-8 mode. It also pop-up unfinished entities on the fly.
293  * NEXTL(l) Skip the current unicode character of l xmlChars long.
294  * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
295  */
296 
297 #define UPPER (toupper(*ctxt->input->cur))
298 
299 #define SKIP(val) ctxt->input->cur += (val),ctxt->input->col+=(val)
300 
301 #define NXT(val) ctxt->input->cur[(val)]
302 
303 #define UPP(val) (toupper(ctxt->input->cur[(val)]))
304 
305 #define CUR_PTR ctxt->input->cur
306 #define BASE_PTR ctxt->input->base
307 
308 #define SHRINK if ((ctxt->input->cur - ctxt->input->base > 2 * INPUT_CHUNK) && \
309  (ctxt->input->end - ctxt->input->cur < 2 * INPUT_CHUNK)) \
310  xmlParserInputShrink(ctxt->input)
311 
312 #define GROW if ((ctxt->progressive == 0) && \
313  (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK)) \
314  xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
315 
316 #define CURRENT ((int) (*ctxt->input->cur))
317 
318 #define SKIP_BLANKS htmlSkipBlankChars(ctxt)
319 
320 /* Imported from XML */
321 
322 /* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */
323 #define CUR ((int) (*ctxt->input->cur))
324 #define NEXT xmlNextChar(ctxt)
325 
326 #define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
327 
328 
329 #define NEXTL(l) do { \
330  if (*(ctxt->input->cur) == '\n') { \
331  ctxt->input->line++; ctxt->input->col = 1; \
332  } else ctxt->input->col++; \
333  ctxt->token = 0; ctxt->input->cur += l; \
334  } while (0)
335 
336 /************
337  \
338  if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
339  if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
340  ************/
341 
342 #define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
343 #define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
344 
345 #define COPY_BUF(l,b,i,v) \
346  if (l == 1) b[i++] = (xmlChar) v; \
347  else i += xmlCopyChar(l,&b[i],v)
348 
363 static xmlChar *
364 htmlFindEncoding(xmlParserCtxtPtr ctxt) {
365  const xmlChar *start, *cur, *end;
366 
367  if ((ctxt == NULL) || (ctxt->input == NULL) ||
368  (ctxt->input->encoding != NULL) || (ctxt->input->buf == NULL) ||
369  (ctxt->input->buf->encoder != NULL))
370  return(NULL);
371  if ((ctxt->input->cur == NULL) || (ctxt->input->end == NULL))
372  return(NULL);
373 
374  start = ctxt->input->cur;
375  end = ctxt->input->end;
376  /* we also expect the input buffer to be zero terminated */
377  if (*end != 0)
378  return(NULL);
379 
380  cur = xmlStrcasestr(start, BAD_CAST "HTTP-EQUIV");
381  if (cur == NULL)
382  return(NULL);
383  cur = xmlStrcasestr(cur, BAD_CAST "CONTENT");
384  if (cur == NULL)
385  return(NULL);
386  cur = xmlStrcasestr(cur, BAD_CAST "CHARSET=");
387  if (cur == NULL)
388  return(NULL);
389  cur += 8;
390  start = cur;
391  while (((*cur >= 'A') && (*cur <= 'Z')) ||
392  ((*cur >= 'a') && (*cur <= 'z')) ||
393  ((*cur >= '0') && (*cur <= '9')) ||
394  (*cur == '-') || (*cur == '_') || (*cur == ':') || (*cur == '/'))
395  cur++;
396  if (cur == start)
397  return(NULL);
398  return(xmlStrndup(start, cur - start));
399 }
400 
415 static int
416 htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
417  const unsigned char *cur;
418  unsigned char c;
419  unsigned int val;
420 
421  if (ctxt->instate == XML_PARSER_EOF)
422  return(0);
423 
424  if (ctxt->token != 0) {
425  *len = 0;
426  return(ctxt->token);
427  }
428  if (ctxt->charset != XML_CHAR_ENCODING_UTF8) {
429  xmlChar * guess;
431 
432  /*
433  * Assume it's a fixed length encoding (1) with
434  * a compatible encoding for the ASCII set, since
435  * HTML constructs only use < 128 chars
436  */
437  if ((int) *ctxt->input->cur < 0x80) {
438  *len = 1;
439  if ((*ctxt->input->cur == 0) &&
440  (ctxt->input->cur < ctxt->input->end)) {
441  htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
442  "Char 0x%X out of allowed range\n", 0);
443  return(' ');
444  }
445  return((int) *ctxt->input->cur);
446  }
447 
448  /*
449  * Humm this is bad, do an automatic flow conversion
450  */
451  guess = htmlFindEncoding(ctxt);
452  if (guess == NULL) {
454  } else {
455  if (ctxt->input->encoding != NULL)
456  xmlFree((xmlChar *) ctxt->input->encoding);
457  ctxt->input->encoding = guess;
458  handler = xmlFindCharEncodingHandler((const char *) guess);
459  if (handler != NULL) {
460  /*
461  * Don't use UTF-8 encoder which isn't required and
462  * can produce invalid UTF-8.
463  */
464  if (!xmlStrEqual(BAD_CAST handler->name, BAD_CAST "UTF-8"))
466  } else {
467  htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
468  "Unsupported encoding %s", guess, NULL);
469  }
470  }
472  }
473 
474  /*
475  * We are supposed to handle UTF8, check it's valid
476  * From rfc2044: encoding of the Unicode values on UTF-8:
477  *
478  * UCS-4 range (hex.) UTF-8 octet sequence (binary)
479  * 0000 0000-0000 007F 0xxxxxxx
480  * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
481  * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
482  *
483  * Check for the 0x110000 limit too
484  */
485  cur = ctxt->input->cur;
486  c = *cur;
487  if (c & 0x80) {
488  if ((c & 0x40) == 0)
489  goto encoding_error;
490  if (cur[1] == 0) {
492  cur = ctxt->input->cur;
493  }
494  if ((cur[1] & 0xc0) != 0x80)
495  goto encoding_error;
496  if ((c & 0xe0) == 0xe0) {
497 
498  if (cur[2] == 0) {
500  cur = ctxt->input->cur;
501  }
502  if ((cur[2] & 0xc0) != 0x80)
503  goto encoding_error;
504  if ((c & 0xf0) == 0xf0) {
505  if (cur[3] == 0) {
507  cur = ctxt->input->cur;
508  }
509  if (((c & 0xf8) != 0xf0) ||
510  ((cur[3] & 0xc0) != 0x80))
511  goto encoding_error;
512  /* 4-byte code */
513  *len = 4;
514  val = (cur[0] & 0x7) << 18;
515  val |= (cur[1] & 0x3f) << 12;
516  val |= (cur[2] & 0x3f) << 6;
517  val |= cur[3] & 0x3f;
518  if (val < 0x10000)
519  goto encoding_error;
520  } else {
521  /* 3-byte code */
522  *len = 3;
523  val = (cur[0] & 0xf) << 12;
524  val |= (cur[1] & 0x3f) << 6;
525  val |= cur[2] & 0x3f;
526  if (val < 0x800)
527  goto encoding_error;
528  }
529  } else {
530  /* 2-byte code */
531  *len = 2;
532  val = (cur[0] & 0x1f) << 6;
533  val |= cur[1] & 0x3f;
534  if (val < 0x80)
535  goto encoding_error;
536  }
537  if (!IS_CHAR(val)) {
538  htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
539  "Char 0x%X out of allowed range\n", val);
540  }
541  return(val);
542  } else {
543  if ((*ctxt->input->cur == 0) &&
544  (ctxt->input->cur < ctxt->input->end)) {
545  htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
546  "Char 0x%X out of allowed range\n", 0);
547  *len = 1;
548  return(' ');
549  }
550  /* 1-byte code */
551  *len = 1;
552  return((int) *ctxt->input->cur);
553  }
554 
555 encoding_error:
556  /*
557  * If we detect an UTF8 error that probably mean that the
558  * input encoding didn't get properly advertised in the
559  * declaration header. Report the error and switch the encoding
560  * to ISO-Latin-1 (if you don't like this policy, just declare the
561  * encoding !)
562  */
563  {
564  char buffer[150];
565 
566  if (ctxt->input->end - ctxt->input->cur >= 4) {
567  snprintf(buffer, 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
568  ctxt->input->cur[0], ctxt->input->cur[1],
569  ctxt->input->cur[2], ctxt->input->cur[3]);
570  } else {
571  snprintf(buffer, 149, "Bytes: 0x%02X\n", ctxt->input->cur[0]);
572  }
573  htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
574  "Input is not proper UTF-8, indicate encoding !\n",
575  BAD_CAST buffer, NULL);
576  }
577 
578  /*
579  * Don't switch encodings twice. Note that if there's an encoder, we
580  * shouldn't receive invalid UTF-8 anyway.
581  *
582  * Note that if ctxt->input->buf == NULL, switching encodings is
583  * impossible, see Gitlab issue #34.
584  */
585  if ((ctxt->input->buf != NULL) &&
586  (ctxt->input->buf->encoder == NULL))
588  *len = 1;
589  return((int) *ctxt->input->cur);
590 }
591 
601 static int
602 htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
603  int res = 0;
604 
605  while (IS_BLANK_CH(*(ctxt->input->cur))) {
606  if ((*ctxt->input->cur == 0) &&
607  (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
608  xmlPopInput(ctxt);
609  } else {
610  if (*(ctxt->input->cur) == '\n') {
611  ctxt->input->line++; ctxt->input->col = 1;
612  } else ctxt->input->col++;
613  ctxt->input->cur++;
614  if (*ctxt->input->cur == 0)
616  }
617  res++;
618  }
619  return(res);
620 }
621 
622 
623 
624 /************************************************************************
625  * *
626  * The list of HTML elements and their properties *
627  * *
628  ************************************************************************/
629 
630 /*
631  * Start Tag: 1 means the start tag can be omitted
632  * End Tag: 1 means the end tag can be omitted
633  * 2 means it's forbidden (empty elements)
634  * 3 means the tag is stylistic and should be closed easily
635  * Depr: this element is deprecated
636  * DTD: 1 means that this element is valid only in the Loose DTD
637  * 2 means that this element is valid only in the Frameset DTD
638  *
639  * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
640  , subElements , impliedsubelt , Attributes, userdata
641  */
642 
643 /* Definitions and a couple of vars for HTML Elements */
644 
645 #define FONTSTYLE "tt", "i", "b", "u", "s", "strike", "big", "small"
646 #define NB_FONTSTYLE 8
647 #define PHRASE "em", "strong", "dfn", "code", "samp", "kbd", "var", "cite", "abbr", "acronym"
648 #define NB_PHRASE 10
649 #define SPECIAL "a", "img", "applet", "embed", "object", "font", "basefont", "br", "script", "map", "q", "sub", "sup", "span", "bdo", "iframe"
650 #define NB_SPECIAL 16
651 #define INLINE FONTSTYLE, PHRASE, SPECIAL, FORMCTRL
652 #define NB_INLINE NB_PCDATA + NB_FONTSTYLE + NB_PHRASE + NB_SPECIAL + NB_FORMCTRL
653 #define BLOCK HEADING, LIST, "pre", "p", "dl", "div", "center", "noscript", "noframes", "blockquote", "form", "isindex", "hr", "table", "fieldset", "address"
654 #define NB_BLOCK NB_HEADING + NB_LIST + 14
655 #define FORMCTRL "input", "select", "textarea", "label", "button"
656 #define NB_FORMCTRL 5
657 #define PCDATA
658 #define NB_PCDATA 0
659 #define HEADING "h1", "h2", "h3", "h4", "h5", "h6"
660 #define NB_HEADING 6
661 #define LIST "ul", "ol", "dir", "menu"
662 #define NB_LIST 4
663 #define MODIFIER
664 #define NB_MODIFIER 0
665 #define FLOW BLOCK,INLINE
666 #define NB_FLOW NB_BLOCK + NB_INLINE
667 #define EMPTY NULL
668 
669 
670 static const char* const html_flow[] = { FLOW, NULL } ;
671 static const char* const html_inline[] = { INLINE, NULL } ;
672 
673 /* placeholders: elts with content but no subelements */
674 static const char* const html_pcdata[] = { NULL } ;
675 #define html_cdata html_pcdata
676 
677 
678 /* ... and for HTML Attributes */
679 
680 #define COREATTRS "id", "class", "style", "title"
681 #define NB_COREATTRS 4
682 #define I18N "lang", "dir"
683 #define NB_I18N 2
684 #define EVENTS "onclick", "ondblclick", "onmousedown", "onmouseup", "onmouseover", "onmouseout", "onkeypress", "onkeydown", "onkeyup"
685 #define NB_EVENTS 9
686 #define ATTRS COREATTRS,I18N,EVENTS
687 #define NB_ATTRS NB_NB_COREATTRS + NB_I18N + NB_EVENTS
688 #define CELLHALIGN "align", "char", "charoff"
689 #define NB_CELLHALIGN 3
690 #define CELLVALIGN "valign"
691 #define NB_CELLVALIGN 1
692 
693 static const char* const html_attrs[] = { ATTRS, NULL } ;
694 static const char* const core_i18n_attrs[] = { COREATTRS, I18N, NULL } ;
695 static const char* const core_attrs[] = { COREATTRS, NULL } ;
696 static const char* const i18n_attrs[] = { I18N, NULL } ;
697 
698 
699 /* Other declarations that should go inline ... */
700 static const char* const a_attrs[] = { ATTRS, "charset", "type", "name",
701  "href", "hreflang", "rel", "rev", "accesskey", "shape", "coords",
702  "tabindex", "onfocus", "onblur", NULL } ;
703 static const char* const target_attr[] = { "target", NULL } ;
704 static const char* const rows_cols_attr[] = { "rows", "cols", NULL } ;
705 static const char* const alt_attr[] = { "alt", NULL } ;
706 static const char* const src_alt_attrs[] = { "src", "alt", NULL } ;
707 static const char* const href_attrs[] = { "href", NULL } ;
708 static const char* const clear_attrs[] = { "clear", NULL } ;
709 static const char* const inline_p[] = { INLINE, "p", NULL } ;
710 
711 static const char* const flow_param[] = { FLOW, "param", NULL } ;
712 static const char* const applet_attrs[] = { COREATTRS , "codebase",
713  "archive", "alt", "name", "height", "width", "align",
714  "hspace", "vspace", NULL } ;
715 static const char* const area_attrs[] = { "shape", "coords", "href", "nohref",
716  "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
717 static const char* const basefont_attrs[] =
718  { "id", "size", "color", "face", NULL } ;
719 static const char* const quote_attrs[] = { ATTRS, "cite", NULL } ;
720 static const char* const body_contents[] = { FLOW, "ins", "del", NULL } ;
721 static const char* const body_attrs[] = { ATTRS, "onload", "onunload", NULL } ;
722 static const char* const body_depr[] = { "background", "bgcolor", "text",
723  "link", "vlink", "alink", NULL } ;
724 static const char* const button_attrs[] = { ATTRS, "name", "value", "type",
725  "disabled", "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
726 
727 
728 static const char* const col_attrs[] = { ATTRS, "span", "width", CELLHALIGN, CELLVALIGN, NULL } ;
729 static const char* const col_elt[] = { "col", NULL } ;
730 static const char* const edit_attrs[] = { ATTRS, "datetime", "cite", NULL } ;
731 static const char* const compact_attrs[] = { ATTRS, "compact", NULL } ;
732 static const char* const dl_contents[] = { "dt", "dd", NULL } ;
733 static const char* const compact_attr[] = { "compact", NULL } ;
734 static const char* const label_attr[] = { "label", NULL } ;
735 static const char* const fieldset_contents[] = { FLOW, "legend" } ;
736 static const char* const font_attrs[] = { COREATTRS, I18N, "size", "color", "face" , NULL } ;
737 static const char* const form_contents[] = { HEADING, LIST, INLINE, "pre", "p", "div", "center", "noscript", "noframes", "blockquote", "isindex", "hr", "table", "fieldset", "address", NULL } ;
738 static const char* const form_attrs[] = { ATTRS, "method", "enctype", "accept", "name", "onsubmit", "onreset", "accept-charset", NULL } ;
739 static const char* const frame_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling" , NULL } ;
740 static const char* const frameset_attrs[] = { COREATTRS, "rows", "cols", "onload", "onunload", NULL } ;
741 static const char* const frameset_contents[] = { "frameset", "frame", "noframes", NULL } ;
742 static const char* const head_attrs[] = { I18N, "profile", NULL } ;
743 static const char* const head_contents[] = { "title", "isindex", "base", "script", "style", "meta", "link", "object", NULL } ;
744 static const char* const hr_depr[] = { "align", "noshade", "size", "width", NULL } ;
745 static const char* const version_attr[] = { "version", NULL } ;
746 static const char* const html_content[] = { "head", "body", "frameset", NULL } ;
747 static const char* const iframe_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width", NULL } ;
748 static const char* const img_attrs[] = { ATTRS, "longdesc", "name", "height", "width", "usemap", "ismap", NULL } ;
749 static const char* const embed_attrs[] = { COREATTRS, "align", "alt", "border", "code", "codebase", "frameborder", "height", "hidden", "hspace", "name", "palette", "pluginspace", "pluginurl", "src", "type", "units", "vspace", "width", NULL } ;
750 static const char* const input_attrs[] = { ATTRS, "type", "name", "value", "checked", "disabled", "readonly", "size", "maxlength", "src", "alt", "usemap", "ismap", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", "accept", NULL } ;
751 static const char* const prompt_attrs[] = { COREATTRS, I18N, "prompt", NULL } ;
752 static const char* const label_attrs[] = { ATTRS, "for", "accesskey", "onfocus", "onblur", NULL } ;
753 static const char* const legend_attrs[] = { ATTRS, "accesskey", NULL } ;
754 static const char* const align_attr[] = { "align", NULL } ;
755 static const char* const link_attrs[] = { ATTRS, "charset", "href", "hreflang", "type", "rel", "rev", "media", NULL } ;
756 static const char* const map_contents[] = { BLOCK, "area", NULL } ;
757 static const char* const name_attr[] = { "name", NULL } ;
758 static const char* const action_attr[] = { "action", NULL } ;
759 static const char* const blockli_elt[] = { BLOCK, "li", NULL } ;
760 static const char* const meta_attrs[] = { I18N, "http-equiv", "name", "scheme", "charset", NULL } ;
761 static const char* const content_attr[] = { "content", NULL } ;
762 static const char* const type_attr[] = { "type", NULL } ;
763 static const char* const noframes_content[] = { "body", FLOW MODIFIER, NULL } ;
764 static const char* const object_contents[] = { FLOW, "param", NULL } ;
765 static const char* const object_attrs[] = { ATTRS, "declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height", "width", "usemap", "name", "tabindex", NULL } ;
766 static const char* const object_depr[] = { "align", "border", "hspace", "vspace", NULL } ;
767 static const char* const ol_attrs[] = { "type", "compact", "start", NULL} ;
768 static const char* const option_elt[] = { "option", NULL } ;
769 static const char* const optgroup_attrs[] = { ATTRS, "disabled", NULL } ;
770 static const char* const option_attrs[] = { ATTRS, "disabled", "label", "selected", "value", NULL } ;
771 static const char* const param_attrs[] = { "id", "value", "valuetype", "type", NULL } ;
772 static const char* const width_attr[] = { "width", NULL } ;
773 static const char* const pre_content[] = { PHRASE, "tt", "i", "b", "u", "s", "strike", "a", "br", "script", "map", "q", "span", "bdo", "iframe", NULL } ;
774 static const char* const script_attrs[] = { "charset", "src", "defer", "event", "for", NULL } ;
775 static const char* const language_attr[] = { "language", NULL } ;
776 static const char* const select_content[] = { "optgroup", "option", NULL } ;
777 static const char* const select_attrs[] = { ATTRS, "name", "size", "multiple", "disabled", "tabindex", "onfocus", "onblur", "onchange", NULL } ;
778 static const char* const style_attrs[] = { I18N, "media", "title", NULL } ;
779 static const char* const table_attrs[] = { ATTRS, "summary", "width", "border", "frame", "rules", "cellspacing", "cellpadding", "datapagesize", NULL } ;
780 static const char* const table_depr[] = { "align", "bgcolor", NULL } ;
781 static const char* const table_contents[] = { "caption", "col", "colgroup", "thead", "tfoot", "tbody", "tr", NULL} ;
782 static const char* const tr_elt[] = { "tr", NULL } ;
783 static const char* const talign_attrs[] = { ATTRS, CELLHALIGN, CELLVALIGN, NULL} ;
784 static const char* const th_td_depr[] = { "nowrap", "bgcolor", "width", "height", NULL } ;
785 static const char* const th_td_attr[] = { ATTRS, "abbr", "axis", "headers", "scope", "rowspan", "colspan", CELLHALIGN, CELLVALIGN, NULL } ;
786 static const char* const textarea_attrs[] = { ATTRS, "name", "disabled", "readonly", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", NULL } ;
787 static const char* const tr_contents[] = { "th", "td", NULL } ;
788 static const char* const bgcolor_attr[] = { "bgcolor", NULL } ;
789 static const char* const li_elt[] = { "li", NULL } ;
790 static const char* const ul_depr[] = { "type", "compact", NULL} ;
791 static const char* const dir_attr[] = { "dir", NULL} ;
792 
793 #define DECL (const char**)
794 
795 static const htmlElemDesc
796 html40ElementTable[] = {
797 { "a", 0, 0, 0, 0, 0, 0, 1, "anchor ",
798  DECL html_inline , NULL , DECL a_attrs , DECL target_attr, NULL
799 },
800 { "abbr", 0, 0, 0, 0, 0, 0, 1, "abbreviated form",
801  DECL html_inline , NULL , DECL html_attrs, NULL, NULL
802 },
803 { "acronym", 0, 0, 0, 0, 0, 0, 1, "",
804  DECL html_inline , NULL , DECL html_attrs, NULL, NULL
805 },
806 { "address", 0, 0, 0, 0, 0, 0, 0, "information on author ",
807  DECL inline_p , NULL , DECL html_attrs, NULL, NULL
808 },
809 { "applet", 0, 0, 0, 0, 1, 1, 2, "java applet ",
810  DECL flow_param , NULL , NULL , DECL applet_attrs, NULL
811 },
812 { "area", 0, 2, 2, 1, 0, 0, 0, "client-side image map area ",
813  EMPTY , NULL , DECL area_attrs , DECL target_attr, DECL alt_attr
814 },
815 { "b", 0, 3, 0, 0, 0, 0, 1, "bold text style",
816  DECL html_inline , NULL , DECL html_attrs, NULL, NULL
817 },
818 { "base", 0, 2, 2, 1, 0, 0, 0, "document base uri ",
819  EMPTY , NULL , NULL , DECL target_attr, DECL href_attrs
820 },
821 { "basefont", 0, 2, 2, 1, 1, 1, 1, "base font size " ,
822  EMPTY , NULL , NULL, DECL basefont_attrs, NULL
823 },
824 { "bdo", 0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride ",
825  DECL html_inline , NULL , DECL core_i18n_attrs, NULL, DECL dir_attr
826 },
827 { "big", 0, 3, 0, 0, 0, 0, 1, "large text style",
828  DECL html_inline , NULL , DECL html_attrs, NULL, NULL
829 },
830 { "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation ",
831  DECL html_flow , NULL , DECL quote_attrs , NULL, NULL
832 },
833 { "body", 1, 1, 0, 0, 0, 0, 0, "document body ",
834  DECL body_contents , "div" , DECL body_attrs, DECL body_depr, NULL
835 },
836 { "br", 0, 2, 2, 1, 0, 0, 1, "forced line break ",
837  EMPTY , NULL , DECL core_attrs, DECL clear_attrs , NULL
838 },
839 { "button", 0, 0, 0, 0, 0, 0, 2, "push button ",
840  DECL html_flow MODIFIER , NULL , DECL button_attrs, NULL, NULL
841 },
842 { "caption", 0, 0, 0, 0, 0, 0, 0, "table caption ",
843  DECL html_inline , NULL , DECL html_attrs, NULL, NULL
844 },
845 { "center", 0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center ",
846  DECL html_flow , NULL , NULL, DECL html_attrs, NULL
847 },
848 { "cite", 0, 0, 0, 0, 0, 0, 1, "citation",
849  DECL html_inline , NULL , DECL html_attrs, NULL, NULL
850 },
851 { "code", 0, 0, 0, 0, 0, 0, 1, "computer code fragment",
852  DECL html_inline , NULL , DECL html_attrs, NULL, NULL
853 },
854 { "col", 0, 2, 2, 1, 0, 0, 0, "table column ",
855  EMPTY , NULL , DECL col_attrs , NULL, NULL
856 },
857 { "colgroup", 0, 1, 0, 0, 0, 0, 0, "table column group ",
858  DECL col_elt , "col" , DECL col_attrs , NULL, NULL
859 },
860 { "dd", 0, 1, 0, 0, 0, 0, 0, "definition description ",
861  DECL html_flow , NULL , DECL html_attrs, NULL, NULL
862 },
863 { "del", 0, 0, 0, 0, 0, 0, 2, "deleted text ",
864  DECL html_flow , NULL , DECL edit_attrs , NULL, NULL
865 },
866 { "dfn", 0, 0, 0, 0, 0, 0, 1, "instance definition",
867  DECL html_inline , NULL , DECL html_attrs, NULL, NULL
868 },
869 { "dir", 0, 0, 0, 0, 1, 1, 0, "directory list",
870  DECL blockli_elt, "li" , NULL, DECL compact_attrs, NULL
871 },
872 { "div", 0, 0, 0, 0, 0, 0, 0, "generic language/style container",
873  DECL html_flow, NULL, DECL html_attrs, DECL align_attr, NULL
874 },
875 { "dl", 0, 0, 0, 0, 0, 0, 0, "definition list ",
876  DECL dl_contents , "dd" , DECL html_attrs, DECL compact_attr, NULL
877 },
878 { "dt", 0, 1, 0, 0, 0, 0, 0, "definition term ",
879  DECL html_inline, NULL, DECL html_attrs, NULL, NULL
880 },
881 { "em", 0, 3, 0, 0, 0, 0, 1, "emphasis",
882  DECL html_inline, NULL, DECL html_attrs, NULL, NULL
883 },
884 { "embed", 0, 1, 0, 0, 1, 1, 1, "generic embedded object ",
885  EMPTY, NULL, DECL embed_attrs, NULL, NULL
886 },
887 { "fieldset", 0, 0, 0, 0, 0, 0, 0, "form control group ",
888  DECL fieldset_contents , NULL, DECL html_attrs, NULL, NULL
889 },
890 { "font", 0, 3, 0, 0, 1, 1, 1, "local change to font ",
891  DECL html_inline, NULL, NULL, DECL font_attrs, NULL
892 },
893 { "form", 0, 0, 0, 0, 0, 0, 0, "interactive form ",
894  DECL form_contents, "fieldset", DECL form_attrs , DECL target_attr, DECL action_attr
895 },
896 { "frame", 0, 2, 2, 1, 0, 2, 0, "subwindow " ,
897  EMPTY, NULL, NULL, DECL frame_attrs, NULL
898 },
899 { "frameset", 0, 0, 0, 0, 0, 2, 0, "window subdivision" ,
900  DECL frameset_contents, "noframes" , NULL , DECL frameset_attrs, NULL
901 },
902 { "h1", 0, 0, 0, 0, 0, 0, 0, "heading ",
903  DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
904 },
905 { "h2", 0, 0, 0, 0, 0, 0, 0, "heading ",
906  DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
907 },
908 { "h3", 0, 0, 0, 0, 0, 0, 0, "heading ",
909  DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
910 },
911 { "h4", 0, 0, 0, 0, 0, 0, 0, "heading ",
912  DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
913 },
914 { "h5", 0, 0, 0, 0, 0, 0, 0, "heading ",
915  DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
916 },
917 { "h6", 0, 0, 0, 0, 0, 0, 0, "heading ",
918  DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
919 },
920 { "head", 1, 1, 0, 0, 0, 0, 0, "document head ",
921  DECL head_contents, NULL, DECL head_attrs, NULL, NULL
922 },
923 { "hr", 0, 2, 2, 1, 0, 0, 0, "horizontal rule " ,
924  EMPTY, NULL, DECL html_attrs, DECL hr_depr, NULL
925 },
926 { "html", 1, 1, 0, 0, 0, 0, 0, "document root element ",
927  DECL html_content , NULL , DECL i18n_attrs, DECL version_attr, NULL
928 },
929 { "i", 0, 3, 0, 0, 0, 0, 1, "italic text style",
930  DECL html_inline, NULL, DECL html_attrs, NULL, NULL
931 },
932 { "iframe", 0, 0, 0, 0, 0, 1, 2, "inline subwindow ",
933  DECL html_flow, NULL, NULL, DECL iframe_attrs, NULL
934 },
935 { "img", 0, 2, 2, 1, 0, 0, 1, "embedded image ",
936  EMPTY, NULL, DECL img_attrs, DECL align_attr, DECL src_alt_attrs
937 },
938 { "input", 0, 2, 2, 1, 0, 0, 1, "form control ",
939  EMPTY, NULL, DECL input_attrs , DECL align_attr, NULL
940 },
941 { "ins", 0, 0, 0, 0, 0, 0, 2, "inserted text",
942  DECL html_flow, NULL, DECL edit_attrs, NULL, NULL
943 },
944 { "isindex", 0, 2, 2, 1, 1, 1, 0, "single line prompt ",
945  EMPTY, NULL, NULL, DECL prompt_attrs, NULL
946 },
947 { "kbd", 0, 0, 0, 0, 0, 0, 1, "text to be entered by the user",
948  DECL html_inline, NULL, DECL html_attrs, NULL, NULL
949 },
950 { "label", 0, 0, 0, 0, 0, 0, 1, "form field label text ",
951  DECL html_inline MODIFIER, NULL, DECL label_attrs , NULL, NULL
952 },
953 { "legend", 0, 0, 0, 0, 0, 0, 0, "fieldset legend ",
954  DECL html_inline, NULL, DECL legend_attrs , DECL align_attr, NULL
955 },
956 { "li", 0, 1, 1, 0, 0, 0, 0, "list item ",
957  DECL html_flow, NULL, DECL html_attrs, NULL, NULL
958 },
959 { "link", 0, 2, 2, 1, 0, 0, 0, "a media-independent link ",
960  EMPTY, NULL, DECL link_attrs, DECL target_attr, NULL
961 },
962 { "map", 0, 0, 0, 0, 0, 0, 2, "client-side image map ",
963  DECL map_contents , NULL, DECL html_attrs , NULL, DECL name_attr
964 },
965 { "menu", 0, 0, 0, 0, 1, 1, 0, "menu list ",
966  DECL blockli_elt , NULL, NULL, DECL compact_attrs, NULL
967 },
968 { "meta", 0, 2, 2, 1, 0, 0, 0, "generic metainformation ",
969  EMPTY, NULL, DECL meta_attrs , NULL , DECL content_attr
970 },
971 { "noframes", 0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering ",
972  DECL noframes_content, "body" , DECL html_attrs, NULL, NULL
973 },
974 { "noscript", 0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering ",
975  DECL html_flow, "div", DECL html_attrs, NULL, NULL
976 },
977 { "object", 0, 0, 0, 0, 0, 0, 2, "generic embedded object ",
978  DECL object_contents , "div" , DECL object_attrs, DECL object_depr, NULL
979 },
980 { "ol", 0, 0, 0, 0, 0, 0, 0, "ordered list ",
981  DECL li_elt , "li" , DECL html_attrs, DECL ol_attrs, NULL
982 },
983 { "optgroup", 0, 0, 0, 0, 0, 0, 0, "option group ",
984  DECL option_elt , "option", DECL optgroup_attrs, NULL, DECL label_attr
985 },
986 { "option", 0, 1, 0, 0, 0, 0, 0, "selectable choice " ,
987  DECL html_pcdata, NULL, DECL option_attrs, NULL, NULL
988 },
989 { "p", 0, 1, 0, 0, 0, 0, 0, "paragraph ",
990  DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
991 },
992 { "param", 0, 2, 2, 1, 0, 0, 0, "named property value ",
993  EMPTY, NULL, DECL param_attrs, NULL, DECL name_attr
994 },
995 { "pre", 0, 0, 0, 0, 0, 0, 0, "preformatted text ",
996  DECL pre_content, NULL, DECL html_attrs, DECL width_attr, NULL
997 },
998 { "q", 0, 0, 0, 0, 0, 0, 1, "short inline quotation ",
999  DECL html_inline, NULL, DECL quote_attrs, NULL, NULL
1000 },
1001 { "s", 0, 3, 0, 0, 1, 1, 1, "strike-through text style",
1002  DECL html_inline, NULL, NULL, DECL html_attrs, NULL
1003 },
1004 { "samp", 0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc.",
1005  DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1006 },
1007 { "script", 0, 0, 0, 0, 0, 0, 2, "script statements ",
1008  DECL html_cdata, NULL, DECL script_attrs, DECL language_attr, DECL type_attr
1009 },
1010 { "select", 0, 0, 0, 0, 0, 0, 1, "option selector ",
1011  DECL select_content, NULL, DECL select_attrs, NULL, NULL
1012 },
1013 { "small", 0, 3, 0, 0, 0, 0, 1, "small text style",
1014  DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1015 },
1016 { "span", 0, 0, 0, 0, 0, 0, 1, "generic language/style container ",
1017  DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1018 },
1019 { "strike", 0, 3, 0, 0, 1, 1, 1, "strike-through text",
1020  DECL html_inline, NULL, NULL, DECL html_attrs, NULL
1021 },
1022 { "strong", 0, 3, 0, 0, 0, 0, 1, "strong emphasis",
1023  DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1024 },
1025 { "style", 0, 0, 0, 0, 0, 0, 0, "style info ",
1026  DECL html_cdata, NULL, DECL style_attrs, NULL, DECL type_attr
1027 },
1028 { "sub", 0, 3, 0, 0, 0, 0, 1, "subscript",
1029  DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1030 },
1031 { "sup", 0, 3, 0, 0, 0, 0, 1, "superscript ",
1032  DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1033 },
1034 { "table", 0, 0, 0, 0, 0, 0, 0, "",
1035  DECL table_contents , "tr" , DECL table_attrs , DECL table_depr, NULL
1036 },
1037 { "tbody", 1, 0, 0, 0, 0, 0, 0, "table body ",
1038  DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1039 },
1040 { "td", 0, 0, 0, 0, 0, 0, 0, "table data cell",
1041  DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
1042 },
1043 { "textarea", 0, 0, 0, 0, 0, 0, 1, "multi-line text field ",
1044  DECL html_pcdata, NULL, DECL textarea_attrs, NULL, DECL rows_cols_attr
1045 },
1046 { "tfoot", 0, 1, 0, 0, 0, 0, 0, "table footer ",
1047  DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1048 },
1049 { "th", 0, 1, 0, 0, 0, 0, 0, "table header cell",
1050  DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
1051 },
1052 { "thead", 0, 1, 0, 0, 0, 0, 0, "table header ",
1053  DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1054 },
1055 { "title", 0, 0, 0, 0, 0, 0, 0, "document title ",
1056  DECL html_pcdata, NULL, DECL i18n_attrs, NULL, NULL
1057 },
1058 { "tr", 0, 0, 0, 0, 0, 0, 0, "table row ",
1059  DECL tr_contents , "td" , DECL talign_attrs, DECL bgcolor_attr, NULL
1060 },
1061 { "tt", 0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style",
1062  DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1063 },
1064 { "u", 0, 3, 0, 0, 1, 1, 1, "underlined text style",
1065  DECL html_inline, NULL, NULL, DECL html_attrs, NULL
1066 },
1067 { "ul", 0, 0, 0, 0, 0, 0, 0, "unordered list ",
1068  DECL li_elt , "li" , DECL html_attrs, DECL ul_depr, NULL
1069 },
1070 { "var", 0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument",
1071  DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1072 }
1073 };
1074 
1075 typedef struct {
1076  const char *oldTag;
1077  const char *newTag;
1078 } htmlStartCloseEntry;
1079 
1080 /*
1081  * start tags that imply the end of current element
1082  */
1083 static const htmlStartCloseEntry htmlStartClose[] = {
1084  { "a", "a" },
1085  { "a", "fieldset" },
1086  { "a", "table" },
1087  { "a", "td" },
1088  { "a", "th" },
1089  { "address", "dd" },
1090  { "address", "dl" },
1091  { "address", "dt" },
1092  { "address", "form" },
1093  { "address", "li" },
1094  { "address", "ul" },
1095  { "b", "center" },
1096  { "b", "p" },
1097  { "b", "td" },
1098  { "b", "th" },
1099  { "big", "p" },
1100  { "caption", "col" },
1101  { "caption", "colgroup" },
1102  { "caption", "tbody" },
1103  { "caption", "tfoot" },
1104  { "caption", "thead" },
1105  { "caption", "tr" },
1106  { "col", "col" },
1107  { "col", "colgroup" },
1108  { "col", "tbody" },
1109  { "col", "tfoot" },
1110  { "col", "thead" },
1111  { "col", "tr" },
1112  { "colgroup", "colgroup" },
1113  { "colgroup", "tbody" },
1114  { "colgroup", "tfoot" },
1115  { "colgroup", "thead" },
1116  { "colgroup", "tr" },
1117  { "dd", "dt" },
1118  { "dir", "dd" },
1119  { "dir", "dl" },
1120  { "dir", "dt" },
1121  { "dir", "form" },
1122  { "dir", "ul" },
1123  { "dl", "form" },
1124  { "dl", "li" },
1125  { "dt", "dd" },
1126  { "dt", "dl" },
1127  { "font", "center" },
1128  { "font", "td" },
1129  { "font", "th" },
1130  { "form", "form" },
1131  { "h1", "fieldset" },
1132  { "h1", "form" },
1133  { "h1", "li" },
1134  { "h1", "p" },
1135  { "h1", "table" },
1136  { "h2", "fieldset" },
1137  { "h2", "form" },
1138  { "h2", "li" },
1139  { "h2", "p" },
1140  { "h2", "table" },
1141  { "h3", "fieldset" },
1142  { "h3", "form" },
1143  { "h3", "li" },
1144  { "h3", "p" },
1145  { "h3", "table" },
1146  { "h4", "fieldset" },
1147  { "h4", "form" },
1148  { "h4", "li" },
1149  { "h4", "p" },
1150  { "h4", "table" },
1151  { "h5", "fieldset" },
1152  { "h5", "form" },
1153  { "h5", "li" },
1154  { "h5", "p" },
1155  { "h5", "table" },
1156  { "h6", "fieldset" },
1157  { "h6", "form" },
1158  { "h6", "li" },
1159  { "h6", "p" },
1160  { "h6", "table" },
1161  { "head", "a" },
1162  { "head", "abbr" },
1163  { "head", "acronym" },
1164  { "head", "address" },
1165  { "head", "b" },
1166  { "head", "bdo" },
1167  { "head", "big" },
1168  { "head", "blockquote" },
1169  { "head", "body" },
1170  { "head", "br" },
1171  { "head", "center" },
1172  { "head", "cite" },
1173  { "head", "code" },
1174  { "head", "dd" },
1175  { "head", "dfn" },
1176  { "head", "dir" },
1177  { "head", "div" },
1178  { "head", "dl" },
1179  { "head", "dt" },
1180  { "head", "em" },
1181  { "head", "fieldset" },
1182  { "head", "font" },
1183  { "head", "form" },
1184  { "head", "frameset" },
1185  { "head", "h1" },
1186  { "head", "h2" },
1187  { "head", "h3" },
1188  { "head", "h4" },
1189  { "head", "h5" },
1190  { "head", "h6" },
1191  { "head", "hr" },
1192  { "head", "i" },
1193  { "head", "iframe" },
1194  { "head", "img" },
1195  { "head", "kbd" },
1196  { "head", "li" },
1197  { "head", "listing" },
1198  { "head", "map" },
1199  { "head", "menu" },
1200  { "head", "ol" },
1201  { "head", "p" },
1202  { "head", "pre" },
1203  { "head", "q" },
1204  { "head", "s" },
1205  { "head", "samp" },
1206  { "head", "small" },
1207  { "head", "span" },
1208  { "head", "strike" },
1209  { "head", "strong" },
1210  { "head", "sub" },
1211  { "head", "sup" },
1212  { "head", "table" },
1213  { "head", "tt" },
1214  { "head", "u" },
1215  { "head", "ul" },
1216  { "head", "var" },
1217  { "head", "xmp" },
1218  { "hr", "form" },
1219  { "i", "center" },
1220  { "i", "p" },
1221  { "i", "td" },
1222  { "i", "th" },
1223  { "legend", "fieldset" },
1224  { "li", "li" },
1225  { "link", "body" },
1226  { "link", "frameset" },
1227  { "listing", "dd" },
1228  { "listing", "dl" },
1229  { "listing", "dt" },
1230  { "listing", "fieldset" },
1231  { "listing", "form" },
1232  { "listing", "li" },
1233  { "listing", "table" },
1234  { "listing", "ul" },
1235  { "menu", "dd" },
1236  { "menu", "dl" },
1237  { "menu", "dt" },
1238  { "menu", "form" },
1239  { "menu", "ul" },
1240  { "ol", "form" },
1241  { "ol", "ul" },
1242  { "option", "optgroup" },
1243  { "option", "option" },
1244  { "p", "address" },
1245  { "p", "blockquote" },
1246  { "p", "body" },
1247  { "p", "caption" },
1248  { "p", "center" },
1249  { "p", "col" },
1250  { "p", "colgroup" },
1251  { "p", "dd" },
1252  { "p", "dir" },
1253  { "p", "div" },
1254  { "p", "dl" },
1255  { "p", "dt" },
1256  { "p", "fieldset" },
1257  { "p", "form" },
1258  { "p", "frameset" },
1259  { "p", "h1" },
1260  { "p", "h2" },
1261  { "p", "h3" },
1262  { "p", "h4" },
1263  { "p", "h5" },
1264  { "p", "h6" },
1265  { "p", "head" },
1266  { "p", "hr" },
1267  { "p", "li" },
1268  { "p", "listing" },
1269  { "p", "menu" },
1270  { "p", "ol" },
1271  { "p", "p" },
1272  { "p", "pre" },
1273  { "p", "table" },
1274  { "p", "tbody" },
1275  { "p", "td" },
1276  { "p", "tfoot" },
1277  { "p", "th" },
1278  { "p", "title" },
1279  { "p", "tr" },
1280  { "p", "ul" },
1281  { "p", "xmp" },
1282  { "pre", "dd" },
1283  { "pre", "dl" },
1284  { "pre", "dt" },
1285  { "pre", "fieldset" },
1286  { "pre", "form" },
1287  { "pre", "li" },
1288  { "pre", "table" },
1289  { "pre", "ul" },
1290  { "s", "p" },
1291  { "script", "noscript" },
1292  { "small", "p" },
1293  { "span", "td" },
1294  { "span", "th" },
1295  { "strike", "p" },
1296  { "style", "body" },
1297  { "style", "frameset" },
1298  { "tbody", "tbody" },
1299  { "tbody", "tfoot" },
1300  { "td", "tbody" },
1301  { "td", "td" },
1302  { "td", "tfoot" },
1303  { "td", "th" },
1304  { "td", "tr" },
1305  { "tfoot", "tbody" },
1306  { "th", "tbody" },
1307  { "th", "td" },
1308  { "th", "tfoot" },
1309  { "th", "th" },
1310  { "th", "tr" },
1311  { "thead", "tbody" },
1312  { "thead", "tfoot" },
1313  { "title", "body" },
1314  { "title", "frameset" },
1315  { "tr", "tbody" },
1316  { "tr", "tfoot" },
1317  { "tr", "tr" },
1318  { "tt", "p" },
1319  { "u", "p" },
1320  { "u", "td" },
1321  { "u", "th" },
1322  { "ul", "address" },
1323  { "ul", "form" },
1324  { "ul", "menu" },
1325  { "ul", "ol" },
1326  { "ul", "pre" },
1327  { "xmp", "dd" },
1328  { "xmp", "dl" },
1329  { "xmp", "dt" },
1330  { "xmp", "fieldset" },
1331  { "xmp", "form" },
1332  { "xmp", "li" },
1333  { "xmp", "table" },
1334  { "xmp", "ul" }
1335 };
1336 
1337 /*
1338  * The list of HTML elements which are supposed not to have
1339  * CDATA content and where a p element will be implied
1340  *
1341  * TODO: extend that list by reading the HTML SGML DTD on
1342  * implied paragraph
1343  */
1344 static const char *const htmlNoContentElements[] = {
1345  "html",
1346  "head",
1347  NULL
1348 };
1349 
1350 /*
1351  * The list of HTML attributes which are of content %Script;
1352  * NOTE: when adding ones, check htmlIsScriptAttribute() since
1353  * it assumes the name starts with 'on'
1354  */
1355 static const char *const htmlScriptAttributes[] = {
1356  "onclick",
1357  "ondblclick",
1358  "onmousedown",
1359  "onmouseup",
1360  "onmouseover",
1361  "onmousemove",
1362  "onmouseout",
1363  "onkeypress",
1364  "onkeydown",
1365  "onkeyup",
1366  "onload",
1367  "onunload",
1368  "onfocus",
1369  "onblur",
1370  "onsubmit",
1371  "onreset",
1372  "onchange",
1373  "onselect"
1374 };
1375 
1376 /*
1377  * This table is used by the htmlparser to know what to do with
1378  * broken html pages. By assigning different priorities to different
1379  * elements the parser can decide how to handle extra endtags.
1380  * Endtags are only allowed to close elements with lower or equal
1381  * priority.
1382  */
1383 
1384 typedef struct {
1385  const char *name;
1386  int priority;
1387 } elementPriority;
1388 
1389 static const elementPriority htmlEndPriority[] = {
1390  {"div", 150},
1391  {"td", 160},
1392  {"th", 160},
1393  {"tr", 170},
1394  {"thead", 180},
1395  {"tbody", 180},
1396  {"tfoot", 180},
1397  {"table", 190},
1398  {"head", 200},
1399  {"body", 200},
1400  {"html", 220},
1401  {NULL, 100} /* Default priority */
1402 };
1403 
1404 /************************************************************************
1405  * *
1406  * functions to handle HTML specific data *
1407  * *
1408  ************************************************************************/
1409 
1415 void
1416 htmlInitAutoClose(void) {
1417 }
1418 
1419 static int
1420 htmlCompareTags(const void *key, const void *member) {
1421  const xmlChar *tag = (const xmlChar *) key;
1422  const htmlElemDesc *desc = (const htmlElemDesc *) member;
1423 
1424  return(xmlStrcasecmp(tag, BAD_CAST desc->name));
1425 }
1426 
1435 const htmlElemDesc *
1436 htmlTagLookup(const xmlChar *tag) {
1437  if (tag == NULL)
1438  return(NULL);
1439 
1440  return((const htmlElemDesc *) bsearch(tag, html40ElementTable,
1441  sizeof(html40ElementTable) / sizeof(htmlElemDesc),
1442  sizeof(htmlElemDesc), htmlCompareTags));
1443 }
1444 
1451 static int
1452 htmlGetEndPriority (const xmlChar *name) {
1453  int i = 0;
1454 
1455  while ((htmlEndPriority[i].name != NULL) &&
1456  (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
1457  i++;
1458 
1459  return(htmlEndPriority[i].priority);
1460 }
1461 
1462 
1463 static int
1464 htmlCompareStartClose(const void *vkey, const void *member) {
1465  const htmlStartCloseEntry *key = (const htmlStartCloseEntry *) vkey;
1466  const htmlStartCloseEntry *entry = (const htmlStartCloseEntry *) member;
1467  int ret;
1468 
1469  ret = strcmp(key->oldTag, entry->oldTag);
1470  if (ret == 0)
1471  ret = strcmp(key->newTag, entry->newTag);
1472 
1473  return(ret);
1474 }
1475 
1486 static int
1487 htmlCheckAutoClose(const xmlChar * newtag, const xmlChar * oldtag)
1488 {
1489  htmlStartCloseEntry key;
1490  void *res;
1491 
1492  key.oldTag = (const char *) oldtag;
1493  key.newTag = (const char *) newtag;
1494  res = bsearch(&key, htmlStartClose,
1495  sizeof(htmlStartClose) / sizeof(htmlStartCloseEntry),
1496  sizeof(htmlStartCloseEntry), htmlCompareStartClose);
1497  return(res != NULL);
1498 }
1499 
1508 static void
1509 htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1510 {
1511  const htmlElemDesc *info;
1512  int i, priority;
1513 
1514  priority = htmlGetEndPriority(newtag);
1515 
1516  for (i = (ctxt->nameNr - 1); i >= 0; i--) {
1517 
1518  if (xmlStrEqual(newtag, ctxt->nameTab[i]))
1519  break;
1520  /*
1521  * A misplaced endtag can only close elements with lower
1522  * or equal priority, so if we find an element with higher
1523  * priority before we find an element with
1524  * matching name, we just ignore this endtag
1525  */
1526  if (htmlGetEndPriority(ctxt->nameTab[i]) > priority)
1527  return;
1528  }
1529  if (i < 0)
1530  return;
1531 
1532  while (!xmlStrEqual(newtag, ctxt->name)) {
1533  info = htmlTagLookup(ctxt->name);
1534  if ((info != NULL) && (info->endTag == 3)) {
1535  htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
1536  "Opening and ending tag mismatch: %s and %s\n",
1537  newtag, ctxt->name);
1538  }
1539  if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1540  ctxt->sax->endElement(ctxt->userData, ctxt->name);
1541  htmlnamePop(ctxt);
1542  }
1543 }
1544 
1551 static void
1552 htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)
1553 {
1554  int i;
1555 
1556  if (ctxt->nameNr == 0)
1557  return;
1558  for (i = (ctxt->nameNr - 1); i >= 0; i--) {
1559  if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1560  ctxt->sax->endElement(ctxt->userData, ctxt->name);
1561  htmlnamePop(ctxt);
1562  }
1563 }
1564 
1577 static void
1578 htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1579 {
1580  while ((newtag != NULL) && (ctxt->name != NULL) &&
1581  (htmlCheckAutoClose(newtag, ctxt->name))) {
1582  if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1583  ctxt->sax->endElement(ctxt->userData, ctxt->name);
1584  htmlnamePop(ctxt);
1585  }
1586  if (newtag == NULL) {
1587  htmlAutoCloseOnEnd(ctxt);
1588  return;
1589  }
1590  while ((newtag == NULL) && (ctxt->name != NULL) &&
1591  ((xmlStrEqual(ctxt->name, BAD_CAST "head")) ||
1592  (xmlStrEqual(ctxt->name, BAD_CAST "body")) ||
1593  (xmlStrEqual(ctxt->name, BAD_CAST "html")))) {
1594  if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1595  ctxt->sax->endElement(ctxt->userData, ctxt->name);
1596  htmlnamePop(ctxt);
1597  }
1598 }
1599 
1613 int
1614 htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
1615  htmlNodePtr child;
1616 
1617  if (elem == NULL) return(1);
1618  if (xmlStrEqual(name, elem->name)) return(0);
1619  if (htmlCheckAutoClose(elem->name, name)) return(1);
1620  child = elem->children;
1621  while (child != NULL) {
1622  if (htmlAutoCloseTag(doc, name, child)) return(1);
1623  child = child->next;
1624  }
1625  return(0);
1626 }
1627 
1639 int
1640 htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
1641  htmlNodePtr child;
1642 
1643  if (elem == NULL) return(1);
1644  child = elem->children;
1645  while (child != NULL) {
1646  if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
1647  child = child->next;
1648  }
1649  return(0);
1650 }
1651 
1661 static void
1662 htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
1663  int i;
1664 
1665  if (ctxt->options & HTML_PARSE_NOIMPLIED)
1666  return;
1667  if (!htmlOmittedDefaultValue)
1668  return;
1669  if (xmlStrEqual(newtag, BAD_CAST"html"))
1670  return;
1671  if (ctxt->nameNr <= 0) {
1672  htmlnamePush(ctxt, BAD_CAST"html");
1673  if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1674  ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
1675  }
1676  if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
1677  return;
1678  if ((ctxt->nameNr <= 1) &&
1679  ((xmlStrEqual(newtag, BAD_CAST"script")) ||
1680  (xmlStrEqual(newtag, BAD_CAST"style")) ||
1681  (xmlStrEqual(newtag, BAD_CAST"meta")) ||
1682  (xmlStrEqual(newtag, BAD_CAST"link")) ||
1683  (xmlStrEqual(newtag, BAD_CAST"title")) ||
1684  (xmlStrEqual(newtag, BAD_CAST"base")))) {
1685  if (ctxt->html >= 3) {
1686  /* we already saw or generated an <head> before */
1687  return;
1688  }
1689  /*
1690  * dropped OBJECT ... i you put it first BODY will be
1691  * assumed !
1692  */
1693  htmlnamePush(ctxt, BAD_CAST"head");
1694  if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1695  ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
1696  } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
1697  (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
1698  (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
1699  if (ctxt->html >= 10) {
1700  /* we already saw or generated a <body> before */
1701  return;
1702  }
1703  for (i = 0;i < ctxt->nameNr;i++) {
1704  if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
1705  return;
1706  }
1707  if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
1708  return;
1709  }
1710  }
1711 
1712  htmlnamePush(ctxt, BAD_CAST"body");
1713  if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1714  ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
1715  }
1716 }
1717 
1729 static int
1730 htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
1731  const xmlChar *tag;
1732  int i;
1733 
1734  if (ctxt == NULL)
1735  return(-1);
1736  tag = ctxt->name;
1737  if (tag == NULL) {
1738  htmlAutoClose(ctxt, BAD_CAST"p");
1739  htmlCheckImplied(ctxt, BAD_CAST"p");
1740  htmlnamePush(ctxt, BAD_CAST"p");
1741  if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1742  ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1743  return(1);
1744  }
1745  if (!htmlOmittedDefaultValue)
1746  return(0);
1747  for (i = 0; htmlNoContentElements[i] != NULL; i++) {
1748  if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
1749  htmlAutoClose(ctxt, BAD_CAST"p");
1750  htmlCheckImplied(ctxt, BAD_CAST"p");
1751  htmlnamePush(ctxt, BAD_CAST"p");
1752  if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1753  ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1754  return(1);
1755  }
1756  }
1757  return(0);
1758 }
1759 
1768 int
1769 htmlIsScriptAttribute(const xmlChar *name) {
1770  unsigned int i;
1771 
1772  if (name == NULL)
1773  return(0);
1774  /*
1775  * all script attributes start with 'on'
1776  */
1777  if ((name[0] != 'o') || (name[1] != 'n'))
1778  return(0);
1779  for (i = 0;
1780  i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
1781  i++) {
1782  if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
1783  return(1);
1784  }
1785  return(0);
1786 }
1787 
1788 /************************************************************************
1789  * *
1790  * The list of HTML predefined entities *
1791  * *
1792  ************************************************************************/
1793 
1794 
1795 static const htmlEntityDesc html40EntitiesTable[] = {
1796 /*
1797  * the 4 absolute ones, plus apostrophe.
1798  */
1799 { 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
1800 { 38, "amp", "ampersand, U+0026 ISOnum" },
1801 { 39, "apos", "single quote" },
1802 { 60, "lt", "less-than sign, U+003C ISOnum" },
1803 { 62, "gt", "greater-than sign, U+003E ISOnum" },
1804 
1805 /*
1806  * A bunch still in the 128-255 range
1807  * Replacing them depend really on the charset used.
1808  */
1809 { 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
1810 { 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
1811 { 162, "cent", "cent sign, U+00A2 ISOnum" },
1812 { 163, "pound","pound sign, U+00A3 ISOnum" },
1813 { 164, "curren","currency sign, U+00A4 ISOnum" },
1814 { 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
1815 { 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
1816 { 167, "sect", "section sign, U+00A7 ISOnum" },
1817 { 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
1818 { 169, "copy", "copyright sign, U+00A9 ISOnum" },
1819 { 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
1820 { 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
1821 { 172, "not", "not sign, U+00AC ISOnum" },
1822 { 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
1823 { 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
1824 { 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
1825 { 176, "deg", "degree sign, U+00B0 ISOnum" },
1826 { 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
1827 { 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
1828 { 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
1829 { 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
1830 { 181, "micro","micro sign, U+00B5 ISOnum" },
1831 { 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1832 { 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1833 { 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
1834 { 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
1835 { 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
1836 { 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1837 { 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1838 { 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1839 { 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1840 { 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1841 { 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1842 { 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1843 { 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1844 { 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1845 { 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1846 { 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1847 { 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1848 { 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1849 { 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1850 { 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1851 { 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1852 { 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
1853 { 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1854 { 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1855 { 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1856 { 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
1857 { 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
1858 { 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1859 { 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1860 { 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1861 { 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1862 { 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1863 { 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1864 { 215, "times","multiplication sign, U+00D7 ISOnum" },
1865 { 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1866 { 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1867 { 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1868 { 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1869 { 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
1870 { 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1871 { 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
1872 { 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1873 { 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1874 { 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1875 { 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1876 { 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1877 { 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
1878 { 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1879 { 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1880 { 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1881 { 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1882 { 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1883 { 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1884 { 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
1885 { 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
1886 { 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
1887 { 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1888 { 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
1889 { 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
1890 { 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1891 { 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1892 { 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1893 { 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1894 { 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1895 { 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
1896 { 247, "divide","division sign, U+00F7 ISOnum" },
1897 { 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1898 { 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1899 { 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
1900 { 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1901 { 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
1902 { 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
1903 { 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
1904 { 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
1905 
1906 { 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
1907 { 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
1908 { 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1909 { 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
1910 { 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1911 
1912 /*
1913  * Anything below should really be kept as entities references
1914  */
1915 { 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
1916 
1917 { 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
1918 { 732, "tilde","small tilde, U+02DC ISOdia" },
1919 
1920 { 913, "Alpha","greek capital letter alpha, U+0391" },
1921 { 914, "Beta", "greek capital letter beta, U+0392" },
1922 { 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1923 { 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
1924 { 917, "Epsilon","greek capital letter epsilon, U+0395" },
1925 { 918, "Zeta", "greek capital letter zeta, U+0396" },
1926 { 919, "Eta", "greek capital letter eta, U+0397" },
1927 { 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
1928 { 921, "Iota", "greek capital letter iota, U+0399" },
1929 { 922, "Kappa","greek capital letter kappa, U+039A" },
1930 { 923, "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
1931 { 924, "Mu", "greek capital letter mu, U+039C" },
1932 { 925, "Nu", "greek capital letter nu, U+039D" },
1933 { 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
1934 { 927, "Omicron","greek capital letter omicron, U+039F" },
1935 { 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
1936 { 929, "Rho", "greek capital letter rho, U+03A1" },
1937 { 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1938 { 932, "Tau", "greek capital letter tau, U+03A4" },
1939 { 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1940 { 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
1941 { 935, "Chi", "greek capital letter chi, U+03A7" },
1942 { 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
1943 { 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
1944 
1945 { 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1946 { 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
1947 { 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1948 { 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
1949 { 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1950 { 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
1951 { 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
1952 { 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
1953 { 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
1954 { 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
1955 { 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
1956 { 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
1957 { 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
1958 { 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
1959 { 959, "omicron","greek small letter omicron, U+03BF NEW" },
1960 { 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
1961 { 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
1962 { 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1963 { 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1964 { 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
1965 { 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1966 { 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
1967 { 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
1968 { 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
1969 { 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
1970 { 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
1971 { 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1972 { 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
1973 
1974 { 8194, "ensp", "en space, U+2002 ISOpub" },
1975 { 8195, "emsp", "em space, U+2003 ISOpub" },
1976 { 8201, "thinsp","thin space, U+2009 ISOpub" },
1977 { 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
1978 { 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
1979 { 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
1980 { 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
1981 { 8211, "ndash","en dash, U+2013 ISOpub" },
1982 { 8212, "mdash","em dash, U+2014 ISOpub" },
1983 { 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
1984 { 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
1985 { 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
1986 { 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
1987 { 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
1988 { 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
1989 { 8224, "dagger","dagger, U+2020 ISOpub" },
1990 { 8225, "Dagger","double dagger, U+2021 ISOpub" },
1991 
1992 { 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
1993 { 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1994 
1995 { 8240, "permil","per mille sign, U+2030 ISOtech" },
1996 
1997 { 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
1998 { 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
1999 
2000 { 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
2001 { 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
2002 
2003 { 8254, "oline","overline = spacing overscore, U+203E NEW" },
2004 { 8260, "frasl","fraction slash, U+2044 NEW" },
2005 
2006 { 8364, "euro", "euro sign, U+20AC NEW" },
2007 
2008 { 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
2009 { 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
2010 { 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
2011 { 8482, "trade","trade mark sign, U+2122 ISOnum" },
2012 { 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
2013 { 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
2014 { 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
2015 { 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
2016 { 8595, "darr", "downwards arrow, U+2193 ISOnum" },
2017 { 8596, "harr", "left right arrow, U+2194 ISOamsa" },
2018 { 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
2019 { 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
2020 { 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
2021 { 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
2022 { 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
2023 { 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
2024 
2025 { 8704, "forall","for all, U+2200 ISOtech" },
2026 { 8706, "part", "partial differential, U+2202 ISOtech" },
2027 { 8707, "exist","there exists, U+2203 ISOtech" },
2028 { 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
2029 { 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
2030 { 8712, "isin", "element of, U+2208 ISOtech" },
2031 { 8713, "notin","not an element of, U+2209 ISOtech" },
2032 { 8715, "ni", "contains as member, U+220B ISOtech" },
2033 { 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
2034 { 8721, "sum", "n-ary summation, U+2211 ISOamsb" },
2035 { 8722, "minus","minus sign, U+2212 ISOtech" },
2036 { 8727, "lowast","asterisk operator, U+2217 ISOtech" },
2037 { 8730, "radic","square root = radical sign, U+221A ISOtech" },
2038 { 8733, "prop", "proportional to, U+221D ISOtech" },
2039 { 8734, "infin","infinity, U+221E ISOtech" },
2040 { 8736, "ang", "angle, U+2220 ISOamso" },
2041 { 8743, "and", "logical and = wedge, U+2227 ISOtech" },
2042 { 8744, "or", "logical or = vee, U+2228 ISOtech" },
2043 { 8745, "cap", "intersection = cap, U+2229 ISOtech" },
2044 { 8746, "cup", "union = cup, U+222A ISOtech" },
2045 { 8747, "int", "integral, U+222B ISOtech" },
2046 { 8756, "there4","therefore, U+2234 ISOtech" },
2047 { 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
2048 { 8773, "cong", "approximately equal to, U+2245 ISOtech" },
2049 { 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
2050 { 8800, "ne", "not equal to, U+2260 ISOtech" },
2051 { 8801, "equiv","identical to, U+2261 ISOtech" },
2052 { 8804, "le", "less-than or equal to, U+2264 ISOtech" },
2053 { 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
2054 { 8834, "sub", "subset of, U+2282 ISOtech" },
2055 { 8835, "sup", "superset of, U+2283 ISOtech" },
2056 { 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
2057 { 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
2058 { 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
2059 { 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
2060 { 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
2061 { 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
2062 { 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
2063 { 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
2064 { 8969, "rceil","right ceiling, U+2309 ISOamsc" },
2065 { 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
2066 { 8971, "rfloor","right floor, U+230B ISOamsc" },
2067 { 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
2068 { 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
2069 { 9674, "loz", "lozenge, U+25CA ISOpub" },
2070 
2071 { 9824, "spades","black spade suit, U+2660 ISOpub" },
2072 { 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
2073 { 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
2074 { 9830, "diams","black diamond suit, U+2666 ISOpub" },
2075 
2076 };
2077 
2078 /************************************************************************
2079  * *
2080  * Commodity functions to handle entities *
2081  * *
2082  ************************************************************************/
2083 
2084 /*
2085  * Macro used to grow the current buffer.
2086  */
2087 #define growBuffer(buffer) { \
2088  xmlChar *tmp; \
2089  buffer##_size *= 2; \
2090  tmp = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
2091  if (tmp == NULL) { \
2092  htmlErrMemory(ctxt, "growing buffer\n"); \
2093  xmlFree(buffer); \
2094  return(NULL); \
2095  } \
2096  buffer = tmp; \
2097 }
2098 
2109 const htmlEntityDesc *
2110 htmlEntityLookup(const xmlChar *name) {
2111  unsigned int i;
2112 
2113  for (i = 0;i < (sizeof(html40EntitiesTable)/
2114  sizeof(html40EntitiesTable[0]));i++) {
2115  if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
2116  return((htmlEntityDescPtr) &html40EntitiesTable[i]);
2117  }
2118  }
2119  return(NULL);
2120 }
2121 
2132 const htmlEntityDesc *
2133 htmlEntityValueLookup(unsigned int value) {
2134  unsigned int i;
2135 
2136  for (i = 0;i < (sizeof(html40EntitiesTable)/
2137  sizeof(html40EntitiesTable[0]));i++) {
2138  if (html40EntitiesTable[i].value >= value) {
2139  if (html40EntitiesTable[i].value > value)
2140  break;
2141  return((htmlEntityDescPtr) &html40EntitiesTable[i]);
2142  }
2143  }
2144  return(NULL);
2145 }
2146 
2162 int
2163 UTF8ToHtml(unsigned char* out, int *outlen,
2164  const unsigned char* in, int *inlen) {
2165  const unsigned char* processed = in;
2166  const unsigned char* outend;
2167  const unsigned char* outstart = out;
2168  const unsigned char* instart = in;
2169  const unsigned char* inend;
2170  unsigned int c, d;
2171  int trailing;
2172 
2173  if ((out == NULL) || (outlen == NULL) || (inlen == NULL)) return(-1);
2174  if (in == NULL) {
2175  /*
2176  * initialization nothing to do
2177  */
2178  *outlen = 0;
2179  *inlen = 0;
2180  return(0);
2181  }
2182  inend = in + (*inlen);
2183  outend = out + (*outlen);
2184  while (in < inend) {
2185  d = *in++;
2186  if (d < 0x80) { c= d; trailing= 0; }
2187  else if (d < 0xC0) {
2188  /* trailing byte in leading position */
2189  *outlen = out - outstart;
2190  *inlen = processed - instart;
2191  return(-2);
2192  } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
2193  else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
2194  else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
2195  else {
2196  /* no chance for this in Ascii */
2197  *outlen = out - outstart;
2198  *inlen = processed - instart;
2199  return(-2);
2200  }
2201 
2202  if (inend - in < trailing) {
2203  break;
2204  }
2205 
2206  for ( ; trailing; trailing--) {
2207  if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
2208  break;
2209  c <<= 6;
2210  c |= d & 0x3F;
2211  }
2212 
2213  /* assertion: c is a single UTF-4 value */
2214  if (c < 0x80) {
2215  if (out + 1 >= outend)
2216  break;
2217  *out++ = c;
2218  } else {
2219  int len;
2220  const htmlEntityDesc * ent;
2221  const char *cp;
2222  char nbuf[16];
2223 
2224  /*
2225  * Try to lookup a predefined HTML entity for it
2226  */
2227 
2228  ent = htmlEntityValueLookup(c);
2229  if (ent == NULL) {
2230  snprintf(nbuf, sizeof(nbuf), "#%u", c);
2231  cp = nbuf;
2232  }
2233  else
2234  cp = ent->name;
2235  len = strlen(cp);
2236  if (out + 2 + len >= outend)
2237  break;
2238  *out++ = '&';
2239  memcpy(out, cp, len);
2240  out += len;
2241  *out++ = ';';
2242  }
2243  processed = in;
2244  }
2245  *outlen = out - outstart;
2246  *inlen = processed - instart;
2247  return(0);
2248 }
2249 
2266 int
2267 htmlEncodeEntities(unsigned char* out, int *outlen,
2268  const unsigned char* in, int *inlen, int quoteChar) {
2269  const unsigned char* processed = in;
2270  const unsigned char* outend;
2271  const unsigned char* outstart = out;
2272  const unsigned char* instart = in;
2273  const unsigned char* inend;
2274  unsigned int c, d;
2275  int trailing;
2276 
2277  if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL))
2278  return(-1);
2279  outend = out + (*outlen);
2280  inend = in + (*inlen);
2281  while (in < inend) {
2282  d = *in++;
2283  if (d < 0x80) { c= d; trailing= 0; }
2284  else if (d < 0xC0) {
2285  /* trailing byte in leading position */
2286  *outlen = out - outstart;
2287  *inlen = processed - instart;
2288  return(-2);
2289  } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
2290  else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
2291  else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
2292  else {
2293  /* no chance for this in Ascii */
2294  *outlen = out - outstart;
2295  *inlen = processed - instart;
2296  return(-2);
2297  }
2298 
2299  if (inend - in < trailing)
2300  break;
2301 
2302  while (trailing--) {
2303  if (((d= *in++) & 0xC0) != 0x80) {
2304  *outlen = out - outstart;
2305  *inlen = processed - instart;
2306  return(-2);
2307  }
2308  c <<= 6;
2309  c |= d & 0x3F;
2310  }
2311 
2312  /* assertion: c is a single UTF-4 value */
2313  if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
2314  (c != '&') && (c != '<') && (c != '>')) {
2315  if (out >= outend)
2316  break;
2317  *out++ = c;
2318  } else {
2319  const htmlEntityDesc * ent;
2320  const char *cp;
2321  char nbuf[16];
2322  int len;
2323 
2324  /*
2325  * Try to lookup a predefined HTML entity for it
2326  */
2327  ent = htmlEntityValueLookup(c);
2328  if (ent == NULL) {
2329  snprintf(nbuf, sizeof(nbuf), "#%u", c);
2330  cp = nbuf;
2331  }
2332  else
2333  cp = ent->name;
2334  len = strlen(cp);
2335  if (out + 2 + len > outend)
2336  break;
2337  *out++ = '&';
2338  memcpy(out, cp, len);
2339  out += len;
2340  *out++ = ';';
2341  }
2342  processed = in;
2343  }
2344  *outlen = out - outstart;
2345  *inlen = processed - instart;
2346  return(0);
2347 }
2348 
2349 /************************************************************************
2350  * *
2351  * Commodity functions to handle streams *
2352  * *
2353  ************************************************************************/
2354 
2355 #ifdef LIBXML_PUSH_ENABLED
2356 
2363 static htmlParserInputPtr
2364 htmlNewInputStream(htmlParserCtxtPtr ctxt) {
2365  htmlParserInputPtr input;
2366 
2367  input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
2368  if (input == NULL) {
2369  htmlErrMemory(ctxt, "couldn't allocate a new input stream\n");
2370  return(NULL);
2371  }
2372  memset(input, 0, sizeof(htmlParserInput));
2373  input->filename = NULL;
2374  input->directory = NULL;
2375  input->base = NULL;
2376  input->cur = NULL;
2377  input->buf = NULL;
2378  input->line = 1;
2379  input->col = 1;
2380  input->buf = NULL;
2381  input->free = NULL;
2382  input->version = NULL;
2383  input->consumed = 0;
2384  input->length = 0;
2385  return(input);
2386 }
2387 #endif
2388 
2389 
2390 /************************************************************************
2391  * *
2392  * Commodity functions, cleanup needed ? *
2393  * *
2394  ************************************************************************/
2395 /*
2396  * all tags allowing pc data from the html 4.01 loose dtd
2397  * NOTE: it might be more appropriate to integrate this information
2398  * into the html40ElementTable array but I don't want to risk any
2399  * binary incompatibility
2400  */
2401 static const char *allowPCData[] = {
2402  "a", "abbr", "acronym", "address", "applet", "b", "bdo", "big",
2403  "blockquote", "body", "button", "caption", "center", "cite", "code",
2404  "dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2",
2405  "h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend",
2406  "li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp",
2407  "small", "span", "strike", "strong", "td", "th", "tt", "u", "var"
2408 };
2409 
2421 static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
2422  unsigned int i;
2423  int j;
2424  xmlNodePtr lastChild;
2425  xmlDtdPtr dtd;
2426 
2427  for (j = 0;j < len;j++)
2428  if (!(IS_BLANK_CH(str[j]))) return(0);
2429 
2430  if (CUR == 0) return(1);
2431  if (CUR != '<') return(0);
2432  if (ctxt->name == NULL)
2433  return(1);
2434  if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
2435  return(1);
2436  if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
2437  return(1);
2438 
2439  /* Only strip CDATA children of the body tag for strict HTML DTDs */
2440  if (xmlStrEqual(ctxt->name, BAD_CAST "body") && ctxt->myDoc != NULL) {
2441  dtd = xmlGetIntSubset(ctxt->myDoc);
2442  if (dtd != NULL && dtd->ExternalID != NULL) {
2443  if (!xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4.01//EN") ||
2444  !xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4//EN"))
2445  return(1);
2446  }
2447  }
2448 
2449  if (ctxt->node == NULL) return(0);
2450  lastChild = xmlGetLastChild(ctxt->node);
2451  while ((lastChild) && (lastChild->type == XML_COMMENT_NODE))
2452  lastChild = lastChild->prev;
2453  if (lastChild == NULL) {
2454  if ((ctxt->node->type != XML_ELEMENT_NODE) &&
2455  (ctxt->node->content != NULL)) return(0);
2456  /* keep ws in constructs like ...<b> </b>...
2457  for all tags "b" allowing PCDATA */
2458  for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2459  if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) {
2460  return(0);
2461  }
2462  }
2463  } else if (xmlNodeIsText(lastChild)) {
2464  return(0);
2465  } else {
2466  /* keep ws in constructs like <p><b>xy</b> <i>z</i><p>
2467  for all tags "p" allowing PCDATA */
2468  for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2469  if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) {
2470  return(0);
2471  }
2472  }
2473  }
2474  return(1);
2475 }
2476 
2487 htmlDocPtr
2488 htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
2489  xmlDocPtr cur;
2490 
2491  /*
2492  * Allocate a new document and fill the fields.
2493  */
2494  cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
2495  if (cur == NULL) {
2496  htmlErrMemory(NULL, "HTML document creation failed\n");
2497  return(NULL);
2498  }
2499  memset(cur, 0, sizeof(xmlDoc));
2500 
2501  cur->type = XML_HTML_DOCUMENT_NODE;
2502  cur->version = NULL;
2503  cur->intSubset = NULL;
2504  cur->doc = cur;
2505  cur->name = NULL;
2506  cur->children = NULL;
2507  cur->extSubset = NULL;
2508  cur->oldNs = NULL;
2509  cur->encoding = NULL;
2510  cur->standalone = 1;
2511  cur->compression = 0;
2512  cur->ids = NULL;
2513  cur->refs = NULL;
2514  cur->_private = NULL;
2515  cur->charset = XML_CHAR_ENCODING_UTF8;
2516  cur->properties = XML_DOC_HTML | XML_DOC_USERBUILT;
2517  if ((ExternalID != NULL) ||
2518  (URI != NULL))
2519  xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI);
2520  return(cur);
2521 }
2522 
2532 htmlDocPtr
2533 htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
2534  if ((URI == NULL) && (ExternalID == NULL))
2535  return(htmlNewDocNoDtD(
2536  BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
2537  BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
2538 
2539  return(htmlNewDocNoDtD(URI, ExternalID));
2540 }
2541 
2542 
2543 /************************************************************************
2544  * *
2545  * The parser itself *
2546  * Relates to http://www.w3.org/TR/html40 *
2547  * *
2548  ************************************************************************/
2549 
2550 /************************************************************************
2551  * *
2552  * The parser itself *
2553  * *
2554  ************************************************************************/
2555 
2556 static const xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt);
2557 
2568 static const xmlChar *
2569 htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
2570  int i = 0;
2571  xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2572 
2573  if (!IS_ASCII_LETTER(CUR) && (CUR != '_') &&
2574  (CUR != ':') && (CUR != '.')) return(NULL);
2575 
2576  while ((i < HTML_PARSER_BUFFER_SIZE) &&
2577  ((IS_ASCII_LETTER(CUR)) || (IS_ASCII_DIGIT(CUR)) ||
2578  (CUR == ':') || (CUR == '-') || (CUR == '_') ||
2579  (CUR == '.'))) {
2580  if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
2581  else loc[i] = CUR;
2582  i++;
2583 
2584  NEXT;
2585  }
2586 
2587  return(xmlDictLookup(ctxt->dict, loc, i));
2588 }
2589 
2590 
2602 static const xmlChar *
2603 htmlParseHTMLName_nonInvasive(htmlParserCtxtPtr ctxt) {
2604  int i = 0;
2605  xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2606 
2607  if (!IS_ASCII_LETTER(NXT(1)) && (NXT(1) != '_') &&
2608  (NXT(1) != ':')) return(NULL);
2609 
2610  while ((i < HTML_PARSER_BUFFER_SIZE) &&
2611  ((IS_ASCII_LETTER(NXT(1+i))) || (IS_ASCII_DIGIT(NXT(1+i))) ||
2612  (NXT(1+i) == ':') || (NXT(1+i) == '-') || (NXT(1+i) == '_'))) {
2613  if ((NXT(1+i) >= 'A') && (NXT(1+i) <= 'Z')) loc[i] = NXT(1+i) + 0x20;
2614  else loc[i] = NXT(1+i);
2615  i++;
2616  }
2617 
2618  return(xmlDictLookup(ctxt->dict, loc, i));
2619 }
2620 
2621 
2631 static const xmlChar *
2632 htmlParseName(htmlParserCtxtPtr ctxt) {
2633  const xmlChar *in;
2634  const xmlChar *ret;
2635  int count = 0;
2636 
2637  GROW;
2638 
2639  /*
2640  * Accelerator for simple ASCII names
2641  */
2642  in = ctxt->input->cur;
2643  if (((*in >= 0x61) && (*in <= 0x7A)) ||
2644  ((*in >= 0x41) && (*in <= 0x5A)) ||
2645  (*in == '_') || (*in == ':')) {
2646  in++;
2647  while (((*in >= 0x61) && (*in <= 0x7A)) ||
2648  ((*in >= 0x41) && (*in <= 0x5A)) ||
2649  ((*in >= 0x30) && (*in <= 0x39)) ||
2650  (*in == '_') || (*in == '-') ||
2651  (*in == ':') || (*in == '.'))
2652  in++;
2653 
2654  if (in == ctxt->input->end)
2655  return(NULL);
2656 
2657  if ((*in > 0) && (*in < 0x80)) {
2658  count = in - ctxt->input->cur;
2659  ret = xmlDictLookup(ctxt->dict, ctxt->input->cur, count);
2660  ctxt->input->cur = in;
2661  ctxt->input->col += count;
2662  return(ret);
2663  }
2664  }
2665  return(htmlParseNameComplex(ctxt));
2666 }
2667 
2668 static const xmlChar *
2669 htmlParseNameComplex(xmlParserCtxtPtr ctxt) {
2670  int len = 0, l;
2671  int c;
2672  int count = 0;
2673  const xmlChar *base = ctxt->input->base;
2674 
2675  /*
2676  * Handler for more complex cases
2677  */
2678  GROW;
2679  c = CUR_CHAR(l);
2680  if ((c == ' ') || (c == '>') || (c == '/') || /* accelerators */
2681  (!IS_LETTER(c) && (c != '_') &&
2682  (c != ':'))) {
2683  return(NULL);
2684  }
2685 
2686  while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */
2687  ((IS_LETTER(c)) || (IS_DIGIT(c)) ||
2688  (c == '.') || (c == '-') ||
2689  (c == '_') || (c == ':') ||
2690  (IS_COMBINING(c)) ||
2691  (IS_EXTENDER(c)))) {
2692  if (count++ > 100) {
2693  count = 0;
2694  GROW;
2695  }
2696  len += l;
2697  NEXTL(l);
2698  c = CUR_CHAR(l);
2699  if (ctxt->input->base != base) {
2700  /*
2701  * We changed encoding from an unknown encoding
2702  * Input buffer changed location, so we better start again
2703  */
2704  return(htmlParseNameComplex(ctxt));
2705  }
2706  }
2707 
2708  if (ctxt->input->cur - ctxt->input->base < len) {
2709  /* Sanity check */
2710  htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
2711  "unexpected change of input buffer", NULL, NULL);
2712  return (NULL);
2713  }
2714 
2715  return(xmlDictLookup(ctxt->dict, ctxt->input->cur - len, len));
2716 }
2717 
2718 
2730 static xmlChar *
2731 htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
2732  xmlChar *buffer = NULL;
2733  int buffer_size = 0;
2734  xmlChar *out = NULL;
2735  const xmlChar *name = NULL;
2736  const xmlChar *cur = NULL;
2737  const htmlEntityDesc * ent;
2738 
2739  /*
2740  * allocate a translation buffer.
2741  */
2742  buffer_size = HTML_PARSER_BUFFER_SIZE;
2744  if (buffer == NULL) {
2745  htmlErrMemory(ctxt, "buffer allocation failed\n");
2746  return(NULL);
2747  }
2748  out = buffer;
2749 
2750  /*
2751  * Ok loop until we reach one of the ending chars
2752  */
2753  while ((CUR != 0) && (CUR != stop)) {
2754  if ((stop == 0) && (CUR == '>')) break;
2755  if ((stop == 0) && (IS_BLANK_CH(CUR))) break;
2756  if (CUR == '&') {
2757  if (NXT(1) == '#') {
2758  unsigned int c;
2759  int bits;
2760 
2761  c = htmlParseCharRef(ctxt);
2762  if (c < 0x80)
2763  { *out++ = c; bits= -6; }
2764  else if (c < 0x800)
2765  { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2766  else if (c < 0x10000)
2767  { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2768  else
2769  { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2770 
2771  for ( ; bits >= 0; bits-= 6) {
2772  *out++ = ((c >> bits) & 0x3F) | 0x80;
2773  }
2774 
2775  if (out - buffer > buffer_size - 100) {
2776  int indx = out - buffer;
2777 
2778  growBuffer(buffer);
2779  out = &buffer[indx];
2780  }
2781  } else {
2782  ent = htmlParseEntityRef(ctxt, &name);
2783  if (name == NULL) {
2784  *out++ = '&';
2785  if (out - buffer > buffer_size - 100) {
2786  int indx = out - buffer;
2787 
2788  growBuffer(buffer);
2789  out = &buffer[indx];
2790  }
2791  } else if (ent == NULL) {
2792  *out++ = '&';
2793  cur = name;
2794  while (*cur != 0) {
2795  if (out - buffer > buffer_size - 100) {
2796  int indx = out - buffer;
2797 
2798  growBuffer(buffer);
2799  out = &buffer[indx];
2800  }
2801  *out++ = *cur++;
2802  }
2803  } else {
2804  unsigned int c;
2805  int bits;
2806 
2807  if (out - buffer > buffer_size - 100) {
2808  int indx = out - buffer;
2809 
2810  growBuffer(buffer);
2811  out = &buffer[indx];
2812  }
2813  c = ent->value;
2814  if (c < 0x80)
2815  { *out++ = c; bits= -6; }
2816  else if (c < 0x800)
2817  { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2818  else if (c < 0x10000)
2819  { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2820  else
2821  { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2822 
2823  for ( ; bits >= 0; bits-= 6) {
2824  *out++ = ((c >> bits) & 0x3F) | 0x80;
2825  }
2826  }
2827  }
2828  } else {
2829  unsigned int c;
2830  int bits, l;
2831 
2832  if (out - buffer > buffer_size - 100) {
2833  int indx = out - buffer;
2834 
2835  growBuffer(buffer);
2836  out = &buffer[indx];
2837  }
2838  c = CUR_CHAR(l);
2839  if (c < 0x80)
2840  { *out++ = c; bits= -6; }
2841  else if (c < 0x800)
2842  { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2843  else if (c < 0x10000)
2844  { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2845  else
2846  { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2847 
2848  for ( ; bits >= 0; bits-= 6) {
2849  *out++ = ((c >> bits) & 0x3F) | 0x80;
2850  }
2851  NEXT;
2852  }
2853  }
2854  *out = 0;
2855  return(buffer);
2856 }
2857 
2870 const htmlEntityDesc *
2871 htmlParseEntityRef(htmlParserCtxtPtr ctxt, const xmlChar **str) {
2872  const xmlChar *name;
2873  const htmlEntityDesc * ent = NULL;
2874 
2875  if (str != NULL) *str = NULL;
2876  if ((ctxt == NULL) || (ctxt->input == NULL)) return(NULL);
2877 
2878  if (CUR == '&') {
2879  NEXT;
2880  name = htmlParseName(ctxt);
2881  if (name == NULL) {
2882  htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
2883  "htmlParseEntityRef: no name\n", NULL, NULL);
2884  } else {
2885  GROW;
2886  if (CUR == ';') {
2887  if (str != NULL)
2888  *str = name;
2889 
2890  /*
2891  * Lookup the entity in the table.
2892  */
2893  ent = htmlEntityLookup(name);
2894  if (ent != NULL) /* OK that's ugly !!! */
2895  NEXT;
2896  } else {
2897  htmlParseErr(ctxt, XML_ERR_ENTITYREF_SEMICOL_MISSING,
2898  "htmlParseEntityRef: expecting ';'\n",
2899  NULL, NULL);
2900  if (str != NULL)
2901  *str = name;
2902  }
2903  }
2904  }
2905  return(ent);
2906 }
2907 
2920 static xmlChar *
2921 htmlParseAttValue(htmlParserCtxtPtr ctxt) {
2922  xmlChar *ret = NULL;
2923 
2924  if (CUR == '"') {
2925  NEXT;
2926  ret = htmlParseHTMLAttribute(ctxt, '"');
2927  if (CUR != '"') {
2928  htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2929  "AttValue: \" expected\n", NULL, NULL);
2930  } else
2931  NEXT;
2932  } else if (CUR == '\'') {
2933  NEXT;
2934  ret = htmlParseHTMLAttribute(ctxt, '\'');
2935  if (CUR != '\'') {
2936  htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2937  "AttValue: ' expected\n", NULL, NULL);
2938  } else
2939  NEXT;
2940  } else {
2941  /*
2942  * That's an HTMLism, the attribute value may not be quoted
2943  */
2944  ret = htmlParseHTMLAttribute(ctxt, 0);
2945  if (ret == NULL) {
2946  htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_WITHOUT_VALUE,
2947  "AttValue: no value found\n", NULL, NULL);
2948  }
2949  }
2950  return(ret);
2951 }
2952 
2964 static xmlChar *
2965 htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
2966  size_t len = 0, startPosition = 0;
2967  int err = 0;
2968  int quote;
2969  xmlChar *ret = NULL;
2970 
2971  if ((CUR != '"') && (CUR != '\'')) {
2972  htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2973  "SystemLiteral \" or ' expected\n", NULL, NULL);
2974  return(NULL);
2975  }
2976  quote = CUR;
2977  NEXT;
2978 
2979  if (CUR_PTR < BASE_PTR)
2980  return(ret);
2981  startPosition = CUR_PTR - BASE_PTR;
2982 
2983  while ((CUR != 0) && (CUR != quote)) {
2984  /* TODO: Handle UTF-8 */
2985  if (!IS_CHAR_CH(CUR)) {
2986  htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2987  "Invalid char in SystemLiteral 0x%X\n", CUR);
2988  err = 1;
2989  }
2990  NEXT;
2991  len++;
2992  }
2993  if (CUR != quote) {
2994  htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2995  "Unfinished SystemLiteral\n", NULL, NULL);
2996  } else {
2997  NEXT;
2998  if (err == 0)
2999  ret = xmlStrndup((BASE_PTR+startPosition), len);
3000  }
3001 
3002  return(ret);
3003 }
3004 
3016 static xmlChar *
3017 htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
3018  size_t len = 0, startPosition = 0;
3019  int err = 0;
3020  int quote;
3021  xmlChar *ret = NULL;
3022 
3023  if ((CUR != '"') && (CUR != '\'')) {
3024  htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
3025  "PubidLiteral \" or ' expected\n", NULL, NULL);
3026  return(NULL);
3027  }
3028  quote = CUR;
3029  NEXT;
3030 
3031  /*
3032  * Name ::= (Letter | '_') (NameChar)*
3033  */
3034  if (CUR_PTR < BASE_PTR)
3035  return(ret);
3036  startPosition = CUR_PTR - BASE_PTR;
3037 
3038  while ((CUR != 0) && (CUR != quote)) {
3039  if (!IS_PUBIDCHAR_CH(CUR)) {
3040  htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3041  "Invalid char in PubidLiteral 0x%X\n", CUR);
3042  err = 1;
3043  }
3044  len++;
3045  NEXT;
3046  }
3047 
3048  if (CUR != '"') {
3049  htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
3050  "Unfinished PubidLiteral\n", NULL, NULL);
3051  } else {
3052  NEXT;
3053  if (err == 0)
3054  ret = xmlStrndup((BASE_PTR + startPosition), len);
3055  }
3056 
3057  return(ret);
3058 }
3059 
3081 static void
3082 htmlParseScript(htmlParserCtxtPtr ctxt) {
3083  xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
3084  int nbchar = 0;
3085  int cur,l;
3086 
3087  SHRINK;
3088  cur = CUR_CHAR(l);
3089  while (cur != 0) {
3090  if ((cur == '<') && (NXT(1) == '/')) {
3091  /*
3092  * One should break here, the specification is clear:
3093  * Authors should therefore escape "</" within the content.
3094  * Escape mechanisms are specific to each scripting or
3095  * style sheet language.
3096  *
3097  * In recovery mode, only break if end tag match the
3098  * current tag, effectively ignoring all tags inside the
3099  * script/style block and treating the entire block as
3100  * CDATA.
3101  */
3102  if (ctxt->recovery) {
3103  if (xmlStrncasecmp(ctxt->name, ctxt->input->cur+2,
3104  xmlStrlen(ctxt->name)) == 0)
3105  {
3106  break; /* while */
3107  } else {
3108  htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
3109  "Element %s embeds close tag\n",
3110  ctxt->name, NULL);
3111  }
3112  } else {
3113  if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
3114  ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
3115  {
3116  break; /* while */
3117  }
3118  }
3119  }
3120  if (IS_CHAR(cur)) {
3121  COPY_BUF(l,buf,nbchar,cur);
3122  } else {
3123  htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3124  "Invalid char in CDATA 0x%X\n", cur);
3125  }
3126  if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
3127  buf[nbchar] = 0;
3128  if (ctxt->sax->cdataBlock!= NULL) {
3129  /*
3130  * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
3131  */
3132  ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
3133  } else if (ctxt->sax->characters != NULL) {
3134  ctxt->sax->characters(ctxt->userData, buf, nbchar);
3135  }
3136  nbchar = 0;
3137  }
3138  GROW;
3139  NEXTL(l);
3140  cur = CUR_CHAR(l);
3141  }
3142 
3143  if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
3144  buf[nbchar] = 0;
3145  if (ctxt->sax->cdataBlock!= NULL) {
3146  /*
3147  * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
3148  */
3149  ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
3150  } else if (ctxt->sax->characters != NULL) {
3151  ctxt->sax->characters(ctxt->userData, buf, nbchar);
3152  }
3153  }
3154 }
3155 
3156 
3168 static void
3169 htmlParseCharDataInternal(htmlParserCtxtPtr ctxt, int readahead) {
3170  xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 6];
3171  int nbchar = 0;
3172  int cur, l;
3173  int chunk = 0;
3174 
3175  if (readahead)
3176  buf[nbchar++] = readahead;
3177 
3178  SHRINK;
3179  cur = CUR_CHAR(l);
3180  while (((cur != '<') || (ctxt->token == '<')) &&
3181  ((cur != '&') || (ctxt->token == '&')) &&
3182  (cur != 0)) {
3183  if (!(IS_CHAR(cur))) {
3184  htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3185  "Invalid char in CDATA 0x%X\n", cur);
3186  } else {
3187  COPY_BUF(l,buf,nbchar,cur);
3188  }
3189  if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
3190  buf[nbchar] = 0;
3191 
3192  /*
3193  * Ok the segment is to be consumed as chars.
3194  */
3195  if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
3196  if (areBlanks(ctxt, buf, nbchar)) {
3197  if (ctxt->keepBlanks) {
3198  if (ctxt->sax->characters != NULL)
3199  ctxt->sax->characters(ctxt->userData, buf, nbchar);
3200  } else {
3201  if (ctxt->sax->ignorableWhitespace != NULL)
3202  ctxt->sax->ignorableWhitespace(ctxt->userData,
3203  buf, nbchar);
3204  }
3205  } else {
3206  htmlCheckParagraph(ctxt);
3207  if (ctxt->sax->characters != NULL)
3208  ctxt->sax->characters(ctxt->userData, buf, nbchar);
3209  }
3210  }
3211  nbchar = 0;
3212  }
3213  NEXTL(l);
3214  chunk++;
3215  if (chunk > HTML_PARSER_BUFFER_SIZE) {
3216  chunk = 0;
3217  SHRINK;
3218  GROW;
3219  }
3220  cur = CUR_CHAR(l);
3221  if (cur == 0) {
3222  SHRINK;
3223  GROW;
3224  cur = CUR_CHAR(l);
3225  }
3226  }
3227  if (nbchar != 0) {
3228  buf[nbchar] = 0;
3229 
3230  /*
3231  * Ok the segment is to be consumed as chars.
3232  */
3233  if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
3234  if (areBlanks(ctxt, buf, nbchar)) {
3235  if (ctxt->keepBlanks) {
3236  if (ctxt->sax->characters != NULL)
3237  ctxt->sax->characters(ctxt->userData, buf, nbchar);
3238  } else {
3239  if (ctxt->sax->ignorableWhitespace != NULL)
3240  ctxt->sax->ignorableWhitespace(ctxt->userData,
3241  buf, nbchar);
3242  }
3243  } else {
3244  htmlCheckParagraph(ctxt);
3245  if (ctxt->sax->characters != NULL)
3246  ctxt->sax->characters(ctxt->userData, buf, nbchar);
3247  }
3248  }
3249  } else {
3250  /*
3251  * Loop detection
3252  */
3253  if (cur == 0)
3254  ctxt->instate = XML_PARSER_EOF;
3255  }
3256 }
3257 
3268 static void
3269 htmlParseCharData(htmlParserCtxtPtr ctxt) {
3270  htmlParseCharDataInternal(ctxt, 0);
3271 }
3272 
3290 static xmlChar *
3291 htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
3292  xmlChar *URI = NULL;
3293 
3294  if ((UPPER == 'S') && (UPP(1) == 'Y') &&
3295  (UPP(2) == 'S') && (UPP(3) == 'T') &&
3296  (UPP(4) == 'E') && (UPP(5) == 'M')) {
3297  SKIP(6);
3298  if (!IS_BLANK_CH(CUR)) {
3299  htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3300  "Space required after 'SYSTEM'\n", NULL, NULL);
3301  }
3302  SKIP_BLANKS;
3303  URI = htmlParseSystemLiteral(ctxt);
3304  if (URI == NULL) {
3305  htmlParseErr(ctxt, XML_ERR_URI_REQUIRED,
3306  "htmlParseExternalID: SYSTEM, no URI\n", NULL, NULL);
3307  }
3308  } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
3309  (UPP(2) == 'B') && (UPP(3) == 'L') &&
3310  (UPP(4) == 'I') && (UPP(5) == 'C')) {
3311  SKIP(6);
3312  if (!IS_BLANK_CH(CUR)) {
3313  htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3314  "Space required after 'PUBLIC'\n", NULL, NULL);
3315  }
3316  SKIP_BLANKS;
3317  *publicID = htmlParsePubidLiteral(ctxt);
3318  if (*publicID == NULL) {
3319  htmlParseErr(ctxt, XML_ERR_PUBID_REQUIRED,
3320  "htmlParseExternalID: PUBLIC, no Public Identifier\n",
3321  NULL, NULL);
3322  }
3323  SKIP_BLANKS;
3324  if ((CUR == '"') || (CUR == '\'')) {
3325  URI = htmlParseSystemLiteral(ctxt);
3326  }
3327  }
3328  return(URI);
3329 }
3330 
3339 static void
3340 htmlParsePI(htmlParserCtxtPtr ctxt) {
3341  xmlChar *buf = NULL;
3342  int len = 0;
3343  int size = HTML_PARSER_BUFFER_SIZE;
3344  int cur, l;
3345  const xmlChar *target;
3347  int count = 0;
3348 
3349  if ((RAW == '<') && (NXT(1) == '?')) {
3350  state = ctxt->instate;
3351  ctxt->instate = XML_PARSER_PI;
3352  /*
3353  * this is a Processing Instruction.
3354  */
3355  SKIP(2);
3356  SHRINK;
3357 
3358  /*
3359  * Parse the target name and check for special support like
3360  * namespace.
3361  */
3362  target = htmlParseName(ctxt);
3363  if (target != NULL) {
3364  if (RAW == '>') {
3365  SKIP(1);
3366 
3367  /*
3368  * SAX: PI detected.
3369  */
3370  if ((ctxt->sax) && (!ctxt->disableSAX) &&
3371  (ctxt->sax->processingInstruction != NULL))
3372  ctxt->sax->processingInstruction(ctxt->userData,
3373  target, NULL);
3374  ctxt->instate = state;
3375  return;
3376  }
3377  buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
3378  if (buf == NULL) {
3379  htmlErrMemory(ctxt, NULL);
3380  ctxt->instate = state;
3381  return;
3382  }
3383  cur = CUR;
3384  if (!IS_BLANK(cur)) {
3385  htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3386  "ParsePI: PI %s space expected\n", target, NULL);
3387  }
3388  SKIP_BLANKS;
3389  cur = CUR_CHAR(l);
3390  while ((cur != 0) && (cur != '>')) {
3391  if (len + 5 >= size) {
3392  xmlChar *tmp;
3393 
3394  size *= 2;
3395  tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
3396  if (tmp == NULL) {
3397  htmlErrMemory(ctxt, NULL);
3398  xmlFree(buf);
3399  ctxt->instate = state;
3400  return;
3401  }
3402  buf = tmp;
3403  }
3404  count++;
3405  if (count > 50) {
3406  GROW;
3407  count = 0;
3408  }
3409  if (IS_CHAR(cur)) {
3410  COPY_BUF(l,buf,len,cur);
3411  } else {
3412  htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3413  "Invalid char in processing instruction "
3414  "0x%X\n", cur);
3415  }
3416  NEXTL(l);
3417  cur = CUR_CHAR(l);
3418  if (cur == 0) {
3419  SHRINK;
3420  GROW;
3421  cur = CUR_CHAR(l);
3422  }
3423  }
3424  buf[len] = 0;
3425  if (cur != '>') {
3426  htmlParseErr(ctxt, XML_ERR_PI_NOT_FINISHED,
3427  "ParsePI: PI %s never end ...\n", target, NULL);
3428  } else {
3429  SKIP(1);
3430 
3431  /*
3432  * SAX: PI detected.
3433  */
3434  if ((ctxt->sax) && (!ctxt->disableSAX) &&
3435  (ctxt->sax->processingInstruction != NULL))
3436  ctxt->sax->processingInstruction(ctxt->userData,
3437  target, buf);
3438  }
3439  xmlFree(buf);
3440  } else {
3441  htmlParseErr(ctxt, XML_ERR_PI_NOT_STARTED,
3442  "PI is not started correctly", NULL, NULL);
3443  }
3444  ctxt->instate = state;
3445  }
3446 }
3447 
3456 static void
3457 htmlParseComment(htmlParserCtxtPtr ctxt) {
3458  xmlChar *buf = NULL;
3459  int len;
3460  int size = HTML_PARSER_BUFFER_SIZE;
3461  int q, ql;
3462  int r, rl;
3463  int cur, l;
3464  int next, nl;
3466 
3467  /*
3468  * Check that there is a comment right here.
3469  */
3470  if ((RAW != '<') || (NXT(1) != '!') ||
3471  (NXT(2) != '-') || (NXT(3) != '-')) return;
3472 
3473  state = ctxt->instate;
3474  ctxt->instate = XML_PARSER_COMMENT;
3475  SHRINK;
3476  SKIP(4);
3477  buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
3478  if (buf == NULL) {
3479  htmlErrMemory(ctxt, "buffer allocation failed\n");
3480  ctxt->instate = state;
3481  return;
3482  }
3483  len = 0;
3484  buf[len] = 0;
3485  q = CUR_CHAR(ql);
3486  if (q == 0)
3487  goto unfinished;
3488  NEXTL(ql);
3489  r = CUR_CHAR(rl);
3490  if (r == 0)
3491  goto unfinished;
3492  NEXTL(rl);
3493  cur = CUR_CHAR(l);
3494  while ((cur != 0) &&
3495  ((cur != '>') ||
3496  (r != '-') || (q != '-'))) {
3497  NEXTL(l);
3498  next = CUR_CHAR(nl);
3499  if (next == 0) {
3500  SHRINK;
3501  GROW;
3502  next = CUR_CHAR(nl);
3503  }
3504 
3505  if ((q == '-') && (r == '-') && (cur == '!') && (next == '>')) {
3506  htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
3507  "Comment incorrectly closed by '--!>'", NULL, NULL);
3508  cur = '>';
3509  break;
3510  }
3511 
3512  if (len + 5 >= size) {
3513  xmlChar *tmp;
3514 
3515  size *= 2;
3516  tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
3517  if (tmp == NULL) {
3518  xmlFree(buf);
3519  htmlErrMemory(ctxt, "growing buffer failed\n");
3520  ctxt->instate = state;
3521  return;
3522  }
3523  buf = tmp;
3524  }
3525  if (IS_CHAR(q)) {
3526  COPY_BUF(ql,buf,len,q);
3527  } else {
3528  htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3529  "Invalid char in comment 0x%X\n", q);
3530  }
3531 
3532  q = r;
3533  ql = rl;
3534  r = cur;
3535  rl = l;
3536  cur = next;
3537  l = nl;
3538  }
3539  buf[len] = 0;
3540  if (cur == '>') {
3541  NEXT;
3542  if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
3543  (!ctxt->disableSAX))
3544  ctxt->sax->comment(ctxt->userData, buf);
3545  xmlFree(buf);
3546  ctxt->instate = state;
3547  return;
3548  }
3549 
3550 unfinished:
3551  htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
3552  "Comment not terminated \n<!--%.50s\n", buf, NULL);
3553  xmlFree(buf);
3554 }
3555 
3567 int
3568 htmlParseCharRef(htmlParserCtxtPtr ctxt) {
3569  int val = 0;
3570 
3571  if ((ctxt == NULL) || (ctxt->input == NULL)) {
3572  htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3573  "htmlParseCharRef: context error\n",
3574  NULL, NULL);
3575  return(0);
3576  }
3577  if ((CUR == '&') && (NXT(1) == '#') &&
3578  ((NXT(2) == 'x') || NXT(2) == 'X')) {
3579  SKIP(3);
3580  while (CUR != ';') {
3581  if ((CUR >= '0') && (CUR <= '9')) {
3582  if (val < 0x110000)
3583  val = val * 16 + (CUR - '0');
3584  } else if ((CUR >= 'a') && (CUR <= 'f')) {
3585  if (val < 0x110000)
3586  val = val * 16 + (CUR - 'a') + 10;
3587  } else if ((CUR >= 'A') && (CUR <= 'F')) {
3588  if (val < 0x110000)
3589  val = val * 16 + (CUR - 'A') + 10;
3590  } else {
3591  htmlParseErr(ctxt, XML_ERR_INVALID_HEX_CHARREF,
3592  "htmlParseCharRef: missing semicolon\n",
3593  NULL, NULL);
3594  break;
3595  }
3596  NEXT;
3597  }
3598  if (CUR == ';')
3599  NEXT;
3600  } else if ((CUR == '&') && (NXT(1) == '#')) {
3601  SKIP(2);
3602  while (CUR != ';') {
3603  if ((CUR >= '0') && (CUR <= '9')) {
3604  if (val < 0x110000)
3605  val = val * 10 + (CUR - '0');
3606  } else {
3607  htmlParseErr(ctxt, XML_ERR_INVALID_DEC_CHARREF,
3608  "htmlParseCharRef: missing semicolon\n",
3609  NULL, NULL);
3610  break;
3611  }
3612  NEXT;
3613  }
3614  if (CUR == ';')
3615  NEXT;
3616  } else {
3617  htmlParseErr(ctxt, XML_ERR_INVALID_CHARREF,
3618  "htmlParseCharRef: invalid value\n", NULL, NULL);
3619  }
3620  /*
3621  * Check the value IS_CHAR ...
3622  */
3623  if (IS_CHAR(val)) {
3624  return(val);
3625  } else if (val >= 0x110000) {
3626  htmlParseErr(ctxt, XML_ERR_INVALID_CHAR,
3627  "htmlParseCharRef: value too large\n", NULL, NULL);
3628  } else {
3629  htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3630  "htmlParseCharRef: invalid xmlChar value %d\n",
3631  val);
3632  }
3633  return(0);
3634 }
3635 
3636 
3647 static void
3648 htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
3649  const xmlChar *name;
3650  xmlChar *ExternalID = NULL;
3651  xmlChar *URI = NULL;
3652 
3653  /*
3654  * We know that '<!DOCTYPE' has been detected.
3655  */
3656  SKIP(9);
3657 
3658  SKIP_BLANKS;
3659 
3660  /*
3661  * Parse the DOCTYPE name.
3662  */
3663  name = htmlParseName(ctxt);
3664  if (name == NULL) {
3665  htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3666  "htmlParseDocTypeDecl : no DOCTYPE name !\n",
3667  NULL, NULL);
3668  }
3669  /*
3670  * Check that upper(name) == "HTML" !!!!!!!!!!!!!
3671  */
3672 
3673  SKIP_BLANKS;
3674 
3675  /*
3676  * Check for SystemID and ExternalID
3677  */
3678  URI = htmlParseExternalID(ctxt, &ExternalID);
3679  SKIP_BLANKS;
3680 
3681  /*
3682  * We should be at the end of the DOCTYPE declaration.
3683  */
3684  if (CUR != '>') {
3685  htmlParseErr(ctxt, XML_ERR_DOCTYPE_NOT_FINISHED,
3686  "DOCTYPE improperly terminated\n", NULL, NULL);
3687  /* Ignore bogus content */
3688  while ((CUR != 0) && (CUR != '>'))
3689  NEXT;
3690  }
3691  if (CUR == '>')
3692  NEXT;
3693 
3694  /*
3695  * Create or update the document accordingly to the DOCTYPE
3696  */
3697  if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
3698  (!ctxt->disableSAX))
3699  ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
3700 
3701  /*
3702  * Cleanup, since we don't use all those identifiers
3703  */
3704  if (URI != NULL) xmlFree(URI);
3705  if (ExternalID != NULL) xmlFree(ExternalID);
3706 }
3707 
3729 static const xmlChar *
3730 htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
3731  const xmlChar *name;
3732  xmlChar *val = NULL;
3733 
3734  *value = NULL;
3735  name = htmlParseHTMLName(ctxt);
3736  if (name == NULL) {
3737  htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3738  "error parsing attribute name\n", NULL, NULL);
3739  return(NULL);
3740  }
3741 
3742  /*
3743  * read the value
3744  */
3745  SKIP_BLANKS;
3746  if (CUR == '=') {
3747  NEXT;
3748  SKIP_BLANKS;
3749  val = htmlParseAttValue(ctxt);
3750  }
3751 
3752  *value = val;
3753  return(name);
3754 }
3755 
3766 static void
3767 htmlCheckEncodingDirect(htmlParserCtxtPtr ctxt, const xmlChar *encoding) {
3768 
3769  if ((ctxt == NULL) || (encoding == NULL) ||
3770  (ctxt->options & HTML_PARSE_IGNORE_ENC))
3771  return;
3772 
3773  /* do not change encoding */
3774  if (ctxt->input->encoding != NULL)
3775  return;
3776 
3777  if (encoding != NULL) {
3778  xmlCharEncoding enc;
3780 
3781  while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
3782 
3783  if (ctxt->input->encoding != NULL)
3784  xmlFree((xmlChar *) ctxt->input->encoding);
3785  ctxt->input->encoding = xmlStrdup(encoding);
3786 
3787  enc = xmlParseCharEncoding((const char *) encoding);
3788  /*
3789  * registered set of known encodings
3790  */
3791  if (enc != XML_CHAR_ENCODING_ERROR) {
3792  if (((enc == XML_CHAR_ENCODING_UTF16LE) ||
3793  (enc == XML_CHAR_ENCODING_UTF16BE) ||
3794  (enc == XML_CHAR_ENCODING_UCS4LE) ||
3795  (enc == XML_CHAR_ENCODING_UCS4BE)) &&
3796  (ctxt->input->buf != NULL) &&
3797  (ctxt->input->buf->encoder == NULL)) {
3798  htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3799  "htmlCheckEncoding: wrong encoding meta\n",
3800  NULL, NULL);
3801  } else {
3802  xmlSwitchEncoding(ctxt, enc);
3803  }
3804  ctxt->charset = XML_CHAR_ENCODING_UTF8;
3805  } else {
3806  /*
3807  * fallback for unknown encodings
3808  */
3809  handler = xmlFindCharEncodingHandler((const char *) encoding);
3810  if (handler != NULL) {
3812  ctxt->charset = XML_CHAR_ENCODING_UTF8;
3813  } else {
3814  htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
3815  "htmlCheckEncoding: unknown encoding %s\n",
3816  encoding, NULL);
3817  }
3818  }
3819 
3820  if ((ctxt->input->buf != NULL) &&
3821  (ctxt->input->buf->encoder != NULL) &&
3822  (ctxt->input->buf->raw != NULL) &&
3823  (ctxt->input->buf->buffer != NULL)) {
3824  int nbchars;
3825  int processed;
3826 
3827  /*
3828  * convert as much as possible to the parser reading buffer.
3829  */
3830  processed = ctxt->input->cur - ctxt->input->base;
3831  xmlBufShrink(ctxt->input->buf->buffer, processed);
3832  nbchars = xmlCharEncInput(ctxt->input->buf, 1);
3833  xmlBufResetInput(ctxt->input->buf->buffer, ctxt->input);
3834  if (nbchars < 0) {
3835  htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3836  "htmlCheckEncoding: encoder error\n",
3837  NULL, NULL);
3838  }
3839  }
3840  }
3841 }
3842 
3853 static void
3854 htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
3855  const xmlChar *encoding;
3856 
3857  if (!attvalue)
3858  return;
3859 
3860  encoding = xmlStrcasestr(attvalue, BAD_CAST"charset");
3861  if (encoding != NULL) {
3862  encoding += 7;
3863  }
3864  /*
3865  * skip blank
3866  */
3867  if (encoding && IS_BLANK_CH(*encoding))
3868  encoding = xmlStrcasestr(attvalue, BAD_CAST"=");
3869  if (encoding && *encoding == '=') {
3870  encoding ++;
3871  htmlCheckEncodingDirect(ctxt, encoding);
3872  }
3873 }
3874 
3882 static void
3883 htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
3884  int i;
3885  const xmlChar *att, *value;
3886  int http = 0;
3887  const xmlChar *content = NULL;
3888 
3889  if ((ctxt == NULL) || (atts == NULL))
3890  return;
3891 
3892  i = 0;
3893  att = atts[i++];
3894  while (att != NULL) {
3895  value = atts[i++];
3896  if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
3897  && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
3898  http = 1;
3899  else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"charset")))
3900  htmlCheckEncodingDirect(ctxt, value);
3901  else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
3902  content = value;
3903  att = atts[i++];
3904  }
3905  if ((http) && (content != NULL))
3906  htmlCheckEncoding(ctxt, content);
3907 
3908 }
3909 
3930 static int
3931 htmlParseStartTag(htmlParserCtxtPtr ctxt) {
3932  const xmlChar *name;
3933  const xmlChar *attname;
3934  xmlChar *attvalue;
3935  const xmlChar **atts;
3936  int nbatts = 0;
3937  int maxatts;
3938  int meta = 0;
3939  int i;
3940  int discardtag = 0;
3941 
3942  if ((ctxt == NULL) || (ctxt->input == NULL)) {
3943  htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3944  "htmlParseStartTag: context error\n", NULL, NULL);
3945  return -1;
3946  }
3947  if (ctxt->instate == XML_PARSER_EOF)
3948  return(-1);
3949  if (CUR != '<') return -1;
3950  NEXT;
3951 
3952  atts = ctxt->atts;
3953  maxatts = ctxt->maxatts;
3954 
3955  GROW;
3956  name = htmlParseHTMLName(ctxt);
3957  if (name == NULL) {
3958  htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3959  "htmlParseStartTag: invalid element name\n",
3960  NULL, NULL);
3961  /* if recover preserve text on classic misconstructs */
3962  if ((ctxt->recovery) && ((IS_BLANK_CH(CUR)) || (CUR == '<') ||
3963  (CUR == '=') || (CUR == '>') || (((CUR >= '0') && (CUR <= '9'))))) {
3964  htmlParseCharDataInternal(ctxt, '<');
3965  return(-1);
3966  }
3967 
3968 
3969  /* Dump the bogus tag like browsers do */
3970  while ((CUR != 0) && (CUR != '>') &&
3971  (ctxt->instate != XML_PARSER_EOF))
3972  NEXT;
3973  return -1;
3974  }
3975  if (xmlStrEqual(name, BAD_CAST"meta"))
3976  meta = 1;
3977 
3978  /*
3979  * Check for auto-closure of HTML elements.
3980  */
3981  htmlAutoClose(ctxt, name);
3982 
3983  /*
3984  * Check for implied HTML elements.
3985  */
3986  htmlCheckImplied(ctxt, name);
3987 
3988  /*
3989  * Avoid html at any level > 0, head at any level != 1
3990  * or any attempt to recurse body
3991  */
3992  if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
3993  htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3994  "htmlParseStartTag: misplaced <html> tag\n",
3995  name, NULL);
3996  discardtag = 1;
3997  ctxt->depth++;
3998  }
3999  if ((ctxt->nameNr != 1) &&
4000  (xmlStrEqual(name, BAD_CAST"head"))) {
4001  htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4002  "htmlParseStartTag: misplaced <head> tag\n",
4003  name, NULL);
4004  discardtag = 1;
4005  ctxt->depth++;
4006  }
4007  if (xmlStrEqual(name, BAD_CAST"body")) {
4008  int indx;
4009  for (indx = 0;indx < ctxt->nameNr;indx++) {
4010  if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
4011  htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4012  "htmlParseStartTag: misplaced <body> tag\n",
4013  name, NULL);
4014  discardtag = 1;
4015  ctxt->depth++;
4016  }
4017  }
4018  }
4019 
4020  /*
4021  * Now parse the attributes, it ends up with the ending
4022  *
4023  * (S Attribute)* S?
4024  */
4025  SKIP_BLANKS;
4026  while ((CUR != 0) &&
4027  (CUR != '>') &&
4028  ((CUR != '/') || (NXT(1) != '>'))) {
4029  GROW;
4030  attname = htmlParseAttribute(ctxt, &attvalue);
4031  if (attname != NULL) {
4032 
4033  /*
4034  * Well formedness requires at most one declaration of an attribute
4035  */
4036  for (i = 0; i < nbatts;i += 2) {
4037  if (xmlStrEqual(atts[i], attname)) {
4038  htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_REDEFINED,
4039  "Attribute %s redefined\n", attname, NULL);
4040  if (attvalue != NULL)
4041  xmlFree(attvalue);
4042  goto failed;
4043  }
4044  }
4045 
4046  /*
4047  * Add the pair to atts
4048  */
4049  if (atts == NULL) {
4050  maxatts = 22; /* allow for 10 attrs by default */
4051  atts = (const xmlChar **)
4052  xmlMalloc(maxatts * sizeof(xmlChar *));
4053  if (atts == NULL) {
4054  htmlErrMemory(ctxt, NULL);
4055  if (attvalue != NULL)
4056  xmlFree(attvalue);
4057  goto failed;
4058  }
4059  ctxt->atts = atts;
4060  ctxt->maxatts = maxatts;
4061  } else if (nbatts + 4 > maxatts) {
4062  const xmlChar **n;
4063 
4064  maxatts *= 2;
4065  n = (const xmlChar **) xmlRealloc((void *) atts,
4066  maxatts * sizeof(const xmlChar *));
4067  if (n == NULL) {
4068  htmlErrMemory(ctxt, NULL);
4069  if (attvalue != NULL)
4070  xmlFree(attvalue);
4071  goto failed;
4072  }
4073  atts = n;
4074  ctxt->atts = atts;
4075  ctxt->maxatts = maxatts;
4076  }
4077  atts[nbatts++] = attname;
4078  atts[nbatts++] = attvalue;
4079  atts[nbatts] = NULL;
4080  atts[nbatts + 1] = NULL;
4081  }
4082  else {
4083  if (attvalue != NULL)
4084  xmlFree(attvalue);
4085  /* Dump the bogus attribute string up to the next blank or
4086  * the end of the tag. */
4087  while ((CUR != 0) &&
4088  !(IS_BLANK_CH(CUR)) && (CUR != '>') &&
4089  ((CUR != '/') || (NXT(1) != '>')))
4090  NEXT;
4091  }
4092 
4093 failed:
4094  SKIP_BLANKS;
4095  }
4096 
4097  /*
4098  * Handle specific association to the META tag
4099  */
4100  if (meta && (nbatts != 0))
4101  htmlCheckMeta(ctxt, atts);
4102 
4103  /*
4104  * SAX: Start of Element !
4105  */
4106  if (!discardtag) {
4107  htmlnamePush(ctxt, name);
4108  if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) {
4109  if (nbatts != 0)
4110  ctxt->sax->startElement(ctxt->userData, name, atts);
4111  else
4112  ctxt->sax->startElement(ctxt->userData, name, NULL);
4113  }
4114  }
4115 
4116  if (atts != NULL) {
4117  for (i = 1;i < nbatts;i += 2) {
4118  if (atts[i] != NULL)
4119  xmlFree((xmlChar *) atts[i]);
4120  }
4121  }
4122 
4123  return(discardtag);
4124 }
4125 
4141 static int
4142 htmlParseEndTag(htmlParserCtxtPtr ctxt)
4143 {
4144  const xmlChar *name;
4145  const xmlChar *oldname;
4146  int i, ret;
4147 
4148  if ((CUR != '<') || (NXT(1) != '/')) {
4149  htmlParseErr(ctxt, XML_ERR_LTSLASH_REQUIRED,
4150  "htmlParseEndTag: '</' not found\n", NULL, NULL);
4151  return (0);
4152  }
4153  SKIP(2);
4154 
4155  name = htmlParseHTMLName(ctxt);
4156  if (name == NULL)
4157  return (0);
4158  /*
4159  * We should definitely be at the ending "S? '>'" part
4160  */
4161  SKIP_BLANKS;
4162  if (CUR != '>') {
4163  htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4164  "End tag : expected '>'\n", NULL, NULL);
4165  /* Skip to next '>' */
4166  while ((CUR != 0) && (CUR != '>'))
4167  NEXT;
4168  }
4169  if (CUR == '>')
4170  NEXT;
4171 
4172  /*
4173  * if we ignored misplaced tags in htmlParseStartTag don't pop them
4174  * out now.
4175  */
4176  if ((ctxt->depth > 0) &&
4177  (xmlStrEqual(name, BAD_CAST "html") ||
4178  xmlStrEqual(name, BAD_CAST "body") ||
4179  xmlStrEqual(name, BAD_CAST "head"))) {
4180  ctxt->depth--;
4181  return (0);
4182  }
4183 
4184  /*
4185  * If the name read is not one of the element in the parsing stack
4186  * then return, it's just an error.
4187  */
4188  for (i = (ctxt->nameNr - 1); i >= 0; i--) {
4189  if (xmlStrEqual(name, ctxt->nameTab[i]))
4190  break;
4191  }
4192  if (i < 0) {
4193  htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
4194  "Unexpected end tag : %s\n", name, NULL);
4195  return (0);
4196  }
4197 
4198 
4199  /*
4200  * Check for auto-closure of HTML elements.
4201  */
4202 
4203  htmlAutoCloseOnClose(ctxt, name);
4204 
4205  /*
4206  * Well formedness constraints, opening and closing must match.
4207  * With the exception that the autoclose may have popped stuff out
4208  * of the stack.
4209  */
4210  if ((ctxt->name != NULL) && (!xmlStrEqual(ctxt->name, name))) {
4211  htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
4212  "Opening and ending tag mismatch: %s and %s\n",
4213  name, ctxt->name);
4214  }
4215 
4216  /*
4217  * SAX: End of Tag
4218  */
4219  oldname = ctxt->name;
4220  if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
4221  if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4222  ctxt->sax->endElement(ctxt->userData, name);
4223  htmlNodeInfoPop(ctxt);
4224  htmlnamePop(ctxt);
4225  ret = 1;
4226  } else {
4227  ret = 0;
4228  }
4229 
4230  return (ret);
4231 }
4232 
4233 
4242 static void
4243 htmlParseReference(htmlParserCtxtPtr ctxt) {
4244  const htmlEntityDesc * ent;
4245  xmlChar out[6];
4246  const xmlChar *name;
4247  if (CUR != '&') return;
4248 
4249  if (NXT(1) == '#') {
4250  unsigned int c;
4251  int bits, i = 0;
4252 
4253  c = htmlParseCharRef(ctxt);
4254  if (c == 0)
4255  return;
4256 
4257  if (c < 0x80) { out[i++]= c; bits= -6; }
4258  else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
4259  else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
4260  else { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
4261 
4262  for ( ; bits >= 0; bits-= 6) {
4263  out[i++]= ((c >> bits) & 0x3F) | 0x80;
4264  }
4265  out[i] = 0;
4266 
4267  htmlCheckParagraph(ctxt);
4268  if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4269  ctxt->sax->characters(ctxt->userData, out, i);
4270  } else {
4271  ent = htmlParseEntityRef(ctxt, &name);
4272  if (name == NULL) {
4273  htmlCheckParagraph(ctxt);
4274  if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4275  ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
4276  return;
4277  }
4278  if ((ent == NULL) || !(ent->value > 0)) {
4279  htmlCheckParagraph(ctxt);
4280  if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
4281  ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
4282  ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
4283  /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
4284  }
4285  } else {
4286  unsigned int c;
4287  int bits, i = 0;
4288 
4289  c = ent->value;
4290  if (c < 0x80)
4291  { out[i++]= c; bits= -6; }
4292  else if (c < 0x800)
4293  { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
4294  else if (c < 0x10000)
4295  { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
4296  else
4297  { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
4298 
4299  for ( ; bits >= 0; bits-= 6) {
4300  out[i++]= ((c >> bits) & 0x3F) | 0x80;
4301  }
4302  out[i] = 0;
4303 
4304  htmlCheckParagraph(ctxt);
4305  if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4306  ctxt->sax->characters(ctxt->userData, out, i);
4307  }
4308  }
4309 }
4310 
4319 static void
4320 htmlParseContent(htmlParserCtxtPtr ctxt) {
4321  xmlChar *currentNode;
4322  int depth;
4323  const xmlChar *name;
4324 
4325  currentNode = xmlStrdup(ctxt->name);
4326  depth = ctxt->nameNr;
4327  while (1) {
4328  GROW;
4329 
4330  if (ctxt->instate == XML_PARSER_EOF)
4331  break;
4332 
4333  /*
4334  * Our tag or one of it's parent or children is ending.
4335  */
4336  if ((CUR == '<') && (NXT(1) == '/')) {
4337  if (htmlParseEndTag(ctxt) &&
4338  ((currentNode != NULL) || (ctxt->nameNr == 0))) {
4339  if (currentNode != NULL)
4340  xmlFree(currentNode);
4341  return;
4342  }
4343  continue; /* while */
4344  }
4345 
4346  else if ((CUR == '<') &&
4347  ((IS_ASCII_LETTER(NXT(1))) ||
4348  (NXT(1) == '_') || (NXT(1) == ':'))) {
4349  name = htmlParseHTMLName_nonInvasive(ctxt);
4350  if (name == NULL) {
4351  htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
4352  "htmlParseStartTag: invalid element name\n",
4353  NULL, NULL);
4354  /* Dump the bogus tag like browsers do */
4355  while ((CUR != 0) && (CUR != '>'))
4356  NEXT;
4357 
4358  if (currentNode != NULL)
4359  xmlFree(currentNode);
4360  return;
4361  }
4362 
4363  if (ctxt->name != NULL) {
4364  if (htmlCheckAutoClose(name, ctxt->name) == 1) {
4365  htmlAutoClose(ctxt, name);
4366  continue;
4367  }
4368  }
4369  }
4370 
4371  /*
4372  * Has this node been popped out during parsing of
4373  * the next element
4374  */
4375  if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
4376  (!xmlStrEqual(currentNode, ctxt->name)))
4377  {
4378  if (currentNode != NULL) xmlFree(currentNode);
4379  return;
4380  }
4381 
4382  if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
4383  (xmlStrEqual(currentNode, BAD_CAST"style")))) {
4384  /*
4385  * Handle SCRIPT/STYLE separately
4386  */
4387  htmlParseScript(ctxt);
4388  } else {
4389  /*
4390  * Sometimes DOCTYPE arrives in the middle of the document
4391  */
4392  if ((CUR == '<') && (NXT(1) == '!') &&
4393  (UPP(2) == 'D') && (UPP(3) == 'O') &&
4394  (UPP(4) == 'C') && (UPP(5) == 'T') &&
4395  (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4396  (UPP(8) == 'E')) {
4397  htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4398  "Misplaced DOCTYPE declaration\n",
4399  BAD_CAST "DOCTYPE" , NULL);
4400  htmlParseDocTypeDecl(ctxt);
4401  }
4402 
4403  /*
4404  * First case : a comment
4405  */
4406  if ((CUR == '<') && (NXT(1) == '!') &&
4407  (NXT(2) == '-') && (NXT(3) == '-')) {
4408  htmlParseComment(ctxt);
4409  }
4410 
4411  /*
4412  * Second case : a Processing Instruction.
4413  */
4414  else if ((CUR == '<') && (NXT(1) == '?')) {
4415  htmlParsePI(ctxt);
4416  }
4417 
4418  /*
4419  * Third case : a sub-element.
4420  */
4421  else if (CUR == '<') {
4422  htmlParseElement(ctxt);
4423  }
4424 
4425  /*
4426  * Fourth case : a reference. If if has not been resolved,
4427  * parsing returns it's Name, create the node
4428  */
4429  else if (CUR == '&') {
4430  htmlParseReference(ctxt);
4431  }
4432 
4433  /*
4434  * Fifth case : end of the resource
4435  */
4436  else if (CUR == 0) {
4437  htmlAutoCloseOnEnd(ctxt);
4438  break;
4439  }
4440 
4441  /*
4442  * Last case, text. Note that References are handled directly.
4443  */
4444  else {
4445  htmlParseCharData(ctxt);
4446  }
4447  }
4448  GROW;
4449  }
4450  if (currentNode != NULL) xmlFree(currentNode);
4451 }
4452 
4465 void
4466 htmlParseElement(htmlParserCtxtPtr ctxt) {
4467  const xmlChar *name;
4468  xmlChar *currentNode = NULL;
4469  const htmlElemDesc * info;
4470  htmlParserNodeInfo node_info;
4471  int failed;
4472  int depth;
4473  const xmlChar *oldptr;
4474 
4475  if ((ctxt == NULL) || (ctxt->input == NULL)) {
4476  htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4477  "htmlParseElement: context error\n", NULL, NULL);
4478  return;
4479  }
4480 
4481  if (ctxt->instate == XML_PARSER_EOF)
4482  return;
4483 
4484  /* Capture start position */
4485  if (ctxt->record_info) {
4486  node_info.begin_pos = ctxt->input->consumed +
4487  (CUR_PTR - ctxt->input->base);
4488  node_info.begin_line = ctxt->input->line;
4489  }
4490 
4491  failed = htmlParseStartTag(ctxt);
4492  name = ctxt->name;
4493  if ((failed == -1) || (name == NULL)) {
4494  if (CUR == '>')
4495  NEXT;
4496  return;
4497  }
4498 
4499  /*
4500  * Lookup the info for that element.
4501  */
4502  info = htmlTagLookup(name);
4503  if (info == NULL) {
4504  htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4505  "Tag %s invalid\n", name, NULL);
4506  }
4507 
4508  /*
4509  * Check for an Empty Element labeled the XML/SGML way
4510  */
4511  if ((CUR == '/') && (NXT(1) == '>')) {
4512  SKIP(2);
4513  if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4514  ctxt->sax->endElement(ctxt->userData, name);
4515  htmlnamePop(ctxt);
4516  return;
4517  }
4518 
4519  if (CUR == '>') {
4520  NEXT;
4521  } else {
4522  htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4523  "Couldn't find end of Start Tag %s\n", name, NULL);
4524 
4525  /*
4526  * end of parsing of this node.
4527  */
4528  if (xmlStrEqual(name, ctxt->name)) {
4529  nodePop(ctxt);
4530  htmlnamePop(ctxt);
4531  }
4532 
4533  /*
4534  * Capture end position and add node
4535  */
4536  if (ctxt->record_info) {
4537  node_info.end_pos = ctxt->input->consumed +
4538  (CUR_PTR - ctxt->input->base);
4539  node_info.end_line = ctxt->input->line;
4540  node_info.node = ctxt->node;
4541  xmlParserAddNodeInfo(ctxt, &node_info);
4542  }
4543  return;
4544  }
4545 
4546  /*
4547  * Check for an Empty Element from DTD definition
4548  */
4549  if ((info != NULL) && (info->empty)) {
4550  if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4551  ctxt->sax->endElement(ctxt->userData, name);
4552  htmlnamePop(ctxt);
4553  return;
4554  }
4555 
4556  /*
4557  * Parse the content of the element:
4558  */
4559  currentNode = xmlStrdup(ctxt->name);
4560  depth = ctxt->nameNr;
4561  while (CUR != 0) {
4562  oldptr = ctxt->input->cur;
4563  htmlParseContent(ctxt);
4564  if (oldptr==ctxt->input->cur) break;
4565  if (ctxt->nameNr < depth) break;
4566  }
4567 
4568  /*
4569  * Capture end position and add node
4570  */
4571  if ( currentNode != NULL && ctxt->record_info ) {
4572  node_info.end_pos = ctxt->input->consumed +
4573  (CUR_PTR - ctxt->input->base);
4574  node_info.end_line = ctxt->input->line;
4575  node_info.node = ctxt->node;
4576  xmlParserAddNodeInfo(ctxt, &node_info);
4577  }
4578  if (CUR == 0) {
4579  htmlAutoCloseOnEnd(ctxt);
4580  }
4581 
4582  if (currentNode != NULL)
4583  xmlFree(currentNode);
4584 }
4585 
4586 static void
4587 htmlParserFinishElementParsing(htmlParserCtxtPtr ctxt) {
4588  /*
4589  * Capture end position and add node
4590  */
4591  if ( ctxt->node != NULL && ctxt->record_info ) {
4592  ctxt->nodeInfo->end_pos = ctxt->input->consumed +
4593  (CUR_PTR - ctxt->input->base);
4594  ctxt->nodeInfo->end_line = ctxt->input->line;
4595  ctxt->nodeInfo->node = ctxt->node;
4596  xmlParserAddNodeInfo(ctxt, ctxt->nodeInfo);
4597  htmlNodeInfoPop(ctxt);
4598  }
4599  if (CUR == 0) {
4600  htmlAutoCloseOnEnd(ctxt);
4601  }
4602 }
4603 
4615 static void
4616 htmlParseElementInternal(htmlParserCtxtPtr ctxt) {
4617  const xmlChar *name;
4618  const htmlElemDesc * info;
4619  htmlParserNodeInfo node_info = { NULL, 0, 0, 0, 0 };
4620  int failed;
4621 
4622  if ((ctxt == NULL) || (ctxt->input == NULL)) {
4623  htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4624  "htmlParseElementInternal: context error\n", NULL, NULL);
4625  return;
4626  }
4627 
4628  if (ctxt->instate == XML_PARSER_EOF)
4629  return;
4630 
4631  /* Capture start position */
4632  if (ctxt->record_info) {
4633  node_info.begin_pos = ctxt->input->consumed +
4634  (CUR_PTR - ctxt->input->base);
4635  node_info.begin_line = ctxt->input->line;
4636  }
4637 
4638  failed = htmlParseStartTag(ctxt);
4639  name = ctxt->name;
4640  if ((failed == -1) || (name == NULL)) {
4641  if (CUR == '>')
4642  NEXT;
4643  return;
4644  }
4645 
4646  /*
4647  * Lookup the info for that element.
4648  */
4649  info = htmlTagLookup(name);
4650  if (info == NULL) {
4651  htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4652  "Tag %s invalid\n", name, NULL);
4653  }
4654 
4655  /*
4656  * Check for an Empty Element labeled the XML/SGML way
4657  */
4658  if ((CUR == '/') && (NXT(1) == '>')) {
4659  SKIP(2);
4660  if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4661  ctxt->sax->endElement(ctxt->userData, name);
4662  htmlnamePop(ctxt);
4663  return;
4664  }
4665 
4666  if (CUR == '>') {
4667  NEXT;
4668  } else {
4669  htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4670  "Couldn't find end of Start Tag %s\n", name, NULL);
4671 
4672  /*
4673  * end of parsing of this node.
4674  */
4675  if (xmlStrEqual(name, ctxt->name)) {
4676  nodePop(ctxt);
4677  htmlnamePop(ctxt);
4678  }
4679 
4680  if (ctxt->record_info)
4681  htmlNodeInfoPush(ctxt, &node_info);
4682  htmlParserFinishElementParsing(ctxt);
4683  return;
4684  }
4685 
4686  /*
4687  * Check for an Empty Element from DTD definition
4688  */
4689  if ((info != NULL) && (info->empty)) {
4690  if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4691  ctxt->sax->endElement(ctxt->userData, name);
4692  htmlnamePop(ctxt);
4693  return;
4694  }
4695 
4696  if (ctxt->record_info)
4697  htmlNodeInfoPush(ctxt, &node_info);
4698 }
4699 
4708 static void
4709 htmlParseContentInternal(htmlParserCtxtPtr ctxt) {
4710  xmlChar *currentNode;
4711  int depth;
4712  const xmlChar *name;
4713 
4714  currentNode = xmlStrdup(ctxt->name);
4715  depth = ctxt->nameNr;
4716  while (1) {
4717  GROW;
4718 
4719  if (ctxt->instate == XML_PARSER_EOF)
4720  break;
4721 
4722  /*
4723  * Our tag or one of it's parent or children is ending.
4724  */
4725  if ((CUR == '<') && (NXT(1) == '/')) {
4726  if (htmlParseEndTag(ctxt) &&
4727  ((currentNode != NULL) || (ctxt->nameNr == 0))) {
4728  if (currentNode != NULL)
4729  xmlFree(currentNode);
4730 
4731  currentNode = xmlStrdup(ctxt->name);
4732  depth = ctxt->nameNr;
4733  }
4734  continue; /* while */
4735  }
4736 
4737  else if ((CUR == '<') &&
4738  ((IS_ASCII_LETTER(NXT(1))) ||
4739  (NXT(1) == '_') || (NXT(1) == ':'))) {
4740  name = htmlParseHTMLName_nonInvasive(ctxt);
4741  if (name == NULL) {
4742  htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
4743  "htmlParseStartTag: invalid element name\n",
4744  NULL, NULL);
4745  /* Dump the bogus tag like browsers do */
4746  while ((CUR == 0) && (CUR != '>'))
4747  NEXT;
4748 
4749  htmlParserFinishElementParsing(ctxt);
4750  if (currentNode != NULL)
4751  xmlFree(currentNode);
4752 
4753  currentNode = xmlStrdup(ctxt->name);
4754  depth = ctxt->nameNr;
4755  continue;
4756  }
4757 
4758  if (ctxt->name != NULL) {
4759  if (htmlCheckAutoClose(name, ctxt->name) == 1) {
4760  htmlAutoClose(ctxt, name);
4761  continue;
4762  }
4763  }
4764  }
4765 
4766  /*
4767  * Has this node been popped out during parsing of
4768  * the next element
4769  */
4770  if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
4771  (!xmlStrEqual(currentNode, ctxt->name)))
4772  {
4773  htmlParserFinishElementParsing(ctxt);
4774  if (currentNode != NULL) xmlFree(currentNode);
4775 
4776  currentNode = xmlStrdup(ctxt->name);
4777  depth = ctxt->nameNr;
4778  continue;
4779  }
4780 
4781  if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
4782  (xmlStrEqual(currentNode, BAD_CAST"style")))) {
4783  /*
4784  * Handle SCRIPT/STYLE separately
4785  */
4786  htmlParseScript(ctxt);
4787  } else {
4788  /*
4789  * Sometimes DOCTYPE arrives in the middle of the document
4790  */
4791  if ((CUR == '<') && (NXT(1) == '!') &&
4792  (UPP(2) == 'D') && (UPP(3) == 'O') &&
4793  (UPP(4) == 'C') && (UPP(5) == 'T') &&
4794  (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4795  (UPP(8) == 'E')) {
4796  htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4797  "Misplaced DOCTYPE declaration\n",
4798  BAD_CAST "DOCTYPE" , NULL);
4799  htmlParseDocTypeDecl(ctxt);
4800  }
4801 
4802  /*
4803  * First case : a comment
4804  */
4805  if ((CUR == '<') && (NXT(1) == '!') &&
4806  (NXT(2) == '-') && (NXT(3) == '-')) {
4807  htmlParseComment(ctxt);
4808  }
4809 
4810  /*
4811  * Second case : a Processing Instruction.
4812  */
4813  else if ((CUR == '<') && (NXT(1) == '?')) {
4814  htmlParsePI(ctxt);
4815  }
4816 
4817  /*
4818  * Third case : a sub-element.
4819  */
4820  else if (CUR == '<') {
4821  htmlParseElementInternal(ctxt);
4822  if (currentNode != NULL) xmlFree(currentNode);
4823 
4824  currentNode = xmlStrdup(ctxt->name);
4825  depth = ctxt->nameNr;
4826  }
4827 
4828  /*
4829  * Fourth case : a reference. If if has not been resolved,
4830  * parsing returns it's Name, create the node
4831  */
4832  else if (CUR == '&') {
4833  htmlParseReference(ctxt);
4834  }
4835 
4836  /*
4837  * Fifth case : end of the resource
4838  */
4839  else if (CUR == 0) {
4840  htmlAutoCloseOnEnd(ctxt);
4841  break;
4842  }
4843 
4844  /*
4845  * Last case, text. Note that References are handled directly.
4846  */
4847  else {
4848  htmlParseCharData(ctxt);
4849  }
4850  }
4851  GROW;
4852  }
4853  if (currentNode != NULL) xmlFree(currentNode);
4854 }
4855 
4864 void
4865 __htmlParseContent(void *ctxt) {
4866  if (ctxt != NULL)
4867  htmlParseContentInternal((htmlParserCtxtPtr) ctxt);
4868 }
4869 
4881 int
4882 htmlParseDocument(htmlParserCtxtPtr ctxt) {
4883  xmlChar start[4];
4884  xmlCharEncoding enc;
4885  xmlDtdPtr dtd;
4886 
4887  xmlInitParser();
4888 
4889  htmlDefaultSAXHandlerInit();
4890 
4891  if ((ctxt == NULL) || (ctxt->input == NULL)) {
4892  htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4893  "htmlParseDocument: context error\n", NULL, NULL);
4894  return(XML_ERR_INTERNAL_ERROR);
4895  }
4896  ctxt->html = 1;
4897  ctxt->linenumbers = 1;
4898  GROW;
4899  /*
4900  * SAX: beginning of the document processing.
4901  */
4902  if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4903  ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
4904 
4905  if ((ctxt->encoding == (const xmlChar *)XML_CHAR_ENCODING_NONE) &&
4906  ((ctxt->input->end - ctxt->input->cur) >= 4)) {
4907  /*
4908  * Get the 4 first bytes and decode the charset
4909  * if enc != XML_CHAR_ENCODING_NONE
4910  * plug some encoding conversion routines.
4911  */
4912  start[0] = RAW;
4913  start[1] = NXT(1);
4914  start[2] = NXT(2);
4915  start[3] = NXT(3);
4916  enc = xmlDetectCharEncoding(&start[0], 4);
4917  if (enc != XML_CHAR_ENCODING_NONE) {
4918  xmlSwitchEncoding(ctxt, enc);
4919  }
4920  }
4921 
4922  /*
4923  * Wipe out everything which is before the first '<'
4924  */
4925  SKIP_BLANKS;
4926  if (CUR == 0) {
4927  htmlParseErr(ctxt, XML_ERR_DOCUMENT_EMPTY,
4928  "Document is empty\n", NULL, NULL);
4929  }
4930 
4931  if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
4932  ctxt->sax->startDocument(ctxt->userData);
4933 
4934 
4935  /*
4936  * Parse possible comments and PIs before any content
4937  */
4938  while (((CUR == '<') && (NXT(1) == '!') &&
4939  (NXT(2) == '-') && (NXT(3) == '-')) ||
4940  ((CUR == '<') && (NXT(1) == '?'))) {
4941  htmlParseComment(ctxt);
4942  htmlParsePI(ctxt);
4943  SKIP_BLANKS;
4944  }
4945 
4946 
4947  /*
4948  * Then possibly doc type declaration(s) and more Misc
4949  * (doctypedecl Misc*)?
4950  */
4951  if ((CUR == '<') && (NXT(1) == '!') &&
4952  (UPP(2) == 'D') && (UPP(3) == 'O') &&
4953  (UPP(4) == 'C') && (UPP(5) == 'T') &&
4954  (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4955  (UPP(8) == 'E')) {
4956  htmlParseDocTypeDecl(ctxt);
4957  }
4958  SKIP_BLANKS;
4959 
4960  /*
4961  * Parse possible comments and PIs before any content
4962  */
4963  while (((CUR == '<') && (NXT(1) == '!') &&
4964  (NXT(2) == '-') && (NXT(3) == '-')) ||
4965  ((CUR == '<') && (NXT(1) == '?'))) {
4966  htmlParseComment(ctxt);
4967  htmlParsePI(ctxt);
4968  SKIP_BLANKS;
4969  }
4970 
4971  /*
4972  * Time to start parsing the tree itself
4973  */
4974  htmlParseContentInternal(ctxt);
4975 
4976  /*
4977  * autoclose
4978  */
4979  if (CUR == 0)
4980  htmlAutoCloseOnEnd(ctxt);
4981 
4982 
4983  /*
4984  * SAX: end of the document processing.
4985  */
4986  if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4987  ctxt->sax->endDocument(ctxt->userData);
4988 
4989  if ((!(ctxt->options & HTML_PARSE_NODEFDTD)) && (ctxt->myDoc != NULL)) {
4990  dtd = xmlGetIntSubset(ctxt->myDoc);
4991  if (dtd == NULL)
4992  ctxt->myDoc->intSubset =
4993  xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
4994  BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
4995  BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
4996  }
4997  if (! ctxt->wellFormed) return(-1);
4998  return(0);
4999 }
5000 
5001 
5002 /************************************************************************
5003  * *
5004  * Parser contexts handling *
5005  * *
5006  ************************************************************************/
5007 
5017 static int
5018 htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
5019 {
5020  htmlSAXHandler *sax;
5021 
5022  if (ctxt == NULL) return(-1);
5023  memset(ctxt, 0, sizeof(htmlParserCtxt));
5024 
5025  ctxt->dict = xmlDictCreate();
5026  if (ctxt->dict == NULL) {
5027  htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
5028  return(-1);
5029  }
5030  sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
5031  if (sax == NULL) {
5032  htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
5033  return(-1);
5034  }
5035  else
5036  memset(sax, 0, sizeof(htmlSAXHandler));
5037 
5038  /* Allocate the Input stack */
5039  ctxt->inputTab = (htmlParserInputPtr *)
5040  xmlMalloc(5 * sizeof(htmlParserInputPtr));
5041  if (ctxt->inputTab == NULL) {
5042  htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
5043  ctxt->inputNr = 0;
5044  ctxt->inputMax = 0;
5045  ctxt->input = NULL;
5046  return(-1);
5047  }
5048  ctxt->inputNr = 0;
5049  ctxt->inputMax = 5;
5050  ctxt->input = NULL;
5051  ctxt->version = NULL;
5052  ctxt->encoding = NULL;
5053  ctxt->standalone = -1;
5054  ctxt->instate = XML_PARSER_START;
5055 
5056  /* Allocate the Node stack */
5057  ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
5058  if (ctxt->nodeTab == NULL) {
5059  htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
5060  ctxt->nodeNr = 0;
5061  ctxt->nodeMax = 0;
5062  ctxt->node = NULL;
5063  ctxt->inputNr = 0;
5064  ctxt->inputMax = 0;
5065  ctxt->input = NULL;
5066  return(-1);
5067  }
5068  ctxt->nodeNr = 0;
5069  ctxt->nodeMax = 10;
5070  ctxt->node = NULL;
5071 
5072  /* Allocate the Name stack */
5073  ctxt->nameTab = (const xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
5074  if (ctxt->nameTab == NULL) {
5075  htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
5076  ctxt->nameNr = 0;
5077  ctxt->nameMax = 0;
5078  ctxt->name = NULL;
5079  ctxt->nodeNr = 0;
5080  ctxt->nodeMax = 0;
5081  ctxt->node = NULL;
5082  ctxt->inputNr = 0;
5083  ctxt->inputMax = 0;
5084  ctxt->input = NULL;
5085  return(-1);
5086  }
5087  ctxt->nameNr = 0;
5088  ctxt->nameMax = 10;
5089  ctxt->name = NULL;
5090 
5091  ctxt->nodeInfoTab = NULL;
5092  ctxt->nodeInfoNr = 0;
5093  ctxt->nodeInfoMax = 0;
5094 
5095  if (sax == NULL) ctxt->sax = (xmlSAXHandlerPtr) &htmlDefaultSAXHandler;
5096  else {
5097  ctxt->sax = sax;
5098  memcpy(sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
5099  }
5100  ctxt->userData = ctxt;
5101  ctxt->myDoc = NULL;
5102  ctxt->wellFormed = 1;
5103  ctxt->replaceEntities = 0;
5104  ctxt->linenumbers = xmlLineNumbersDefaultValue;
5105  ctxt->keepBlanks = xmlKeepBlanksDefaultValue;
5106  ctxt->html = 1;
5107  ctxt->vctxt.finishDtd = XML_CTXT_FINISH_DTD_0;
5108  ctxt->vctxt.userData = ctxt;
5109  ctxt->vctxt.error = xmlParserValidityError;
5110  ctxt->vctxt.warning = xmlParserValidityWarning;
5111  ctxt->record_info = 0;
5112  ctxt->validate = 0;
5113  ctxt->checkIndex = 0;
5114  ctxt->catalogs = NULL;
5115  xmlInitNodeInfoSeq(&ctxt->node_seq);
5116  return(0);
5117 }
5118 
5127 void
5128 htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
5129 {
5130  xmlFreeParserCtxt(ctxt);
5131 }
5132 
5141 htmlParserCtxtPtr
5142 htmlNewParserCtxt(void)
5143 {
5144  xmlParserCtxtPtr ctxt;
5145 
5146  ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt));
5147  if (ctxt == NULL) {
5148  htmlErrMemory(NULL, "NewParserCtxt: out of memory\n");
5149  return(NULL);
5150  }
5151  memset(ctxt, 0, sizeof(xmlParserCtxt));
5152  if (htmlInitParserCtxt(ctxt) < 0) {
5153  htmlFreeParserCtxt(ctxt);
5154  return(NULL);
5155  }
5156  return(ctxt);
5157 }
5158 
5168 htmlParserCtxtPtr
5169 htmlCreateMemoryParserCtxt(const char *buffer, int size) {
5170  xmlParserCtxtPtr ctxt;
5173 
5174  if (buffer == NULL)
5175  return(NULL);
5176  if (size <= 0)
5177  return(NULL);
5178 
5179  ctxt = htmlNewParserCtxt();
5180  if (ctxt == NULL)
5181  return(NULL);
5182 
5184  if (buf == NULL) return(NULL);
5185 
5186  input = xmlNewInputStream(ctxt);
5187  if (input == NULL) {
5188  xmlFreeParserCtxt(ctxt);
5189  return(NULL);
5190  }
5191 
5192  input->filename = NULL;
5193  input->buf = buf;
5194  xmlBufResetInput(buf->buffer, input);
5195 
5196  inputPush(ctxt, input);
5197  return(ctxt);
5198 }
5199 
5211 static htmlParserCtxtPtr
5212 htmlCreateDocParserCtxt(const xmlChar *cur, const char *encoding) {
5213  int len;
5214  htmlParserCtxtPtr ctxt;
5215 
5216  if (cur == NULL)
5217  return(NULL);
5218  len = xmlStrlen(cur);
5219  ctxt = htmlCreateMemoryParserCtxt((char *)cur, len);
5220  if (ctxt == NULL)
5221  return(NULL);
5222 
5223  if (encoding != NULL) {
5224  xmlCharEncoding enc;
5226 
5227  if (ctxt->input->encoding != NULL)
5228  xmlFree((xmlChar *) ctxt->input->encoding);
5229  ctxt->input->encoding = xmlStrdup((const xmlChar *) encoding);
5230 
5232  /*
5233  * registered set of known encodings
5234  */
5235  if (enc != XML_CHAR_ENCODING_ERROR) {
5236  xmlSwitchEncoding(ctxt, enc);
5237  if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
5238  htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
5239  "Unsupported encoding %s\n",
5240  (const xmlChar *) encoding, NULL);
5241  }
5242  } else {
5243  /*
5244  * fallback for unknown encodings
5245  */
5246  handler = xmlFindCharEncodingHandler((const char *) encoding);
5247  if (handler != NULL) {
5249  } else {
5250  htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
5251  "Unsupported encoding %s\n",
5252  (const xmlChar *) encoding, NULL);
5253  }
5254  }
5255  }
5256  return(ctxt);
5257 }
5258 
5259 #ifdef LIBXML_PUSH_ENABLED
5260 /************************************************************************
5261  * *
5262  * Progressive parsing interfaces *
5263  * *
5264  ************************************************************************/
5265 
5284 static int
5285 htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
5286  xmlChar next, xmlChar third, int ignoreattrval)
5287 {
5288  int base, len;
5289  htmlParserInputPtr in;
5290  const xmlChar *buf;
5291  int invalue = 0;
5292  char valdellim = 0x0;
5293 
5294  in = ctxt->input;
5295  if (in == NULL)
5296  return (-1);
5297 
5298  base = in->cur - in->base;
5299  if (base < 0)
5300  return (-1);
5301 
5302  if (ctxt->checkIndex > base) {
5303  base = ctxt->checkIndex;
5304  /* Abuse hasPErefs member to restore current state. */
5305  invalue = ctxt->hasPErefs & 1 ? 1 : 0;
5306  }
5307 
5308  if (in->buf == NULL) {
5309  buf = in->base;
5310  len = in->length;
5311  } else {
5312  buf = xmlBufContent(in->buf->buffer);
5313  len = xmlBufUse(in->buf->buffer);
5314  }
5315 
5316  /* take into account the sequence length */
5317  if (third)
5318  len -= 2;
5319  else if (next)
5320  len--;
5321  for (; base < len; base++) {
5322  if (ignoreattrval) {
5323  if (buf[base] == '"' || buf[base] == '\'') {
5324  if (invalue) {
5325  if (buf[base] == valdellim) {
5326  invalue = 0;
5327  continue;
5328  }
5329  } else {
5330  valdellim = buf[base];
5331  invalue = 1;
5332  continue;
5333  }
5334  } else if (invalue) {
5335  continue;
5336  }
5337  }
5338  if (buf[base] == first) {
5339  if (third != 0) {
5340  if ((buf[base + 1] != next) || (buf[base + 2] != third))
5341  continue;
5342  } else if (next != 0) {
5343  if (buf[base + 1] != next)
5344  continue;
5345  }
5346  ctxt->checkIndex = 0;
5347 #ifdef DEBUG_PUSH
5348  if (next == 0)
5350  "HPP: lookup '%c' found at %d\n",
5351  first, base);
5352  else if (third == 0)
5354  "HPP: lookup '%c%c' found at %d\n",
5355  first, next, base);
5356  else
5358  "HPP: lookup '%c%c%c' found at %d\n",
5359  first, next, third, base);
5360 #endif
5361  return (base - (in->cur - in->base));
5362  }
5363  }
5364  ctxt->checkIndex = base;
5365  /* Abuse hasPErefs member to track current state. */
5366  if (invalue)
5367  ctxt->hasPErefs |= 1;
5368  else
5369  ctxt->hasPErefs &= ~1;
5370 #ifdef DEBUG_PUSH
5371  if (next == 0)
5373  "HPP: lookup '%c' failed\n", first);
5374  else if (third == 0)
5376  "HPP: lookup '%c%c' failed\n", first, next);
5377  else
5379  "HPP: lookup '%c%c%c' failed\n", first, next,
5380  third);
5381 #endif
5382  return (-1);
5383 }
5384 
5399 static int
5400 htmlParseLookupCommentEnd(htmlParserCtxtPtr ctxt)
5401 {
5402  int mark = 0;
5403  int cur = CUR_PTR - BASE_PTR;
5404 
5405  while (mark >= 0) {
5406  mark = htmlParseLookupSequence(ctxt, '-', '-', 0, 0);
5407  if ((mark < 0) ||
5408  (NXT(mark+2) == '>') ||
5409  ((NXT(mark+2) == '!') && (NXT(mark+3) == '>'))) {
5410  return mark;
5411  }
5412  ctxt->checkIndex = cur + mark + 1;
5413  }
5414  return mark;
5415 }
5416 
5417 
5427 static int
5428 htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
5429  int ret = 0;
5430  htmlParserInputPtr in;
5431  ptrdiff_t avail = 0;
5432  xmlChar cur, next;
5433 
5434  htmlParserNodeInfo node_info;
5435 
5436 #ifdef DEBUG_PUSH
5437  switch (ctxt->instate) {
5438  case XML_PARSER_EOF: