11#ifdef LIBXML_HTML_ENABLED
34#define HTML_MAX_NAMELEN 1000
35#define HTML_PARSER_BIG_BUFFER_SIZE 1000
36#define HTML_PARSER_BUFFER_SIZE 100
41static int htmlOmittedDefaultValue = 1;
43xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt,
int len,
45static void htmlParseComment(htmlParserCtxtPtr ctxt);
75 "Memory allocation failed : %s\n",
extra);
79 NULL,
NULL, 0, 0,
"Memory allocation failed\n");
96 if ((ctxt !=
NULL) && (ctxt->disableSAX != 0) &&
103 (
const char *) str1, (
const char *) str2,
107 ctxt->wellFormed = 0;
123 if ((ctxt !=
NULL) && (ctxt->disableSAX != 0) &&
132 ctxt->wellFormed = 0;
151htmlnamePush(htmlParserCtxtPtr ctxt,
const xmlChar *
value)
157 if (ctxt->nameNr >= ctxt->nameMax) {
159 ctxt->nameTab = (
const xmlChar * *)
162 sizeof(ctxt->nameTab[0]));
163 if (ctxt->nameTab ==
NULL) {
164 htmlErrMemory(ctxt,
NULL);
168 ctxt->nameTab[ctxt->nameNr] =
value;
170 return (ctxt->nameNr++);
181htmlnamePop(htmlParserCtxtPtr ctxt)
185 if (ctxt->nameNr <= 0)
188 if (ctxt->nameNr < 0)
190 if (ctxt->nameNr > 0)
191 ctxt->name = ctxt->nameTab[ctxt->nameNr - 1];
194 ret = ctxt->nameTab[ctxt->nameNr];
195 ctxt->nameTab[ctxt->nameNr] =
NULL;
209htmlNodeInfoPush(htmlParserCtxtPtr ctxt, htmlParserNodeInfo *
value)
211 if (ctxt->nodeInfoNr >= ctxt->nodeInfoMax) {
212 if (ctxt->nodeInfoMax == 0)
213 ctxt->nodeInfoMax = 5;
214 ctxt->nodeInfoMax *= 2;
215 ctxt->nodeInfoTab = (htmlParserNodeInfo *)
216 xmlRealloc((htmlParserNodeInfo *)ctxt->nodeInfoTab,
218 sizeof(ctxt->nodeInfoTab[0]));
219 if (ctxt->nodeInfoTab ==
NULL) {
220 htmlErrMemory(ctxt,
NULL);
224 ctxt->nodeInfoTab[ctxt->nodeInfoNr] = *
value;
225 ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
226 return (ctxt->nodeInfoNr++);
237static htmlParserNodeInfo *
238htmlNodeInfoPop(htmlParserCtxtPtr ctxt)
240 if (ctxt->nodeInfoNr <= 0)
243 if (ctxt->nodeInfoNr < 0)
245 if (ctxt->nodeInfoNr > 0)
246 ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr - 1];
248 ctxt->nodeInfo =
NULL;
249 return &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
281#define UPPER (toupper(*ctxt->input->cur))
283#define SKIP(val) ctxt->input->cur += (val),ctxt->input->col+=(val)
285#define NXT(val) ctxt->input->cur[(val)]
287#define UPP(val) (toupper(ctxt->input->cur[(val)]))
289#define CUR_PTR ctxt->input->cur
290#define BASE_PTR ctxt->input->base
292#define SHRINK if ((ctxt->input->cur - ctxt->input->base > 2 * INPUT_CHUNK) && \
293 (ctxt->input->end - ctxt->input->cur < 2 * INPUT_CHUNK)) \
294 xmlParserInputShrink(ctxt->input)
296#define GROW if ((ctxt->progressive == 0) && \
297 (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK)) \
298 xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
300#define CURRENT ((int) (*ctxt->input->cur))
302#define SKIP_BLANKS htmlSkipBlankChars(ctxt)
307#define CUR ((int) (*ctxt->input->cur))
308#define NEXT xmlNextChar(ctxt)
310#define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
313#define NEXTL(l) do { \
314 if (*(ctxt->input->cur) == '\n') { \
315 ctxt->input->line++; ctxt->input->col = 1; \
316 } else ctxt->input->col++; \
317 ctxt->token = 0; ctxt->input->cur += l; \
326#define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
327#define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
329#define COPY_BUF(l,b,i,v) \
330 if (l == 1) b[i++] = (xmlChar) v; \
331 else i += xmlCopyChar(l,&b[i],v)
375 while (((*
cur >=
'A') && (*
cur <=
'Z')) ||
376 ((*
cur >=
'a') && (*
cur <=
'z')) ||
377 ((*
cur >=
'0') && (*
cur <=
'9')) ||
378 (*
cur ==
'-') || (*
cur ==
'_') || (*
cur ==
':') || (*
cur ==
'/'))
401 const unsigned char *
cur;
408 if (ctxt->
token != 0) {
421 if ((
int) *ctxt->
input->
cur < 0x80) {
426 "Char 0x%X out of allowed range\n", 0);
435 guess = htmlFindEncoding(ctxt);
452 "Unsupported encoding %s", guess,
NULL);
478 if ((
cur[1] & 0xc0) != 0x80)
480 if ((
c & 0xe0) == 0xe0) {
486 if ((
cur[2] & 0xc0) != 0x80)
488 if ((
c & 0xf0) == 0xf0) {
493 if (((
c & 0xf8) != 0xf0) ||
494 ((
cur[3] & 0xc0) != 0x80))
498 val = (
cur[0] & 0x7) << 18;
499 val |= (
cur[1] & 0x3f) << 12;
500 val |= (
cur[2] & 0x3f) << 6;
507 val = (
cur[0] & 0xf) << 12;
508 val |= (
cur[1] & 0x3f) << 6;
516 val = (
cur[0] & 0x1f) << 6;
523 "Char 0x%X out of allowed range\n",
val);
530 "Char 0x%X out of allowed range\n", 0);
558 "Input is not proper UTF-8, indicate encoding !\n",
630#define FONTSTYLE "tt", "i", "b", "u", "s", "strike", "big", "small"
631#define NB_FONTSTYLE 8
632#define PHRASE "em", "strong", "dfn", "code", "samp", "kbd", "var", "cite", "abbr", "acronym"
634#define SPECIAL "a", "img", "applet", "embed", "object", "font", "basefont", "br", "script", "map", "q", "sub", "sup", "span", "bdo", "iframe"
636#define INLINE FONTSTYLE, PHRASE, SPECIAL, FORMCTRL
637#define NB_INLINE NB_PCDATA + NB_FONTSTYLE + NB_PHRASE + NB_SPECIAL + NB_FORMCTRL
638#define BLOCK HEADING, LIST, "pre", "p", "dl", "div", "center", "noscript", "noframes", "blockquote", "form", "isindex", "hr", "table", "fieldset", "address"
639#define NB_BLOCK NB_HEADING + NB_LIST + 14
640#define FORMCTRL "input", "select", "textarea", "label", "button"
644#define HEADING "h1", "h2", "h3", "h4", "h5", "h6"
646#define LIST "ul", "ol", "dir", "menu"
650#define FLOW BLOCK,INLINE
651#define NB_FLOW NB_BLOCK + NB_INLINE
655static const char*
const html_flow[] = { FLOW,
NULL } ;
656static const char*
const html_inline[] = {
INLINE,
NULL } ;
659static const char*
const html_pcdata[] = {
NULL } ;
660#define html_cdata html_pcdata
665#define COREATTRS "id", "class", "style", "title"
666#define NB_COREATTRS 4
667#define I18N "lang", "dir"
669#define EVENTS "onclick", "ondblclick", "onmousedown", "onmouseup", "onmouseover", "onmouseout", "onkeypress", "onkeydown", "onkeyup"
671#define ATTRS COREATTRS,I18N,EVENTS
672#define NB_ATTRS NB_NB_COREATTRS + NB_I18N + NB_EVENTS
673#define CELLHALIGN "align", "char", "charoff"
674#define NB_CELLHALIGN 3
675#define CELLVALIGN "valign"
676#define NB_CELLVALIGN 1
678static const char*
const html_attrs[] = { ATTRS,
NULL } ;
679static const char*
const core_i18n_attrs[] = { COREATTRS, I18N,
NULL } ;
680static const char*
const core_attrs[] = { COREATTRS,
NULL } ;
681static const char*
const i18n_attrs[] = { I18N,
NULL } ;
685static const char*
const a_attrs[] = { ATTRS,
"charset",
"type",
"name",
686 "href",
"hreflang",
"rel",
"rev",
"accesskey",
"shape",
"coords",
687 "tabindex",
"onfocus",
"onblur",
NULL } ;
688static const char*
const target_attr[] = {
"target",
NULL } ;
689static const char*
const rows_cols_attr[] = {
"rows",
"cols",
NULL } ;
690static const char*
const alt_attr[] = {
"alt",
NULL } ;
691static const char*
const src_alt_attrs[] = {
"src",
"alt",
NULL } ;
692static const char*
const href_attrs[] = {
"href",
NULL } ;
693static const char*
const clear_attrs[] = {
"clear",
NULL } ;
694static const char*
const inline_p[] = {
INLINE,
"p",
NULL } ;
696static const char*
const flow_param[] = { FLOW,
"param",
NULL } ;
697static const char*
const applet_attrs[] = { COREATTRS ,
"codebase",
698 "archive",
"alt",
"name",
"height",
"width",
"align",
699 "hspace",
"vspace",
NULL } ;
700static const char*
const area_attrs[] = {
"shape",
"coords",
"href",
"nohref",
701 "tabindex",
"accesskey",
"onfocus",
"onblur",
NULL } ;
702static const char*
const basefont_attrs[] =
703 {
"id",
"size",
"color",
"face",
NULL } ;
704static const char*
const quote_attrs[] = { ATTRS,
"cite",
NULL } ;
705static const char*
const body_contents[] = { FLOW,
"ins",
"del",
NULL } ;
706static const char*
const body_attrs[] = { ATTRS,
"onload",
"onunload",
NULL } ;
707static const char*
const body_depr[] = {
"background",
"bgcolor",
"text",
708 "link",
"vlink",
"alink",
NULL } ;
709static const char*
const button_attrs[] = { ATTRS,
"name",
"value",
"type",
710 "disabled",
"tabindex",
"accesskey",
"onfocus",
"onblur",
NULL } ;
713static const char*
const col_attrs[] = { ATTRS,
"span",
"width", CELLHALIGN, CELLVALIGN,
NULL } ;
714static const char*
const col_elt[] = {
"col",
NULL } ;
715static const char*
const edit_attrs[] = { ATTRS,
"datetime",
"cite",
NULL } ;
716static const char*
const compact_attrs[] = { ATTRS,
"compact",
NULL } ;
717static const char*
const dl_contents[] = {
"dt",
"dd",
NULL } ;
718static const char*
const compact_attr[] = {
"compact",
NULL } ;
719static const char*
const label_attr[] = {
"label",
NULL } ;
720static const char*
const fieldset_contents[] = { FLOW,
"legend" } ;
721static const char*
const font_attrs[] = { COREATTRS, I18N,
"size",
"color",
"face" ,
NULL } ;
722static const char*
const form_contents[] = { HEADING,
LIST,
INLINE,
"pre",
"p",
"div",
"center",
"noscript",
"noframes",
"blockquote",
"isindex",
"hr",
"table",
"fieldset",
"address",
NULL } ;
723static const char*
const form_attrs[] = { ATTRS,
"method",
"enctype",
"accept",
"name",
"onsubmit",
"onreset",
"accept-charset",
NULL } ;
724static const char*
const frame_attrs[] = { COREATTRS,
"longdesc",
"name",
"src",
"frameborder",
"marginwidth",
"marginheight",
"noresize",
"scrolling" ,
NULL } ;
725static const char*
const frameset_attrs[] = { COREATTRS,
"rows",
"cols",
"onload",
"onunload",
NULL } ;
726static const char*
const frameset_contents[] = {
"frameset",
"frame",
"noframes",
NULL } ;
727static const char*
const head_attrs[] = { I18N,
"profile",
NULL } ;
728static const char*
const head_contents[] = {
"title",
"isindex",
"base",
"script",
"style",
"meta",
"link",
"object",
NULL } ;
729static const char*
const hr_depr[] = {
"align",
"noshade",
"size",
"width",
NULL } ;
730static const char*
const version_attr[] = {
"version",
NULL } ;
731static const char*
const html_content[] = {
"head",
"body",
"frameset",
NULL } ;
732static const char*
const iframe_attrs[] = { COREATTRS,
"longdesc",
"name",
"src",
"frameborder",
"marginwidth",
"marginheight",
"scrolling",
"align",
"height",
"width",
NULL } ;
733static const char*
const img_attrs[] = { ATTRS,
"longdesc",
"name",
"height",
"width",
"usemap",
"ismap",
NULL } ;
734static const char*
const embed_attrs[] = { COREATTRS,
"align",
"alt",
"border",
"code",
"codebase",
"frameborder",
"height",
"hidden",
"hspace",
"name",
"palette",
"pluginspace",
"pluginurl",
"src",
"type",
"units",
"vspace",
"width",
NULL } ;
735static const char*
const input_attrs[] = { ATTRS,
"type",
"name",
"value",
"checked",
"disabled",
"readonly",
"size",
"maxlength",
"src",
"alt",
"usemap",
"ismap",
"tabindex",
"accesskey",
"onfocus",
"onblur",
"onselect",
"onchange",
"accept",
NULL } ;
736static const char*
const prompt_attrs[] = { COREATTRS, I18N,
"prompt",
NULL } ;
737static const char*
const label_attrs[] = { ATTRS,
"for",
"accesskey",
"onfocus",
"onblur",
NULL } ;
738static const char*
const legend_attrs[] = { ATTRS,
"accesskey",
NULL } ;
739static const char*
const align_attr[] = {
"align",
NULL } ;
740static const char*
const link_attrs[] = { ATTRS,
"charset",
"href",
"hreflang",
"type",
"rel",
"rev",
"media",
NULL } ;
741static const char*
const map_contents[] = {
BLOCK,
"area",
NULL } ;
742static const char*
const name_attr[] = {
"name",
NULL } ;
743static const char*
const action_attr[] = {
"action",
NULL } ;
744static const char*
const blockli_elt[] = {
BLOCK,
"li",
NULL } ;
745static const char*
const meta_attrs[] = { I18N,
"http-equiv",
"name",
"scheme",
"charset",
NULL } ;
746static const char*
const content_attr[] = {
"content",
NULL } ;
747static const char*
const type_attr[] = {
"type",
NULL } ;
748static const char*
const noframes_content[] = {
"body", FLOW MODIFIER,
NULL } ;
749static const char*
const object_contents[] = { FLOW,
"param",
NULL } ;
750static const char*
const object_attrs[] = { ATTRS,
"declare",
"classid",
"codebase",
"data",
"type",
"codetype",
"archive",
"standby",
"height",
"width",
"usemap",
"name",
"tabindex",
NULL } ;
751static const char*
const object_depr[] = {
"align",
"border",
"hspace",
"vspace",
NULL } ;
752static const char*
const ol_attrs[] = {
"type",
"compact",
"start",
NULL} ;
753static const char*
const option_elt[] = {
"option",
NULL } ;
754static const char*
const optgroup_attrs[] = { ATTRS,
"disabled",
NULL } ;
755static const char*
const option_attrs[] = { ATTRS,
"disabled",
"label",
"selected",
"value",
NULL } ;
756static const char*
const param_attrs[] = {
"id",
"value",
"valuetype",
"type",
NULL } ;
757static const char*
const width_attr[] = {
"width",
NULL } ;
758static const char*
const pre_content[] = { PHRASE,
"tt",
"i",
"b",
"u",
"s",
"strike",
"a",
"br",
"script",
"map",
"q",
"span",
"bdo",
"iframe",
NULL } ;
759static const char*
const script_attrs[] = {
"charset",
"src",
"defer",
"event",
"for",
NULL } ;
760static const char*
const language_attr[] = {
"language",
NULL } ;
761static const char*
const select_content[] = {
"optgroup",
"option",
NULL } ;
762static const char*
const select_attrs[] = { ATTRS,
"name",
"size",
"multiple",
"disabled",
"tabindex",
"onfocus",
"onblur",
"onchange",
NULL } ;
763static const char*
const style_attrs[] = { I18N,
"media",
"title",
NULL } ;
764static const char*
const table_attrs[] = { ATTRS,
"summary",
"width",
"border",
"frame",
"rules",
"cellspacing",
"cellpadding",
"datapagesize",
NULL } ;
765static const char*
const table_depr[] = {
"align",
"bgcolor",
NULL } ;
766static const char*
const table_contents[] = {
"caption",
"col",
"colgroup",
"thead",
"tfoot",
"tbody",
"tr",
NULL} ;
767static const char*
const tr_elt[] = {
"tr",
NULL } ;
768static const char*
const talign_attrs[] = { ATTRS, CELLHALIGN, CELLVALIGN,
NULL} ;
769static const char*
const th_td_depr[] = {
"nowrap",
"bgcolor",
"width",
"height",
NULL } ;
770static const char*
const th_td_attr[] = { ATTRS,
"abbr",
"axis",
"headers",
"scope",
"rowspan",
"colspan", CELLHALIGN, CELLVALIGN,
NULL } ;
771static const char*
const textarea_attrs[] = { ATTRS,
"name",
"disabled",
"readonly",
"tabindex",
"accesskey",
"onfocus",
"onblur",
"onselect",
"onchange",
NULL } ;
772static const char*
const tr_contents[] = {
"th",
"td",
NULL } ;
773static const char*
const bgcolor_attr[] = {
"bgcolor",
NULL } ;
774static const char*
const li_elt[] = {
"li",
NULL } ;
775static const char*
const ul_depr[] = {
"type",
"compact",
NULL} ;
776static const char*
const dir_attr[] = {
"dir",
NULL} ;
778#define DECL (const char**)
780static const htmlElemDesc
781html40ElementTable[] = {
782{
"a", 0, 0, 0, 0, 0, 0, 1,
"anchor ",
783 DECL html_inline ,
NULL , DECL a_attrs , DECL target_attr,
NULL
785{
"abbr", 0, 0, 0, 0, 0, 0, 1,
"abbreviated form",
788{
"acronym", 0, 0, 0, 0, 0, 0, 1,
"",
791{
"address", 0, 0, 0, 0, 0, 0, 0,
"information on author ",
794{
"applet", 0, 0, 0, 0, 1, 1, 2,
"java applet ",
797{
"area", 0, 2, 2, 1, 0, 0, 0,
"client-side image map area ",
798 EMPTY ,
NULL , DECL area_attrs , DECL target_attr, DECL alt_attr
800{
"b", 0, 3, 0, 0, 0, 0, 1,
"bold text style",
803{
"base", 0, 2, 2, 1, 0, 0, 0,
"document base uri ",
806{
"basefont", 0, 2, 2, 1, 1, 1, 1,
"base font size " ,
809{
"bdo", 0, 0, 0, 0, 0, 0, 1,
"i18n bidi over-ride ",
810 DECL html_inline ,
NULL , DECL core_i18n_attrs,
NULL, DECL dir_attr
812{
"big", 0, 3, 0, 0, 0, 0, 1,
"large text style",
815{
"blockquote", 0, 0, 0, 0, 0, 0, 0,
"long quotation ",
818{
"body", 1, 1, 0, 0, 0, 0, 0,
"document body ",
819 DECL body_contents ,
"div" , DECL body_attrs, DECL body_depr,
NULL
821{
"br", 0, 2, 2, 1, 0, 0, 1,
"forced line break ",
824{
"button", 0, 0, 0, 0, 0, 0, 2,
"push button ",
825 DECL html_flow MODIFIER ,
NULL , DECL button_attrs,
NULL,
NULL
827{
"caption", 0, 0, 0, 0, 0, 0, 0,
"table caption ",
830{
"center", 0, 3, 0, 0, 1, 1, 0,
"shorthand for div align=center ",
833{
"cite", 0, 0, 0, 0, 0, 0, 1,
"citation",
836{
"code", 0, 0, 0, 0, 0, 0, 1,
"computer code fragment",
839{
"col", 0, 2, 2, 1, 0, 0, 0,
"table column ",
842{
"colgroup", 0, 1, 0, 0, 0, 0, 0,
"table column group ",
843 DECL col_elt ,
"col" , DECL col_attrs ,
NULL,
NULL
845{
"dd", 0, 1, 0, 0, 0, 0, 0,
"definition description ",
848{
"del", 0, 0, 0, 0, 0, 0, 2,
"deleted text ",
851{
"dfn", 0, 0, 0, 0, 0, 0, 1,
"instance definition",
854{
"dir", 0, 0, 0, 0, 1, 1, 0,
"directory list",
855 DECL blockli_elt,
"li" ,
NULL, DECL compact_attrs,
NULL
857{
"div", 0, 0, 0, 0, 0, 0, 0,
"generic language/style container",
858 DECL html_flow,
NULL, DECL html_attrs, DECL align_attr,
NULL
860{
"dl", 0, 0, 0, 0, 0, 0, 0,
"definition list ",
861 DECL dl_contents ,
"dd" , DECL html_attrs, DECL compact_attr,
NULL
863{
"dt", 0, 1, 0, 0, 0, 0, 0,
"definition term ",
866{
"em", 0, 3, 0, 0, 0, 0, 1,
"emphasis",
869{
"embed", 0, 1, 0, 0, 1, 1, 1,
"generic embedded object ",
872{
"fieldset", 0, 0, 0, 0, 0, 0, 0,
"form control group ",
873 DECL fieldset_contents ,
NULL, DECL html_attrs,
NULL,
NULL
875{
"font", 0, 3, 0, 0, 1, 1, 1,
"local change to font ",
878{
"form", 0, 0, 0, 0, 0, 0, 0,
"interactive form ",
879 DECL form_contents,
"fieldset", DECL form_attrs , DECL target_attr, DECL action_attr
881{
"frame", 0, 2, 2, 1, 0, 2, 0,
"subwindow " ,
884{
"frameset", 0, 0, 0, 0, 0, 2, 0,
"window subdivision" ,
885 DECL frameset_contents,
"noframes" ,
NULL , DECL frameset_attrs,
NULL
887{
"h1", 0, 0, 0, 0, 0, 0, 0,
"heading ",
888 DECL html_inline,
NULL, DECL html_attrs, DECL align_attr,
NULL
890{
"h2", 0, 0, 0, 0, 0, 0, 0,
"heading ",
891 DECL html_inline,
NULL, DECL html_attrs, DECL align_attr,
NULL
893{
"h3", 0, 0, 0, 0, 0, 0, 0,
"heading ",
894 DECL html_inline,
NULL, DECL html_attrs, DECL align_attr,
NULL
896{
"h4", 0, 0, 0, 0, 0, 0, 0,
"heading ",
897 DECL html_inline,
NULL, DECL html_attrs, DECL align_attr,
NULL
899{
"h5", 0, 0, 0, 0, 0, 0, 0,
"heading ",
900 DECL html_inline,
NULL, DECL html_attrs, DECL align_attr,
NULL
902{
"h6", 0, 0, 0, 0, 0, 0, 0,
"heading ",
903 DECL html_inline,
NULL, DECL html_attrs, DECL align_attr,
NULL
905{
"head", 1, 1, 0, 0, 0, 0, 0,
"document head ",
908{
"hr", 0, 2, 2, 1, 0, 0, 0,
"horizontal rule " ,
911{
"html", 1, 1, 0, 0, 0, 0, 0,
"document root element ",
912 DECL html_content ,
NULL , DECL i18n_attrs, DECL version_attr,
NULL
914{
"i", 0, 3, 0, 0, 0, 0, 1,
"italic text style",
917{
"iframe", 0, 0, 0, 0, 0, 1, 2,
"inline subwindow ",
920{
"img", 0, 2, 2, 1, 0, 0, 1,
"embedded image ",
921 EMPTY,
NULL, DECL img_attrs, DECL align_attr, DECL src_alt_attrs
923{
"input", 0, 2, 2, 1, 0, 0, 1,
"form control ",
926{
"ins", 0, 0, 0, 0, 0, 0, 2,
"inserted text",
929{
"isindex", 0, 2, 2, 1, 1, 1, 0,
"single line prompt ",
932{
"kbd", 0, 0, 0, 0, 0, 0, 1,
"text to be entered by the user",
935{
"label", 0, 0, 0, 0, 0, 0, 1,
"form field label text ",
936 DECL html_inline MODIFIER,
NULL, DECL label_attrs ,
NULL,
NULL
938{
"legend", 0, 0, 0, 0, 0, 0, 0,
"fieldset legend ",
939 DECL html_inline,
NULL, DECL legend_attrs , DECL align_attr,
NULL
941{
"li", 0, 1, 1, 0, 0, 0, 0,
"list item ",
944{
"link", 0, 2, 2, 1, 0, 0, 0,
"a media-independent link ",
947{
"map", 0, 0, 0, 0, 0, 0, 2,
"client-side image map ",
948 DECL map_contents ,
NULL, DECL html_attrs ,
NULL, DECL name_attr
950{
"menu", 0, 0, 0, 0, 1, 1, 0,
"menu list ",
953{
"meta", 0, 2, 2, 1, 0, 0, 0,
"generic metainformation ",
956{
"noframes", 0, 0, 0, 0, 0, 2, 0,
"alternate content container for non frame-based rendering ",
957 DECL noframes_content,
"body" , DECL html_attrs,
NULL,
NULL
959{
"noscript", 0, 0, 0, 0, 0, 0, 0,
"alternate content container for non script-based rendering ",
960 DECL html_flow,
"div", DECL html_attrs,
NULL,
NULL
962{
"object", 0, 0, 0, 0, 0, 0, 2,
"generic embedded object ",
963 DECL object_contents ,
"div" , DECL object_attrs, DECL object_depr,
NULL
965{
"ol", 0, 0, 0, 0, 0, 0, 0,
"ordered list ",
966 DECL li_elt ,
"li" , DECL html_attrs, DECL ol_attrs,
NULL
968{
"optgroup", 0, 0, 0, 0, 0, 0, 0,
"option group ",
969 DECL option_elt ,
"option", DECL optgroup_attrs,
NULL, DECL label_attr
971{
"option", 0, 1, 0, 0, 0, 0, 0,
"selectable choice " ,
974{
"p", 0, 1, 0, 0, 0, 0, 0,
"paragraph ",
975 DECL html_inline,
NULL, DECL html_attrs, DECL align_attr,
NULL
977{
"param", 0, 2, 2, 1, 0, 0, 0,
"named property value ",
980{
"pre", 0, 0, 0, 0, 0, 0, 0,
"preformatted text ",
981 DECL pre_content,
NULL, DECL html_attrs, DECL width_attr,
NULL
983{
"q", 0, 0, 0, 0, 0, 0, 1,
"short inline quotation ",
986{
"s", 0, 3, 0, 0, 1, 1, 1,
"strike-through text style",
989{
"samp", 0, 0, 0, 0, 0, 0, 1,
"sample program output, scripts, etc.",
992{
"script", 0, 0, 0, 0, 0, 0, 2,
"script statements ",
993 DECL html_cdata,
NULL, DECL script_attrs, DECL language_attr, DECL type_attr
995{
"select", 0, 0, 0, 0, 0, 0, 1,
"option selector ",
996 DECL select_content,
NULL, DECL select_attrs,
NULL,
NULL
998{
"small", 0, 3, 0, 0, 0, 0, 1,
"small text style",
1001{
"span", 0, 0, 0, 0, 0, 0, 1,
"generic language/style container ",
1004{
"strike", 0, 3, 0, 0, 1, 1, 1,
"strike-through text",
1007{
"strong", 0, 3, 0, 0, 0, 0, 1,
"strong emphasis",
1010{
"style", 0, 0, 0, 0, 0, 0, 0,
"style info ",
1011 DECL html_cdata,
NULL, DECL style_attrs,
NULL, DECL type_attr
1013{
"sub", 0, 3, 0, 0, 0, 0, 1,
"subscript",
1016{
"sup", 0, 3, 0, 0, 0, 0, 1,
"superscript ",
1019{
"table", 0, 0, 0, 0, 0, 0, 0,
"",
1020 DECL table_contents ,
"tr" , DECL table_attrs , DECL table_depr,
NULL
1022{
"tbody", 1, 0, 0, 0, 0, 0, 0,
"table body ",
1023 DECL tr_elt ,
"tr" , DECL talign_attrs,
NULL,
NULL
1025{
"td", 0, 0, 0, 0, 0, 0, 0,
"table data cell",
1026 DECL html_flow,
NULL, DECL th_td_attr, DECL th_td_depr,
NULL
1028{
"textarea", 0, 0, 0, 0, 0, 0, 1,
"multi-line text field ",
1029 DECL html_pcdata,
NULL, DECL textarea_attrs,
NULL, DECL rows_cols_attr
1031{
"tfoot", 0, 1, 0, 0, 0, 0, 0,
"table footer ",
1032 DECL tr_elt ,
"tr" , DECL talign_attrs,
NULL,
NULL
1034{
"th", 0, 1, 0, 0, 0, 0, 0,
"table header cell",
1035 DECL html_flow,
NULL, DECL th_td_attr, DECL th_td_depr,
NULL
1037{
"thead", 0, 1, 0, 0, 0, 0, 0,
"table header ",
1038 DECL tr_elt ,
"tr" , DECL talign_attrs,
NULL,
NULL
1040{
"title", 0, 0, 0, 0, 0, 0, 0,
"document title ",
1043{
"tr", 0, 0, 0, 0, 0, 0, 0,
"table row ",
1044 DECL tr_contents ,
"td" , DECL talign_attrs, DECL bgcolor_attr,
NULL
1046{
"tt", 0, 3, 0, 0, 0, 0, 1,
"teletype or monospaced text style",
1049{
"u", 0, 3, 0, 0, 1, 1, 1,
"underlined text style",
1052{
"ul", 0, 0, 0, 0, 0, 0, 0,
"unordered list ",
1053 DECL li_elt ,
"li" , DECL html_attrs, DECL ul_depr,
NULL
1055{
"var", 0, 0, 0, 0, 0, 0, 1,
"instance of a variable or program argument",
1063} htmlStartCloseEntry;
1068static const htmlStartCloseEntry htmlStartClose[] = {
1070 {
"a",
"fieldset" },
1074 {
"address",
"dd" },
1075 {
"address",
"dl" },
1076 {
"address",
"dt" },
1077 {
"address",
"form" },
1078 {
"address",
"li" },
1079 {
"address",
"ul" },
1085 {
"caption",
"col" },
1086 {
"caption",
"colgroup" },
1087 {
"caption",
"tbody" },
1088 {
"caption",
"tfoot" },
1089 {
"caption",
"thead" },
1090 {
"caption",
"tr" },
1092 {
"col",
"colgroup" },
1097 {
"colgroup",
"colgroup" },
1098 {
"colgroup",
"tbody" },
1099 {
"colgroup",
"tfoot" },
1100 {
"colgroup",
"thead" },
1101 {
"colgroup",
"tr" },
1112 {
"font",
"center" },
1116 {
"h1",
"fieldset" },
1121 {
"h2",
"fieldset" },
1126 {
"h3",
"fieldset" },
1131 {
"h4",
"fieldset" },
1136 {
"h5",
"fieldset" },
1141 {
"h6",
"fieldset" },
1148 {
"head",
"acronym" },
1149 {
"head",
"address" },
1153 {
"head",
"blockquote" },
1156 {
"head",
"center" },
1166 {
"head",
"fieldset" },
1169 {
"head",
"frameset" },
1178 {
"head",
"iframe" },
1182 {
"head",
"listing" },
1191 {
"head",
"small" },
1193 {
"head",
"strike" },
1194 {
"head",
"strong" },
1197 {
"head",
"table" },
1208 {
"legend",
"fieldset" },
1211 {
"link",
"frameset" },
1212 {
"listing",
"dd" },
1213 {
"listing",
"dl" },
1214 {
"listing",
"dt" },
1215 {
"listing",
"fieldset" },
1216 {
"listing",
"form" },
1217 {
"listing",
"li" },
1218 {
"listing",
"table" },
1219 {
"listing",
"ul" },
1227 {
"option",
"optgroup" },
1228 {
"option",
"option" },
1230 {
"p",
"blockquote" },
1235 {
"p",
"colgroup" },
1241 {
"p",
"fieldset" },
1243 {
"p",
"frameset" },
1270 {
"pre",
"fieldset" },
1276 {
"script",
"noscript" },
1281 {
"style",
"body" },
1282 {
"style",
"frameset" },
1283 {
"tbody",
"tbody" },
1284 {
"tbody",
"tfoot" },
1290 {
"tfoot",
"tbody" },
1296 {
"thead",
"tbody" },
1297 {
"thead",
"tfoot" },
1298 {
"title",
"body" },
1299 {
"title",
"frameset" },
1307 {
"ul",
"address" },
1315 {
"xmp",
"fieldset" },
1329static const char *
const htmlNoContentElements[] = {
1340static const char *
const htmlScriptAttributes[] = {
1374static const elementPriority htmlEndPriority[] = {
1404htmlInitAutoClose(
void) {
1408htmlCompareTags(
const void *
key,
const void *
member) {
1410 const htmlElemDesc *
desc = (
const htmlElemDesc *)
member;
1428 return((
const htmlElemDesc *)
bsearch(
tag, html40ElementTable,
1429 sizeof(html40ElementTable) /
sizeof(htmlElemDesc),
1430 sizeof(htmlElemDesc), htmlCompareTags));
1443 while ((htmlEndPriority[
i].
name !=
NULL) &&
1452htmlCompareStartClose(
const void *vkey,
const void *
member) {
1453 const htmlStartCloseEntry *
key = (
const htmlStartCloseEntry *) vkey;
1454 const htmlStartCloseEntry *
entry = (
const htmlStartCloseEntry *)
member;
1475htmlCheckAutoClose(
const xmlChar * newtag,
const xmlChar * oldtag)
1477 htmlStartCloseEntry
key;
1480 key.oldTag = (
const char *) oldtag;
1481 key.newTag = (
const char *) newtag;
1483 sizeof(htmlStartClose) /
sizeof(htmlStartCloseEntry),
1484 sizeof(htmlStartCloseEntry), htmlCompareStartClose);
1497htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt,
const xmlChar * newtag)
1499 const htmlElemDesc *
info;
1502 priority = htmlGetEndPriority(newtag);
1504 for (
i = (ctxt->nameNr - 1);
i >= 0;
i--) {
1514 if (htmlGetEndPriority(ctxt->nameTab[
i]) >
priority)
1521 info = htmlTagLookup(ctxt->name);
1524 "Opening and ending tag mismatch: %s and %s\n",
1525 newtag, ctxt->name);
1527 if ((ctxt->sax !=
NULL) && (ctxt->sax->endElement !=
NULL))
1528 ctxt->sax->endElement(ctxt->userData, ctxt->name);
1540htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)
1544 if (ctxt->nameNr == 0)
1546 for (
i = (ctxt->nameNr - 1);
i >= 0;
i--) {
1547 if ((ctxt->sax !=
NULL) && (ctxt->sax->endElement !=
NULL))
1548 ctxt->sax->endElement(ctxt->userData, ctxt->name);
1566htmlAutoClose(htmlParserCtxtPtr ctxt,
const xmlChar * newtag)
1568 while ((newtag !=
NULL) && (ctxt->name !=
NULL) &&
1569 (htmlCheckAutoClose(newtag, ctxt->name))) {
1570 if ((ctxt->sax !=
NULL) && (ctxt->sax->endElement !=
NULL))
1571 ctxt->sax->endElement(ctxt->userData, ctxt->name);
1574 if (newtag ==
NULL) {
1575 htmlAutoCloseOnEnd(ctxt);
1578 while ((newtag ==
NULL) && (ctxt->name !=
NULL) &&
1582 if ((ctxt->sax !=
NULL) && (ctxt->sax->endElement !=
NULL))
1583 ctxt->sax->endElement(ctxt->userData, ctxt->name);
1602htmlAutoCloseTag(htmlDocPtr doc,
const xmlChar *
name, htmlNodePtr
elem) {
1607 if (htmlCheckAutoClose(
elem->name,
name))
return(1);
1610 if (htmlAutoCloseTag(doc,
name,
child))
return(1);
1628htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr
elem) {
1634 if (htmlAutoCloseTag(doc,
elem->name,
child))
return(1);
1650htmlCheckImplied(htmlParserCtxtPtr ctxt,
const xmlChar *newtag) {
1653 if (ctxt->options & HTML_PARSE_NOIMPLIED)
1655 if (!htmlOmittedDefaultValue)
1659 if (ctxt->nameNr <= 0) {
1660 htmlnamePush(ctxt,
BAD_CAST"html");
1661 if ((ctxt->sax !=
NULL) && (ctxt->sax->startElement !=
NULL))
1662 ctxt->sax->startElement(ctxt->userData,
BAD_CAST"html",
NULL);
1666 if ((ctxt->nameNr <= 1) &&
1673 if (ctxt->html >= 3) {
1681 htmlnamePush(ctxt,
BAD_CAST"head");
1682 if ((ctxt->sax !=
NULL) && (ctxt->sax->startElement !=
NULL))
1683 ctxt->sax->startElement(ctxt->userData,
BAD_CAST"head",
NULL);
1687 if (ctxt->html >= 10) {
1691 for (
i = 0;
i < ctxt->nameNr;
i++) {
1700 htmlnamePush(ctxt,
BAD_CAST"body");
1701 if ((ctxt->sax !=
NULL) && (ctxt->sax->startElement !=
NULL))
1702 ctxt->sax->startElement(ctxt->userData,
BAD_CAST"body",
NULL);
1718htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
1727 htmlCheckImplied(ctxt,
BAD_CAST"p");
1729 if ((ctxt->sax !=
NULL) && (ctxt->sax->startElement !=
NULL))
1730 ctxt->sax->startElement(ctxt->userData,
BAD_CAST"p",
NULL);
1733 if (!htmlOmittedDefaultValue)
1735 for (
i = 0; htmlNoContentElements[
i] !=
NULL;
i++) {
1738 htmlCheckImplied(ctxt,
BAD_CAST"p");
1740 if ((ctxt->sax !=
NULL) && (ctxt->sax->startElement !=
NULL))
1741 ctxt->sax->startElement(ctxt->userData,
BAD_CAST"p",
NULL);
1765 if ((
name[0] !=
'o') || (
name[1] !=
'n'))
1768 i <
sizeof(htmlScriptAttributes)/
sizeof(htmlScriptAttributes[0]);
1783static const htmlEntityDesc html40EntitiesTable[] = {
1787{ 34,
"quot",
"quotation mark = APL quote, U+0022 ISOnum" },
1788{ 38,
"amp",
"ampersand, U+0026 ISOnum" },
1789{ 39,
"apos",
"single quote" },
1790{ 60,
"lt",
"less-than sign, U+003C ISOnum" },
1791{ 62,
"gt",
"greater-than sign, U+003E ISOnum" },
1797{ 160,
"nbsp",
"no-break space = non-breaking space, U+00A0 ISOnum" },
1798{ 161,
"iexcl",
"inverted exclamation mark, U+00A1 ISOnum" },
1799{ 162,
"cent",
"cent sign, U+00A2 ISOnum" },
1800{ 163,
"pound",
"pound sign, U+00A3 ISOnum" },
1801{ 164,
"curren",
"currency sign, U+00A4 ISOnum" },
1802{ 165,
"yen",
"yen sign = yuan sign, U+00A5 ISOnum" },
1803{ 166,
"brvbar",
"broken bar = broken vertical bar, U+00A6 ISOnum" },
1804{ 167,
"sect",
"section sign, U+00A7 ISOnum" },
1805{ 168,
"uml",
"diaeresis = spacing diaeresis, U+00A8 ISOdia" },
1806{ 169,
"copy",
"copyright sign, U+00A9 ISOnum" },
1807{ 170,
"ordf",
"feminine ordinal indicator, U+00AA ISOnum" },
1808{ 171,
"laquo",
"left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
1809{ 172,
"not",
"not sign, U+00AC ISOnum" },
1810{ 173,
"shy",
"soft hyphen = discretionary hyphen, U+00AD ISOnum" },
1811{ 174,
"reg",
"registered sign = registered trade mark sign, U+00AE ISOnum" },
1812{ 175,
"macr",
"macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
1813{ 176,
"deg",
"degree sign, U+00B0 ISOnum" },
1814{ 177,
"plusmn",
"plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
1815{ 178,
"sup2",
"superscript two = superscript digit two = squared, U+00B2 ISOnum" },
1816{ 179,
"sup3",
"superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
1817{ 180,
"acute",
"acute accent = spacing acute, U+00B4 ISOdia" },
1818{ 181,
"micro",
"micro sign, U+00B5 ISOnum" },
1819{ 182,
"para",
"pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1820{ 183,
"middot",
"middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1821{ 184,
"cedil",
"cedilla = spacing cedilla, U+00B8 ISOdia" },
1822{ 185,
"sup1",
"superscript one = superscript digit one, U+00B9 ISOnum" },
1823{ 186,
"ordm",
"masculine ordinal indicator, U+00BA ISOnum" },
1824{ 187,
"raquo",
"right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1825{ 188,
"frac14",
"vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1826{ 189,
"frac12",
"vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1827{ 190,
"frac34",
"vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1828{ 191,
"iquest",
"inverted question mark = turned question mark, U+00BF ISOnum" },
1829{ 192,
"Agrave",
"latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1830{ 193,
"Aacute",
"latin capital letter A with acute, U+00C1 ISOlat1" },
1831{ 194,
"Acirc",
"latin capital letter A with circumflex, U+00C2 ISOlat1" },
1832{ 195,
"Atilde",
"latin capital letter A with tilde, U+00C3 ISOlat1" },
1833{ 196,
"Auml",
"latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1834{ 197,
"Aring",
"latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1835{ 198,
"AElig",
"latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1836{ 199,
"Ccedil",
"latin capital letter C with cedilla, U+00C7 ISOlat1" },
1837{ 200,
"Egrave",
"latin capital letter E with grave, U+00C8 ISOlat1" },
1838{ 201,
"Eacute",
"latin capital letter E with acute, U+00C9 ISOlat1" },
1839{ 202,
"Ecirc",
"latin capital letter E with circumflex, U+00CA ISOlat1" },
1840{ 203,
"Euml",
"latin capital letter E with diaeresis, U+00CB ISOlat1" },
1841{ 204,
"Igrave",
"latin capital letter I with grave, U+00CC ISOlat1" },
1842{ 205,
"Iacute",
"latin capital letter I with acute, U+00CD ISOlat1" },
1843{ 206,
"Icirc",
"latin capital letter I with circumflex, U+00CE ISOlat1" },
1844{ 207,
"Iuml",
"latin capital letter I with diaeresis, U+00CF ISOlat1" },
1845{ 208,
"ETH",
"latin capital letter ETH, U+00D0 ISOlat1" },
1846{ 209,
"Ntilde",
"latin capital letter N with tilde, U+00D1 ISOlat1" },
1847{ 210,
"Ograve",
"latin capital letter O with grave, U+00D2 ISOlat1" },
1848{ 211,
"Oacute",
"latin capital letter O with acute, U+00D3 ISOlat1" },
1849{ 212,
"Ocirc",
"latin capital letter O with circumflex, U+00D4 ISOlat1" },
1850{ 213,
"Otilde",
"latin capital letter O with tilde, U+00D5 ISOlat1" },
1851{ 214,
"Ouml",
"latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1852{ 215,
"times",
"multiplication sign, U+00D7 ISOnum" },
1853{ 216,
"Oslash",
"latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1854{ 217,
"Ugrave",
"latin capital letter U with grave, U+00D9 ISOlat1" },
1855{ 218,
"Uacute",
"latin capital letter U with acute, U+00DA ISOlat1" },
1856{ 219,
"Ucirc",
"latin capital letter U with circumflex, U+00DB ISOlat1" },
1857{ 220,
"Uuml",
"latin capital letter U with diaeresis, U+00DC ISOlat1" },
1858{ 221,
"Yacute",
"latin capital letter Y with acute, U+00DD ISOlat1" },
1859{ 222,
"THORN",
"latin capital letter THORN, U+00DE ISOlat1" },
1860{ 223,
"szlig",
"latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1861{ 224,
"agrave",
"latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1862{ 225,
"aacute",
"latin small letter a with acute, U+00E1 ISOlat1" },
1863{ 226,
"acirc",
"latin small letter a with circumflex, U+00E2 ISOlat1" },
1864{ 227,
"atilde",
"latin small letter a with tilde, U+00E3 ISOlat1" },
1865{ 228,
"auml",
"latin small letter a with diaeresis, U+00E4 ISOlat1" },
1866{ 229,
"aring",
"latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1867{ 230,
"aelig",
"latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1868{ 231,
"ccedil",
"latin small letter c with cedilla, U+00E7 ISOlat1" },
1869{ 232,
"egrave",
"latin small letter e with grave, U+00E8 ISOlat1" },
1870{ 233,
"eacute",
"latin small letter e with acute, U+00E9 ISOlat1" },
1871{ 234,
"ecirc",
"latin small letter e with circumflex, U+00EA ISOlat1" },
1872{ 235,
"euml",
"latin small letter e with diaeresis, U+00EB ISOlat1" },
1873{ 236,
"igrave",
"latin small letter i with grave, U+00EC ISOlat1" },
1874{ 237,
"iacute",
"latin small letter i with acute, U+00ED ISOlat1" },
1875{ 238,
"icirc",
"latin small letter i with circumflex, U+00EE ISOlat1" },
1876{ 239,
"iuml",
"latin small letter i with diaeresis, U+00EF ISOlat1" },
1877{ 240,
"eth",
"latin small letter eth, U+00F0 ISOlat1" },
1878{ 241,
"ntilde",
"latin small letter n with tilde, U+00F1 ISOlat1" },
1879{ 242,
"ograve",
"latin small letter o with grave, U+00F2 ISOlat1" },
1880{ 243,
"oacute",
"latin small letter o with acute, U+00F3 ISOlat1" },
1881{ 244,
"ocirc",
"latin small letter o with circumflex, U+00F4 ISOlat1" },
1882{ 245,
"otilde",
"latin small letter o with tilde, U+00F5 ISOlat1" },
1883{ 246,
"ouml",
"latin small letter o with diaeresis, U+00F6 ISOlat1" },
1884{ 247,
"divide",
"division sign, U+00F7 ISOnum" },
1885{ 248,
"oslash",
"latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1886{ 249,
"ugrave",
"latin small letter u with grave, U+00F9 ISOlat1" },
1887{ 250,
"uacute",
"latin small letter u with acute, U+00FA ISOlat1" },
1888{ 251,
"ucirc",
"latin small letter u with circumflex, U+00FB ISOlat1" },
1889{ 252,
"uuml",
"latin small letter u with diaeresis, U+00FC ISOlat1" },
1890{ 253,
"yacute",
"latin small letter y with acute, U+00FD ISOlat1" },
1891{ 254,
"thorn",
"latin small letter thorn with, U+00FE ISOlat1" },
1892{ 255,
"yuml",
"latin small letter y with diaeresis, U+00FF ISOlat1" },
1894{ 338,
"OElig",
"latin capital ligature OE, U+0152 ISOlat2" },
1895{ 339,
"oelig",
"latin small ligature oe, U+0153 ISOlat2" },
1896{ 352,
"Scaron",
"latin capital letter S with caron, U+0160 ISOlat2" },
1897{ 353,
"scaron",
"latin small letter s with caron, U+0161 ISOlat2" },
1898{ 376,
"Yuml",
"latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1903{ 402,
"fnof",
"latin small f with hook = function = florin, U+0192 ISOtech" },
1905{ 710,
"circ",
"modifier letter circumflex accent, U+02C6 ISOpub" },
1906{ 732,
"tilde",
"small tilde, U+02DC ISOdia" },
1908{ 913,
"Alpha",
"greek capital letter alpha, U+0391" },
1909{ 914,
"Beta",
"greek capital letter beta, U+0392" },
1910{ 915,
"Gamma",
"greek capital letter gamma, U+0393 ISOgrk3" },
1911{ 916,
"Delta",
"greek capital letter delta, U+0394 ISOgrk3" },
1912{ 917,
"Epsilon",
"greek capital letter epsilon, U+0395" },
1913{ 918,
"Zeta",
"greek capital letter zeta, U+0396" },
1914{ 919,
"Eta",
"greek capital letter eta, U+0397" },
1915{ 920,
"Theta",
"greek capital letter theta, U+0398 ISOgrk3" },
1916{ 921,
"Iota",
"greek capital letter iota, U+0399" },
1917{ 922,
"Kappa",
"greek capital letter kappa, U+039A" },
1918{ 923,
"Lambda",
"greek capital letter lambda, U+039B ISOgrk3" },
1919{ 924,
"Mu",
"greek capital letter mu, U+039C" },
1920{ 925,
"Nu",
"greek capital letter nu, U+039D" },
1921{ 926,
"Xi",
"greek capital letter xi, U+039E ISOgrk3" },
1922{ 927,
"Omicron",
"greek capital letter omicron, U+039F" },
1923{ 928,
"Pi",
"greek capital letter pi, U+03A0 ISOgrk3" },
1924{ 929,
"Rho",
"greek capital letter rho, U+03A1" },
1925{ 931,
"Sigma",
"greek capital letter sigma, U+03A3 ISOgrk3" },
1926{ 932,
"Tau",
"greek capital letter tau, U+03A4" },
1927{ 933,
"Upsilon",
"greek capital letter upsilon, U+03A5 ISOgrk3" },
1928{ 934,
"Phi",
"greek capital letter phi, U+03A6 ISOgrk3" },
1929{ 935,
"Chi",
"greek capital letter chi, U+03A7" },
1930{ 936,
"Psi",
"greek capital letter psi, U+03A8 ISOgrk3" },
1931{ 937,
"Omega",
"greek capital letter omega, U+03A9 ISOgrk3" },
1933{ 945,
"alpha",
"greek small letter alpha, U+03B1 ISOgrk3" },
1934{ 946,
"beta",
"greek small letter beta, U+03B2 ISOgrk3" },
1935{ 947,
"gamma",
"greek small letter gamma, U+03B3 ISOgrk3" },
1936{ 948,
"delta",
"greek small letter delta, U+03B4 ISOgrk3" },
1937{ 949,
"epsilon",
"greek small letter epsilon, U+03B5 ISOgrk3" },
1938{ 950,
"zeta",
"greek small letter zeta, U+03B6 ISOgrk3" },
1939{ 951,
"eta",
"greek small letter eta, U+03B7 ISOgrk3" },
1940{ 952,
"theta",
"greek small letter theta, U+03B8 ISOgrk3" },
1941{ 953,
"iota",
"greek small letter iota, U+03B9 ISOgrk3" },
1942{ 954,
"kappa",
"greek small letter kappa, U+03BA ISOgrk3" },
1943{ 955,
"lambda",
"greek small letter lambda, U+03BB ISOgrk3" },
1944{ 956,
"mu",
"greek small letter mu, U+03BC ISOgrk3" },
1945{ 957,
"nu",
"greek small letter nu, U+03BD ISOgrk3" },
1946{ 958,
"xi",
"greek small letter xi, U+03BE ISOgrk3" },
1947{ 959,
"omicron",
"greek small letter omicron, U+03BF NEW" },
1948{ 960,
"pi",
"greek small letter pi, U+03C0 ISOgrk3" },
1949{ 961,
"rho",
"greek small letter rho, U+03C1 ISOgrk3" },
1950{ 962,
"sigmaf",
"greek small letter final sigma, U+03C2 ISOgrk3" },
1951{ 963,
"sigma",
"greek small letter sigma, U+03C3 ISOgrk3" },
1952{ 964,
"tau",
"greek small letter tau, U+03C4 ISOgrk3" },
1953{ 965,
"upsilon",
"greek small letter upsilon, U+03C5 ISOgrk3" },
1954{ 966,
"phi",
"greek small letter phi, U+03C6 ISOgrk3" },
1955{ 967,
"chi",
"greek small letter chi, U+03C7 ISOgrk3" },
1956{ 968,
"psi",
"greek small letter psi, U+03C8 ISOgrk3" },
1957{ 969,
"omega",
"greek small letter omega, U+03C9 ISOgrk3" },
1958{ 977,
"thetasym",
"greek small letter theta symbol, U+03D1 NEW" },
1959{ 978,
"upsih",
"greek upsilon with hook symbol, U+03D2 NEW" },
1960{ 982,
"piv",
"greek pi symbol, U+03D6 ISOgrk3" },
1962{ 8194,
"ensp",
"en space, U+2002 ISOpub" },
1963{ 8195,
"emsp",
"em space, U+2003 ISOpub" },
1964{ 8201,
"thinsp",
"thin space, U+2009 ISOpub" },
1965{ 8204,
"zwnj",
"zero width non-joiner, U+200C NEW RFC 2070" },
1966{ 8205,
"zwj",
"zero width joiner, U+200D NEW RFC 2070" },
1967{ 8206,
"lrm",
"left-to-right mark, U+200E NEW RFC 2070" },
1968{ 8207,
"rlm",
"right-to-left mark, U+200F NEW RFC 2070" },
1969{ 8211,
"ndash",
"en dash, U+2013 ISOpub" },
1970{ 8212,
"mdash",
"em dash, U+2014 ISOpub" },
1971{ 8216,
"lsquo",
"left single quotation mark, U+2018 ISOnum" },
1972{ 8217,
"rsquo",
"right single quotation mark, U+2019 ISOnum" },
1973{ 8218,
"sbquo",
"single low-9 quotation mark, U+201A NEW" },
1974{ 8220,
"ldquo",
"left double quotation mark, U+201C ISOnum" },
1975{ 8221,
"rdquo",
"right double quotation mark, U+201D ISOnum" },
1976{ 8222,
"bdquo",
"double low-9 quotation mark, U+201E NEW" },
1977{ 8224,
"dagger",
"dagger, U+2020 ISOpub" },
1978{ 8225,
"Dagger",
"double dagger, U+2021 ISOpub" },
1980{ 8226,
"bull",
"bullet = black small circle, U+2022 ISOpub" },
1981{ 8230,
"hellip",
"horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1983{ 8240,
"permil",
"per mille sign, U+2030 ISOtech" },
1985{ 8242,
"prime",
"prime = minutes = feet, U+2032 ISOtech" },
1986{ 8243,
"Prime",
"double prime = seconds = inches, U+2033 ISOtech" },
1988{ 8249,
"lsaquo",
"single left-pointing angle quotation mark, U+2039 ISO proposed" },
1989{ 8250,
"rsaquo",
"single right-pointing angle quotation mark, U+203A ISO proposed" },
1991{ 8254,
"oline",
"overline = spacing overscore, U+203E NEW" },
1992{ 8260,
"frasl",
"fraction slash, U+2044 NEW" },
1994{ 8364,
"euro",
"euro sign, U+20AC NEW" },
1996{ 8465,
"image",
"blackletter capital I = imaginary part, U+2111 ISOamso" },
1997{ 8472,
"weierp",
"script capital P = power set = Weierstrass p, U+2118 ISOamso" },
1998{ 8476,
"real",
"blackletter capital R = real part symbol, U+211C ISOamso" },
1999{ 8482,
"trade",
"trade mark sign, U+2122 ISOnum" },
2000{ 8501,
"alefsym",
"alef symbol = first transfinite cardinal, U+2135 NEW" },
2001{ 8592,
"larr",
"leftwards arrow, U+2190 ISOnum" },
2002{ 8593,
"uarr",
"upwards arrow, U+2191 ISOnum" },
2003{ 8594,
"rarr",
"rightwards arrow, U+2192 ISOnum" },
2004{ 8595,
"darr",
"downwards arrow, U+2193 ISOnum" },
2005{ 8596,
"harr",
"left right arrow, U+2194 ISOamsa" },
2006{ 8629,
"crarr",
"downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
2007{ 8656,
"lArr",
"leftwards double arrow, U+21D0 ISOtech" },
2008{ 8657,
"uArr",
"upwards double arrow, U+21D1 ISOamsa" },
2009{ 8658,
"rArr",
"rightwards double arrow, U+21D2 ISOtech" },
2010{ 8659,
"dArr",
"downwards double arrow, U+21D3 ISOamsa" },
2011{ 8660,
"hArr",
"left right double arrow, U+21D4 ISOamsa" },
2013{ 8704,
"forall",
"for all, U+2200 ISOtech" },
2014{ 8706,
"part",
"partial differential, U+2202 ISOtech" },
2015{ 8707,
"exist",
"there exists, U+2203 ISOtech" },
2016{ 8709,
"empty",
"empty set = null set = diameter, U+2205 ISOamso" },
2017{ 8711,
"nabla",
"nabla = backward difference, U+2207 ISOtech" },
2018{ 8712,
"isin",
"element of, U+2208 ISOtech" },
2019{ 8713,
"notin",
"not an element of, U+2209 ISOtech" },
2020{ 8715,
"ni",
"contains as member, U+220B ISOtech" },
2021{ 8719,
"prod",
"n-ary product = product sign, U+220F ISOamsb" },
2022{ 8721,
"sum",
"n-ary summation, U+2211 ISOamsb" },
2023{ 8722,
"minus",
"minus sign, U+2212 ISOtech" },
2024{ 8727,
"lowast",
"asterisk operator, U+2217 ISOtech" },
2025{ 8730,
"radic",
"square root = radical sign, U+221A ISOtech" },
2026{ 8733,
"prop",
"proportional to, U+221D ISOtech" },
2027{ 8734,
"infin",
"infinity, U+221E ISOtech" },
2028{ 8736,
"ang",
"angle, U+2220 ISOamso" },
2029{ 8743,
"and",
"logical and = wedge, U+2227 ISOtech" },
2030{ 8744,
"or",
"logical or = vee, U+2228 ISOtech" },
2031{ 8745,
"cap",
"intersection = cap, U+2229 ISOtech" },
2032{ 8746,
"cup",
"union = cup, U+222A ISOtech" },
2033{ 8747,
"int",
"integral, U+222B ISOtech" },
2034{ 8756,
"there4",
"therefore, U+2234 ISOtech" },
2035{ 8764,
"sim",
"tilde operator = varies with = similar to, U+223C ISOtech" },
2036{ 8773,
"cong",
"approximately equal to, U+2245 ISOtech" },
2037{ 8776,
"asymp",
"almost equal to = asymptotic to, U+2248 ISOamsr" },
2038{ 8800,
"ne",
"not equal to, U+2260 ISOtech" },
2039{ 8801,
"equiv",
"identical to, U+2261 ISOtech" },
2040{ 8804,
"le",
"less-than or equal to, U+2264 ISOtech" },
2041{ 8805,
"ge",
"greater-than or equal to, U+2265 ISOtech" },
2042{ 8834,
"sub",
"subset of, U+2282 ISOtech" },
2043{ 8835,
"sup",
"superset of, U+2283 ISOtech" },
2044{ 8836,
"nsub",
"not a subset of, U+2284 ISOamsn" },
2045{ 8838,
"sube",
"subset of or equal to, U+2286 ISOtech" },
2046{ 8839,
"supe",
"superset of or equal to, U+2287 ISOtech" },
2047{ 8853,
"oplus",
"circled plus = direct sum, U+2295 ISOamsb" },
2048{ 8855,
"otimes",
"circled times = vector product, U+2297 ISOamsb" },
2049{ 8869,
"perp",
"up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
2050{ 8901,
"sdot",
"dot operator, U+22C5 ISOamsb" },
2051{ 8968,
"lceil",
"left ceiling = apl upstile, U+2308 ISOamsc" },
2052{ 8969,
"rceil",
"right ceiling, U+2309 ISOamsc" },
2053{ 8970,
"lfloor",
"left floor = apl downstile, U+230A ISOamsc" },
2054{ 8971,
"rfloor",
"right floor, U+230B ISOamsc" },
2055{ 9001,
"lang",
"left-pointing angle bracket = bra, U+2329 ISOtech" },
2056{ 9002,
"rang",
"right-pointing angle bracket = ket, U+232A ISOtech" },
2057{ 9674,
"loz",
"lozenge, U+25CA ISOpub" },
2059{ 9824,
"spades",
"black spade suit, U+2660 ISOpub" },
2060{ 9827,
"clubs",
"black club suit = shamrock, U+2663 ISOpub" },
2061{ 9829,
"hearts",
"black heart suit = valentine, U+2665 ISOpub" },
2062{ 9830,
"diams",
"black diamond suit, U+2666 ISOpub" },
2075#define growBuffer(buffer) { \
2077 buffer##_size *= 2; \
2078 tmp = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
2079 if (tmp == NULL) { \
2080 htmlErrMemory(ctxt, "growing buffer\n"); \
2097const htmlEntityDesc *
2101 for (
i = 0;
i < (
sizeof(html40EntitiesTable)/
2102 sizeof(html40EntitiesTable[0]));
i++) {
2104 return((htmlEntityDescPtr) &html40EntitiesTable[
i]);
2120const htmlEntityDesc *
2121htmlEntityValueLookup(
unsigned int value) {
2124 for (
i = 0;
i < (
sizeof(html40EntitiesTable)/
2125 sizeof(html40EntitiesTable[0]));
i++) {
2129 return((htmlEntityDescPtr) &html40EntitiesTable[
i]);
2151UTF8ToHtml(
unsigned char*
out,
int *outlen,
2152 const unsigned char*
in,
int *inlen) {
2154 const unsigned char* outend;
2155 const unsigned char* outstart =
out;
2156 const unsigned char* instart =
in;
2157 const unsigned char* inend;
2170 inend =
in + (*inlen);
2171 outend =
out + (*outlen);
2172 while (
in < inend) {
2174 if (
d < 0x80) {
c=
d; trailing= 0; }
2175 else if (
d < 0xC0) {
2177 *outlen =
out - outstart;
2180 }
else if (
d < 0xE0) {
c=
d & 0x1F; trailing= 1; }
2181 else if (
d < 0xF0) {
c=
d & 0x0F; trailing= 2; }
2182 else if (
d < 0xF8) {
c=
d & 0x07; trailing= 3; }
2185 *outlen =
out - outstart;
2190 if (inend -
in < trailing) {
2194 for ( ; trailing; trailing--) {
2195 if ((
in >= inend) || (((
d= *
in++) & 0xC0) != 0x80))
2203 if (
out + 1 >= outend)
2208 const htmlEntityDesc * ent;
2216 ent = htmlEntityValueLookup(
c);
2224 if (
out + 2 +
len >= outend)
2233 *outlen =
out - outstart;
2255htmlEncodeEntities(
unsigned char*
out,
int *outlen,
2256 const unsigned char*
in,
int *inlen,
int quoteChar) {
2258 const unsigned char* outend;
2259 const unsigned char* outstart =
out;
2260 const unsigned char* instart =
in;
2261 const unsigned char* inend;
2267 outend =
out + (*outlen);
2268 inend =
in + (*inlen);
2269 while (
in < inend) {
2271 if (
d < 0x80) {
c=
d; trailing= 0; }
2272 else if (
d < 0xC0) {
2274 *outlen =
out - outstart;
2277 }
else if (
d < 0xE0) {
c=
d & 0x1F; trailing= 1; }
2278 else if (
d < 0xF0) {
c=
d & 0x0F; trailing= 2; }
2279 else if (
d < 0xF8) {
c=
d & 0x07; trailing= 3; }
2282 *outlen =
out - outstart;
2287 if (inend -
in < trailing)
2290 while (trailing--) {
2291 if (((
d= *
in++) & 0xC0) != 0x80) {
2292 *outlen =
out - outstart;
2301 if ((
c < 0x80) && (
c != (
unsigned int) quoteChar) &&
2302 (
c !=
'&') && (
c !=
'<') && (
c !=
'>')) {
2307 const htmlEntityDesc * ent;
2315 ent = htmlEntityValueLookup(
c);
2323 if (
out + 2 +
len > outend)
2332 *outlen =
out - outstart;
2343#ifdef LIBXML_PUSH_ENABLED
2351static htmlParserInputPtr
2352htmlNewInputStream(htmlParserCtxtPtr ctxt) {
2353 htmlParserInputPtr
input;
2357 htmlErrMemory(ctxt,
"couldn't allocate a new input stream\n");
2371 input->consumed = 0;
2389static const char *allowPCData[] = {
2390 "a",
"abbr",
"acronym",
"address",
"applet",
"b",
"bdo",
"big",
2391 "blockquote",
"body",
"button",
"caption",
"center",
"cite",
"code",
2392 "dd",
"del",
"dfn",
"div",
"dt",
"em",
"font",
"form",
"h1",
"h2",
2393 "h3",
"h4",
"h5",
"h6",
"i",
"iframe",
"ins",
"kbd",
"label",
"legend",
2394 "li",
"noframes",
"noscript",
"object",
"p",
"pre",
"q",
"s",
"samp",
2395 "small",
"span",
"strike",
"strong",
"td",
"th",
"tt",
"u",
"var"
2418 if (
CUR == 0)
return(1);
2419 if (
CUR !=
'<')
return(0);
2420 if (ctxt->name ==
NULL)
2437 if (ctxt->node ==
NULL)
return(0);
2440 lastChild = lastChild->
prev;
2441 if (lastChild ==
NULL) {
2443 (ctxt->node->content !=
NULL))
return(0);
2446 for (
i = 0;
i <
sizeof(allowPCData)/
sizeof(allowPCData[0]);
i++ ) {
2456 for (
i = 0;
i <
sizeof(allowPCData)/
sizeof(allowPCData[0]);
i++ ) {
2484 htmlErrMemory(
NULL,
"HTML document creation failed\n");
2498 cur->standalone = 1;
2499 cur->compression = 0;
2505 if ((ExternalID !=
NULL) ||
2524 if ((URI ==
NULL) && (ExternalID ==
NULL))
2525 return(htmlNewDocNoDtD(
2526 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
2527 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
2529 return(htmlNewDocNoDtD(URI, ExternalID));
2549htmlSkipBogusComment(htmlParserCtxtPtr ctxt) {
2553 "Incorrectly opened comment\n",
NULL,
NULL);
2574htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
2576 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2581 while ((
i < HTML_PARSER_BUFFER_SIZE) &&
2583 (
CUR ==
':') || (
CUR ==
'-') || (
CUR ==
'_') ||
2585 if ((
CUR >=
'A') && (
CUR <=
'Z')) loc[
i] =
CUR + 0x20;
2608htmlParseHTMLName_nonInvasive(htmlParserCtxtPtr ctxt) {
2610 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2613 (
NXT(1) !=
':'))
return(
NULL);
2615 while ((
i < HTML_PARSER_BUFFER_SIZE) &&
2617 (
NXT(1+
i) ==
':') || (
NXT(1+
i) ==
'-') || (
NXT(1+
i) ==
'_'))) {
2618 if ((
NXT(1+
i) >=
'A') && (
NXT(1+
i) <=
'Z')) loc[
i] =
NXT(1+
i) + 0x20;
2619 else loc[
i] =
NXT(1+
i);
2637htmlParseName(htmlParserCtxtPtr ctxt) {
2647 in = ctxt->input->cur;
2648 if (((*
in >= 0x61) && (*
in <= 0x7A)) ||
2649 ((*
in >= 0x41) && (*
in <= 0x5A)) ||
2650 (*
in ==
'_') || (*
in ==
':')) {
2652 while (((*
in >= 0x61) && (*
in <= 0x7A)) ||
2653 ((*
in >= 0x41) && (*
in <= 0x5A)) ||
2654 ((*
in >= 0x30) && (*
in <= 0x39)) ||
2655 (*
in ==
'_') || (*
in ==
'-') ||
2656 (*
in ==
':') || (*
in ==
'.'))
2659 if (
in == ctxt->input->end)
2662 if ((*
in > 0) && (*
in < 0x80)) {
2663 count =
in - ctxt->input->cur;
2665 ctxt->input->cur =
in;
2666 ctxt->input->col +=
count;
2670 return(htmlParseNameComplex(ctxt));
2685 if ((
c ==
' ') || (
c ==
'>') || (
c ==
'/') ||
2691 while ((
c !=
' ') && (
c !=
'>') && (
c !=
'/') &&
2693 (
c ==
'.') || (
c ==
'-') ||
2694 (
c ==
'_') || (
c ==
':') ||
2697 if (
count++ > 100) {
2709 return(htmlParseNameComplex(ctxt));
2716 "unexpected change of input buffer",
NULL,
NULL);
2736htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt,
const xmlChar stop) {
2742 const htmlEntityDesc * ent;
2750 htmlErrMemory(ctxt,
"buffer allocation failed\n");
2758 while ((
CUR != 0) && (
CUR != stop)) {
2759 if ((stop == 0) && (
CUR ==
'>'))
break;
2762 if (
NXT(1) ==
'#') {
2766 c = htmlParseCharRef(ctxt);
2770 { *
out++ =((
c >> 6) & 0x1F) | 0xC0;
bits= 0; }
2771 else if (
c < 0x10000)
2772 { *
out++ =((
c >> 12) & 0x0F) | 0xE0;
bits= 6; }
2774 { *
out++ =((
c >> 18) & 0x07) | 0xF0;
bits= 12; }
2777 *
out++ = ((
c >>
bits) & 0x3F) | 0x80;
2787 ent = htmlParseEntityRef(ctxt, &
name);
2796 }
else if (ent ==
NULL) {
2822 { *
out++ =((
c >> 6) & 0x1F) | 0xC0;
bits= 0; }
2823 else if (
c < 0x10000)
2824 { *
out++ =((
c >> 12) & 0x0F) | 0xE0;
bits= 6; }
2826 { *
out++ =((
c >> 18) & 0x07) | 0xF0;
bits= 12; }
2829 *
out++ = ((
c >>
bits) & 0x3F) | 0x80;
2847 { *
out++ =((
c >> 6) & 0x1F) | 0xC0;
bits= 0; }
2848 else if (
c < 0x10000)
2849 { *
out++ =((
c >> 12) & 0x0F) | 0xE0;
bits= 6; }
2851 { *
out++ =((
c >> 18) & 0x07) | 0xF0;
bits= 12; }
2854 *
out++ = ((
c >>
bits) & 0x3F) | 0x80;
2875const htmlEntityDesc *
2876htmlParseEntityRef(htmlParserCtxtPtr ctxt,
const xmlChar **
str) {
2878 const htmlEntityDesc * ent =
NULL;
2881 if ((ctxt ==
NULL) || (ctxt->input ==
NULL))
return(
NULL);
2885 name = htmlParseName(ctxt);
2888 "htmlParseEntityRef: no name\n",
NULL,
NULL);
2898 ent = htmlEntityLookup(
name);
2903 "htmlParseEntityRef: expecting ';'\n",
2926htmlParseAttValue(htmlParserCtxtPtr ctxt) {
2931 ret = htmlParseHTMLAttribute(ctxt,
'"');
2934 "AttValue: \" expected\n",
NULL,
NULL);
2937 }
else if (
CUR ==
'\'') {
2939 ret = htmlParseHTMLAttribute(ctxt,
'\'');
2942 "AttValue: ' expected\n",
NULL,
NULL);
2949 ret = htmlParseHTMLAttribute(ctxt, 0);
2952 "AttValue: no value found\n",
NULL,
NULL);
2970htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
2971 size_t len = 0, startPosition = 0;
2976 if ((
CUR !=
'"') && (
CUR !=
'\'')) {
2978 "SystemLiteral \" or ' expected\n",
NULL,
NULL);
2992 "Invalid char in SystemLiteral 0x%X\n",
CUR);
3000 "Unfinished SystemLiteral\n",
NULL,
NULL);
3022htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
3023 size_t len = 0, startPosition = 0;
3028 if ((
CUR !=
'"') && (
CUR !=
'\'')) {
3030 "PubidLiteral \" or ' expected\n",
NULL,
NULL);
3046 "Invalid char in PubidLiteral 0x%X\n",
CUR);
3055 "Unfinished PubidLiteral\n",
NULL,
NULL);
3087htmlParseScript(htmlParserCtxtPtr ctxt) {
3088 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
3095 if ((
cur ==
'<') && (
NXT(1) ==
'/')) {
3107 if (ctxt->recovery) {
3114 "Element %s embeds close tag\n",
3118 if (((
NXT(2) >=
'A') && (
NXT(2) <=
'Z')) ||
3119 ((
NXT(2) >=
'a') && (
NXT(2) <=
'z')))
3129 "Invalid char in CDATA 0x%X\n",
cur);
3131 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
3133 if (ctxt->sax->cdataBlock!=
NULL) {
3137 ctxt->sax->cdataBlock(ctxt->userData,
buf, nbchar);
3138 }
else if (ctxt->sax->characters !=
NULL) {
3139 ctxt->sax->characters(ctxt->userData,
buf, nbchar);
3148 if ((nbchar != 0) && (ctxt->sax !=
NULL) && (!ctxt->disableSAX)) {
3150 if (ctxt->sax->cdataBlock!=
NULL) {
3154 ctxt->sax->cdataBlock(ctxt->userData,
buf, nbchar);
3155 }
else if (ctxt->sax->characters !=
NULL) {
3156 ctxt->sax->characters(ctxt->userData,
buf, nbchar);
3174htmlParseCharDataInternal(htmlParserCtxtPtr ctxt,
int readahead) {
3175 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 6];
3181 buf[nbchar++] = readahead;
3185 while (((
cur !=
'<') || (ctxt->token ==
'<')) &&
3186 ((
cur !=
'&') || (ctxt->token ==
'&')) &&
3190 "Invalid char in CDATA 0x%X\n",
cur);
3194 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
3200 if ((ctxt->sax !=
NULL) && (!ctxt->disableSAX)) {
3202 if (ctxt->keepBlanks) {
3203 if (ctxt->sax->characters !=
NULL)
3204 ctxt->sax->characters(ctxt->userData,
buf, nbchar);
3206 if (ctxt->sax->ignorableWhitespace !=
NULL)
3207 ctxt->sax->ignorableWhitespace(ctxt->userData,
3211 htmlCheckParagraph(ctxt);
3212 if (ctxt->sax->characters !=
NULL)
3213 ctxt->sax->characters(ctxt->userData,
buf, nbchar);
3220 if (
chunk > HTML_PARSER_BUFFER_SIZE) {
3238 if ((ctxt->sax !=
NULL) && (!ctxt->disableSAX)) {
3240 if (ctxt->keepBlanks) {
3241 if (ctxt->sax->characters !=
NULL)
3242 ctxt->sax->characters(ctxt->userData,
buf, nbchar);
3244 if (ctxt->sax->ignorableWhitespace !=
NULL)
3245 ctxt->sax->ignorableWhitespace(ctxt->userData,
3249 htmlCheckParagraph(ctxt);
3250 if (ctxt->sax->characters !=
NULL)
3251 ctxt->sax->characters(ctxt->userData,
buf, nbchar);
3274htmlParseCharData(htmlParserCtxtPtr ctxt) {
3275 htmlParseCharDataInternal(ctxt, 0);
3296htmlParseExternalID(htmlParserCtxtPtr ctxt,
xmlChar **publicID) {
3299 if ((UPPER ==
'S') && (UPP(1) ==
'Y') &&
3300 (UPP(2) ==
'S') && (UPP(3) ==
'T') &&
3301 (UPP(4) ==
'E') && (UPP(5) ==
'M')) {
3305 "Space required after 'SYSTEM'\n",
NULL,
NULL);
3308 URI = htmlParseSystemLiteral(ctxt);
3311 "htmlParseExternalID: SYSTEM, no URI\n",
NULL,
NULL);
3313 }
else if ((UPPER ==
'P') && (UPP(1) ==
'U') &&
3314 (UPP(2) ==
'B') && (UPP(3) ==
'L') &&
3315 (UPP(4) ==
'I') && (UPP(5) ==
'C')) {
3319 "Space required after 'PUBLIC'\n",
NULL,
NULL);
3322 *publicID = htmlParsePubidLiteral(ctxt);
3323 if (*publicID ==
NULL) {
3325 "htmlParseExternalID: PUBLIC, no Public Identifier\n",
3329 if ((
CUR ==
'"') || (
CUR ==
'\'')) {
3330 URI = htmlParseSystemLiteral(ctxt);
3345htmlParsePI(htmlParserCtxtPtr ctxt) {
3348 int size = HTML_PARSER_BUFFER_SIZE;
3354 if ((
RAW ==
'<') && (
NXT(1) ==
'?')) {
3355 state = ctxt->instate;
3367 target = htmlParseName(ctxt);
3375 if ((ctxt->sax) && (!ctxt->disableSAX) &&
3376 (ctxt->sax->processingInstruction !=
NULL))
3377 ctxt->sax->processingInstruction(ctxt->userData,
3379 ctxt->instate =
state;
3384 htmlErrMemory(ctxt,
NULL);
3385 ctxt->instate =
state;
3391 "ParsePI: PI %s space expected\n",
target,
NULL);
3395 while ((
cur != 0) && (
cur !=
'>')) {
3402 htmlErrMemory(ctxt,
NULL);
3404 ctxt->instate =
state;
3418 "Invalid char in processing instruction "
3432 "ParsePI: PI %s never end ...\n",
target,
NULL);
3439 if ((ctxt->sax) && (!ctxt->disableSAX) &&
3440 (ctxt->sax->processingInstruction !=
NULL))
3441 ctxt->sax->processingInstruction(ctxt->userData,
3447 "PI is not started correctly",
NULL,
NULL);
3449 ctxt->instate =
state;
3462htmlParseComment(htmlParserCtxtPtr ctxt) {
3465 int size = HTML_PARSER_BUFFER_SIZE;
3475 if ((
RAW !=
'<') || (
NXT(1) !=
'!') ||
3476 (
NXT(2) !=
'-') || (
NXT(3) !=
'-'))
return;
3478 state = ctxt->instate;
3484 htmlErrMemory(ctxt,
"buffer allocation failed\n");
3485 ctxt->instate =
state;
3502 if (
q ==
'-' &&
r ==
'>') {
3509 while ((
cur != 0) &&
3511 (
r !=
'-') || (
q !=
'-'))) {
3520 if ((
q ==
'-') && (
r ==
'-') && (
cur ==
'!') && (
next ==
'>')) {
3522 "Comment incorrectly closed by '--!>'",
NULL,
NULL);
3534 htmlErrMemory(ctxt,
"growing buffer failed\n");
3535 ctxt->instate =
state;
3544 "Invalid char in comment 0x%X\n",
q);
3558 if ((ctxt->sax !=
NULL) && (ctxt->sax->comment !=
NULL) &&
3559 (!ctxt->disableSAX))
3560 ctxt->sax->comment(ctxt->userData,
buf);
3562 ctxt->instate =
state;
3568 "Comment not terminated \n<!--%.50s\n",
buf,
NULL);
3584htmlParseCharRef(htmlParserCtxtPtr ctxt) {
3587 if ((ctxt ==
NULL) || (ctxt->input ==
NULL)) {
3589 "htmlParseCharRef: context error\n",
3593 if ((
CUR ==
'&') && (
NXT(1) ==
'#') &&
3594 ((
NXT(2) ==
'x') ||
NXT(2) ==
'X')) {
3596 while (
CUR !=
';') {
3597 if ((
CUR >=
'0') && (
CUR <=
'9')) {
3600 }
else if ((
CUR >=
'a') && (
CUR <=
'f')) {
3603 }
else if ((
CUR >=
'A') && (
CUR <=
'F')) {
3608 "htmlParseCharRef: missing semicolon\n",
3616 }
else if ((
CUR ==
'&') && (
NXT(1) ==
'#')) {
3618 while (
CUR !=
';') {
3619 if ((
CUR >=
'0') && (
CUR <=
'9')) {
3624 "htmlParseCharRef: missing semicolon\n",
3634 "htmlParseCharRef: invalid value\n",
NULL,
NULL);
3641 }
else if (
val >= 0x110000) {
3643 "htmlParseCharRef: value too large\n",
NULL,
NULL);
3646 "htmlParseCharRef: invalid xmlChar value %d\n",
3664htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
3679 name = htmlParseName(ctxt);
3682 "htmlParseDocTypeDecl : no DOCTYPE name !\n",
3694 URI = htmlParseExternalID(ctxt, &ExternalID);
3702 "DOCTYPE improperly terminated\n",
NULL,
NULL);
3704 while ((
CUR != 0) && (
CUR !=
'>'))
3713 if ((ctxt->sax !=
NULL) && (ctxt->sax->internalSubset !=
NULL) &&
3714 (!ctxt->disableSAX))
3715 ctxt->sax->internalSubset(ctxt->userData,
name, ExternalID, URI);
3746htmlParseAttribute(htmlParserCtxtPtr ctxt,
xmlChar **
value) {
3751 name = htmlParseHTMLName(ctxt);
3754 "error parsing attribute name\n",
NULL,
NULL);
3765 val = htmlParseAttValue(ctxt);
3783htmlCheckEncodingDirect(htmlParserCtxtPtr ctxt,
const xmlChar *
encoding) {
3786 (ctxt->options & HTML_PARSE_IGNORE_ENC))
3790 if (ctxt->input->encoding !=
NULL)
3799 if (ctxt->input->encoding !=
NULL)
3812 (ctxt->input->buf !=
NULL) &&
3813 (ctxt->input->buf->encoder ==
NULL)) {
3815 "htmlCheckEncoding: wrong encoding meta\n",
3831 "htmlCheckEncoding: unknown encoding %s\n",
3836 if ((ctxt->input->buf !=
NULL) &&
3837 (ctxt->input->buf->encoder !=
NULL) &&
3838 (ctxt->input->buf->raw !=
NULL) &&
3839 (ctxt->input->buf->buffer !=
NULL)) {
3846 processed = ctxt->input->cur - ctxt->input->base;
3852 "htmlCheckEncoding: encoder error\n",
3870htmlCheckEncoding(htmlParserCtxtPtr ctxt,
const xmlChar *attvalue) {
3887 htmlCheckEncodingDirect(ctxt,
encoding);
3899htmlCheckMeta(htmlParserCtxtPtr ctxt,
const xmlChar **
atts) {
3910 while (att !=
NULL) {
3916 htmlCheckEncodingDirect(ctxt,
value);
3922 htmlCheckEncoding(ctxt,
content);
3947htmlParseStartTag(htmlParserCtxtPtr ctxt) {
3958 if ((ctxt ==
NULL) || (ctxt->input ==
NULL)) {
3960 "htmlParseStartTag: context error\n",
NULL,
NULL);
3965 if (
CUR !=
'<')
return -1;
3969 maxatts = ctxt->maxatts;
3972 name = htmlParseHTMLName(ctxt);
3975 "htmlParseStartTag: invalid element name\n",
3978 while ((
CUR != 0) && (
CUR !=
'>') &&
3989 htmlAutoClose(ctxt,
name);
3994 htmlCheckImplied(ctxt,
name);
4002 "htmlParseStartTag: misplaced <html> tag\n",
4007 if ((ctxt->nameNr != 1) &&
4010 "htmlParseStartTag: misplaced <head> tag\n",
4017 for (indx = 0;indx < ctxt->nameNr;indx++) {
4020 "htmlParseStartTag: misplaced <body> tag\n",
4034 while ((
CUR != 0) &&
4036 ((
CUR !=
'/') || (
NXT(1) !=
'>'))) {
4038 attname = htmlParseAttribute(ctxt, &attvalue);
4039 if (attname !=
NULL) {
4044 for (
i = 0;
i < nbatts;
i += 2) {
4047 "Attribute %s redefined\n", attname,
NULL);
4048 if (attvalue !=
NULL)
4062 htmlErrMemory(ctxt,
NULL);
4063 if (attvalue !=
NULL)
4068 ctxt->maxatts = maxatts;
4069 }
else if (nbatts + 4 > maxatts) {
4074 maxatts *
sizeof(
const xmlChar *));
4076 htmlErrMemory(ctxt,
NULL);
4077 if (attvalue !=
NULL)
4083 ctxt->maxatts = maxatts;
4085 atts[nbatts++] = attname;
4086 atts[nbatts++] = attvalue;
4091 if (attvalue !=
NULL)
4095 while ((
CUR != 0) &&
4097 ((
CUR !=
'/') || (
NXT(1) !=
'>')))
4108 if (meta && (nbatts != 0))
4109 htmlCheckMeta(ctxt,
atts);
4115 htmlnamePush(ctxt,
name);
4116 if ((ctxt->sax !=
NULL) && (ctxt->sax->startElement !=
NULL)) {
4118 ctxt->sax->startElement(ctxt->userData,
name,
atts);
4120 ctxt->sax->startElement(ctxt->userData,
name,
NULL);
4125 for (
i = 1;
i < nbatts;
i += 2) {
4150htmlParseEndTag(htmlParserCtxtPtr ctxt)
4156 if ((
CUR !=
'<') || (
NXT(1) !=
'/')) {
4158 "htmlParseEndTag: '</' not found\n",
NULL,
NULL);
4163 name = htmlParseHTMLName(ctxt);
4172 "End tag : expected '>'\n",
NULL,
NULL);
4174 while ((
CUR != 0) && (
CUR !=
'>'))
4184 if ((ctxt->depth > 0) &&
4196 for (
i = (ctxt->nameNr - 1);
i >= 0;
i--) {
4202 "Unexpected end tag : %s\n",
name,
NULL);
4211 htmlAutoCloseOnClose(ctxt,
name);
4220 "Opening and ending tag mismatch: %s and %s\n",
4227 oldname = ctxt->name;
4229 if ((ctxt->sax !=
NULL) && (ctxt->sax->endElement !=
NULL))
4230 ctxt->sax->endElement(ctxt->userData,
name);
4231 htmlNodeInfoPop(ctxt);
4251htmlParseReference(htmlParserCtxtPtr ctxt) {
4252 const htmlEntityDesc * ent;
4255 if (
CUR !=
'&')
return;
4257 if (
NXT(1) ==
'#') {
4261 c = htmlParseCharRef(ctxt);
4266 else if (
c < 0x800) {
out[
i++]=((
c >> 6) & 0x1F) | 0xC0;
bits= 0; }
4267 else if (
c < 0x10000) {
out[
i++]=((
c >> 12) & 0x0F) | 0xE0;
bits= 6; }
4268 else {
out[
i++]=((
c >> 18) & 0x07) | 0xF0;
bits= 12; }
4275 htmlCheckParagraph(ctxt);
4276 if ((ctxt->sax !=
NULL) && (ctxt->sax->characters !=
NULL))
4277 ctxt->sax->characters(ctxt->userData,
out,
i);
4279 ent = htmlParseEntityRef(ctxt, &
name);
4281 htmlCheckParagraph(ctxt);
4282 if ((ctxt->sax !=
NULL) && (ctxt->sax->characters !=
NULL))
4283 ctxt->sax->characters(ctxt->userData,
BAD_CAST "&", 1);
4286 if ((ent ==
NULL) || !(ent->value > 0)) {
4287 htmlCheckParagraph(ctxt);
4288 if ((ctxt->sax !=
NULL) && (ctxt->sax->characters !=
NULL)) {
4289 ctxt->sax->characters(ctxt->userData,
BAD_CAST "&", 1);
4301 {
out[
i++]=((
c >> 6) & 0x1F) | 0xC0;
bits= 0; }
4302 else if (
c < 0x10000)
4303 {
out[
i++]=((
c >> 12) & 0x0F) | 0xE0;
bits= 6; }
4305 {
out[
i++]=((
c >> 18) & 0x07) | 0xF0;
bits= 12; }
4312 htmlCheckParagraph(ctxt);
4313 if ((ctxt->sax !=
NULL) && (ctxt->sax->characters !=
NULL))
4314 ctxt->sax->characters(ctxt->userData,
out,
i);
4328htmlParseContent(htmlParserCtxtPtr ctxt) {
4334 depth = ctxt->nameNr;
4344 if ((
CUR ==
'<') && (
NXT(1) ==
'/')) {
4345 if (htmlParseEndTag(ctxt) &&
4346 ((currentNode !=
NULL) || (ctxt->nameNr == 0))) {
4347 if (currentNode !=
NULL)
4354 else if ((
CUR ==
'<') &&
4356 (
NXT(1) ==
'_') || (
NXT(1) ==
':'))) {
4357 name = htmlParseHTMLName_nonInvasive(ctxt);
4360 "htmlParseStartTag: invalid element name\n",
4363 while ((
CUR != 0) && (
CUR !=
'>'))
4366 if (currentNode !=
NULL)
4371 if (ctxt->name !=
NULL) {
4372 if (htmlCheckAutoClose(
name, ctxt->
name) == 1) {
4373 htmlAutoClose(ctxt,
name);
4383 if ((ctxt->nameNr > 0) && (
depth >= ctxt->nameNr) &&
4395 htmlParseScript(ctxt);
4398 else if ((
CUR ==
'<') && (
NXT(1) ==
'!')) {
4402 if ((UPP(2) ==
'D') && (UPP(3) ==
'O') &&
4403 (UPP(4) ==
'C') && (UPP(5) ==
'T') &&
4404 (UPP(6) ==
'Y') && (UPP(7) ==
'P') &&
4407 "Misplaced DOCTYPE declaration\n",
4409 htmlParseDocTypeDecl(ctxt);
4414 else if ((
NXT(2) ==
'-') && (
NXT(3) ==
'-')) {
4415 htmlParseComment(ctxt);
4418 htmlSkipBogusComment(ctxt);
4425 else if ((
CUR ==
'<') && (
NXT(1) ==
'?')) {
4433 htmlParseElement(ctxt);
4435 else if (
CUR ==
'<') {
4436 if ((ctxt->sax !=
NULL) && (!ctxt->disableSAX) &&
4437 (ctxt->sax->characters !=
NULL))
4438 ctxt->sax->characters(ctxt->userData,
BAD_CAST "<", 1);
4446 else if (
CUR ==
'&') {
4447 htmlParseReference(ctxt);
4453 else if (
CUR == 0) {
4454 htmlAutoCloseOnEnd(ctxt);
4462 htmlParseCharData(ctxt);
4482htmlParseElement(htmlParserCtxtPtr ctxt) {
4485 const htmlElemDesc *
info;
4486 htmlParserNodeInfo node_info;
4491 if ((ctxt ==
NULL) || (ctxt->input ==
NULL)) {
4493 "htmlParseElement: context error\n",
NULL,
NULL);
4501 if (ctxt->record_info) {
4502 node_info.begin_pos = ctxt->input->consumed +
4503 (
CUR_PTR - ctxt->input->base);
4504 node_info.begin_line = ctxt->input->line;
4507 failed = htmlParseStartTag(ctxt);
4509 if ((failed == -1) || (
name ==
NULL)) {
4527 if ((
CUR ==
'/') && (
NXT(1) ==
'>')) {
4529 if ((ctxt->sax !=
NULL) && (ctxt->sax->endElement !=
NULL))
4530 ctxt->sax->endElement(ctxt->userData,
name);
4539 "Couldn't find end of Start Tag %s\n",
name,
NULL);
4552 if (ctxt->record_info) {
4553 node_info.end_pos = ctxt->input->consumed +
4554 (
CUR_PTR - ctxt->input->base);
4555 node_info.end_line = ctxt->input->line;
4556 node_info.node = ctxt->node;
4566 if ((ctxt->sax !=
NULL) && (ctxt->sax->endElement !=
NULL))
4567 ctxt->sax->endElement(ctxt->userData,
name);
4576 depth = ctxt->nameNr;
4578 oldptr = ctxt->input->cur;
4579 htmlParseContent(ctxt);
4580 if (oldptr==ctxt->input->cur)
break;
4581 if (ctxt->nameNr <
depth)
break;
4587 if ( currentNode !=
NULL && ctxt->record_info ) {
4588 node_info.end_pos = ctxt->input->consumed +
4589 (
CUR_PTR - ctxt->input->base);
4590 node_info.end_line = ctxt->input->line;
4591 node_info.node = ctxt->node;
4595 htmlAutoCloseOnEnd(ctxt);
4598 if (currentNode !=
NULL)
4603htmlParserFinishElementParsing(htmlParserCtxtPtr ctxt) {
4607 if ( ctxt->node !=
NULL && ctxt->record_info ) {
4608 ctxt->nodeInfo->end_pos = ctxt->input->consumed +
4609 (
CUR_PTR - ctxt->input->base);
4610 ctxt->nodeInfo->end_line = ctxt->input->line;
4611 ctxt->nodeInfo->node = ctxt->node;
4613 htmlNodeInfoPop(ctxt);
4616 htmlAutoCloseOnEnd(ctxt);
4632htmlParseElementInternal(htmlParserCtxtPtr ctxt) {
4634 const htmlElemDesc *
info;
4635 htmlParserNodeInfo node_info = {
NULL, 0, 0, 0, 0 };
4638 if ((ctxt ==
NULL) || (ctxt->input ==
NULL)) {
4640 "htmlParseElementInternal: context error\n",
NULL,
NULL);
4648 if (ctxt->record_info) {
4649 node_info.begin_pos = ctxt->input->consumed +
4650 (
CUR_PTR - ctxt->input->base);
4651 node_info.begin_line = ctxt->input->line;
4654 failed = htmlParseStartTag(ctxt);
4656 if ((failed == -1) || (
name ==
NULL)) {
4674 if ((
CUR ==
'/') && (
NXT(1) ==
'>')) {
4676 if ((ctxt->sax !=
NULL) && (ctxt->sax->endElement !=
NULL))
4677 ctxt->sax->endElement(ctxt->userData,
name);
4686 "Couldn't find end of Start Tag %s\n",
name,
NULL);
4696 if (ctxt->record_info)
4697 htmlNodeInfoPush(ctxt, &node_info);
4698 htmlParserFinishElementParsing(ctxt);
4706 if ((ctxt->sax !=
NULL) && (ctxt->sax->endElement !=
NULL))
4707 ctxt->sax->endElement(ctxt->userData,
name);
4712 if (ctxt->record_info)
4713 htmlNodeInfoPush(ctxt, &node_info);
4725htmlParseContentInternal(htmlParserCtxtPtr ctxt) {
4731 depth = ctxt->nameNr;
4741 if ((
CUR ==
'<') && (
NXT(1) ==
'/')) {
4742 if (htmlParseEndTag(ctxt) &&
4743 ((currentNode !=
NULL) || (ctxt->nameNr == 0))) {
4744 if (currentNode !=
NULL)
4748 depth = ctxt->nameNr;
4753 else if ((
CUR ==
'<') &&
4755 (
NXT(1) ==
'_') || (
NXT(1) ==
':'))) {
4756 name = htmlParseHTMLName_nonInvasive(ctxt);
4759 "htmlParseStartTag: invalid element name\n",
4762 while ((
CUR == 0) && (
CUR !=
'>'))
4765 htmlParserFinishElementParsing(ctxt);
4766 if (currentNode !=
NULL)
4770 depth = ctxt->nameNr;
4774 if (ctxt->name !=
NULL) {
4775 if (htmlCheckAutoClose(
name, ctxt->
name) == 1) {
4776 htmlAutoClose(ctxt,
name);
4786 if ((ctxt->nameNr > 0) && (
depth >= ctxt->nameNr) &&
4789 htmlParserFinishElementParsing(ctxt);
4793 depth = ctxt->nameNr;
4802 htmlParseScript(ctxt);
4805 else if ((
CUR ==
'<') && (
NXT(1) ==
'!')) {
4809 if ((UPP(2) ==
'D') && (UPP(3) ==
'O') &&
4810 (UPP(4) ==
'C') && (UPP(5) ==
'T') &&
4811 (UPP(6) ==
'Y') && (UPP(7) ==
'P') &&
4814 "Misplaced DOCTYPE declaration\n",
4816 htmlParseDocTypeDecl(ctxt);
4821 else if ((
NXT(2) ==
'-') && (
NXT(3) ==
'-')) {
4822 htmlParseComment(ctxt);
4825 htmlSkipBogusComment(ctxt);
4832 else if ((
CUR ==
'<') && (
NXT(1) ==
'?')) {
4840 htmlParseElementInternal(ctxt);
4844 depth = ctxt->nameNr;
4846 else if (
CUR ==
'<') {
4847 if ((ctxt->sax !=
NULL) && (!ctxt->disableSAX) &&
4848 (ctxt->sax->characters !=
NULL))
4849 ctxt->sax->characters(ctxt->userData,
BAD_CAST "<", 1);
4857 else if (
CUR ==
'&') {
4858 htmlParseReference(ctxt);
4864 else if (
CUR == 0) {
4865 htmlAutoCloseOnEnd(ctxt);
4873 htmlParseCharData(ctxt);
4889__htmlParseContent(
void *ctxt) {
4891 htmlParseContentInternal((htmlParserCtxtPtr) ctxt);
4906htmlParseDocument(htmlParserCtxtPtr ctxt) {
4913 htmlDefaultSAXHandlerInit();
4915 if ((ctxt ==
NULL) || (ctxt->input ==
NULL)) {
4917 "htmlParseDocument: context error\n",
NULL,
NULL);
4921 ctxt->linenumbers = 1;
4926 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4930 ((ctxt->input->end - ctxt->input->cur) >= 4)) {
4952 "Document is empty\n",
NULL,
NULL);
4955 if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
4956 ctxt->sax->startDocument(ctxt->userData);
4962 while (((
CUR ==
'<') && (
NXT(1) ==
'!') &&
4963 (
NXT(2) ==
'-') && (
NXT(3) ==
'-')) ||
4964 ((
CUR ==
'<') && (
NXT(1) ==
'?'))) {
4965 htmlParseComment(ctxt);
4975 if ((
CUR ==
'<') && (
NXT(1) ==
'!') &&
4976 (UPP(2) ==
'D') && (UPP(3) ==
'O') &&
4977 (UPP(4) ==
'C') && (UPP(5) ==
'T') &&
4978 (UPP(6) ==
'Y') && (UPP(7) ==
'P') &&
4980 htmlParseDocTypeDecl(ctxt);
4987 while (((
CUR ==
'<') && (
NXT(1) ==
'!') &&
4988 (
NXT(2) ==
'-') && (
NXT(3) ==
'-')) ||
4989 ((
CUR ==
'<') && (
NXT(1) ==
'?'))) {
4990 htmlParseComment(ctxt);
4998 htmlParseContentInternal(ctxt);
5004 htmlAutoCloseOnEnd(ctxt);
5010 if ((ctxt->sax) && (ctxt->sax->endDocument !=
NULL))
5011 ctxt->sax->endDocument(ctxt->userData);
5013 if ((!(ctxt->options & HTML_PARSE_NODEFDTD)) && (ctxt->myDoc !=
NULL)) {
5016 ctxt->myDoc->intSubset =
5018 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
5019 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
5021 if (! ctxt->wellFormed)
return(-1);
5042htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
5044 htmlSAXHandler *
sax;
5046 if (ctxt ==
NULL)
return(-1);
5047 memset(ctxt, 0,
sizeof(htmlParserCtxt));
5050 if (ctxt->dict ==
NULL) {
5051 htmlErrMemory(
NULL,
"htmlInitParserCtxt: out of memory\n");