This module parses an HTML document and creates its XML tree representation. It is supposed to handle the wild HTML the real world uses.
It can be used to parse a wild HTML document and output it as valid XHTML document (well, if you are lucky):
echo loadHtml("mydirty.html")
Every tag in the resulting tree is in lower case.
Note: The resulting PXmlNode already uses the clientData field, so it cannot be used by clients of this library.
Types
THtmlTag = enum tagUnknown, ## unknown HTML element tagA, ## the HTML ``a`` element tagAbbr, ## the deprecated HTML ``abbr`` element tagAcronym, ## the HTML ``acronym`` element tagAddress, ## the HTML ``address`` element tagApplet, ## the deprecated HTML ``applet`` element tagArea, ## the HTML ``area`` element tagB, ## the HTML ``b`` element tagBase, ## the HTML ``base`` element tagBdo, ## the deprecated HTML ``dbo`` element tagBasefont, ## the deprecated HTML ``basefont`` element tagBig, ## the HTML ``big`` element tagBlockquote, ## the HTML ``blockquote`` element tagBody, ## the HTML ``body`` element tagBr, ## the HTML ``br`` element tagButton, ## the HTML ``button`` element tagCaption, ## the HTML ``caption`` element tagCenter, ## the deprecated HTML ``center`` element tagCite, ## the HTML ``cite`` element tagCode, ## the HTML ``code`` element tagCol, ## the HTML ``col`` element tagColgroup, ## the HTML ``colgroup`` element tagDd, ## the HTML ``dd`` element tagDel, ## the HTML ``del`` element tagDfn, ## the HTML ``dfn`` element tagDiv, ## the HTML ``div`` element tagDir, ## the deprecated HTLM ``dir`` element tagDl, ## the HTML ``dl`` element tagDt, ## the HTML ``dt`` element tagEm, ## the HTML ``em`` element tagFieldset, ## the HTML ``fieldset`` element tagFont, ## the deprecated HTML ``font`` element tagForm, ## the HTML ``form`` element tagFrame, ## the HTML ``frame`` element tagFrameset, ## the deprecated HTML ``frameset`` element tagH1, ## the HTML ``h1`` element tagH2, ## the HTML ``h2`` element tagH3, ## the HTML ``h3`` element tagH4, ## the HTML ``h4`` element tagH5, ## the HTML ``h5`` element tagH6, ## the HTML ``h6`` element tagHead, ## the HTML ``head`` element tagHtml, ## the HTML ``html`` element tagHr, ## the HTML ``hr`` element tagI, ## the HTML ``i`` element tagIframe, ## the deprecated HTML ``iframe`` element tagImg, ## the HTML ``img`` element tagInput, ## the HTML ``input`` element tagIns, ## the HTML ``ins`` element tagIsindex, ## the deprecated HTML ``isindex`` element tagKbd, ## the HTML ``kbd`` element tagLabel, ## the HTML ``label`` element tagLegend, ## the HTML ``legend`` element tagLi, ## the HTML ``li`` element tagLink, ## the HTML ``link`` element tagMap, ## the HTML ``map`` element tagMenu, ## the deprecated HTML ``menu`` element tagMeta, ## the HTML ``meta`` element tagNobr, ## the deprecated HTML ``nobr`` element tagNoframes, ## the deprecated HTML ``noframes`` element tagNoscript, ## the HTML ``noscript`` element tagObject, ## the HTML ``object`` element tagOl, ## the HTML ``ol`` element tagOptgroup, ## the HTML ``optgroup`` element tagOption, ## the HTML ``option`` element tagP, ## the HTML ``p`` element tagParam, ## the HTML ``param`` element tagPre, ## the HTML ``pre`` element tagQ, ## the HTML ``q`` element tagS, ## the deprecated HTML ``s`` element tagSamp, ## the HTML ``samp`` element tagScript, ## the HTML ``script`` element tagSelect, ## the HTML ``select`` element tagSmall, ## the HTML ``small`` element tagSpan, ## the HTML ``span`` element tagStrike, ## the deprecated HTML ``strike`` element tagStrong, ## the HTML ``strong`` element tagStyle, ## the HTML ``style`` element tagSub, ## the HTML ``sub`` element tagSup, ## the HTML ``sup`` element tagTable, ## the HTML ``table`` element tagTbody, ## the HTML ``tbody`` element tagTd, ## the HTML ``td`` element tagTextarea, ## the HTML ``textarea`` element tagTfoot, ## the HTML ``tfoot`` element tagTh, ## the HTML ``th`` element tagThead, ## the HTML ``thead`` element tagTitle, ## the HTML ``title`` element tagTr, ## the HTML ``tr`` element tagTt, ## the HTML ``tt`` element tagU, ## the deprecated HTML ``u`` element tagUl, ## the HTML ``ul`` element tagVar ## the HTML ``var`` element
- list of all supported HTML tags; order will always be alphabetically
Consts
tagToStr = ["a", "abbr", "acronym", "address", "applet", "area", "b", "base", "basefont", "bdo", "big", "blockquote", "body", "br", "button", "caption", "center", "cite", "code", "col", "colgroup", "dd", "del", "dfn", "div", "dir", "dl", "dt", "em", "fieldset", "font", "form", "frame", "frameset", "h1", "h2", "h3", "h4", "h5", "h6", "head", "html", "hr", "i", "iframe", "img", "input", "ins", "isindex", "kbd", "label", "legend", "li", "link", "map", "menu", "meta", "nobr", "noframes", "noscript", "object", "ol", "optgroup", "option", "p", "param", "pre", "q", "s", "samp", "script", "select", "small", "span", "strike", "strong", "style", "sub", "sup", "table", "tbody", "td", "textarea", "tfoot", "th", "thead", "title", "tr", "tt", "u", "ul", "var"]
InlineTags = {tagA, tagAbbr, tagAcronym, tagApplet, tagB, tagBasefont, tagBdo, tagBig, tagBr, tagButton, tagCite, tagCode, tagDel, tagDfn, tagEm, tagFont, tagI, tagImg, tagIns, tagInput, tagIframe, tagKbd, tagLabel, tagMap, tagObject, tagQ, tagSamp, tagScript, tagSelect, tagSmall, tagSpan, tagStrong, tagSub, tagSup, tagTextarea, tagTt, tagVar, tagApplet, tagBasefont, tagFont, tagIframe, tagU, tagS, tagStrike}
BlockTags = {tagAddress, tagBlockquote, tagCenter, tagDel, tagDir, tagDiv, tagDl, tagFieldset, tagForm, tagH1, tagH2, tagH3, tagH4, tagH5, tagH6, tagHr, tagIns, tagIsindex, tagMenu, tagNoframes, tagNoscript, tagOl, tagP, tagPre, tagTable, tagUl, tagCenter, tagDir, tagIsindex, tagMenu, tagNoframes}
SingleTags = {tagArea, tagBase, tagBasefont, tagBr, tagCol, tagFrame, tagHr, tagImg, tagIsindex, tagLink, tagMeta, tagParam}
Procs
proc htmlTag(n: PXmlNode): THtmlTag {.raises: [], tags: [].}
- gets n's tag as a THtmlTag.
proc htmlTag(s: string): THtmlTag {.raises: [], tags: [].}
- converts s to a THtmlTag. If s is no HTML tag, tagUnknown is returned.
proc entityToUtf8(entity: string): string {.raises: [], tags: [].}
- converts an HTML entity name like Ü to its UTF-8 equivalent. "" is returned if the entity name is unknown. The HTML parser already converts entities to UTF-8.
proc parseHtml(s: PStream; filename: string; errors: var seq[string]): PXmlNode {. raises: [E_Base, EInvalidValue], tags: [FReadIO, TEffect].}
- parses the XML from stream s and returns a PXmlNode. Every occured parsing error is added to the errors sequence.
proc parseHtml(s: PStream): PXmlNode {.raises: [E_Base, EInvalidValue], tags: [FReadIO, TEffect].}
- parses the XTML from stream s and returns a PXmlNode. All parsing errors are ignored.
proc loadHtml(path: string; errors: var seq[string]): PXmlNode {. raises: [EOutOfMemory, EIO, E_Base, EInvalidValue], tags: [FReadIO, TEffect].}
- Loads and parses HTML from file specified by path, and returns a PXmlNode. Every occured parsing error is added to the errors sequence.
proc loadHtml(path: string): PXmlNode {.raises: [EOutOfMemory, EIO, E_Base, EInvalidValue], tags: [FReadIO, TEffect].}
- Loads and parses HTML from file specified by path, and returns a PXmlNode. All parsing errors are ignored.