// // Copyright (c) Microsoft. All rights reserved. // // Licensed under the MIT license. See LICENSE file in the project root for full license information. using System.Collections; using System.Diagnostics; namespace HtmlToXamlDemo { /// /// HtmlSchema class /// maintains static information about HTML structure /// can be used by HtmlParser to check conditions under which an element starts or ends, etc. /// internal class HtmlSchema { // --------------------------------------------------------------------- // // Constructors // // --------------------------------------------------------------------- #region Constructors /// /// static constructor, initializes the ArrayLists /// that hold the elements in various sub-components of the schema /// e.g _htmlEmptyElements, etc. /// static HtmlSchema() { // initializes the list of all html elements InitializeInlineElements(); InitializeBlockElements(); InitializeOtherOpenableElements(); // initialize empty elements list InitializeEmptyElements(); // initialize list of elements closing on the outer element end InitializeElementsClosingOnParentElementEnd(); // initalize list of elements that close when a new element starts InitializeElementsClosingOnNewElementStart(); // Initialize character entities InitializeHtmlCharacterEntities(); } #endregion Constructors; // --------------------------------------------------------------------- // // Internal Methods // // --------------------------------------------------------------------- #region Internal Methods /// /// returns true when xmlElementName corresponds to empty element /// /// /// string representing name to test /// internal static bool IsEmptyElement(string xmlElementName) => _htmlEmptyElements.Contains(xmlElementName.ToLower()); /// /// returns true if xmlElementName represents a block formattinng element. /// It used in an algorithm of transferring inline elements over block elements /// in HtmlParser /// /// /// internal static bool IsBlockElement(string xmlElementName) => _htmlBlockElements.Contains(xmlElementName); /// /// returns true if the xmlElementName represents an inline formatting element /// /// /// internal static bool IsInlineElement(string xmlElementName) => _htmlInlineElements.Contains(xmlElementName); /// /// It is a list of known html elements which we /// want to allow to produce bt HTML parser, /// but don'tt want to act as inline, block or no-scope. /// Presence in this list will allow to open /// elements during html parsing, and adding the /// to a tree produced by html parser. /// internal static bool IsKnownOpenableElement(string xmlElementName) => _htmlOtherOpenableElements.Contains(xmlElementName); /// /// returns true when xmlElementName closes when the outer element closes /// this is true of elements with optional start tags /// /// /// string representing name to test /// internal static bool ClosesOnParentElementEnd(string xmlElementName) => _htmlElementsClosingOnParentElementEnd.Contains(xmlElementName.ToLower()); /// /// returns true if the current element closes when the new element, whose name has just been read, starts /// /// /// string representing current element name /// /// /// string representing name of the next element that will start internal static bool ClosesOnNextElementStart(string currentElementName, string nextElementName) { Debug.Assert(currentElementName == currentElementName.ToLower()); switch (currentElementName) { case "colgroup": return _htmlElementsClosingColgroup.Contains(nextElementName) && IsBlockElement(nextElementName); case "dd": return _htmlElementsClosingDd.Contains(nextElementName) && IsBlockElement(nextElementName); case "dt": return _htmlElementsClosingDt.Contains(nextElementName) && IsBlockElement(nextElementName); case "li": return _htmlElementsClosingLi.Contains(nextElementName); case "p": return IsBlockElement(nextElementName); case "tbody": return _htmlElementsClosingTbody.Contains(nextElementName); case "tfoot": return _htmlElementsClosingTfoot.Contains(nextElementName); case "thead": return _htmlElementsClosingThead.Contains(nextElementName); case "tr": return _htmlElementsClosingTr.Contains(nextElementName); case "td": return _htmlElementsClosingTd.Contains(nextElementName); case "th": return _htmlElementsClosingTh.Contains(nextElementName); } return false; } /// /// returns true if the string passed as argument is an Html entity name /// /// /// string to be tested for Html entity name /// internal static bool IsEntity(string entityName) { // we do not convert entity strings to lowercase because these names are case-sensitive if (_htmlCharacterEntities.Contains(entityName)) { return true; } return false; } /// /// returns the character represented by the entity name string which is passed as an argument, if the string is an /// entity name /// as specified in _htmlCharacterEntities, returns the character value of 0 otherwise /// /// /// string representing entity name whose character value is desired /// internal static char EntityCharacterValue(string entityName) { if (_htmlCharacterEntities.Contains(entityName)) { return (char) _htmlCharacterEntities[entityName]; } return (char) 0; } #endregion Internal Methods // --------------------------------------------------------------------- // // Internal Properties // // --------------------------------------------------------------------- #region Internal Properties #endregion Internal Indexers // --------------------------------------------------------------------- // // Private Methods // // --------------------------------------------------------------------- #region Private Methods private static void InitializeInlineElements() { _htmlInlineElements = new ArrayList { "a", "abbr", "acronym", "address", "b", "bdo", "big", "button", "code", "del", "dfn", "em", "font", "i", "ins", "kbd", "label", "legend", "q", "s", "samp", "small", "span", "strike", "strong", "sub", "sup", "u", "var" }; // ??? // deleted text // inserted text // text to entered by a user // ??? // short inline quotation // strike-through text style // Specifies a code sample // indicates an instance of a program variable } private static void InitializeBlockElements() { _htmlBlockElements = new ArrayList { "blockquote", "body", "caption", "center", "cite", "dd", "dir", "div", "dl", "dt", "form", "h1", "h2", "h3", "h4", "h5", "h6", "html", "li", "menu", "ol", "p", "pre", "table", "tbody", "td", "textarea", "tfoot", "th", "thead", "tr", "tt", "ul" }; // treat as UL element // Not a block according to XHTML spec // treat as UL element // Renders text in a fixed-width font } /// /// initializes _htmlEmptyElements with empty elements in HTML 4 spec at /// http://www.w3.org/TR/REC-html40/index/elements.html /// private static void InitializeEmptyElements() { // Build a list of empty (no-scope) elements // (element not requiring closing tags, and not accepting any content) _htmlEmptyElements = new ArrayList { "area", "base", "basefont", "br", "col", "frame", "hr", "img", "input", "isindex", "link", "meta", "param" }; } private static void InitializeOtherOpenableElements() { // It is a list of known html elements which we // want to allow to produce bt HTML parser, // but don'tt want to act as inline, block or no-scope. // Presence in this list will allow to open // elements during html parsing, and adding the // to a tree produced by html parser. _htmlOtherOpenableElements = new ArrayList { "applet", "base", "basefont", "colgroup", "fieldset", "frameset", "head", "iframe", "map", "noframes", "noscript", "object", "optgroup", "option", "script", "select", "style", "title" }; //_htmlOtherOpenableElements.Add("form"); --> treated as block } /// /// initializes _htmlElementsClosingOnParentElementEnd with the list of HTML 4 elements for which closing tags are /// optional /// we assume that for any element for which closing tags are optional, the element closes when it's outer element /// (in which it is nested) does /// private static void InitializeElementsClosingOnParentElementEnd() { _htmlElementsClosingOnParentElementEnd = new ArrayList { "body", "colgroup", "dd", "dt", "head", "html", "li", "p", "tbody", "td", "tfoot", "thead", "th", "tr" }; } private static void InitializeElementsClosingOnNewElementStart() { _htmlElementsClosingColgroup = new ArrayList {"colgroup", "tr", "thead", "tfoot", "tbody"}; _htmlElementsClosingDd = new ArrayList {"dd", "dt"}; // TODO: dd may end in other cases as well - if a new "p" starts, etc. // TODO: these are the basic "legal" cases but there may be more recovery _htmlElementsClosingDt = new ArrayList(); _htmlElementsClosingDd.Add("dd"); _htmlElementsClosingDd.Add("dt"); // TODO: dd may end in other cases as well - if a new "p" starts, etc. // TODO: these are the basic "legal" cases but there may be more recovery _htmlElementsClosingLi = new ArrayList {"li"}; // TODO: more complex recovery _htmlElementsClosingTbody = new ArrayList {"tbody", "thead", "tfoot"}; // TODO: more complex recovery _htmlElementsClosingTr = new ArrayList {"thead", "tfoot", "tbody", "tr"}; // NOTE: tr should not really close on a new thead // because if there are rows before a thead, it is assumed to be in tbody, whose start tag is optional // and thead can't come after tbody // however, if we do encounter this, it's probably best to end the row and ignore the thead or treat // it as part of the table // TODO: more complex recovery _htmlElementsClosingTd = new ArrayList {"td", "th", "tr", "tbody", "tfoot", "thead"}; // TODO: more complex recovery _htmlElementsClosingTh = new ArrayList {"td", "th", "tr", "tbody", "tfoot", "thead"}; // TODO: more complex recovery _htmlElementsClosingThead = new ArrayList {"tbody", "tfoot"}; // TODO: more complex recovery _htmlElementsClosingTfoot = new ArrayList {"tbody", "thead"}; // although thead comes before tfoot, we add it because if it is found the tfoot should close // and some recovery processing be done on the thead // TODO: more complex recovery } /// /// initializes _htmlCharacterEntities hashtable with the character corresponding to entity names /// private static void InitializeHtmlCharacterEntities() { _htmlCharacterEntities = new Hashtable { ["Aacute"] = (char) 193, ["aacute"] = (char) 225, ["Acirc"] = (char) 194, ["acirc"] = (char) 226, ["acute"] = (char) 180, ["AElig"] = (char) 198, ["aelig"] = (char) 230, ["Agrave"] = (char) 192, ["agrave"] = (char) 224, ["alefsym"] = (char) 8501, ["Alpha"] = (char) 913, ["alpha"] = (char) 945, ["amp"] = (char) 38, ["and"] = (char) 8743, ["ang"] = (char) 8736, ["Aring"] = (char) 197, ["aring"] = (char) 229, ["asymp"] = (char) 8776, ["Atilde"] = (char) 195, ["atilde"] = (char) 227, ["Auml"] = (char) 196, ["auml"] = (char) 228, ["bdquo"] = (char) 8222, ["Beta"] = (char) 914, ["beta"] = (char) 946, ["brvbar"] = (char) 166, ["bull"] = (char) 8226, ["cap"] = (char) 8745, ["Ccedil"] = (char) 199, ["ccedil"] = (char) 231, ["cent"] = (char) 162, ["Chi"] = (char) 935, ["chi"] = (char) 967, ["circ"] = (char) 710, ["clubs"] = (char) 9827, ["cong"] = (char) 8773, ["copy"] = (char) 169, ["crarr"] = (char) 8629, ["cup"] = (char) 8746, ["curren"] = (char) 164, ["dagger"] = (char) 8224, ["Dagger"] = (char) 8225, ["darr"] = (char) 8595, ["dArr"] = (char) 8659, ["deg"] = (char) 176, ["Delta"] = (char) 916, ["delta"] = (char) 948, ["diams"] = (char) 9830, ["divide"] = (char) 247, ["Eacute"] = (char) 201, ["eacute"] = (char) 233, ["Ecirc"] = (char) 202, ["ecirc"] = (char) 234, ["Egrave"] = (char) 200, ["egrave"] = (char) 232, ["empty"] = (char) 8709, ["emsp"] = (char) 8195, ["ensp"] = (char) 8194, ["Epsilon"] = (char) 917, ["epsilon"] = (char) 949, ["equiv"] = (char) 8801, ["Eta"] = (char) 919, ["eta"] = (char) 951, ["ETH"] = (char) 208, ["eth"] = (char) 240, ["Euml"] = (char) 203, ["euml"] = (char) 235, ["euro"] = (char) 8364, ["exist"] = (char) 8707, ["fnof"] = (char) 402, ["forall"] = (char) 8704, ["frac12"] = (char) 189, ["frac14"] = (char) 188, ["frac34"] = (char) 190, ["frasl"] = (char) 8260, ["Gamma"] = (char) 915, ["gamma"] = (char) 947, ["ge"] = (char) 8805, ["gt"] = (char) 62, ["harr"] = (char) 8596, ["hArr"] = (char) 8660, ["hearts"] = (char) 9829, ["hellip"] = (char) 8230, ["Iacute"] = (char) 205, ["iacute"] = (char) 237, ["Icirc"] = (char) 206, ["icirc"] = (char) 238, ["iexcl"] = (char) 161, ["Igrave"] = (char) 204, ["igrave"] = (char) 236, ["image"] = (char) 8465, ["infin"] = (char) 8734, ["int"] = (char) 8747, ["Iota"] = (char) 921, ["iota"] = (char) 953, ["iquest"] = (char) 191, ["isin"] = (char) 8712, ["Iuml"] = (char) 207, ["iuml"] = (char) 239, ["Kappa"] = (char) 922, ["kappa"] = (char) 954, ["Lambda"] = (char) 923, ["lambda"] = (char) 955, ["lang"] = (char) 9001, ["laquo"] = (char) 171, ["larr"] = (char) 8592, ["lArr"] = (char) 8656, ["lceil"] = (char) 8968, ["ldquo"] = (char) 8220, ["le"] = (char) 8804, ["lfloor"] = (char) 8970, ["lowast"] = (char) 8727, ["loz"] = (char) 9674, ["lrm"] = (char) 8206, ["lsaquo"] = (char) 8249, ["lsquo"] = (char) 8216, ["lt"] = (char) 60, ["macr"] = (char) 175, ["mdash"] = (char) 8212, ["micro"] = (char) 181, ["middot"] = (char) 183, ["minus"] = (char) 8722, ["Mu"] = (char) 924, ["mu"] = (char) 956, ["nabla"] = (char) 8711, ["nbsp"] = (char) 160, ["ndash"] = (char) 8211, ["ne"] = (char) 8800, ["ni"] = (char) 8715, ["not"] = (char) 172, ["notin"] = (char) 8713, ["nsub"] = (char) 8836, ["Ntilde"] = (char) 209, ["ntilde"] = (char) 241, ["Nu"] = (char) 925, ["nu"] = (char) 957, ["Oacute"] = (char) 211, ["ocirc"] = (char) 244, ["OElig"] = (char) 338, ["oelig"] = (char) 339, ["Ograve"] = (char) 210, ["ograve"] = (char) 242, ["oline"] = (char) 8254, ["Omega"] = (char) 937, ["omega"] = (char) 969, ["Omicron"] = (char) 927, ["omicron"] = (char) 959, ["oplus"] = (char) 8853, ["or"] = (char) 8744, ["ordf"] = (char) 170, ["ordm"] = (char) 186, ["Oslash"] = (char) 216, ["oslash"] = (char) 248, ["Otilde"] = (char) 213, ["otilde"] = (char) 245, ["otimes"] = (char) 8855, ["Ouml"] = (char) 214, ["ouml"] = (char) 246, ["para"] = (char) 182, ["part"] = (char) 8706, ["permil"] = (char) 8240, ["perp"] = (char) 8869, ["Phi"] = (char) 934, ["phi"] = (char) 966, ["pi"] = (char) 960, ["piv"] = (char) 982, ["plusmn"] = (char) 177, ["pound"] = (char) 163, ["prime"] = (char) 8242, ["Prime"] = (char) 8243, ["prod"] = (char) 8719, ["prop"] = (char) 8733, ["Psi"] = (char) 936, ["psi"] = (char) 968, ["quot"] = (char) 34, ["radic"] = (char) 8730, ["rang"] = (char) 9002, ["raquo"] = (char) 187, ["rarr"] = (char) 8594, ["rArr"] = (char) 8658, ["rceil"] = (char) 8969, ["rdquo"] = (char) 8221, ["real"] = (char) 8476, ["reg"] = (char) 174, ["rfloor"] = (char) 8971, ["Rho"] = (char) 929, ["rho"] = (char) 961, ["rlm"] = (char) 8207, ["rsaquo"] = (char) 8250, ["rsquo"] = (char) 8217, ["sbquo"] = (char) 8218, ["Scaron"] = (char) 352, ["scaron"] = (char) 353, ["sdot"] = (char) 8901, ["sect"] = (char) 167, ["shy"] = (char) 173, ["Sigma"] = (char) 931, ["sigma"] = (char) 963, ["sigmaf"] = (char) 962, ["sim"] = (char) 8764, ["spades"] = (char) 9824, ["sub"] = (char) 8834, ["sube"] = (char) 8838, ["sum"] = (char) 8721, ["sup"] = (char) 8835, ["sup1"] = (char) 185, ["sup2"] = (char) 178, ["sup3"] = (char) 179, ["supe"] = (char) 8839, ["szlig"] = (char) 223, ["Tau"] = (char) 932, ["tau"] = (char) 964, ["there4"] = (char) 8756, ["Theta"] = (char) 920, ["theta"] = (char) 952, ["thetasym"] = (char) 977, ["thinsp"] = (char) 8201, ["THORN"] = (char) 222, ["thorn"] = (char) 254, ["tilde"] = (char) 732, ["times"] = (char) 215, ["trade"] = (char) 8482, ["Uacute"] = (char) 218, ["uacute"] = (char) 250, ["uarr"] = (char) 8593, ["uArr"] = (char) 8657, ["Ucirc"] = (char) 219, ["ucirc"] = (char) 251, ["Ugrave"] = (char) 217, ["ugrave"] = (char) 249, ["uml"] = (char) 168, ["upsih"] = (char) 978, ["Upsilon"] = (char) 933, ["upsilon"] = (char) 965, ["Uuml"] = (char) 220, ["uuml"] = (char) 252, ["weierp"] = (char) 8472, ["Xi"] = (char) 926, ["xi"] = (char) 958, ["Yacute"] = (char) 221, ["yacute"] = (char) 253, ["yen"] = (char) 165, ["Yuml"] = (char) 376, ["yuml"] = (char) 255, ["Zeta"] = (char) 918, ["zeta"] = (char) 950, ["zwj"] = (char) 8205, ["zwnj"] = (char) 8204 }; } #endregion Private Methods // --------------------------------------------------------------------- // // Private Fields // // --------------------------------------------------------------------- #region Private Fields // html element names // this is an array list now, but we may want to make it a hashtable later for better performance private static ArrayList _htmlInlineElements; private static ArrayList _htmlBlockElements; private static ArrayList _htmlOtherOpenableElements; // list of html empty element names private static ArrayList _htmlEmptyElements; // names of html elements for which closing tags are optional, and close when the outer nested element closes private static ArrayList _htmlElementsClosingOnParentElementEnd; // names of elements that close certain optional closing tag elements when they start // names of elements closing the colgroup element private static ArrayList _htmlElementsClosingColgroup; // names of elements closing the dd element private static ArrayList _htmlElementsClosingDd; // names of elements closing the dt element private static ArrayList _htmlElementsClosingDt; // names of elements closing the li element private static ArrayList _htmlElementsClosingLi; // names of elements closing the tbody element private static ArrayList _htmlElementsClosingTbody; // names of elements closing the td element private static ArrayList _htmlElementsClosingTd; // names of elements closing the tfoot element private static ArrayList _htmlElementsClosingTfoot; // names of elements closing the thead element private static ArrayList _htmlElementsClosingThead; // names of elements closing the th element private static ArrayList _htmlElementsClosingTh; // names of elements closing the tr element private static ArrayList _htmlElementsClosingTr; // html character entities hashtable private static Hashtable _htmlCharacterEntities; #endregion Private Fields } }