Initial population

This commit is contained in:
Jon
2026-03-07 19:22:22 -06:00
parent 647f55feb9
commit cae1a3ec46
108 changed files with 28484 additions and 0 deletions

View File

@@ -0,0 +1,255 @@
// // Copyright (c) Microsoft. All rights reserved.
// // Licensed under the MIT license. See LICENSE file in the project root for full license information.
using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.Text;
using System.Xml;
namespace HtmlToXamlDemo
{
internal class CssStylesheet
{
private List<StyleDefinition> _styleDefinitions;
// Constructor
public CssStylesheet(XmlElement htmlElement)
{
if (htmlElement != null)
{
DiscoverStyleDefinitions(htmlElement);
}
}
// Recursively traverses an html tree, discovers STYLE elements and creates a style definition table
// for further cascading style application
public void DiscoverStyleDefinitions(XmlElement htmlElement)
{
if (htmlElement.LocalName.ToLower() == "link")
{
return;
// Add LINK elements processing for included stylesheets
// <LINK href="http://sc.msn.com/global/css/ptnr/orange.css" type=text/css \r\nrel=stylesheet>
}
if (htmlElement.LocalName.ToLower() != "style")
{
// This is not a STYLE element. Recurse into it
for (var htmlChildNode = htmlElement.FirstChild;
htmlChildNode != null;
htmlChildNode = htmlChildNode.NextSibling)
{
if (htmlChildNode is XmlElement)
{
DiscoverStyleDefinitions((XmlElement) htmlChildNode);
}
}
return;
}
// Add style definitions from this style.
// Collect all text from this style definition
var stylesheetBuffer = new StringBuilder();
for (var htmlChildNode = htmlElement.FirstChild;
htmlChildNode != null;
htmlChildNode = htmlChildNode.NextSibling)
{
if (htmlChildNode is XmlText || htmlChildNode is XmlComment)
{
stylesheetBuffer.Append(RemoveComments(htmlChildNode.Value));
}
}
// CssStylesheet has the following syntactical structure:
// @import declaration;
// selector { definition }
// where "selector" is one of: ".classname", "tagname"
// It can contain comments in the following form: /*...*/
var nextCharacterIndex = 0;
while (nextCharacterIndex < stylesheetBuffer.Length)
{
// Extract selector
var selectorStart = nextCharacterIndex;
while (nextCharacterIndex < stylesheetBuffer.Length && stylesheetBuffer[nextCharacterIndex] != '{')
{
// Skip declaration directive starting from @
if (stylesheetBuffer[nextCharacterIndex] == '@')
{
while (nextCharacterIndex < stylesheetBuffer.Length &&
stylesheetBuffer[nextCharacterIndex] != ';')
{
nextCharacterIndex++;
}
selectorStart = nextCharacterIndex + 1;
}
nextCharacterIndex++;
}
if (nextCharacterIndex < stylesheetBuffer.Length)
{
// Extract definition
var definitionStart = nextCharacterIndex;
while (nextCharacterIndex < stylesheetBuffer.Length && stylesheetBuffer[nextCharacterIndex] != '}')
{
nextCharacterIndex++;
}
// Define a style
if (nextCharacterIndex - definitionStart > 2)
{
AddStyleDefinition(
stylesheetBuffer.ToString(selectorStart, definitionStart - selectorStart),
stylesheetBuffer.ToString(definitionStart + 1, nextCharacterIndex - definitionStart - 2));
}
// Skip closing brace
if (nextCharacterIndex < stylesheetBuffer.Length)
{
Debug.Assert(stylesheetBuffer[nextCharacterIndex] == '}');
nextCharacterIndex++;
}
}
}
}
// Returns a string with all c-style comments replaced by spaces
private string RemoveComments(string text)
{
var commentStart = text.IndexOf("/*", StringComparison.Ordinal);
if (commentStart < 0)
{
return text;
}
var commentEnd = text.IndexOf("*/", commentStart + 2, StringComparison.Ordinal);
if (commentEnd < 0)
{
return text.Substring(0, commentStart);
}
return text.Substring(0, commentStart) + " " + RemoveComments(text.Substring(commentEnd + 2));
}
public void AddStyleDefinition(string selector, string definition)
{
// Notrmalize parameter values
selector = selector.Trim().ToLower();
definition = definition.Trim().ToLower();
if (selector.Length == 0 || definition.Length == 0)
{
return;
}
if (_styleDefinitions == null)
{
_styleDefinitions = new List<StyleDefinition>();
}
var simpleSelectors = selector.Split(',');
foreach (string t in simpleSelectors)
{
var simpleSelector = t.Trim();
if (simpleSelector.Length > 0)
{
_styleDefinitions.Add(new StyleDefinition(simpleSelector, definition));
}
}
}
public string GetStyle(string elementName, List<XmlElement> sourceContext)
{
Debug.Assert(sourceContext.Count > 0);
Debug.Assert(elementName == sourceContext[sourceContext.Count - 1].LocalName);
// Add id processing for style selectors
if (_styleDefinitions != null)
{
for (var i = _styleDefinitions.Count - 1; i >= 0; i--)
{
var selector = _styleDefinitions[i].Selector;
var selectorLevels = selector.Split(' ');
var indexInSelector = selectorLevels.Length - 1;
var indexInContext = sourceContext.Count - 1;
var selectorLevel = selectorLevels[indexInSelector].Trim();
if (MatchSelectorLevel(selectorLevel, sourceContext[sourceContext.Count - 1]))
{
return _styleDefinitions[i].Definition;
}
}
}
return null;
}
private bool MatchSelectorLevel(string selectorLevel, XmlElement xmlElement)
{
if (selectorLevel.Length == 0)
{
return false;
}
var indexOfDot = selectorLevel.IndexOf('.');
var indexOfPound = selectorLevel.IndexOf('#');
string selectorClass = null;
string selectorId = null;
string selectorTag = null;
if (indexOfDot >= 0)
{
if (indexOfDot > 0)
{
selectorTag = selectorLevel.Substring(0, indexOfDot);
}
selectorClass = selectorLevel.Substring(indexOfDot + 1);
}
else if (indexOfPound >= 0)
{
if (indexOfPound > 0)
{
selectorTag = selectorLevel.Substring(0, indexOfPound);
}
selectorId = selectorLevel.Substring(indexOfPound + 1);
}
else
{
selectorTag = selectorLevel;
}
if (selectorTag != null && selectorTag != xmlElement.LocalName)
{
return false;
}
if (selectorId != null && HtmlToXamlConverter.GetAttribute(xmlElement, "id") != selectorId)
{
return false;
}
if (selectorClass != null && HtmlToXamlConverter.GetAttribute(xmlElement, "class") != selectorClass)
{
return false;
}
return true;
}
private class StyleDefinition
{
public readonly string Definition;
public readonly string Selector;
public StyleDefinition(string selector, string definition)
{
Selector = selector;
Definition = definition;
}
}
}
}

View File

@@ -0,0 +1,844 @@
// // Copyright (c) Microsoft. All rights reserved.
// // Licensed under the MIT license. See LICENSE file in the project root for full license information.
using System.Collections;
using System.Collections.Generic;
using System.Diagnostics;
using System.Xml;
namespace HtmlToXamlDemo
{
// DependencyProperty
// TextElement
internal static class HtmlCssParser
{
private static readonly string[] Colors =
{
"aliceblue", "antiquewhite", "aqua", "aquamarine", "azure", "beige", "bisque", "black", "blanchedalmond",
"blue", "blueviolet", "brown", "burlywood", "cadetblue", "chartreuse", "chocolate", "coral",
"cornflowerblue", "cornsilk", "crimson", "cyan", "darkblue", "darkcyan", "darkgoldenrod", "darkgray",
"darkgreen", "darkkhaki", "darkmagenta", "darkolivegreen", "darkorange", "darkorchid", "darkred",
"darksalmon", "darkseagreen", "darkslateblue", "darkslategray", "darkturquoise", "darkviolet", "deeppink",
"deepskyblue", "dimgray", "dodgerblue", "firebrick", "floralwhite", "forestgreen", "fuchsia", "gainsboro",
"ghostwhite", "gold", "goldenrod", "gray", "green", "greenyellow", "honeydew", "hotpink", "indianred",
"indigo", "ivory", "khaki", "lavender", "lavenderblush", "lawngreen", "lemonchiffon", "lightblue",
"lightcoral",
"lightcyan", "lightgoldenrodyellow", "lightgreen", "lightgrey", "lightpink", "lightsalmon", "lightseagreen",
"lightskyblue", "lightslategray", "lightsteelblue", "lightyellow", "lime", "limegreen", "linen", "magenta",
"maroon", "mediumaquamarine", "mediumblue", "mediumorchid", "mediumpurple", "mediumseagreen",
"mediumslateblue",
"mediumspringgreen", "mediumturquoise", "mediumvioletred", "midnightblue", "mintcream", "mistyrose",
"moccasin",
"navajowhite", "navy", "oldlace", "olive", "olivedrab", "orange", "orangered", "orchid", "palegoldenrod",
"palegreen", "paleturquoise", "palevioletred", "papayawhip", "peachpuff", "peru", "pink", "plum",
"powderblue",
"purple", "red", "rosybrown", "royalblue", "saddlebrown", "salmon", "sandybrown", "seagreen", "seashell",
"sienna", "silver", "skyblue", "slateblue", "slategray", "snow", "springgreen", "steelblue", "tan", "teal",
"thistle", "tomato", "turquoise", "violet", "wheat", "white", "whitesmoke", "yellow", "yellowgreen"
};
private static readonly string[] SystemColors =
{
"activeborder", "activecaption", "appworkspace", "background", "buttonface", "buttonhighlight",
"buttonshadow",
"buttontext", "captiontext", "graytext", "highlight", "highlighttext", "inactiveborder", "inactivecaption",
"inactivecaptiontext", "infobackground", "infotext", "menu", "menutext", "scrollbar", "threeddarkshadow",
"threedface", "threedhighlight", "threedlightshadow", "threedshadow", "window", "windowframe", "windowtext"
};
// .................................................................
//
// Pasring CSS font Property
//
// .................................................................
// CSS has five font properties: font-family, font-style, font-variant, font-weight, font-size.
// An aggregated "font" property lets you specify in one action all the five in combination
// with additional line-height property.
//
// font-family: [<family-name>,]* [<family-name> | <generic-family>]
// generic-family: serif | sans-serif | monospace | cursive | fantasy
// The list of families sets priorities to choose fonts;
// Quotes not allowed around generic-family names
// font-style: normal | italic | oblique
// font-variant: normal | small-caps
// font-weight: normal | bold | bolder | lighter | 100 ... 900 |
// Default is "normal", normal==400
// font-size: <absolute-size> | <relative-size> | <length> | <percentage>
// absolute-size: xx-small | x-small | small | medium | large | x-large | xx-large
// relative-size: larger | smaller
// length: <point> | <pica> | <ex> | <em> | <points> | <millimeters> | <centimeters> | <inches>
// Default: medium
// font: [ <font-style> || <font-variant> || <font-weight ]? <font-size> [ / <line-height> ]? <font-family>
private static readonly string[] FontGenericFamilies =
{
"serif", "sans-serif", "monospace", "cursive", "fantasy"
};
private static readonly string[] FontStyles = {"normal", "italic", "oblique"};
private static readonly string[] FontVariants = {"normal", "small-caps"};
private static readonly string[] FontWeights =
{
"normal", "bold", "bolder", "lighter", "100", "200", "300",
"400", "500", "600", "700", "800", "900"
};
private static readonly string[] FontAbsoluteSizes =
{
"xx-small", "x-small", "small", "medium", "large",
"x-large", "xx-large"
};
private static readonly string[] FontRelativeSizes = {"larger", "smaller"};
private static readonly string[] FontSizeUnits = {"px", "mm", "cm", "in", "pt", "pc", "em", "ex", "%"};
// .................................................................
//
// Pasring CSS list-style Property
//
// .................................................................
// list-style: [ <list-style-type> || <list-style-position> || <list-style-image> ]
private static readonly string[] ListStyleTypes =
{
"disc", "circle", "square", "decimal", "lower-roman",
"upper-roman", "lower-alpha", "upper-alpha", "none"
};
private static readonly string[] ListStylePositions = {"inside", "outside"};
// .................................................................
//
// Pasring CSS text-decorations Property
//
// .................................................................
private static readonly string[] TextDecorations = {"none", "underline", "overline", "line-through", "blink"};
// .................................................................
//
// Pasring CSS text-transform Property
//
// .................................................................
private static readonly string[] TextTransforms = {"none", "capitalize", "uppercase", "lowercase"};
// .................................................................
//
// Pasring CSS text-align Property
//
// .................................................................
private static readonly string[] TextAligns = {"left", "right", "center", "justify"};
// .................................................................
//
// Pasring CSS vertical-align Property
//
// .................................................................
private static readonly string[] VerticalAligns =
{
"baseline", "sub", "super", "top", "text-top", "middle",
"bottom", "text-bottom"
};
// .................................................................
//
// Pasring CSS float Property
//
// .................................................................
private static readonly string[] Floats = {"left", "right", "none"};
// .................................................................
//
// Pasring CSS clear Property
//
// .................................................................
private static readonly string[] Clears = {"none", "left", "right", "both"};
// .................................................................
//
// Pasring CSS border-style Propertie
//
// .................................................................
private static readonly string[] BorderStyles =
{
"none", "dotted", "dashed", "solid", "double", "groove",
"ridge", "inset", "outset"
};
// .................................................................
//
// What are these definitions doing here:
//
// .................................................................
private static string[] _blocks = {"block", "inline", "list-item", "none"};
// .................................................................
//
// Processing CSS Attributes
//
// .................................................................
internal static void GetElementPropertiesFromCssAttributes(XmlElement htmlElement, string elementName,
CssStylesheet stylesheet, Hashtable localProperties, List<XmlElement> sourceContext)
{
var styleFromStylesheet = stylesheet.GetStyle(elementName, sourceContext);
var styleInline = HtmlToXamlConverter.GetAttribute(htmlElement, "style");
// Combine styles from stylesheet and from inline attribute.
// The order is important - the latter styles will override the former.
var style = styleFromStylesheet ?? null;
if (styleInline != null)
{
style = style == null ? styleInline : (style + ";" + styleInline);
}
// Apply local style to current formatting properties
if (style != null)
{
var styleValues = style.Split(';');
foreach (string t in styleValues)
{
string[] styleNameValue;
styleNameValue = t.Split(':');
if (styleNameValue.Length == 2)
{
var styleName = styleNameValue[0].Trim().ToLower();
var styleValue = HtmlToXamlConverter.UnQuote(styleNameValue[1].Trim()).ToLower();
var nextIndex = 0;
switch (styleName)
{
case "font":
ParseCssFont(styleValue, localProperties);
break;
case "font-family":
ParseCssFontFamily(styleValue, ref nextIndex, localProperties);
break;
case "font-size":
ParseCssSize(styleValue, ref nextIndex, localProperties, "font-size",
/*mustBeNonNegative:*/true);
break;
case "font-style":
ParseCssFontStyle(styleValue, ref nextIndex, localProperties);
break;
case "font-weight":
ParseCssFontWeight(styleValue, ref nextIndex, localProperties);
break;
case "font-variant":
ParseCssFontVariant(styleValue, ref nextIndex, localProperties);
break;
case "line-height":
ParseCssSize(styleValue, ref nextIndex, localProperties, "line-height",
/*mustBeNonNegative:*/true);
break;
case "word-spacing":
// Implement word-spacing conversion
break;
case "letter-spacing":
// Implement letter-spacing conversion
break;
case "color":
ParseCssColor(styleValue, ref nextIndex, localProperties, "color");
break;
case "text-decoration":
ParseCssTextDecoration(styleValue, ref nextIndex, localProperties);
break;
case "text-transform":
ParseCssTextTransform(styleValue, ref nextIndex, localProperties);
break;
case "background-color":
ParseCssColor(styleValue, ref nextIndex, localProperties, "background-color");
break;
case "background":
// TODO: need to parse composite background property
ParseCssBackground(styleValue, ref nextIndex, localProperties);
break;
case "text-align":
ParseCssTextAlign(styleValue, ref nextIndex, localProperties);
break;
case "vertical-align":
ParseCssVerticalAlign(styleValue, ref nextIndex, localProperties);
break;
case "text-indent":
ParseCssSize(styleValue, ref nextIndex, localProperties, "text-indent",
/*mustBeNonNegative:*/false);
break;
case "width":
case "height":
ParseCssSize(styleValue, ref nextIndex, localProperties, styleName,
/*mustBeNonNegative:*/true);
break;
case "margin": // top/right/bottom/left
ParseCssRectangleProperty(styleValue, ref nextIndex, localProperties, styleName);
break;
case "margin-top":
case "margin-right":
case "margin-bottom":
case "margin-left":
ParseCssSize(styleValue, ref nextIndex, localProperties, styleName,
/*mustBeNonNegative:*/true);
break;
case "padding":
ParseCssRectangleProperty(styleValue, ref nextIndex, localProperties, styleName);
break;
case "padding-top":
case "padding-right":
case "padding-bottom":
case "padding-left":
ParseCssSize(styleValue, ref nextIndex, localProperties, styleName,
/*mustBeNonNegative:*/true);
break;
case "border":
ParseCssBorder(styleValue, ref nextIndex, localProperties);
break;
case "border-style":
case "border-width":
case "border-color":
ParseCssRectangleProperty(styleValue, ref nextIndex, localProperties, styleName);
break;
case "border-top":
case "border-right":
case "border-left":
case "border-bottom":
// Parse css border style
break;
// NOTE: css names for elementary border styles have side indications in the middle (top/bottom/left/right)
// In our internal notation we intentionally put them at the end - to unify processing in ParseCssRectangleProperty method
case "border-top-style":
case "border-right-style":
case "border-left-style":
case "border-bottom-style":
case "border-top-color":
case "border-right-color":
case "border-left-color":
case "border-bottom-color":
case "border-top-width":
case "border-right-width":
case "border-left-width":
case "border-bottom-width":
// Parse css border style
break;
case "display":
// Implement display style conversion
break;
case "float":
ParseCssFloat(styleValue, ref nextIndex, localProperties);
break;
case "clear":
ParseCssClear(styleValue, ref nextIndex, localProperties);
break;
default:
break;
}
}
}
}
}
// .................................................................
//
// Parsing CSS - Lexical Helpers
//
// .................................................................
// Skips whitespaces in style values
private static void ParseWhiteSpace(string styleValue, ref int nextIndex)
{
while (nextIndex < styleValue.Length && char.IsWhiteSpace(styleValue[nextIndex]))
{
nextIndex++;
}
}
// Checks if the following character matches to a given word and advances nextIndex
// by the word's length in case of success.
// Otherwise leaves nextIndex in place (except for possible whitespaces).
// Returns true or false depending on success or failure of matching.
private static bool ParseWord(string word, string styleValue, ref int nextIndex)
{
ParseWhiteSpace(styleValue, ref nextIndex);
for (var i = 0; i < word.Length; i++)
{
if (!(nextIndex + i < styleValue.Length && word[i] == styleValue[nextIndex + i]))
{
return false;
}
}
if (nextIndex + word.Length < styleValue.Length && char.IsLetterOrDigit(styleValue[nextIndex + word.Length]))
{
return false;
}
nextIndex += word.Length;
return true;
}
// CHecks whether the following character sequence matches to one of the given words,
// and advances the nextIndex to matched word length.
// Returns null in case if there is no match or the word matched.
private static string ParseWordEnumeration(string[] words, string styleValue, ref int nextIndex)
{
foreach (string t in words)
{
if (ParseWord(t, styleValue, ref nextIndex))
{
return t;
}
}
return null;
}
private static void ParseWordEnumeration(string[] words, string styleValue, ref int nextIndex,
Hashtable localProperties, string attributeName)
{
var attributeValue = ParseWordEnumeration(words, styleValue, ref nextIndex);
if (attributeValue != null)
{
localProperties[attributeName] = attributeValue;
}
}
private static string ParseCssSize(string styleValue, ref int nextIndex, bool mustBeNonNegative)
{
ParseWhiteSpace(styleValue, ref nextIndex);
var startIndex = nextIndex;
// Parse optional munis sign
if (nextIndex < styleValue.Length && styleValue[nextIndex] == '-')
{
nextIndex++;
}
if (nextIndex < styleValue.Length && char.IsDigit(styleValue[nextIndex]))
{
while (nextIndex < styleValue.Length &&
(char.IsDigit(styleValue[nextIndex]) || styleValue[nextIndex] == '.'))
{
nextIndex++;
}
var number = styleValue.Substring(startIndex, nextIndex - startIndex);
var unit = ParseWordEnumeration(FontSizeUnits, styleValue, ref nextIndex) ?? "px";
if (mustBeNonNegative && styleValue[startIndex] == '-')
{
return "0";
}
return number + unit;
}
return null;
}
private static void ParseCssSize(string styleValue, ref int nextIndex, Hashtable localValues,
string propertyName,
bool mustBeNonNegative)
{
var length = ParseCssSize(styleValue, ref nextIndex, mustBeNonNegative);
if (length != null)
{
localValues[propertyName] = length;
}
}
private static string ParseCssColor(string styleValue, ref int nextIndex)
{
// Implement color parsing
// rgb(100%,53.5%,10%)
// rgb(255,91,26)
// #FF5B1A
// black | silver | gray | ... | aqua
// transparent - for background-color
ParseWhiteSpace(styleValue, ref nextIndex);
string color = null;
if (nextIndex < styleValue.Length)
{
var startIndex = nextIndex;
var character = styleValue[nextIndex];
if (character == '#')
{
nextIndex++;
while (nextIndex < styleValue.Length)
{
character = char.ToUpper(styleValue[nextIndex]);
if (!('0' <= character && character <= '9' || 'A' <= character && character <= 'F'))
{
break;
}
nextIndex++;
}
if (nextIndex > startIndex + 1)
{
color = styleValue.Substring(startIndex, nextIndex - startIndex);
}
}
else if (styleValue.Substring(nextIndex, 3).ToLower() == "rbg")
{
// Implement real rgb() color parsing
while (nextIndex < styleValue.Length && styleValue[nextIndex] != ')')
{
nextIndex++;
}
if (nextIndex < styleValue.Length)
{
nextIndex++; // to skip ')'
}
color = "gray"; // return bogus color
}
else if (char.IsLetter(character))
{
color = ParseWordEnumeration(Colors, styleValue, ref nextIndex);
if (color == null)
{
color = ParseWordEnumeration(SystemColors, styleValue, ref nextIndex);
if (color != null)
{
// Implement smarter system color converions into real colors
color = "black";
}
}
}
}
return color;
}
private static void ParseCssColor(string styleValue, ref int nextIndex, Hashtable localValues,
string propertyName)
{
var color = ParseCssColor(styleValue, ref nextIndex);
if (color != null)
{
localValues[propertyName] = color;
}
}
// Parses CSS string fontStyle representing a value for css font attribute
private static void ParseCssFont(string styleValue, Hashtable localProperties)
{
var nextIndex = 0;
ParseCssFontStyle(styleValue, ref nextIndex, localProperties);
ParseCssFontVariant(styleValue, ref nextIndex, localProperties);
ParseCssFontWeight(styleValue, ref nextIndex, localProperties);
ParseCssSize(styleValue, ref nextIndex, localProperties, "font-size", /*mustBeNonNegative:*/true);
ParseWhiteSpace(styleValue, ref nextIndex);
if (nextIndex < styleValue.Length && styleValue[nextIndex] == '/')
{
nextIndex++;
ParseCssSize(styleValue, ref nextIndex, localProperties, "line-height", /*mustBeNonNegative:*/true);
}
ParseCssFontFamily(styleValue, ref nextIndex, localProperties);
}
private static void ParseCssFontStyle(string styleValue, ref int nextIndex, Hashtable localProperties)
{
ParseWordEnumeration(FontStyles, styleValue, ref nextIndex, localProperties, "font-style");
}
private static void ParseCssFontVariant(string styleValue, ref int nextIndex, Hashtable localProperties)
{
ParseWordEnumeration(FontVariants, styleValue, ref nextIndex, localProperties, "font-variant");
}
private static void ParseCssFontWeight(string styleValue, ref int nextIndex, Hashtable localProperties)
{
ParseWordEnumeration(FontWeights, styleValue, ref nextIndex, localProperties, "font-weight");
}
private static void ParseCssFontFamily(string styleValue, ref int nextIndex, Hashtable localProperties)
{
string fontFamilyList = null;
while (nextIndex < styleValue.Length)
{
// Try generic-family
var fontFamily = ParseWordEnumeration(FontGenericFamilies, styleValue, ref nextIndex);
if (fontFamily == null)
{
// Try quoted font family name
if (nextIndex < styleValue.Length && (styleValue[nextIndex] == '"' || styleValue[nextIndex] == '\''))
{
var quote = styleValue[nextIndex];
nextIndex++;
var startIndex = nextIndex;
while (nextIndex < styleValue.Length && styleValue[nextIndex] != quote)
{
nextIndex++;
}
fontFamily = '"' + styleValue.Substring(startIndex, nextIndex - startIndex) + '"';
}
if (fontFamily == null)
{
// Try unquoted font family name
var startIndex = nextIndex;
while (nextIndex < styleValue.Length && styleValue[nextIndex] != ',' &&
styleValue[nextIndex] != ';')
{
nextIndex++;
}
if (nextIndex > startIndex)
{
fontFamily = styleValue.Substring(startIndex, nextIndex - startIndex).Trim();
if (fontFamily.Length == 0)
{
fontFamily = null;
}
}
}
}
ParseWhiteSpace(styleValue, ref nextIndex);
if (nextIndex < styleValue.Length && styleValue[nextIndex] == ',')
{
nextIndex++;
}
if (fontFamily != null)
{
// css font-family can contein a list of names. We only consider the first name from the list. Need a decision what to do with remaining names
// fontFamilyList = (fontFamilyList == null) ? fontFamily : fontFamilyList + "," + fontFamily;
if (fontFamilyList == null && fontFamily.Length > 0)
{
if (fontFamily[0] == '"' || fontFamily[0] == '\'')
{
// Unquote the font family name
fontFamily = fontFamily.Substring(1, fontFamily.Length - 2);
}
fontFamilyList = fontFamily;
}
}
else
{
break;
}
}
if (fontFamilyList != null)
{
localProperties["font-family"] = fontFamilyList;
}
}
private static void ParseCssListStyle(string styleValue, Hashtable localProperties)
{
var nextIndex = 0;
while (nextIndex < styleValue.Length)
{
var listStyleType = ParseCssListStyleType(styleValue, ref nextIndex);
if (listStyleType != null)
{
localProperties["list-style-type"] = listStyleType;
}
else
{
var listStylePosition = ParseCssListStylePosition(styleValue, ref nextIndex);
if (listStylePosition != null)
{
localProperties["list-style-position"] = listStylePosition;
}
else
{
var listStyleImage = ParseCssListStyleImage(styleValue, ref nextIndex);
if (listStyleImage != null)
{
localProperties["list-style-image"] = listStyleImage;
}
else
{
// TODO: Process unrecognized list style value
break;
}
}
}
}
}
private static string ParseCssListStyleType(string styleValue, ref int nextIndex) => ParseWordEnumeration(ListStyleTypes, styleValue, ref nextIndex);
private static string ParseCssListStylePosition(string styleValue, ref int nextIndex) => ParseWordEnumeration(ListStylePositions, styleValue, ref nextIndex);
private static string ParseCssListStyleImage(string styleValue, ref int nextIndex) => null;
private static void ParseCssTextDecoration(string styleValue, ref int nextIndex, Hashtable localProperties)
{
// Set default text-decorations:none;
for (var i = 1; i < TextDecorations.Length; i++)
{
localProperties["text-decoration-" + TextDecorations[i]] = "false";
}
// Parse list of decorations values
while (nextIndex < styleValue.Length)
{
var decoration = ParseWordEnumeration(TextDecorations, styleValue, ref nextIndex);
if (decoration == null || decoration == "none")
{
break;
}
localProperties["text-decoration-" + decoration] = "true";
}
}
private static void ParseCssTextTransform(string styleValue, ref int nextIndex, Hashtable localProperties)
{
ParseWordEnumeration(TextTransforms, styleValue, ref nextIndex, localProperties, "text-transform");
}
private static void ParseCssTextAlign(string styleValue, ref int nextIndex, Hashtable localProperties)
{
ParseWordEnumeration(TextAligns, styleValue, ref nextIndex, localProperties, "text-align");
}
private static void ParseCssVerticalAlign(string styleValue, ref int nextIndex, Hashtable localProperties)
{
// Parse percentage value for vertical-align style
ParseWordEnumeration(VerticalAligns, styleValue, ref nextIndex, localProperties, "vertical-align");
}
private static void ParseCssFloat(string styleValue, ref int nextIndex, Hashtable localProperties)
{
ParseWordEnumeration(Floats, styleValue, ref nextIndex, localProperties, "float");
}
private static void ParseCssClear(string styleValue, ref int nextIndex, Hashtable localProperties)
{
ParseWordEnumeration(Clears, styleValue, ref nextIndex, localProperties, "clear");
}
// .................................................................
//
// Pasring CSS margin and padding Properties
//
// .................................................................
// Generic method for parsing any of four-values properties, such as margin, padding, border-width, border-style, border-color
private static bool ParseCssRectangleProperty(string styleValue, ref int nextIndex, Hashtable localProperties,
string propertyName)
{
// CSS Spec:
// If only one value is set, then the value applies to all four sides;
// If two or three values are set, then missinng value(s) are taken fromm the opposite side(s).
// The order they are applied is: top/right/bottom/left
Debug.Assert(propertyName == "margin" || propertyName == "padding" || propertyName == "border-width" ||
propertyName == "border-style" || propertyName == "border-color");
var value = propertyName == "border-color"
? ParseCssColor(styleValue, ref nextIndex)
: propertyName == "border-style"
? ParseCssBorderStyle(styleValue, ref nextIndex)
: ParseCssSize(styleValue, ref nextIndex, /*mustBeNonNegative:*/true);
if (value != null)
{
localProperties[propertyName + "-top"] = value;
localProperties[propertyName + "-bottom"] = value;
localProperties[propertyName + "-right"] = value;
localProperties[propertyName + "-left"] = value;
value = propertyName == "border-color"
? ParseCssColor(styleValue, ref nextIndex)
: propertyName == "border-style"
? ParseCssBorderStyle(styleValue, ref nextIndex)
: ParseCssSize(styleValue, ref nextIndex, /*mustBeNonNegative:*/true);
if (value != null)
{
localProperties[propertyName + "-right"] = value;
localProperties[propertyName + "-left"] = value;
value = propertyName == "border-color"
? ParseCssColor(styleValue, ref nextIndex)
: propertyName == "border-style"
? ParseCssBorderStyle(styleValue, ref nextIndex)
: ParseCssSize(styleValue, ref nextIndex, /*mustBeNonNegative:*/true);
if (value != null)
{
localProperties[propertyName + "-bottom"] = value;
value = propertyName == "border-color"
? ParseCssColor(styleValue, ref nextIndex)
: propertyName == "border-style"
? ParseCssBorderStyle(styleValue, ref nextIndex)
: ParseCssSize(styleValue, ref nextIndex, /*mustBeNonNegative:*/true);
if (value != null)
{
localProperties[propertyName + "-left"] = value;
}
}
}
return true;
}
return false;
}
// .................................................................
//
// Pasring CSS border Properties
//
// .................................................................
// border: [ <border-width> || <border-style> || <border-color> ]
private static void ParseCssBorder(string styleValue, ref int nextIndex, Hashtable localProperties)
{
while (
ParseCssRectangleProperty(styleValue, ref nextIndex, localProperties, "border-width") ||
ParseCssRectangleProperty(styleValue, ref nextIndex, localProperties, "border-style") ||
ParseCssRectangleProperty(styleValue, ref nextIndex, localProperties, "border-color"))
{
}
}
private static string ParseCssBorderStyle(string styleValue, ref int nextIndex) => ParseWordEnumeration(BorderStyles, styleValue, ref nextIndex);
// .................................................................
//
// Pasring CSS Background Properties
//
// .................................................................
private static void ParseCssBackground(string styleValue, ref int nextIndex, Hashtable localValues)
{
// Implement parsing background attribute
}
}
}

View File

@@ -0,0 +1,26 @@
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Net;
using System.Text;
using System.Xml;
namespace HtmlToXamlDemo
{
public class HtmlEncodedTextWriter : XmlTextWriter
{
public HtmlEncodedTextWriter(TextWriter w) : base(w) { }
#region Overrides of XmlTextWriter
/// <inheritdoc />
public override void WriteString(string text)
{
text = WebUtility.HtmlEncode(text);
WriteRaw(text);
}
#endregion
}
}

View File

@@ -0,0 +1,577 @@
// // Copyright (c) Microsoft. All rights reserved.
// // Licensed under the MIT license. See LICENSE file in the project root for full license information.
using System;
using System.Diagnostics;
using System.Globalization;
using System.IO;
using System.Text;
using System.Xml;
namespace HtmlToXamlDemo
{
/// <summary>
/// HtmlToXamlConverter is a static class that takes an HTML string
/// and converts it into XAML
/// </summary>
internal static class HtmlFromXamlConverter
{
// ---------------------------------------------------------------------
//
// Internal Methods
//
// ---------------------------------------------------------------------
#region Internal Methods
/// <summary>
/// Main entry point for Xaml-to-Html converter.
/// Converts a xaml string into html string.
/// </summary>
/// <param name="xamlString">
/// Xaml strinng to convert.
/// </param>
/// <returns>
/// Html string produced from a source xaml.
/// </returns>
internal static string ConvertXamlToHtml(string xamlString)
{
XmlTextReader xamlReader;
StringBuilder htmlStringBuilder;
XmlTextWriter htmlWriter;
xamlReader = new XmlTextReader(new StringReader(xamlString));
htmlStringBuilder = new StringBuilder(100);
htmlWriter = new HtmlEncodedTextWriter(new StringWriter(htmlStringBuilder));
if (!WriteFlowDocument(xamlReader, htmlWriter))
{
return "";
}
var htmlString = htmlStringBuilder.ToString();
return htmlString;
}
#endregion Internal Methods
// ---------------------------------------------------------------------
//
// Private Methods
//
// ---------------------------------------------------------------------
#region Private Methods
/// <summary>
/// Processes a root level element of XAML (normally it's FlowDocument element).
/// </summary>
/// <param name="xamlReader">
/// XmlTextReader for a source xaml.
/// </param>
/// <param name="htmlWriter">
/// XmlTextWriter producing resulting html
/// </param>
private static bool WriteFlowDocument(XmlTextReader xamlReader, XmlTextWriter htmlWriter)
{
if (!ReadNextToken(xamlReader))
{
// Xaml content is empty - nothing to convert
return false;
}
if (xamlReader.NodeType != XmlNodeType.Element || xamlReader.Name != "FlowDocument")
{
// Root FlowDocument elemet is missing
return false;
}
// Create a buffer StringBuilder for collecting css properties for inline STYLE attributes
// on every element level (it will be re-initialized on every level).
var inlineStyle = new StringBuilder();
htmlWriter.WriteStartElement("html");
htmlWriter.WriteStartElement("body");
WriteFormattingProperties(xamlReader, htmlWriter, inlineStyle);
WriteElementContent(xamlReader, htmlWriter, inlineStyle);
htmlWriter.WriteEndElement();
htmlWriter.WriteEndElement();
return true;
}
/// <summary>
/// Reads attributes of the current xaml element and converts
/// them into appropriate html attributes or css styles.
/// </summary>
/// <param name="xamlReader">
/// XmlTextReader which is expected to be at XmlNodeType.Element
/// (opening element tag) position.
/// The reader will remain at the same level after function complete.
/// </param>
/// <param name="htmlWriter">
/// XmlTextWriter for output html, which is expected to be in
/// after WriteStartElement state.
/// </param>
/// <param name="inlineStyle">
/// String builder for collecting css properties for inline STYLE attribute.
/// </param>
private static void WriteFormattingProperties(XmlTextReader xamlReader, XmlTextWriter htmlWriter,
StringBuilder inlineStyle)
{
Debug.Assert(xamlReader.NodeType == XmlNodeType.Element);
// Clear string builder for the inline style
inlineStyle.Remove(0, inlineStyle.Length);
if (!xamlReader.HasAttributes)
{
return;
}
var borderSet = false;
while (xamlReader.MoveToNextAttribute())
{
string css = null;
switch (xamlReader.Name)
{
// Character fomatting properties
// ------------------------------
case "Background":
css = "background-color:" + ParseXamlColor(xamlReader.Value) + ";";
break;
case "FontFamily":
css = "font-family:" + xamlReader.Value + ";";
break;
case "FontStyle":
css = "font-style:" + xamlReader.Value.ToLower() + ";";
break;
case "FontWeight":
css = "font-weight:" + xamlReader.Value.ToLower() + ";";
break;
case "FontStretch":
break;
case "FontSize":
css = "font-size:" + xamlReader.Value + ";";
break;
case "Foreground":
css = "color:" + ParseXamlColor(xamlReader.Value) + ";";
break;
case "TextDecorations":
css = "text-decoration:underline;";
break;
case "TextEffects":
break;
case "Emphasis":
break;
case "StandardLigatures":
break;
case "Variants":
break;
case "Capitals":
break;
case "Fraction":
break;
// Paragraph formatting properties
// -------------------------------
case "Padding":
css = "padding:" + ParseXamlThickness(xamlReader.Value) + ";";
break;
case "Margin":
css = "margin:" + ParseXamlThickness(xamlReader.Value) + ";";
break;
case "BorderThickness":
css = "border-width:" + ParseXamlThickness(xamlReader.Value) + ";";
borderSet = true;
break;
case "BorderBrush":
css = "border-color:" + ParseXamlColor(xamlReader.Value) + ";";
borderSet = true;
break;
case "LineHeight":
break;
case "TextIndent":
css = "text-indent:" + xamlReader.Value + ";";
break;
case "TextAlignment":
css = "text-align:" + xamlReader.Value + ";";
break;
case "IsKeptTogether":
break;
case "IsKeptWithNext":
break;
case "ColumnBreakBefore":
break;
case "PageBreakBefore":
break;
case "FlowDirection":
break;
// Table attributes
// ----------------
case "Width":
css = "width:" + xamlReader.Value + ";";
break;
case "ColumnSpan":
htmlWriter.WriteAttributeString("colspan", xamlReader.Value);
break;
case "RowSpan":
htmlWriter.WriteAttributeString("rowspan", xamlReader.Value);
break;
}
if (css != null)
{
inlineStyle.Append(css);
}
}
if (borderSet)
{
inlineStyle.Append("border-style:solid;mso-element:para-border-div;");
}
// Return the xamlReader back to element level
xamlReader.MoveToElement();
Debug.Assert(xamlReader.NodeType == XmlNodeType.Element);
}
private static string ParseXamlColor(string color)
{
if (color.StartsWith("#"))
{
// Remove transparancy value
color = "#" + color.Substring(3);
}
return color;
}
private static string ParseXamlThickness(string thickness)
{
var values = thickness.Split(',');
for (var i = 0; i < values.Length; i++)
{
if (double.TryParse(values[i], NumberStyles.Any, CultureInfo.InvariantCulture, out double value))
{
values[i] = Math.Ceiling(value).ToString(CultureInfo.InvariantCulture);
}
else
{
values[i] = "1";
}
}
string cssThickness;
switch (values.Length)
{
case 1:
cssThickness = thickness;
break;
case 2:
cssThickness = values[1] + " " + values[0];
break;
case 4:
cssThickness = values[1] + " " + values[2] + " " + values[3] + " " + values[0];
break;
default:
cssThickness = values[0];
break;
}
return cssThickness;
}
/// <summary>
/// Reads a content of current xaml element, converts it
/// </summary>
/// <param name="xamlReader">
/// XmlTextReader which is expected to be at XmlNodeType.Element
/// (opening element tag) position.
/// </param>
/// <param name="htmlWriter">
/// May be null, in which case we are skipping the xaml element;
/// witout producing any output to html.
/// </param>
/// <param name="inlineStyle">
/// StringBuilder used for collecting css properties for inline STYLE attribute.
/// </param>
private static void WriteElementContent(XmlTextReader xamlReader, XmlTextWriter htmlWriter,
StringBuilder inlineStyle)
{
Debug.Assert(xamlReader.NodeType == XmlNodeType.Element);
var elementContentStarted = false;
if (xamlReader.IsEmptyElement)
{
if (htmlWriter != null && !elementContentStarted && inlineStyle.Length > 0)
{
// Output STYLE attribute and clear inlineStyle buffer.
htmlWriter.WriteAttributeString("STYLE", inlineStyle.ToString());
inlineStyle.Remove(0, inlineStyle.Length);
}
elementContentStarted = true;
}
else
{
while (ReadNextToken(xamlReader) && xamlReader.NodeType != XmlNodeType.EndElement)
{
switch (xamlReader.NodeType)
{
case XmlNodeType.Element:
if (xamlReader.Name.Contains("."))
{
AddComplexProperty(xamlReader, inlineStyle);
}
else
{
if (htmlWriter != null && !elementContentStarted && inlineStyle.Length > 0)
{
// Output STYLE attribute and clear inlineStyle buffer.
htmlWriter.WriteAttributeString("style", inlineStyle.ToString());
inlineStyle.Remove(0, inlineStyle.Length);
}
elementContentStarted = true;
WriteElement(xamlReader, htmlWriter, inlineStyle);
}
Debug.Assert(xamlReader.NodeType == XmlNodeType.EndElement ||
xamlReader.NodeType == XmlNodeType.Element && xamlReader.IsEmptyElement);
break;
case XmlNodeType.Comment:
if (htmlWriter != null)
{
if (!elementContentStarted && inlineStyle.Length > 0)
{
htmlWriter.WriteAttributeString("style", inlineStyle.ToString());
}
htmlWriter.WriteComment(xamlReader.Value);
}
elementContentStarted = true;
break;
case XmlNodeType.CDATA:
case XmlNodeType.Text:
case XmlNodeType.SignificantWhitespace:
if (htmlWriter != null)
{
if (!elementContentStarted && inlineStyle.Length > 0)
{
htmlWriter.WriteAttributeString("style", inlineStyle.ToString());
}
htmlWriter.WriteString(xamlReader.Value);
}
elementContentStarted = true;
break;
}
}
Debug.Assert(xamlReader.NodeType == XmlNodeType.EndElement);
}
}
/// <summary>
/// Conberts an element notation of complex property into
/// </summary>
/// <param name="xamlReader">
/// On entry this XmlTextReader must be on Element start tag;
/// on exit - on EndElement tag.
/// </param>
/// <param name="inlineStyle">
/// StringBuilder containing a value for STYLE attribute.
/// </param>
private static void AddComplexProperty(XmlTextReader xamlReader, StringBuilder inlineStyle)
{
Debug.Assert(xamlReader.NodeType == XmlNodeType.Element);
if (inlineStyle != null && xamlReader.Name.EndsWith(".TextDecorations"))
{
inlineStyle.Append("text-decoration:underline;");
}
// Skip the element representing the complex property
WriteElementContent(xamlReader, /*htmlWriter:*/null, /*inlineStyle:*/null);
}
/// <summary>
/// Converts a xaml element into an appropriate html element.
/// </summary>
/// <param name="xamlReader">
/// On entry this XmlTextReader must be on Element start tag;
/// on exit - on EndElement tag.
/// </param>
/// <param name="htmlWriter">
/// May be null, in which case we are skipping xaml content
/// without producing any html output
/// </param>
/// <param name="inlineStyle">
/// StringBuilder used for collecting css properties for inline STYLE attributes on every level.
/// </param>
private static void WriteElement(XmlTextReader xamlReader, XmlTextWriter htmlWriter, StringBuilder inlineStyle)
{
Debug.Assert(xamlReader.NodeType == XmlNodeType.Element);
if (htmlWriter == null)
{
// Skipping mode; recurse into the xaml element without any output
WriteElementContent(xamlReader, /*htmlWriter:*/null, null);
}
else
{
string htmlElementName = null;
switch (xamlReader.Name)
{
case "Run":
case "Span":
htmlElementName = "span";
break;
case "InlineUIContainer":
htmlElementName = "span";
break;
case "Bold":
htmlElementName = "b";
break;
case "Italic":
htmlElementName = "i";
break;
case "Paragraph":
htmlElementName = "p";
break;
case "BlockUIContainer":
htmlElementName = "div";
break;
case "Section":
htmlElementName = "div";
break;
case "Table":
htmlElementName = "table";
break;
case "TableColumn":
htmlElementName = "col";
break;
case "TableRowGroup":
htmlElementName = "tbody";
break;
case "TableRow":
htmlElementName = "tr";
break;
case "TableCell":
htmlElementName = "td";
break;
case "List":
var marker = xamlReader.GetAttribute("MarkerStyle");
if (marker == null || marker == "None" || marker == "Disc" || marker == "Circle" ||
marker == "Square" ||
marker == "Box")
{
htmlElementName = "ul";
}
else
{
htmlElementName = "ol";
}
break;
case "ListItem":
htmlElementName = "li";
break;
default:
htmlElementName = null; // Ignore the element
break;
}
if (htmlWriter != null && htmlElementName != null)
{
htmlWriter.WriteStartElement(htmlElementName);
WriteFormattingProperties(xamlReader, htmlWriter, inlineStyle);
WriteElementContent(xamlReader, htmlWriter, inlineStyle);
htmlWriter.WriteEndElement();
}
else
{
// Skip this unrecognized xaml element
WriteElementContent(xamlReader, /*htmlWriter:*/null, null);
}
}
}
// Reader advance helpers
// ----------------------
/// <summary>
/// Reads several items from xamlReader skipping all non-significant stuff.
/// </summary>
/// <param name="xamlReader">
/// XmlTextReader from tokens are being read.
/// </param>
/// <returns>
/// True if new token is available; false if end of stream reached.
/// </returns>
private static bool ReadNextToken(XmlReader xamlReader)
{
while (xamlReader.Read())
{
Debug.Assert(xamlReader.ReadState == ReadState.Interactive,
"Reader is expected to be in Interactive state (" + xamlReader.ReadState + ")");
switch (xamlReader.NodeType)
{
case XmlNodeType.Element:
case XmlNodeType.EndElement:
case XmlNodeType.None:
case XmlNodeType.CDATA:
case XmlNodeType.Text:
case XmlNodeType.SignificantWhitespace:
return true;
case XmlNodeType.Whitespace:
if (xamlReader.XmlSpace == XmlSpace.Preserve)
{
return true;
}
// ignore insignificant whitespace
break;
case XmlNodeType.EndEntity:
case XmlNodeType.EntityReference:
// Implement entity reading
//xamlReader.ResolveEntity();
//xamlReader.Read();
//ReadChildNodes( parent, parentBaseUri, xamlReader, positionInfo);
break; // for now we ignore entities as insignificant stuff
case XmlNodeType.Comment:
return true;
case XmlNodeType.ProcessingInstruction:
case XmlNodeType.DocumentType:
case XmlNodeType.XmlDeclaration:
default:
// Ignorable stuff
break;
}
}
return false;
}
#endregion Private Methods
// ---------------------------------------------------------------------
//
// Private Fields
//
// ---------------------------------------------------------------------
#region Private Fields
#endregion Private Fields
}
}

View File

@@ -0,0 +1,745 @@
// // Copyright (c) Microsoft. All rights reserved.
// // Licensed under the MIT license. See LICENSE file in the project root for full license information.
using System;
using System.Diagnostics;
using System.IO;
using System.Text;
namespace HtmlToXamlDemo
{
/// <summary>
/// lexical analyzer class
/// recognizes tokens as groups of characters separated by arbitrary amounts of whitespace
/// also classifies tokens according to type
/// </summary>
internal class HtmlLexicalAnalyzer
{
// ---------------------------------------------------------------------
//
// Constructors
//
// ---------------------------------------------------------------------
#region Constructors
/// <summary>
/// initializes the _inputStringReader member with the string to be read
/// also sets initial values for _nextCharacterCode and _nextTokenType
/// </summary>
/// <param name="inputTextString">
/// text string to be parsed for xml content
/// </param>
internal HtmlLexicalAnalyzer(string inputTextString)
{
_inputStringReader = new StringReader(inputTextString);
_nextCharacterCode = 0;
NextCharacter = ' ';
_lookAheadCharacterCode = _inputStringReader.Read();
_lookAheadCharacter = (char) _lookAheadCharacterCode;
_previousCharacter = ' ';
_ignoreNextWhitespace = true;
_nextToken = new StringBuilder(100);
NextTokenType = HtmlTokenType.Text;
// read the first character so we have some value for the NextCharacter property
GetNextCharacter();
}
#endregion Constructors
// ---------------------------------------------------------------------
//
// Internal methods
//
// ---------------------------------------------------------------------
#region Internal Methods
/// <summary>
/// retrieves next recognizable token from input string
/// and identifies its type
/// if no valid token is found, the output parameters are set to null
/// if end of stream is reached without matching any token, token type
/// paramter is set to EOF
/// </summary>
internal void GetNextContentToken()
{
Debug.Assert(NextTokenType != HtmlTokenType.Eof);
_nextToken.Length = 0;
if (IsAtEndOfStream)
{
NextTokenType = HtmlTokenType.Eof;
return;
}
if (IsAtTagStart)
{
GetNextCharacter();
if (NextCharacter == '/')
{
_nextToken.Append("</");
NextTokenType = HtmlTokenType.ClosingTagStart;
// advance
GetNextCharacter();
_ignoreNextWhitespace = false; // Whitespaces after closing tags are significant
}
else
{
NextTokenType = HtmlTokenType.OpeningTagStart;
_nextToken.Append("<");
_ignoreNextWhitespace = true; // Whitespaces after opening tags are insignificant
}
}
else if (IsAtDirectiveStart)
{
// either a comment or CDATA
GetNextCharacter();
if (_lookAheadCharacter == '[')
{
// cdata
ReadDynamicContent();
}
else if (_lookAheadCharacter == '-')
{
ReadComment();
}
else
{
// neither a comment nor cdata, should be something like DOCTYPE
// skip till the next tag ender
ReadUnknownDirective();
}
}
else
{
// read text content, unless you encounter a tag
NextTokenType = HtmlTokenType.Text;
while (!IsAtTagStart && !IsAtEndOfStream && !IsAtDirectiveStart)
{
if (NextCharacter == '<' && !IsNextCharacterEntity && _lookAheadCharacter == '?')
{
// ignore processing directive
SkipProcessingDirective();
}
else
{
if (NextCharacter <= ' ')
{
// Respect xml:preserve or its equivalents for whitespace processing
if (_ignoreNextWhitespace)
{
// Ignore repeated whitespaces
}
else
{
// Treat any control character sequence as one whitespace
_nextToken.Append(' ');
}
_ignoreNextWhitespace = true; // and keep ignoring the following whitespaces
}
else
{
_nextToken.Append(NextCharacter);
_ignoreNextWhitespace = false;
}
GetNextCharacter();
}
}
}
}
/// <summary>
/// Unconditionally returns a token which is one of: TagEnd, EmptyTagEnd, Name, Atom or EndOfStream
/// Does not guarantee token reader advancing.
/// </summary>
internal void GetNextTagToken()
{
_nextToken.Length = 0;
if (IsAtEndOfStream)
{
NextTokenType = HtmlTokenType.Eof;
return;
}
SkipWhiteSpace();
if (NextCharacter == '>' && !IsNextCharacterEntity)
{
// &gt; should not end a tag, so make sure it's not an entity
NextTokenType = HtmlTokenType.TagEnd;
_nextToken.Append('>');
GetNextCharacter();
// Note: _ignoreNextWhitespace must be set appropriately on tag start processing
}
else if (NextCharacter == '/' && _lookAheadCharacter == '>')
{
// could be start of closing of empty tag
NextTokenType = HtmlTokenType.EmptyTagEnd;
_nextToken.Append("/>");
GetNextCharacter();
GetNextCharacter();
_ignoreNextWhitespace = false; // Whitespace after no-scope tags are sifnificant
}
else if (IsGoodForNameStart(NextCharacter))
{
NextTokenType = HtmlTokenType.Name;
// starts a name
// we allow character entities here
// we do not throw exceptions here if end of stream is encountered
// just stop and return whatever is in the token
// if the parser is not expecting end of file after this it will call
// the get next token function and throw an exception
while (IsGoodForName(NextCharacter) && !IsAtEndOfStream)
{
_nextToken.Append(NextCharacter);
GetNextCharacter();
}
}
else
{
// Unexpected type of token for a tag. Reprot one character as Atom, expecting that HtmlParser will ignore it.
NextTokenType = HtmlTokenType.Atom;
_nextToken.Append(NextCharacter);
GetNextCharacter();
}
}
/// <summary>
/// Unconditionally returns equal sign token. Even if there is no
/// real equal sign in the stream, it behaves as if it were there.
/// Does not guarantee token reader advancing.
/// </summary>
internal void GetNextEqualSignToken()
{
Debug.Assert(NextTokenType != HtmlTokenType.Eof);
_nextToken.Length = 0;
_nextToken.Append('=');
NextTokenType = HtmlTokenType.EqualSign;
SkipWhiteSpace();
if (NextCharacter == '=')
{
// '=' is not in the list of entities, so no need to check for entities here
GetNextCharacter();
}
}
/// <summary>
/// Unconditionally returns an atomic value for an attribute
/// Even if there is no appropriate token it returns Atom value
/// Does not guarantee token reader advancing.
/// </summary>
internal void GetNextAtomToken()
{
Debug.Assert(NextTokenType != HtmlTokenType.Eof);
_nextToken.Length = 0;
SkipWhiteSpace();
NextTokenType = HtmlTokenType.Atom;
if ((NextCharacter == '\'' || NextCharacter == '"') && !IsNextCharacterEntity)
{
var startingQuote = NextCharacter;
GetNextCharacter();
// Consume all characters between quotes
while (!(NextCharacter == startingQuote && !IsNextCharacterEntity) && !IsAtEndOfStream)
{
_nextToken.Append(NextCharacter);
GetNextCharacter();
}
if (NextCharacter == startingQuote)
{
GetNextCharacter();
}
// complete the quoted value
// NOTE: our recovery here is different from IE's
// IE keeps reading until it finds a closing quote or end of file
// if end of file, it treats current value as text
// if it finds a closing quote at any point within the text, it eats everything between the quotes
// TODO: Suggestion:
// however, we could stop when we encounter end of file or an angle bracket of any kind
// and assume there was a quote there
// so the attribute value may be meaningless but it is never treated as text
}
else
{
while (!IsAtEndOfStream && !char.IsWhiteSpace(NextCharacter) && NextCharacter != '>')
{
_nextToken.Append(NextCharacter);
GetNextCharacter();
}
}
}
#endregion Internal Methods
// ---------------------------------------------------------------------
//
// Internal Properties
//
// ---------------------------------------------------------------------
#region Internal Properties
internal HtmlTokenType NextTokenType { get; private set; }
internal string NextToken => _nextToken.ToString();
#endregion Internal Properties
// ---------------------------------------------------------------------
//
// Private methods
//
// ---------------------------------------------------------------------
#region Private Methods
/// <summary>
/// Advances a reading position by one character code
/// and reads the next availbale character from a stream.
/// This character becomes available as NextCharacter property.
/// </summary>
/// <remarks>
/// Throws InvalidOperationException if attempted to be called on EndOfStream
/// condition.
/// </remarks>
private void GetNextCharacter()
{
if (_nextCharacterCode == -1)
{
throw new InvalidOperationException("GetNextCharacter method called at the end of a stream");
}
_previousCharacter = NextCharacter;
NextCharacter = _lookAheadCharacter;
_nextCharacterCode = _lookAheadCharacterCode;
// next character not an entity as of now
IsNextCharacterEntity = false;
ReadLookAheadCharacter();
if (NextCharacter == '&')
{
if (_lookAheadCharacter == '#')
{
// numeric entity - parse digits - &#DDDDD;
int entityCode;
entityCode = 0;
ReadLookAheadCharacter();
// largest numeric entity is 7 characters
for (var i = 0; i < 7 && char.IsDigit(_lookAheadCharacter); i++)
{
entityCode = 10*entityCode + (_lookAheadCharacterCode - '0');
ReadLookAheadCharacter();
}
if (_lookAheadCharacter == ';')
{
// correct format - advance
ReadLookAheadCharacter();
_nextCharacterCode = entityCode;
// if this is out of range it will set the character to '?'
NextCharacter = (char) _nextCharacterCode;
// as far as we are concerned, this is an entity
IsNextCharacterEntity = true;
}
else
{
// not an entity, set next character to the current lookahread character
// we would have eaten up some digits
NextCharacter = _lookAheadCharacter;
_nextCharacterCode = _lookAheadCharacterCode;
ReadLookAheadCharacter();
IsNextCharacterEntity = false;
}
}
else if (char.IsLetter(_lookAheadCharacter))
{
// entity is written as a string
var entity = "";
// maximum length of string entities is 10 characters
for (var i = 0;
i < 10 && (char.IsLetter(_lookAheadCharacter) || char.IsDigit(_lookAheadCharacter));
i++)
{
entity += _lookAheadCharacter;
ReadLookAheadCharacter();
}
if (_lookAheadCharacter == ';')
{
// advance
ReadLookAheadCharacter();
if (HtmlSchema.IsEntity(entity))
{
NextCharacter = HtmlSchema.EntityCharacterValue(entity);
_nextCharacterCode = NextCharacter;
IsNextCharacterEntity = true;
}
else
{
// just skip the whole thing - invalid entity
// move on to the next character
NextCharacter = _lookAheadCharacter;
_nextCharacterCode = _lookAheadCharacterCode;
ReadLookAheadCharacter();
// not an entity
IsNextCharacterEntity = false;
}
}
else
{
// skip whatever we read after the ampersand
// set next character and move on
NextCharacter = _lookAheadCharacter;
ReadLookAheadCharacter();
IsNextCharacterEntity = false;
}
}
}
}
private void ReadLookAheadCharacter()
{
if (_lookAheadCharacterCode != -1)
{
_lookAheadCharacterCode = _inputStringReader.Read();
_lookAheadCharacter = (char) _lookAheadCharacterCode;
}
}
/// <summary>
/// skips whitespace in the input string
/// leaves the first non-whitespace character available in the NextCharacter property
/// this may be the end-of-file character, it performs no checking
/// </summary>
private void SkipWhiteSpace()
{
// TODO: handle character entities while processing comments, cdata, and directives
// TODO: SUGGESTION: we could check if lookahead and previous characters are entities also
while (true)
{
if (NextCharacter == '<' && (_lookAheadCharacter == '?' || _lookAheadCharacter == '!'))
{
GetNextCharacter();
if (_lookAheadCharacter == '[')
{
// Skip CDATA block and DTDs(?)
while (!IsAtEndOfStream &&
!(_previousCharacter == ']' && NextCharacter == ']' && _lookAheadCharacter == '>'))
{
GetNextCharacter();
}
if (NextCharacter == '>')
{
GetNextCharacter();
}
}
else
{
// Skip processing instruction, comments
while (!IsAtEndOfStream && NextCharacter != '>')
{
GetNextCharacter();
}
if (NextCharacter == '>')
{
GetNextCharacter();
}
}
}
if (!char.IsWhiteSpace(NextCharacter))
{
break;
}
GetNextCharacter();
}
}
/// <summary>
/// checks if a character can be used to start a name
/// if this check is true then the rest of the name can be read
/// </summary>
/// <param name="character">
/// character value to be checked
/// </param>
/// <returns>
/// true if the character can be the first character in a name
/// false otherwise
/// </returns>
private bool IsGoodForNameStart(char character) => character == '_' || char.IsLetter(character);
/// <summary>
/// checks if a character can be used as a non-starting character in a name
/// uses the IsExtender and IsCombiningCharacter predicates to see
/// if a character is an extender or a combining character
/// </summary>
/// <param name="character">
/// character to be checked for validity in a name
/// </param>
/// <returns>
/// true if the character can be a valid part of a name
/// </returns>
private bool IsGoodForName(char character) => IsGoodForNameStart(character) ||
character == '.' ||
character == '-' ||
character == ':' ||
char.IsDigit(character) ||
IsCombiningCharacter(character) ||
IsExtender(character);
/// <summary>
/// identifies a character as being a combining character, permitted in a name
/// TODO: only a placeholder for now but later to be replaced with comparisons against
/// the list of combining characters in the XML documentation
/// </summary>
/// <param name="character">
/// character to be checked
/// </param>
/// <returns>
/// true if the character is a combining character, false otherwise
/// </returns>
private bool IsCombiningCharacter(char character) => false;
/// <summary>
/// identifies a character as being an extender, permitted in a name
/// TODO: only a placeholder for now but later to be replaced with comparisons against
/// the list of extenders in the XML documentation
/// </summary>
/// <param name="character">
/// character to be checked
/// </param>
/// <returns>
/// true if the character is an extender, false otherwise
/// </returns>
private bool IsExtender(char character) => false;
/// <summary>
/// skips dynamic content starting with '<![' and ending with ']>'
/// </summary>
private void ReadDynamicContent()
{
// verify that we are at dynamic content, which may include CDATA
Debug.Assert(_previousCharacter == '<' && NextCharacter == '!' && _lookAheadCharacter == '[');
// Let's treat this as empty text
NextTokenType = HtmlTokenType.Text;
_nextToken.Length = 0;
// advance twice, once to get the lookahead character and then to reach the start of the cdata
GetNextCharacter();
GetNextCharacter();
// NOTE: 10/12/2004: modified this function to check when called if's reading CDATA or something else
// some directives may start with a <![ and then have some data and they will just end with a ]>
// this function is modified to stop at the sequence ]> and not ]]>
// this means that CDATA and anything else expressed in their own set of [] within the <! [...]>
// directive cannot contain a ]> sequence. However it is doubtful that cdata could contain such
// sequence anyway, it probably stops at the first ]
while (!(NextCharacter == ']' && _lookAheadCharacter == '>') && !IsAtEndOfStream)
{
// advance
GetNextCharacter();
}
if (!IsAtEndOfStream)
{
// advance, first to the last >
GetNextCharacter();
// then advance past it to the next character after processing directive
GetNextCharacter();
}
}
/// <summary>
/// skips comments starting with '<!-' and ending with '-->'
/// NOTE: 10/06/2004: processing changed, will now skip anything starting with
/// the "<!-" sequence and ending in "!>" or "->", because in practice many html pages do not
/// use the full comment specifying conventions
/// </summary>
private void ReadComment()
{
// verify that we are at a comment
Debug.Assert(_previousCharacter == '<' && NextCharacter == '!' && _lookAheadCharacter == '-');
// Initialize a token
NextTokenType = HtmlTokenType.Comment;
_nextToken.Length = 0;
// advance to the next character, so that to be at the start of comment value
GetNextCharacter(); // get first '-'
GetNextCharacter(); // get second '-'
GetNextCharacter(); // get first character of comment content
while (true)
{
// Read text until end of comment
// Note that in many actual html pages comments end with "!>" (while xml standard is "-->")
while (!IsAtEndOfStream &&
!(NextCharacter == '-' && _lookAheadCharacter == '-' ||
NextCharacter == '!' && _lookAheadCharacter == '>'))
{
_nextToken.Append(NextCharacter);
GetNextCharacter();
}
// Finish comment reading
GetNextCharacter();
if (_previousCharacter == '-' && NextCharacter == '-' && _lookAheadCharacter == '>')
{
// Standard comment end. Eat it and exit the loop
GetNextCharacter(); // get '>'
break;
}
if (_previousCharacter == '!' && NextCharacter == '>')
{
// Nonstandard but possible comment end - '!>'. Exit the loop
break;
}
// Not an end. Save character and continue continue reading
_nextToken.Append(_previousCharacter);
}
// Read end of comment combination
if (NextCharacter == '>')
{
GetNextCharacter();
}
}
/// <summary>
/// skips past unknown directives that start with "<!" but are not comments or Cdata
/// ignores content of such directives until the next ">"
/// character
/// applies to directives such as DOCTYPE, etc that we do not presently support
/// </summary>
private void ReadUnknownDirective()
{
// verify that we are at an unknown directive
Debug.Assert(_previousCharacter == '<' && NextCharacter == '!' &&
!(_lookAheadCharacter == '-' || _lookAheadCharacter == '['));
// Let's treat this as empty text
NextTokenType = HtmlTokenType.Text;
_nextToken.Length = 0;
// advance to the next character
GetNextCharacter();
// skip to the first tag end we find
while (!(NextCharacter == '>' && !IsNextCharacterEntity) && !IsAtEndOfStream)
{
GetNextCharacter();
}
if (!IsAtEndOfStream)
{
// advance past the tag end
GetNextCharacter();
}
}
/// <summary>
/// skips processing directives starting with the characters '<?' and ending with '?>'
/// NOTE: 10/14/2004: IE also ends processing directives with a />, so this function is
/// being modified to recognize that condition as well
/// </summary>
private void SkipProcessingDirective()
{
// verify that we are at a processing directive
Debug.Assert(NextCharacter == '<' && _lookAheadCharacter == '?');
// advance twice, once to get the lookahead character and then to reach the start of the drective
GetNextCharacter();
GetNextCharacter();
while (!((NextCharacter == '?' || NextCharacter == '/') && _lookAheadCharacter == '>') && !IsAtEndOfStream)
{
// advance
// we don't need to check for entities here because '?' is not an entity
// and even though > is an entity there is no entity processing when reading lookahead character
GetNextCharacter();
}
if (!IsAtEndOfStream)
{
// advance, first to the last >
GetNextCharacter();
// then advance past it to the next character after processing directive
GetNextCharacter();
}
}
#endregion Private Methods
// ---------------------------------------------------------------------
//
// Private Properties
//
// ---------------------------------------------------------------------
#region Private Properties
private char NextCharacter { get; set; }
private bool IsAtEndOfStream => _nextCharacterCode == -1;
private bool IsAtTagStart
=> NextCharacter == '<' && (_lookAheadCharacter == '/' || IsGoodForNameStart(_lookAheadCharacter)) &&
!IsNextCharacterEntity;
private bool IsAtTagEnd => (NextCharacter == '>' || (NextCharacter == '/' && _lookAheadCharacter == '>')) &&
!IsNextCharacterEntity;
private bool IsAtDirectiveStart
=> (NextCharacter == '<' && _lookAheadCharacter == '!' && !IsNextCharacterEntity);
private bool IsNextCharacterEntity { // check if next character is an entity
get; set; }
#endregion Private Properties
// ---------------------------------------------------------------------
//
// Private Fields
//
// ---------------------------------------------------------------------
#region Private Fields
// string reader which will move over input text
private readonly StringReader _inputStringReader;
// next character code read from input that is not yet part of any token
// and the character it represents
private int _nextCharacterCode;
private int _lookAheadCharacterCode;
private char _lookAheadCharacter;
private char _previousCharacter;
private bool _ignoreNextWhitespace;
// store token and type in local variables before copying them to output parameters
private readonly StringBuilder _nextToken;
#endregion Private Fields
}
}

View File

@@ -0,0 +1,539 @@
// // Copyright (c) Microsoft. All rights reserved.
// // Licensed under the MIT license. See LICENSE file in the project root for full license information.
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Xml;
// StringBuilder
// important TODOS:
// TODO 1. Start tags: The ParseXmlElement function has been modified to be called after both the
// angle bracket < and element name have been read, instead of just the < bracket and some valid name character,
// previously the case. This change was made so that elements with optional closing tags could read a new
// element's start tag and decide whether they were required to close. However, there is a question of whether to
// handle this in the parser or lexical analyzer. It is currently handled in the parser - the lexical analyzer still
// recognizes a start tag opener as a '<' + valid name start char; it is the parser that reads the actual name.
// this is correct behavior assuming that the name is a valid html name, because the lexical analyzer should not know anything
// about optional closing tags, etc. UPDATED: 10/13/2004: I am updating this to read the whole start tag of something
// that is not an HTML, treat it as empty, and add it to the tree. That way the converter will know it's there, but
// it will hvae no content. We could also partially recover by trying to look up and match names if they are similar
// TODO 2. Invalid element names: However, it might make sense to give the lexical analyzer the ability to identify
// a valid html element name and not return something as a start tag otherwise. For example, if we type <good>, should
// the lexical analyzer return that it has found the start of an element when this is not the case in HTML? But this will
// require implementing a lookahead token in the lexical analyzer so that it can treat an invalid element name as text. One
// character of lookahead will not be enough.
// TODO 3. Attributes: The attribute recovery is poor when reading attribute values in quotes - if no closing quotes are found,
// the lexical analyzer just keeps reading and if it eventually reaches the end of file, it would have just skipped everything.
// There are a couple of ways to deal with this: 1) stop reading attributes when we encounter a '>' character - this doesn't allow
// the '>' character to be used in attribute values, but it can still be used as an entity. 2) Maintain a HTML-specific list
// of attributes and their values that each html element can take, and if we find correct attribute namesand values for an
// element we use them regardless of the quotes, this way we could just ignore something invalid. One more option: 3) Read ahead
// in the quoted value and if we find an end of file, we can return to where we were and process as text. However this requires
// a lot of lookahead and a resettable reader.
// TODO 4: elements with optional closing tags: For elements with optional closing tags, we always close the element if we find
// that one of it's ancestors has closed. This condition may be too broad and we should develop a better heuristic. We should also
// improve the heuristics for closing certain elements when the next element starts
// TODO 5. Nesting: Support for unbalanced nesting, e.g. <b> <i> </b> </i>: this is not presently supported. To support it we may need
// to maintain two xml elements, one the element that represents what has already been read and another represents what we are presently reading.
// Then if we encounter an unbalanced nesting tag we could close the element that was supposed to close, save the current element
// and store it in the list of already-read content, and then open a new element to which all tags that are currently open
// can be applied. Is there a better way to do this? Should we do it at all?
// TODO 6. Elements with optional starting tags: there are 4 such elements in the HTML 4 specification - html, tbody, body and head.
// The current recovery doesn;t do anything for any of these elements except the html element, because it's not critical - head
// and body elementscan be contained within html element, and tbody is contained within table. To extend this for XHTML
// extensions, and to recover in case other elements are missing start tags, we would need to insert an extra recursive call
// to ParseXmlElement for the missing start tag. It is suggested to do this by giving ParseXmlElement an argument that specifies
// a name to use. If this argument is null, it assumes its name is the next token from the lexical analyzer and continues
// exactly as it does now. However, if the argument contains a valid html element name then it takes that value as its name
// and continues as before. This way, if the next token is the element that should actually be its child, it will see
// the name in the next step and initiate a recursive call. We would also need to add some logic in the loop for when a start tag
// is found - if the start tag is not compatible with current context and indicates that a start tag has been missed, then we
// can initiate the extra recursive call and give it the name of the missed start tag. The issues are when to insert this logic,
// and if we want to support it over multiple missing start tags. If we insert it at the time a start tag is read in element
// text, then we can support only one missing start tag, since the extra call will read the next start tag and make a recursive
// call without checking the context. This is a conceptual problem, and the check should be made just before a recursive call,
// with the choice being whether we should supply an element name as argument, or leave it as NULL and read from the input
// TODO 7: Context: Is it appropriate to keep track of context here? For example, should we only expect td, tr elements when
// reading a table and ignore them otherwise? This may be too much of a load on the parser, I think it's better if the converter
// deals with it
namespace HtmlToXamlDemo
{
/// <summary>
/// HtmlParser class accepts a string of possibly badly formed Html, parses it and returns a string
/// of well-formed Html that is as close to the original string in content as possible
/// </summary>
internal class HtmlParser
{
// ---------------------------------------------------------------------
//
// Constructors
//
// ---------------------------------------------------------------------
#region Constructors
/// <summary>
/// Constructor. Initializes the _htmlLexicalAnalayzer element with the given input string
/// </summary>
/// <param name="inputString">
/// string to parsed into well-formed Html
/// </param>
private HtmlParser(string inputString)
{
// Create an output xml document
_document = new XmlDocument();
// initialize open tag stack
_openedElements = new Stack<XmlElement>();
_pendingInlineElements = new Stack<XmlElement>();
// initialize lexical analyzer
_htmlLexicalAnalyzer = new HtmlLexicalAnalyzer(inputString);
// get first token from input, expecting text
_htmlLexicalAnalyzer.GetNextContentToken();
}
#endregion Constructors
// ---------------------------------------------------------------------
//
// Internal Methods
//
// ---------------------------------------------------------------------
#region Internal Methods
/// <summary>
/// Instantiates an HtmlParser element and calls the parsing function on the given input string
/// </summary>
/// <param name="htmlString">
/// Input string of pssibly badly-formed Html to be parsed into well-formed Html
/// </param>
/// <returns>
/// XmlElement rep
/// </returns>
internal static XmlElement ParseHtml(string htmlString)
{
var htmlParser = new HtmlParser(htmlString);
var htmlRootElement = htmlParser.ParseHtmlContent();
return htmlRootElement;
}
// .....................................................................
//
// Html Header on Clipboard
//
// .....................................................................
// Html header structure.
// Version:1.0
// StartHTML:000000000
// EndHTML:000000000
// StartFragment:000000000
// EndFragment:000000000
// StartSelection:000000000
// EndSelection:000000000
internal const string HtmlHeader =
"Version:1.0\r\nStartHTML:{0:D10}\r\nEndHTML:{1:D10}\r\nStartFragment:{2:D10}\r\nEndFragment:{3:D10}\r\nStartSelection:{4:D10}\r\nEndSelection:{5:D10}\r\n";
internal const string HtmlStartFragmentComment = "<!--StartFragment-->";
internal const string HtmlEndFragmentComment = "<!--EndFragment-->";
/// <summary>
/// Extracts Html string from clipboard data by parsing header information in htmlDataString
/// </summary>
/// <param name="htmlDataString">
/// String representing Html clipboard data. This includes Html header
/// </param>
/// <returns>
/// String containing only the Html data part of htmlDataString, without header
/// </returns>
internal static string ExtractHtmlFromClipboardData(string htmlDataString)
{
var startHtmlIndex = htmlDataString.IndexOf("StartHTML:", StringComparison.Ordinal);
if (startHtmlIndex < 0)
{
return "ERROR: Urecognized html header";
}
// TODO: We assume that indices represented by strictly 10 zeros ("0123456789".Length),
// which could be wrong assumption. We need to implement more flrxible parsing here
startHtmlIndex =
int.Parse(htmlDataString.Substring(startHtmlIndex + "StartHTML:".Length, "0123456789".Length));
if (startHtmlIndex < 0 || startHtmlIndex > htmlDataString.Length)
{
return "ERROR: Urecognized html header";
}
var endHtmlIndex = htmlDataString.IndexOf("EndHTML:", StringComparison.Ordinal);
if (endHtmlIndex < 0)
{
return "ERROR: Urecognized html header";
}
// TODO: We assume that indices represented by strictly 10 zeros ("0123456789".Length),
// which could be wrong assumption. We need to implement more flrxible parsing here
endHtmlIndex = int.Parse(htmlDataString.Substring(endHtmlIndex + "EndHTML:".Length, "0123456789".Length));
if (endHtmlIndex > htmlDataString.Length)
{
endHtmlIndex = htmlDataString.Length;
}
return htmlDataString.Substring(startHtmlIndex, endHtmlIndex - startHtmlIndex);
}
/// <summary>
/// Adds Xhtml header information to Html data string so that it can be placed on clipboard
/// </summary>
/// <param name="htmlString">
/// Html string to be placed on clipboard with appropriate header
/// </param>
/// <returns>
/// String wrapping htmlString with appropriate Html header
/// </returns>
internal static string AddHtmlClipboardHeader(string htmlString)
{
var stringBuilder = new StringBuilder();
// each of 6 numbers is represented by "{0:D10}" in the format string
// must actually occupy 10 digit positions ("0123456789")
var startHtml = HtmlHeader.Length + 6*("0123456789".Length - "{0:D10}".Length);
var endHtml = startHtml + htmlString.Length;
var startFragment = htmlString.IndexOf(HtmlStartFragmentComment, 0, StringComparison.Ordinal);
if (startFragment >= 0)
{
startFragment = startHtml + startFragment + HtmlStartFragmentComment.Length;
}
else
{
startFragment = startHtml;
}
var endFragment = htmlString.IndexOf(HtmlEndFragmentComment, 0, StringComparison.Ordinal);
if (endFragment >= 0)
{
endFragment = startHtml + endFragment;
}
else
{
endFragment = endHtml;
}
// Create HTML clipboard header string
stringBuilder.AppendFormat(HtmlHeader, startHtml, endHtml, startFragment, endFragment, startFragment,
endFragment);
// Append HTML body.
stringBuilder.Append(htmlString);
return stringBuilder.ToString();
}
#endregion Internal Methods
// ---------------------------------------------------------------------
//
// Private methods
//
// ---------------------------------------------------------------------
#region Private Methods
private void InvariantAssert(bool condition, string message)
{
if (!condition)
{
throw new Exception("Assertion error: " + message);
}
}
/// <summary>
/// Parses the stream of html tokens starting
/// from the name of top-level element.
/// Returns XmlElement representing the top-level
/// html element
/// </summary>
private XmlElement ParseHtmlContent()
{
// Create artificial root elelemt to be able to group multiple top-level elements
// We create "html" element which may be a duplicate of real HTML element, which is ok, as HtmlConverter will swallow it painlessly..
var htmlRootElement = _document.CreateElement("html", XhtmlNamespace);
OpenStructuringElement(htmlRootElement);
while (_htmlLexicalAnalyzer.NextTokenType != HtmlTokenType.Eof)
{
if (_htmlLexicalAnalyzer.NextTokenType == HtmlTokenType.OpeningTagStart)
{
_htmlLexicalAnalyzer.GetNextTagToken();
if (_htmlLexicalAnalyzer.NextTokenType == HtmlTokenType.Name)
{
var htmlElementName = _htmlLexicalAnalyzer.NextToken.ToLower();
_htmlLexicalAnalyzer.GetNextTagToken();
// Create an element
var htmlElement = _document.CreateElement(htmlElementName, XhtmlNamespace);
// Parse element attributes
ParseAttributes(htmlElement);
if (_htmlLexicalAnalyzer.NextTokenType == HtmlTokenType.EmptyTagEnd ||
HtmlSchema.IsEmptyElement(htmlElementName))
{
// It is an element without content (because of explicit slash or based on implicit knowledge aboout html)
AddEmptyElement(htmlElement);
}
else if (HtmlSchema.IsInlineElement(htmlElementName))
{
// Elements known as formatting are pushed to some special
// pending stack, which allows them to be transferred
// over block tags - by doing this we convert
// overlapping tags into normal heirarchical element structure.
OpenInlineElement(htmlElement);
}
else if (HtmlSchema.IsBlockElement(htmlElementName) ||
HtmlSchema.IsKnownOpenableElement(htmlElementName))
{
// This includes no-scope elements
OpenStructuringElement(htmlElement);
}
}
}
else if (_htmlLexicalAnalyzer.NextTokenType == HtmlTokenType.ClosingTagStart)
{
_htmlLexicalAnalyzer.GetNextTagToken();
if (_htmlLexicalAnalyzer.NextTokenType == HtmlTokenType.Name)
{
var htmlElementName = _htmlLexicalAnalyzer.NextToken.ToLower();
// Skip the name token. Assume that the following token is end of tag,
// but do not check this. If it is not true, we simply ignore one token
// - this is our recovery from bad xml in this case.
_htmlLexicalAnalyzer.GetNextTagToken();
CloseElement(htmlElementName);
}
}
else if (_htmlLexicalAnalyzer.NextTokenType == HtmlTokenType.Text)
{
AddTextContent(_htmlLexicalAnalyzer.NextToken);
}
else if (_htmlLexicalAnalyzer.NextTokenType == HtmlTokenType.Comment)
{
AddComment(_htmlLexicalAnalyzer.NextToken);
}
_htmlLexicalAnalyzer.GetNextContentToken();
}
// Get rid of the artificial root element
if (htmlRootElement.FirstChild is XmlElement &&
htmlRootElement.FirstChild == htmlRootElement.LastChild &&
htmlRootElement.FirstChild.LocalName.ToLower() == "html")
{
htmlRootElement = (XmlElement) htmlRootElement.FirstChild;
}
return htmlRootElement;
}
private XmlElement CreateElementCopy(XmlElement htmlElement)
{
var htmlElementCopy = _document.CreateElement(htmlElement.LocalName, XhtmlNamespace);
for (var i = 0; i < htmlElement.Attributes.Count; i++)
{
var attribute = htmlElement.Attributes[i];
htmlElementCopy.SetAttribute(attribute.Name, attribute.Value);
}
return htmlElementCopy;
}
private void AddEmptyElement(XmlElement htmlEmptyElement)
{
InvariantAssert(_openedElements.Count > 0,
"AddEmptyElement: Stack of opened elements cannot be empty, as we have at least one artificial root element");
var htmlParent = _openedElements.Peek();
htmlParent.AppendChild(htmlEmptyElement);
}
private void OpenInlineElement(XmlElement htmlInlineElement)
{
_pendingInlineElements.Push(htmlInlineElement);
}
// Opens structurig element such as Div or Table etc.
private void OpenStructuringElement(XmlElement htmlElement)
{
// Close all pending inline elements
// All block elements are considered as delimiters for inline elements
// which forces all inline elements to be closed and re-opened in the following
// structural element (if any).
// By doing that we guarantee that all inline elements appear only within most nested blocks
if (HtmlSchema.IsBlockElement(htmlElement.LocalName))
{
while (_openedElements.Count > 0 && HtmlSchema.IsInlineElement(_openedElements.Peek().LocalName))
{
var htmlInlineElement = _openedElements.Pop();
InvariantAssert(_openedElements.Count > 0,
"OpenStructuringElement: stack of opened elements cannot become empty here");
_pendingInlineElements.Push(CreateElementCopy(htmlInlineElement));
}
}
// Add this block element to its parent
if (_openedElements.Count > 0)
{
var htmlParent = _openedElements.Peek();
// Check some known block elements for auto-closing (LI and P)
if (HtmlSchema.ClosesOnNextElementStart(htmlParent.LocalName, htmlElement.LocalName))
{
_openedElements.Pop();
htmlParent = _openedElements.Count > 0 ? _openedElements.Peek() : null;
}
// NOTE:
// Actually we never expect null - it would mean two top-level P or LI (without a parent).
// In such weird case we will loose all paragraphs except the first one...
htmlParent?.AppendChild(htmlElement);
}
// Push it onto a stack
_openedElements.Push(htmlElement);
}
private bool IsElementOpened(string htmlElementName) => _openedElements.Any(openedElement => openedElement.LocalName == htmlElementName);
private void CloseElement(string htmlElementName)
{
// Check if the element is opened and already added to the parent
InvariantAssert(_openedElements.Count > 0,
"CloseElement: Stack of opened elements cannot be empty, as we have at least one artificial root element");
// Check if the element is opened and still waiting to be added to the parent
if (_pendingInlineElements.Count > 0 && _pendingInlineElements.Peek().LocalName == htmlElementName)
{
// Closing an empty inline element.
// Note that HtmlConverter will skip empty inlines, but for completeness we keep them here on parser level.
var htmlInlineElement = _pendingInlineElements.Pop();
InvariantAssert(_openedElements.Count > 0,
"CloseElement: Stack of opened elements cannot be empty, as we have at least one artificial root element");
var htmlParent = _openedElements.Peek();
htmlParent.AppendChild(htmlInlineElement);
}
else if (IsElementOpened(htmlElementName))
{
while (_openedElements.Count > 1) // we never pop the last element - the artificial root
{
// Close all unbalanced elements.
var htmlOpenedElement = _openedElements.Pop();
if (htmlOpenedElement.LocalName == htmlElementName)
{
return;
}
if (HtmlSchema.IsInlineElement(htmlOpenedElement.LocalName))
{
// Unbalances Inlines will be transfered to the next element content
_pendingInlineElements.Push(CreateElementCopy(htmlOpenedElement));
}
}
}
// If element was not opened, we simply ignore the unbalanced closing tag
}
private void AddTextContent(string textContent)
{
OpenPendingInlineElements();
InvariantAssert(_openedElements.Count > 0,
"AddTextContent: Stack of opened elements cannot be empty, as we have at least one artificial root element");
var htmlParent = _openedElements.Peek();
var textNode = _document.CreateTextNode(textContent);
htmlParent.AppendChild(textNode);
}
private void AddComment(string comment)
{
OpenPendingInlineElements();
InvariantAssert(_openedElements.Count > 0,
"AddComment: Stack of opened elements cannot be empty, as we have at least one artificial root element");
var htmlParent = _openedElements.Peek();
var xmlComment = _document.CreateComment(comment);
htmlParent.AppendChild(xmlComment);
}
// Moves all inline elements pending for opening to actual document
// and adds them to current open stack.
private void OpenPendingInlineElements()
{
if (_pendingInlineElements.Count > 0)
{
var htmlInlineElement = _pendingInlineElements.Pop();
OpenPendingInlineElements();
InvariantAssert(_openedElements.Count > 0,
"OpenPendingInlineElements: Stack of opened elements cannot be empty, as we have at least one artificial root element");
var htmlParent = _openedElements.Peek();
htmlParent.AppendChild(htmlInlineElement);
_openedElements.Push(htmlInlineElement);
}
}
private void ParseAttributes(XmlElement xmlElement)
{
while (_htmlLexicalAnalyzer.NextTokenType != HtmlTokenType.Eof && //
_htmlLexicalAnalyzer.NextTokenType != HtmlTokenType.TagEnd && //
_htmlLexicalAnalyzer.NextTokenType != HtmlTokenType.EmptyTagEnd)
{
// read next attribute (name=value)
if (_htmlLexicalAnalyzer.NextTokenType == HtmlTokenType.Name)
{
var attributeName = _htmlLexicalAnalyzer.NextToken;
_htmlLexicalAnalyzer.GetNextEqualSignToken();
_htmlLexicalAnalyzer.GetNextAtomToken();
var attributeValue = _htmlLexicalAnalyzer.NextToken;
xmlElement.SetAttribute(attributeName, attributeValue);
}
_htmlLexicalAnalyzer.GetNextTagToken();
}
}
#endregion Private Methods
// ---------------------------------------------------------------------
//
// Private Fields
//
// ---------------------------------------------------------------------
#region Private Fields
internal const string XhtmlNamespace = "http://www.w3.org/1999/xhtml";
private readonly HtmlLexicalAnalyzer _htmlLexicalAnalyzer;
// document from which all elements are created
private readonly XmlDocument _document;
// stack for open elements
private readonly Stack<XmlElement> _openedElements;
private readonly Stack<XmlElement> _pendingInlineElements;
#endregion Private Fields
}
}

View File

@@ -0,0 +1,733 @@
// // Copyright (c) Microsoft. All rights reserved.
// // Licensed under the MIT license. See LICENSE file in the project root for full license information.
using System.Collections;
using System.Diagnostics;
namespace HtmlToXamlDemo
{
/// <summary>
/// HtmlSchema class
/// maintains static information about HTML structure
/// can be used by HtmlParser to check conditions under which an element starts or ends, etc.
/// </summary>
internal class HtmlSchema
{
// ---------------------------------------------------------------------
//
// Constructors
//
// ---------------------------------------------------------------------
#region Constructors
/// <summary>
/// static constructor, initializes the ArrayLists
/// that hold the elements in various sub-components of the schema
/// e.g _htmlEmptyElements, etc.
/// </summary>
static HtmlSchema()
{
// initializes the list of all html elements
InitializeInlineElements();
InitializeBlockElements();
InitializeOtherOpenableElements();
// initialize empty elements list
InitializeEmptyElements();
// initialize list of elements closing on the outer element end
InitializeElementsClosingOnParentElementEnd();
// initalize list of elements that close when a new element starts
InitializeElementsClosingOnNewElementStart();
// Initialize character entities
InitializeHtmlCharacterEntities();
}
#endregion Constructors;
// ---------------------------------------------------------------------
//
// Internal Methods
//
// ---------------------------------------------------------------------
#region Internal Methods
/// <summary>
/// returns true when xmlElementName corresponds to empty element
/// </summary>
/// <param name="xmlElementName">
/// string representing name to test
/// </param>
internal static bool IsEmptyElement(string xmlElementName) => _htmlEmptyElements.Contains(xmlElementName.ToLower());
/// <summary>
/// returns true if xmlElementName represents a block formattinng element.
/// It used in an algorithm of transferring inline elements over block elements
/// in HtmlParser
/// </summary>
/// <param name="xmlElementName"></param>
/// <returns></returns>
internal static bool IsBlockElement(string xmlElementName) => _htmlBlockElements.Contains(xmlElementName);
/// <summary>
/// returns true if the xmlElementName represents an inline formatting element
/// </summary>
/// <param name="xmlElementName"></param>
/// <returns></returns>
internal static bool IsInlineElement(string xmlElementName) => _htmlInlineElements.Contains(xmlElementName);
/// <summary>
/// It is a list of known html elements which we
/// want to allow to produce bt HTML parser,
/// but don'tt want to act as inline, block or no-scope.
/// Presence in this list will allow to open
/// elements during html parsing, and adding the
/// to a tree produced by html parser.
/// </summary>
internal static bool IsKnownOpenableElement(string xmlElementName) => _htmlOtherOpenableElements.Contains(xmlElementName);
/// <summary>
/// returns true when xmlElementName closes when the outer element closes
/// this is true of elements with optional start tags
/// </summary>
/// <param name="xmlElementName">
/// string representing name to test
/// </param>
internal static bool ClosesOnParentElementEnd(string xmlElementName) => _htmlElementsClosingOnParentElementEnd.Contains(xmlElementName.ToLower());
/// <summary>
/// returns true if the current element closes when the new element, whose name has just been read, starts
/// </summary>
/// <param name="currentElementName">
/// string representing current element name
/// </param>
/// <param name="elementName"></param>
/// string representing name of the next element that will start
internal static bool ClosesOnNextElementStart(string currentElementName, string nextElementName)
{
Debug.Assert(currentElementName == currentElementName.ToLower());
switch (currentElementName)
{
case "colgroup":
return _htmlElementsClosingColgroup.Contains(nextElementName) && IsBlockElement(nextElementName);
case "dd":
return _htmlElementsClosingDd.Contains(nextElementName) && IsBlockElement(nextElementName);
case "dt":
return _htmlElementsClosingDt.Contains(nextElementName) && IsBlockElement(nextElementName);
case "li":
return _htmlElementsClosingLi.Contains(nextElementName);
case "p":
return IsBlockElement(nextElementName);
case "tbody":
return _htmlElementsClosingTbody.Contains(nextElementName);
case "tfoot":
return _htmlElementsClosingTfoot.Contains(nextElementName);
case "thead":
return _htmlElementsClosingThead.Contains(nextElementName);
case "tr":
return _htmlElementsClosingTr.Contains(nextElementName);
case "td":
return _htmlElementsClosingTd.Contains(nextElementName);
case "th":
return _htmlElementsClosingTh.Contains(nextElementName);
}
return false;
}
/// <summary>
/// returns true if the string passed as argument is an Html entity name
/// </summary>
/// <param name="entityName">
/// string to be tested for Html entity name
/// </param>
internal static bool IsEntity(string entityName)
{
// we do not convert entity strings to lowercase because these names are case-sensitive
if (_htmlCharacterEntities.Contains(entityName))
{
return true;
}
return false;
}
/// <summary>
/// returns the character represented by the entity name string which is passed as an argument, if the string is an
/// entity name
/// as specified in _htmlCharacterEntities, returns the character value of 0 otherwise
/// </summary>
/// <param name="entityName">
/// string representing entity name whose character value is desired
/// </param>
internal static char EntityCharacterValue(string entityName)
{
if (_htmlCharacterEntities.Contains(entityName))
{
return (char) _htmlCharacterEntities[entityName];
}
return (char) 0;
}
#endregion Internal Methods
// ---------------------------------------------------------------------
//
// Internal Properties
//
// ---------------------------------------------------------------------
#region Internal Properties
#endregion Internal Indexers
// ---------------------------------------------------------------------
//
// Private Methods
//
// ---------------------------------------------------------------------
#region Private Methods
private static void InitializeInlineElements()
{
_htmlInlineElements = new ArrayList
{
"a",
"abbr",
"acronym",
"address",
"b",
"bdo",
"big",
"button",
"code",
"del",
"dfn",
"em",
"font",
"i",
"ins",
"kbd",
"label",
"legend",
"q",
"s",
"samp",
"small",
"span",
"strike",
"strong",
"sub",
"sup",
"u",
"var"
};
// ???
// deleted text
// inserted text
// text to entered by a user
// ???
// short inline quotation
// strike-through text style
// Specifies a code sample
// indicates an instance of a program variable
}
private static void InitializeBlockElements()
{
_htmlBlockElements = new ArrayList
{
"blockquote",
"body",
"caption",
"center",
"cite",
"dd",
"dir",
"div",
"dl",
"dt",
"form",
"h1",
"h2",
"h3",
"h4",
"h5",
"h6",
"html",
"li",
"menu",
"ol",
"p",
"pre",
"table",
"tbody",
"td",
"textarea",
"tfoot",
"th",
"thead",
"tr",
"tt",
"ul"
};
// treat as UL element
// Not a block according to XHTML spec
// treat as UL element
// Renders text in a fixed-width font
}
/// <summary>
/// initializes _htmlEmptyElements with empty elements in HTML 4 spec at
/// http://www.w3.org/TR/REC-html40/index/elements.html
/// </summary>
private static void InitializeEmptyElements()
{
// Build a list of empty (no-scope) elements
// (element not requiring closing tags, and not accepting any content)
_htmlEmptyElements = new ArrayList
{
"area",
"base",
"basefont",
"br",
"col",
"frame",
"hr",
"img",
"input",
"isindex",
"link",
"meta",
"param"
};
}
private static void InitializeOtherOpenableElements()
{
// It is a list of known html elements which we
// want to allow to produce bt HTML parser,
// but don'tt want to act as inline, block or no-scope.
// Presence in this list will allow to open
// elements during html parsing, and adding the
// to a tree produced by html parser.
_htmlOtherOpenableElements = new ArrayList
{
"applet",
"base",
"basefont",
"colgroup",
"fieldset",
"frameset",
"head",
"iframe",
"map",
"noframes",
"noscript",
"object",
"optgroup",
"option",
"script",
"select",
"style",
"title"
};
//_htmlOtherOpenableElements.Add("form"); --> treated as block
}
/// <summary>
/// initializes _htmlElementsClosingOnParentElementEnd with the list of HTML 4 elements for which closing tags are
/// optional
/// we assume that for any element for which closing tags are optional, the element closes when it's outer element
/// (in which it is nested) does
/// </summary>
private static void InitializeElementsClosingOnParentElementEnd()
{
_htmlElementsClosingOnParentElementEnd = new ArrayList
{
"body",
"colgroup",
"dd",
"dt",
"head",
"html",
"li",
"p",
"tbody",
"td",
"tfoot",
"thead",
"th",
"tr"
};
}
private static void InitializeElementsClosingOnNewElementStart()
{
_htmlElementsClosingColgroup = new ArrayList {"colgroup", "tr", "thead", "tfoot", "tbody"};
_htmlElementsClosingDd = new ArrayList {"dd", "dt"};
// TODO: dd may end in other cases as well - if a new "p" starts, etc.
// TODO: these are the basic "legal" cases but there may be more recovery
_htmlElementsClosingDt = new ArrayList();
_htmlElementsClosingDd.Add("dd");
_htmlElementsClosingDd.Add("dt");
// TODO: dd may end in other cases as well - if a new "p" starts, etc.
// TODO: these are the basic "legal" cases but there may be more recovery
_htmlElementsClosingLi = new ArrayList {"li"};
// TODO: more complex recovery
_htmlElementsClosingTbody = new ArrayList {"tbody", "thead", "tfoot"};
// TODO: more complex recovery
_htmlElementsClosingTr = new ArrayList {"thead", "tfoot", "tbody", "tr"};
// NOTE: tr should not really close on a new thead
// because if there are rows before a thead, it is assumed to be in tbody, whose start tag is optional
// and thead can't come after tbody
// however, if we do encounter this, it's probably best to end the row and ignore the thead or treat
// it as part of the table
// TODO: more complex recovery
_htmlElementsClosingTd = new ArrayList {"td", "th", "tr", "tbody", "tfoot", "thead"};
// TODO: more complex recovery
_htmlElementsClosingTh = new ArrayList {"td", "th", "tr", "tbody", "tfoot", "thead"};
// TODO: more complex recovery
_htmlElementsClosingThead = new ArrayList {"tbody", "tfoot"};
// TODO: more complex recovery
_htmlElementsClosingTfoot = new ArrayList {"tbody", "thead"};
// although thead comes before tfoot, we add it because if it is found the tfoot should close
// and some recovery processing be done on the thead
// TODO: more complex recovery
}
/// <summary>
/// initializes _htmlCharacterEntities hashtable with the character corresponding to entity names
/// </summary>
private static void InitializeHtmlCharacterEntities()
{
_htmlCharacterEntities = new Hashtable
{
["Aacute"] = (char) 193,
["aacute"] = (char) 225,
["Acirc"] = (char) 194,
["acirc"] = (char) 226,
["acute"] = (char) 180,
["AElig"] = (char) 198,
["aelig"] = (char) 230,
["Agrave"] = (char) 192,
["agrave"] = (char) 224,
["alefsym"] = (char) 8501,
["Alpha"] = (char) 913,
["alpha"] = (char) 945,
["amp"] = (char) 38,
["and"] = (char) 8743,
["ang"] = (char) 8736,
["Aring"] = (char) 197,
["aring"] = (char) 229,
["asymp"] = (char) 8776,
["Atilde"] = (char) 195,
["atilde"] = (char) 227,
["Auml"] = (char) 196,
["auml"] = (char) 228,
["bdquo"] = (char) 8222,
["Beta"] = (char) 914,
["beta"] = (char) 946,
["brvbar"] = (char) 166,
["bull"] = (char) 8226,
["cap"] = (char) 8745,
["Ccedil"] = (char) 199,
["ccedil"] = (char) 231,
["cent"] = (char) 162,
["Chi"] = (char) 935,
["chi"] = (char) 967,
["circ"] = (char) 710,
["clubs"] = (char) 9827,
["cong"] = (char) 8773,
["copy"] = (char) 169,
["crarr"] = (char) 8629,
["cup"] = (char) 8746,
["curren"] = (char) 164,
["dagger"] = (char) 8224,
["Dagger"] = (char) 8225,
["darr"] = (char) 8595,
["dArr"] = (char) 8659,
["deg"] = (char) 176,
["Delta"] = (char) 916,
["delta"] = (char) 948,
["diams"] = (char) 9830,
["divide"] = (char) 247,
["Eacute"] = (char) 201,
["eacute"] = (char) 233,
["Ecirc"] = (char) 202,
["ecirc"] = (char) 234,
["Egrave"] = (char) 200,
["egrave"] = (char) 232,
["empty"] = (char) 8709,
["emsp"] = (char) 8195,
["ensp"] = (char) 8194,
["Epsilon"] = (char) 917,
["epsilon"] = (char) 949,
["equiv"] = (char) 8801,
["Eta"] = (char) 919,
["eta"] = (char) 951,
["ETH"] = (char) 208,
["eth"] = (char) 240,
["Euml"] = (char) 203,
["euml"] = (char) 235,
["euro"] = (char) 8364,
["exist"] = (char) 8707,
["fnof"] = (char) 402,
["forall"] = (char) 8704,
["frac12"] = (char) 189,
["frac14"] = (char) 188,
["frac34"] = (char) 190,
["frasl"] = (char) 8260,
["Gamma"] = (char) 915,
["gamma"] = (char) 947,
["ge"] = (char) 8805,
["gt"] = (char) 62,
["harr"] = (char) 8596,
["hArr"] = (char) 8660,
["hearts"] = (char) 9829,
["hellip"] = (char) 8230,
["Iacute"] = (char) 205,
["iacute"] = (char) 237,
["Icirc"] = (char) 206,
["icirc"] = (char) 238,
["iexcl"] = (char) 161,
["Igrave"] = (char) 204,
["igrave"] = (char) 236,
["image"] = (char) 8465,
["infin"] = (char) 8734,
["int"] = (char) 8747,
["Iota"] = (char) 921,
["iota"] = (char) 953,
["iquest"] = (char) 191,
["isin"] = (char) 8712,
["Iuml"] = (char) 207,
["iuml"] = (char) 239,
["Kappa"] = (char) 922,
["kappa"] = (char) 954,
["Lambda"] = (char) 923,
["lambda"] = (char) 955,
["lang"] = (char) 9001,
["laquo"] = (char) 171,
["larr"] = (char) 8592,
["lArr"] = (char) 8656,
["lceil"] = (char) 8968,
["ldquo"] = (char) 8220,
["le"] = (char) 8804,
["lfloor"] = (char) 8970,
["lowast"] = (char) 8727,
["loz"] = (char) 9674,
["lrm"] = (char) 8206,
["lsaquo"] = (char) 8249,
["lsquo"] = (char) 8216,
["lt"] = (char) 60,
["macr"] = (char) 175,
["mdash"] = (char) 8212,
["micro"] = (char) 181,
["middot"] = (char) 183,
["minus"] = (char) 8722,
["Mu"] = (char) 924,
["mu"] = (char) 956,
["nabla"] = (char) 8711,
["nbsp"] = (char) 160,
["ndash"] = (char) 8211,
["ne"] = (char) 8800,
["ni"] = (char) 8715,
["not"] = (char) 172,
["notin"] = (char) 8713,
["nsub"] = (char) 8836,
["Ntilde"] = (char) 209,
["ntilde"] = (char) 241,
["Nu"] = (char) 925,
["nu"] = (char) 957,
["Oacute"] = (char) 211,
["ocirc"] = (char) 244,
["OElig"] = (char) 338,
["oelig"] = (char) 339,
["Ograve"] = (char) 210,
["ograve"] = (char) 242,
["oline"] = (char) 8254,
["Omega"] = (char) 937,
["omega"] = (char) 969,
["Omicron"] = (char) 927,
["omicron"] = (char) 959,
["oplus"] = (char) 8853,
["or"] = (char) 8744,
["ordf"] = (char) 170,
["ordm"] = (char) 186,
["Oslash"] = (char) 216,
["oslash"] = (char) 248,
["Otilde"] = (char) 213,
["otilde"] = (char) 245,
["otimes"] = (char) 8855,
["Ouml"] = (char) 214,
["ouml"] = (char) 246,
["para"] = (char) 182,
["part"] = (char) 8706,
["permil"] = (char) 8240,
["perp"] = (char) 8869,
["Phi"] = (char) 934,
["phi"] = (char) 966,
["pi"] = (char) 960,
["piv"] = (char) 982,
["plusmn"] = (char) 177,
["pound"] = (char) 163,
["prime"] = (char) 8242,
["Prime"] = (char) 8243,
["prod"] = (char) 8719,
["prop"] = (char) 8733,
["Psi"] = (char) 936,
["psi"] = (char) 968,
["quot"] = (char) 34,
["radic"] = (char) 8730,
["rang"] = (char) 9002,
["raquo"] = (char) 187,
["rarr"] = (char) 8594,
["rArr"] = (char) 8658,
["rceil"] = (char) 8969,
["rdquo"] = (char) 8221,
["real"] = (char) 8476,
["reg"] = (char) 174,
["rfloor"] = (char) 8971,
["Rho"] = (char) 929,
["rho"] = (char) 961,
["rlm"] = (char) 8207,
["rsaquo"] = (char) 8250,
["rsquo"] = (char) 8217,
["sbquo"] = (char) 8218,
["Scaron"] = (char) 352,
["scaron"] = (char) 353,
["sdot"] = (char) 8901,
["sect"] = (char) 167,
["shy"] = (char) 173,
["Sigma"] = (char) 931,
["sigma"] = (char) 963,
["sigmaf"] = (char) 962,
["sim"] = (char) 8764,
["spades"] = (char) 9824,
["sub"] = (char) 8834,
["sube"] = (char) 8838,
["sum"] = (char) 8721,
["sup"] = (char) 8835,
["sup1"] = (char) 185,
["sup2"] = (char) 178,
["sup3"] = (char) 179,
["supe"] = (char) 8839,
["szlig"] = (char) 223,
["Tau"] = (char) 932,
["tau"] = (char) 964,
["there4"] = (char) 8756,
["Theta"] = (char) 920,
["theta"] = (char) 952,
["thetasym"] = (char) 977,
["thinsp"] = (char) 8201,
["THORN"] = (char) 222,
["thorn"] = (char) 254,
["tilde"] = (char) 732,
["times"] = (char) 215,
["trade"] = (char) 8482,
["Uacute"] = (char) 218,
["uacute"] = (char) 250,
["uarr"] = (char) 8593,
["uArr"] = (char) 8657,
["Ucirc"] = (char) 219,
["ucirc"] = (char) 251,
["Ugrave"] = (char) 217,
["ugrave"] = (char) 249,
["uml"] = (char) 168,
["upsih"] = (char) 978,
["Upsilon"] = (char) 933,
["upsilon"] = (char) 965,
["Uuml"] = (char) 220,
["uuml"] = (char) 252,
["weierp"] = (char) 8472,
["Xi"] = (char) 926,
["xi"] = (char) 958,
["Yacute"] = (char) 221,
["yacute"] = (char) 253,
["yen"] = (char) 165,
["Yuml"] = (char) 376,
["yuml"] = (char) 255,
["Zeta"] = (char) 918,
["zeta"] = (char) 950,
["zwj"] = (char) 8205,
["zwnj"] = (char) 8204
};
}
#endregion Private Methods
// ---------------------------------------------------------------------
//
// Private Fields
//
// ---------------------------------------------------------------------
#region Private Fields
// html element names
// this is an array list now, but we may want to make it a hashtable later for better performance
private static ArrayList _htmlInlineElements;
private static ArrayList _htmlBlockElements;
private static ArrayList _htmlOtherOpenableElements;
// list of html empty element names
private static ArrayList _htmlEmptyElements;
// names of html elements for which closing tags are optional, and close when the outer nested element closes
private static ArrayList _htmlElementsClosingOnParentElementEnd;
// names of elements that close certain optional closing tag elements when they start
// names of elements closing the colgroup element
private static ArrayList _htmlElementsClosingColgroup;
// names of elements closing the dd element
private static ArrayList _htmlElementsClosingDd;
// names of elements closing the dt element
private static ArrayList _htmlElementsClosingDt;
// names of elements closing the li element
private static ArrayList _htmlElementsClosingLi;
// names of elements closing the tbody element
private static ArrayList _htmlElementsClosingTbody;
// names of elements closing the td element
private static ArrayList _htmlElementsClosingTd;
// names of elements closing the tfoot element
private static ArrayList _htmlElementsClosingTfoot;
// names of elements closing the thead element
private static ArrayList _htmlElementsClosingThead;
// names of elements closing the th element
private static ArrayList _htmlElementsClosingTh;
// names of elements closing the tr element
private static ArrayList _htmlElementsClosingTr;
// html character entities hashtable
private static Hashtable _htmlCharacterEntities;
#endregion Private Fields
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,22 @@
// // Copyright (c) Microsoft. All rights reserved.
// // Licensed under the MIT license. See LICENSE file in the project root for full license information.
namespace HtmlToXamlDemo
{
/// <summary>
/// types of lexical tokens for html-to-xaml converter
/// </summary>
internal enum HtmlTokenType
{
OpeningTagStart,
ClosingTagStart,
TagEnd,
EmptyTagEnd,
EqualSign,
Name,
Atom, // any attribute value not in quotes
Text, //text content when accepting text
Comment,
Eof
}
}