// // Copyright (c) Microsoft. All rights reserved. // // Licensed under the MIT license. See LICENSE file in the project root for full license information. using System; using System.Diagnostics; using System.IO; using System.Text; namespace HtmlToXamlDemo { /// /// lexical analyzer class /// recognizes tokens as groups of characters separated by arbitrary amounts of whitespace /// also classifies tokens according to type /// internal class HtmlLexicalAnalyzer { // --------------------------------------------------------------------- // // Constructors // // --------------------------------------------------------------------- #region Constructors /// /// initializes the _inputStringReader member with the string to be read /// also sets initial values for _nextCharacterCode and _nextTokenType /// /// /// text string to be parsed for xml content /// internal HtmlLexicalAnalyzer(string inputTextString) { _inputStringReader = new StringReader(inputTextString); _nextCharacterCode = 0; NextCharacter = ' '; _lookAheadCharacterCode = _inputStringReader.Read(); _lookAheadCharacter = (char) _lookAheadCharacterCode; _previousCharacter = ' '; _ignoreNextWhitespace = true; _nextToken = new StringBuilder(100); NextTokenType = HtmlTokenType.Text; // read the first character so we have some value for the NextCharacter property GetNextCharacter(); } #endregion Constructors // --------------------------------------------------------------------- // // Internal methods // // --------------------------------------------------------------------- #region Internal Methods /// /// retrieves next recognizable token from input string /// and identifies its type /// if no valid token is found, the output parameters are set to null /// if end of stream is reached without matching any token, token type /// paramter is set to EOF /// internal void GetNextContentToken() { Debug.Assert(NextTokenType != HtmlTokenType.Eof); _nextToken.Length = 0; if (IsAtEndOfStream) { NextTokenType = HtmlTokenType.Eof; return; } if (IsAtTagStart) { GetNextCharacter(); if (NextCharacter == '/') { _nextToken.Append(" /// Unconditionally returns a token which is one of: TagEnd, EmptyTagEnd, Name, Atom or EndOfStream /// Does not guarantee token reader advancing. /// internal void GetNextTagToken() { _nextToken.Length = 0; if (IsAtEndOfStream) { NextTokenType = HtmlTokenType.Eof; return; } SkipWhiteSpace(); if (NextCharacter == '>' && !IsNextCharacterEntity) { // > should not end a tag, so make sure it's not an entity NextTokenType = HtmlTokenType.TagEnd; _nextToken.Append('>'); GetNextCharacter(); // Note: _ignoreNextWhitespace must be set appropriately on tag start processing } else if (NextCharacter == '/' && _lookAheadCharacter == '>') { // could be start of closing of empty tag NextTokenType = HtmlTokenType.EmptyTagEnd; _nextToken.Append("/>"); GetNextCharacter(); GetNextCharacter(); _ignoreNextWhitespace = false; // Whitespace after no-scope tags are sifnificant } else if (IsGoodForNameStart(NextCharacter)) { NextTokenType = HtmlTokenType.Name; // starts a name // we allow character entities here // we do not throw exceptions here if end of stream is encountered // just stop and return whatever is in the token // if the parser is not expecting end of file after this it will call // the get next token function and throw an exception while (IsGoodForName(NextCharacter) && !IsAtEndOfStream) { _nextToken.Append(NextCharacter); GetNextCharacter(); } } else { // Unexpected type of token for a tag. Reprot one character as Atom, expecting that HtmlParser will ignore it. NextTokenType = HtmlTokenType.Atom; _nextToken.Append(NextCharacter); GetNextCharacter(); } } /// /// Unconditionally returns equal sign token. Even if there is no /// real equal sign in the stream, it behaves as if it were there. /// Does not guarantee token reader advancing. /// internal void GetNextEqualSignToken() { Debug.Assert(NextTokenType != HtmlTokenType.Eof); _nextToken.Length = 0; _nextToken.Append('='); NextTokenType = HtmlTokenType.EqualSign; SkipWhiteSpace(); if (NextCharacter == '=') { // '=' is not in the list of entities, so no need to check for entities here GetNextCharacter(); } } /// /// Unconditionally returns an atomic value for an attribute /// Even if there is no appropriate token it returns Atom value /// Does not guarantee token reader advancing. /// internal void GetNextAtomToken() { Debug.Assert(NextTokenType != HtmlTokenType.Eof); _nextToken.Length = 0; SkipWhiteSpace(); NextTokenType = HtmlTokenType.Atom; if ((NextCharacter == '\'' || NextCharacter == '"') && !IsNextCharacterEntity) { var startingQuote = NextCharacter; GetNextCharacter(); // Consume all characters between quotes while (!(NextCharacter == startingQuote && !IsNextCharacterEntity) && !IsAtEndOfStream) { _nextToken.Append(NextCharacter); GetNextCharacter(); } if (NextCharacter == startingQuote) { GetNextCharacter(); } // complete the quoted value // NOTE: our recovery here is different from IE's // IE keeps reading until it finds a closing quote or end of file // if end of file, it treats current value as text // if it finds a closing quote at any point within the text, it eats everything between the quotes // TODO: Suggestion: // however, we could stop when we encounter end of file or an angle bracket of any kind // and assume there was a quote there // so the attribute value may be meaningless but it is never treated as text } else { while (!IsAtEndOfStream && !char.IsWhiteSpace(NextCharacter) && NextCharacter != '>') { _nextToken.Append(NextCharacter); GetNextCharacter(); } } } #endregion Internal Methods // --------------------------------------------------------------------- // // Internal Properties // // --------------------------------------------------------------------- #region Internal Properties internal HtmlTokenType NextTokenType { get; private set; } internal string NextToken => _nextToken.ToString(); #endregion Internal Properties // --------------------------------------------------------------------- // // Private methods // // --------------------------------------------------------------------- #region Private Methods /// /// Advances a reading position by one character code /// and reads the next availbale character from a stream. /// This character becomes available as NextCharacter property. /// /// /// Throws InvalidOperationException if attempted to be called on EndOfStream /// condition. /// private void GetNextCharacter() { if (_nextCharacterCode == -1) { throw new InvalidOperationException("GetNextCharacter method called at the end of a stream"); } _previousCharacter = NextCharacter; NextCharacter = _lookAheadCharacter; _nextCharacterCode = _lookAheadCharacterCode; // next character not an entity as of now IsNextCharacterEntity = false; ReadLookAheadCharacter(); if (NextCharacter == '&') { if (_lookAheadCharacter == '#') { // numeric entity - parse digits - &#DDDDD; int entityCode; entityCode = 0; ReadLookAheadCharacter(); // largest numeric entity is 7 characters for (var i = 0; i < 7 && char.IsDigit(_lookAheadCharacter); i++) { entityCode = 10*entityCode + (_lookAheadCharacterCode - '0'); ReadLookAheadCharacter(); } if (_lookAheadCharacter == ';') { // correct format - advance ReadLookAheadCharacter(); _nextCharacterCode = entityCode; // if this is out of range it will set the character to '?' NextCharacter = (char) _nextCharacterCode; // as far as we are concerned, this is an entity IsNextCharacterEntity = true; } else { // not an entity, set next character to the current lookahread character // we would have eaten up some digits NextCharacter = _lookAheadCharacter; _nextCharacterCode = _lookAheadCharacterCode; ReadLookAheadCharacter(); IsNextCharacterEntity = false; } } else if (char.IsLetter(_lookAheadCharacter)) { // entity is written as a string var entity = ""; // maximum length of string entities is 10 characters for (var i = 0; i < 10 && (char.IsLetter(_lookAheadCharacter) || char.IsDigit(_lookAheadCharacter)); i++) { entity += _lookAheadCharacter; ReadLookAheadCharacter(); } if (_lookAheadCharacter == ';') { // advance ReadLookAheadCharacter(); if (HtmlSchema.IsEntity(entity)) { NextCharacter = HtmlSchema.EntityCharacterValue(entity); _nextCharacterCode = NextCharacter; IsNextCharacterEntity = true; } else { // just skip the whole thing - invalid entity // move on to the next character NextCharacter = _lookAheadCharacter; _nextCharacterCode = _lookAheadCharacterCode; ReadLookAheadCharacter(); // not an entity IsNextCharacterEntity = false; } } else { // skip whatever we read after the ampersand // set next character and move on NextCharacter = _lookAheadCharacter; ReadLookAheadCharacter(); IsNextCharacterEntity = false; } } } } private void ReadLookAheadCharacter() { if (_lookAheadCharacterCode != -1) { _lookAheadCharacterCode = _inputStringReader.Read(); _lookAheadCharacter = (char) _lookAheadCharacterCode; } } /// /// skips whitespace in the input string /// leaves the first non-whitespace character available in the NextCharacter property /// this may be the end-of-file character, it performs no checking /// private void SkipWhiteSpace() { // TODO: handle character entities while processing comments, cdata, and directives // TODO: SUGGESTION: we could check if lookahead and previous characters are entities also while (true) { if (NextCharacter == '<' && (_lookAheadCharacter == '?' || _lookAheadCharacter == '!')) { GetNextCharacter(); if (_lookAheadCharacter == '[') { // Skip CDATA block and DTDs(?) while (!IsAtEndOfStream && !(_previousCharacter == ']' && NextCharacter == ']' && _lookAheadCharacter == '>')) { GetNextCharacter(); } if (NextCharacter == '>') { GetNextCharacter(); } } else { // Skip processing instruction, comments while (!IsAtEndOfStream && NextCharacter != '>') { GetNextCharacter(); } if (NextCharacter == '>') { GetNextCharacter(); } } } if (!char.IsWhiteSpace(NextCharacter)) { break; } GetNextCharacter(); } } /// /// checks if a character can be used to start a name /// if this check is true then the rest of the name can be read /// /// /// character value to be checked /// /// /// true if the character can be the first character in a name /// false otherwise /// private bool IsGoodForNameStart(char character) => character == '_' || char.IsLetter(character); /// /// checks if a character can be used as a non-starting character in a name /// uses the IsExtender and IsCombiningCharacter predicates to see /// if a character is an extender or a combining character /// /// /// character to be checked for validity in a name /// /// /// true if the character can be a valid part of a name /// private bool IsGoodForName(char character) => IsGoodForNameStart(character) || character == '.' || character == '-' || character == ':' || char.IsDigit(character) || IsCombiningCharacter(character) || IsExtender(character); /// /// identifies a character as being a combining character, permitted in a name /// TODO: only a placeholder for now but later to be replaced with comparisons against /// the list of combining characters in the XML documentation /// /// /// character to be checked /// /// /// true if the character is a combining character, false otherwise /// private bool IsCombiningCharacter(char character) => false; /// /// identifies a character as being an extender, permitted in a name /// TODO: only a placeholder for now but later to be replaced with comparisons against /// the list of extenders in the XML documentation /// /// /// character to be checked /// /// /// true if the character is an extender, false otherwise /// private bool IsExtender(char character) => false; /// /// skips dynamic content starting with '' /// private void ReadDynamicContent() { // verify that we are at dynamic content, which may include CDATA Debug.Assert(_previousCharacter == '<' && NextCharacter == '!' && _lookAheadCharacter == '['); // Let's treat this as empty text NextTokenType = HtmlTokenType.Text; _nextToken.Length = 0; // advance twice, once to get the lookahead character and then to reach the start of the cdata GetNextCharacter(); GetNextCharacter(); // NOTE: 10/12/2004: modified this function to check when called if's reading CDATA or something else // some directives may start with a // this function is modified to stop at the sequence ]> and not ]]> // this means that CDATA and anything else expressed in their own set of [] within the // directive cannot contain a ]> sequence. However it is doubtful that cdata could contain such // sequence anyway, it probably stops at the first ] while (!(NextCharacter == ']' && _lookAheadCharacter == '>') && !IsAtEndOfStream) { // advance GetNextCharacter(); } if (!IsAtEndOfStream) { // advance, first to the last > GetNextCharacter(); // then advance past it to the next character after processing directive GetNextCharacter(); } } /// /// skips comments starting with '' /// NOTE: 10/06/2004: processing changed, will now skip anything starting with /// the "" or "->", because in practice many html pages do not /// use the full comment specifying conventions /// private void ReadComment() { // verify that we are at a comment Debug.Assert(_previousCharacter == '<' && NextCharacter == '!' && _lookAheadCharacter == '-'); // Initialize a token NextTokenType = HtmlTokenType.Comment; _nextToken.Length = 0; // advance to the next character, so that to be at the start of comment value GetNextCharacter(); // get first '-' GetNextCharacter(); // get second '-' GetNextCharacter(); // get first character of comment content while (true) { // Read text until end of comment // Note that in many actual html pages comments end with "!>" (while xml standard is "-->") while (!IsAtEndOfStream && !(NextCharacter == '-' && _lookAheadCharacter == '-' || NextCharacter == '!' && _lookAheadCharacter == '>')) { _nextToken.Append(NextCharacter); GetNextCharacter(); } // Finish comment reading GetNextCharacter(); if (_previousCharacter == '-' && NextCharacter == '-' && _lookAheadCharacter == '>') { // Standard comment end. Eat it and exit the loop GetNextCharacter(); // get '>' break; } if (_previousCharacter == '!' && NextCharacter == '>') { // Nonstandard but possible comment end - '!>'. Exit the loop break; } // Not an end. Save character and continue continue reading _nextToken.Append(_previousCharacter); } // Read end of comment combination if (NextCharacter == '>') { GetNextCharacter(); } } /// /// skips past unknown directives that start with "" /// character /// applies to directives such as DOCTYPE, etc that we do not presently support /// private void ReadUnknownDirective() { // verify that we are at an unknown directive Debug.Assert(_previousCharacter == '<' && NextCharacter == '!' && !(_lookAheadCharacter == '-' || _lookAheadCharacter == '[')); // Let's treat this as empty text NextTokenType = HtmlTokenType.Text; _nextToken.Length = 0; // advance to the next character GetNextCharacter(); // skip to the first tag end we find while (!(NextCharacter == '>' && !IsNextCharacterEntity) && !IsAtEndOfStream) { GetNextCharacter(); } if (!IsAtEndOfStream) { // advance past the tag end GetNextCharacter(); } } /// /// skips processing directives starting with the characters '' /// NOTE: 10/14/2004: IE also ends processing directives with a />, so this function is /// being modified to recognize that condition as well /// private void SkipProcessingDirective() { // verify that we are at a processing directive Debug.Assert(NextCharacter == '<' && _lookAheadCharacter == '?'); // advance twice, once to get the lookahead character and then to reach the start of the drective GetNextCharacter(); GetNextCharacter(); while (!((NextCharacter == '?' || NextCharacter == '/') && _lookAheadCharacter == '>') && !IsAtEndOfStream) { // advance // we don't need to check for entities here because '?' is not an entity // and even though > is an entity there is no entity processing when reading lookahead character GetNextCharacter(); } if (!IsAtEndOfStream) { // advance, first to the last > GetNextCharacter(); // then advance past it to the next character after processing directive GetNextCharacter(); } } #endregion Private Methods // --------------------------------------------------------------------- // // Private Properties // // --------------------------------------------------------------------- #region Private Properties private char NextCharacter { get; set; } private bool IsAtEndOfStream => _nextCharacterCode == -1; private bool IsAtTagStart => NextCharacter == '<' && (_lookAheadCharacter == '/' || IsGoodForNameStart(_lookAheadCharacter)) && !IsNextCharacterEntity; private bool IsAtTagEnd => (NextCharacter == '>' || (NextCharacter == '/' && _lookAheadCharacter == '>')) && !IsNextCharacterEntity; private bool IsAtDirectiveStart => (NextCharacter == '<' && _lookAheadCharacter == '!' && !IsNextCharacterEntity); private bool IsNextCharacterEntity { // check if next character is an entity get; set; } #endregion Private Properties // --------------------------------------------------------------------- // // Private Fields // // --------------------------------------------------------------------- #region Private Fields // string reader which will move over input text private readonly StringReader _inputStringReader; // next character code read from input that is not yet part of any token // and the character it represents private int _nextCharacterCode; private int _lookAheadCharacterCode; private char _lookAheadCharacter; private char _previousCharacter; private bool _ignoreNextWhitespace; // store token and type in local variables before copying them to output parameters private readonly StringBuilder _nextToken; #endregion Private Fields } }