Initial population

2026-03-07 19:22:22 -06:00
parent 647f55feb9
commit cae1a3ec46
108 changed files with 28484 additions and 0 deletions
--- a/JRCookbookBusiness/Converters/HtmlLexicalAnalyzer.cs
+++ b/JRCookbookBusiness/Converters/HtmlLexicalAnalyzer.cs
@@ -0,0 +1,745 @@
+// // Copyright (c) Microsoft. All rights reserved.
+// // Licensed under the MIT license. See LICENSE file in the project root for full license information.
+
+using System;
+using System.Diagnostics;
+using System.IO;
+using System.Text;
+
+namespace HtmlToXamlDemo
+{
+    /// <summary>
+    ///     lexical analyzer class
+    ///     recognizes tokens as groups of characters separated by arbitrary amounts of whitespace
+    ///     also classifies tokens according to type
+    /// </summary>
+    internal class HtmlLexicalAnalyzer
+    {
+        // ---------------------------------------------------------------------
+        //
+        // Constructors
+        //
+        // ---------------------------------------------------------------------
+
+        #region Constructors
+
+        /// <summary>
+        ///     initializes the _inputStringReader member with the string to be read
+        ///     also sets initial values for _nextCharacterCode and _nextTokenType
+        /// </summary>
+        /// <param name="inputTextString">
+        ///     text string to be parsed for xml content
+        /// </param>
+        internal HtmlLexicalAnalyzer(string inputTextString)
+        {
+            _inputStringReader = new StringReader(inputTextString);
+            _nextCharacterCode = 0;
+            NextCharacter = ' ';
+            _lookAheadCharacterCode = _inputStringReader.Read();
+            _lookAheadCharacter = (char) _lookAheadCharacterCode;
+            _previousCharacter = ' ';
+            _ignoreNextWhitespace = true;
+            _nextToken = new StringBuilder(100);
+            NextTokenType = HtmlTokenType.Text;
+            // read the first character so we have some value for the NextCharacter property
+            GetNextCharacter();
+        }
+
+        #endregion Constructors
+
+        // ---------------------------------------------------------------------
+        //
+        // Internal methods
+        //
+        // ---------------------------------------------------------------------
+
+        #region Internal Methods
+
+        /// <summary>
+        ///     retrieves next recognizable token from input string
+        ///     and identifies its type
+        ///     if no valid token is found, the output parameters are set to null
+        ///     if end of stream is reached without matching any token, token type
+        ///     paramter is set to EOF
+        /// </summary>
+        internal void GetNextContentToken()
+        {
+            Debug.Assert(NextTokenType != HtmlTokenType.Eof);
+            _nextToken.Length = 0;
+            if (IsAtEndOfStream)
+            {
+                NextTokenType = HtmlTokenType.Eof;
+                return;
+            }
+
+            if (IsAtTagStart)
+            {
+                GetNextCharacter();
+
+                if (NextCharacter == '/')
+                {
+                    _nextToken.Append("</");
+                    NextTokenType = HtmlTokenType.ClosingTagStart;
+
+                    // advance
+                    GetNextCharacter();
+                    _ignoreNextWhitespace = false; // Whitespaces after closing tags are significant
+                }
+                else
+                {
+                    NextTokenType = HtmlTokenType.OpeningTagStart;
+                    _nextToken.Append("<");
+                    _ignoreNextWhitespace = true; // Whitespaces after opening tags are insignificant
+                }
+            }
+            else if (IsAtDirectiveStart)
+            {
+                // either a comment or CDATA
+                GetNextCharacter();
+                if (_lookAheadCharacter == '[')
+                {
+                    // cdata
+                    ReadDynamicContent();
+                }
+                else if (_lookAheadCharacter == '-')
+                {
+                    ReadComment();
+                }
+                else
+                {
+                    // neither a comment nor cdata, should be something like DOCTYPE
+                    // skip till the next tag ender
+                    ReadUnknownDirective();
+                }
+            }
+            else
+            {
+                // read text content, unless you encounter a tag
+                NextTokenType = HtmlTokenType.Text;
+                while (!IsAtTagStart && !IsAtEndOfStream && !IsAtDirectiveStart)
+                {
+                    if (NextCharacter == '<' && !IsNextCharacterEntity && _lookAheadCharacter == '?')
+                    {
+                        // ignore processing directive
+                        SkipProcessingDirective();
+                    }
+                    else
+                    {
+                        if (NextCharacter <= ' ')
+                        {
+                            //  Respect xml:preserve or its equivalents for whitespace processing
+                            if (_ignoreNextWhitespace)
+                            {
+                                // Ignore repeated whitespaces
+                            }
+                            else
+                            {
+                                // Treat any control character sequence as one whitespace
+                                _nextToken.Append(' ');
+                            }
+                            _ignoreNextWhitespace = true; // and keep ignoring the following whitespaces
+                        }
+                        else
+                        {
+                            _nextToken.Append(NextCharacter);
+                            _ignoreNextWhitespace = false;
+                        }
+                        GetNextCharacter();
+                    }
+                }
+            }
+        }
+
+        /// <summary>
+        ///     Unconditionally returns a token which is one of: TagEnd, EmptyTagEnd, Name, Atom or EndOfStream
+        ///     Does not guarantee token reader advancing.
+        /// </summary>
+        internal void GetNextTagToken()
+        {
+            _nextToken.Length = 0;
+            if (IsAtEndOfStream)
+            {
+                NextTokenType = HtmlTokenType.Eof;
+                return;
+            }
+
+            SkipWhiteSpace();
+
+            if (NextCharacter == '>' && !IsNextCharacterEntity)
+            {
+                // &gt; should not end a tag, so make sure it's not an entity
+                NextTokenType = HtmlTokenType.TagEnd;
+                _nextToken.Append('>');
+                GetNextCharacter();
+                // Note: _ignoreNextWhitespace must be set appropriately on tag start processing
+            }
+            else if (NextCharacter == '/' && _lookAheadCharacter == '>')
+            {
+                // could be start of closing of empty tag
+                NextTokenType = HtmlTokenType.EmptyTagEnd;
+                _nextToken.Append("/>");
+                GetNextCharacter();
+                GetNextCharacter();
+                _ignoreNextWhitespace = false; // Whitespace after no-scope tags are sifnificant
+            }
+            else if (IsGoodForNameStart(NextCharacter))
+            {
+                NextTokenType = HtmlTokenType.Name;
+
+                // starts a name
+                // we allow character entities here
+                // we do not throw exceptions here if end of stream is encountered
+                // just stop and return whatever is in the token
+                // if the parser is not expecting end of file after this it will call
+                // the get next token function and throw an exception
+                while (IsGoodForName(NextCharacter) && !IsAtEndOfStream)
+                {
+                    _nextToken.Append(NextCharacter);
+                    GetNextCharacter();
+                }
+            }
+            else
+            {
+                // Unexpected type of token for a tag. Reprot one character as Atom, expecting that HtmlParser will ignore it.
+                NextTokenType = HtmlTokenType.Atom;
+                _nextToken.Append(NextCharacter);
+                GetNextCharacter();
+            }
+        }
+
+        /// <summary>
+        ///     Unconditionally returns equal sign token. Even if there is no
+        ///     real equal sign in the stream, it behaves as if it were there.
+        ///     Does not guarantee token reader advancing.
+        /// </summary>
+        internal void GetNextEqualSignToken()
+        {
+            Debug.Assert(NextTokenType != HtmlTokenType.Eof);
+            _nextToken.Length = 0;
+
+            _nextToken.Append('=');
+            NextTokenType = HtmlTokenType.EqualSign;
+
+            SkipWhiteSpace();
+
+            if (NextCharacter == '=')
+            {
+                // '=' is not in the list of entities, so no need to check for entities here
+                GetNextCharacter();
+            }
+        }
+
+        /// <summary>
+        ///     Unconditionally returns an atomic value for an attribute
+        ///     Even if there is no appropriate token it returns Atom value
+        ///     Does not guarantee token reader advancing.
+        /// </summary>
+        internal void GetNextAtomToken()
+        {
+            Debug.Assert(NextTokenType != HtmlTokenType.Eof);
+            _nextToken.Length = 0;
+
+            SkipWhiteSpace();
+
+            NextTokenType = HtmlTokenType.Atom;
+
+            if ((NextCharacter == '\'' || NextCharacter == '"') && !IsNextCharacterEntity)
+            {
+                var startingQuote = NextCharacter;
+                GetNextCharacter();
+
+                // Consume all characters between quotes
+                while (!(NextCharacter == startingQuote && !IsNextCharacterEntity) && !IsAtEndOfStream)
+                {
+                    _nextToken.Append(NextCharacter);
+                    GetNextCharacter();
+                }
+                if (NextCharacter == startingQuote)
+                {
+                    GetNextCharacter();
+                }
+
+                // complete the quoted value
+                // NOTE: our recovery here is different from IE's
+                // IE keeps reading until it finds a closing quote or end of file
+                // if end of file, it treats current value as text
+                // if it finds a closing quote at any point within the text, it eats everything between the quotes
+                // TODO: Suggestion:
+                // however, we could stop when we encounter end of file or an angle bracket of any kind
+                // and assume there was a quote there
+                // so the attribute value may be meaningless but it is never treated as text
+            }
+            else
+            {
+                while (!IsAtEndOfStream && !char.IsWhiteSpace(NextCharacter) && NextCharacter != '>')
+                {
+                    _nextToken.Append(NextCharacter);
+                    GetNextCharacter();
+                }
+            }
+        }
+
+        #endregion Internal Methods
+
+        // ---------------------------------------------------------------------
+        //
+        // Internal Properties
+        //
+        // ---------------------------------------------------------------------
+
+        #region Internal Properties
+
+        internal HtmlTokenType NextTokenType { get; private set; }
+
+        internal string NextToken => _nextToken.ToString();
+
+        #endregion Internal Properties
+
+        // ---------------------------------------------------------------------
+        //
+        // Private methods
+        //
+        // ---------------------------------------------------------------------
+
+        #region Private Methods
+
+        /// <summary>
+        ///     Advances a reading position by one character code
+        ///     and reads the next availbale character from a stream.
+        ///     This character becomes available as NextCharacter property.
+        /// </summary>
+        /// <remarks>
+        ///     Throws InvalidOperationException if attempted to be called on EndOfStream
+        ///     condition.
+        /// </remarks>
+        private void GetNextCharacter()
+        {
+            if (_nextCharacterCode == -1)
+            {
+                throw new InvalidOperationException("GetNextCharacter method called at the end of a stream");
+            }
+
+            _previousCharacter = NextCharacter;
+
+            NextCharacter = _lookAheadCharacter;
+            _nextCharacterCode = _lookAheadCharacterCode;
+            // next character not an entity as of now
+            IsNextCharacterEntity = false;
+
+            ReadLookAheadCharacter();
+
+            if (NextCharacter == '&')
+            {
+                if (_lookAheadCharacter == '#')
+                {
+                    // numeric entity - parse digits - &#DDDDD;
+                    int entityCode;
+                    entityCode = 0;
+                    ReadLookAheadCharacter();
+
+                    // largest numeric entity is 7 characters
+                    for (var i = 0; i < 7 && char.IsDigit(_lookAheadCharacter); i++)
+                    {
+                        entityCode = 10*entityCode + (_lookAheadCharacterCode - '0');
+                        ReadLookAheadCharacter();
+                    }
+                    if (_lookAheadCharacter == ';')
+                    {
+                        // correct format - advance
+                        ReadLookAheadCharacter();
+                        _nextCharacterCode = entityCode;
+
+                        // if this is out of range it will set the character to '?'
+                        NextCharacter = (char) _nextCharacterCode;
+
+                        // as far as we are concerned, this is an entity
+                        IsNextCharacterEntity = true;
+                    }
+                    else
+                    {
+                        // not an entity, set next character to the current lookahread character
+                        // we would have eaten up some digits
+                        NextCharacter = _lookAheadCharacter;
+                        _nextCharacterCode = _lookAheadCharacterCode;
+                        ReadLookAheadCharacter();
+                        IsNextCharacterEntity = false;
+                    }
+                }
+                else if (char.IsLetter(_lookAheadCharacter))
+                {
+                    // entity is written as a string
+                    var entity = "";
+
+                    // maximum length of string entities is 10 characters
+                    for (var i = 0;
+                        i < 10 && (char.IsLetter(_lookAheadCharacter) || char.IsDigit(_lookAheadCharacter));
+                        i++)
+                    {
+                        entity += _lookAheadCharacter;
+                        ReadLookAheadCharacter();
+                    }
+                    if (_lookAheadCharacter == ';')
+                    {
+                        // advance
+                        ReadLookAheadCharacter();
+
+                        if (HtmlSchema.IsEntity(entity))
+                        {
+                            NextCharacter = HtmlSchema.EntityCharacterValue(entity);
+                            _nextCharacterCode = NextCharacter;
+                            IsNextCharacterEntity = true;
+                        }
+                        else
+                        {
+                            // just skip the whole thing - invalid entity
+                            // move on to the next character
+                            NextCharacter = _lookAheadCharacter;
+                            _nextCharacterCode = _lookAheadCharacterCode;
+                            ReadLookAheadCharacter();
+
+                            // not an entity
+                            IsNextCharacterEntity = false;
+                        }
+                    }
+                    else
+                    {
+                        // skip whatever we read after the ampersand
+                        // set next character and move on
+                        NextCharacter = _lookAheadCharacter;
+                        ReadLookAheadCharacter();
+                        IsNextCharacterEntity = false;
+                    }
+                }
+            }
+        }
+
+        private void ReadLookAheadCharacter()
+        {
+            if (_lookAheadCharacterCode != -1)
+            {
+                _lookAheadCharacterCode = _inputStringReader.Read();
+                _lookAheadCharacter = (char) _lookAheadCharacterCode;
+            }
+        }
+
+        /// <summary>
+        ///     skips whitespace in the input string
+        ///     leaves the first non-whitespace character available in the NextCharacter property
+        ///     this may be the end-of-file character, it performs no checking
+        /// </summary>
+        private void SkipWhiteSpace()
+        {
+            // TODO: handle character entities while processing comments, cdata, and directives
+            // TODO: SUGGESTION: we could check if lookahead and previous characters are entities also
+            while (true)
+            {
+                if (NextCharacter == '<' && (_lookAheadCharacter == '?' || _lookAheadCharacter == '!'))
+                {
+                    GetNextCharacter();
+
+                    if (_lookAheadCharacter == '[')
+                    {
+                        // Skip CDATA block and DTDs(?)
+                        while (!IsAtEndOfStream &&
+                               !(_previousCharacter == ']' && NextCharacter == ']' && _lookAheadCharacter == '>'))
+                        {
+                            GetNextCharacter();
+                        }
+                        if (NextCharacter == '>')
+                        {
+                            GetNextCharacter();
+                        }
+                    }
+                    else
+                    {
+                        // Skip processing instruction, comments
+                        while (!IsAtEndOfStream && NextCharacter != '>')
+                        {
+                            GetNextCharacter();
+                        }
+                        if (NextCharacter == '>')
+                        {
+                            GetNextCharacter();
+                        }
+                    }
+                }
+
+
+                if (!char.IsWhiteSpace(NextCharacter))
+                {
+                    break;
+                }
+
+                GetNextCharacter();
+            }
+        }
+
+        /// <summary>
+        ///     checks if a character can be used to start a name
+        ///     if this check is true then the rest of the name can be read
+        /// </summary>
+        /// <param name="character">
+        ///     character value to be checked
+        /// </param>
+        /// <returns>
+        ///     true if the character can be the first character in a name
+        ///     false otherwise
+        /// </returns>
+        private bool IsGoodForNameStart(char character) => character == '_' || char.IsLetter(character);
+
+        /// <summary>
+        ///     checks if a character can be used as a non-starting character in a name
+        ///     uses the IsExtender and IsCombiningCharacter predicates to see
+        ///     if a character is an extender or a combining character
+        /// </summary>
+        /// <param name="character">
+        ///     character to be checked for validity in a name
+        /// </param>
+        /// <returns>
+        ///     true if the character can be a valid part of a name
+        /// </returns>
+        private bool IsGoodForName(char character) => IsGoodForNameStart(character) ||
+                character == '.' ||
+                character == '-' ||
+                character == ':' ||
+                char.IsDigit(character) ||
+                IsCombiningCharacter(character) ||
+                IsExtender(character);
+
+        /// <summary>
+        ///     identifies a character as being a combining character, permitted in a name
+        ///     TODO: only a placeholder for now but later to be replaced with comparisons against
+        ///     the list of combining characters in the XML documentation
+        /// </summary>
+        /// <param name="character">
+        ///     character to be checked
+        /// </param>
+        /// <returns>
+        ///     true if the character is a combining character, false otherwise
+        /// </returns>
+        private bool IsCombiningCharacter(char character) => false;
+
+        /// <summary>
+        ///     identifies a character as being an extender, permitted in a name
+        ///     TODO: only a placeholder for now but later to be replaced with comparisons against
+        ///     the list of extenders in the XML documentation
+        /// </summary>
+        /// <param name="character">
+        ///     character to be checked
+        /// </param>
+        /// <returns>
+        ///     true if the character is an extender, false otherwise
+        /// </returns>
+        private bool IsExtender(char character) => false;
+
+        /// <summary>
+        ///     skips dynamic content starting with '<![' and ending with ']>'
+        /// </summary>
+        private void ReadDynamicContent()
+        {
+            // verify that we are at dynamic content, which may include CDATA
+            Debug.Assert(_previousCharacter == '<' && NextCharacter == '!' && _lookAheadCharacter == '[');
+
+            // Let's treat this as empty text
+            NextTokenType = HtmlTokenType.Text;
+            _nextToken.Length = 0;
+
+            // advance twice, once to get the lookahead character and then to reach the start of the cdata
+            GetNextCharacter();
+            GetNextCharacter();
+
+            // NOTE: 10/12/2004: modified this function to check when called if's reading CDATA or something else
+            // some directives may start with a <![ and then have some data and they will just end with a ]>
+            // this function is modified to stop at the sequence ]> and not ]]>
+            // this means that CDATA and anything else expressed in their own set of [] within the <! [...]>
+            // directive cannot contain a ]> sequence. However it is doubtful that cdata could contain such
+            // sequence anyway, it probably stops at the first ]
+            while (!(NextCharacter == ']' && _lookAheadCharacter == '>') && !IsAtEndOfStream)
+            {
+                // advance
+                GetNextCharacter();
+            }
+
+            if (!IsAtEndOfStream)
+            {
+                // advance, first to the last >
+                GetNextCharacter();
+
+                // then advance past it to the next character after processing directive
+                GetNextCharacter();
+            }
+        }
+
+        /// <summary>
+        ///     skips comments starting with '<!-' and ending with '-->'
+        ///     NOTE: 10/06/2004: processing changed, will now skip anything starting with
+        ///     the "<!-"  sequence and ending in "!>" or "->", because in practice many html pages do not
+        ///     use the full comment specifying conventions
+        /// </summary>
+        private void ReadComment()
+        {
+            // verify that we are at a comment
+            Debug.Assert(_previousCharacter == '<' && NextCharacter == '!' && _lookAheadCharacter == '-');
+
+            // Initialize a token
+            NextTokenType = HtmlTokenType.Comment;
+            _nextToken.Length = 0;
+
+            // advance to the next character, so that to be at the start of comment value
+            GetNextCharacter(); // get first '-'
+            GetNextCharacter(); // get second '-'
+            GetNextCharacter(); // get first character of comment content
+
+            while (true)
+            {
+                // Read text until end of comment
+                // Note that in many actual html pages comments end with "!>" (while xml standard is "-->")
+                while (!IsAtEndOfStream &&
+                       !(NextCharacter == '-' && _lookAheadCharacter == '-' ||
+                         NextCharacter == '!' && _lookAheadCharacter == '>'))
+                {
+                    _nextToken.Append(NextCharacter);
+                    GetNextCharacter();
+                }
+
+                // Finish comment reading
+                GetNextCharacter();
+                if (_previousCharacter == '-' && NextCharacter == '-' && _lookAheadCharacter == '>')
+                {
+                    // Standard comment end. Eat it and exit the loop
+                    GetNextCharacter(); // get '>'
+                    break;
+                }
+                if (_previousCharacter == '!' && NextCharacter == '>')
+                {
+                    // Nonstandard but possible comment end - '!>'. Exit the loop
+                    break;
+                }
+                // Not an end. Save character and continue continue reading
+                _nextToken.Append(_previousCharacter);
+            }
+
+            // Read end of comment combination
+            if (NextCharacter == '>')
+            {
+                GetNextCharacter();
+            }
+        }
+
+        /// <summary>
+        ///     skips past unknown directives that start with "<!" but are not comments or Cdata
+        /// ignores content of such directives until the next ">"
+        ///     character
+        ///     applies to directives such as DOCTYPE, etc that we do not presently support
+        /// </summary>
+        private void ReadUnknownDirective()
+        {
+            // verify that we are at an unknown directive
+            Debug.Assert(_previousCharacter == '<' && NextCharacter == '!' &&
+                         !(_lookAheadCharacter == '-' || _lookAheadCharacter == '['));
+
+            // Let's treat this as empty text
+            NextTokenType = HtmlTokenType.Text;
+            _nextToken.Length = 0;
+
+            // advance to the next character
+            GetNextCharacter();
+
+            // skip to the first tag end we find
+            while (!(NextCharacter == '>' && !IsNextCharacterEntity) && !IsAtEndOfStream)
+            {
+                GetNextCharacter();
+            }
+
+            if (!IsAtEndOfStream)
+            {
+                // advance past the tag end
+                GetNextCharacter();
+            }
+        }
+
+        /// <summary>
+        ///     skips processing directives starting with the characters '<?' and ending with '?>'
+        ///     NOTE: 10/14/2004: IE also ends processing directives with a />, so this function is
+        ///     being modified to recognize that condition as well
+        /// </summary>
+        private void SkipProcessingDirective()
+        {
+            // verify that we are at a processing directive
+            Debug.Assert(NextCharacter == '<' && _lookAheadCharacter == '?');
+
+            // advance twice, once to get the lookahead character and then to reach the start of the drective
+            GetNextCharacter();
+            GetNextCharacter();
+
+            while (!((NextCharacter == '?' || NextCharacter == '/') && _lookAheadCharacter == '>') && !IsAtEndOfStream)
+            {
+                // advance
+                // we don't need to check for entities here because '?' is not an entity
+                // and even though > is an entity there is no entity processing when reading lookahead character
+                GetNextCharacter();
+            }
+
+            if (!IsAtEndOfStream)
+            {
+                // advance, first to the last >
+                GetNextCharacter();
+
+                // then advance past it to the next character after processing directive
+                GetNextCharacter();
+            }
+        }
+
+        #endregion Private Methods
+
+        // ---------------------------------------------------------------------
+        //
+        // Private Properties
+        //
+        // ---------------------------------------------------------------------
+
+        #region Private Properties
+
+        private char NextCharacter { get; set; }
+
+        private bool IsAtEndOfStream => _nextCharacterCode == -1;
+
+        private bool IsAtTagStart
+            => NextCharacter == '<' && (_lookAheadCharacter == '/' || IsGoodForNameStart(_lookAheadCharacter)) &&
+               !IsNextCharacterEntity;
+
+        private bool IsAtTagEnd => (NextCharacter == '>' || (NextCharacter == '/' && _lookAheadCharacter == '>')) &&
+                                   !IsNextCharacterEntity;
+
+        private bool IsAtDirectiveStart
+            => (NextCharacter == '<' && _lookAheadCharacter == '!' && !IsNextCharacterEntity);
+
+        private bool IsNextCharacterEntity { // check if next character is an entity
+            get; set; }
+
+        #endregion Private Properties
+
+        // ---------------------------------------------------------------------
+        //
+        // Private Fields
+        //
+        // ---------------------------------------------------------------------
+
+        #region Private Fields
+
+        // string reader which will move over input text
+        private readonly StringReader _inputStringReader;
+        // next character code read from input that is not yet part of any token
+        // and the character it represents
+        private int _nextCharacterCode;
+        private int _lookAheadCharacterCode;
+        private char _lookAheadCharacter;
+        private char _previousCharacter;
+        private bool _ignoreNextWhitespace;
+
+        // store token and type in local variables before copying them to output parameters
+        private readonly StringBuilder _nextToken;
+
+        #endregion Private Fields
+    }
+}