745 lines
29 KiB
C#
745 lines
29 KiB
C#
// // Copyright (c) Microsoft. All rights reserved.
|
|
// // Licensed under the MIT license. See LICENSE file in the project root for full license information.
|
|
|
|
using System;
|
|
using System.Diagnostics;
|
|
using System.IO;
|
|
using System.Text;
|
|
|
|
namespace HtmlToXamlDemo
|
|
{
|
|
/// <summary>
|
|
/// lexical analyzer class
|
|
/// recognizes tokens as groups of characters separated by arbitrary amounts of whitespace
|
|
/// also classifies tokens according to type
|
|
/// </summary>
|
|
internal class HtmlLexicalAnalyzer
|
|
{
|
|
// ---------------------------------------------------------------------
|
|
//
|
|
// Constructors
|
|
//
|
|
// ---------------------------------------------------------------------
|
|
|
|
#region Constructors
|
|
|
|
/// <summary>
|
|
/// initializes the _inputStringReader member with the string to be read
|
|
/// also sets initial values for _nextCharacterCode and _nextTokenType
|
|
/// </summary>
|
|
/// <param name="inputTextString">
|
|
/// text string to be parsed for xml content
|
|
/// </param>
|
|
internal HtmlLexicalAnalyzer(string inputTextString)
|
|
{
|
|
_inputStringReader = new StringReader(inputTextString);
|
|
_nextCharacterCode = 0;
|
|
NextCharacter = ' ';
|
|
_lookAheadCharacterCode = _inputStringReader.Read();
|
|
_lookAheadCharacter = (char) _lookAheadCharacterCode;
|
|
_previousCharacter = ' ';
|
|
_ignoreNextWhitespace = true;
|
|
_nextToken = new StringBuilder(100);
|
|
NextTokenType = HtmlTokenType.Text;
|
|
// read the first character so we have some value for the NextCharacter property
|
|
GetNextCharacter();
|
|
}
|
|
|
|
#endregion Constructors
|
|
|
|
// ---------------------------------------------------------------------
|
|
//
|
|
// Internal methods
|
|
//
|
|
// ---------------------------------------------------------------------
|
|
|
|
#region Internal Methods
|
|
|
|
/// <summary>
|
|
/// retrieves next recognizable token from input string
|
|
/// and identifies its type
|
|
/// if no valid token is found, the output parameters are set to null
|
|
/// if end of stream is reached without matching any token, token type
|
|
/// paramter is set to EOF
|
|
/// </summary>
|
|
internal void GetNextContentToken()
|
|
{
|
|
Debug.Assert(NextTokenType != HtmlTokenType.Eof);
|
|
_nextToken.Length = 0;
|
|
if (IsAtEndOfStream)
|
|
{
|
|
NextTokenType = HtmlTokenType.Eof;
|
|
return;
|
|
}
|
|
|
|
if (IsAtTagStart)
|
|
{
|
|
GetNextCharacter();
|
|
|
|
if (NextCharacter == '/')
|
|
{
|
|
_nextToken.Append("</");
|
|
NextTokenType = HtmlTokenType.ClosingTagStart;
|
|
|
|
// advance
|
|
GetNextCharacter();
|
|
_ignoreNextWhitespace = false; // Whitespaces after closing tags are significant
|
|
}
|
|
else
|
|
{
|
|
NextTokenType = HtmlTokenType.OpeningTagStart;
|
|
_nextToken.Append("<");
|
|
_ignoreNextWhitespace = true; // Whitespaces after opening tags are insignificant
|
|
}
|
|
}
|
|
else if (IsAtDirectiveStart)
|
|
{
|
|
// either a comment or CDATA
|
|
GetNextCharacter();
|
|
if (_lookAheadCharacter == '[')
|
|
{
|
|
// cdata
|
|
ReadDynamicContent();
|
|
}
|
|
else if (_lookAheadCharacter == '-')
|
|
{
|
|
ReadComment();
|
|
}
|
|
else
|
|
{
|
|
// neither a comment nor cdata, should be something like DOCTYPE
|
|
// skip till the next tag ender
|
|
ReadUnknownDirective();
|
|
}
|
|
}
|
|
else
|
|
{
|
|
// read text content, unless you encounter a tag
|
|
NextTokenType = HtmlTokenType.Text;
|
|
while (!IsAtTagStart && !IsAtEndOfStream && !IsAtDirectiveStart)
|
|
{
|
|
if (NextCharacter == '<' && !IsNextCharacterEntity && _lookAheadCharacter == '?')
|
|
{
|
|
// ignore processing directive
|
|
SkipProcessingDirective();
|
|
}
|
|
else
|
|
{
|
|
if (NextCharacter <= ' ')
|
|
{
|
|
// Respect xml:preserve or its equivalents for whitespace processing
|
|
if (_ignoreNextWhitespace)
|
|
{
|
|
// Ignore repeated whitespaces
|
|
}
|
|
else
|
|
{
|
|
// Treat any control character sequence as one whitespace
|
|
_nextToken.Append(' ');
|
|
}
|
|
_ignoreNextWhitespace = true; // and keep ignoring the following whitespaces
|
|
}
|
|
else
|
|
{
|
|
_nextToken.Append(NextCharacter);
|
|
_ignoreNextWhitespace = false;
|
|
}
|
|
GetNextCharacter();
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/// <summary>
|
|
/// Unconditionally returns a token which is one of: TagEnd, EmptyTagEnd, Name, Atom or EndOfStream
|
|
/// Does not guarantee token reader advancing.
|
|
/// </summary>
|
|
internal void GetNextTagToken()
|
|
{
|
|
_nextToken.Length = 0;
|
|
if (IsAtEndOfStream)
|
|
{
|
|
NextTokenType = HtmlTokenType.Eof;
|
|
return;
|
|
}
|
|
|
|
SkipWhiteSpace();
|
|
|
|
if (NextCharacter == '>' && !IsNextCharacterEntity)
|
|
{
|
|
// > should not end a tag, so make sure it's not an entity
|
|
NextTokenType = HtmlTokenType.TagEnd;
|
|
_nextToken.Append('>');
|
|
GetNextCharacter();
|
|
// Note: _ignoreNextWhitespace must be set appropriately on tag start processing
|
|
}
|
|
else if (NextCharacter == '/' && _lookAheadCharacter == '>')
|
|
{
|
|
// could be start of closing of empty tag
|
|
NextTokenType = HtmlTokenType.EmptyTagEnd;
|
|
_nextToken.Append("/>");
|
|
GetNextCharacter();
|
|
GetNextCharacter();
|
|
_ignoreNextWhitespace = false; // Whitespace after no-scope tags are sifnificant
|
|
}
|
|
else if (IsGoodForNameStart(NextCharacter))
|
|
{
|
|
NextTokenType = HtmlTokenType.Name;
|
|
|
|
// starts a name
|
|
// we allow character entities here
|
|
// we do not throw exceptions here if end of stream is encountered
|
|
// just stop and return whatever is in the token
|
|
// if the parser is not expecting end of file after this it will call
|
|
// the get next token function and throw an exception
|
|
while (IsGoodForName(NextCharacter) && !IsAtEndOfStream)
|
|
{
|
|
_nextToken.Append(NextCharacter);
|
|
GetNextCharacter();
|
|
}
|
|
}
|
|
else
|
|
{
|
|
// Unexpected type of token for a tag. Reprot one character as Atom, expecting that HtmlParser will ignore it.
|
|
NextTokenType = HtmlTokenType.Atom;
|
|
_nextToken.Append(NextCharacter);
|
|
GetNextCharacter();
|
|
}
|
|
}
|
|
|
|
/// <summary>
|
|
/// Unconditionally returns equal sign token. Even if there is no
|
|
/// real equal sign in the stream, it behaves as if it were there.
|
|
/// Does not guarantee token reader advancing.
|
|
/// </summary>
|
|
internal void GetNextEqualSignToken()
|
|
{
|
|
Debug.Assert(NextTokenType != HtmlTokenType.Eof);
|
|
_nextToken.Length = 0;
|
|
|
|
_nextToken.Append('=');
|
|
NextTokenType = HtmlTokenType.EqualSign;
|
|
|
|
SkipWhiteSpace();
|
|
|
|
if (NextCharacter == '=')
|
|
{
|
|
// '=' is not in the list of entities, so no need to check for entities here
|
|
GetNextCharacter();
|
|
}
|
|
}
|
|
|
|
/// <summary>
|
|
/// Unconditionally returns an atomic value for an attribute
|
|
/// Even if there is no appropriate token it returns Atom value
|
|
/// Does not guarantee token reader advancing.
|
|
/// </summary>
|
|
internal void GetNextAtomToken()
|
|
{
|
|
Debug.Assert(NextTokenType != HtmlTokenType.Eof);
|
|
_nextToken.Length = 0;
|
|
|
|
SkipWhiteSpace();
|
|
|
|
NextTokenType = HtmlTokenType.Atom;
|
|
|
|
if ((NextCharacter == '\'' || NextCharacter == '"') && !IsNextCharacterEntity)
|
|
{
|
|
var startingQuote = NextCharacter;
|
|
GetNextCharacter();
|
|
|
|
// Consume all characters between quotes
|
|
while (!(NextCharacter == startingQuote && !IsNextCharacterEntity) && !IsAtEndOfStream)
|
|
{
|
|
_nextToken.Append(NextCharacter);
|
|
GetNextCharacter();
|
|
}
|
|
if (NextCharacter == startingQuote)
|
|
{
|
|
GetNextCharacter();
|
|
}
|
|
|
|
// complete the quoted value
|
|
// NOTE: our recovery here is different from IE's
|
|
// IE keeps reading until it finds a closing quote or end of file
|
|
// if end of file, it treats current value as text
|
|
// if it finds a closing quote at any point within the text, it eats everything between the quotes
|
|
// TODO: Suggestion:
|
|
// however, we could stop when we encounter end of file or an angle bracket of any kind
|
|
// and assume there was a quote there
|
|
// so the attribute value may be meaningless but it is never treated as text
|
|
}
|
|
else
|
|
{
|
|
while (!IsAtEndOfStream && !char.IsWhiteSpace(NextCharacter) && NextCharacter != '>')
|
|
{
|
|
_nextToken.Append(NextCharacter);
|
|
GetNextCharacter();
|
|
}
|
|
}
|
|
}
|
|
|
|
#endregion Internal Methods
|
|
|
|
// ---------------------------------------------------------------------
|
|
//
|
|
// Internal Properties
|
|
//
|
|
// ---------------------------------------------------------------------
|
|
|
|
#region Internal Properties
|
|
|
|
internal HtmlTokenType NextTokenType { get; private set; }
|
|
|
|
internal string NextToken => _nextToken.ToString();
|
|
|
|
#endregion Internal Properties
|
|
|
|
// ---------------------------------------------------------------------
|
|
//
|
|
// Private methods
|
|
//
|
|
// ---------------------------------------------------------------------
|
|
|
|
#region Private Methods
|
|
|
|
/// <summary>
|
|
/// Advances a reading position by one character code
|
|
/// and reads the next availbale character from a stream.
|
|
/// This character becomes available as NextCharacter property.
|
|
/// </summary>
|
|
/// <remarks>
|
|
/// Throws InvalidOperationException if attempted to be called on EndOfStream
|
|
/// condition.
|
|
/// </remarks>
|
|
private void GetNextCharacter()
|
|
{
|
|
if (_nextCharacterCode == -1)
|
|
{
|
|
throw new InvalidOperationException("GetNextCharacter method called at the end of a stream");
|
|
}
|
|
|
|
_previousCharacter = NextCharacter;
|
|
|
|
NextCharacter = _lookAheadCharacter;
|
|
_nextCharacterCode = _lookAheadCharacterCode;
|
|
// next character not an entity as of now
|
|
IsNextCharacterEntity = false;
|
|
|
|
ReadLookAheadCharacter();
|
|
|
|
if (NextCharacter == '&')
|
|
{
|
|
if (_lookAheadCharacter == '#')
|
|
{
|
|
// numeric entity - parse digits - &#DDDDD;
|
|
int entityCode;
|
|
entityCode = 0;
|
|
ReadLookAheadCharacter();
|
|
|
|
// largest numeric entity is 7 characters
|
|
for (var i = 0; i < 7 && char.IsDigit(_lookAheadCharacter); i++)
|
|
{
|
|
entityCode = 10*entityCode + (_lookAheadCharacterCode - '0');
|
|
ReadLookAheadCharacter();
|
|
}
|
|
if (_lookAheadCharacter == ';')
|
|
{
|
|
// correct format - advance
|
|
ReadLookAheadCharacter();
|
|
_nextCharacterCode = entityCode;
|
|
|
|
// if this is out of range it will set the character to '?'
|
|
NextCharacter = (char) _nextCharacterCode;
|
|
|
|
// as far as we are concerned, this is an entity
|
|
IsNextCharacterEntity = true;
|
|
}
|
|
else
|
|
{
|
|
// not an entity, set next character to the current lookahread character
|
|
// we would have eaten up some digits
|
|
NextCharacter = _lookAheadCharacter;
|
|
_nextCharacterCode = _lookAheadCharacterCode;
|
|
ReadLookAheadCharacter();
|
|
IsNextCharacterEntity = false;
|
|
}
|
|
}
|
|
else if (char.IsLetter(_lookAheadCharacter))
|
|
{
|
|
// entity is written as a string
|
|
var entity = "";
|
|
|
|
// maximum length of string entities is 10 characters
|
|
for (var i = 0;
|
|
i < 10 && (char.IsLetter(_lookAheadCharacter) || char.IsDigit(_lookAheadCharacter));
|
|
i++)
|
|
{
|
|
entity += _lookAheadCharacter;
|
|
ReadLookAheadCharacter();
|
|
}
|
|
if (_lookAheadCharacter == ';')
|
|
{
|
|
// advance
|
|
ReadLookAheadCharacter();
|
|
|
|
if (HtmlSchema.IsEntity(entity))
|
|
{
|
|
NextCharacter = HtmlSchema.EntityCharacterValue(entity);
|
|
_nextCharacterCode = NextCharacter;
|
|
IsNextCharacterEntity = true;
|
|
}
|
|
else
|
|
{
|
|
// just skip the whole thing - invalid entity
|
|
// move on to the next character
|
|
NextCharacter = _lookAheadCharacter;
|
|
_nextCharacterCode = _lookAheadCharacterCode;
|
|
ReadLookAheadCharacter();
|
|
|
|
// not an entity
|
|
IsNextCharacterEntity = false;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
// skip whatever we read after the ampersand
|
|
// set next character and move on
|
|
NextCharacter = _lookAheadCharacter;
|
|
ReadLookAheadCharacter();
|
|
IsNextCharacterEntity = false;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
private void ReadLookAheadCharacter()
|
|
{
|
|
if (_lookAheadCharacterCode != -1)
|
|
{
|
|
_lookAheadCharacterCode = _inputStringReader.Read();
|
|
_lookAheadCharacter = (char) _lookAheadCharacterCode;
|
|
}
|
|
}
|
|
|
|
/// <summary>
|
|
/// skips whitespace in the input string
|
|
/// leaves the first non-whitespace character available in the NextCharacter property
|
|
/// this may be the end-of-file character, it performs no checking
|
|
/// </summary>
|
|
private void SkipWhiteSpace()
|
|
{
|
|
// TODO: handle character entities while processing comments, cdata, and directives
|
|
// TODO: SUGGESTION: we could check if lookahead and previous characters are entities also
|
|
while (true)
|
|
{
|
|
if (NextCharacter == '<' && (_lookAheadCharacter == '?' || _lookAheadCharacter == '!'))
|
|
{
|
|
GetNextCharacter();
|
|
|
|
if (_lookAheadCharacter == '[')
|
|
{
|
|
// Skip CDATA block and DTDs(?)
|
|
while (!IsAtEndOfStream &&
|
|
!(_previousCharacter == ']' && NextCharacter == ']' && _lookAheadCharacter == '>'))
|
|
{
|
|
GetNextCharacter();
|
|
}
|
|
if (NextCharacter == '>')
|
|
{
|
|
GetNextCharacter();
|
|
}
|
|
}
|
|
else
|
|
{
|
|
// Skip processing instruction, comments
|
|
while (!IsAtEndOfStream && NextCharacter != '>')
|
|
{
|
|
GetNextCharacter();
|
|
}
|
|
if (NextCharacter == '>')
|
|
{
|
|
GetNextCharacter();
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
if (!char.IsWhiteSpace(NextCharacter))
|
|
{
|
|
break;
|
|
}
|
|
|
|
GetNextCharacter();
|
|
}
|
|
}
|
|
|
|
/// <summary>
|
|
/// checks if a character can be used to start a name
|
|
/// if this check is true then the rest of the name can be read
|
|
/// </summary>
|
|
/// <param name="character">
|
|
/// character value to be checked
|
|
/// </param>
|
|
/// <returns>
|
|
/// true if the character can be the first character in a name
|
|
/// false otherwise
|
|
/// </returns>
|
|
private bool IsGoodForNameStart(char character) => character == '_' || char.IsLetter(character);
|
|
|
|
/// <summary>
|
|
/// checks if a character can be used as a non-starting character in a name
|
|
/// uses the IsExtender and IsCombiningCharacter predicates to see
|
|
/// if a character is an extender or a combining character
|
|
/// </summary>
|
|
/// <param name="character">
|
|
/// character to be checked for validity in a name
|
|
/// </param>
|
|
/// <returns>
|
|
/// true if the character can be a valid part of a name
|
|
/// </returns>
|
|
private bool IsGoodForName(char character) => IsGoodForNameStart(character) ||
|
|
character == '.' ||
|
|
character == '-' ||
|
|
character == ':' ||
|
|
char.IsDigit(character) ||
|
|
IsCombiningCharacter(character) ||
|
|
IsExtender(character);
|
|
|
|
/// <summary>
|
|
/// identifies a character as being a combining character, permitted in a name
|
|
/// TODO: only a placeholder for now but later to be replaced with comparisons against
|
|
/// the list of combining characters in the XML documentation
|
|
/// </summary>
|
|
/// <param name="character">
|
|
/// character to be checked
|
|
/// </param>
|
|
/// <returns>
|
|
/// true if the character is a combining character, false otherwise
|
|
/// </returns>
|
|
private bool IsCombiningCharacter(char character) => false;
|
|
|
|
/// <summary>
|
|
/// identifies a character as being an extender, permitted in a name
|
|
/// TODO: only a placeholder for now but later to be replaced with comparisons against
|
|
/// the list of extenders in the XML documentation
|
|
/// </summary>
|
|
/// <param name="character">
|
|
/// character to be checked
|
|
/// </param>
|
|
/// <returns>
|
|
/// true if the character is an extender, false otherwise
|
|
/// </returns>
|
|
private bool IsExtender(char character) => false;
|
|
|
|
/// <summary>
|
|
/// skips dynamic content starting with '<![' and ending with ']>'
|
|
/// </summary>
|
|
private void ReadDynamicContent()
|
|
{
|
|
// verify that we are at dynamic content, which may include CDATA
|
|
Debug.Assert(_previousCharacter == '<' && NextCharacter == '!' && _lookAheadCharacter == '[');
|
|
|
|
// Let's treat this as empty text
|
|
NextTokenType = HtmlTokenType.Text;
|
|
_nextToken.Length = 0;
|
|
|
|
// advance twice, once to get the lookahead character and then to reach the start of the cdata
|
|
GetNextCharacter();
|
|
GetNextCharacter();
|
|
|
|
// NOTE: 10/12/2004: modified this function to check when called if's reading CDATA or something else
|
|
// some directives may start with a <![ and then have some data and they will just end with a ]>
|
|
// this function is modified to stop at the sequence ]> and not ]]>
|
|
// this means that CDATA and anything else expressed in their own set of [] within the <! [...]>
|
|
// directive cannot contain a ]> sequence. However it is doubtful that cdata could contain such
|
|
// sequence anyway, it probably stops at the first ]
|
|
while (!(NextCharacter == ']' && _lookAheadCharacter == '>') && !IsAtEndOfStream)
|
|
{
|
|
// advance
|
|
GetNextCharacter();
|
|
}
|
|
|
|
if (!IsAtEndOfStream)
|
|
{
|
|
// advance, first to the last >
|
|
GetNextCharacter();
|
|
|
|
// then advance past it to the next character after processing directive
|
|
GetNextCharacter();
|
|
}
|
|
}
|
|
|
|
/// <summary>
|
|
/// skips comments starting with '<!-' and ending with '-->'
|
|
/// NOTE: 10/06/2004: processing changed, will now skip anything starting with
|
|
/// the "<!-" sequence and ending in "!>" or "->", because in practice many html pages do not
|
|
/// use the full comment specifying conventions
|
|
/// </summary>
|
|
private void ReadComment()
|
|
{
|
|
// verify that we are at a comment
|
|
Debug.Assert(_previousCharacter == '<' && NextCharacter == '!' && _lookAheadCharacter == '-');
|
|
|
|
// Initialize a token
|
|
NextTokenType = HtmlTokenType.Comment;
|
|
_nextToken.Length = 0;
|
|
|
|
// advance to the next character, so that to be at the start of comment value
|
|
GetNextCharacter(); // get first '-'
|
|
GetNextCharacter(); // get second '-'
|
|
GetNextCharacter(); // get first character of comment content
|
|
|
|
while (true)
|
|
{
|
|
// Read text until end of comment
|
|
// Note that in many actual html pages comments end with "!>" (while xml standard is "-->")
|
|
while (!IsAtEndOfStream &&
|
|
!(NextCharacter == '-' && _lookAheadCharacter == '-' ||
|
|
NextCharacter == '!' && _lookAheadCharacter == '>'))
|
|
{
|
|
_nextToken.Append(NextCharacter);
|
|
GetNextCharacter();
|
|
}
|
|
|
|
// Finish comment reading
|
|
GetNextCharacter();
|
|
if (_previousCharacter == '-' && NextCharacter == '-' && _lookAheadCharacter == '>')
|
|
{
|
|
// Standard comment end. Eat it and exit the loop
|
|
GetNextCharacter(); // get '>'
|
|
break;
|
|
}
|
|
if (_previousCharacter == '!' && NextCharacter == '>')
|
|
{
|
|
// Nonstandard but possible comment end - '!>'. Exit the loop
|
|
break;
|
|
}
|
|
// Not an end. Save character and continue continue reading
|
|
_nextToken.Append(_previousCharacter);
|
|
}
|
|
|
|
// Read end of comment combination
|
|
if (NextCharacter == '>')
|
|
{
|
|
GetNextCharacter();
|
|
}
|
|
}
|
|
|
|
/// <summary>
|
|
/// skips past unknown directives that start with "<!" but are not comments or Cdata
|
|
/// ignores content of such directives until the next ">"
|
|
/// character
|
|
/// applies to directives such as DOCTYPE, etc that we do not presently support
|
|
/// </summary>
|
|
private void ReadUnknownDirective()
|
|
{
|
|
// verify that we are at an unknown directive
|
|
Debug.Assert(_previousCharacter == '<' && NextCharacter == '!' &&
|
|
!(_lookAheadCharacter == '-' || _lookAheadCharacter == '['));
|
|
|
|
// Let's treat this as empty text
|
|
NextTokenType = HtmlTokenType.Text;
|
|
_nextToken.Length = 0;
|
|
|
|
// advance to the next character
|
|
GetNextCharacter();
|
|
|
|
// skip to the first tag end we find
|
|
while (!(NextCharacter == '>' && !IsNextCharacterEntity) && !IsAtEndOfStream)
|
|
{
|
|
GetNextCharacter();
|
|
}
|
|
|
|
if (!IsAtEndOfStream)
|
|
{
|
|
// advance past the tag end
|
|
GetNextCharacter();
|
|
}
|
|
}
|
|
|
|
/// <summary>
|
|
/// skips processing directives starting with the characters '<?' and ending with '?>'
|
|
/// NOTE: 10/14/2004: IE also ends processing directives with a />, so this function is
|
|
/// being modified to recognize that condition as well
|
|
/// </summary>
|
|
private void SkipProcessingDirective()
|
|
{
|
|
// verify that we are at a processing directive
|
|
Debug.Assert(NextCharacter == '<' && _lookAheadCharacter == '?');
|
|
|
|
// advance twice, once to get the lookahead character and then to reach the start of the drective
|
|
GetNextCharacter();
|
|
GetNextCharacter();
|
|
|
|
while (!((NextCharacter == '?' || NextCharacter == '/') && _lookAheadCharacter == '>') && !IsAtEndOfStream)
|
|
{
|
|
// advance
|
|
// we don't need to check for entities here because '?' is not an entity
|
|
// and even though > is an entity there is no entity processing when reading lookahead character
|
|
GetNextCharacter();
|
|
}
|
|
|
|
if (!IsAtEndOfStream)
|
|
{
|
|
// advance, first to the last >
|
|
GetNextCharacter();
|
|
|
|
// then advance past it to the next character after processing directive
|
|
GetNextCharacter();
|
|
}
|
|
}
|
|
|
|
#endregion Private Methods
|
|
|
|
// ---------------------------------------------------------------------
|
|
//
|
|
// Private Properties
|
|
//
|
|
// ---------------------------------------------------------------------
|
|
|
|
#region Private Properties
|
|
|
|
private char NextCharacter { get; set; }
|
|
|
|
private bool IsAtEndOfStream => _nextCharacterCode == -1;
|
|
|
|
private bool IsAtTagStart
|
|
=> NextCharacter == '<' && (_lookAheadCharacter == '/' || IsGoodForNameStart(_lookAheadCharacter)) &&
|
|
!IsNextCharacterEntity;
|
|
|
|
private bool IsAtTagEnd => (NextCharacter == '>' || (NextCharacter == '/' && _lookAheadCharacter == '>')) &&
|
|
!IsNextCharacterEntity;
|
|
|
|
private bool IsAtDirectiveStart
|
|
=> (NextCharacter == '<' && _lookAheadCharacter == '!' && !IsNextCharacterEntity);
|
|
|
|
private bool IsNextCharacterEntity { // check if next character is an entity
|
|
get; set; }
|
|
|
|
#endregion Private Properties
|
|
|
|
// ---------------------------------------------------------------------
|
|
//
|
|
// Private Fields
|
|
//
|
|
// ---------------------------------------------------------------------
|
|
|
|
#region Private Fields
|
|
|
|
// string reader which will move over input text
|
|
private readonly StringReader _inputStringReader;
|
|
// next character code read from input that is not yet part of any token
|
|
// and the character it represents
|
|
private int _nextCharacterCode;
|
|
private int _lookAheadCharacterCode;
|
|
private char _lookAheadCharacter;
|
|
private char _previousCharacter;
|
|
private bool _ignoreNextWhitespace;
|
|
|
|
// store token and type in local variables before copying them to output parameters
|
|
private readonly StringBuilder _nextToken;
|
|
|
|
#endregion Private Fields
|
|
}
|
|
} |