Initial population
This commit is contained in:
745
JRCookbookBusiness/Converters/HtmlLexicalAnalyzer.cs
Normal file
745
JRCookbookBusiness/Converters/HtmlLexicalAnalyzer.cs
Normal file
@@ -0,0 +1,745 @@
|
||||
// // Copyright (c) Microsoft. All rights reserved.
|
||||
// // Licensed under the MIT license. See LICENSE file in the project root for full license information.
|
||||
|
||||
using System;
|
||||
using System.Diagnostics;
|
||||
using System.IO;
|
||||
using System.Text;
|
||||
|
||||
namespace HtmlToXamlDemo
|
||||
{
|
||||
/// <summary>
|
||||
/// lexical analyzer class
|
||||
/// recognizes tokens as groups of characters separated by arbitrary amounts of whitespace
|
||||
/// also classifies tokens according to type
|
||||
/// </summary>
|
||||
internal class HtmlLexicalAnalyzer
|
||||
{
|
||||
// ---------------------------------------------------------------------
|
||||
//
|
||||
// Constructors
|
||||
//
|
||||
// ---------------------------------------------------------------------
|
||||
|
||||
#region Constructors
|
||||
|
||||
/// <summary>
|
||||
/// initializes the _inputStringReader member with the string to be read
|
||||
/// also sets initial values for _nextCharacterCode and _nextTokenType
|
||||
/// </summary>
|
||||
/// <param name="inputTextString">
|
||||
/// text string to be parsed for xml content
|
||||
/// </param>
|
||||
internal HtmlLexicalAnalyzer(string inputTextString)
|
||||
{
|
||||
_inputStringReader = new StringReader(inputTextString);
|
||||
_nextCharacterCode = 0;
|
||||
NextCharacter = ' ';
|
||||
_lookAheadCharacterCode = _inputStringReader.Read();
|
||||
_lookAheadCharacter = (char) _lookAheadCharacterCode;
|
||||
_previousCharacter = ' ';
|
||||
_ignoreNextWhitespace = true;
|
||||
_nextToken = new StringBuilder(100);
|
||||
NextTokenType = HtmlTokenType.Text;
|
||||
// read the first character so we have some value for the NextCharacter property
|
||||
GetNextCharacter();
|
||||
}
|
||||
|
||||
#endregion Constructors
|
||||
|
||||
// ---------------------------------------------------------------------
|
||||
//
|
||||
// Internal methods
|
||||
//
|
||||
// ---------------------------------------------------------------------
|
||||
|
||||
#region Internal Methods
|
||||
|
||||
/// <summary>
|
||||
/// retrieves next recognizable token from input string
|
||||
/// and identifies its type
|
||||
/// if no valid token is found, the output parameters are set to null
|
||||
/// if end of stream is reached without matching any token, token type
|
||||
/// paramter is set to EOF
|
||||
/// </summary>
|
||||
internal void GetNextContentToken()
|
||||
{
|
||||
Debug.Assert(NextTokenType != HtmlTokenType.Eof);
|
||||
_nextToken.Length = 0;
|
||||
if (IsAtEndOfStream)
|
||||
{
|
||||
NextTokenType = HtmlTokenType.Eof;
|
||||
return;
|
||||
}
|
||||
|
||||
if (IsAtTagStart)
|
||||
{
|
||||
GetNextCharacter();
|
||||
|
||||
if (NextCharacter == '/')
|
||||
{
|
||||
_nextToken.Append("</");
|
||||
NextTokenType = HtmlTokenType.ClosingTagStart;
|
||||
|
||||
// advance
|
||||
GetNextCharacter();
|
||||
_ignoreNextWhitespace = false; // Whitespaces after closing tags are significant
|
||||
}
|
||||
else
|
||||
{
|
||||
NextTokenType = HtmlTokenType.OpeningTagStart;
|
||||
_nextToken.Append("<");
|
||||
_ignoreNextWhitespace = true; // Whitespaces after opening tags are insignificant
|
||||
}
|
||||
}
|
||||
else if (IsAtDirectiveStart)
|
||||
{
|
||||
// either a comment or CDATA
|
||||
GetNextCharacter();
|
||||
if (_lookAheadCharacter == '[')
|
||||
{
|
||||
// cdata
|
||||
ReadDynamicContent();
|
||||
}
|
||||
else if (_lookAheadCharacter == '-')
|
||||
{
|
||||
ReadComment();
|
||||
}
|
||||
else
|
||||
{
|
||||
// neither a comment nor cdata, should be something like DOCTYPE
|
||||
// skip till the next tag ender
|
||||
ReadUnknownDirective();
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// read text content, unless you encounter a tag
|
||||
NextTokenType = HtmlTokenType.Text;
|
||||
while (!IsAtTagStart && !IsAtEndOfStream && !IsAtDirectiveStart)
|
||||
{
|
||||
if (NextCharacter == '<' && !IsNextCharacterEntity && _lookAheadCharacter == '?')
|
||||
{
|
||||
// ignore processing directive
|
||||
SkipProcessingDirective();
|
||||
}
|
||||
else
|
||||
{
|
||||
if (NextCharacter <= ' ')
|
||||
{
|
||||
// Respect xml:preserve or its equivalents for whitespace processing
|
||||
if (_ignoreNextWhitespace)
|
||||
{
|
||||
// Ignore repeated whitespaces
|
||||
}
|
||||
else
|
||||
{
|
||||
// Treat any control character sequence as one whitespace
|
||||
_nextToken.Append(' ');
|
||||
}
|
||||
_ignoreNextWhitespace = true; // and keep ignoring the following whitespaces
|
||||
}
|
||||
else
|
||||
{
|
||||
_nextToken.Append(NextCharacter);
|
||||
_ignoreNextWhitespace = false;
|
||||
}
|
||||
GetNextCharacter();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Unconditionally returns a token which is one of: TagEnd, EmptyTagEnd, Name, Atom or EndOfStream
|
||||
/// Does not guarantee token reader advancing.
|
||||
/// </summary>
|
||||
internal void GetNextTagToken()
|
||||
{
|
||||
_nextToken.Length = 0;
|
||||
if (IsAtEndOfStream)
|
||||
{
|
||||
NextTokenType = HtmlTokenType.Eof;
|
||||
return;
|
||||
}
|
||||
|
||||
SkipWhiteSpace();
|
||||
|
||||
if (NextCharacter == '>' && !IsNextCharacterEntity)
|
||||
{
|
||||
// > should not end a tag, so make sure it's not an entity
|
||||
NextTokenType = HtmlTokenType.TagEnd;
|
||||
_nextToken.Append('>');
|
||||
GetNextCharacter();
|
||||
// Note: _ignoreNextWhitespace must be set appropriately on tag start processing
|
||||
}
|
||||
else if (NextCharacter == '/' && _lookAheadCharacter == '>')
|
||||
{
|
||||
// could be start of closing of empty tag
|
||||
NextTokenType = HtmlTokenType.EmptyTagEnd;
|
||||
_nextToken.Append("/>");
|
||||
GetNextCharacter();
|
||||
GetNextCharacter();
|
||||
_ignoreNextWhitespace = false; // Whitespace after no-scope tags are sifnificant
|
||||
}
|
||||
else if (IsGoodForNameStart(NextCharacter))
|
||||
{
|
||||
NextTokenType = HtmlTokenType.Name;
|
||||
|
||||
// starts a name
|
||||
// we allow character entities here
|
||||
// we do not throw exceptions here if end of stream is encountered
|
||||
// just stop and return whatever is in the token
|
||||
// if the parser is not expecting end of file after this it will call
|
||||
// the get next token function and throw an exception
|
||||
while (IsGoodForName(NextCharacter) && !IsAtEndOfStream)
|
||||
{
|
||||
_nextToken.Append(NextCharacter);
|
||||
GetNextCharacter();
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// Unexpected type of token for a tag. Reprot one character as Atom, expecting that HtmlParser will ignore it.
|
||||
NextTokenType = HtmlTokenType.Atom;
|
||||
_nextToken.Append(NextCharacter);
|
||||
GetNextCharacter();
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Unconditionally returns equal sign token. Even if there is no
|
||||
/// real equal sign in the stream, it behaves as if it were there.
|
||||
/// Does not guarantee token reader advancing.
|
||||
/// </summary>
|
||||
internal void GetNextEqualSignToken()
|
||||
{
|
||||
Debug.Assert(NextTokenType != HtmlTokenType.Eof);
|
||||
_nextToken.Length = 0;
|
||||
|
||||
_nextToken.Append('=');
|
||||
NextTokenType = HtmlTokenType.EqualSign;
|
||||
|
||||
SkipWhiteSpace();
|
||||
|
||||
if (NextCharacter == '=')
|
||||
{
|
||||
// '=' is not in the list of entities, so no need to check for entities here
|
||||
GetNextCharacter();
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Unconditionally returns an atomic value for an attribute
|
||||
/// Even if there is no appropriate token it returns Atom value
|
||||
/// Does not guarantee token reader advancing.
|
||||
/// </summary>
|
||||
internal void GetNextAtomToken()
|
||||
{
|
||||
Debug.Assert(NextTokenType != HtmlTokenType.Eof);
|
||||
_nextToken.Length = 0;
|
||||
|
||||
SkipWhiteSpace();
|
||||
|
||||
NextTokenType = HtmlTokenType.Atom;
|
||||
|
||||
if ((NextCharacter == '\'' || NextCharacter == '"') && !IsNextCharacterEntity)
|
||||
{
|
||||
var startingQuote = NextCharacter;
|
||||
GetNextCharacter();
|
||||
|
||||
// Consume all characters between quotes
|
||||
while (!(NextCharacter == startingQuote && !IsNextCharacterEntity) && !IsAtEndOfStream)
|
||||
{
|
||||
_nextToken.Append(NextCharacter);
|
||||
GetNextCharacter();
|
||||
}
|
||||
if (NextCharacter == startingQuote)
|
||||
{
|
||||
GetNextCharacter();
|
||||
}
|
||||
|
||||
// complete the quoted value
|
||||
// NOTE: our recovery here is different from IE's
|
||||
// IE keeps reading until it finds a closing quote or end of file
|
||||
// if end of file, it treats current value as text
|
||||
// if it finds a closing quote at any point within the text, it eats everything between the quotes
|
||||
// TODO: Suggestion:
|
||||
// however, we could stop when we encounter end of file or an angle bracket of any kind
|
||||
// and assume there was a quote there
|
||||
// so the attribute value may be meaningless but it is never treated as text
|
||||
}
|
||||
else
|
||||
{
|
||||
while (!IsAtEndOfStream && !char.IsWhiteSpace(NextCharacter) && NextCharacter != '>')
|
||||
{
|
||||
_nextToken.Append(NextCharacter);
|
||||
GetNextCharacter();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endregion Internal Methods
|
||||
|
||||
// ---------------------------------------------------------------------
|
||||
//
|
||||
// Internal Properties
|
||||
//
|
||||
// ---------------------------------------------------------------------
|
||||
|
||||
#region Internal Properties
|
||||
|
||||
internal HtmlTokenType NextTokenType { get; private set; }
|
||||
|
||||
internal string NextToken => _nextToken.ToString();
|
||||
|
||||
#endregion Internal Properties
|
||||
|
||||
// ---------------------------------------------------------------------
|
||||
//
|
||||
// Private methods
|
||||
//
|
||||
// ---------------------------------------------------------------------
|
||||
|
||||
#region Private Methods
|
||||
|
||||
/// <summary>
|
||||
/// Advances a reading position by one character code
|
||||
/// and reads the next availbale character from a stream.
|
||||
/// This character becomes available as NextCharacter property.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// Throws InvalidOperationException if attempted to be called on EndOfStream
|
||||
/// condition.
|
||||
/// </remarks>
|
||||
private void GetNextCharacter()
|
||||
{
|
||||
if (_nextCharacterCode == -1)
|
||||
{
|
||||
throw new InvalidOperationException("GetNextCharacter method called at the end of a stream");
|
||||
}
|
||||
|
||||
_previousCharacter = NextCharacter;
|
||||
|
||||
NextCharacter = _lookAheadCharacter;
|
||||
_nextCharacterCode = _lookAheadCharacterCode;
|
||||
// next character not an entity as of now
|
||||
IsNextCharacterEntity = false;
|
||||
|
||||
ReadLookAheadCharacter();
|
||||
|
||||
if (NextCharacter == '&')
|
||||
{
|
||||
if (_lookAheadCharacter == '#')
|
||||
{
|
||||
// numeric entity - parse digits - &#DDDDD;
|
||||
int entityCode;
|
||||
entityCode = 0;
|
||||
ReadLookAheadCharacter();
|
||||
|
||||
// largest numeric entity is 7 characters
|
||||
for (var i = 0; i < 7 && char.IsDigit(_lookAheadCharacter); i++)
|
||||
{
|
||||
entityCode = 10*entityCode + (_lookAheadCharacterCode - '0');
|
||||
ReadLookAheadCharacter();
|
||||
}
|
||||
if (_lookAheadCharacter == ';')
|
||||
{
|
||||
// correct format - advance
|
||||
ReadLookAheadCharacter();
|
||||
_nextCharacterCode = entityCode;
|
||||
|
||||
// if this is out of range it will set the character to '?'
|
||||
NextCharacter = (char) _nextCharacterCode;
|
||||
|
||||
// as far as we are concerned, this is an entity
|
||||
IsNextCharacterEntity = true;
|
||||
}
|
||||
else
|
||||
{
|
||||
// not an entity, set next character to the current lookahread character
|
||||
// we would have eaten up some digits
|
||||
NextCharacter = _lookAheadCharacter;
|
||||
_nextCharacterCode = _lookAheadCharacterCode;
|
||||
ReadLookAheadCharacter();
|
||||
IsNextCharacterEntity = false;
|
||||
}
|
||||
}
|
||||
else if (char.IsLetter(_lookAheadCharacter))
|
||||
{
|
||||
// entity is written as a string
|
||||
var entity = "";
|
||||
|
||||
// maximum length of string entities is 10 characters
|
||||
for (var i = 0;
|
||||
i < 10 && (char.IsLetter(_lookAheadCharacter) || char.IsDigit(_lookAheadCharacter));
|
||||
i++)
|
||||
{
|
||||
entity += _lookAheadCharacter;
|
||||
ReadLookAheadCharacter();
|
||||
}
|
||||
if (_lookAheadCharacter == ';')
|
||||
{
|
||||
// advance
|
||||
ReadLookAheadCharacter();
|
||||
|
||||
if (HtmlSchema.IsEntity(entity))
|
||||
{
|
||||
NextCharacter = HtmlSchema.EntityCharacterValue(entity);
|
||||
_nextCharacterCode = NextCharacter;
|
||||
IsNextCharacterEntity = true;
|
||||
}
|
||||
else
|
||||
{
|
||||
// just skip the whole thing - invalid entity
|
||||
// move on to the next character
|
||||
NextCharacter = _lookAheadCharacter;
|
||||
_nextCharacterCode = _lookAheadCharacterCode;
|
||||
ReadLookAheadCharacter();
|
||||
|
||||
// not an entity
|
||||
IsNextCharacterEntity = false;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// skip whatever we read after the ampersand
|
||||
// set next character and move on
|
||||
NextCharacter = _lookAheadCharacter;
|
||||
ReadLookAheadCharacter();
|
||||
IsNextCharacterEntity = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void ReadLookAheadCharacter()
|
||||
{
|
||||
if (_lookAheadCharacterCode != -1)
|
||||
{
|
||||
_lookAheadCharacterCode = _inputStringReader.Read();
|
||||
_lookAheadCharacter = (char) _lookAheadCharacterCode;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// skips whitespace in the input string
|
||||
/// leaves the first non-whitespace character available in the NextCharacter property
|
||||
/// this may be the end-of-file character, it performs no checking
|
||||
/// </summary>
|
||||
private void SkipWhiteSpace()
|
||||
{
|
||||
// TODO: handle character entities while processing comments, cdata, and directives
|
||||
// TODO: SUGGESTION: we could check if lookahead and previous characters are entities also
|
||||
while (true)
|
||||
{
|
||||
if (NextCharacter == '<' && (_lookAheadCharacter == '?' || _lookAheadCharacter == '!'))
|
||||
{
|
||||
GetNextCharacter();
|
||||
|
||||
if (_lookAheadCharacter == '[')
|
||||
{
|
||||
// Skip CDATA block and DTDs(?)
|
||||
while (!IsAtEndOfStream &&
|
||||
!(_previousCharacter == ']' && NextCharacter == ']' && _lookAheadCharacter == '>'))
|
||||
{
|
||||
GetNextCharacter();
|
||||
}
|
||||
if (NextCharacter == '>')
|
||||
{
|
||||
GetNextCharacter();
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// Skip processing instruction, comments
|
||||
while (!IsAtEndOfStream && NextCharacter != '>')
|
||||
{
|
||||
GetNextCharacter();
|
||||
}
|
||||
if (NextCharacter == '>')
|
||||
{
|
||||
GetNextCharacter();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
if (!char.IsWhiteSpace(NextCharacter))
|
||||
{
|
||||
break;
|
||||
}
|
||||
|
||||
GetNextCharacter();
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// checks if a character can be used to start a name
|
||||
/// if this check is true then the rest of the name can be read
|
||||
/// </summary>
|
||||
/// <param name="character">
|
||||
/// character value to be checked
|
||||
/// </param>
|
||||
/// <returns>
|
||||
/// true if the character can be the first character in a name
|
||||
/// false otherwise
|
||||
/// </returns>
|
||||
private bool IsGoodForNameStart(char character) => character == '_' || char.IsLetter(character);
|
||||
|
||||
/// <summary>
|
||||
/// checks if a character can be used as a non-starting character in a name
|
||||
/// uses the IsExtender and IsCombiningCharacter predicates to see
|
||||
/// if a character is an extender or a combining character
|
||||
/// </summary>
|
||||
/// <param name="character">
|
||||
/// character to be checked for validity in a name
|
||||
/// </param>
|
||||
/// <returns>
|
||||
/// true if the character can be a valid part of a name
|
||||
/// </returns>
|
||||
private bool IsGoodForName(char character) => IsGoodForNameStart(character) ||
|
||||
character == '.' ||
|
||||
character == '-' ||
|
||||
character == ':' ||
|
||||
char.IsDigit(character) ||
|
||||
IsCombiningCharacter(character) ||
|
||||
IsExtender(character);
|
||||
|
||||
/// <summary>
|
||||
/// identifies a character as being a combining character, permitted in a name
|
||||
/// TODO: only a placeholder for now but later to be replaced with comparisons against
|
||||
/// the list of combining characters in the XML documentation
|
||||
/// </summary>
|
||||
/// <param name="character">
|
||||
/// character to be checked
|
||||
/// </param>
|
||||
/// <returns>
|
||||
/// true if the character is a combining character, false otherwise
|
||||
/// </returns>
|
||||
private bool IsCombiningCharacter(char character) => false;
|
||||
|
||||
/// <summary>
|
||||
/// identifies a character as being an extender, permitted in a name
|
||||
/// TODO: only a placeholder for now but later to be replaced with comparisons against
|
||||
/// the list of extenders in the XML documentation
|
||||
/// </summary>
|
||||
/// <param name="character">
|
||||
/// character to be checked
|
||||
/// </param>
|
||||
/// <returns>
|
||||
/// true if the character is an extender, false otherwise
|
||||
/// </returns>
|
||||
private bool IsExtender(char character) => false;
|
||||
|
||||
/// <summary>
|
||||
/// skips dynamic content starting with '<![' and ending with ']>'
|
||||
/// </summary>
|
||||
private void ReadDynamicContent()
|
||||
{
|
||||
// verify that we are at dynamic content, which may include CDATA
|
||||
Debug.Assert(_previousCharacter == '<' && NextCharacter == '!' && _lookAheadCharacter == '[');
|
||||
|
||||
// Let's treat this as empty text
|
||||
NextTokenType = HtmlTokenType.Text;
|
||||
_nextToken.Length = 0;
|
||||
|
||||
// advance twice, once to get the lookahead character and then to reach the start of the cdata
|
||||
GetNextCharacter();
|
||||
GetNextCharacter();
|
||||
|
||||
// NOTE: 10/12/2004: modified this function to check when called if's reading CDATA or something else
|
||||
// some directives may start with a <![ and then have some data and they will just end with a ]>
|
||||
// this function is modified to stop at the sequence ]> and not ]]>
|
||||
// this means that CDATA and anything else expressed in their own set of [] within the <! [...]>
|
||||
// directive cannot contain a ]> sequence. However it is doubtful that cdata could contain such
|
||||
// sequence anyway, it probably stops at the first ]
|
||||
while (!(NextCharacter == ']' && _lookAheadCharacter == '>') && !IsAtEndOfStream)
|
||||
{
|
||||
// advance
|
||||
GetNextCharacter();
|
||||
}
|
||||
|
||||
if (!IsAtEndOfStream)
|
||||
{
|
||||
// advance, first to the last >
|
||||
GetNextCharacter();
|
||||
|
||||
// then advance past it to the next character after processing directive
|
||||
GetNextCharacter();
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// skips comments starting with '<!-' and ending with '-->'
|
||||
/// NOTE: 10/06/2004: processing changed, will now skip anything starting with
|
||||
/// the "<!-" sequence and ending in "!>" or "->", because in practice many html pages do not
|
||||
/// use the full comment specifying conventions
|
||||
/// </summary>
|
||||
private void ReadComment()
|
||||
{
|
||||
// verify that we are at a comment
|
||||
Debug.Assert(_previousCharacter == '<' && NextCharacter == '!' && _lookAheadCharacter == '-');
|
||||
|
||||
// Initialize a token
|
||||
NextTokenType = HtmlTokenType.Comment;
|
||||
_nextToken.Length = 0;
|
||||
|
||||
// advance to the next character, so that to be at the start of comment value
|
||||
GetNextCharacter(); // get first '-'
|
||||
GetNextCharacter(); // get second '-'
|
||||
GetNextCharacter(); // get first character of comment content
|
||||
|
||||
while (true)
|
||||
{
|
||||
// Read text until end of comment
|
||||
// Note that in many actual html pages comments end with "!>" (while xml standard is "-->")
|
||||
while (!IsAtEndOfStream &&
|
||||
!(NextCharacter == '-' && _lookAheadCharacter == '-' ||
|
||||
NextCharacter == '!' && _lookAheadCharacter == '>'))
|
||||
{
|
||||
_nextToken.Append(NextCharacter);
|
||||
GetNextCharacter();
|
||||
}
|
||||
|
||||
// Finish comment reading
|
||||
GetNextCharacter();
|
||||
if (_previousCharacter == '-' && NextCharacter == '-' && _lookAheadCharacter == '>')
|
||||
{
|
||||
// Standard comment end. Eat it and exit the loop
|
||||
GetNextCharacter(); // get '>'
|
||||
break;
|
||||
}
|
||||
if (_previousCharacter == '!' && NextCharacter == '>')
|
||||
{
|
||||
// Nonstandard but possible comment end - '!>'. Exit the loop
|
||||
break;
|
||||
}
|
||||
// Not an end. Save character and continue continue reading
|
||||
_nextToken.Append(_previousCharacter);
|
||||
}
|
||||
|
||||
// Read end of comment combination
|
||||
if (NextCharacter == '>')
|
||||
{
|
||||
GetNextCharacter();
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// skips past unknown directives that start with "<!" but are not comments or Cdata
|
||||
/// ignores content of such directives until the next ">"
|
||||
/// character
|
||||
/// applies to directives such as DOCTYPE, etc that we do not presently support
|
||||
/// </summary>
|
||||
private void ReadUnknownDirective()
|
||||
{
|
||||
// verify that we are at an unknown directive
|
||||
Debug.Assert(_previousCharacter == '<' && NextCharacter == '!' &&
|
||||
!(_lookAheadCharacter == '-' || _lookAheadCharacter == '['));
|
||||
|
||||
// Let's treat this as empty text
|
||||
NextTokenType = HtmlTokenType.Text;
|
||||
_nextToken.Length = 0;
|
||||
|
||||
// advance to the next character
|
||||
GetNextCharacter();
|
||||
|
||||
// skip to the first tag end we find
|
||||
while (!(NextCharacter == '>' && !IsNextCharacterEntity) && !IsAtEndOfStream)
|
||||
{
|
||||
GetNextCharacter();
|
||||
}
|
||||
|
||||
if (!IsAtEndOfStream)
|
||||
{
|
||||
// advance past the tag end
|
||||
GetNextCharacter();
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// skips processing directives starting with the characters '<?' and ending with '?>'
|
||||
/// NOTE: 10/14/2004: IE also ends processing directives with a />, so this function is
|
||||
/// being modified to recognize that condition as well
|
||||
/// </summary>
|
||||
private void SkipProcessingDirective()
|
||||
{
|
||||
// verify that we are at a processing directive
|
||||
Debug.Assert(NextCharacter == '<' && _lookAheadCharacter == '?');
|
||||
|
||||
// advance twice, once to get the lookahead character and then to reach the start of the drective
|
||||
GetNextCharacter();
|
||||
GetNextCharacter();
|
||||
|
||||
while (!((NextCharacter == '?' || NextCharacter == '/') && _lookAheadCharacter == '>') && !IsAtEndOfStream)
|
||||
{
|
||||
// advance
|
||||
// we don't need to check for entities here because '?' is not an entity
|
||||
// and even though > is an entity there is no entity processing when reading lookahead character
|
||||
GetNextCharacter();
|
||||
}
|
||||
|
||||
if (!IsAtEndOfStream)
|
||||
{
|
||||
// advance, first to the last >
|
||||
GetNextCharacter();
|
||||
|
||||
// then advance past it to the next character after processing directive
|
||||
GetNextCharacter();
|
||||
}
|
||||
}
|
||||
|
||||
#endregion Private Methods
|
||||
|
||||
// ---------------------------------------------------------------------
|
||||
//
|
||||
// Private Properties
|
||||
//
|
||||
// ---------------------------------------------------------------------
|
||||
|
||||
#region Private Properties
|
||||
|
||||
private char NextCharacter { get; set; }
|
||||
|
||||
private bool IsAtEndOfStream => _nextCharacterCode == -1;
|
||||
|
||||
private bool IsAtTagStart
|
||||
=> NextCharacter == '<' && (_lookAheadCharacter == '/' || IsGoodForNameStart(_lookAheadCharacter)) &&
|
||||
!IsNextCharacterEntity;
|
||||
|
||||
private bool IsAtTagEnd => (NextCharacter == '>' || (NextCharacter == '/' && _lookAheadCharacter == '>')) &&
|
||||
!IsNextCharacterEntity;
|
||||
|
||||
private bool IsAtDirectiveStart
|
||||
=> (NextCharacter == '<' && _lookAheadCharacter == '!' && !IsNextCharacterEntity);
|
||||
|
||||
private bool IsNextCharacterEntity { // check if next character is an entity
|
||||
get; set; }
|
||||
|
||||
#endregion Private Properties
|
||||
|
||||
// ---------------------------------------------------------------------
|
||||
//
|
||||
// Private Fields
|
||||
//
|
||||
// ---------------------------------------------------------------------
|
||||
|
||||
#region Private Fields
|
||||
|
||||
// string reader which will move over input text
|
||||
private readonly StringReader _inputStringReader;
|
||||
// next character code read from input that is not yet part of any token
|
||||
// and the character it represents
|
||||
private int _nextCharacterCode;
|
||||
private int _lookAheadCharacterCode;
|
||||
private char _lookAheadCharacter;
|
||||
private char _previousCharacter;
|
||||
private bool _ignoreNextWhitespace;
|
||||
|
||||
// store token and type in local variables before copying them to output parameters
|
||||
private readonly StringBuilder _nextToken;
|
||||
|
||||
#endregion Private Fields
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user