In my research of how to develop a language I came across several different compilers on Codeplex. In my development of a compiler I am using the following design
In order to keep things simple I am following the grammar of the Visual Basic .NET language. I have started development and have managed to the scanner written, source code below; I based it upon Joel Pobar’s Nua. I am still fine turning and learning more about the proper way to develop a compiler.
Imports Microsoft.Scripting
Imports Microsoft.Scripting.Runtime
''' <summary>
''' A lexical analyzer for GFN. It produces a stream of lexical tokens.
''' </summary>
Public Class Scanner
' Buffer used to extract and process tokens
Private _buffer As TokenizerBuffer
' Used to track errors
Private _errors As ErrorSink
' Source code to be read.
Private _source As SourceUnit
' Token being processed
Private _token As Token
''' <summary>
''' Fetches the current token without advancing the stream position
''' </summary>
''' <returns>The current token.</returns>
Public ReadOnly Property Peek() As Token
Get
If (Me._token Is Nothing OrElse Me._token.Type = TokenType.None) Then
Me._token = ReadToken()
End If
Return Me._token
End Get
End Property ' Peek
''' <summary>
''' Constructs a scanner for the specified TextReader.
''' </summary>
''' <param name="source">Represents the source code to be processed.</param>
Public Sub New(ByVal errors As ErrorSink, ByVal source As SourceUnit)
Me._errors = errors
Me._source = source
If (Me._source Is Nothing) Then Throw New ArgumentNullException("SourceUnit")
' multiEolns - Whether to allow multiple forms of EOLN If false only '\n' is treated as a line separator otherwise '\n', '\r\n' and '\r' are treated as separators
Me._buffer = New TokenizerBuffer(Me._source.GetReader(), SourceLocation.MinValue, 1024, True)
End Sub ' New
''' <summary>
''' Reads the next token available in the stream.
''' </summary>
''' <returns>Next token available in the stream.</returns>
Public Function Read() As Token
Dim readToken As Token = Nothing
If (Me._token.Type = TokenType.None) Then
readToken = Me.ReadToken()
Me._token = Me.ReadToken()
Return readToken
End If
readToken = Me._token
Me._token = Me.ReadToken()
Return readToken
End Function ' Read
''' <summary>
''' Read the next avaliable token in the stream.
''' </summary>
''' <returns>Next avaliable token.</returns>
Private Function ReadToken() As Token
Dim token As Token = Nothing
Dim nchr As Char = Nothing
' Discard any white spaces.
While (Me._buffer.Peek <> -1 AndAlso Char.IsWhiteSpace(ChrW(Me._buffer.Peek)))
Me._buffer.Read()
' Buffer can drop current token.
Me._buffer.DiscardToken()
End While
' Has the end of the buffer been reached?
If (Me._buffer.Peek = -1) Then
Me._buffer.MarkTokenEnd(True)
token = New EndOfStreamToken(New SourceSpan(Me._buffer.TokenStart, Me._buffer.TokenEnd))
' Buffer can drop current token.
Me._buffer.DiscardToken()
Return token
End If
' Read the first character avaliable in the buffer.
nchr = ChrW(Me._buffer.Peek())
If (Char.IsLetter(nchr) OrElse nchr = "_"c) Then
Return Me.ScanKeywordOrIdentifier()
ElseIf Char.IsDigit(nchr) Then
Return Me.ScanNumericLiteral()
ElseIf (nchr = """"c) Then
Return Me.ScanStringLiteral()
ElseIf (nchr = "="c) Then
Me._buffer.Read()
Me._buffer.MarkTokenEnd(False)
token = New PunctuatorToken(New SourceSpan(Me._buffer.TokenStart, Me._buffer.TokenEnd), TokenType.Equals)
' Buffer can drop current token.
Me._buffer.DiscardToken()
Return token
ElseIf ("+*/^(){}".IndexOf(nchr) <> -1) Then
Dim type As TokenType = TokenType.None
' Single-character punctuation.
Select Case Me._buffer.Read()
Case AscW("+")
type = TokenType.Plus
Case AscW("*")
type = TokenType.Star
Case AscW("/")
type = TokenType.ForwardSlash
Case AscW("^")
type = TokenType.Caret
Case AscW("(")
type = TokenType.LeftParenthesis
Case AscW(")")
type = TokenType.RightParenthesis
Case AscW("{")
type = TokenType.LeftCurlyBrace
Case AscW("}")
type = TokenType.RightCurlyBrace
End Select
Me._buffer.MarkTokenEnd(False)
token = New PunctuatorToken(New SourceSpan(Me._buffer.TokenStart, Me._buffer.TokenEnd), type)
' Buffer can drop current token.
Me._buffer.DiscardToken()
Return token
Else
' An invalid character has been discovered.
Me._buffer.MarkTokenEnd(False)
token = New ErrorToken(SyntaxErrorType.InvalidCharacter, New SourceSpan(Me._buffer.TokenStart, Me._buffer.TokenEnd))
' Buffer can drop current token.
Me._buffer.DiscardToken()
Return token
End If
End Function ' ReadNextToken
''' <summary>
''' Scans in all the digits in the numeric literal.
''' </summary>
''' <param name="acc"></param>
Private Sub ScanDigit(ByRef acc As Text.StringBuilder)
Dim nchr = ChrW(Me._buffer.Peek)
If (Me._buffer.Peek = -1 OrElse Not Char.IsDigit(nchr)) Then
Me._errors.Add(Me._source, "Expected digits", New SourceSpan(Me._buffer.TokenStart, Me._buffer.TokenEnd), 0, Severity.FatalError)
' Invalid numeric constant.
acc.Append("0")
Else
While (Me._buffer.Peek <> -1 AndAlso Char.IsDigit(nchr))
acc.Append(Chr(Me._buffer.Read()))
nchr = ChrW(Me._buffer.Peek)
End While
End If
End Sub ' ScanDigit
''' <summary>
''' Scans in the identifier.
''' </summary>
''' <returns>Token associated with the identifier.</returns>
Private Function ScanIdentifier() As Token
Dim acc As New Text.StringBuilder()
Dim nchr As Char = Nothing
Dim token As Token = Nothing
nchr = ChrW(Me._buffer.Peek())
While (Me._buffer.Peek <> -1 AndAlso (Char.IsLetterOrDigit(nchr) OrElse nchr = "_"c))
acc.Append(ChrW(Me._buffer.Read()))
nchr = ChrW(Me._buffer.Peek())
End While
Me._buffer.MarkTokenEnd(False)
token = New IdentifierToken(acc.ToString(), New SourceSpan(Me._buffer.TokenStart, Me._buffer.TokenEnd), TokenType.Identifier)
' Buffer can drop current token.
Me._buffer.DiscardToken()
Return token
End Function ' ScanIdentifier
''' <summary>
''' Identifies the keyword or identifier.
''' </summary>
''' <returns>Token associated with the given keyword or identifier.</returns>
Private Function ScanKeywordOrIdentifier() As Token
Dim identifier As IdentifierToken = Me.ScanIdentifier()
Dim token As Token = Nothing
Me._buffer.MarkTokenEnd(False)
token = New IdentifierToken(identifier.Identifier, New SourceSpan(Me._buffer.TokenStart, Me._buffer.TokenEnd), IdentifierToken.TokenTypeFromString(identifier.Identifier))
' Buffer can drop current token.
Me._buffer.DiscardToken()
Return token
End Function ' ScanKeywordOrIdentifier
''' <summary>
''' Scans in a numeric literal.
''' </summary>
''' <returns>Token associated with the numeric literal.</returns>
Private Function ScanNumericLiteral() As Token
Dim acc As New Text.StringBuilder()
Dim token As Token = Nothing
Me.ScanDigit(acc)
Me._buffer.MarkTokenEnd(False)
token = New IntegerLiteralToken(Integer.Parse(acc.ToString()), New SourceSpan(Me._buffer.TokenStart, Me._buffer.TokenEnd))
' Buffer can drop current token.
Me._buffer.DiscardToken()
Return token
End Function ' ScanNumericLiteral
''' <summary>
''' Scans in a string literal.
''' </summary>
''' <returns>Token associated with the string literal.</returns>
Private Function ScanStringLiteral() As Token
Dim acc As New Text.StringBuilder()
Dim nchr As Char = Nothing
Dim token As Token = Nothing
' Discard the initial quote.
Me._buffer.Read()
' Has the end of the buffer been reached?
If (Me._buffer.Peek = -1) Then
Me._errors.Add(Me._source, "Unterminated string literal", New SourceSpan(Me._buffer.TokenStart, Me._buffer.TokenEnd), 0, Severity.FatalError)
Me._buffer.MarkTokenEnd(False)
token = New ErrorToken(SyntaxErrorType.InvalidStringLiteral, New SourceSpan(Me._buffer.TokenStart, Me._buffer.TokenEnd))
' Buffer can drop current token.
Me._buffer.DiscardToken()
Return token
End If
nchr = ChrW(Me._buffer.Peek)
' Read until the terminating quote is read.
While (Not nchr = """"c)
acc.Append(ChrW(Me._buffer.Read()))
' Has the end of the buffer been reached?
If (Me._buffer.Peek = -1) Then
Me._errors.Add(Me._source, "Unterminated string literal", New SourceSpan(Me._buffer.TokenStart, Me._buffer.TokenEnd), 0, Severity.FatalError)
Me._buffer.MarkTokenEnd(False)
token = New ErrorToken(SyntaxErrorType.InvalidStringLiteral, New SourceSpan(Me._buffer.TokenStart, Me._buffer.TokenEnd))
' Buffer can drop current token.
Me._buffer.DiscardToken()
Return token
Else
nchr = ChrW(Me._buffer.Peek)
End If
End While
' Discard the terminating quote.
Me._buffer.Read()
Me._buffer.MarkTokenEnd(False)
token = New StringLiteralToken(acc.ToString(), New SourceSpan(Me._buffer.TokenStart, Me._buffer.TokenEnd))
' Buffer can drop current token.
Me._buffer.DiscardToken()
Return token
End Function ' ScanStringLiteral
End Class' Scanner