This lexer is handwritten for efficiency. I used an elegant buffering scheme which I have not seen anywhere else: We guarantee that a whole line is in the buffer. Thus only when scanning the \n or \r character we have to check whether we need to read in the next chunk. (\n or \r already need special handling for incrementing the line counter; choosing both \n and \r allows the lexer to properly read Unix, DOS or Macintosh text files, even when it is not the native format.


Lexer = object of TBaseLexer
  fileIdx*: FileIndex
  indentAhead*: int          ## if > 0 an indentation has already been read
                             ## this is needed because scanning comments
                             ## needs so much look-ahead
  currLineIndent*: int
  strongSpaces*, allowTabs*: bool
  cache*: IdentCache
  when defined(nimsuggest):
      previousToken: TLineInfo

  config*: ConfigRef
  diags: seq[LexerDiag]
LexerDiag = object
  msg*: string
  location*: TLineInfo       ## diagnostic location
  instLoc*: InstantiationInfo ## instantiation in lexer's source
  case kind*: LexerDiagKind
  of lexDiagNameXShouldBeY:
      got*: string


Diagnostic data from the Lexer, mostly errors
LexerDiagKind = enum
  lexDiagMalformedNumUnderscores, lexDiagMalformedIdentUnderscores,
  lexDiagMalformedTrailingUnderscre, lexDiagInvalidToken,
  lexDiagInvalidTokenSpaceBetweenNumAndIdent, lexDiagNoTabs,
  lexDiagInvalidIntegerLiteralOctalPrefix, lexDiagInvalidIntegerSuffix,
  lexDiagNumberNotInRange, lexDiagExpectedHex, lexDiagInvalidIntegerLiteral,
  lexDiagInvalidNumericLiteral, lexDiagInvalidCharLiteral,
  lexDiagInvalidCharLiteralConstant, lexDiagInvalidCharLiteralPlatformNewline,
  lexDiagInvalidCharLiteralUnicodeCodepoint, lexDiagMissingClosingApostrophe,
  lexDiagUnclosedTripleString, lexDiagUnclosedSingleString,
  lexDiagUnclosedComment, lexDiagDeprecatedOctalPrefix = "OctalEscape",
  lexDiagLineTooLong = "LineTooLong", lexDiagNameXShouldBeY = "Name"
Token = object
  tokType*: TokType          ## the type of the token
  indent*: int               ## the indentation; != -1 if the token has been
                             ## preceded with indentation
  ident*: PIdent             ## the parsed identifier
  iNumber*: BiggestInt       ## the parsed integer literal
  fNumber*: BiggestFloat     ## the parsed floating point literal
  base*: NumericalBase       ## the numerical base; only valid for int
                             ## or float literals
  strongSpaceA*: int8        ## leading spaces of an operator
  strongSpaceB*: int8        ## trailing spaces of an operator
  literal*: string           ## the parsed (string) literal; and
                             ## documentation comments are here too
  line*, col*: int
  error*: LexerDiag          ## error diagnostic if `tokType` is `tkError`
a Nim token
TokType = enum
  tkInvalid = "tkInvalid", tkError = "tkError", tkEof = "[EOF]",
  tkSymbol = "tkSymbol", tkAddr = "addr", tkAnd = "and", tkAs = "as",
  tkAsm = "asm", tkBind = "bind", tkBlock = "block", tkBreak = "break",
  tkCase = "case", tkCast = "cast", tkConcept = "concept", tkConst = "const",
  tkContinue = "continue", tkConverter = "converter", tkDefer = "defer",
  tkDiscard = "discard", tkDistinct = "distinct", tkDiv = "div", tkDo = "do",
  tkElif = "elif", tkElse = "else", tkEnd = "end", tkEnum = "enum",
  tkExcept = "except", tkExport = "export", tkFinally = "finally",
  tkFor = "for", tkFrom = "from", tkFunc = "func", tkIf = "if",
  tkImport = "import", tkIn = "in", tkInclude = "include",
  tkInterface = "interface", tkIs = "is", tkIsnot = "isnot",
  tkIterator = "iterator", tkLet = "let", tkMacro = "macro",
  tkMethod = "method", tkMixin = "mixin", tkMod = "mod", tkNil = "nil",
  tkNot = "not", tkNotin = "notin", tkObject = "object", tkOf = "of",
  tkOr = "or", tkOut = "out", tkProc = "proc", tkPtr = "ptr", tkRaise = "raise",
  tkRef = "ref", tkReturn = "return", tkShl = "shl", tkShr = "shr",
  tkStatic = "static", tkTemplate = "template", tkTry = "try",
  tkTuple = "tuple", tkType = "type", tkUsing = "using", tkVar = "var",
  tkWhen = "when", tkWhile = "while", tkXor = "xor", tkYield = "yield",
  tkIntLit = "tkIntLit", tkInt8Lit = "tkInt8Lit", tkInt16Lit = "tkInt16Lit",
  tkInt32Lit = "tkInt32Lit", tkInt64Lit = "tkInt64Lit", tkUIntLit = "tkUIntLit",
  tkUInt8Lit = "tkUInt8Lit", tkUInt16Lit = "tkUInt16Lit",
  tkUInt32Lit = "tkUInt32Lit", tkUInt64Lit = "tkUInt64Lit",
  tkFloatLit = "tkFloatLit", tkFloat32Lit = "tkFloat32Lit",
  tkFloat64Lit = "tkFloat64Lit", tkStrLit = "tkStrLit", tkRStrLit = "tkRStrLit",
  tkTripleStrLit = "tkTripleStrLit", tkGStrLit = "tkGStrLit",
  tkGTripleStrLit = "tkGTripleStrLit", tkCharLit = "tkCharLit",
  tkCustomLit = "tkCustomLit", tkParLe = "(", tkParRi = ")", tkBracketLe = "[",
  tkBracketRi = "]", tkCurlyLe = "{", tkCurlyRi = "}", tkBracketDotLe = "[.",
  tkBracketDotRi = ".]", tkCurlyDotLe = "{.", tkCurlyDotRi = ".}",
  tkParDotLe = "(.", tkParDotRi = ".)", tkComma = ",", tkSemiColon = ";",
  tkColon = ":", tkColonColon = "::", tkEquals = "=", tkDot = ".",
  tkDotDot = "..", tkBracketLeColon = "[:", tkOpr, tkComment, tkAccent = "`",
  tkSpaces, tkInfixOpr, tkPrefixOpr, tkPostfixOpr, tkHideableStart,
TokTypes = set[TokType]
LexDiagsError = {lexDiagMalformedNumUnderscores..lexDiagUnclosedComment}
LexDiagsHint = {lexDiagLineTooLong..lexDiagNameXShouldBeY}
LexDiagsWarning = {lexDiagDeprecatedOctalPrefix}
MaxLineLength = 80
numChars: set[char] = {'0'..'9', 'a'..'z', 'A'..'Z'}
OpChars: set[char] = {'+', '-', '*', '/', '\\', '<', '>', '!', '?', '^', '.',
                      '|', '=', '%', '&', '$', '@', '~', ':'}
SymChars: set[char] = {'a'..'z', 'A'..'Z', '0'..'9', '\x80'..'\xFF'}
SymStartChars: set[char] = {'a'..'z', 'A'..'Z', '\x80'..'\xFF'}
tkKeywords = {tkAddr..tkYield}
tokKeywordHigh = tkYield
tokKeywordLow = tkAddr
proc `$`(tok: Token): string {....raises: [], tags: [].}
proc closeLexer(lex: var Lexer) {....raises: [], tags: [].}
func diagOffset(L: Lexer): int {.inline, ...raises: [], tags: [].}
return value represents a point in time where all existing diagnostics are considered in the past, used in conjunction with errorsHintsAndWarnings
func diagToHumanStr(d: LexerDiag): string {....raises: [ValueError], tags: [].}
creates a human readable string message for a diagnostic, does not include any extra information such as line info, severity, and so on.
proc getLineInfo(L: Lexer): TLineInfo {....raises: [], tags: [].}
proc getLineInfo(L: Lexer; tok: Token): TLineInfo {.inline, ...raises: [], tags: [].}
proc getPrecedence(ident: PIdent): int {....raises: [], tags: [].}
assumes ident is binary operator already
proc getPrecedence(tok: Token): int {....raises: [], tags: [].}
Calculates the precedence of the given token.
proc initToken(L: var Token) {....raises: [], tags: [].}
func isKeyword(i: PIdent): bool {....raises: [], tags: [].}
is this the identifier a keyword?
func isKeyword(kind: TokType): bool {....raises: [], tags: [].}
proc isNimIdentifier(s: string): bool {....raises: [], tags: [].}
proc openLexer(lex: var Lexer; fileIdx: FileIndex; inputstream: PLLStream;
               cache: IdentCache; config: ConfigRef) {.
    ...raises: [IOError, Exception], tags: [ReadIOEffect, RootEffect].}
proc openLexer(lex: var Lexer; filename: AbsoluteFile; inputstream: PLLStream;
               cache: IdentCache; config: ConfigRef) {.
    ...raises: [IOError, Exception, KeyError],
    tags: [ReadIOEffect, RootEffect, ReadDirEffect].}
proc prettyTok(tok: Token): string {....raises: [], tags: [].}
proc printTok(conf: ConfigRef; tok: Token) {....raises: [Exception],
    tags: [RootEffect].}
proc rawGetTok(L: var Lexer; tok: var Token) {....raises: [IOError, Exception],
    tags: [ReadIOEffect, RootEffect].}
iterator errorsHintsAndWarnings(L: Lexer; diagOffset = 0): LexerDiag {.
    ...raises: [], tags: [].}
iterate over all diagnostics from the beginning, or from the point in time specificed via diagOffset