Files
newt/src/Lib/Tokenizer.idr
2024-09-14 19:46:15 -07:00

72 lines
2.0 KiB
Idris

module Lib.Tokenizer
import Text.Lexer
import Text.Lexer.Tokenizer
import Lib.Token
keywords : List String
keywords = ["let", "in", "where", "case", "of", "data", "U",
"ptype", "pfunc", "module", "infixl", "infixr", "infix"]
specialOps : List String
specialOps = ["->", ":", "=>", ":=", "="]
checkKW : String -> Token Kind
checkKW s = if elem s keywords then Tok Keyword s else Tok Ident s
checkUKW : String -> Token Kind
checkUKW s = if elem s keywords then Tok Keyword s else Tok UIdent s
checkOp : String -> Token Kind
checkOp s = if elem s specialOps then Tok Keyword s else Tok Oper s
isOpChar : Char -> Bool
isOpChar c = c `elem` (unpack ":!#$%&*+./<=>?@\\^|-~")
opChar : Lexer
opChar = pred isOpChar
identMore : Lexer
identMore = alphaNum <|> exact "." <|> exact "'"
quo : Recognise True
quo = is '"'
esc : Recognise True -> Recognise True
esc l = is '\\' <+> l
-- REVIEW maybe we can do this faster with the view thinger
unquote : String -> String
unquote str = case unpack str of
('"' :: xs) => pack $ go xs
imp => pack $ go imp
where
go : List Char -> List Char
go [] = []
go ['"'] = []
go ('\\' :: (x :: xs)) = x :: go xs
go (x :: xs) = x :: go xs
rawTokens : Tokenizer (Token Kind)
rawTokens
= match (lower <+> many identMore) checkKW
<|> match (upper <+> many identMore) checkUKW
<|> match (some digit) (Tok Number)
<|> match (is '#' <+> many alpha) (Tok Pragma)
<|> match (exact "_" <+> (some opChar <|> exact ",") <+> exact "_") (Tok MixFix)
<|> match (quo <+> manyUntil quo ((esc any <+> any) <|> any) <+> opt quo) (Tok StringKind . unquote)
<|> match (lineComment (exact "--")) (Tok Space)
<|> match (blockComment (exact "/-") (exact "-/")) (Tok Space)
<|> match (exact ",") (Tok Oper)
<|> match (some opChar) checkOp
<|> match symbol (Tok Symbol)
<|> match spaces (Tok Space)
notSpace : WithBounds (Token Kind) -> Bool
notSpace (MkBounded (Tok Space _) _ _) = False
notSpace _ = True
export
tokenise : String -> List BTok
tokenise = filter notSpace . fst . lex rawTokens