newt/src/Lib/Tokenizer.idr

module Lib.Tokenizer

import Text.Lexer
import Text.Lexer.Tokenizer
import Lib.Token

keywords : List String
keywords = ["let", "in", "where", "case", "of", "data", "U",
            "ptype", "pfunc", "module", "infixl", "infixr", "infix"]

specialOps : List String
specialOps = ["->", ":", "=>", ":=", "="]

checkKW : String -> Token Kind
checkKW s = if elem s keywords then Tok Keyword s else Tok Ident s

checkUKW : String -> Token Kind
checkUKW s = if elem s keywords then Tok Keyword s else Tok UIdent s

checkOp : String -> Token Kind
checkOp s = if elem s specialOps then Tok Keyword s else Tok Oper s

isOpChar : Char -> Bool
isOpChar c = c `elem` (unpack ":!#$%&*+./<=>?@\\^|-~")

opChar : Lexer
opChar = pred isOpChar

identMore : Lexer
identMore = alphaNum <|> exact "." <|> exact "'"

quo : Recognise True
quo = is '"'

esc : Recognise True -> Recognise True
esc l = is '\\' <+> l

-- REVIEW maybe we can do this faster with the view thinger
unquote : String -> String
unquote str = case unpack str of
  ('"' :: xs) => pack $ go xs
  imp => pack $ go imp
  where
    go : List Char -> List Char
    go [] = []
    go ['"'] = []
    go ('\\' :: (x :: xs)) = x :: go xs
    go (x :: xs) = x :: go xs

rawTokens : Tokenizer (Token Kind)
rawTokens
   =  match (lower <+> many identMore) checkKW
  <|> match (upper <+> many identMore) checkUKW
  <|> match (some digit) (Tok Number)
  <|> match (is '#' <+> many alpha) (Tok Pragma)
  <|> match (exact "_" <+> (some opChar <|> exact ",") <+> exact "_") (Tok MixFix)
  <|> match (quo <+> manyUntil quo ((esc any <+> any) <|> any) <+> opt quo) (Tok StringKind . unquote)
  <|> match (lineComment (exact "--")) (Tok Space)
  <|> match (blockComment (exact "/-") (exact "-/")) (Tok Space)
  <|> match (exact ",") (Tok Oper)
  <|> match (some opChar) checkOp
  <|> match symbol (Tok Symbol)
  <|> match spaces (Tok Space)

notSpace : WithBounds (Token Kind) -> Bool
notSpace (MkBounded (Tok Space _) _ _) = False
notSpace _ = True

export
tokenise : String -> List BTok
tokenise = filter notSpace . fst . lex rawTokens