Parsing updates for unicode

- Allow unicode characters in indents and operators
- Show lexing errors
This commit is contained in:
2024-11-02 10:22:04 -07:00
parent d41558c219
commit 6164893da5
13 changed files with 152 additions and 101 deletions

View File

@@ -3,10 +3,12 @@ module Lib.Tokenizer
import Text.Lexer
import Text.Lexer.Tokenizer
import Lib.Token
import Lib.Common
keywords : List String
keywords = ["let", "in", "where", "case", "of", "data", "U", "do",
"ptype", "pfunc", "module", "infixl", "infixr", "infix"]
"ptype", "pfunc", "module", "infixl", "infixr", "infix",
"->", "", ":", "=>", ":=", "=", "<-", "\\", "_"]
specialOps : List String
specialOps = ["->", ":", "=>", ":=", "=", "<-"]
@@ -17,18 +19,12 @@ checkKW s = if elem s keywords then Tok Keyword s else Tok Ident s
checkUKW : String -> Token Kind
checkUKW s = if elem s keywords then Tok Keyword s else Tok UIdent s
checkOp : String -> Token Kind
checkOp s = if elem s specialOps then Tok Keyword s else Tok Oper s
isOpChar : Char -> Bool
isOpChar c = c `elem` (unpack ":!#$%&*+./<=>?@\\^|-~")
opChar : Lexer
opChar = pred isOpChar
identMore : Lexer
identMore = alphaNum <|> exact "." <|> exact "'" <|> exact "_"
singleton : Lexer
singleton = oneOf "()\\{}[],"
quo : Recognise True
quo = is '"'
@@ -52,25 +48,34 @@ opMiddle = pred (\c => not (isSpace c || c == '_'))
rawTokens : Tokenizer (Token Kind)
rawTokens
= match (lower <+> many identMore) checkKW
<|> match (upper <+> many identMore) checkUKW
<|> match (some digit) (Tok Number)
<|> match (is '#' <+> many alpha) (Tok Pragma)
<|> match charLit (Tok Character)
= match spaces (Tok Space)
-- { is singleton except for {{
<|> match (exact "{{" <|> exact "}}") (Tok Keyword)
-- need to make this an ident
<|> match (exact ",") (checkKW)
-- for now, our lambda slash is singleton
<|> match (singleton) (Tok Symbol)
-- TODO Drop MixFix token type when we support if_then_else_
<|> match (exact "_" <+> (some opMiddle) <+> exact "_") (Tok MixFix)
<|> match (quo <+> manyUntil quo (esc any <|> any) <+> quo) (Tok StringKind . unquote)
-- REVIEW - expect non-alpha after?
<|> match (some digit) (Tok Number)
-- for module names and maybe type constructors
<|> match (charLit) (Tok Character)
<|> match (is '#' <+> many alpha) (Tok Pragma)
<|> match (lineComment (exact "--")) (Tok Space)
<|> match (blockComment (exact "/-") (exact "-/")) (Tok Space)
<|> match (exact ",") (Tok Oper)
<|> match (some opChar) checkOp
<|> match (exact "{{" <|> exact "}}") (Tok Keyword)
<|> match symbol (Tok Symbol)
<|> match spaces (Tok Space)
<|> match (upper <+> many identMore) checkUKW
<|> match (quo <+> manyUntil quo (esc any <|> any) <+> quo) (Tok StringKind . unquote)
-- accept almost everything, but
<|> match (some (non (space <|> singleton))) checkKW
notSpace : WithBounds (Token Kind) -> Bool
notSpace (MkBounded (Tok Space _) _ _) = False
notSpace _ = True
export
tokenise : String -> List BTok
tokenise = filter notSpace . fst . lex rawTokens
tokenise : String -> Either Error (List BTok)
tokenise s = case lex rawTokens s of
(toks, EndInput, l, c, what) => Right (filter notSpace toks)
(toks, reason, l, c, what) => Left (E (l,c) "\{show reason}")