Parsing updates for unicode

- Allow unicode characters in indents and operators - Show lexing errors
2024-11-02 10:22:04 -07:00
parent d41558c219
commit 6164893da5
13 changed files with 152 additions and 101 deletions
--- a/src/Lib/Tokenizer.idr
+++ b/src/Lib/Tokenizer.idr
@@ -3,10 +3,12 @@ module Lib.Tokenizer
 import Text.Lexer
 import Text.Lexer.Tokenizer
 import Lib.Token
+import Lib.Common

 keywords : List String
 keywords = ["let", "in", "where", "case", "of", "data", "U", "do",
-            "ptype", "pfunc", "module", "infixl", "infixr", "infix"]
+            "ptype", "pfunc", "module", "infixl", "infixr", "infix",
+            "->", "→", ":", "=>", ":=", "=", "<-", "\\", "_"]

 specialOps : List String
 specialOps = ["->", ":", "=>", ":=", "=", "<-"]
@@ -17,18 +19,12 @@ checkKW s = if elem s keywords then Tok Keyword s else Tok Ident s
 checkUKW : String -> Token Kind
 checkUKW s = if elem s keywords then Tok Keyword s else Tok UIdent s

-checkOp : String -> Token Kind
-checkOp s = if elem s specialOps then Tok Keyword s else Tok Oper s
-
-isOpChar : Char -> Bool
-isOpChar c = c `elem` (unpack ":!#$%&*+./<=>?@\\^|-~")
-
-opChar : Lexer
-opChar = pred isOpChar
-
 identMore : Lexer
 identMore = alphaNum <|> exact "." <|> exact "'" <|> exact "_"

+singleton : Lexer
+singleton = oneOf "()\\{}[],"
+
 quo : Recognise True
 quo = is '"'

@@ -52,25 +48,34 @@ opMiddle = pred (\c => not (isSpace c || c == '_'))

 rawTokens : Tokenizer (Token Kind)
 rawTokens
-   =  match (lower <+> many identMore) checkKW
-  <|> match (upper <+> many identMore) checkUKW
-  <|> match (some digit) (Tok Number)
-  <|> match (is '#' <+> many alpha) (Tok Pragma)
-  <|> match charLit (Tok Character)
+  = match spaces (Tok Space)
+  -- { is singleton except for {{
+  <|> match (exact "{{" <|> exact "}}") (Tok Keyword)
+  -- need to make this an ident
+  <|> match (exact ",") (checkKW)
+  -- for now, our lambda slash is singleton
+  <|> match (singleton) (Tok Symbol)
+  -- TODO Drop MixFix token type when we support if_then_else_
  <|> match (exact "_" <+> (some opMiddle) <+> exact "_") (Tok MixFix)
-  <|> match (quo <+> manyUntil quo (esc any <|> any) <+> quo) (Tok StringKind . unquote)
+  -- REVIEW - expect non-alpha after?
+  <|> match (some digit) (Tok Number)
+  -- for module names and maybe type constructors
+  <|> match (charLit) (Tok Character)
+  <|> match (is '#' <+> many alpha) (Tok Pragma)
  <|> match (lineComment (exact "--")) (Tok Space)
  <|> match (blockComment (exact "/-") (exact "-/")) (Tok Space)
-  <|> match (exact ",") (Tok Oper)
-  <|> match (some opChar) checkOp
-  <|> match (exact "{{" <|> exact "}}") (Tok Keyword)
-  <|> match symbol (Tok Symbol)
-  <|> match spaces (Tok Space)
+  <|> match (upper <+> many identMore) checkUKW
+  <|> match (quo <+> manyUntil quo (esc any <|> any) <+> quo) (Tok StringKind . unquote)
+  -- accept almost everything, but
+  <|> match (some (non (space <|> singleton))) checkKW

 notSpace : WithBounds (Token Kind) -> Bool
 notSpace (MkBounded (Tok Space _) _ _) = False
 notSpace _ = True

 export
-tokenise : String -> List BTok
-tokenise = filter notSpace . fst . lex rawTokens
+tokenise : String -> Either Error (List BTok)
+tokenise s = case lex rawTokens s of
+  (toks, EndInput, l, c, what) => Right (filter notSpace toks)
+  (toks, reason, l, c, what) => Left (E (l,c) "\{show reason}")
+