Parsing updates for unicode

- Allow unicode characters in indents and operators - Show lexing errors
2024-11-02 10:22:04 -07:00
parent d41558c219
commit 6164893da5
13 changed files with 152 additions and 101 deletions
--- a/src/Lib/Common.idr
+++ b/src/Lib/Common.idr
@@ -0,0 +1,52 @@
+module Lib.Common
+
+import Data.String
+
+-- I was going to use a record, but we're peeling this off of bounds at the moment.
+public export
+FC : Type
+FC = (Int,Int)
+
+public export
+interface HasFC a where
+  getFC : a -> FC
+
+%name FC fc
+
+export
+emptyFC : FC
+emptyFC = (0,0)
+
+-- Error of a parse
+public export
+data Error = E FC String
+%name Error err
+
+public export
+showError : String -> Error -> String
+showError src (E (line, col) msg) = "ERROR at \{show (line,col)}: \{msg}\n" ++ go 0 (lines src)
+  where
+    go : Int -> List String -> String
+    go l [] = ""
+    go l (x :: xs) =
+      if l == line then
+        "  \{x}\n  \{replicate (cast col) ' '}^\n"
+      else if line - 3 < l then "  " ++ x ++ "\n" ++ go (l + 1) xs
+      else go (l + 1) xs
+
+public export
+data Fixity = InfixL | InfixR | Infix
+
+export
+Show Fixity where
+  show InfixL = "infixl"
+  show InfixR = "infixr"
+  show Infix = "infix"
+
+public export
+record OpDef where
+  constructor MkOp
+  name : String
+  prec : Int
+  fix : Fixity
+
--- a/src/Lib/Parser.idr
+++ b/src/Lib/Parser.idr
@@ -95,7 +95,6 @@ pArg = do
  (Explicit,fc,) <$> atom
    <|> (Implicit,fc,) <$> braces typeExpr
    <|> (Auto,fc,) <$> dbraces typeExpr
-    <|> (Explicit,fc,) . RVar fc <$> token Oper

 AppSpine = List (Icit,FC,Raw)

@@ -203,13 +202,6 @@ caseExpr = do
  alts <- startBlock $ someSame $ caseAlt
  pure $ RCase fc sc alts

-doArrow : Parser DoStmt
-doArrow = do
-  fc <- getPos
-  name <- try $ ident <* keyword "<-"
-  expr <- term
-  pure $ DoArrow fc name expr
-
 doStmt : Parser DoStmt
 doStmt
  = DoArrow <$> getPos <*> (try $ ident <* keyword "<-") <*> term
@@ -232,9 +224,8 @@ varname = (ident <|> uident <|> keyword "_" *> pure "_")

 ebind : Parser (List (FC, String, Icit, Raw))
 ebind = do
-  sym "("
-  names <- some $ withPos varname
-  sym ":"
+  -- don't commit until we see the ":"
+  names <- try (sym "(" *> some (withPos varname) <* sym ":")
  ty <- typeExpr
  sym ")"
  pure $ map (\(pos, name) => (pos, name, Explicit, ty)) names
@@ -262,7 +253,7 @@ arrow = sym "->" <|> sym "→"
 -- Collect a bunch of binders (A : U) {y : A} -> ...
 binders : Parser Raw
 binders = do
-  binds <- many (abind <|> ibind <|> try ebind)
+  binds <- many (abind <|> ibind <|> ebind)
  arrow
  scope <- typeExpr
  pure $ foldr (uncurry mkBind) scope (join binds)
@@ -286,7 +277,7 @@ typeExpr = binders

 export
 parseSig : Parser Decl
-parseSig = TypeSig <$> getPos <*> some (ident <|> uident) <* keyword ":" <*> typeExpr
+parseSig = TypeSig <$> getPos <*> try (some (ident <|> uident) <* keyword ":") <*> typeExpr

 parseImport : Parser Import
 parseImport = MkImport <$> getPos <* keyword "import" <*> uident
@@ -364,7 +355,7 @@ parseNorm = DCheck <$> getPos <* keyword "#check" <*> typeExpr <* keyword ":" <*

 export
 parseDecl : Parser Decl
-parseDecl = parseMixfix <|> parsePType <|> parsePFunc <|> parseNorm <|> parseData <|> (try $ parseSig) <|> parseDef
+parseDecl = parseMixfix <|> parsePType <|> parsePFunc <|> parseNorm <|> parseData <|> parseSig <|> parseDef


 export
--- a/src/Lib/Parser/Impl.idr
+++ b/src/Lib/Parser/Impl.idr
@@ -1,6 +1,7 @@
 module Lib.Parser.Impl

 import Lib.Token
+import Lib.Common
 import Data.String
 import Data.Nat

@@ -8,54 +9,6 @@ public export
 TokenList : Type
 TokenList = List BTok

-public export
-data Fixity = InfixL | InfixR | Infix
-
-export
-Show Fixity where
-  show InfixL = "infixl"
-  show InfixR = "infixr"
-  show Infix = "infix"
-
-- I was going to use a record, but we're peeling this off of bounds at the moment.
-public export
-FC : Type
-FC = (Int,Int)
-
-public export
-interface HasFC a where
-  getFC : a -> FC
-
-%name FC fc
-
-export
-emptyFC : FC
-emptyFC = (0,0)
-
-- Error of a parse
-public export
-data Error = E FC String
-%name Error err
-
-public export
-showError : String -> Error -> String
-showError src (E (line, col) msg) = "ERROR at \{show (line,col)}: \{msg}\n" ++ go 0 (lines src)
-  where
-    go : Int -> List String -> String
-    go l [] = ""
-    go l (x :: xs) =
-      if l == line then
-        "  \{x}\n  \{replicate (cast col) ' '}^\n"
-      else if line - 3 < l then "  " ++ x ++ "\n" ++ go (l + 1) xs
-      else go (l + 1) xs
-
-public export
-record OpDef where
-  constructor MkOp
-  name : String
-  prec : Int
-  fix : Fixity
-
 -- Result of a parse
 public export
 data Result : Type -> Type where
--- a/src/Lib/Token.idr
+++ b/src/Lib/Token.idr
@@ -9,7 +9,6 @@ data Kind
  = Ident
  | UIdent
  | Keyword
-  | Oper
  | MixFix
  | Number
  | Character
@@ -29,7 +28,6 @@ Show Kind where
  show Ident   = "Ident"
  show UIdent   = "UIdent"
  show Keyword = "Keyword"
-  show Oper    = "Oper"
  show MixFix = "MixFix"
  show Number  = "Number"
  show Character  = "Character"
@@ -47,7 +45,6 @@ Eq Kind where
  Ident   == Ident = True
  UIdent  == UIdent = True
  Keyword == Keyword = True
-  Oper    == Oper = True
  MixFix  == MixFix = True
  Number  == Number = True
  Character == Character = True
--- a/src/Lib/Tokenizer.idr
+++ b/src/Lib/Tokenizer.idr
@@ -3,10 +3,12 @@ module Lib.Tokenizer
 import Text.Lexer
 import Text.Lexer.Tokenizer
 import Lib.Token
+import Lib.Common

 keywords : List String
 keywords = ["let", "in", "where", "case", "of", "data", "U", "do",
-            "ptype", "pfunc", "module", "infixl", "infixr", "infix"]
+            "ptype", "pfunc", "module", "infixl", "infixr", "infix",
+            "->", "→", ":", "=>", ":=", "=", "<-", "\\", "_"]

 specialOps : List String
 specialOps = ["->", ":", "=>", ":=", "=", "<-"]
@@ -17,18 +19,12 @@ checkKW s = if elem s keywords then Tok Keyword s else Tok Ident s
 checkUKW : String -> Token Kind
 checkUKW s = if elem s keywords then Tok Keyword s else Tok UIdent s

-checkOp : String -> Token Kind
-checkOp s = if elem s specialOps then Tok Keyword s else Tok Oper s
-
-isOpChar : Char -> Bool
-isOpChar c = c `elem` (unpack ":!#$%&*+./<=>?@\\^|-~")
-
-opChar : Lexer
-opChar = pred isOpChar
-
 identMore : Lexer
 identMore = alphaNum <|> exact "." <|> exact "'" <|> exact "_"

+singleton : Lexer
+singleton = oneOf "()\\{}[],"
+
 quo : Recognise True
 quo = is '"'

@@ -52,25 +48,34 @@ opMiddle = pred (\c => not (isSpace c || c == '_'))

 rawTokens : Tokenizer (Token Kind)
 rawTokens
-   =  match (lower <+> many identMore) checkKW
-  <|> match (upper <+> many identMore) checkUKW
-  <|> match (some digit) (Tok Number)
-  <|> match (is '#' <+> many alpha) (Tok Pragma)
-  <|> match charLit (Tok Character)
+  = match spaces (Tok Space)
+  -- { is singleton except for {{
+  <|> match (exact "{{" <|> exact "}}") (Tok Keyword)
+  -- need to make this an ident
+  <|> match (exact ",") (checkKW)
+  -- for now, our lambda slash is singleton
+  <|> match (singleton) (Tok Symbol)
+  -- TODO Drop MixFix token type when we support if_then_else_
  <|> match (exact "_" <+> (some opMiddle) <+> exact "_") (Tok MixFix)
-  <|> match (quo <+> manyUntil quo (esc any <|> any) <+> quo) (Tok StringKind . unquote)
+  -- REVIEW - expect non-alpha after?
+  <|> match (some digit) (Tok Number)
+  -- for module names and maybe type constructors
+  <|> match (charLit) (Tok Character)
+  <|> match (is '#' <+> many alpha) (Tok Pragma)
  <|> match (lineComment (exact "--")) (Tok Space)
  <|> match (blockComment (exact "/-") (exact "-/")) (Tok Space)
-  <|> match (exact ",") (Tok Oper)
-  <|> match (some opChar) checkOp
-  <|> match (exact "{{" <|> exact "}}") (Tok Keyword)
-  <|> match symbol (Tok Symbol)
-  <|> match spaces (Tok Space)
+  <|> match (upper <+> many identMore) checkUKW
+  <|> match (quo <+> manyUntil quo (esc any <|> any) <+> quo) (Tok StringKind . unquote)
+  -- accept almost everything, but
+  <|> match (some (non (space <|> singleton))) checkKW

 notSpace : WithBounds (Token Kind) -> Bool
 notSpace (MkBounded (Tok Space _) _ _) = False
 notSpace _ = True

 export
-tokenise : String -> List BTok
-tokenise = filter notSpace . fst . lex rawTokens
+tokenise : String -> Either Error (List BTok)
+tokenise s = case lex rawTokens s of
+  (toks, EndInput, l, c, what) => Right (filter notSpace toks)
+  (toks, reason, l, c, what) => Left (E (l,c) "\{show reason}")
+
--- a/src/Lib/Types.idr
+++ b/src/Lib/Types.idr
@@ -1,7 +1,7 @@
 module Lib.Types

 -- For FC, Error
-import public Lib.Parser.Impl
+import public Lib.Common
 import Lib.Prettier

 import public Control.Monad.Error.Either
@@ -433,7 +433,7 @@ names ctx = toList $ map fst ctx.types

 public export
 M : Type -> Type
-M = (StateT TopContext (EitherT Impl.Error IO))
+M = (StateT TopContext (EitherT Error IO))

 ||| Force argument and print if verbose is true
 export