Parsing updates for unicode

- Allow unicode characters in indents and operators - Show lexing errors
2024-11-02 10:22:04 -07:00
parent d41558c219
commit 6164893da5
13 changed files with 152 additions and 101 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -7,3 +7,4 @@ build/
 *.agdai
 *.js
 input.txt
 node_modules
--- a/TODO.md
+++ b/TODO.md
@@ -3,15 +3,24 @@
 - [ ] Allow unicode operators/names
  - refactored parser to prep for this
- [ ] get rid of stray INFO from auto resolution
+- [ ] Web tool
- [ ] handle if_then_else_j
+  - edit, view output, view js, run js, monaco would be nice.
  - need to shim out Buffer
 - [x] get rid of stray INFO from auto resolution
 - [ ] handle if_then_else_ style mixfix
 - [ ] Search should look at context
 - [ ] records
 - [ ] copattern matching
 - [ ] Support @ on the LHS
 - [x] Remember operators from imports
 - [ ] Default cases for non-primitives (currently gets expanded to all constructors)
  - This may need a little care. But I think I could collect all constructors that only match wildcards into a single case. This would lose any information from the individual, unnamed cases though.
  - There are cases where we have  `_` and then `Foo` on the next line, but they should all get collected into the `Foo` case. I think I sorted all of this out for primitives.
 - [x] Case for primitives
 - [ ] aoc2023 translation
  - [x] day1
  - [x] day2
-  - some "real world" examples -v
+  - some "real world" examples
 - [x] Maybe Eq and stuff would work for typeclass without dealing with unification issues yet
 - [x] unsolved meta errors repeat (need to freeze or only report at end)
 - [x] Sanitize JS idents, e.g. `_+_`
@@ -22,7 +31,6 @@
    leave that implicit for efficiency. I think it would also make printing more readable.
  - When printing `Value`, I now print the spine size instead of spine.
 - [x] eval for case (see order.newt)
 - [ ] dynamic pattern unification (add test case first)
 - [x] switch from commit/mustWork to checking progress
 - [x] type constructors are no longer generated?  And seem to have 0 arity.
 - [x] raw let is not yet implemented (although define used by case tree building)
@@ -60,17 +68,19 @@
 - [ ] magic nat (codegen as number with appropriate pattern matching)
 - [ ] magic tuple? (codegen as array)
 - [ ] magic newtype? (drop them in codegen)
 - [ ] records / copatterns
 - [x] vscode: syntax highlighting for String
 - [ ] add `pop` or variant of `pfunc` that maps to an operator, giving the js operator and precedence on RHS
 ### Parsing
 - [ ] consider allowing σ etc in identifiers
-  - Probably need to merge oper / ident first and sort out mixfix in parsing.
+  - Probably need to merge oper / ident first and sort out mixfix in parsing
  - The mixfix parsing can handle this now, need to update lexing.
 - [ ] Parse error not ideal for `\x y z b=> b` (points to lambda)
 ### Background
 - [ ] Read Ulf Norell thesis
 - [ ] Finish reading dynamic pattern unification paper to see what is missing/wrong with the current implementation
--- a/newt/Combinatory.newt
+++ b/newt/Combinatory.newt
@@ -0,0 +1,40 @@
 module Combinatory
 data Unit : U where
  MkUnit : Unit
 infixr 7 _::_
 data List : U -> U where
  Nil : {A : U} -> List A
  _::_ : {A : U} -> A -> List A -> List A
 -- prj/menagerie/papers/combinatory
 infixr 6 _~>_
 data Type : U where
  ι : Type
  _~>_ : Type -> Type -> Type
 A : U
 A = Unit
 Val : Type -> U
 Val ι = A
 Val (x ~> y) = Val x -> Val y
 Ctx : U
 Ctx = List Type
 data Ref : Type -> Ctx -> U where
  Z : {σ : Type} {Γ : Ctx} -> Ref σ (σ :: Γ)
  S : {σ τ : Type} {Γ : Ctx} -> Ref σ Γ -> Ref σ (τ :: Γ)
 data Term : Ctx -> Type -> U where
  App : {Γ : Ctx} {σ τ : Type} -> Term Γ (σ ~> τ) -> Term Γ σ -> Term Γ τ
  Lam : {Γ : Ctx} {σ τ : Type} -> Term (σ :: Γ) τ -> Term Γ (σ ~> τ)
  Var : {Γ : Ctx} {σ : Type} -> Ref σ Γ → Term Γ σ
 -- FIXME, I'm not getting an error for Nil, but it's shadowing Nil
 data Env : Ctx -> U where
  ENil : Env Nil
  ECons : {Γ : Ctx} {σ : Type} → Val σ → Env Γ → Env (σ :: Γ)
--- a/newt/Prelude.newt
+++ b/newt/Prelude.newt
@@ -17,6 +17,7 @@ data Either : U -> U -> U where
 infixr 0 _$_
 -- Currently very noisy in generated code
 _$_ : {a b : U} -> (a -> b) -> a -> b
 f $ a = f a
--- a/src/Lib/Common.idr
+++ b/src/Lib/Common.idr
@@ -0,0 +1,52 @@
 module Lib.Common
 import Data.String
 -- I was going to use a record, but we're peeling this off of bounds at the moment.
 public export
 FC : Type
 FC = (Int,Int)
 public export
 interface HasFC a where
  getFC : a -> FC
 %name FC fc
 export
 emptyFC : FC
 emptyFC = (0,0)
 -- Error of a parse
 public export
 data Error = E FC String
 %name Error err
 public export
 showError : String -> Error -> String
 showError src (E (line, col) msg) = "ERROR at \{show (line,col)}: \{msg}\n" ++ go 0 (lines src)
  where
    go : Int -> List String -> String
    go l [] = ""
    go l (x :: xs) =
      if l == line then
        "  \{x}\n  \{replicate (cast col) ' '}^\n"
      else if line - 3 < l then "  " ++ x ++ "\n" ++ go (l + 1) xs
      else go (l + 1) xs
 public export
 data Fixity = InfixL | InfixR | Infix
 export
 Show Fixity where
  show InfixL = "infixl"
  show InfixR = "infixr"
  show Infix = "infix"
 public export
 record OpDef where
  constructor MkOp
  name : String
  prec : Int
  fix : Fixity
--- a/src/Lib/Parser.idr
+++ b/src/Lib/Parser.idr
@@ -95,7 +95,6 @@ pArg = do
  (Explicit,fc,) <$> atom
    <|> (Implicit,fc,) <$> braces typeExpr
    <|> (Auto,fc,) <$> dbraces typeExpr
    <|> (Explicit,fc,) . RVar fc <$> token Oper
 AppSpine = List (Icit,FC,Raw)
@@ -203,13 +202,6 @@ caseExpr = do
  alts <- startBlock $ someSame $ caseAlt
  pure $ RCase fc sc alts
 doArrow : Parser DoStmt
 doArrow = do
  fc <- getPos
  name <- try $ ident <* keyword "<-"
  expr <- term
  pure $ DoArrow fc name expr
 doStmt : Parser DoStmt
 doStmt
  = DoArrow <$> getPos <*> (try $ ident <* keyword "<-") <*> term
@@ -232,9 +224,8 @@ varname = (ident <|> uident <|> keyword "_" *> pure "_")
 ebind : Parser (List (FC, String, Icit, Raw))
 ebind = do
-  sym "("
+  -- don't commit until we see the ":"
-  names <- some $ withPos varname
+  names <- try (sym "(" *> some (withPos varname) <* sym ":")
  sym ":"
  ty <- typeExpr
  sym ")"
  pure $ map (\(pos, name) => (pos, name, Explicit, ty)) names
@@ -262,7 +253,7 @@ arrow = sym "->" <|> sym "→"
 -- Collect a bunch of binders (A : U) {y : A} -> ...
 binders : Parser Raw
 binders = do
-  binds <- many (abind <|> ibind <|> try ebind)
+  binds <- many (abind <|> ibind <|> ebind)
  arrow
  scope <- typeExpr
  pure $ foldr (uncurry mkBind) scope (join binds)
@@ -286,7 +277,7 @@ typeExpr = binders
 export
 parseSig : Parser Decl
-parseSig = TypeSig <$> getPos <*> some (ident <|> uident) <* keyword ":" <*> typeExpr
+parseSig = TypeSig <$> getPos <*> try (some (ident <|> uident) <* keyword ":") <*> typeExpr
 parseImport : Parser Import
 parseImport = MkImport <$> getPos <* keyword "import" <*> uident
@@ -364,7 +355,7 @@ parseNorm = DCheck <$> getPos <* keyword "#check" <*> typeExpr <* keyword ":" <*
 export
 parseDecl : Parser Decl
-parseDecl = parseMixfix <|> parsePType <|> parsePFunc <|> parseNorm <|> parseData <|> (try $ parseSig) <|> parseDef
+parseDecl = parseMixfix <|> parsePType <|> parsePFunc <|> parseNorm <|> parseData <|> parseSig <|> parseDef
 export
--- a/src/Lib/Parser/Impl.idr
+++ b/src/Lib/Parser/Impl.idr
@@ -1,6 +1,7 @@
 module Lib.Parser.Impl
 import Lib.Token
 import Lib.Common
 import Data.String
 import Data.Nat
@@ -8,54 +9,6 @@ public export
 TokenList : Type
 TokenList = List BTok
 public export
 data Fixity = InfixL | InfixR | Infix
 export
 Show Fixity where
  show InfixL = "infixl"
  show InfixR = "infixr"
  show Infix = "infix"
 -- I was going to use a record, but we're peeling this off of bounds at the moment.
 public export
 FC : Type
 FC = (Int,Int)
 public export
 interface HasFC a where
  getFC : a -> FC
 %name FC fc
 export
 emptyFC : FC
 emptyFC = (0,0)
 -- Error of a parse
 public export
 data Error = E FC String
 %name Error err
 public export
 showError : String -> Error -> String
 showError src (E (line, col) msg) = "ERROR at \{show (line,col)}: \{msg}\n" ++ go 0 (lines src)
  where
    go : Int -> List String -> String
    go l [] = ""
    go l (x :: xs) =
      if l == line then
        "  \{x}\n  \{replicate (cast col) ' '}^\n"
      else if line - 3 < l then "  " ++ x ++ "\n" ++ go (l + 1) xs
      else go (l + 1) xs
 public export
 record OpDef where
  constructor MkOp
  name : String
  prec : Int
  fix : Fixity
 -- Result of a parse
 public export
 data Result : Type -> Type where
--- a/src/Lib/Token.idr
+++ b/src/Lib/Token.idr
@@ -9,7 +9,6 @@ data Kind
  = Ident
  | UIdent
  | Keyword
  | Oper
  | MixFix
  | Number
  | Character
@@ -29,7 +28,6 @@ Show Kind where
  show Ident   = "Ident"
  show UIdent   = "UIdent"
  show Keyword = "Keyword"
  show Oper    = "Oper"
  show MixFix = "MixFix"
  show Number  = "Number"
  show Character  = "Character"
@@ -47,7 +45,6 @@ Eq Kind where
  Ident   == Ident = True
  UIdent  == UIdent = True
  Keyword == Keyword = True
  Oper    == Oper = True
  MixFix  == MixFix = True
  Number  == Number = True
  Character == Character = True
--- a/src/Lib/Tokenizer.idr
+++ b/src/Lib/Tokenizer.idr
@@ -3,10 +3,12 @@ module Lib.Tokenizer
 import Text.Lexer
 import Text.Lexer.Tokenizer
 import Lib.Token
 import Lib.Common
 keywords : List String
 keywords = ["let", "in", "where", "case", "of", "data", "U", "do",
-            "ptype", "pfunc", "module", "infixl", "infixr", "infix"]
+            "ptype", "pfunc", "module", "infixl", "infixr", "infix",
            "->", "→", ":", "=>", ":=", "=", "<-", "\\", "_"]
 specialOps : List String
 specialOps = ["->", ":", "=>", ":=", "=", "<-"]
@@ -17,18 +19,12 @@ checkKW s = if elem s keywords then Tok Keyword s else Tok Ident s
 checkUKW : String -> Token Kind
 checkUKW s = if elem s keywords then Tok Keyword s else Tok UIdent s
 checkOp : String -> Token Kind
 checkOp s = if elem s specialOps then Tok Keyword s else Tok Oper s
 isOpChar : Char -> Bool
 isOpChar c = c `elem` (unpack ":!#$%&*+./<=>?@\\^|-~")
 opChar : Lexer
 opChar = pred isOpChar
 identMore : Lexer
 identMore = alphaNum <|> exact "." <|> exact "'" <|> exact "_"
 singleton : Lexer
 singleton = oneOf "()\\{}[],"
 quo : Recognise True
 quo = is '"'
@@ -52,25 +48,34 @@ opMiddle = pred (\c => not (isSpace c || c == '_'))
 rawTokens : Tokenizer (Token Kind)
 rawTokens
-   =  match (lower <+> many identMore) checkKW
+  = match spaces (Tok Space)
-  <|> match (upper <+> many identMore) checkUKW
+  -- { is singleton except for {{
-  <|> match (some digit) (Tok Number)
+  <|> match (exact "{{" <|> exact "}}") (Tok Keyword)
-  <|> match (is '#' <+> many alpha) (Tok Pragma)
+  -- need to make this an ident
-  <|> match charLit (Tok Character)
+  <|> match (exact ",") (checkKW)
  -- for now, our lambda slash is singleton
  <|> match (singleton) (Tok Symbol)
  -- TODO Drop MixFix token type when we support if_then_else_
  <|> match (exact "_" <+> (some opMiddle) <+> exact "_") (Tok MixFix)
-  <|> match (quo <+> manyUntil quo (esc any <|> any) <+> quo) (Tok StringKind . unquote)
+  -- REVIEW - expect non-alpha after?
  <|> match (some digit) (Tok Number)
  -- for module names and maybe type constructors
  <|> match (charLit) (Tok Character)
  <|> match (is '#' <+> many alpha) (Tok Pragma)
  <|> match (lineComment (exact "--")) (Tok Space)
  <|> match (blockComment (exact "/-") (exact "-/")) (Tok Space)
-  <|> match (exact ",") (Tok Oper)
+  <|> match (upper <+> many identMore) checkUKW
-  <|> match (some opChar) checkOp
+  <|> match (quo <+> manyUntil quo (esc any <|> any) <+> quo) (Tok StringKind . unquote)
-  <|> match (exact "{{" <|> exact "}}") (Tok Keyword)
+  -- accept almost everything, but
-  <|> match symbol (Tok Symbol)
+  <|> match (some (non (space <|> singleton))) checkKW
  <|> match spaces (Tok Space)
 notSpace : WithBounds (Token Kind) -> Bool
 notSpace (MkBounded (Tok Space _) _ _) = False
 notSpace _ = True
 export
-tokenise : String -> List BTok
+tokenise : String -> Either Error (List BTok)
-tokenise = filter notSpace . fst . lex rawTokens
+tokenise s = case lex rawTokens s of
  (toks, EndInput, l, c, what) => Right (filter notSpace toks)
  (toks, reason, l, c, what) => Left (E (l,c) "\{show reason}")
--- a/src/Lib/Types.idr
+++ b/src/Lib/Types.idr
@@ -1,7 +1,7 @@
 module Lib.Types
 -- For FC, Error
-import public Lib.Parser.Impl
+import public Lib.Common
 import Lib.Prettier
 import public Control.Monad.Error.Either
@@ -433,7 +433,7 @@ names ctx = toList $ map fst ctx.types
 public export
 M : Type -> Type
-M = (StateT TopContext (EitherT Impl.Error IO))
+M = (StateT TopContext (EitherT Error IO))
 ||| Force argument and print if verbose is true
 export
--- a/src/Main.idr
+++ b/src/Main.idr
@@ -11,7 +11,7 @@ import Data.IORef
 -- import Lib.Elab
 import Lib.Compile
 import Lib.Parser
-- import Lib.Parser.Impl
+import Lib.Parser.Impl
 import Lib.Prettier
 import Lib.ProcessDecl
 import Lib.Token
@@ -60,7 +60,8 @@ processModule base stk name = do
  let fn = base ++ "/" ++ name ++ ".newt"
  Right src <- readFile $ fn
    | Left err => fail (show err)
-  let toks = tokenise src
+  let Right toks = tokenise src
    | Left err => fail (showError src err)
  let Right (modName, ops, toks) := partialParse parseModHeader top.ops toks
    | Left err => fail (showError src err)
--- a/tests/black/TestCase4.newt
+++ b/tests/black/TestCase4.newt
@@ -15,7 +15,7 @@ head (x :: xs) = x
 -- These came from a Conor McBride lecture where they use SHE
-vapp : {s t: U} {k : Nat} -> Vect k (s -> t) -> Vect k s -> Vect k t
+vapp : {s t : U} {k : Nat} -> Vect k (s -> t) -> Vect k s -> Vect k t
 vapp (f :: fs) (t :: ts) = f t :: vapp fs ts
 vapp Nil Nil = Nil
@@ -32,7 +32,7 @@ fmap f (x :: xs) = (f x :: fmap f xs)
 pure : {a : U} {n : Nat} -> a -> Vect n a
 pure {a} {n} = vec n
-_<*>_ : {s t: U} {k : Nat} -> Vect k (s -> t) -> Vect k s -> Vect k t
+_<*>_ : {s t : U} {k : Nat} -> Vect k (s -> t) -> Vect k s -> Vect k t
 _<*>_ = vapp
 -- and idiom brackets (maybe someday)
--- a/tests/black/Zoo4eg.newt
+++ b/tests/black/Zoo4eg.newt
@@ -12,7 +12,7 @@ group1 : {A B : U}(x y z : A) -> B -> B
 group1 = \x y z b => b
 group2 : {A B}(x y z : A) -> B -> B
-group2 = \x y z b=> b
+group2 = \x y z b => b
 -- explicit id function used for annotation as in Idris
 the : (A : _) -> A -> A