From 6164893da5d748c591c276ac70123666f32c5623 Mon Sep 17 00:00:00 2001
From: Steve Dunham <dunhamsteve@gmail.com>
Date: Sat, 2 Nov 2024 10:22:04 -0700
Subject: [PATCH] Parsing updates for unicode

- Allow unicode characters in indents and operators
- Show lexing errors
---
 .gitignore                 |  1 +
 TODO.md                    | 22 +++++++++++-----
 newt/Combinatory.newt      | 40 +++++++++++++++++++++++++++++
 newt/Prelude.newt          |  1 +
 src/Lib/Common.idr         | 52 ++++++++++++++++++++++++++++++++++++++
 src/Lib/Parser.idr         | 19 ++++----------
 src/Lib/Parser/Impl.idr    | 49 +----------------------------------
 src/Lib/Token.idr          |  3 ---
 src/Lib/Tokenizer.idr      | 51 ++++++++++++++++++++-----------------
 src/Lib/Types.idr          |  4 +--
 src/Main.idr               |  5 ++--
 tests/black/TestCase4.newt |  4 +--
 tests/black/Zoo4eg.newt    |  2 +-
 13 files changed, 152 insertions(+), 101 deletions(-)
 create mode 100644 newt/Combinatory.newt
 create mode 100644 src/Lib/Common.idr

diff --git a/.gitignore b/.gitignore
index 6ce73d3..bf086df 100644
--- a/.gitignore
+++ b/.gitignore
@@ -7,3 +7,4 @@ build/
 *.agdai
 *.js
 input.txt
+node_modules
diff --git a/TODO.md b/TODO.md
index e106df1..cd0e9e3 100644
--- a/TODO.md
+++ b/TODO.md
@@ -3,15 +3,24 @@
 
 - [ ] Allow unicode operators/names
   - refactored parser to prep for this
-- [ ] get rid of stray INFO from auto resolution
-- [ ] handle if_then_else_j
+- [ ] Web tool
+  - edit, view output, view js, run js, monaco would be nice.
+  - need to shim out Buffer
+- [x] get rid of stray INFO from auto resolution
+- [ ] handle if_then_else_ style mixfix
+- [ ] Search should look at context
+- [ ] records
+- [ ] copattern matching
+- [ ] Support @ on the LHS
 - [x] Remember operators from imports
 - [ ] Default cases for non-primitives (currently gets expanded to all constructors)
+  - This may need a little care. But I think I could collect all constructors that only match wildcards into a single case. This would lose any information from the individual, unnamed cases though.
+  - There are cases where we have  `_` and then `Foo` on the next line, but they should all get collected into the `Foo` case. I think I sorted all of this out for primitives.
 - [x] Case for primitives
 - [ ] aoc2023 translation
   - [x] day1
   - [x] day2
-  - some "real world" examples -v
+  - some "real world" examples
 - [x] Maybe Eq and stuff would work for typeclass without dealing with unification issues yet
 - [x] unsolved meta errors repeat (need to freeze or only report at end)
 - [x] Sanitize JS idents, e.g. `_+_`
@@ -22,7 +31,6 @@
     leave that implicit for efficiency. I think it would also make printing more readable.
   - When printing `Value`, I now print the spine size instead of spine.
 - [x] eval for case (see order.newt)
-- [ ] dynamic pattern unification (add test case first)
 - [x] switch from commit/mustWork to checking progress
 - [x] type constructors are no longer generated?  And seem to have 0 arity.
 - [x] raw let is not yet implemented (although define used by case tree building)
@@ -60,17 +68,19 @@
 - [ ] magic nat (codegen as number with appropriate pattern matching)
 - [ ] magic tuple? (codegen as array)
 - [ ] magic newtype? (drop them in codegen)
-- [ ] records / copatterns
 - [x] vscode: syntax highlighting for String
 - [ ] add `pop` or variant of `pfunc` that maps to an operator, giving the js operator and precedence on RHS
 
 ### Parsing
 
 - [ ] consider allowing σ etc in identifiers
-  - Probably need to merge oper / ident first and sort out mixfix in parsing.
+  - Probably need to merge oper / ident first and sort out mixfix in parsing
+  - The mixfix parsing can handle this now, need to update lexing.
+- [ ] Parse error not ideal for `\x y z b=> b` (points to lambda)
 
 ### Background
 
 - [ ] Read Ulf Norell thesis
+- [ ] Finish reading dynamic pattern unification paper to see what is missing/wrong with the current implementation
 
 
diff --git a/newt/Combinatory.newt b/newt/Combinatory.newt
new file mode 100644
index 0000000..9bf5578
--- /dev/null
+++ b/newt/Combinatory.newt
@@ -0,0 +1,40 @@
+module Combinatory
+
+data Unit : U where
+  MkUnit : Unit
+
+infixr 7 _::_
+data List : U -> U where
+  Nil : {A : U} -> List A
+  _::_ : {A : U} -> A -> List A -> List A
+
+-- prj/menagerie/papers/combinatory
+
+infixr 6 _~>_
+data Type : U where
+  ι : Type
+  _~>_ : Type -> Type -> Type
+
+A : U
+A = Unit
+
+Val : Type -> U
+Val ι = A
+Val (x ~> y) = Val x -> Val y
+
+Ctx : U
+Ctx = List Type
+
+data Ref : Type -> Ctx -> U where
+  Z : {σ : Type} {Γ : Ctx} -> Ref σ (σ :: Γ)
+  S : {σ τ : Type} {Γ : Ctx} -> Ref σ Γ -> Ref σ (τ :: Γ)
+
+data Term : Ctx -> Type -> U where
+  App : {Γ : Ctx} {σ τ : Type} -> Term Γ (σ ~> τ) -> Term Γ σ -> Term Γ τ
+  Lam : {Γ : Ctx} {σ τ : Type} -> Term (σ :: Γ) τ -> Term Γ (σ ~> τ)
+  Var : {Γ : Ctx} {σ : Type} -> Ref σ Γ → Term Γ σ
+
+-- FIXME, I'm not getting an error for Nil, but it's shadowing Nil
+data Env : Ctx -> U where
+  ENil : Env Nil
+  ECons : {Γ : Ctx} {σ : Type} → Val σ → Env Γ → Env (σ :: Γ)
diff --git a/newt/Prelude.newt b/newt/Prelude.newt
index 9769974..e0a9dee 100644
--- a/newt/Prelude.newt
+++ b/newt/Prelude.newt
@@ -17,6 +17,7 @@ data Either : U -> U -> U where
 
 infixr 0 _$_
 
+-- Currently very noisy in generated code
 _$_ : {a b : U} -> (a -> b) -> a -> b
 f $ a = f a
 
diff --git a/src/Lib/Common.idr b/src/Lib/Common.idr
new file mode 100644
index 0000000..08c4852
--- /dev/null
+++ b/src/Lib/Common.idr
@@ -0,0 +1,52 @@
+module Lib.Common
+
+import Data.String
+
+-- I was going to use a record, but we're peeling this off of bounds at the moment.
+public export
+FC : Type
+FC = (Int,Int)
+
+public export
+interface HasFC a where
+  getFC : a -> FC
+
+%name FC fc
+
+export
+emptyFC : FC
+emptyFC = (0,0)
+
+-- Error of a parse
+public export
+data Error = E FC String
+%name Error err
+
+public export
+showError : String -> Error -> String
+showError src (E (line, col) msg) = "ERROR at \{show (line,col)}: \{msg}\n" ++ go 0 (lines src)
+  where
+    go : Int -> List String -> String
+    go l [] = ""
+    go l (x :: xs) =
+      if l == line then
+        "  \{x}\n  \{replicate (cast col) ' '}^\n"
+      else if line - 3 < l then "  " ++ x ++ "\n" ++ go (l + 1) xs
+      else go (l + 1) xs
+
+public export
+data Fixity = InfixL | InfixR | Infix
+
+export
+Show Fixity where
+  show InfixL = "infixl"
+  show InfixR = "infixr"
+  show Infix = "infix"
+
+public export
+record OpDef where
+  constructor MkOp
+  name : String
+  prec : Int
+  fix : Fixity
+
diff --git a/src/Lib/Parser.idr b/src/Lib/Parser.idr
index 322393d..b8f7612 100644
--- a/src/Lib/Parser.idr
+++ b/src/Lib/Parser.idr
@@ -95,7 +95,6 @@ pArg = do
   (Explicit,fc,) <$> atom
     <|> (Implicit,fc,) <$> braces typeExpr
     <|> (Auto,fc,) <$> dbraces typeExpr
-    <|> (Explicit,fc,) . RVar fc <$> token Oper
 
 AppSpine = List (Icit,FC,Raw)
 
@@ -203,13 +202,6 @@ caseExpr = do
   alts <- startBlock $ someSame $ caseAlt
   pure $ RCase fc sc alts
 
-doArrow : Parser DoStmt
-doArrow = do
-  fc <- getPos
-  name <- try $ ident <* keyword "<-"
-  expr <- term
-  pure $ DoArrow fc name expr
-
 doStmt : Parser DoStmt
 doStmt
   = DoArrow <$> getPos <*> (try $ ident <* keyword "<-") <*> term
@@ -232,9 +224,8 @@ varname = (ident <|> uident <|> keyword "_" *> pure "_")
 
 ebind : Parser (List (FC, String, Icit, Raw))
 ebind = do
-  sym "("
-  names <- some $ withPos varname
-  sym ":"
+  -- don't commit until we see the ":"
+  names <- try (sym "(" *> some (withPos varname) <* sym ":")
   ty <- typeExpr
   sym ")"
   pure $ map (\(pos, name) => (pos, name, Explicit, ty)) names
@@ -262,7 +253,7 @@ arrow = sym "->" <|> sym "→"
 -- Collect a bunch of binders (A : U) {y : A} -> ...
 binders : Parser Raw
 binders = do
-  binds <- many (abind <|> ibind <|> try ebind)
+  binds <- many (abind <|> ibind <|> ebind)
   arrow
   scope <- typeExpr
   pure $ foldr (uncurry mkBind) scope (join binds)
@@ -286,7 +277,7 @@ typeExpr = binders
 
 export
 parseSig : Parser Decl
-parseSig = TypeSig <$> getPos <*> some (ident <|> uident) <* keyword ":" <*> typeExpr
+parseSig = TypeSig <$> getPos <*> try (some (ident <|> uident) <* keyword ":") <*> typeExpr
 
 parseImport : Parser Import
 parseImport = MkImport <$> getPos <* keyword "import" <*> uident
@@ -364,7 +355,7 @@ parseNorm = DCheck <$> getPos <* keyword "#check" <*> typeExpr <* keyword ":" <*
 
 export
 parseDecl : Parser Decl
-parseDecl = parseMixfix <|> parsePType <|> parsePFunc <|> parseNorm <|> parseData <|> (try $ parseSig) <|> parseDef
+parseDecl = parseMixfix <|> parsePType <|> parsePFunc <|> parseNorm <|> parseData <|> parseSig <|> parseDef
 
 
 export
diff --git a/src/Lib/Parser/Impl.idr b/src/Lib/Parser/Impl.idr
index 7f119eb..cdaaa22 100644
--- a/src/Lib/Parser/Impl.idr
+++ b/src/Lib/Parser/Impl.idr
@@ -1,6 +1,7 @@
 module Lib.Parser.Impl
 
 import Lib.Token
+import Lib.Common
 import Data.String
 import Data.Nat
 
@@ -8,54 +9,6 @@ public export
 TokenList : Type
 TokenList = List BTok
 
-public export
-data Fixity = InfixL | InfixR | Infix
-
-export
-Show Fixity where
-  show InfixL = "infixl"
-  show InfixR = "infixr"
-  show Infix = "infix"
-
--- I was going to use a record, but we're peeling this off of bounds at the moment.
-public export
-FC : Type
-FC = (Int,Int)
-
-public export
-interface HasFC a where
-  getFC : a -> FC
-
-%name FC fc
-
-export
-emptyFC : FC
-emptyFC = (0,0)
-
--- Error of a parse
-public export
-data Error = E FC String
-%name Error err
-
-public export
-showError : String -> Error -> String
-showError src (E (line, col) msg) = "ERROR at \{show (line,col)}: \{msg}\n" ++ go 0 (lines src)
-  where
-    go : Int -> List String -> String
-    go l [] = ""
-    go l (x :: xs) =
-      if l == line then
-        "  \{x}\n  \{replicate (cast col) ' '}^\n"
-      else if line - 3 < l then "  " ++ x ++ "\n" ++ go (l + 1) xs
-      else go (l + 1) xs
-
-public export
-record OpDef where
-  constructor MkOp
-  name : String
-  prec : Int
-  fix : Fixity
-
 -- Result of a parse
 public export
 data Result : Type -> Type where
diff --git a/src/Lib/Token.idr b/src/Lib/Token.idr
index a7056e6..162b65b 100644
--- a/src/Lib/Token.idr
+++ b/src/Lib/Token.idr
@@ -9,7 +9,6 @@ data Kind
   = Ident
   | UIdent
   | Keyword
-  | Oper
   | MixFix
   | Number
   | Character
@@ -29,7 +28,6 @@ Show Kind where
   show Ident   = "Ident"
   show UIdent   = "UIdent"
   show Keyword = "Keyword"
-  show Oper    = "Oper"
   show MixFix = "MixFix"
   show Number  = "Number"
   show Character  = "Character"
@@ -47,7 +45,6 @@ Eq Kind where
   Ident   == Ident = True
   UIdent  == UIdent = True
   Keyword == Keyword = True
-  Oper    == Oper = True
   MixFix  == MixFix = True
   Number  == Number = True
   Character == Character = True
diff --git a/src/Lib/Tokenizer.idr b/src/Lib/Tokenizer.idr
index d4ecfee..dd01706 100644
--- a/src/Lib/Tokenizer.idr
+++ b/src/Lib/Tokenizer.idr
@@ -3,10 +3,12 @@ module Lib.Tokenizer
 import Text.Lexer
 import Text.Lexer.Tokenizer
 import Lib.Token
+import Lib.Common
 
 keywords : List String
 keywords = ["let", "in", "where", "case", "of", "data", "U", "do",
-            "ptype", "pfunc", "module", "infixl", "infixr", "infix"]
+            "ptype", "pfunc", "module", "infixl", "infixr", "infix",
+            "->", "→", ":", "=>", ":=", "=", "<-", "\\", "_"]
 
 specialOps : List String
 specialOps = ["->", ":", "=>", ":=", "=", "<-"]
@@ -17,18 +19,12 @@ checkKW s = if elem s keywords then Tok Keyword s else Tok Ident s
 checkUKW : String -> Token Kind
 checkUKW s = if elem s keywords then Tok Keyword s else Tok UIdent s
 
-checkOp : String -> Token Kind
-checkOp s = if elem s specialOps then Tok Keyword s else Tok Oper s
-
-isOpChar : Char -> Bool
-isOpChar c = c `elem` (unpack ":!#$%&*+./<=>?@\\^|-~")
-
-opChar : Lexer
-opChar = pred isOpChar
-
 identMore : Lexer
 identMore = alphaNum <|> exact "." <|> exact "'" <|> exact "_"
 
+singleton : Lexer
+singleton = oneOf "()\\{}[],"
+
 quo : Recognise True
 quo = is '"'
 
@@ -52,25 +48,34 @@ opMiddle = pred (\c => not (isSpace c || c == '_'))
 
 rawTokens : Tokenizer (Token Kind)
 rawTokens
-   =  match (lower <+> many identMore) checkKW
-  <|> match (upper <+> many identMore) checkUKW
-  <|> match (some digit) (Tok Number)
-  <|> match (is '#' <+> many alpha) (Tok Pragma)
-  <|> match charLit (Tok Character)
+  = match spaces (Tok Space)
+  -- { is singleton except for {{
+  <|> match (exact "{{" <|> exact "}}") (Tok Keyword)
+  -- need to make this an ident
+  <|> match (exact ",") (checkKW)
+  -- for now, our lambda slash is singleton
+  <|> match (singleton) (Tok Symbol)
+  -- TODO Drop MixFix token type when we support if_then_else_
   <|> match (exact "_" <+> (some opMiddle) <+> exact "_") (Tok MixFix)
-  <|> match (quo <+> manyUntil quo (esc any <|> any) <+> quo) (Tok StringKind . unquote)
+  -- REVIEW - expect non-alpha after?
+  <|> match (some digit) (Tok Number)
+  -- for module names and maybe type constructors
+  <|> match (charLit) (Tok Character)
+  <|> match (is '#' <+> many alpha) (Tok Pragma)
   <|> match (lineComment (exact "--")) (Tok Space)
   <|> match (blockComment (exact "/-") (exact "-/")) (Tok Space)
-  <|> match (exact ",") (Tok Oper)
-  <|> match (some opChar) checkOp
-  <|> match (exact "{{" <|> exact "}}") (Tok Keyword)
-  <|> match symbol (Tok Symbol)
-  <|> match spaces (Tok Space)
+  <|> match (upper <+> many identMore) checkUKW
+  <|> match (quo <+> manyUntil quo (esc any <|> any) <+> quo) (Tok StringKind . unquote)
+  -- accept almost everything, but
+  <|> match (some (non (space <|> singleton))) checkKW
 
 notSpace : WithBounds (Token Kind) -> Bool
 notSpace (MkBounded (Tok Space _) _ _) = False
 notSpace _ = True
 
 export
-tokenise : String -> List BTok
-tokenise = filter notSpace . fst . lex rawTokens
+tokenise : String -> Either Error (List BTok)
+tokenise s = case lex rawTokens s of
+  (toks, EndInput, l, c, what) => Right (filter notSpace toks)
+  (toks, reason, l, c, what) => Left (E (l,c) "\{show reason}")
+
diff --git a/src/Lib/Types.idr b/src/Lib/Types.idr
index 00a03ab..2435e87 100644
--- a/src/Lib/Types.idr
+++ b/src/Lib/Types.idr
@@ -1,7 +1,7 @@
 module Lib.Types
 
 -- For FC, Error
-import public Lib.Parser.Impl
+import public Lib.Common
 import Lib.Prettier
 
 import public Control.Monad.Error.Either
@@ -433,7 +433,7 @@ names ctx = toList $ map fst ctx.types
 
 public export
 M : Type -> Type
-M = (StateT TopContext (EitherT Impl.Error IO))
+M = (StateT TopContext (EitherT Error IO))
 
 ||| Force argument and print if verbose is true
 export
diff --git a/src/Main.idr b/src/Main.idr
index aa280e0..024a3c8 100644
--- a/src/Main.idr
+++ b/src/Main.idr
@@ -11,7 +11,7 @@ import Data.IORef
 -- import Lib.Elab
 import Lib.Compile
 import Lib.Parser
--- import Lib.Parser.Impl
+import Lib.Parser.Impl
 import Lib.Prettier
 import Lib.ProcessDecl
 import Lib.Token
@@ -60,7 +60,8 @@ processModule base stk name = do
   let fn = base ++ "/" ++ name ++ ".newt"
   Right src <- readFile $ fn
     | Left err => fail (show err)
-  let toks = tokenise src
+  let Right toks = tokenise src
+    | Left err => fail (showError src err)
 
   let Right (modName, ops, toks) := partialParse parseModHeader top.ops toks
     | Left err => fail (showError src err)
diff --git a/tests/black/TestCase4.newt b/tests/black/TestCase4.newt
index 2b1ebdd..38054d3 100644
--- a/tests/black/TestCase4.newt
+++ b/tests/black/TestCase4.newt
@@ -15,7 +15,7 @@ head (x :: xs) = x
 
 -- These came from a Conor McBride lecture where they use SHE
 
-vapp : {s t: U} {k : Nat} -> Vect k (s -> t) -> Vect k s -> Vect k t
+vapp : {s t : U} {k : Nat} -> Vect k (s -> t) -> Vect k s -> Vect k t
 vapp (f :: fs) (t :: ts) = f t :: vapp fs ts
 vapp Nil Nil = Nil
 
@@ -32,7 +32,7 @@ fmap f (x :: xs) = (f x :: fmap f xs)
 pure : {a : U} {n : Nat} -> a -> Vect n a
 pure {a} {n} = vec n
 
-_<*>_ : {s t: U} {k : Nat} -> Vect k (s -> t) -> Vect k s -> Vect k t
+_<*>_ : {s t : U} {k : Nat} -> Vect k (s -> t) -> Vect k s -> Vect k t
 _<*>_ = vapp
 
 -- and idiom brackets (maybe someday)
diff --git a/tests/black/Zoo4eg.newt b/tests/black/Zoo4eg.newt
index 17e103e..a778d25 100644
--- a/tests/black/Zoo4eg.newt
+++ b/tests/black/Zoo4eg.newt
@@ -12,7 +12,7 @@ group1 : {A B : U}(x y z : A) -> B -> B
 group1 = \x y z b => b
 
 group2 : {A B}(x y z : A) -> B -> B
-group2 = \x y z b=> b
+group2 = \x y z b => b
 
 -- explicit id function used for annotation as in Idris
 the : (A : _) -> A -> A