From 6164893da5d748c591c276ac70123666f32c5623 Mon Sep 17 00:00:00 2001 From: Steve Dunham Date: Sat, 2 Nov 2024 10:22:04 -0700 Subject: [PATCH] Parsing updates for unicode - Allow unicode characters in indents and operators - Show lexing errors --- .gitignore | 1 + TODO.md | 22 +++++++++++----- newt/Combinatory.newt | 40 +++++++++++++++++++++++++++++ newt/Prelude.newt | 1 + src/Lib/Common.idr | 52 ++++++++++++++++++++++++++++++++++++++ src/Lib/Parser.idr | 19 ++++---------- src/Lib/Parser/Impl.idr | 49 +---------------------------------- src/Lib/Token.idr | 3 --- src/Lib/Tokenizer.idr | 51 ++++++++++++++++++++----------------- src/Lib/Types.idr | 4 +-- src/Main.idr | 5 ++-- tests/black/TestCase4.newt | 4 +-- tests/black/Zoo4eg.newt | 2 +- 13 files changed, 152 insertions(+), 101 deletions(-) create mode 100644 newt/Combinatory.newt create mode 100644 src/Lib/Common.idr diff --git a/.gitignore b/.gitignore index 6ce73d3..bf086df 100644 --- a/.gitignore +++ b/.gitignore @@ -7,3 +7,4 @@ build/ *.agdai *.js input.txt +node_modules diff --git a/TODO.md b/TODO.md index e106df1..cd0e9e3 100644 --- a/TODO.md +++ b/TODO.md @@ -3,15 +3,24 @@ - [ ] Allow unicode operators/names - refactored parser to prep for this -- [ ] get rid of stray INFO from auto resolution -- [ ] handle if_then_else_j +- [ ] Web tool + - edit, view output, view js, run js, monaco would be nice. + - need to shim out Buffer +- [x] get rid of stray INFO from auto resolution +- [ ] handle if_then_else_ style mixfix +- [ ] Search should look at context +- [ ] records +- [ ] copattern matching +- [ ] Support @ on the LHS - [x] Remember operators from imports - [ ] Default cases for non-primitives (currently gets expanded to all constructors) + - This may need a little care. But I think I could collect all constructors that only match wildcards into a single case. This would lose any information from the individual, unnamed cases though. + - There are cases where we have `_` and then `Foo` on the next line, but they should all get collected into the `Foo` case. I think I sorted all of this out for primitives. - [x] Case for primitives - [ ] aoc2023 translation - [x] day1 - [x] day2 - - some "real world" examples -v + - some "real world" examples - [x] Maybe Eq and stuff would work for typeclass without dealing with unification issues yet - [x] unsolved meta errors repeat (need to freeze or only report at end) - [x] Sanitize JS idents, e.g. `_+_` @@ -22,7 +31,6 @@ leave that implicit for efficiency. I think it would also make printing more readable. - When printing `Value`, I now print the spine size instead of spine. - [x] eval for case (see order.newt) -- [ ] dynamic pattern unification (add test case first) - [x] switch from commit/mustWork to checking progress - [x] type constructors are no longer generated? And seem to have 0 arity. - [x] raw let is not yet implemented (although define used by case tree building) @@ -60,17 +68,19 @@ - [ ] magic nat (codegen as number with appropriate pattern matching) - [ ] magic tuple? (codegen as array) - [ ] magic newtype? (drop them in codegen) -- [ ] records / copatterns - [x] vscode: syntax highlighting for String - [ ] add `pop` or variant of `pfunc` that maps to an operator, giving the js operator and precedence on RHS ### Parsing - [ ] consider allowing σ etc in identifiers - - Probably need to merge oper / ident first and sort out mixfix in parsing. + - Probably need to merge oper / ident first and sort out mixfix in parsing + - The mixfix parsing can handle this now, need to update lexing. +- [ ] Parse error not ideal for `\x y z b=> b` (points to lambda) ### Background - [ ] Read Ulf Norell thesis +- [ ] Finish reading dynamic pattern unification paper to see what is missing/wrong with the current implementation diff --git a/newt/Combinatory.newt b/newt/Combinatory.newt new file mode 100644 index 0000000..9bf5578 --- /dev/null +++ b/newt/Combinatory.newt @@ -0,0 +1,40 @@ +module Combinatory + +data Unit : U where + MkUnit : Unit + +infixr 7 _::_ +data List : U -> U where + Nil : {A : U} -> List A + _::_ : {A : U} -> A -> List A -> List A + +-- prj/menagerie/papers/combinatory + +infixr 6 _~>_ +data Type : U where + ι : Type + _~>_ : Type -> Type -> Type + +A : U +A = Unit + +Val : Type -> U +Val ι = A +Val (x ~> y) = Val x -> Val y + +Ctx : U +Ctx = List Type + +data Ref : Type -> Ctx -> U where + Z : {σ : Type} {Γ : Ctx} -> Ref σ (σ :: Γ) + S : {σ τ : Type} {Γ : Ctx} -> Ref σ Γ -> Ref σ (τ :: Γ) + +data Term : Ctx -> Type -> U where + App : {Γ : Ctx} {σ τ : Type} -> Term Γ (σ ~> τ) -> Term Γ σ -> Term Γ τ + Lam : {Γ : Ctx} {σ τ : Type} -> Term (σ :: Γ) τ -> Term Γ (σ ~> τ) + Var : {Γ : Ctx} {σ : Type} -> Ref σ Γ → Term Γ σ + +-- FIXME, I'm not getting an error for Nil, but it's shadowing Nil +data Env : Ctx -> U where + ENil : Env Nil + ECons : {Γ : Ctx} {σ : Type} → Val σ → Env Γ → Env (σ :: Γ) diff --git a/newt/Prelude.newt b/newt/Prelude.newt index 9769974..e0a9dee 100644 --- a/newt/Prelude.newt +++ b/newt/Prelude.newt @@ -17,6 +17,7 @@ data Either : U -> U -> U where infixr 0 _$_ +-- Currently very noisy in generated code _$_ : {a b : U} -> (a -> b) -> a -> b f $ a = f a diff --git a/src/Lib/Common.idr b/src/Lib/Common.idr new file mode 100644 index 0000000..08c4852 --- /dev/null +++ b/src/Lib/Common.idr @@ -0,0 +1,52 @@ +module Lib.Common + +import Data.String + +-- I was going to use a record, but we're peeling this off of bounds at the moment. +public export +FC : Type +FC = (Int,Int) + +public export +interface HasFC a where + getFC : a -> FC + +%name FC fc + +export +emptyFC : FC +emptyFC = (0,0) + +-- Error of a parse +public export +data Error = E FC String +%name Error err + +public export +showError : String -> Error -> String +showError src (E (line, col) msg) = "ERROR at \{show (line,col)}: \{msg}\n" ++ go 0 (lines src) + where + go : Int -> List String -> String + go l [] = "" + go l (x :: xs) = + if l == line then + " \{x}\n \{replicate (cast col) ' '}^\n" + else if line - 3 < l then " " ++ x ++ "\n" ++ go (l + 1) xs + else go (l + 1) xs + +public export +data Fixity = InfixL | InfixR | Infix + +export +Show Fixity where + show InfixL = "infixl" + show InfixR = "infixr" + show Infix = "infix" + +public export +record OpDef where + constructor MkOp + name : String + prec : Int + fix : Fixity + diff --git a/src/Lib/Parser.idr b/src/Lib/Parser.idr index 322393d..b8f7612 100644 --- a/src/Lib/Parser.idr +++ b/src/Lib/Parser.idr @@ -95,7 +95,6 @@ pArg = do (Explicit,fc,) <$> atom <|> (Implicit,fc,) <$> braces typeExpr <|> (Auto,fc,) <$> dbraces typeExpr - <|> (Explicit,fc,) . RVar fc <$> token Oper AppSpine = List (Icit,FC,Raw) @@ -203,13 +202,6 @@ caseExpr = do alts <- startBlock $ someSame $ caseAlt pure $ RCase fc sc alts -doArrow : Parser DoStmt -doArrow = do - fc <- getPos - name <- try $ ident <* keyword "<-" - expr <- term - pure $ DoArrow fc name expr - doStmt : Parser DoStmt doStmt = DoArrow <$> getPos <*> (try $ ident <* keyword "<-") <*> term @@ -232,9 +224,8 @@ varname = (ident <|> uident <|> keyword "_" *> pure "_") ebind : Parser (List (FC, String, Icit, Raw)) ebind = do - sym "(" - names <- some $ withPos varname - sym ":" + -- don't commit until we see the ":" + names <- try (sym "(" *> some (withPos varname) <* sym ":") ty <- typeExpr sym ")" pure $ map (\(pos, name) => (pos, name, Explicit, ty)) names @@ -262,7 +253,7 @@ arrow = sym "->" <|> sym "→" -- Collect a bunch of binders (A : U) {y : A} -> ... binders : Parser Raw binders = do - binds <- many (abind <|> ibind <|> try ebind) + binds <- many (abind <|> ibind <|> ebind) arrow scope <- typeExpr pure $ foldr (uncurry mkBind) scope (join binds) @@ -286,7 +277,7 @@ typeExpr = binders export parseSig : Parser Decl -parseSig = TypeSig <$> getPos <*> some (ident <|> uident) <* keyword ":" <*> typeExpr +parseSig = TypeSig <$> getPos <*> try (some (ident <|> uident) <* keyword ":") <*> typeExpr parseImport : Parser Import parseImport = MkImport <$> getPos <* keyword "import" <*> uident @@ -364,7 +355,7 @@ parseNorm = DCheck <$> getPos <* keyword "#check" <*> typeExpr <* keyword ":" <* export parseDecl : Parser Decl -parseDecl = parseMixfix <|> parsePType <|> parsePFunc <|> parseNorm <|> parseData <|> (try $ parseSig) <|> parseDef +parseDecl = parseMixfix <|> parsePType <|> parsePFunc <|> parseNorm <|> parseData <|> parseSig <|> parseDef export diff --git a/src/Lib/Parser/Impl.idr b/src/Lib/Parser/Impl.idr index 7f119eb..cdaaa22 100644 --- a/src/Lib/Parser/Impl.idr +++ b/src/Lib/Parser/Impl.idr @@ -1,6 +1,7 @@ module Lib.Parser.Impl import Lib.Token +import Lib.Common import Data.String import Data.Nat @@ -8,54 +9,6 @@ public export TokenList : Type TokenList = List BTok -public export -data Fixity = InfixL | InfixR | Infix - -export -Show Fixity where - show InfixL = "infixl" - show InfixR = "infixr" - show Infix = "infix" - --- I was going to use a record, but we're peeling this off of bounds at the moment. -public export -FC : Type -FC = (Int,Int) - -public export -interface HasFC a where - getFC : a -> FC - -%name FC fc - -export -emptyFC : FC -emptyFC = (0,0) - --- Error of a parse -public export -data Error = E FC String -%name Error err - -public export -showError : String -> Error -> String -showError src (E (line, col) msg) = "ERROR at \{show (line,col)}: \{msg}\n" ++ go 0 (lines src) - where - go : Int -> List String -> String - go l [] = "" - go l (x :: xs) = - if l == line then - " \{x}\n \{replicate (cast col) ' '}^\n" - else if line - 3 < l then " " ++ x ++ "\n" ++ go (l + 1) xs - else go (l + 1) xs - -public export -record OpDef where - constructor MkOp - name : String - prec : Int - fix : Fixity - -- Result of a parse public export data Result : Type -> Type where diff --git a/src/Lib/Token.idr b/src/Lib/Token.idr index a7056e6..162b65b 100644 --- a/src/Lib/Token.idr +++ b/src/Lib/Token.idr @@ -9,7 +9,6 @@ data Kind = Ident | UIdent | Keyword - | Oper | MixFix | Number | Character @@ -29,7 +28,6 @@ Show Kind where show Ident = "Ident" show UIdent = "UIdent" show Keyword = "Keyword" - show Oper = "Oper" show MixFix = "MixFix" show Number = "Number" show Character = "Character" @@ -47,7 +45,6 @@ Eq Kind where Ident == Ident = True UIdent == UIdent = True Keyword == Keyword = True - Oper == Oper = True MixFix == MixFix = True Number == Number = True Character == Character = True diff --git a/src/Lib/Tokenizer.idr b/src/Lib/Tokenizer.idr index d4ecfee..dd01706 100644 --- a/src/Lib/Tokenizer.idr +++ b/src/Lib/Tokenizer.idr @@ -3,10 +3,12 @@ module Lib.Tokenizer import Text.Lexer import Text.Lexer.Tokenizer import Lib.Token +import Lib.Common keywords : List String keywords = ["let", "in", "where", "case", "of", "data", "U", "do", - "ptype", "pfunc", "module", "infixl", "infixr", "infix"] + "ptype", "pfunc", "module", "infixl", "infixr", "infix", + "->", "→", ":", "=>", ":=", "=", "<-", "\\", "_"] specialOps : List String specialOps = ["->", ":", "=>", ":=", "=", "<-"] @@ -17,18 +19,12 @@ checkKW s = if elem s keywords then Tok Keyword s else Tok Ident s checkUKW : String -> Token Kind checkUKW s = if elem s keywords then Tok Keyword s else Tok UIdent s -checkOp : String -> Token Kind -checkOp s = if elem s specialOps then Tok Keyword s else Tok Oper s - -isOpChar : Char -> Bool -isOpChar c = c `elem` (unpack ":!#$%&*+./<=>?@\\^|-~") - -opChar : Lexer -opChar = pred isOpChar - identMore : Lexer identMore = alphaNum <|> exact "." <|> exact "'" <|> exact "_" +singleton : Lexer +singleton = oneOf "()\\{}[]," + quo : Recognise True quo = is '"' @@ -52,25 +48,34 @@ opMiddle = pred (\c => not (isSpace c || c == '_')) rawTokens : Tokenizer (Token Kind) rawTokens - = match (lower <+> many identMore) checkKW - <|> match (upper <+> many identMore) checkUKW - <|> match (some digit) (Tok Number) - <|> match (is '#' <+> many alpha) (Tok Pragma) - <|> match charLit (Tok Character) + = match spaces (Tok Space) + -- { is singleton except for {{ + <|> match (exact "{{" <|> exact "}}") (Tok Keyword) + -- need to make this an ident + <|> match (exact ",") (checkKW) + -- for now, our lambda slash is singleton + <|> match (singleton) (Tok Symbol) + -- TODO Drop MixFix token type when we support if_then_else_ <|> match (exact "_" <+> (some opMiddle) <+> exact "_") (Tok MixFix) - <|> match (quo <+> manyUntil quo (esc any <|> any) <+> quo) (Tok StringKind . unquote) + -- REVIEW - expect non-alpha after? + <|> match (some digit) (Tok Number) + -- for module names and maybe type constructors + <|> match (charLit) (Tok Character) + <|> match (is '#' <+> many alpha) (Tok Pragma) <|> match (lineComment (exact "--")) (Tok Space) <|> match (blockComment (exact "/-") (exact "-/")) (Tok Space) - <|> match (exact ",") (Tok Oper) - <|> match (some opChar) checkOp - <|> match (exact "{{" <|> exact "}}") (Tok Keyword) - <|> match symbol (Tok Symbol) - <|> match spaces (Tok Space) + <|> match (upper <+> many identMore) checkUKW + <|> match (quo <+> manyUntil quo (esc any <|> any) <+> quo) (Tok StringKind . unquote) + -- accept almost everything, but + <|> match (some (non (space <|> singleton))) checkKW notSpace : WithBounds (Token Kind) -> Bool notSpace (MkBounded (Tok Space _) _ _) = False notSpace _ = True export -tokenise : String -> List BTok -tokenise = filter notSpace . fst . lex rawTokens +tokenise : String -> Either Error (List BTok) +tokenise s = case lex rawTokens s of + (toks, EndInput, l, c, what) => Right (filter notSpace toks) + (toks, reason, l, c, what) => Left (E (l,c) "\{show reason}") + diff --git a/src/Lib/Types.idr b/src/Lib/Types.idr index 00a03ab..2435e87 100644 --- a/src/Lib/Types.idr +++ b/src/Lib/Types.idr @@ -1,7 +1,7 @@ module Lib.Types -- For FC, Error -import public Lib.Parser.Impl +import public Lib.Common import Lib.Prettier import public Control.Monad.Error.Either @@ -433,7 +433,7 @@ names ctx = toList $ map fst ctx.types public export M : Type -> Type -M = (StateT TopContext (EitherT Impl.Error IO)) +M = (StateT TopContext (EitherT Error IO)) ||| Force argument and print if verbose is true export diff --git a/src/Main.idr b/src/Main.idr index aa280e0..024a3c8 100644 --- a/src/Main.idr +++ b/src/Main.idr @@ -11,7 +11,7 @@ import Data.IORef -- import Lib.Elab import Lib.Compile import Lib.Parser --- import Lib.Parser.Impl +import Lib.Parser.Impl import Lib.Prettier import Lib.ProcessDecl import Lib.Token @@ -60,7 +60,8 @@ processModule base stk name = do let fn = base ++ "/" ++ name ++ ".newt" Right src <- readFile $ fn | Left err => fail (show err) - let toks = tokenise src + let Right toks = tokenise src + | Left err => fail (showError src err) let Right (modName, ops, toks) := partialParse parseModHeader top.ops toks | Left err => fail (showError src err) diff --git a/tests/black/TestCase4.newt b/tests/black/TestCase4.newt index 2b1ebdd..38054d3 100644 --- a/tests/black/TestCase4.newt +++ b/tests/black/TestCase4.newt @@ -15,7 +15,7 @@ head (x :: xs) = x -- These came from a Conor McBride lecture where they use SHE -vapp : {s t: U} {k : Nat} -> Vect k (s -> t) -> Vect k s -> Vect k t +vapp : {s t : U} {k : Nat} -> Vect k (s -> t) -> Vect k s -> Vect k t vapp (f :: fs) (t :: ts) = f t :: vapp fs ts vapp Nil Nil = Nil @@ -32,7 +32,7 @@ fmap f (x :: xs) = (f x :: fmap f xs) pure : {a : U} {n : Nat} -> a -> Vect n a pure {a} {n} = vec n -_<*>_ : {s t: U} {k : Nat} -> Vect k (s -> t) -> Vect k s -> Vect k t +_<*>_ : {s t : U} {k : Nat} -> Vect k (s -> t) -> Vect k s -> Vect k t _<*>_ = vapp -- and idiom brackets (maybe someday) diff --git a/tests/black/Zoo4eg.newt b/tests/black/Zoo4eg.newt index 17e103e..a778d25 100644 --- a/tests/black/Zoo4eg.newt +++ b/tests/black/Zoo4eg.newt @@ -12,7 +12,7 @@ group1 : {A B : U}(x y z : A) -> B -> B group1 = \x y z b => b group2 : {A B}(x y z : A) -> B -> B -group2 = \x y z b=> b +group2 = \x y z b => b -- explicit id function used for annotation as in Idris the : (A : _) -> A -> A