Parsing updates for unicode

- Allow unicode characters in indents and operators
- Show lexing errors
This commit is contained in:
2024-11-02 10:22:04 -07:00
parent d41558c219
commit 6164893da5
13 changed files with 152 additions and 101 deletions

1
.gitignore vendored
View File

@@ -7,3 +7,4 @@ build/
*.agdai
*.js
input.txt
node_modules

22
TODO.md
View File

@@ -3,15 +3,24 @@
- [ ] Allow unicode operators/names
- refactored parser to prep for this
- [ ] get rid of stray INFO from auto resolution
- [ ] handle if_then_else_j
- [ ] Web tool
- edit, view output, view js, run js, monaco would be nice.
- need to shim out Buffer
- [x] get rid of stray INFO from auto resolution
- [ ] handle if_then_else_ style mixfix
- [ ] Search should look at context
- [ ] records
- [ ] copattern matching
- [ ] Support @ on the LHS
- [x] Remember operators from imports
- [ ] Default cases for non-primitives (currently gets expanded to all constructors)
- This may need a little care. But I think I could collect all constructors that only match wildcards into a single case. This would lose any information from the individual, unnamed cases though.
- There are cases where we have `_` and then `Foo` on the next line, but they should all get collected into the `Foo` case. I think I sorted all of this out for primitives.
- [x] Case for primitives
- [ ] aoc2023 translation
- [x] day1
- [x] day2
- some "real world" examples -v
- some "real world" examples
- [x] Maybe Eq and stuff would work for typeclass without dealing with unification issues yet
- [x] unsolved meta errors repeat (need to freeze or only report at end)
- [x] Sanitize JS idents, e.g. `_+_`
@@ -22,7 +31,6 @@
leave that implicit for efficiency. I think it would also make printing more readable.
- When printing `Value`, I now print the spine size instead of spine.
- [x] eval for case (see order.newt)
- [ ] dynamic pattern unification (add test case first)
- [x] switch from commit/mustWork to checking progress
- [x] type constructors are no longer generated? And seem to have 0 arity.
- [x] raw let is not yet implemented (although define used by case tree building)
@@ -60,17 +68,19 @@
- [ ] magic nat (codegen as number with appropriate pattern matching)
- [ ] magic tuple? (codegen as array)
- [ ] magic newtype? (drop them in codegen)
- [ ] records / copatterns
- [x] vscode: syntax highlighting for String
- [ ] add `pop` or variant of `pfunc` that maps to an operator, giving the js operator and precedence on RHS
### Parsing
- [ ] consider allowing σ etc in identifiers
- Probably need to merge oper / ident first and sort out mixfix in parsing.
- Probably need to merge oper / ident first and sort out mixfix in parsing
- The mixfix parsing can handle this now, need to update lexing.
- [ ] Parse error not ideal for `\x y z b=> b` (points to lambda)
### Background
- [ ] Read Ulf Norell thesis
- [ ] Finish reading dynamic pattern unification paper to see what is missing/wrong with the current implementation

40
newt/Combinatory.newt Normal file
View File

@@ -0,0 +1,40 @@
module Combinatory
data Unit : U where
MkUnit : Unit
infixr 7 _::_
data List : U -> U where
Nil : {A : U} -> List A
_::_ : {A : U} -> A -> List A -> List A
-- prj/menagerie/papers/combinatory
infixr 6 _~>_
data Type : U where
ι : Type
_~>_ : Type -> Type -> Type
A : U
A = Unit
Val : Type -> U
Val ι = A
Val (x ~> y) = Val x -> Val y
Ctx : U
Ctx = List Type
data Ref : Type -> Ctx -> U where
Z : {σ : Type} {Γ : Ctx} -> Ref σ (σ :: Γ)
S : {σ τ : Type} {Γ : Ctx} -> Ref σ Γ -> Ref σ (τ :: Γ)
data Term : Ctx -> Type -> U where
App : {Γ : Ctx} {σ τ : Type} -> Term Γ (σ ~> τ) -> Term Γ σ -> Term Γ τ
Lam : {Γ : Ctx} {σ τ : Type} -> Term (σ :: Γ) τ -> Term Γ (σ ~> τ)
Var : {Γ : Ctx} {σ : Type} -> Ref σ Γ Term Γ σ
-- FIXME, I'm not getting an error for Nil, but it's shadowing Nil
data Env : Ctx -> U where
ENil : Env Nil
ECons : {Γ : Ctx} {σ : Type} Val σ Env Γ Env (σ :: Γ)

View File

@@ -17,6 +17,7 @@ data Either : U -> U -> U where
infixr 0 _$_
-- Currently very noisy in generated code
_$_ : {a b : U} -> (a -> b) -> a -> b
f $ a = f a

52
src/Lib/Common.idr Normal file
View File

@@ -0,0 +1,52 @@
module Lib.Common
import Data.String
-- I was going to use a record, but we're peeling this off of bounds at the moment.
public export
FC : Type
FC = (Int,Int)
public export
interface HasFC a where
getFC : a -> FC
%name FC fc
export
emptyFC : FC
emptyFC = (0,0)
-- Error of a parse
public export
data Error = E FC String
%name Error err
public export
showError : String -> Error -> String
showError src (E (line, col) msg) = "ERROR at \{show (line,col)}: \{msg}\n" ++ go 0 (lines src)
where
go : Int -> List String -> String
go l [] = ""
go l (x :: xs) =
if l == line then
" \{x}\n \{replicate (cast col) ' '}^\n"
else if line - 3 < l then " " ++ x ++ "\n" ++ go (l + 1) xs
else go (l + 1) xs
public export
data Fixity = InfixL | InfixR | Infix
export
Show Fixity where
show InfixL = "infixl"
show InfixR = "infixr"
show Infix = "infix"
public export
record OpDef where
constructor MkOp
name : String
prec : Int
fix : Fixity

View File

@@ -95,7 +95,6 @@ pArg = do
(Explicit,fc,) <$> atom
<|> (Implicit,fc,) <$> braces typeExpr
<|> (Auto,fc,) <$> dbraces typeExpr
<|> (Explicit,fc,) . RVar fc <$> token Oper
AppSpine = List (Icit,FC,Raw)
@@ -203,13 +202,6 @@ caseExpr = do
alts <- startBlock $ someSame $ caseAlt
pure $ RCase fc sc alts
doArrow : Parser DoStmt
doArrow = do
fc <- getPos
name <- try $ ident <* keyword "<-"
expr <- term
pure $ DoArrow fc name expr
doStmt : Parser DoStmt
doStmt
= DoArrow <$> getPos <*> (try $ ident <* keyword "<-") <*> term
@@ -232,9 +224,8 @@ varname = (ident <|> uident <|> keyword "_" *> pure "_")
ebind : Parser (List (FC, String, Icit, Raw))
ebind = do
sym "("
names <- some $ withPos varname
sym ":"
-- don't commit until we see the ":"
names <- try (sym "(" *> some (withPos varname) <* sym ":")
ty <- typeExpr
sym ")"
pure $ map (\(pos, name) => (pos, name, Explicit, ty)) names
@@ -262,7 +253,7 @@ arrow = sym "->" <|> sym "→"
-- Collect a bunch of binders (A : U) {y : A} -> ...
binders : Parser Raw
binders = do
binds <- many (abind <|> ibind <|> try ebind)
binds <- many (abind <|> ibind <|> ebind)
arrow
scope <- typeExpr
pure $ foldr (uncurry mkBind) scope (join binds)
@@ -286,7 +277,7 @@ typeExpr = binders
export
parseSig : Parser Decl
parseSig = TypeSig <$> getPos <*> some (ident <|> uident) <* keyword ":" <*> typeExpr
parseSig = TypeSig <$> getPos <*> try (some (ident <|> uident) <* keyword ":") <*> typeExpr
parseImport : Parser Import
parseImport = MkImport <$> getPos <* keyword "import" <*> uident
@@ -364,7 +355,7 @@ parseNorm = DCheck <$> getPos <* keyword "#check" <*> typeExpr <* keyword ":" <*
export
parseDecl : Parser Decl
parseDecl = parseMixfix <|> parsePType <|> parsePFunc <|> parseNorm <|> parseData <|> (try $ parseSig) <|> parseDef
parseDecl = parseMixfix <|> parsePType <|> parsePFunc <|> parseNorm <|> parseData <|> parseSig <|> parseDef
export

View File

@@ -1,6 +1,7 @@
module Lib.Parser.Impl
import Lib.Token
import Lib.Common
import Data.String
import Data.Nat
@@ -8,54 +9,6 @@ public export
TokenList : Type
TokenList = List BTok
public export
data Fixity = InfixL | InfixR | Infix
export
Show Fixity where
show InfixL = "infixl"
show InfixR = "infixr"
show Infix = "infix"
-- I was going to use a record, but we're peeling this off of bounds at the moment.
public export
FC : Type
FC = (Int,Int)
public export
interface HasFC a where
getFC : a -> FC
%name FC fc
export
emptyFC : FC
emptyFC = (0,0)
-- Error of a parse
public export
data Error = E FC String
%name Error err
public export
showError : String -> Error -> String
showError src (E (line, col) msg) = "ERROR at \{show (line,col)}: \{msg}\n" ++ go 0 (lines src)
where
go : Int -> List String -> String
go l [] = ""
go l (x :: xs) =
if l == line then
" \{x}\n \{replicate (cast col) ' '}^\n"
else if line - 3 < l then " " ++ x ++ "\n" ++ go (l + 1) xs
else go (l + 1) xs
public export
record OpDef where
constructor MkOp
name : String
prec : Int
fix : Fixity
-- Result of a parse
public export
data Result : Type -> Type where

View File

@@ -9,7 +9,6 @@ data Kind
= Ident
| UIdent
| Keyword
| Oper
| MixFix
| Number
| Character
@@ -29,7 +28,6 @@ Show Kind where
show Ident = "Ident"
show UIdent = "UIdent"
show Keyword = "Keyword"
show Oper = "Oper"
show MixFix = "MixFix"
show Number = "Number"
show Character = "Character"
@@ -47,7 +45,6 @@ Eq Kind where
Ident == Ident = True
UIdent == UIdent = True
Keyword == Keyword = True
Oper == Oper = True
MixFix == MixFix = True
Number == Number = True
Character == Character = True

View File

@@ -3,10 +3,12 @@ module Lib.Tokenizer
import Text.Lexer
import Text.Lexer.Tokenizer
import Lib.Token
import Lib.Common
keywords : List String
keywords = ["let", "in", "where", "case", "of", "data", "U", "do",
"ptype", "pfunc", "module", "infixl", "infixr", "infix"]
"ptype", "pfunc", "module", "infixl", "infixr", "infix",
"->", "", ":", "=>", ":=", "=", "<-", "\\", "_"]
specialOps : List String
specialOps = ["->", ":", "=>", ":=", "=", "<-"]
@@ -17,18 +19,12 @@ checkKW s = if elem s keywords then Tok Keyword s else Tok Ident s
checkUKW : String -> Token Kind
checkUKW s = if elem s keywords then Tok Keyword s else Tok UIdent s
checkOp : String -> Token Kind
checkOp s = if elem s specialOps then Tok Keyword s else Tok Oper s
isOpChar : Char -> Bool
isOpChar c = c `elem` (unpack ":!#$%&*+./<=>?@\\^|-~")
opChar : Lexer
opChar = pred isOpChar
identMore : Lexer
identMore = alphaNum <|> exact "." <|> exact "'" <|> exact "_"
singleton : Lexer
singleton = oneOf "()\\{}[],"
quo : Recognise True
quo = is '"'
@@ -52,25 +48,34 @@ opMiddle = pred (\c => not (isSpace c || c == '_'))
rawTokens : Tokenizer (Token Kind)
rawTokens
= match (lower <+> many identMore) checkKW
<|> match (upper <+> many identMore) checkUKW
<|> match (some digit) (Tok Number)
<|> match (is '#' <+> many alpha) (Tok Pragma)
<|> match charLit (Tok Character)
= match spaces (Tok Space)
-- { is singleton except for {{
<|> match (exact "{{" <|> exact "}}") (Tok Keyword)
-- need to make this an ident
<|> match (exact ",") (checkKW)
-- for now, our lambda slash is singleton
<|> match (singleton) (Tok Symbol)
-- TODO Drop MixFix token type when we support if_then_else_
<|> match (exact "_" <+> (some opMiddle) <+> exact "_") (Tok MixFix)
<|> match (quo <+> manyUntil quo (esc any <|> any) <+> quo) (Tok StringKind . unquote)
-- REVIEW - expect non-alpha after?
<|> match (some digit) (Tok Number)
-- for module names and maybe type constructors
<|> match (charLit) (Tok Character)
<|> match (is '#' <+> many alpha) (Tok Pragma)
<|> match (lineComment (exact "--")) (Tok Space)
<|> match (blockComment (exact "/-") (exact "-/")) (Tok Space)
<|> match (exact ",") (Tok Oper)
<|> match (some opChar) checkOp
<|> match (exact "{{" <|> exact "}}") (Tok Keyword)
<|> match symbol (Tok Symbol)
<|> match spaces (Tok Space)
<|> match (upper <+> many identMore) checkUKW
<|> match (quo <+> manyUntil quo (esc any <|> any) <+> quo) (Tok StringKind . unquote)
-- accept almost everything, but
<|> match (some (non (space <|> singleton))) checkKW
notSpace : WithBounds (Token Kind) -> Bool
notSpace (MkBounded (Tok Space _) _ _) = False
notSpace _ = True
export
tokenise : String -> List BTok
tokenise = filter notSpace . fst . lex rawTokens
tokenise : String -> Either Error (List BTok)
tokenise s = case lex rawTokens s of
(toks, EndInput, l, c, what) => Right (filter notSpace toks)
(toks, reason, l, c, what) => Left (E (l,c) "\{show reason}")

View File

@@ -1,7 +1,7 @@
module Lib.Types
-- For FC, Error
import public Lib.Parser.Impl
import public Lib.Common
import Lib.Prettier
import public Control.Monad.Error.Either
@@ -433,7 +433,7 @@ names ctx = toList $ map fst ctx.types
public export
M : Type -> Type
M = (StateT TopContext (EitherT Impl.Error IO))
M = (StateT TopContext (EitherT Error IO))
||| Force argument and print if verbose is true
export

View File

@@ -11,7 +11,7 @@ import Data.IORef
-- import Lib.Elab
import Lib.Compile
import Lib.Parser
-- import Lib.Parser.Impl
import Lib.Parser.Impl
import Lib.Prettier
import Lib.ProcessDecl
import Lib.Token
@@ -60,7 +60,8 @@ processModule base stk name = do
let fn = base ++ "/" ++ name ++ ".newt"
Right src <- readFile $ fn
| Left err => fail (show err)
let toks = tokenise src
let Right toks = tokenise src
| Left err => fail (showError src err)
let Right (modName, ops, toks) := partialParse parseModHeader top.ops toks
| Left err => fail (showError src err)