From 39deff14659932084a7a88489feb7559b7dc4163 Mon Sep 17 00:00:00 2001 From: Steve Dunham Date: Thu, 8 Sep 2022 22:45:07 -0700 Subject: [PATCH] chkpt --- newt.ipkg | 47 ++++++++++++++ src/Lib/Grammar.idr | 45 +++++++++++++ src/Lib/Layout.idr | 48 ++++++++++++++ src/Lib/Lexer2.idr | 56 ++++++++++++++++ src/Lib/Parser.idr | 81 +++++++++++++++++++++++ src/Lib/Parser/Impl.idr | 139 ++++++++++++++++++++++++++++++++++++++++ src/Lib/Test.idr | 23 +++++++ src/Lib/Tok2.idr | 55 ++++++++++++++++ src/Lib/Token.idr | 55 ++++++++++++++++ src/Lib/Tokenizer.idr | 54 ++++++++++++++++ src/Lib/Tokenizer2.idr | 78 ++++++++++++++++++++++ src/Main.idr | 28 ++++++++ src/Syntax.idr | 58 +++++++++++++++++ 13 files changed, 767 insertions(+) create mode 100644 newt.ipkg create mode 100644 src/Lib/Grammar.idr create mode 100644 src/Lib/Layout.idr create mode 100644 src/Lib/Lexer2.idr create mode 100644 src/Lib/Parser.idr create mode 100644 src/Lib/Parser/Impl.idr create mode 100644 src/Lib/Test.idr create mode 100644 src/Lib/Tok2.idr create mode 100644 src/Lib/Token.idr create mode 100644 src/Lib/Tokenizer.idr create mode 100644 src/Lib/Tokenizer2.idr create mode 100644 src/Main.idr create mode 100644 src/Syntax.idr diff --git a/newt.ipkg b/newt.ipkg new file mode 100644 index 0000000..9837226 --- /dev/null +++ b/newt.ipkg @@ -0,0 +1,47 @@ +package newt +version = 0.1.0 +authors = "Steve Dunham" +-- maintainers = +-- license = +-- brief = +-- readme = +-- homepage = +-- sourceloc = +-- bugtracker = + +-- the Idris2 version required (e.g. langversion >= 0.5.1) +-- langversion + +-- packages to add to search path +depends = contrib + +-- modules to install +-- modules = + +-- main file (i.e. file to load at REPL) +main = Main + +-- name of executable +executable = "newt" +-- opts = +sourcedir = "src" +-- builddir = +-- outputdir = + +-- script to run before building +-- prebuild = + +-- script to run after building +-- postbuild = + +-- script to run after building, before installing +-- preinstall = + +-- script to run after installing +-- postinstall = + +-- script to run before cleaning +-- preclean = + +-- script to run after cleaning +-- postclean = diff --git a/src/Lib/Grammar.idr b/src/Lib/Grammar.idr new file mode 100644 index 0000000..d1a5b8d --- /dev/null +++ b/src/Lib/Grammar.idr @@ -0,0 +1,45 @@ +module Lib.Grammar + +-- Derived from Parser/Core.idr - revisit later when we want to be stack safe +-- For simplicity (for now), it drops consumed flag from the type, which I think was +-- there for totality checking. May want it back someday. + +data Grammar : (state : Type) -> (tok : Type) -> Type -> Type where + EOF : Grammar state tok () + Empty : a -> Grammar state tok a + Fail : String -> Grammar state tok a + Commit : Grammar state tok () + Alt : Grammar state tok ty -> Lazy (Grammar state tok ty) -> Grammar state tok ty + Seq : Grammar state tok a -> (a -> Grammar state tok b) -> Grammar state tok b + +Functor (Grammar state tok) where + map f (Fail str) = Fail str + map f (Alt x y) = Alt (map f x) (map f y) + map f (Seq x g) = ?todo_4 + map f (Empty v) = Empty (f v) + -- Empty / EOF + map f p = Seq p (Empty . f) + +Applicative (Grammar state tok) where + pure a = Empty a + x <*> y = ?hole3 + +Alternative (Grammar state tok) where + empty = ?hole2 + (<|>) = Alt + +Monad (Grammar state tok) where + (>>=) = Seq + +ParseError : Type + +data ParseResult : Type -> Type -> Type -> Type where + Failure : (commit : Bool) -> List ParseError -> ParseResult state tok ty + +doParse : Semigroup state => + state -> + (commit : Bool) -> + Grammar state tok ty -> + List tok -> + ParseResult state tok ty +doParse = ?todo diff --git a/src/Lib/Layout.idr b/src/Lib/Layout.idr new file mode 100644 index 0000000..ad88f56 --- /dev/null +++ b/src/Lib/Layout.idr @@ -0,0 +1,48 @@ +module Lib.Layout + +import Lib.Token +import Text.Lexer + +-- Dunno if I'll do layout or pass col, but here it is: + +intro : List String +intro = [ "where", "let", "of" ] + +Position : Type +Position = (Int,Int) + +isIntro : BTok -> Bool +isIntro t = elem (value t) intro + +export +layout : List BTok -> List BTok +layout = go [] [] + where + lbrace = irrelevantBounds (Tok LBrace "{") + semi = irrelevantBounds (Tok Semi ";") + rbrace = irrelevantBounds (Tok RBrace "}") + + -- go' does start of block + go' : List BTok -> List Position -> List BTok -> List BTok + + -- go does semi and end of block, we require "in" to be on separate, outdented line if not the same line as let + go : List BTok -> List Position -> List BTok -> List BTok + go acc lvls [] = reverse acc + go acc [] ts = go' acc [] ts + go acc lvls@(lvl::rest) toks@(t :: ts) = + let (sr,sc) = start t in + if snd lvl > sc then go (rbrace :: acc) rest toks + else if snd lvl == sc then go' (semi :: acc) lvls toks + else if value t == "in" && sr == fst lvl then go' (rbrace :: acc) rest toks + else go' acc lvls toks + + -- handle start of block + go' acc lvls [] = reverse acc + go' acc lvls (t::ts) = case isIntro t of + False => go (t::acc) lvls ts + True => case ts of + t'::ts' => go (t' :: lbrace :: t :: acc) (start t' :: lvls) ts' + [] => go (rbrace :: lbrace :: t :: acc) lvls ts + + + \ No newline at end of file diff --git a/src/Lib/Lexer2.idr b/src/Lib/Lexer2.idr new file mode 100644 index 0000000..0d8f96b --- /dev/null +++ b/src/Lib/Lexer2.idr @@ -0,0 +1,56 @@ +module Lib.Lexer2 + +data UToken + = Ident String + | Let + | In + | Data + | Where + | Forall + | Case + | Of + | Underscore + | Operator String + | Equals + | Dot + | Colon + | Pipe + | RightArrow + | FatArrow + | Number Int + | Lambda + | LeftParen + | RightParen + | LeftBrace + | RightBrace + | Error + +-- we can skip this if we keep the text... + +Show UToken where + show = \case + Ident name => name + Let => "let" + In => "in" + Data => "data" + Where => "where" + Forall => "forall" + Case => "case" + Of => "of" + Underscore => "_" + Operator name => name + Equals => ?rhs_10 + Dot => ?rhs_11 + Colon => ?rhs_12 + Pipe => ?rhs_13 + RightArrow => ?rhs_14 + FatArrow => ?rhs_15 + Number i => show i + Lambda => ?rhs_17 + LeftParen => ?rhs_18 + RightParen => ?rhs_19 + LeftBrace => ?rhs_20 + RightBrace => ?rhs_21 + Error => ?rhs_22 + + diff --git a/src/Lib/Parser.idr b/src/Lib/Parser.idr new file mode 100644 index 0000000..53055ea --- /dev/null +++ b/src/Lib/Parser.idr @@ -0,0 +1,81 @@ +module Lib.Parser + + +import Lib.Token +import Lib.Parser.Impl +import Syntax + +-- There is the whole core vs surface thing here. +-- might be best to do core first/ Technically don't +-- _need_ a parser, but would be useful for testing. + +-- look to pi-forall / ezoo, but I think we start with a +-- TTImpl level grammar, then add a more sugared layer above +-- so holes and all that + +-- After the parser runs, see below, take a break and finish pi-forall +-- exercises. There is some fill in the parser stuff that may show +-- the future. + +ident = token Ident + +term : Parser Term + +letExpr : Parser Term +letExpr = Let <$ keyword "let" <*> ident <* keyword "=" <*> term <* keyword "in" <*> term + +pattern : Parser Pattern + +lamExpr : Parser Term +lamExpr = do + keyword "\\" + commit + name <- pattern + keyword "." + scope <- term + pure $ Lam name scope + +caseAlt : Parser CaseAlt + +caseExpr : Parser Term +caseExpr = do + _ <- keyword "case" + commit + sc <- term + keyword "of" + alts <- startBlock $ someSame $ caseAlt + pure $ Case sc alts + + + +-- TODO - get this up and running with a case expr to test it + + + +{- +so lets say we wanted to do the indent, the hard way. + +what does this look like + +We want to say: + kw 'let' $ indentedSome assignment + +now assignment is going to be whatever, but we need to make sure the initial token can/must be at +current indentation level. + +Ok idris indent is not Col, it's AnyIndent | AtPos Int | AfterPos Int + +The col though is somehow sixty. + +sixten is sameCol, dropAnchor, with an "environment" + + +sixty is Position is line and col and either line matches or col < tokenCol. + +that's maybe doable.. + + + + + -} + diff --git a/src/Lib/Parser/Impl.idr b/src/Lib/Parser/Impl.idr new file mode 100644 index 0000000..13f0189 --- /dev/null +++ b/src/Lib/Parser/Impl.idr @@ -0,0 +1,139 @@ +module Lib.Parser.Impl + +import Lib.Token + +public export +TokenList : Type +TokenList = List BTok + +-- Error of a parse +public export +data Error = E String +%name Error err + +-- Result of a parse +public export +data Result : Type -> Type where + OK : a -> (toks : TokenList) -> (com : Bool) -> Result a + Fail : Error -> (toks : TokenList) -> (com : Bool) -> Result a + +export +Functor Result where + map f (OK a toks com ) = OK (f a) toks com + map _ (Fail err toks com) = Fail err toks com + +-- So sixty just has a newtype function here now (probably for perf). +-- A record might be more ergonomic, but would require a record impl before +-- self hosting. + +-- We keep the line and column for indents. The idea being that we check +-- either the col < tokCol or line == tokLine, enabling sameLevel. + +-- dunno why I'm making that a pair.. +export +data Parser a = P (TokenList -> Bool -> (lc : (Int,Int)) -> Result a) + +runP : Parser a -> TokenList -> Bool -> (Int,Int) -> Result a +runP (P f) = f + +fail : String -> Parser a +fail msg = P $ \toks,com,col => Fail (E msg) toks com + +export +Functor Parser where + map f (P pa) = P $ \ toks, com, col => map f (pa toks com col) + +export +Applicative Parser where + pure pa = P (\ toks, com, col => OK pa toks com) + P pab <*> P pa = P $ \toks,com,col => + case pab toks com col of + Fail err toks com => Fail err toks com + OK f toks com => + case pa toks com col of + (OK x toks com) => OK (f x) toks com + (Fail err toks com) => Fail err toks com + +export +Alternative Parser where + empty = fail "empty" + (P pa) <|> (P pb) = P $ \toks,com,col => + case pa toks com col of + f@(Fail _ _ com') => if com' then f else pb toks com col + t => t + +export +Monad Parser where + P pa >>= pab = P $ \toks,com,col => + case pa toks com col of + (OK a toks com) => runP (pab a) toks com col + (Fail err xs x) => ?rhs_1 + + +-- do we need this? +pred : (BTok -> Bool) -> String -> Parser String +pred f msg = P $ \toks,com,col => + case toks of + (t :: ts) => if f t then OK (value t) ts com else Fail (E msg) toks com + [] => Fail (E "eof") toks com + +export +commit : Parser () +commit = P $ \toks,com,col => OK () toks True + +export +token : Kind -> Parser String +token k = pred (\t => t.val.kind == k) "Expected a \{show k}" + + +export +keyword : String -> Parser () +keyword kw = ignore $ pred (\t => t.val.text == kw) "Expected \{kw}" + + +many : Parser a -> Parser (List a) +some : Parser a -> Parser (List a) +some p = (::) <$> p <*> many p +many p = some p <|> pure [] + +-- sixty let has some weird CPS stuff, but essentially: + +-- case token_ of +-- Lexer.LLet -> PLet <$> blockOfMany let_ <* token Lexer.In <*> term + +-- withIndentationBlock - sets the col + +||| Start an indented block and run parser in it +export +startBlock : Parser a -> Parser a +startBlock (P p) = P $ \toks,com,(l,c) => case toks of + [] => p toks com (l,c) + (t :: ts) => + let (tl,tc) = start t + in p toks com (tl,tc) + +||| Assert that parser starts at our current column by +||| checking column and then updating line to match the current +export +sameLevel : Parser a -> Parser a +sameLevel (P p) = P $ \toks,com,(l,c) => case toks of + [] => p toks com (l,c) + (t :: ts) => + let (tl,tc) = start t + in if tc == c then p toks com (tl, c) + else if c < tc then Fail (E "unexpected indent") toks com + else Fail (E "unexpected indent") toks com + +export +someSame : Parser a -> Parser (List a) +someSame = some . sameLevel + +||| requires a token to be indented? +indented : Parser a -> Parser a +indented (P p) = P $ \toks,com,(l,c) => case toks of + [] => p toks com (l,c) + (t::ts) => + let (tl,tc) = start t + in if tc > c || tl == l then p toks com (l,c) + else Fail (E "unexpected outdent") toks com + diff --git a/src/Lib/Test.idr b/src/Lib/Test.idr new file mode 100644 index 0000000..a2bbdf0 --- /dev/null +++ b/src/Lib/Test.idr @@ -0,0 +1,23 @@ +import Text.Lexer + +data Kind = Ignore | Ident | Oper | Number + +ignore : WithBounds (Token Kind) -> Bool +ignore (MkBounded (Tok Ignore _) _ _) = True +ignore _ = False + +tmap : TokenMap (Token Kind) +tmap = [ + (alpha <+> many alphaNum, Tok Ident), + (some digit, Tok Number), + (some symbol, Tok Oper), + (spaces, Tok Ignore) +] + +show : WithBounds (Token Kind) -> String +show t@(MkBounded (Tok _ v) _ _) = "\{show $ start t} \{show v}" + +main : IO () +main = do + let toks = filter (not . ignore) $ fst $ lex tmap "let x = 1\n y = 2\n in x + y" + traverse_ (putStrLn . show) toks diff --git a/src/Lib/Tok2.idr b/src/Lib/Tok2.idr new file mode 100644 index 0000000..fc673fe --- /dev/null +++ b/src/Lib/Tok2.idr @@ -0,0 +1,55 @@ +module Lib.Tok2 + +import Text.Bounded + +data Kind + = Ident + | Keyword + | Oper + | Number + | Symbol + | Arrow + | Ignore + | EOF + +data Token = Tok Kind String + +Show Kind where + show Ident = "Ident" + show Keyword = "Keyword" + show Oper = "Oper" + show Number = "Number" + show Symbol = "Symbol" + show Arrow = "Arrow" + show Ignore = "Ignore" + show EOF = "EOF" + +Show Token where show (Tok k v) = "<\{show k}:\{show v}>" + +keywords : List String +keywords = ["let", "in", "where", "case"] + +specialOps : List String +specialOps = ["->", ":"] + +checkKW : String -> Token +checkKW s = if elem s keywords then Tok Keyword s else Tok Ident s + +opkind : String -> Kind +opkind "->" = Arrow +opkind _ = Oper + +isOpChar : Char -> Bool +isOpChar c = c `elem` (unpack ":!#$%&*+./<=>?@\\^|-~") + +nextToken : Int -> Int -> List Char -> Maybe (Token, (Int, Int, List Char)) +nextToken row col xs = case xs of + [] => Nothing + (' ' :: cs) => nextToken row (col + 1) cs + (c :: cs) => + if isDigit c then getTok (Tok Number) isDigit [c] cs + else if isOpChar c then getTok else ?Foo + where + getTok : (String -> Token) -> (Char -> Bool) -> List Char -> List Char -> Maybe (Token, (Int, Int, List Char)) + + diff --git a/src/Lib/Token.idr b/src/Lib/Token.idr new file mode 100644 index 0000000..33a3e4b --- /dev/null +++ b/src/Lib/Token.idr @@ -0,0 +1,55 @@ +module Lib.Token + +import public Text.Lexer + +public export +data Kind + = Ident + | Keyword + | Oper + | Number + | Symbol + | Arrow + | Space + | LBrace + | Semi + | RBrace + +export +Show Kind where + show Ident = "Ident" + show Keyword = "Keyword" + show Oper = "Oper" + show Number = "Number" + show Symbol = "Symbol" + show Arrow = "Arrow" + show Space = "Space" + show LBrace = "LBrace" + show Semi = "Semi" + show RBrace = "RBrace" + +export +Eq Kind where + Ident == Ident = True + Keyword == Keyword = True + Oper == Oper = True + Number == Number = True + Symbol == Symbol = True + Arrow == Arrow = True + Space == Space = True + LBrace == LBrace = True + Semi == Semi = True + RBrace == RBrace = True + _ == _ = False + +export +Show k => Show (Token k) where + show (Tok k v) = "<\{show k}:\{show v}>" + +public export +BTok : Type +BTok = WithBounds (Token Kind) + +export +value : BTok -> String +value (MkBounded (Tok _ s) _ _) = s diff --git a/src/Lib/Tokenizer.idr b/src/Lib/Tokenizer.idr new file mode 100644 index 0000000..fed96ea --- /dev/null +++ b/src/Lib/Tokenizer.idr @@ -0,0 +1,54 @@ +module Lib.Tokenizer + +import Text.Lexer +import Text.Lexer.Tokenizer +import Lib.Token + +keywords : List String +keywords = ["let", "in", "where", "case", "of", "data"] + +specialOps : List String +specialOps = ["->", ":"] + +checkKW : String -> Token Kind +checkKW s = if elem s keywords then Tok Keyword s else Tok Ident s + +opkind : String -> Kind +opkind "->" = Arrow +opkind _ = Oper + +isOpChar : Char -> Bool +isOpChar c = c `elem` (unpack ":!#$%&*+./<=>?@\\^|-~") + +opChar : Lexer +opChar = pred isOpChar + +-- so Text.Lexer.Core.lex is broken +-- tmap : TokenMap (Token Kind) +-- tmap = [ +-- (alpha <+> many alphaNum, checkKW), +-- (some digit, Tok Number), +-- (some opChar, \s => Tok (opkind s) s), +-- (lineComment (exact "--"), Tok Space), +-- (symbol, Tok Symbol), +-- (spaces, Tok Space) +-- ] + +rawTokens : Tokenizer (Token Kind) +rawTokens + = match (alpha <+> many alphaNum) checkKW + <|> match (some digit) (Tok Number) + <|> match (some opChar) (\s => Tok (opkind s) s) + <|> match (lineComment (exact "--")) (Tok Space) + <|> match symbol (Tok Symbol) + <|> match spaces (Tok Space) + +notSpace : WithBounds (Token Kind) -> Bool +notSpace (MkBounded (Tok Space _) _ _) = False +notSpace _ = True + +export +tokenise : String -> List BTok +tokenise = filter notSpace . fst . lex rawTokens + + diff --git a/src/Lib/Tokenizer2.idr b/src/Lib/Tokenizer2.idr new file mode 100644 index 0000000..f456358 --- /dev/null +++ b/src/Lib/Tokenizer2.idr @@ -0,0 +1,78 @@ +module Lib.Tokenizer2 + +-- Starting to write from-scratch tokenizer to flatten out the data a little. + +data Position = Pos Int Int + +inc : Position -> Position +inc (Pos l c) = Pos l (c + 1) + + +public export +data Kind + = Ident + | Keyword + | Oper + | Comment + | Number + | Symbol + | Arrow + -- virtual stuff deprecated? + | LBrace + | Semi + | RBrace + | EOF + +data Token = Tok Int Int Kind String + +data Lexer + = Match (Char -> Bool) + | Seq Lexer + +keywords : List String +keywords = ["let", "in", "where", "case", "of", "data"] + +specialOps : List String +specialOps = ["->", ":"] + +-- checkKW : String -> Token Kind +-- checkKW s = if elem s keywords then Tok Keyword s else Tok Ident s + +opkind : String -> Kind +opkind "->" = Arrow +opkind _ = Oper + +isOpChar : Char -> Bool +isOpChar c = c `elem` (unpack ":!#$%&*+./<=>?@\\^|-~") + +-- opChar : Lexer +-- opChar = pred isOpChar + +token : Int -> Int -> List Char -> (Token, List Char) +token l c cs = case cs of + [] => (Tok l c EOF "",[]) + ('-' :: '-' :: cs) => scan (/= '\n') ['-','-'] l (c+2) cs + _ => ?fixme + + where + scan : (Char -> Bool) -> (List Char) -> Int -> Int -> List Char -> (Token,List Char) + + +-- match could be a Maybe thing and use the maybe Alt impl. +-- Not sure what shape I want here. Maybe a straight up Character Parser and drop tokenization. + +-- rawTokens : Tokenizer (Token Kind) +-- rawTokens +-- = match (alpha <+> many alphaNum) checkKW +-- <|> match (some digit) (Tok Number) +-- <|> match (some opChar) (\s => Tok (opkind s) s) +-- <|> match (lineComment (exact "--")) (Tok Space) +-- <|> match symbol (Tok Symbol) +-- <|> match spaces (Tok Space) + + +-- export +-- tokenise : String -> List BTok +-- tokenise = filter notSpace . fst . lex rawTokens + + diff --git a/src/Main.idr b/src/Main.idr new file mode 100644 index 0000000..05db178 --- /dev/null +++ b/src/Main.idr @@ -0,0 +1,28 @@ +module Main + +import Lib.Tokenizer +import Lib.Layout +import Lib.Token + +src = "let x = 1\n y = 2\n in x + y" + +check : String -> IO () +check src = do + putStrLn "--" + printLn $ src + let toks = tokenise src + + printLn $ toks + let toks2 = layout toks + + printLn $ map value toks2 + +main : IO () +main = do + -- this stuff is working with layout, should I bother with col. + -- downside is that it will be somewhat picky about the indentation of `in` + -- The sixty stuff looks promising. I might want my own tokenizer though. + check "let x = 1\n y = 2\n in x + y" + check "let x = 1 in x + 2" + check "case x of\n True => something\n False => let\n x = 1\n y = 2\n in x + y" + diff --git a/src/Syntax.idr b/src/Syntax.idr new file mode 100644 index 0000000..0e72a40 --- /dev/null +++ b/src/Syntax.idr @@ -0,0 +1,58 @@ +module Syntax + +-- Good enough start, lets parse +-- This is informed by pi-forall and others and is somewhat low level + +Name = String + +data Term : Type where + +TyTerm = Term + +data Literal = LString String | LInt Int | LBool Bool +data RigCount = Rig0 | RigW +data Plicity = Implicit | Explicit | Eq + +public export +data Pattern + = PatVar Name + | PatCon Name (List (Pattern, RigCount)) + | PatWild + | PatLit Literal + +-- could be a pair, but I suspect stuff will be added? +public export +data CaseAlt = MkAlt Pattern Term + +public export +data Term + = Var Name + | Ann Term TyTerm + | Lit Literal + | Let Name Term Term + | Pi Name Plicity Term Term + | App Term Term + | Lam Pattern Term + | Case Term (List CaseAlt) + | Wildcard + | ParseError String + +-- derive some stuff - I'd like json, eq, show, ... + +data Decl : Type where + +Telescope: Type +Telescope = List Decl -- pi-forall, always typeSig? + +data ConstrDef = MkCDef Name Telescope + +data Decl + = TypeSig Name RigCount TyTerm + | Def Name Term + | Data Name Telescope (List ConstrDef) + +record Module where + constructor MkModule + name : Name + imports : List Name + decls : List Decl