remove a Text.Parser dependency (about 10%), and alternate tokenizer

This commit is contained in:
2024-12-28 21:49:49 -08:00
parent 23a95f70f5
commit 9a2eac0bd7
6 changed files with 115 additions and 16 deletions

View File

@@ -3,14 +3,13 @@
More comments in code! This is getting big enough that I need to re-find my bearings when fixing stuff. More comments in code! This is getting big enough that I need to re-find my bearings when fixing stuff.
- [ ] tokenizer - [x] tokenizer that can be ported to newt
- [ ] string interpolation - [ ] string interpolation
- [ ] editor - indent newline on let with no in - [ ] editor - indent newline on let with no in
- I've seen this done in vi for Idris, but it seems non-trivial in vscode. - I've seen this done in vi for Idris, but it seems non-trivial in vscode.
- [x] Move on to next decl in case of error - [x] Move on to next decl in case of error
- [x] for parse error, seek to col 0 token and process next decl - [x] for parse error, seek to col 0 token and process next decl
- [ ] record initialization sugar, e.g. `{ x := 1, y := 2 }`
- [ ] record update sugar, syntax TBD - [ ] record update sugar, syntax TBD
- I think I'm going to hold off on this for now as it requires the type to elaborate. This ends up at the head of an app, which typically is inferred. We'd need a special case somewhere that infers its argument instead. - I think I'm going to hold off on this for now as it requires the type to elaborate. This ends up at the head of an app, which typically is inferred. We'd need a special case somewhere that infers its argument instead.
- [ ] Change `Ord` to be more like Idris - LT / EQ / GT (and entail equality) - [ ] Change `Ord` to be more like Idris - LT / EQ / GT (and entail equality)

View File

@@ -984,7 +984,7 @@ showDef : Context -> List String -> Nat -> Val -> M String
showDef ctx names n v@(VVar _ n' [<]) = if n == n' then pure "" else pure "= \{pprint names !(quote ctx.lvl v)}" showDef ctx names n v@(VVar _ n' [<]) = if n == n' then pure "" else pure "= \{pprint names !(quote ctx.lvl v)}"
showDef ctx names n v = pure "= \{pprint names !(quote ctx.lvl v)}" showDef ctx names n v = pure "= \{pprint names !(quote ctx.lvl v)}"
-- desugar do
undo : FC -> List DoStmt -> M Raw undo : FC -> List DoStmt -> M Raw
undo prev [] = error prev "do block must end in expression" undo prev [] = error prev "do block must end in expression"
undo prev ((DoExpr fc tm) :: Nil) = pure tm undo prev ((DoExpr fc tm) :: Nil) = pure tm

View File

@@ -84,22 +84,18 @@ rawTokens
-- { is singleton except for {{ -- { is singleton except for {{
<|> match (exact "{{" <|> exact "}}") (Tok Keyword) <|> match (exact "{{" <|> exact "}}") (Tok Keyword)
-- need to make this an ident -- need to make this an ident
<|> match (exact ",") (checkKW) <|> match (exact ",") (Tok Ident)
-- for now, our lambda slash is singleton -- for now, our lambda slash is singleton
<|> match (singleton) (Tok Symbol)
-- TODO Drop MixFix token type when we support if_then_else_
<|> match (exact "_,_" <|> exact "_._") (Tok MixFix) <|> match (exact "_,_" <|> exact "_._") (Tok MixFix)
-- REVIEW - expect non-alpha after?
<|> match (opt (exact "-") <+> some digit) (Tok Number) <|> match (opt (exact "-") <+> some digit) (Tok Number)
-- for module names and maybe type constructors
<|> match (charLit) (Tok Character . unquoteChar) <|> match (charLit) (Tok Character . unquoteChar)
<|> match (is '#' <+> many alpha) (Tok Pragma) <|> match (is '#' <+> many alpha) (Tok Pragma)
<|> match (lineComment (exact "--")) (Tok Space) <|> match (lineComment (exact "--")) (Tok Space)
<|> match (blockComment (exact "/-") (exact "-/")) (Tok Space) <|> match (blockComment (exact "/-") (exact "-/")) (Tok Space)
<|> match (upper <+> many identMore) checkUKW
<|> match (btick <+> manyUntil btick any <+> btick) (Tok JSLit . trimJS) <|> match (btick <+> manyUntil btick any <+> btick) (Tok JSLit . trimJS)
<|> match (quo <+> manyUntil quo (esc any <|> any) <+> quo) (Tok StringKind . unquote) <|> match (quo <+> manyUntil quo (esc any <|> any) <+> quo) (Tok StringKind . unquote)
-- accept almost everything, but <|> match (singleton) (Tok Symbol)
<|> match (upper <+> many identMore) checkUKW
<|> match (some (non (space <|> singleton))) checkKW <|> match (some (non (space <|> singleton))) checkKW
notSpace : WithBounds (Token Kind) -> Bool notSpace : WithBounds (Token Kind) -> Bool

103
src/Lib/Tokenizer2.idr Normal file
View File

@@ -0,0 +1,103 @@
module Lib.Tokenizer2
-- Working alternate tokenizer, saves about 2k, can be translated to newt
import Lib.Token
import Lib.Common
import Data.String
standalone : List Char
standalone = unpack "()\\{}[],.@"
keywords : List String
keywords = ["let", "in", "where", "case", "of", "data", "U", "do",
"ptype", "pfunc", "module", "infixl", "infixr", "infix",
"", "forall", "import", "uses",
"class", "instance", "record", "constructor",
"if", "then", "else",
"$", "λ", "?", "@",
"->", "", ":", "=>", ":=", "=", "<-", "\\", "_", "|"]
-- This makes a big case tree...
tokenise' : Int -> Int -> SnocList BTok -> List Char -> Either Error (List BTok)
tokenise' sl sc toks chars = case chars of
Nil => Right $ toks <>> []
' ' :: cs => tokenise' sl (sc + 1) toks cs
'\n' :: cs => tokenise' (sl + 1) 0 toks cs
'{' :: '{' :: cs => tokenise' sl (sc + 2) (toks :< mktok sl (sc + 2) Keyword "{{" ) cs
'}' :: '}' :: cs => tokenise' sl (sc + 2) (toks :< mktok sl (sc + 2) Keyword "}}" ) cs
',' :: cs => tokenise' sl (sc + 1) (toks :< mktok sl (sc + 1) Ident ",") cs
'_' :: ',' :: '_' :: cs => tokenise' sl (sc + 3) (toks :< mktok sl (sc + 3) MixFix "_,_") cs
'_' :: '.' :: '_' :: cs => tokenise' sl (sc + 3) (toks :< mktok sl (sc + 3) MixFix "_._") cs
'\'' :: c :: '\'' :: cs => tokenise' sl (sc + 3) (toks :< mktok sl (sc + 3) Character (singleton c)) cs
'\'' :: '\\' :: c :: '\'' :: cs =>
let ch = ifThenElse (c == 'n') '\n' c
in tokenise' sl (sc + 4) (toks :< mktok sl (sc + 4) Character (singleton ch)) cs
'#' :: cs => ?do_pragma -- we probably want to require at least one alpha?
'-' :: '-' :: cs => lineComment sl (sc + 2) toks cs
'/' :: '-' :: cs => blockComment sl (sc + 2) toks cs
'`' :: cs => doBacktick sl (sc + 1) toks cs [<]
'"' :: cs => doQuote sl (sc + 1) toks cs [<]
'-' :: c :: cs => if isDigit c
then doRest sl (sc + 2) toks cs Number isDigit (Lin :< '-' :< c)
else doRest sl (sc + 1) toks (c :: cs) Ident isIdent (Lin :< '-')
c :: cs => doChar c cs
where
isIdent : Char -> Bool
isIdent c = not (isSpace c || elem c standalone)
doBacktick : Int -> Int -> SnocList BTok -> List Char -> SnocList Char -> Either Error (List BTok)
doBacktick l c toks Nil acc = Left $ E (MkFC "" (l,c)) "EOF in backtick string"
doBacktick el ec toks ('`' :: cs) acc =
let tok = MkBounded (Tok JSLit (pack $ acc <>> [])) False (MkBounds sl sc el ec) in
tokenise' el (ec + 1) (toks :< tok) cs
doBacktick l c toks ('\n' :: cs) acc = doBacktick (l + 1) 0 toks cs (acc :< '\n')
doBacktick l c toks (ch :: cs) acc = doBacktick l (c + 1) toks cs (acc :< ch)
-- temporary use same token as before
mktok : Int -> Int -> Kind -> String -> BTok
mktok el ec kind text = let kind = if elem text keywords then Keyword else kind in
MkBounded (Tok kind text) False (MkBounds sl sc el ec)
lineComment : Int -> Int -> SnocList BTok -> List Char -> Either Error (List BTok)
lineComment line col toks Nil = tokenise' line col toks Nil
lineComment line col toks ('\n' :: cs) = tokenise' (line + 1) 0 toks cs
lineComment line col toks (c :: cs) = lineComment line (col + 1) toks cs
blockComment : Int -> Int -> SnocList BTok -> List Char -> Either Error (List BTok)
blockComment line col toks Nil = Left $ E (MkFC "" (line, col)) "EOF in block comment"
blockComment line col toks ('-' :: '/' :: cs) = tokenise' line (col + 2) toks cs
blockComment line col toks ('\n' :: cs) = blockComment (line + 1) 0 toks cs
blockComment line col toks (c :: cs) = blockComment line (col + 1) toks cs
doRest : Int -> Int -> SnocList BTok -> List Char -> Kind -> (Char -> Bool) -> SnocList Char -> Either Error (List BTok)
doRest l c toks Nil kind pred acc = tokenise' l c (toks :< mktok l c kind (pack $ acc <>> [])) Nil
doRest l c toks (ch :: cs) kind pred acc = if pred ch
then doRest l (c + 1) toks cs kind pred (acc :< ch)
else
let kind = if elem '_' acc then MixFix else kind in
tokenise' l c (toks :< mktok l (c - 1) kind (pack $ acc <>> [])) (ch :: cs)
doQuote : Int -> Int -> SnocList BTok -> List Char -> SnocList Char -> Either Error (List BTok)
-- should be an error..
doQuote line col toks Nil acc = ?missing_end_quote
doQuote line col toks ('\\' :: 'n' :: cs) acc = doQuote line (col + 2) toks cs (acc :< '\n')
doQuote line col toks ('\\' :: c :: cs) acc = doQuote line (col + 2) toks cs (acc :< c)
doQuote line col toks ('\n' :: cs) acc = ?error_newline_in_quote
doQuote line col toks ('"' :: cs) acc = tokenise' line (col + 1) (toks :< mktok line (col + 1) StringKind (pack $ acc <>> [])) cs
doQuote line col toks (c :: cs) acc = doQuote line (col + 1) toks cs (acc :< c)
doChar : Char -> List Char -> Either Error (List BTok)
doChar c cs = if elem c standalone
then tokenise' sl (sc + 1) (toks :< mktok sl (sc + 1) Symbol (singleton c)) cs
else let kind = if isDigit c then Number else if isUpper c then UIdent else Ident in
doRest sl sc toks (c :: cs) kind isIdent [<]
export
tokenise : String -> String -> Either Error (List BTok)
tokenise fn text = case tokenise' 0 0 Lin (unpack text) of
Left (E (MkFC file start) str) => Left (E (MkFC fn start) str)
res => res

View File

@@ -18,7 +18,7 @@ import Lib.Parser.Impl
import Lib.Prettier import Lib.Prettier
import Lib.ProcessDecl import Lib.ProcessDecl
import Lib.Token import Lib.Token
import Lib.Tokenizer import Lib.Tokenizer2
import Lib.TopContext import Lib.TopContext
import Lib.Types import Lib.Types
import Lib.Syntax import Lib.Syntax
@@ -152,10 +152,11 @@ processModule importFC base stk name = do
processFile : String -> M () processFile : String -> M ()
processFile fn = do processFile fn = do
putStrLn "*** Process \{fn}" putStrLn "*** Process \{fn}"
let parts = splitPath fn let parts = split (== '/') fn
let file = fromMaybe "" $ last' parts let file = last parts
let dir = fromMaybe "./" $ parent fn let dirs = init parts
let (name,ext) = splitFileName (fromMaybe "" $ last' parts) let dir = if dirs == Nil then "." else joinBy "/" dirs
let (name,ext) = splitFileName file
putStrLn "\{show dir} \{show name} \{show ext}" putStrLn "\{show dir} \{show name} \{show ext}"
-- declare internal primitives -- declare internal primitives