remove a Text.Parser dependency (about 10%), and alternate tokenizer

2024-12-28 21:49:49 -08:00
parent 23a95f70f5
commit 9a2eac0bd7
6 changed files with 115 additions and 16 deletions
--- a/TODO.md
+++ b/TODO.md
@@ -3,14 +3,13 @@
 More comments in code! This is getting big enough that I need to re-find my bearings when fixing stuff.
- [ ] tokenizer
+- [x] tokenizer that can be ported to newt
 - [ ] string interpolation
 - [ ] editor - indent newline on let with no in
  - I've seen this done in vi for Idris, but it seems non-trivial in vscode.
 - [x] Move on to next decl in case of error
 - [x] for parse error, seek to col 0 token and process next decl
 - [ ] record initialization sugar, e.g. `{ x := 1, y := 2 }`
 - [ ] record update sugar, syntax TBD
  - I think I'm going to hold off on this for now as it requires the type to elaborate. This ends up at the head of an app, which typically is inferred. We'd need a special case somewhere that infers its argument instead.
 - [ ] Change `Ord` to be more like Idris - LT / EQ / GT (and entail equality)
--- a/src/Lib/Elab.idr
+++ b/src/Lib/Elab.idr
@@ -984,7 +984,7 @@ showDef : Context -> List String -> Nat -> Val -> M String
 showDef ctx names n v@(VVar _ n' [<]) =  if n == n' then pure "" else pure "= \{pprint names !(quote ctx.lvl v)}"
 showDef ctx names n v = pure "= \{pprint names !(quote ctx.lvl v)}"
-
+-- desugar do
 undo : FC -> List DoStmt -> M Raw
 undo prev [] = error prev "do block must end in expression"
 undo prev ((DoExpr fc tm) :: Nil) = pure tm
--- a/src/Lib/Tokenizer.idr
+++ b/src/Lib/Tokenizer.idr
@@ -84,22 +84,18 @@ rawTokens
  -- { is singleton except for {{
  <|> match (exact "{{" <|> exact "}}") (Tok Keyword)
  -- need to make this an ident
-  <|> match (exact ",") (checkKW)
+  <|> match (exact ",") (Tok Ident)
  -- for now, our lambda slash is singleton
  <|> match (singleton) (Tok Symbol)
  -- TODO Drop MixFix token type when we support if_then_else_
  <|> match (exact "_,_" <|> exact "_._") (Tok MixFix)
  -- REVIEW - expect non-alpha after?
  <|> match (opt (exact "-") <+> some digit) (Tok Number)
  -- for module names and maybe type constructors
  <|> match (charLit) (Tok Character . unquoteChar)
  <|> match (is '#' <+> many alpha) (Tok Pragma)
  <|> match (lineComment (exact "--")) (Tok Space)
  <|> match (blockComment (exact "/-") (exact "-/")) (Tok Space)
  <|> match (upper <+> many identMore) checkUKW
  <|> match (btick <+> manyUntil btick any <+> btick) (Tok JSLit . trimJS)
  <|> match (quo <+> manyUntil quo (esc any <|> any) <+> quo) (Tok StringKind . unquote)
-  -- accept almost everything, but
+  <|> match (singleton) (Tok Symbol)
  <|> match (upper <+> many identMore) checkUKW
  <|> match (some (non (space <|> singleton))) checkKW
 notSpace : WithBounds (Token Kind) -> Bool
--- a/src/Lib/Tokenizer2.idr
+++ b/src/Lib/Tokenizer2.idr
@@ -0,0 +1,103 @@
 module Lib.Tokenizer2
 -- Working alternate tokenizer, saves about 2k, can be translated to newt
 import Lib.Token
 import Lib.Common
 import Data.String
 standalone : List Char
 standalone = unpack "()\\{}[],.@"
 keywords : List String
 keywords = ["let", "in", "where", "case", "of", "data", "U", "do",
            "ptype", "pfunc", "module", "infixl", "infixr", "infix",
            "∀", "forall", "import", "uses",
            "class", "instance", "record", "constructor",
            "if", "then", "else",
            "$", "λ", "?", "@",
             "->", "→", ":", "=>", ":=", "=", "<-", "\\", "_", "|"]
 -- This makes a big case tree...
 tokenise' : Int -> Int ->  SnocList BTok -> List Char -> Either Error (List BTok)
 tokenise' sl sc toks chars = case chars of
  Nil => Right $ toks <>> []
  ' ' :: cs => tokenise' sl (sc + 1) toks cs
  '\n' :: cs => tokenise' (sl + 1) 0 toks cs
  '{' :: '{' :: cs => tokenise' sl (sc + 2) (toks :< mktok sl (sc + 2) Keyword "{{" ) cs
  '}' :: '}' :: cs => tokenise' sl (sc + 2) (toks :< mktok sl (sc + 2) Keyword "}}" ) cs
  ',' :: cs => tokenise' sl (sc + 1) (toks :< mktok sl (sc + 1) Ident ",") cs
  '_' :: ',' :: '_' :: cs => tokenise' sl (sc + 3) (toks :< mktok sl (sc + 3) MixFix "_,_") cs
  '_' :: '.' :: '_' :: cs => tokenise' sl (sc + 3) (toks :< mktok sl (sc + 3) MixFix "_._") cs
  '\'' :: c :: '\'' :: cs => tokenise' sl (sc + 3) (toks :< mktok sl (sc + 3) Character (singleton c)) cs
  '\'' :: '\\' :: c :: '\'' :: cs =>
    let ch = ifThenElse (c == 'n') '\n' c
    in tokenise' sl (sc + 4) (toks :< mktok sl (sc + 4) Character (singleton ch)) cs
  '#' :: cs => ?do_pragma -- we probably want to require at least one alpha?
  '-' :: '-' :: cs => lineComment sl (sc + 2) toks cs
  '/' :: '-' :: cs => blockComment sl (sc + 2) toks cs
  '`' :: cs => doBacktick sl (sc + 1) toks cs [<]
  '"' :: cs => doQuote sl (sc + 1) toks cs [<]
  '-' :: c :: cs => if isDigit c
    then doRest sl (sc + 2) toks cs Number isDigit (Lin :< '-' :< c)
    else doRest sl (sc + 1) toks (c :: cs) Ident isIdent (Lin :< '-')
  c :: cs => doChar c cs
  where
    isIdent : Char -> Bool
    isIdent c = not (isSpace c || elem c standalone)
    doBacktick : Int -> Int -> SnocList BTok -> List Char -> SnocList Char -> Either Error (List BTok)
    doBacktick l c toks Nil acc = Left $ E (MkFC "" (l,c)) "EOF in backtick string"
    doBacktick el ec toks ('`' :: cs) acc =
      let tok = MkBounded (Tok JSLit (pack $ acc <>> [])) False (MkBounds sl sc el ec) in
      tokenise' el (ec + 1) (toks :< tok) cs
    doBacktick l c toks ('\n' :: cs) acc = doBacktick (l + 1) 0 toks cs (acc :< '\n')
    doBacktick l c toks (ch :: cs) acc = doBacktick l (c + 1) toks cs (acc :< ch)
    -- temporary use same token as before
    mktok : Int -> Int -> Kind -> String -> BTok
    mktok el ec kind text = let kind = if elem text keywords then Keyword else kind in
      MkBounded (Tok kind text) False (MkBounds sl sc el ec)
    lineComment : Int -> Int -> SnocList BTok -> List Char -> Either Error (List BTok)
    lineComment line col toks Nil = tokenise' line col toks Nil
    lineComment line col toks ('\n' :: cs) = tokenise' (line + 1) 0 toks cs
    lineComment line col toks (c :: cs) = lineComment line (col + 1) toks cs
    blockComment : Int -> Int -> SnocList BTok -> List Char -> Either Error (List BTok)
    blockComment line col toks Nil = Left $ E (MkFC "" (line, col)) "EOF in block comment"
    blockComment line col toks ('-' :: '/' :: cs) = tokenise' line (col + 2) toks cs
    blockComment line col toks ('\n' :: cs) = blockComment (line + 1) 0 toks cs
    blockComment line col toks (c :: cs) = blockComment line (col + 1) toks cs
    doRest : Int -> Int -> SnocList BTok -> List Char -> Kind -> (Char -> Bool) -> SnocList Char -> Either Error (List BTok)
    doRest l c toks Nil kind pred acc = tokenise' l c (toks :< mktok l c kind (pack $ acc <>> [])) Nil
    doRest l c toks (ch :: cs) kind pred acc = if pred ch
      then doRest l (c + 1) toks cs kind pred (acc :< ch)
      else
        let kind = if elem '_' acc then MixFix else kind in
        tokenise' l c (toks :< mktok l (c - 1) kind (pack $ acc <>> [])) (ch :: cs)
    doQuote : Int -> Int -> SnocList BTok -> List Char -> SnocList Char -> Either Error (List BTok)
    -- should be an error..
    doQuote line col toks Nil acc = ?missing_end_quote
    doQuote line col toks ('\\' :: 'n' :: cs) acc = doQuote line (col + 2) toks cs (acc :< '\n')
    doQuote line col toks ('\\' :: c :: cs) acc = doQuote line (col + 2) toks cs (acc :< c)
    doQuote line col toks ('\n' :: cs) acc = ?error_newline_in_quote
    doQuote line col toks ('"' :: cs) acc = tokenise' line (col + 1) (toks :< mktok line (col + 1) StringKind (pack $ acc <>> [])) cs
    doQuote line col toks (c :: cs) acc = doQuote line (col + 1) toks cs (acc :< c)
    doChar : Char -> List Char ->  Either Error (List BTok)
    doChar c cs = if elem c standalone
      then tokenise' sl (sc + 1) (toks :< mktok sl (sc + 1) Symbol (singleton c)) cs
      else let kind = if isDigit c then Number else if isUpper c then UIdent else Ident in
        doRest sl sc toks (c :: cs) kind isIdent [<]
 export
 tokenise : String -> String -> Either Error (List BTok)
 tokenise fn text = case tokenise' 0 0 Lin (unpack text) of
  Left (E (MkFC file start) str) => Left (E (MkFC fn start) str)
  res => res
--- a/src/Main.idr
+++ b/src/Main.idr
@@ -18,7 +18,7 @@ import Lib.Parser.Impl
 import Lib.Prettier
 import Lib.ProcessDecl
 import Lib.Token
-import Lib.Tokenizer
+import Lib.Tokenizer2
 import Lib.TopContext
 import Lib.Types
 import Lib.Syntax
@@ -152,10 +152,11 @@ processModule importFC base stk name = do
 processFile : String -> M ()
 processFile fn = do
  putStrLn "*** Process \{fn}"
-  let parts = splitPath fn
+  let parts = split (== '/') fn
-  let file = fromMaybe "" $ last' parts
+  let file = last parts
-  let dir = fromMaybe "./" $ parent fn
+  let dirs = init parts
-  let (name,ext) = splitFileName (fromMaybe "" $ last' parts)
+  let dir = if dirs == Nil then "." else joinBy "/" dirs
  let (name,ext) = splitFileName file
  putStrLn "\{show dir} \{show name} \{show ext}"
  -- declare internal primitives
--- a/tests/Zoo4eg.newt
+++ b/tests/Zoo4eg.newt
@@ -80,7 +80,7 @@ hundred : _
 hundred = mul ten ten
 -- Leibniz equality
-Eq : {A: U} -> A -> A -> U
+Eq : {A : U} -> A -> A -> U
 Eq = \{A} x y => (P : A -> U) -> P x -> P y
 refl : {A : U} {x : A} -> Eq x x