remove a Text.Parser dependency (about 10%), and alternate tokenizer

2024-12-28 21:49:49 -08:00
parent 23a95f70f5
commit 9a2eac0bd7
6 changed files with 115 additions and 16 deletions
--- a/src/Lib/Elab.idr
+++ b/src/Lib/Elab.idr
@@ -984,7 +984,7 @@ showDef : Context -> List String -> Nat -> Val -> M String
 showDef ctx names n v@(VVar _ n' [<]) =  if n == n' then pure "" else pure "= \{pprint names !(quote ctx.lvl v)}"
 showDef ctx names n v = pure "= \{pprint names !(quote ctx.lvl v)}"

-
+-- desugar do
 undo : FC -> List DoStmt -> M Raw
 undo prev [] = error prev "do block must end in expression"
 undo prev ((DoExpr fc tm) :: Nil) = pure tm
--- a/src/Lib/Tokenizer.idr
+++ b/src/Lib/Tokenizer.idr
@@ -84,22 +84,18 @@ rawTokens
  -- { is singleton except for {{
  <|> match (exact "{{" <|> exact "}}") (Tok Keyword)
  -- need to make this an ident
-  <|> match (exact ",") (checkKW)
+  <|> match (exact ",") (Tok Ident)
  -- for now, our lambda slash is singleton
-  <|> match (singleton) (Tok Symbol)
-  -- TODO Drop MixFix token type when we support if_then_else_
  <|> match (exact "_,_" <|> exact "_._") (Tok MixFix)
-  -- REVIEW - expect non-alpha after?
  <|> match (opt (exact "-") <+> some digit) (Tok Number)
-  -- for module names and maybe type constructors
  <|> match (charLit) (Tok Character . unquoteChar)
  <|> match (is '#' <+> many alpha) (Tok Pragma)
  <|> match (lineComment (exact "--")) (Tok Space)
  <|> match (blockComment (exact "/-") (exact "-/")) (Tok Space)
-  <|> match (upper <+> many identMore) checkUKW
  <|> match (btick <+> manyUntil btick any <+> btick) (Tok JSLit . trimJS)
  <|> match (quo <+> manyUntil quo (esc any <|> any) <+> quo) (Tok StringKind . unquote)
-  -- accept almost everything, but
+  <|> match (singleton) (Tok Symbol)
+  <|> match (upper <+> many identMore) checkUKW
  <|> match (some (non (space <|> singleton))) checkKW

 notSpace : WithBounds (Token Kind) -> Bool
--- a/src/Lib/Tokenizer2.idr
+++ b/src/Lib/Tokenizer2.idr
@@ -0,0 +1,103 @@
+module Lib.Tokenizer2
+
+-- Working alternate tokenizer, saves about 2k, can be translated to newt
+
+import Lib.Token
+import Lib.Common
+import Data.String
+
+standalone : List Char
+standalone = unpack "()\\{}[],.@"
+
+keywords : List String
+keywords = ["let", "in", "where", "case", "of", "data", "U", "do",
+            "ptype", "pfunc", "module", "infixl", "infixr", "infix",
+            "∀", "forall", "import", "uses",
+            "class", "instance", "record", "constructor",
+            "if", "then", "else",
+            "$", "λ", "?", "@",
+             "->", "→", ":", "=>", ":=", "=", "<-", "\\", "_", "|"]
+
+-- This makes a big case tree...
+tokenise' : Int -> Int ->  SnocList BTok -> List Char -> Either Error (List BTok)
+tokenise' sl sc toks chars = case chars of
+  Nil => Right $ toks <>> []
+  ' ' :: cs => tokenise' sl (sc + 1) toks cs
+  '\n' :: cs => tokenise' (sl + 1) 0 toks cs
+  '{' :: '{' :: cs => tokenise' sl (sc + 2) (toks :< mktok sl (sc + 2) Keyword "{{" ) cs
+  '}' :: '}' :: cs => tokenise' sl (sc + 2) (toks :< mktok sl (sc + 2) Keyword "}}" ) cs
+  ',' :: cs => tokenise' sl (sc + 1) (toks :< mktok sl (sc + 1) Ident ",") cs
+  '_' :: ',' :: '_' :: cs => tokenise' sl (sc + 3) (toks :< mktok sl (sc + 3) MixFix "_,_") cs
+  '_' :: '.' :: '_' :: cs => tokenise' sl (sc + 3) (toks :< mktok sl (sc + 3) MixFix "_._") cs
+  '\'' :: c :: '\'' :: cs => tokenise' sl (sc + 3) (toks :< mktok sl (sc + 3) Character (singleton c)) cs
+  '\'' :: '\\' :: c :: '\'' :: cs =>
+    let ch = ifThenElse (c == 'n') '\n' c
+    in tokenise' sl (sc + 4) (toks :< mktok sl (sc + 4) Character (singleton ch)) cs
+  '#' :: cs => ?do_pragma -- we probably want to require at least one alpha?
+  '-' :: '-' :: cs => lineComment sl (sc + 2) toks cs
+  '/' :: '-' :: cs => blockComment sl (sc + 2) toks cs
+  '`' :: cs => doBacktick sl (sc + 1) toks cs [<]
+  '"' :: cs => doQuote sl (sc + 1) toks cs [<]
+  '-' :: c :: cs => if isDigit c
+    then doRest sl (sc + 2) toks cs Number isDigit (Lin :< '-' :< c)
+    else doRest sl (sc + 1) toks (c :: cs) Ident isIdent (Lin :< '-')
+
+  c :: cs => doChar c cs
+
+  where
+    isIdent : Char -> Bool
+    isIdent c = not (isSpace c || elem c standalone)
+
+    doBacktick : Int -> Int -> SnocList BTok -> List Char -> SnocList Char -> Either Error (List BTok)
+    doBacktick l c toks Nil acc = Left $ E (MkFC "" (l,c)) "EOF in backtick string"
+    doBacktick el ec toks ('`' :: cs) acc =
+      let tok = MkBounded (Tok JSLit (pack $ acc <>> [])) False (MkBounds sl sc el ec) in
+      tokenise' el (ec + 1) (toks :< tok) cs
+    doBacktick l c toks ('\n' :: cs) acc = doBacktick (l + 1) 0 toks cs (acc :< '\n')
+    doBacktick l c toks (ch :: cs) acc = doBacktick l (c + 1) toks cs (acc :< ch)
+
+
+    -- temporary use same token as before
+    mktok : Int -> Int -> Kind -> String -> BTok
+    mktok el ec kind text = let kind = if elem text keywords then Keyword else kind in
+      MkBounded (Tok kind text) False (MkBounds sl sc el ec)
+
+    lineComment : Int -> Int -> SnocList BTok -> List Char -> Either Error (List BTok)
+    lineComment line col toks Nil = tokenise' line col toks Nil
+    lineComment line col toks ('\n' :: cs) = tokenise' (line + 1) 0 toks cs
+    lineComment line col toks (c :: cs) = lineComment line (col + 1) toks cs
+
+    blockComment : Int -> Int -> SnocList BTok -> List Char -> Either Error (List BTok)
+    blockComment line col toks Nil = Left $ E (MkFC "" (line, col)) "EOF in block comment"
+    blockComment line col toks ('-' :: '/' :: cs) = tokenise' line (col + 2) toks cs
+    blockComment line col toks ('\n' :: cs) = blockComment (line + 1) 0 toks cs
+    blockComment line col toks (c :: cs) = blockComment line (col + 1) toks cs
+
+
+    doRest : Int -> Int -> SnocList BTok -> List Char -> Kind -> (Char -> Bool) -> SnocList Char -> Either Error (List BTok)
+    doRest l c toks Nil kind pred acc = tokenise' l c (toks :< mktok l c kind (pack $ acc <>> [])) Nil
+    doRest l c toks (ch :: cs) kind pred acc = if pred ch
+      then doRest l (c + 1) toks cs kind pred (acc :< ch)
+      else
+        let kind = if elem '_' acc then MixFix else kind in
+        tokenise' l c (toks :< mktok l (c - 1) kind (pack $ acc <>> [])) (ch :: cs)
+
+    doQuote : Int -> Int -> SnocList BTok -> List Char -> SnocList Char -> Either Error (List BTok)
+    -- should be an error..
+    doQuote line col toks Nil acc = ?missing_end_quote
+    doQuote line col toks ('\\' :: 'n' :: cs) acc = doQuote line (col + 2) toks cs (acc :< '\n')
+    doQuote line col toks ('\\' :: c :: cs) acc = doQuote line (col + 2) toks cs (acc :< c)
+    doQuote line col toks ('\n' :: cs) acc = ?error_newline_in_quote
+    doQuote line col toks ('"' :: cs) acc = tokenise' line (col + 1) (toks :< mktok line (col + 1) StringKind (pack $ acc <>> [])) cs
+    doQuote line col toks (c :: cs) acc = doQuote line (col + 1) toks cs (acc :< c)
+
+    doChar : Char -> List Char ->  Either Error (List BTok)
+    doChar c cs = if elem c standalone
+      then tokenise' sl (sc + 1) (toks :< mktok sl (sc + 1) Symbol (singleton c)) cs
+      else let kind = if isDigit c then Number else if isUpper c then UIdent else Ident in
+        doRest sl sc toks (c :: cs) kind isIdent [<]
+export
+tokenise : String -> String -> Either Error (List BTok)
+tokenise fn text = case tokenise' 0 0 Lin (unpack text) of
+  Left (E (MkFC file start) str) => Left (E (MkFC fn start) str)
+  res => res