From 747ab08dd69e9f681e19e41e33bb1c9ae2dbfe8c Mon Sep 17 00:00:00 2001 From: Steve Dunham Date: Sun, 29 Dec 2024 14:04:22 -0800 Subject: [PATCH] remove external dependency for Token/WithBounds --- src/Lib/Parser/Impl.idr | 2 +- src/Lib/Token.idr | 57 +++++++++----- src/Lib/Tokenizer.idr | 171 ++++++++++++++++++++-------------------- src/Lib/Tokenizer2.idr | 110 -------------------------- src/Main.idr | 3 +- 5 files changed, 123 insertions(+), 220 deletions(-) delete mode 100644 src/Lib/Tokenizer2.idr diff --git a/src/Lib/Parser/Impl.idr b/src/Lib/Parser/Impl.idr index 99a0beb..53842af 100644 --- a/src/Lib/Parser/Impl.idr +++ b/src/Lib/Parser/Impl.idr @@ -44,7 +44,7 @@ runP (P f) = f -- FIXME no filename, also half the time it is pointing at the token after the error error : String -> TokenList -> String -> Error error fn [] msg = E (MkFC fn (0,0)) msg -error fn ((MkBounded val isIrrelevant (MkBounds line col _ _)) :: _) msg = E (MkFC fn (line,col)) msg +error fn ((MkBounded val (MkBounds line col _ _)) :: _) msg = E (MkFC fn (line,col)) msg export parse : String -> Parser a -> TokenList -> Either Error a diff --git a/src/Lib/Token.idr b/src/Lib/Token.idr index 0f763fa..2f7ff85 100644 --- a/src/Lib/Token.idr +++ b/src/Lib/Token.idr @@ -1,8 +1,26 @@ module Lib.Token --- TODO replace this with own lexer +public export +record Bounds where + constructor MkBounds + startLine : Int + startCol : Int + endLine : Int + endCol : Int -import public Text.Lexer +export +Eq Bounds where + (MkBounds sl sc el ec) == (MkBounds sl' sc' el' ec') = + sl == sl' + && sc == sc' + && el == el' + && ec == ec' + +public export +record WithBounds ty where + constructor MkBounded + val : ty + bounds : Bounds public export data Kind @@ -47,34 +65,31 @@ Show Kind where export Eq Kind where - Ident == Ident = True - UIdent == UIdent = True - Keyword == Keyword = True - MixFix == MixFix = True - Number == Number = True - Character == Character = True - Symbol == Symbol = True - Space == Space = True - LBrace == LBrace = True - Semi == Semi = True - RBrace == RBrace = True - StringKind == StringKind = True - JSLit == JSLit = True - Projection == Projection = True - _ == _ = False + a == b = show a == show b + +public export +record Token where + constructor Tok + kind : Kind + text : String + export -Show k => Show (Token k) where +Show Token where show (Tok k v) = "<\{show k}:\{show v}>" public export BTok : Type -BTok = WithBounds (Token Kind) +BTok = WithBounds Token export value : BTok -> String -value (MkBounded (Tok _ s) _ _) = s +value (MkBounded (Tok _ s) _) = s export kind : BTok -> Kind -kind (MkBounded (Tok k s) _ _) = k +kind (MkBounded (Tok k s) _) = k + +export +start : BTok -> (Int, Int) +start (MkBounded _ (MkBounds l c _ _)) = (l,c) diff --git a/src/Lib/Tokenizer.idr b/src/Lib/Tokenizer.idr index 207c096..a33d5b0 100644 --- a/src/Lib/Tokenizer.idr +++ b/src/Lib/Tokenizer.idr @@ -1,9 +1,13 @@ module Lib.Tokenizer -import Text.Lexer -import Text.Lexer.Tokenizer +-- Working alternate tokenizer, saves about 2k, can be translated to newt + import Lib.Token import Lib.Common +import Data.String + +standalone : List Char +standalone = unpack "()\\{}[],.@" keywords : List String keywords = ["let", "in", "where", "case", "of", "data", "U", "do", @@ -11,100 +15,95 @@ keywords = ["let", "in", "where", "case", "of", "data", "U", "do", "∀", "forall", "import", "uses", "class", "instance", "record", "constructor", "if", "then", "else", - "$", "λ", "?", "@", + -- it would be nice to find a way to unkeyword "." so it could be + -- used as an operator too + "$", "λ", "?", "@", ".", "->", "→", ":", "=>", ":=", "=", "<-", "\\", "_", "|"] -checkKW : String -> Token Kind -checkKW s = - if s /= "_" && elem '_' (unpack s) then Tok MixFix s else - if elem s keywords then Tok Keyword s - else Tok Ident s +-- This makes a big case tree... +tokenise' : Int -> Int -> SnocList BTok -> List Char -> Either Error (List BTok) +tokenise' sl sc toks chars = case chars of + Nil => Right $ toks <>> [] + ' ' :: cs => tokenise' sl (sc + 1) toks cs + '\n' :: cs => tokenise' (sl + 1) 0 toks cs + '{' :: '{' :: cs => tokenise' sl (sc + 2) (toks :< mktok False sl (sc + 2) Keyword "{{" ) cs + '}' :: '}' :: cs => tokenise' sl (sc + 2) (toks :< mktok False sl (sc + 2) Keyword "}}" ) cs + ',' :: cs => tokenise' sl (sc + 1) (toks :< mktok False sl (sc + 1) Ident ",") cs + '_' :: ',' :: '_' :: cs => tokenise' sl (sc + 3) (toks :< mktok False sl (sc + 3) MixFix "_,_") cs + '_' :: '.' :: '_' :: cs => tokenise' sl (sc + 3) (toks :< mktok False sl (sc + 3) MixFix "_._") cs + '\'' :: '\\' :: c :: '\'' :: cs => + let ch = ifThenElse (c == 'n') '\n' c + in tokenise' sl (sc + 4) (toks :< mktok False sl (sc + 4) Character (singleton ch)) cs + '\'' :: c :: '\'' :: cs => tokenise' sl (sc + 3) (toks :< mktok False sl (sc + 3) Character (singleton c)) cs + '#' :: cs => ?do_pragma -- we probably want to require at least one alpha? + '-' :: '-' :: cs => lineComment sl (sc + 2) toks cs + '/' :: '-' :: cs => blockComment sl (sc + 2) toks cs + '`' :: cs => doBacktick sl (sc + 1) toks cs [<] + '"' :: cs => doQuote sl (sc + 1) toks cs [<] + '.' :: cs => doRest sl (sc + 1) toks cs Projection isIdent (Lin :< '.') + '-' :: c :: cs => if isDigit c + then doRest sl (sc + 2) toks cs Number isDigit (Lin :< '-' :< c) + else doRest sl (sc + 1) toks (c :: cs) Ident isIdent (Lin :< '-') -checkUKW : String -> Token Kind -checkUKW s = if elem s keywords then Tok Keyword s else Tok UIdent s + c :: cs => doChar c cs -identMore : Lexer -identMore = alphaNum <|> exact "'" <|> exact "_" - -singleton : Lexer -singleton = oneOf "()\\{}[],.@" - -quo : Recognise True -quo = is '"' - -esc : Recognise True -> Recognise True -esc l = is '\\' <+> l - --- REVIEW maybe we can do this faster with the view thinger -unquote : String -> String -unquote str = case unpack str of - ('"' :: xs) => pack $ go xs - imp => pack $ go imp where - go : List Char -> List Char - go [] = [] - go ['"'] = [] - go ('\\' :: ('n' :: xs)) = '\n' :: go xs - go ('\\' :: (x :: xs)) = x :: go xs - go (x :: xs) = x :: go xs + isIdent : Char -> Bool + isIdent c = not (isSpace c || elem c standalone) -unquoteChar : String -> String -unquoteChar str = pack $ case unpack str of - ('\'' :: xs) => go xs - imp => go imp -- shouldn't happen - where - go : List Char -> List Char - go [] = ['\''] -- shouldn't happen - go ('\\' :: ('n' :: xs)) = ['\n'] - go ('\\' :: (x :: xs)) = [x] - go (x :: xs) = [x] + isUIdent : Char -> Bool + isUIdent c = isIdent c || c == '.' -opMiddle = pred (\c => not (isSpace c || c == '_')) - -btick = is '`' - -trimJS : String -> String -trimJS str = case unpack str of - ('`' :: xs) => pack $ go xs - bug => pack $ go bug - where - go : List Char -> List Char - go [] = [] - go ['`'] = [] - go (x :: xs) = x :: go xs - -%hide charLit -charLit : Lexer -charLit = is '\'' <+> (is '\\' <+> any <|> any) <+> is '\'' + doBacktick : Int -> Int -> SnocList BTok -> List Char -> SnocList Char -> Either Error (List BTok) + doBacktick l c toks Nil acc = Left $ E (MkFC "" (l,c)) "EOF in backtick string" + doBacktick el ec toks ('`' :: cs) acc = + let tok = MkBounded (Tok JSLit (pack $ acc <>> [])) (MkBounds sl sc el ec) in + tokenise' el (ec + 1) (toks :< tok) cs + doBacktick l c toks ('\n' :: cs) acc = doBacktick (l + 1) 0 toks cs (acc :< '\n') + doBacktick l c toks (ch :: cs) acc = doBacktick l (c + 1) toks cs (acc :< ch) -rawTokens : Tokenizer (Token Kind) -rawTokens - = match spaces (Tok Space) - -- { is singleton except for {{ - <|> match (exact "{{" <|> exact "}}") (Tok Keyword) - -- need to make this an ident - <|> match (exact ",") (Tok Ident) - -- for now, our lambda slash is singleton - <|> match (exact "_,_" <|> exact "_._") (Tok MixFix) - <|> match (opt (exact "-") <+> some digit) (Tok Number) - <|> match (charLit) (Tok Character . unquoteChar) - <|> match (is '#' <+> many alpha) (Tok Pragma) - <|> match (lineComment (exact "--")) (Tok Space) - <|> match (blockComment (exact "/-") (exact "-/")) (Tok Space) - <|> match (btick <+> manyUntil btick any <+> btick) (Tok JSLit . trimJS) - <|> match (quo <+> manyUntil quo (esc any <|> any) <+> quo) (Tok StringKind . unquote) - <|> match (singleton) (Tok Symbol) - <|> match (upper <+> many identMore) checkUKW - <|> match (some (non (space <|> singleton))) checkKW + -- temporary use same token as before + mktok : Bool -> Int -> Int -> Kind -> String -> BTok + mktok checkkw el ec kind text = let kind = if checkkw && elem text keywords then Keyword else kind in + MkBounded (Tok kind text) (MkBounds sl sc el ec) -notSpace : WithBounds (Token Kind) -> Bool -notSpace (MkBounded (Tok Space _) _ _) = False -notSpace _ = True + lineComment : Int -> Int -> SnocList BTok -> List Char -> Either Error (List BTok) + lineComment line col toks Nil = tokenise' line col toks Nil + lineComment line col toks ('\n' :: cs) = tokenise' (line + 1) 0 toks cs + lineComment line col toks (c :: cs) = lineComment line (col + 1) toks cs + + blockComment : Int -> Int -> SnocList BTok -> List Char -> Either Error (List BTok) + blockComment line col toks Nil = Left $ E (MkFC "" (line, col)) "EOF in block comment" + blockComment line col toks ('-' :: '/' :: cs) = tokenise' line (col + 2) toks cs + blockComment line col toks ('\n' :: cs) = blockComment (line + 1) 0 toks cs + blockComment line col toks (c :: cs) = blockComment line (col + 1) toks cs + + doRest : Int -> Int -> SnocList BTok -> List Char -> Kind -> (Char -> Bool) -> SnocList Char -> Either Error (List BTok) + doRest l c toks Nil kind pred acc = tokenise' l c (toks :< mktok True l c kind (pack $ acc <>> [])) Nil + doRest l c toks (ch :: cs) kind pred acc = if pred ch + then doRest l (c + 1) toks cs kind pred (acc :< ch) + else + let kind = if elem '_' acc then MixFix else kind in + tokenise' l c (toks :< mktok True l (c - 1) kind (pack $ acc <>> [])) (ch :: cs) + + doQuote : Int -> Int -> SnocList BTok -> List Char -> SnocList Char -> Either Error (List BTok) + -- should be an error.. + doQuote line col toks Nil acc = ?missing_end_quote + doQuote line col toks ('\\' :: 'n' :: cs) acc = doQuote line (col + 2) toks cs (acc :< '\n') + doQuote line col toks ('\\' :: c :: cs) acc = doQuote line (col + 2) toks cs (acc :< c) + doQuote line col toks ('\n' :: cs) acc = ?error_newline_in_quote + doQuote line col toks ('"' :: cs) acc = tokenise' line (col + 1) (toks :< mktok False line (col + 1) StringKind (pack $ acc <>> [])) cs + doQuote line col toks (c :: cs) acc = doQuote line (col + 1) toks cs (acc :< c) + + doChar : Char -> List Char -> Either Error (List BTok) + doChar c cs = if elem c standalone + then tokenise' sl (sc + 1) (toks :< mktok True sl (sc + 1) Symbol (singleton c)) cs + else let kind = if isDigit c then Number else if isUpper c then UIdent else Ident in + doRest sl sc toks (c :: cs) kind isIdent [<] export tokenise : String -> String -> Either Error (List BTok) -tokenise fn s = case lex rawTokens s of - (toks, EndInput, l, c, what) => Right (filter notSpace toks) - (toks, reason, l, c, what) => Left (E (MkFC fn (l,c)) "\{show reason}") - +tokenise fn text = case tokenise' 0 0 Lin (unpack text) of + Left (E (MkFC file start) str) => Left (E (MkFC fn start) str) + res => res diff --git a/src/Lib/Tokenizer2.idr b/src/Lib/Tokenizer2.idr deleted file mode 100644 index b2e378e..0000000 --- a/src/Lib/Tokenizer2.idr +++ /dev/null @@ -1,110 +0,0 @@ -module Lib.Tokenizer2 - --- Working alternate tokenizer, saves about 2k, can be translated to newt - -import Lib.Token -import Lib.Common -import Data.String - -standalone : List Char -standalone = unpack "()\\{}[],.@" - -keywords : List String -keywords = ["let", "in", "where", "case", "of", "data", "U", "do", - "ptype", "pfunc", "module", "infixl", "infixr", "infix", - "∀", "forall", "import", "uses", - "class", "instance", "record", "constructor", - "if", "then", "else", - -- it would be nice to find a way to unkeyword "." so it could be - -- used as an operator too - "$", "λ", "?", "@", ".", - "->", "→", ":", "=>", ":=", "=", "<-", "\\", "_", "|"] - --- This makes a big case tree... -tokenise' : Int -> Int -> SnocList BTok -> List Char -> Either Error (List BTok) -tokenise' sl sc toks chars = case chars of - Nil => Right $ toks <>> [] - ' ' :: cs => tokenise' sl (sc + 1) toks cs - '\n' :: cs => tokenise' (sl + 1) 0 toks cs - '{' :: '{' :: cs => tokenise' sl (sc + 2) (toks :< mktok False sl (sc + 2) Keyword "{{" ) cs - '}' :: '}' :: cs => tokenise' sl (sc + 2) (toks :< mktok False sl (sc + 2) Keyword "}}" ) cs - ',' :: cs => tokenise' sl (sc + 1) (toks :< mktok False sl (sc + 1) Ident ",") cs - '_' :: ',' :: '_' :: cs => tokenise' sl (sc + 3) (toks :< mktok False sl (sc + 3) MixFix "_,_") cs - '_' :: '.' :: '_' :: cs => tokenise' sl (sc + 3) (toks :< mktok False sl (sc + 3) MixFix "_._") cs - '\'' :: '\\' :: c :: '\'' :: cs => - let ch = ifThenElse (c == 'n') '\n' c - in tokenise' sl (sc + 4) (toks :< mktok False sl (sc + 4) Character (singleton ch)) cs - '\'' :: c :: '\'' :: cs => tokenise' sl (sc + 3) (toks :< mktok False sl (sc + 3) Character (singleton c)) cs - '#' :: cs => ?do_pragma -- we probably want to require at least one alpha? - '-' :: '-' :: cs => lineComment sl (sc + 2) toks cs - '/' :: '-' :: cs => blockComment sl (sc + 2) toks cs - '`' :: cs => doBacktick sl (sc + 1) toks cs [<] - '"' :: cs => doQuote sl (sc + 1) toks cs [<] - '.' :: cs => doRest sl (sc + 1) toks cs Projection isIdent (Lin :< '.') - '-' :: c :: cs => if isDigit c - then doRest sl (sc + 2) toks cs Number isDigit (Lin :< '-' :< c) - else doRest sl (sc + 1) toks (c :: cs) Ident isIdent (Lin :< '-') - - c :: cs => doChar c cs - - where - isIdent : Char -> Bool - isIdent c = not (isSpace c || elem c standalone) - - isUIdent : Char -> Bool - isUIdent c = isIdent c || c == '.' - - doBacktick : Int -> Int -> SnocList BTok -> List Char -> SnocList Char -> Either Error (List BTok) - doBacktick l c toks Nil acc = Left $ E (MkFC "" (l,c)) "EOF in backtick string" - doBacktick el ec toks ('`' :: cs) acc = - let tok = MkBounded (Tok JSLit (pack $ acc <>> [])) False (MkBounds sl sc el ec) in - tokenise' el (ec + 1) (toks :< tok) cs - doBacktick l c toks ('\n' :: cs) acc = doBacktick (l + 1) 0 toks cs (acc :< '\n') - doBacktick l c toks (ch :: cs) acc = doBacktick l (c + 1) toks cs (acc :< ch) - - - -- temporary use same token as before - mktok : Bool -> Int -> Int -> Kind -> String -> BTok - mktok checkkw el ec kind text = let kind = if checkkw && elem text keywords then Keyword else kind in - MkBounded (Tok kind text) False (MkBounds sl sc el ec) - - lineComment : Int -> Int -> SnocList BTok -> List Char -> Either Error (List BTok) - lineComment line col toks Nil = tokenise' line col toks Nil - lineComment line col toks ('\n' :: cs) = tokenise' (line + 1) 0 toks cs - lineComment line col toks (c :: cs) = lineComment line (col + 1) toks cs - - blockComment : Int -> Int -> SnocList BTok -> List Char -> Either Error (List BTok) - blockComment line col toks Nil = Left $ E (MkFC "" (line, col)) "EOF in block comment" - blockComment line col toks ('-' :: '/' :: cs) = tokenise' line (col + 2) toks cs - blockComment line col toks ('\n' :: cs) = blockComment (line + 1) 0 toks cs - blockComment line col toks (c :: cs) = blockComment line (col + 1) toks cs - - - doRest : Int -> Int -> SnocList BTok -> List Char -> Kind -> (Char -> Bool) -> SnocList Char -> Either Error (List BTok) - doRest l c toks Nil kind pred acc = tokenise' l c (toks :< mktok True l c kind (pack $ acc <>> [])) Nil - doRest l c toks (ch :: cs) kind pred acc = if pred ch - then doRest l (c + 1) toks cs kind pred (acc :< ch) - else - let kind = if elem '_' acc then MixFix else kind in - tokenise' l c (toks :< mktok True l (c - 1) kind (pack $ acc <>> [])) (ch :: cs) - - doQuote : Int -> Int -> SnocList BTok -> List Char -> SnocList Char -> Either Error (List BTok) - -- should be an error.. - doQuote line col toks Nil acc = ?missing_end_quote - doQuote line col toks ('\\' :: 'n' :: cs) acc = doQuote line (col + 2) toks cs (acc :< '\n') - doQuote line col toks ('\\' :: c :: cs) acc = doQuote line (col + 2) toks cs (acc :< c) - doQuote line col toks ('\n' :: cs) acc = ?error_newline_in_quote - doQuote line col toks ('"' :: cs) acc = tokenise' line (col + 1) (toks :< mktok False line (col + 1) StringKind (pack $ acc <>> [])) cs - doQuote line col toks (c :: cs) acc = doQuote line (col + 1) toks cs (acc :< c) - - doChar : Char -> List Char -> Either Error (List BTok) - doChar c cs = if elem c standalone - then tokenise' sl (sc + 1) (toks :< mktok True sl (sc + 1) Symbol (singleton c)) cs - else let kind = if isDigit c then Number else if isUpper c then UIdent else Ident in - doRest sl sc toks (c :: cs) kind isIdent [<] - -export -tokenise : String -> String -> Either Error (List BTok) -tokenise fn text = case tokenise' 0 0 Lin (unpack text) of - Left (E (MkFC file start) str) => Left (E (MkFC fn start) str) - res => res diff --git a/src/Main.idr b/src/Main.idr index 1a7e651..578e561 100644 --- a/src/Main.idr +++ b/src/Main.idr @@ -9,7 +9,6 @@ import Data.List1 import Data.String import Data.Vect import Data.IORef --- import Lib.Elab import Lib.Common import Lib.Compile import Lib.Parser @@ -18,7 +17,7 @@ import Lib.Parser.Impl import Lib.Prettier import Lib.ProcessDecl import Lib.Token -import Lib.Tokenizer2 +import Lib.Tokenizer import Lib.TopContext import Lib.Types import Lib.Syntax