put the port in the port directory
This commit is contained in:
172
port/Lib/Tokenizer.newt
Normal file
172
port/Lib/Tokenizer.newt
Normal file
@@ -0,0 +1,172 @@
|
||||
module Lib.Tokenizer
|
||||
|
||||
-- Working alternate tokenizer, saves about 2k, can be translated to newt
|
||||
|
||||
-- Currently this processes a stream of characters, I may switch it to
|
||||
-- combinators for readability in the future.
|
||||
|
||||
-- Newt is having a rough time dealing with do blocks for Either in here
|
||||
--
|
||||
|
||||
import Lib.Token
|
||||
import Lib.Common
|
||||
import Data.String
|
||||
import Data.SnocList
|
||||
|
||||
standalone : List Char
|
||||
standalone = unpack "()\\{}[],.@"
|
||||
|
||||
keywords : List String
|
||||
keywords = ("let" :: "in" :: "where" :: "case" :: "of" :: "data" :: "U" :: "do" ::
|
||||
"ptype" :: "pfunc" :: "module" :: "infixl" :: "infixr" :: "infix" ::
|
||||
"∀" :: "forall" :: "import" :: "uses" ::
|
||||
"class" :: "instance" :: "record" :: "constructor" ::
|
||||
"if" :: "then" :: "else" ::
|
||||
-- it would be nice to find a way to unkeyword "." so it could be
|
||||
-- used as an operator too
|
||||
"$" :: "λ" :: "?" :: "@" :: "." ::
|
||||
"->" :: "→" :: ":" :: "=>" :: ":=" :: "=" :: "<-" :: "\\" :: "_" :: "|" :: Nil)
|
||||
|
||||
record TState where
|
||||
constructor TS
|
||||
trow : Int
|
||||
tcol : Int
|
||||
acc : SnocList BTok
|
||||
chars : List Char
|
||||
|
||||
singleton : Char → String
|
||||
singleton c = pack $ c :: Nil
|
||||
|
||||
-- This makes a big case tree...
|
||||
rawTokenise : TState -> Either Error TState
|
||||
|
||||
quoteTokenise : TState -> Int -> Int -> SnocList Char -> Either Error TState
|
||||
quoteTokenise ts@(TS el ec toks chars) startl startc acc = case chars of
|
||||
'"' :: cs => Right (TS el ec (toks :< stok) chars)
|
||||
'\\' :: '{' :: cs =>
|
||||
let tok = MkBounded (Tok StartInterp "\\{") (MkBounds el ec el (ec + 2)) in
|
||||
case (rawTokenise $ TS el (ec + 2) (toks :< stok :< tok) cs) of
|
||||
Left err => Left err
|
||||
Right (TS el ec toks chars) => case chars of
|
||||
'}' :: cs =>
|
||||
let tok = MkBounded (Tok EndInterp "}") (MkBounds el ec el (ec + 1))
|
||||
in quoteTokenise (TS el (ec + 1) (toks :< tok) cs) el (ec + 1) Lin
|
||||
cs => Left $ E (MkFC "" (el, ec)) "Expected '{'"
|
||||
-- TODO newline in string should be an error
|
||||
'\n' :: cs => Left $ E (MkFC "" (el, ec)) "Newline in string"
|
||||
'\\' :: 'n' :: cs => quoteTokenise (TS el (ec + 2) toks cs) startl startc (acc :< '\n')
|
||||
'\\' :: c :: cs => quoteTokenise (TS el (ec + 2) toks cs) startl startc (acc :< c)
|
||||
c :: cs => quoteTokenise (TS el (ec + 1) toks cs) startl startc (acc :< c)
|
||||
Nil => Left $ E (MkFC "" (el, ec)) "Expected '\"' at EOF"
|
||||
|
||||
where
|
||||
stok : BTok
|
||||
stok = MkBounded (Tok StringKind (pack $ acc <>> Nil)) (MkBounds startl startc el ec)
|
||||
|
||||
|
||||
|
||||
rawTokenise ts@(TS sl sc toks chars) = case chars of
|
||||
Nil => Right $ ts
|
||||
' ' :: cs => rawTokenise (TS sl (sc + 1) toks cs)
|
||||
'\n' :: cs => rawTokenise (TS (sl + 1) 0 toks cs)
|
||||
|
||||
'"' :: cs =>
|
||||
let tok = mktok False sl (sc + 1) StartQuote "\"" in
|
||||
case quoteTokenise (TS sl (sc + 1) (toks :< tok) cs) sl (sc + 1) Lin of
|
||||
Left err => Left err
|
||||
Right (TS sl sc toks chars) => case chars of
|
||||
'"' :: cs => let tok = mktok False sl (sc + 1) EndQuote "\"" in
|
||||
rawTokenise (TS sl (sc + 1) (toks :< tok) cs)
|
||||
cs => Left $ E (MkFC "" (sl, sc)) "Expected '\"'"
|
||||
|
||||
'{' :: '{' :: cs =>
|
||||
let tok = mktok False sl (sc + 2) Keyword "{{" in
|
||||
case rawTokenise (TS sl (sc + 2) (toks :< tok) cs) of
|
||||
Left err => Left err
|
||||
Right (TS sl sc toks chars) => case chars of
|
||||
'}' :: '}' :: cs => let tok = mktok False sl (sc + 2) Keyword "}}" in
|
||||
rawTokenise (TS sl (sc + 2) (toks :< tok) cs)
|
||||
cs => Left $ E (MkFC "" (sl, sc)) "Expected '}}'"
|
||||
|
||||
'}' :: cs => Right ts
|
||||
'{' :: cs =>
|
||||
let tok = mktok False sl (sc + 1) Symbol "{" in
|
||||
case rawTokenise (TS sl (sc + 1) (toks :< tok) cs) of
|
||||
Left err => Left err
|
||||
Right (TS sl sc toks chars) => case chars of
|
||||
'}' :: cs => let tok = mktok False sl (sc + 1) Symbol "}" in
|
||||
rawTokenise (TS sl (sc + 1) (toks :< tok) cs)
|
||||
cs => Left $ E (MkFC "" (sl, sc)) "Expected '}'"
|
||||
|
||||
',' :: cs => rawTokenise (TS sl (sc + 1) (toks :< mktok False sl (sc + 1) Ident ",") cs)
|
||||
'_' :: ',' :: '_' :: cs => rawTokenise (TS sl (sc + 3) (toks :< mktok False sl (sc + 3) MixFix "_,_") cs)
|
||||
'_' :: '.' :: '_' :: cs => rawTokenise (TS sl (sc + 3) (toks :< mktok False sl (sc + 3) MixFix "_._") cs)
|
||||
'\'' :: '\\' :: c :: '\'' :: cs =>
|
||||
let ch = ite (c == 'n') '\n' c
|
||||
in rawTokenise (TS sl (sc + 4) (toks :< mktok False sl (sc + 4) Character (singleton ch)) cs)
|
||||
'\'' :: c :: '\'' :: cs => rawTokenise (TS sl (sc + 3) (toks :< mktok False sl (sc + 3) Character (singleton c)) cs)
|
||||
'#' :: cs => doRest (TS sl (sc + 1) toks cs) Pragma isIdent (Lin :< '#')
|
||||
'-' :: '-' :: cs => lineComment (TS sl (sc + 2) toks cs)
|
||||
'/' :: '-' :: cs => blockComment (TS sl (sc + 2) toks cs)
|
||||
'`' :: cs => doBacktick (TS sl (sc + 1) toks cs) Lin
|
||||
'.' :: cs => doRest (TS sl (sc + 1) toks cs) Projection isIdent (Lin :< '.')
|
||||
'-' :: c :: cs => if isDigit c
|
||||
then doRest (TS sl (sc + 2) toks cs) Number isDigit (Lin :< '-' :< c)
|
||||
else doRest (TS sl (sc + 1) toks (c :: cs)) Ident isIdent (Lin :< '-')
|
||||
|
||||
c :: cs => doChar c cs
|
||||
|
||||
where
|
||||
isIdent : Char -> Bool
|
||||
isIdent c = not (isSpace c || elem c standalone)
|
||||
|
||||
isUIdent : Char -> Bool
|
||||
isUIdent c = isIdent c || c == '.'
|
||||
|
||||
doBacktick : TState -> SnocList Char -> Either Error TState
|
||||
doBacktick (TS l c toks Nil) acc = Left $ E (MkFC "" (l,c)) "EOF in backtick string"
|
||||
doBacktick (TS el ec toks ('`' :: cs)) acc =
|
||||
let tok = MkBounded (Tok JSLit (pack $ acc <>> Nil)) (MkBounds sl sc el ec) in
|
||||
rawTokenise (TS el (ec + 1) (toks :< tok) cs)
|
||||
doBacktick (TS l c toks ('\n' :: cs)) acc = doBacktick (TS (l + 1) 0 toks cs) (acc :< '\n')
|
||||
doBacktick (TS l c toks (ch :: cs)) acc = doBacktick (TS l (c + 1) toks cs) (acc :< ch)
|
||||
|
||||
|
||||
-- temporary use same token as before
|
||||
mktok : Bool -> Int -> Int -> Kind -> String -> BTok
|
||||
mktok checkkw el ec kind text = let kind = if checkkw && elem text keywords then Keyword else kind in
|
||||
MkBounded (Tok kind text) (MkBounds sl sc el ec)
|
||||
|
||||
lineComment : TState -> Either Error TState
|
||||
lineComment (TS line col toks Nil) = rawTokenise (TS line col toks Nil)
|
||||
lineComment (TS line col toks ('\n' :: cs)) = rawTokenise (TS (line + 1) 0 toks cs)
|
||||
lineComment (TS line col toks (c :: cs)) = lineComment (TS line (col + 1) toks cs)
|
||||
|
||||
blockComment : TState -> Either Error TState
|
||||
blockComment (TS line col toks Nil) = Left $ E (MkFC "" (line, col)) "EOF in block comment"
|
||||
blockComment (TS line col toks ('-' :: '/' :: cs)) = rawTokenise (TS line (col + 2) toks cs)
|
||||
blockComment (TS line col toks ('\n' :: cs)) = blockComment (TS (line + 1) 0 toks cs)
|
||||
blockComment (TS line col toks (c :: cs)) = blockComment (TS line (col + 1) toks cs)
|
||||
|
||||
doRest : TState -> Kind -> (Char -> Bool) -> SnocList Char -> Either Error TState
|
||||
doRest (TS l c toks Nil) kind pred acc = rawTokenise (TS l c (toks :< mktok True l c kind (pack $ acc <>> Nil)) Nil)
|
||||
doRest (TS l c toks (ch :: cs)) kind pred acc = if pred ch
|
||||
then doRest (TS l (c + 1) toks cs) kind pred (acc :< ch)
|
||||
else
|
||||
let kind = if snocelem '_' acc then MixFix else kind in
|
||||
rawTokenise (TS l c (toks :< mktok True l (c - 1) kind (pack $ acc <>> Nil)) (ch :: cs))
|
||||
|
||||
doChar : Char -> List Char -> Either Error TState
|
||||
doChar c cs = if elem c standalone
|
||||
then rawTokenise (TS sl (sc + 1) (toks :< mktok True sl (sc + 1) Symbol (pack $ c :: Nil)) cs)
|
||||
else let kind = if isDigit c then Number else if isUpper c then UIdent else Ident in
|
||||
doRest (TS sl sc toks (c :: cs)) kind isIdent Lin
|
||||
|
||||
|
||||
tokenise : String -> String -> Either Error (List BTok)
|
||||
tokenise fn text = case rawTokenise (TS 0 0 Lin (unpack text)) of
|
||||
Right (TS trow tcol acc Nil) => Right $ acc <>> Nil
|
||||
Right (TS trow tcol acc chars) => Left $ E (MkFC fn (trow, tcol)) "Extra toks"
|
||||
Left (E (MkFC file start) str) => Left $ E (MkFC fn start) str
|
||||
Left err => Left err
|
||||
|
||||
Reference in New Issue
Block a user