Parsing updates for unicode
- Allow unicode characters in indents and operators - Show lexing errors
This commit is contained in:
@@ -3,10 +3,12 @@ module Lib.Tokenizer
|
||||
import Text.Lexer
|
||||
import Text.Lexer.Tokenizer
|
||||
import Lib.Token
|
||||
import Lib.Common
|
||||
|
||||
keywords : List String
|
||||
keywords = ["let", "in", "where", "case", "of", "data", "U", "do",
|
||||
"ptype", "pfunc", "module", "infixl", "infixr", "infix"]
|
||||
"ptype", "pfunc", "module", "infixl", "infixr", "infix",
|
||||
"->", "→", ":", "=>", ":=", "=", "<-", "\\", "_"]
|
||||
|
||||
specialOps : List String
|
||||
specialOps = ["->", ":", "=>", ":=", "=", "<-"]
|
||||
@@ -17,18 +19,12 @@ checkKW s = if elem s keywords then Tok Keyword s else Tok Ident s
|
||||
checkUKW : String -> Token Kind
|
||||
checkUKW s = if elem s keywords then Tok Keyword s else Tok UIdent s
|
||||
|
||||
checkOp : String -> Token Kind
|
||||
checkOp s = if elem s specialOps then Tok Keyword s else Tok Oper s
|
||||
|
||||
isOpChar : Char -> Bool
|
||||
isOpChar c = c `elem` (unpack ":!#$%&*+./<=>?@\\^|-~")
|
||||
|
||||
opChar : Lexer
|
||||
opChar = pred isOpChar
|
||||
|
||||
identMore : Lexer
|
||||
identMore = alphaNum <|> exact "." <|> exact "'" <|> exact "_"
|
||||
|
||||
singleton : Lexer
|
||||
singleton = oneOf "()\\{}[],"
|
||||
|
||||
quo : Recognise True
|
||||
quo = is '"'
|
||||
|
||||
@@ -52,25 +48,34 @@ opMiddle = pred (\c => not (isSpace c || c == '_'))
|
||||
|
||||
rawTokens : Tokenizer (Token Kind)
|
||||
rawTokens
|
||||
= match (lower <+> many identMore) checkKW
|
||||
<|> match (upper <+> many identMore) checkUKW
|
||||
<|> match (some digit) (Tok Number)
|
||||
<|> match (is '#' <+> many alpha) (Tok Pragma)
|
||||
<|> match charLit (Tok Character)
|
||||
= match spaces (Tok Space)
|
||||
-- { is singleton except for {{
|
||||
<|> match (exact "{{" <|> exact "}}") (Tok Keyword)
|
||||
-- need to make this an ident
|
||||
<|> match (exact ",") (checkKW)
|
||||
-- for now, our lambda slash is singleton
|
||||
<|> match (singleton) (Tok Symbol)
|
||||
-- TODO Drop MixFix token type when we support if_then_else_
|
||||
<|> match (exact "_" <+> (some opMiddle) <+> exact "_") (Tok MixFix)
|
||||
<|> match (quo <+> manyUntil quo (esc any <|> any) <+> quo) (Tok StringKind . unquote)
|
||||
-- REVIEW - expect non-alpha after?
|
||||
<|> match (some digit) (Tok Number)
|
||||
-- for module names and maybe type constructors
|
||||
<|> match (charLit) (Tok Character)
|
||||
<|> match (is '#' <+> many alpha) (Tok Pragma)
|
||||
<|> match (lineComment (exact "--")) (Tok Space)
|
||||
<|> match (blockComment (exact "/-") (exact "-/")) (Tok Space)
|
||||
<|> match (exact ",") (Tok Oper)
|
||||
<|> match (some opChar) checkOp
|
||||
<|> match (exact "{{" <|> exact "}}") (Tok Keyword)
|
||||
<|> match symbol (Tok Symbol)
|
||||
<|> match spaces (Tok Space)
|
||||
<|> match (upper <+> many identMore) checkUKW
|
||||
<|> match (quo <+> manyUntil quo (esc any <|> any) <+> quo) (Tok StringKind . unquote)
|
||||
-- accept almost everything, but
|
||||
<|> match (some (non (space <|> singleton))) checkKW
|
||||
|
||||
notSpace : WithBounds (Token Kind) -> Bool
|
||||
notSpace (MkBounded (Tok Space _) _ _) = False
|
||||
notSpace _ = True
|
||||
|
||||
export
|
||||
tokenise : String -> List BTok
|
||||
tokenise = filter notSpace . fst . lex rawTokens
|
||||
tokenise : String -> Either Error (List BTok)
|
||||
tokenise s = case lex rawTokens s of
|
||||
(toks, EndInput, l, c, what) => Right (filter notSpace toks)
|
||||
(toks, reason, l, c, what) => Left (E (l,c) "\{show reason}")
|
||||
|
||||
|
||||
Reference in New Issue
Block a user