module Lib.Tokenizer

import Text.Lexer
import Text.Lexer.Tokenizer
import Lib.Token
import Lib.Common

keywords : List String
keywords = ["let", "in", "where", "case", "of", "data", "U", "do",
            "ptype", "pfunc", "module", "infixl", "infixr", "infix",
            "∀", "forall",
            "class", "instance",
            "if", "then", "else",
             "->", "→", ":", "=>", ":=", "=", "<-", "\\", "_"]

checkKW : String -> Token Kind
checkKW s =
  if s /= "_" && elem '_' (unpack s) then Tok MixFix s else
  if elem s keywords then Tok Keyword s
  else Tok Ident s

checkUKW : String -> Token Kind
checkUKW s = if elem s keywords then Tok Keyword s else Tok UIdent s

identMore : Lexer
identMore = alphaNum <|> exact "'" <|> exact "_"

singleton : Lexer
singleton = oneOf "()\\{}[],?."

quo : Recognise True
quo = is '"'

esc : Recognise True -> Recognise True
esc l = is '\\' <+> l

-- REVIEW maybe we can do this faster with the view thinger
unquote : String -> String
unquote str = case unpack str of
  ('"' :: xs) => pack $ go xs
  imp => pack $ go imp
  where
    go : List Char -> List Char
    go [] = []
    go ['"'] = []
    go ('\\' :: ('n' :: xs)) = '\n' :: go xs
    go ('\\' :: (x :: xs)) = x :: go xs
    go (x :: xs) = x :: go xs

unquoteChar : String -> String
unquoteChar str = pack $ case unpack str of
  ('\'' :: xs) => go xs
  imp => go imp -- shouldn't happen
  where
    go : List Char -> List Char
    go [] = ['\''] -- shouldn't happen
    go ('\\' :: ('n' :: xs)) = ['\n']
    go ('\\' :: (x :: xs)) = [x]
    go (x :: xs) = [x]

opMiddle = pred (\c => not (isSpace c || c == '_'))

btick = is '`'

trimJS : String -> String
trimJS str = case unpack str of
  ('`' :: xs) => pack $ go xs
  bug => pack $ go bug
  where
    go : List Char -> List Char
    go [] = []
    go ['`'] = []
    go (x :: xs) = x :: go xs

%hide charLit
charLit : Lexer
charLit = is '\'' <+> (is '\\' <+> any <|> any) <+> is '\''


rawTokens : Tokenizer (Token Kind)
rawTokens
  = match spaces (Tok Space)
  -- { is singleton except for {{
  <|> match (exact "{{" <|> exact "}}") (Tok Keyword)
  -- need to make this an ident
  <|> match (exact ",") (checkKW)
  -- for now, our lambda slash is singleton
  <|> match (singleton) (Tok Symbol)
  -- TODO Drop MixFix token type when we support if_then_else_
  <|> match (exact "_,_" <|> exact "_._") (Tok MixFix)
  -- REVIEW - expect non-alpha after?
  <|> match (some digit) (Tok Number)
  -- for module names and maybe type constructors
  <|> match (charLit) (Tok Character . unquoteChar)
  <|> match (is '#' <+> many alpha) (Tok Pragma)
  <|> match (lineComment (exact "--")) (Tok Space)
  <|> match (blockComment (exact "/-") (exact "-/")) (Tok Space)
  <|> match (upper <+> many identMore) checkUKW
  <|> match (btick <+> manyUntil btick any <+> btick) (Tok JSLit . trimJS)
  <|> match (quo <+> manyUntil quo (esc any <|> any) <+> quo) (Tok StringKind . unquote)
  -- accept almost everything, but
  <|> match (some (non (space <|> singleton))) checkKW

notSpace : WithBounds (Token Kind) -> Bool
notSpace (MkBounded (Tok Space _) _ _) = False
notSpace _ = True

export
tokenise : String -> String -> Either Error (List BTok)
tokenise fn s = case lex rawTokens s of
  (toks, EndInput, l, c, what) => Right (filter notSpace toks)
  (toks, reason, l, c, what) => Left (E (MkFC fn (l,c)) "\{show reason}")