-- | This is another hand-written lexer, this time for the Xtract
--   command-language.  The entry point is lexXtract.  You don't
--   normally need to use this module directly - the lexer is called
--   automatically by the parser.  (We only expose this interface
--   for debugging purposes.)
--
--   The Xtract command language is very like the XPath specification.

module Text.XML.HaXml.Xtract.Lex
  ( lexXtract
  , Posn(..)
  , TokenT(..)
  , Token
  ) where

import Data.Char


type Token = Either String (Posn, TokenT)

data Posn = Pn Int		-- char index only
        deriving Posn -> Posn -> Bool
(Posn -> Posn -> Bool) -> (Posn -> Posn -> Bool) -> Eq Posn
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
/= :: Posn -> Posn -> Bool
$c/= :: Posn -> Posn -> Bool
== :: Posn -> Posn -> Bool
$c== :: Posn -> Posn -> Bool
Eq

instance Show Posn where
      showsPrec :: Int -> Posn -> ShowS
showsPrec _p :: Int
_p (Pn c :: Int
c) = String -> ShowS
showString "char pos " ShowS -> ShowS -> ShowS
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Int -> ShowS
forall a. Show a => a -> ShowS
shows Int
c

data TokenT =
      Symbol String
    | TokString String		--     begins with letter
    | TokNum Integer		--     begins with digit
    deriving TokenT -> TokenT -> Bool
(TokenT -> TokenT -> Bool)
-> (TokenT -> TokenT -> Bool) -> Eq TokenT
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
/= :: TokenT -> TokenT -> Bool
$c/= :: TokenT -> TokenT -> Bool
== :: TokenT -> TokenT -> Bool
$c== :: TokenT -> TokenT -> Bool
Eq

instance Show TokenT where
    showsPrec :: Int -> TokenT -> ShowS
showsPrec _p :: Int
_p (Symbol s :: String
s) = String -> ShowS
showString String
s
    showsPrec _p :: Int
_p (TokString s :: String
s) = String -> ShowS
showString String
s
    showsPrec _p :: Int
_p (TokNum n :: Integer
n) = Integer -> ShowS
forall a. Show a => a -> ShowS
shows Integer
n

emit :: TokenT -> Posn -> Token
emit :: TokenT -> Posn -> Token
emit tok :: TokenT
tok p :: Posn
p = Posn -> Int
forcep Posn
p Int -> Token -> Token
forall a b. a -> b -> b
`seq` (Posn, TokenT) -> Token
forall a b. b -> Either a b
Right (Posn
p,TokenT
tok)
  where forcep :: Posn -> Int
forcep (Pn n :: Int
n) = Int
n

lexerror :: String -> Posn -> [Token]
lexerror :: String -> Posn -> [Token]
lexerror s :: String
s p :: Posn
p = [String -> Token
forall a b. a -> Either a b
Left ("Lexical error in selection pattern at "String -> ShowS
forall a. [a] -> [a] -> [a]
++Posn -> String
forall a. Show a => a -> String
show Posn
pString -> ShowS
forall a. [a] -> [a] -> [a]
++": "
                       String -> ShowS
forall a. [a] -> [a] -> [a]
++String
sString -> ShowS
forall a. [a] -> [a] -> [a]
++"\n")]

addcol :: Int -> Posn -> Posn
addcol :: Int -> Posn -> Posn
addcol n :: Int
n (Pn c :: Int
c) = Int -> Posn
Pn (Int
cInt -> Int -> Int
forall a. Num a => a -> a -> a
+Int
n)

newline, tab :: Posn -> Posn
newline :: Posn -> Posn
newline (Pn c :: Int
c) = Int -> Posn
Pn (Int
cInt -> Int -> Int
forall a. Num a => a -> a -> a
+1)
tab :: Posn -> Posn
tab     (Pn c :: Int
c) = Int -> Posn
Pn (((Int
cInt -> Int -> Int
forall a. Integral a => a -> a -> a
`div`8)Int -> Int -> Int
forall a. Num a => a -> a -> a
+1)Int -> Int -> Int
forall a. Num a => a -> a -> a
*8)

white :: Char -> Posn -> Posn
white :: Char -> Posn -> Posn
white '\t' = Posn -> Posn
tab
white ' '  = Int -> Posn -> Posn
addcol 1
white '\n' = Int -> Posn -> Posn
addcol 1
white '\r' = Int -> Posn -> Posn
addcol 1
white '\xa0' = Int -> Posn -> Posn
addcol 1

blank :: (Posn->String->[Token]) -> Posn-> String-> [Token]
blank :: (Posn -> String -> [Token]) -> Posn -> String -> [Token]
blank _ _ []       = []
blank k :: Posn -> String -> [Token]
k p :: Posn
p (' ': s :: String
s) = (Posn -> String -> [Token]) -> Posn -> String -> [Token]
blank Posn -> String -> [Token]
k (Int -> Posn -> Posn
addcol 1 Posn
p) String
s
blank k :: Posn -> String -> [Token]
k p :: Posn
p ('\t':s :: String
s) = (Posn -> String -> [Token]) -> Posn -> String -> [Token]
blank Posn -> String -> [Token]
k (Posn -> Posn
tab Posn
p) String
s
blank k :: Posn -> String -> [Token]
k p :: Posn
p ('\n':s :: String
s) = (Posn -> String -> [Token]) -> Posn -> String -> [Token]
blank Posn -> String -> [Token]
k (Posn -> Posn
newline Posn
p) String
s
blank k :: Posn -> String -> [Token]
k p :: Posn
p ('\r':s :: String
s) = (Posn -> String -> [Token]) -> Posn -> String -> [Token]
blank Posn -> String -> [Token]
k Posn
p String
s
blank k :: Posn -> String -> [Token]
k p :: Posn
p ('\xa0': s :: String
s) = (Posn -> String -> [Token]) -> Posn -> String -> [Token]
blank Posn -> String -> [Token]
k (Int -> Posn -> Posn
addcol 1 Posn
p) String
s
blank k :: Posn -> String -> [Token]
k p :: Posn
p    s :: String
s     = Posn -> String -> [Token]
k Posn
p String
s

----
-- | First argument is a transformer for pattern strings, e.g. map toLower,
--   but only applying to parts of the pattern not in quotation marks.
--   (Needed to canonicalise HTML where tags are case-insensitive, but
--   attribute values are case sensitive.)
lexXtract :: (String->String) -> String -> [Token]
lexXtract :: ShowS -> String -> [Token]
lexXtract f :: ShowS
f = ShowS -> Posn -> String -> [Token]
selAny ShowS
f (Int -> Posn
Pn 1)

syms :: [Char]
syms :: String
syms = "/[]()@,=*&|~$+-<>"

selAny :: (String->String) -> Posn -> String -> [Token]
selAny :: ShowS -> Posn -> String -> [Token]
selAny _ _ [] = []
selAny f :: ShowS
f p :: Posn
p ('/':'/':ss :: String
ss) = TokenT -> Posn -> Token
emit (String -> TokenT
Symbol "//") Posn
pToken -> [Token] -> [Token]
forall a. a -> [a] -> [a]
:  ShowS -> Posn -> String -> [Token]
selAny ShowS
f (Int -> Posn -> Posn
addcol 2 Posn
p) String
ss
selAny f :: ShowS
f p :: Posn
p ('!':'=':ss :: String
ss) = TokenT -> Posn -> Token
emit (String -> TokenT
Symbol "!=") Posn
pToken -> [Token] -> [Token]
forall a. a -> [a] -> [a]
:  ShowS -> Posn -> String -> [Token]
selAny ShowS
f (Int -> Posn -> Posn
addcol 2 Posn
p) String
ss
selAny f :: ShowS
f p :: Posn
p ('<':'=':ss :: String
ss) = TokenT -> Posn -> Token
emit (String -> TokenT
Symbol "<=") Posn
pToken -> [Token] -> [Token]
forall a. a -> [a] -> [a]
:  ShowS -> Posn -> String -> [Token]
selAny ShowS
f (Int -> Posn -> Posn
addcol 2 Posn
p) String
ss
selAny f :: ShowS
f p :: Posn
p ('>':'=':ss :: String
ss) = TokenT -> Posn -> Token
emit (String -> TokenT
Symbol ">=") Posn
pToken -> [Token] -> [Token]
forall a. a -> [a] -> [a]
:  ShowS -> Posn -> String -> [Token]
selAny ShowS
f (Int -> Posn -> Posn
addcol 2 Posn
p) String
ss
selAny f :: ShowS
f p :: Posn
p ('\'':ss :: String
ss)    = TokenT -> Posn -> Token
emit (String -> TokenT
Symbol "'") Posn
pToken -> [Token] -> [Token]
forall a. a -> [a] -> [a]
:
                          Char
-> TokenT
-> String
-> Posn
-> Posn
-> String
-> (Posn -> String -> [Token])
-> [Token]
accumulateUntil '\'' (String -> TokenT
Symbol "'") [] Posn
p (Int -> Posn -> Posn
addcol 1 Posn
p) String
ss
                                          (ShowS -> Posn -> String -> [Token]
selAny ShowS
f)
selAny f :: ShowS
f p :: Posn
p ('"':ss :: String
ss)     = TokenT -> Posn -> Token
emit (String -> TokenT
Symbol "\"") Posn
pToken -> [Token] -> [Token]
forall a. a -> [a] -> [a]
:
                          Char
-> TokenT
-> String
-> Posn
-> Posn
-> String
-> (Posn -> String -> [Token])
-> [Token]
accumulateUntil '"' (String -> TokenT
Symbol "\"") [] Posn
p (Int -> Posn -> Posn
addcol 1 Posn
p) String
ss
                                          (ShowS -> Posn -> String -> [Token]
selAny ShowS
f)
selAny f :: ShowS
f p :: Posn
p ('_':ss :: String
ss)     = ShowS
-> String
-> Posn
-> Posn
-> String
-> (Posn -> String -> [Token])
-> [Token]
gatherName ShowS
f "_" Posn
p (Int -> Posn -> Posn
addcol 1 Posn
p) String
ss ((Posn -> String -> [Token]) -> Posn -> String -> [Token]
blank (ShowS -> Posn -> String -> [Token]
selAny ShowS
f))
selAny f :: ShowS
f p :: Posn
p (':':ss :: String
ss)     = ShowS
-> String
-> Posn
-> Posn
-> String
-> (Posn -> String -> [Token])
-> [Token]
gatherName ShowS
f ":" Posn
p (Int -> Posn -> Posn
addcol 1 Posn
p) String
ss ((Posn -> String -> [Token]) -> Posn -> String -> [Token]
blank (ShowS -> Posn -> String -> [Token]
selAny ShowS
f))
selAny f :: ShowS
f p :: Posn
p ('.':'=':'.':ss :: String
ss) = TokenT -> Posn -> Token
emit (String -> TokenT
Symbol ".=.") Posn
pToken -> [Token] -> [Token]
forall a. a -> [a] -> [a]
:  ShowS -> Posn -> String -> [Token]
selAny ShowS
f (Int -> Posn -> Posn
addcol 3 Posn
p) String
ss
selAny f :: ShowS
f p :: Posn
p ('.':'!':'=':'.':ss :: String
ss)
                            = TokenT -> Posn -> Token
emit (String -> TokenT
Symbol ".!=.") Posn
pToken -> [Token] -> [Token]
forall a. a -> [a] -> [a]
: ShowS -> Posn -> String -> [Token]
selAny ShowS
f (Int -> Posn -> Posn
addcol 4 Posn
p) String
ss
selAny f :: ShowS
f p :: Posn
p ('.':'<':'.':ss :: String
ss) = TokenT -> Posn -> Token
emit (String -> TokenT
Symbol ".<.") Posn
pToken -> [Token] -> [Token]
forall a. a -> [a] -> [a]
:  ShowS -> Posn -> String -> [Token]
selAny ShowS
f (Int -> Posn -> Posn
addcol 3 Posn
p) String
ss
selAny f :: ShowS
f p :: Posn
p ('.':'<':'=':'.':ss :: String
ss)
                            = TokenT -> Posn -> Token
emit (String -> TokenT
Symbol ".<=.") Posn
pToken -> [Token] -> [Token]
forall a. a -> [a] -> [a]
: ShowS -> Posn -> String -> [Token]
selAny ShowS
f (Int -> Posn -> Posn
addcol 4 Posn
p) String
ss
selAny f :: ShowS
f p :: Posn
p ('.':'>':'.':ss :: String
ss) = TokenT -> Posn -> Token
emit (String -> TokenT
Symbol ".>.") Posn
pToken -> [Token] -> [Token]
forall a. a -> [a] -> [a]
:  ShowS -> Posn -> String -> [Token]
selAny ShowS
f (Int -> Posn -> Posn
addcol 3 Posn
p) String
ss
selAny f :: ShowS
f p :: Posn
p ('.':'>':'=':'.':ss :: String
ss)
                            = TokenT -> Posn -> Token
emit (String -> TokenT
Symbol ".>=.") Posn
pToken -> [Token] -> [Token]
forall a. a -> [a] -> [a]
: ShowS -> Posn -> String -> [Token]
selAny ShowS
f (Int -> Posn -> Posn
addcol 4 Posn
p) String
ss
selAny f :: ShowS
f p :: Posn
p ('.':'/':ss :: String
ss)     = TokenT -> Posn -> Token
emit (String -> TokenT
Symbol "./") Posn
pToken -> [Token] -> [Token]
forall a. a -> [a] -> [a]
:  ShowS -> Posn -> String -> [Token]
selAny ShowS
f (Int -> Posn -> Posn
addcol 2 Posn
p) String
ss
selAny f :: ShowS
f p :: Posn
p (s :: Char
s:ss :: String
ss)
    | Char
s Char -> String -> Bool
forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`elem` String
syms   = TokenT -> Posn -> Token
emit (String -> TokenT
Symbol [Char
s]) Posn
pToken -> [Token] -> [Token]
forall a. a -> [a] -> [a]
:     ShowS -> Posn -> String -> [Token]
selAny ShowS
f (Int -> Posn -> Posn
addcol 1 Posn
p) String
ss
    | Char -> Bool
isSpace Char
s       = (Posn -> String -> [Token]) -> Posn -> String -> [Token]
blank (ShowS -> Posn -> String -> [Token]
selAny ShowS
f) Posn
p (Char
sChar -> ShowS
forall a. a -> [a] -> [a]
:String
ss)
    | Char -> Bool
isAlpha Char
s       = ShowS
-> String
-> Posn
-> Posn
-> String
-> (Posn -> String -> [Token])
-> [Token]
gatherName ShowS
f [Char
s] Posn
p (Int -> Posn -> Posn
addcol 1 Posn
p) String
ss ((Posn -> String -> [Token]) -> Posn -> String -> [Token]
blank (ShowS -> Posn -> String -> [Token]
selAny ShowS
f))
    | Char -> Bool
isDigit Char
s       = String
-> Posn -> Posn -> String -> (Posn -> String -> [Token]) -> [Token]
gatherNum    [Char
s] Posn
p (Int -> Posn -> Posn
addcol 1 Posn
p) String
ss ((Posn -> String -> [Token]) -> Posn -> String -> [Token]
blank (ShowS -> Posn -> String -> [Token]
selAny ShowS
f))
    | Bool
otherwise       = String -> Posn -> [Token]
lexerror "unrecognised pattern" Posn
p

gatherName :: (String->String) -> String -> Posn -> Posn -> String
              -> (Posn->String->[Token]) -> [Token]
gatherName :: ShowS
-> String
-> Posn
-> Posn
-> String
-> (Posn -> String -> [Token])
-> [Token]
gatherName f :: ShowS
f acc :: String
acc pos :: Posn
pos p :: Posn
p (s :: Char
s:ss :: String
ss) k :: Posn -> String -> [Token]
k
  | Char -> Bool
isAlphaNum Char
s Bool -> Bool -> Bool
|| Char
s Char -> String -> Bool
forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`elem` "-_:" = ShowS
-> String
-> Posn
-> Posn
-> String
-> (Posn -> String -> [Token])
-> [Token]
gatherName ShowS
f (Char
sChar -> ShowS
forall a. a -> [a] -> [a]
:String
acc) Posn
pos (Int -> Posn -> Posn
addcol 1 Posn
p) String
ss Posn -> String -> [Token]
k
gatherName f :: ShowS
f acc :: String
acc pos :: Posn
pos p :: Posn
p ss :: String
ss k :: Posn -> String -> [Token]
k =
  TokenT -> Posn -> Token
emit (String -> TokenT
TokString (ShowS
f (ShowS
forall a. [a] -> [a]
reverse String
acc))) Posn
posToken -> [Token] -> [Token]
forall a. a -> [a] -> [a]
: Posn -> String -> [Token]
k Posn
p String
ss

gatherNum :: String -> Posn -> Posn -> String
             -> (Posn->String->[Token]) -> [Token]
gatherNum :: String
-> Posn -> Posn -> String -> (Posn -> String -> [Token]) -> [Token]
gatherNum acc :: String
acc pos :: Posn
pos p :: Posn
p (s :: Char
s:ss :: String
ss) k :: Posn -> String -> [Token]
k
  | Char -> Bool
isHexDigit Char
s = String
-> Posn -> Posn -> String -> (Posn -> String -> [Token]) -> [Token]
gatherNum (Char
sChar -> ShowS
forall a. a -> [a] -> [a]
:String
acc) Posn
pos (Int -> Posn -> Posn
addcol 1 Posn
p) String
ss Posn -> String -> [Token]
k
gatherNum acc :: String
acc pos :: Posn
pos p :: Posn
p ss :: String
ss k :: Posn -> String -> [Token]
k =
  TokenT -> Posn -> Token
emit (Integer -> TokenT
TokNum (String -> Integer
forall a. Read a => String -> a
read (ShowS
forall a. [a] -> [a]
reverse String
acc))) Posn
posToken -> [Token] -> [Token]
forall a. a -> [a] -> [a]
: Posn -> String -> [Token]
k Posn
p String
ss

accumulateUntil :: Char -> TokenT -> String -> Posn -> Posn -> String
                   -> (Posn->String->[Token]) -> [Token]
accumulateUntil :: Char
-> TokenT
-> String
-> Posn
-> Posn
-> String
-> (Posn -> String -> [Token])
-> [Token]
accumulateUntil c :: Char
c _tok :: TokenT
_tok _acc :: String
_acc pos :: Posn
pos  p :: Posn
p  [] _k :: Posn -> String -> [Token]
_k =
    String -> Posn -> [Token]
lexerror ("found end of pattern while looking for "String -> ShowS
forall a. [a] -> [a] -> [a]
++Char
c
              Char -> ShowS
forall a. a -> [a] -> [a]
:" to match opening quote at "String -> ShowS
forall a. [a] -> [a] -> [a]
++Posn -> String
forall a. Show a => a -> String
show Posn
pos) Posn
p
accumulateUntil c :: Char
c  tok :: TokenT
tok  acc :: String
acc pos :: Posn
pos  p :: Posn
p (s :: Char
s:ss :: String
ss) k :: Posn -> String -> [Token]
k
    | Char
cChar -> Char -> Bool
forall a. Eq a => a -> a -> Bool
==Char
s       = TokenT -> Posn -> Token
emit (String -> TokenT
TokString (ShowS
forall a. [a] -> [a]
reverse String
acc)) Posn
posToken -> [Token] -> [Token]
forall a. a -> [a] -> [a]
:
                                  TokenT -> Posn -> Token
emit TokenT
tok Posn
pToken -> [Token] -> [Token]
forall a. a -> [a] -> [a]
: Posn -> String -> [Token]
k (Int -> Posn -> Posn
addcol 1 Posn
p) String
ss
    | Char -> Bool
isSpace Char
s  = Char
-> TokenT
-> String
-> Posn
-> Posn
-> String
-> (Posn -> String -> [Token])
-> [Token]
accumulateUntil Char
c TokenT
tok (Char
sChar -> ShowS
forall a. a -> [a] -> [a]
:String
acc) Posn
pos (Char -> Posn -> Posn
white Char
s Posn
p) String
ss Posn -> String -> [Token]
k
    | Bool
otherwise  = Char
-> TokenT
-> String
-> Posn
-> Posn
-> String
-> (Posn -> String -> [Token])
-> [Token]
accumulateUntil Char
c TokenT
tok (Char
sChar -> ShowS
forall a. a -> [a] -> [a]
:String
acc) Posn
pos (Int -> Posn -> Posn
addcol 1 Posn
p) String
ss Posn -> String -> [Token]
k