{-# LANGUAGE ViewPatterns #-}
{-# LANGUAGE RecordWildCards #-}
{-# LANGUAGE OverloadedStrings #-}
module Text.HTML.TagStream.Entities
(Dec(..)
,isNameChar
,isNameStart
,decodeEntities)
where
import Data.Char
import Data.Monoid
import Data.String
import Data.Conduit
import Text.HTML.TagStream.Types
import qualified Data.Conduit.List as CL
import Data.Maybe (fromMaybe, isJust)
import Control.Arrow (first,second)
decodeEntities :: (Monad m
,Monoid builder
,Monoid string
,IsString string
,Eq string)
=> Dec builder string
-> Conduit (Token' string) m (Token' string)
decodeEntities :: Dec builder string -> Conduit (Token' string) m (Token' string)
decodeEntities dec :: Dec builder string
dec =
Conduit (Token' string) m (Token' string)
start
where
start :: Conduit (Token' string) m (Token' string)
start = ConduitT (Token' string) (Token' string) m (Maybe (Token' string))
forall (m :: * -> *) i. Monad m => Consumer i m (Maybe i)
await ConduitT (Token' string) (Token' string) m (Maybe (Token' string))
-> (Maybe (Token' string)
-> Conduit (Token' string) m (Token' string))
-> Conduit (Token' string) m (Token' string)
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= Conduit (Token' string) m (Token' string)
-> (Token' string -> Conduit (Token' string) m (Token' string))
-> Maybe (Token' string)
-> Conduit (Token' string) m (Token' string)
forall b a. b -> (a -> b) -> Maybe a -> b
maybe (() -> Conduit (Token' string) m (Token' string)
forall (m :: * -> *) a. Monad m => a -> m a
return ()) (\token :: Token' string
token -> Token' string -> Conduit (Token' string) m (Token' string)
forall (m :: * -> *).
Monad m =>
Token' string -> ConduitT (Token' string) (Token' string) m ()
start' Token' string
token Conduit (Token' string) m (Token' string)
-> Conduit (Token' string) m (Token' string)
-> Conduit (Token' string) m (Token' string)
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Conduit (Token' string) m (Token' string)
start)
start' :: Token' string -> ConduitT (Token' string) (Token' string) m ()
start' (Text t :: string
t) = (string -> ConduitT (Token' string) string m ()
forall (m :: * -> *) o i. Monad m => o -> ConduitT i o m ()
yield string
t ConduitT (Token' string) string m ()
-> ConduitT (Token' string) string m ()
-> ConduitT (Token' string) string m ()
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> ConduitT (Token' string) string m ()
forall (m :: * -> *) string.
Monad m =>
Conduit (Token' string) m string
yieldWhileText) ConduitT (Token' string) string m ()
-> ConduitT string (Token' string) m ()
-> ConduitT (Token' string) (Token' string) m ()
forall (m :: * -> *) a b c r.
Monad m =>
Conduit a m b -> ConduitT b c m r -> ConduitT a c m r
=$= Dec builder string -> Conduit string m string
forall (m :: * -> *) string builder.
(Monad m, Monoid string, IsString string, Monoid builder,
Eq string) =>
Dec builder string -> Conduit string m string
decodeEntities' Dec builder string
dec Conduit string m string
-> ConduitT string (Token' string) m ()
-> ConduitT string (Token' string) m ()
forall (m :: * -> *) a b c r.
Monad m =>
Conduit a m b -> ConduitT b c m r -> ConduitT a c m r
=$= (string -> Maybe (Token' string))
-> ConduitT string (Token' string) m ()
forall (m :: * -> *) a b.
Monad m =>
(a -> Maybe b) -> ConduitT a b m ()
CL.mapMaybe string -> Maybe (Token' string)
forall s. (Eq s, IsString s) => s -> Maybe (Token' s)
go
start' (TagOpen name :: string
name attrs :: [Attr' string]
attrs bool :: Bool
bool) = Token' string -> ConduitT (Token' string) (Token' string) m ()
forall (m :: * -> *) o i. Monad m => o -> ConduitT i o m ()
yield (string -> [Attr' string] -> Bool -> Token' string
forall s. s -> [Attr' s] -> Bool -> Token' s
TagOpen string
name ((Attr' string -> Attr' string) -> [Attr' string] -> [Attr' string]
forall a b. (a -> b) -> [a] -> [b]
map ((string -> string) -> Attr' string -> Attr' string
forall (a :: * -> * -> *) b c d.
Arrow a =>
a b c -> a (d, b) (d, c)
second (Dec builder string -> string -> string
forall a builder.
(Eq a, IsString a, Monoid builder, Monoid a) =>
Dec builder a -> a -> a
decodeString Dec builder string
dec)) [Attr' string]
attrs) Bool
bool)
start' token :: Token' string
token = Token' string -> ConduitT (Token' string) (Token' string) m ()
forall (m :: * -> *) o i. Monad m => o -> ConduitT i o m ()
yield Token' string
token
go :: s -> Maybe (Token' s)
go t :: s
t
| s
t s -> s -> Bool
forall a. Eq a => a -> a -> Bool
== "" = Maybe (Token' s)
forall a. Maybe a
Nothing
| Bool
otherwise = Token' s -> Maybe (Token' s)
forall a. a -> Maybe a
Just (s -> Token' s
forall s. s -> Token' s
Text s
t)
decodeString
:: (Eq a, IsString a, Monoid builder, Monoid a)
=> Dec builder a -> a -> a
decodeString :: Dec builder a -> a -> a
decodeString dec :: Dec builder a
dec input :: a
input =
case Dec builder a -> a -> (a, a)
forall string builder.
(IsString string, Monoid builder, Eq string, Monoid string) =>
Dec builder string -> string -> (string, string)
makeEntityDecoder Dec builder a
dec a
input of
(value' :: a
value', remainder :: a
remainder)
| a
value' a -> a -> Bool
forall a. Eq a => a -> a -> Bool
/= a
forall a. Monoid a => a
mempty -> a
value' a -> a -> a
forall a. Semigroup a => a -> a -> a
<> Dec builder a -> a -> a
forall a builder.
(Eq a, IsString a, Monoid builder, Monoid a) =>
Dec builder a -> a -> a
decodeString Dec builder a
dec a
remainder
| Bool
otherwise -> a
input
decodeEntities' :: (Monad m
,Monoid string
,IsString string
,Monoid builder
,Eq string)
=> Dec builder string
-> Conduit string m string
decodeEntities' :: Dec builder string -> Conduit string m string
decodeEntities' dec :: Dec builder string
dec =
(string -> string) -> Conduit string m string
forall (m :: * -> *).
Monad m =>
(string -> string) -> ConduitT string string m ()
loop string -> string
forall a. a -> a
id
where
loop :: (string -> string) -> ConduitT string string m ()
loop accum :: string -> string
accum = do
Maybe string
mchunk <- ConduitT string string m (Maybe string)
forall (m :: * -> *) i. Monad m => Consumer i m (Maybe i)
await
let chunk :: string
chunk = string -> string
accum (string -> string) -> string -> string
forall a b. (a -> b) -> a -> b
$ string -> Maybe string -> string
forall a. a -> Maybe a -> a
fromMaybe string
forall a. Monoid a => a
mempty Maybe string
mchunk
(newStr :: string
newStr, remainder :: string
remainder) = Dec builder string -> string -> (string, string)
forall string builder.
(IsString string, Monoid builder, Eq string, Monoid string) =>
Dec builder string -> string -> (string, string)
makeEntityDecoder Dec builder string
dec string
chunk
string -> ConduitT string string m ()
forall (m :: * -> *) o i. Monad m => o -> ConduitT i o m ()
yield string
newStr
if Maybe string -> Bool
forall a. Maybe a -> Bool
isJust Maybe string
mchunk
then (string -> string) -> ConduitT string string m ()
loop (string -> string -> string
forall a. Monoid a => a -> a -> a
mappend string
remainder)
else string -> ConduitT string string m ()
forall (m :: * -> *) o i. Monad m => o -> ConduitT i o m ()
yield string
remainder
yieldWhileText :: Monad m => Conduit (Token' string) m string
yieldWhileText :: Conduit (Token' string) m string
yieldWhileText =
Conduit (Token' string) m string
forall o. ConduitT (Token' o) o m ()
loop
where
loop :: ConduitT (Token' o) o m ()
loop = ConduitT (Token' o) o m (Maybe (Token' o))
forall (m :: * -> *) i. Monad m => Consumer i m (Maybe i)
await ConduitT (Token' o) o m (Maybe (Token' o))
-> (Maybe (Token' o) -> ConduitT (Token' o) o m ())
-> ConduitT (Token' o) o m ()
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= ConduitT (Token' o) o m ()
-> (Token' o -> ConduitT (Token' o) o m ())
-> Maybe (Token' o)
-> ConduitT (Token' o) o m ()
forall b a. b -> (a -> b) -> Maybe a -> b
maybe (() -> ConduitT (Token' o) o m ()
forall (m :: * -> *) a. Monad m => a -> m a
return ()) Token' o -> ConduitT (Token' o) o m ()
go
go :: Token' o -> ConduitT (Token' o) o m ()
go (Text t :: o
t) = o -> ConduitT (Token' o) o m ()
forall (m :: * -> *) o i. Monad m => o -> ConduitT i o m ()
yield o
t ConduitT (Token' o) o m ()
-> ConduitT (Token' o) o m () -> ConduitT (Token' o) o m ()
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> ConduitT (Token' o) o m ()
loop
go token :: Token' o
token = Token' o -> ConduitT (Token' o) o m ()
forall i o (m :: * -> *). i -> ConduitT i o m ()
leftover Token' o
token
data Dec builder string = Dec
{ Dec builder string -> builder -> string
decToS :: builder -> string
, Dec builder string -> (Char -> Bool) -> string -> (string, string)
decBreak :: (Char -> Bool) -> string -> (string,string)
, Dec builder string -> string -> builder
decBuilder :: string -> builder
, Dec builder string -> Int -> string -> string
decDrop :: Int -> string -> string
, Dec builder string -> string -> Maybe string
decEntity :: string -> Maybe string
, Dec builder string -> string -> Maybe (Char, string)
decUncons :: string -> Maybe (Char,string)
}
makeEntityDecoder :: (IsString string,Monoid builder,Eq string,Monoid string)
=> Dec builder string -> string -> (string, string)
makeEntityDecoder :: Dec builder string -> string -> (string, string)
makeEntityDecoder Dec{..} = (builder -> string) -> (builder, string) -> (string, string)
forall (a :: * -> * -> *) b c d.
Arrow a =>
a b c -> a (b, d) (c, d)
first builder -> string
decToS ((builder, string) -> (string, string))
-> (string -> (builder, string)) -> string -> (string, string)
forall b c a. (b -> c) -> (a -> b) -> a -> c
. string -> (builder, string)
go
where
go :: string -> (builder, string)
go s :: string
s =
case (Char -> Bool) -> string -> (string, string)
decBreak (Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
=='&') string
s of
(_,"") -> (string -> builder
decBuilder string
s, "")
(before :: string
before,restPlusAmp :: string
restPlusAmp@(Int -> string -> string
decDrop 1 -> string
rest)) ->
case (Char -> Bool) -> string -> (string, string)
decBreak (Bool -> Bool
not (Bool -> Bool) -> (Char -> Bool) -> Char -> Bool
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (\c :: Char
c -> Char -> Bool
isNameChar Char
c Bool -> Bool -> Bool
|| Char
c Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
== '#')) string
rest of
(_,"") -> (string -> builder
decBuilder string
before, string
restPlusAmp)
(entity :: string
entity,after :: string
after) -> (builder
before1 builder -> builder -> builder
forall a. Semigroup a => a -> a -> a
<> builder
before2, string
after')
where
before1 :: builder
before1 = string -> builder
decBuilder string
before
(before2 :: builder
before2, after' :: string
after') =
case Maybe string
mdecoded of
Nothing -> (builder -> builder) -> (builder, string) -> (builder, string)
forall (a :: * -> * -> *) b c d.
Arrow a =>
a b c -> a (b, d) (c, d)
first ((string -> builder
decBuilder "&" builder -> builder -> builder
forall a. Semigroup a => a -> a -> a
<> string -> builder
decBuilder string
entity) builder -> builder -> builder
forall a. Semigroup a => a -> a -> a
<>) (string -> (builder, string)
go string
after)
Just (string -> builder
decBuilder -> builder
decoded) ->
case string -> Maybe (Char, string)
decUncons string
after of
Just (';',validAfter :: string
validAfter) -> (builder -> builder) -> (builder, string) -> (builder, string)
forall (a :: * -> * -> *) b c d.
Arrow a =>
a b c -> a (b, d) (c, d)
first (builder
decoded builder -> builder -> builder
forall a. Semigroup a => a -> a -> a
<>) (string -> (builder, string)
go string
validAfter)
Just (_invalid :: Char
_invalid,_rest :: string
_rest) -> (builder -> builder) -> (builder, string) -> (builder, string)
forall (a :: * -> * -> *) b c d.
Arrow a =>
a b c -> a (b, d) (c, d)
first (builder
decoded builder -> builder -> builder
forall a. Semigroup a => a -> a -> a
<>) (string -> (builder, string)
go string
after)
Nothing -> (builder
forall a. Monoid a => a
mempty, string
s)
mdecoded :: Maybe string
mdecoded =
if string
entity string -> string -> Bool
forall a. Eq a => a -> a -> Bool
== string
forall a. Monoid a => a
mempty
then Maybe string
forall a. Maybe a
Nothing
else string -> Maybe string
decEntity string
entity
isNameStart :: Char -> Bool
isNameStart :: Char -> Bool
isNameStart c :: Char
c =
Char
c Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
== ':' Bool -> Bool -> Bool
||
Char
c Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
== '_' Bool -> Bool -> Bool
||
Char -> Bool
isAsciiUpper Char
c Bool -> Bool -> Bool
||
Char -> Bool
isAsciiLower Char
c Bool -> Bool -> Bool
||
(Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
>= '\xC0' Bool -> Bool -> Bool
&& Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= '\xD6') Bool -> Bool -> Bool
||
(Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
>= '\xD8' Bool -> Bool -> Bool
&& Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= '\xF6') Bool -> Bool -> Bool
||
(Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
>= '\xF8' Bool -> Bool -> Bool
&& Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= '\x2FF') Bool -> Bool -> Bool
||
(Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
>= '\x370' Bool -> Bool -> Bool
&& Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= '\x37D') Bool -> Bool -> Bool
||
(Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
>= '\x37F' Bool -> Bool -> Bool
&& Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= '\x1FFF') Bool -> Bool -> Bool
||
(Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
>= '\x200C' Bool -> Bool -> Bool
&& Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= '\x200D') Bool -> Bool -> Bool
||
(Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
>= '\x2070' Bool -> Bool -> Bool
&& Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= '\x218F') Bool -> Bool -> Bool
||
(Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
>= '\x2C00' Bool -> Bool -> Bool
&& Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= '\x2FEF') Bool -> Bool -> Bool
||
(Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
>= '\x3001' Bool -> Bool -> Bool
&& Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= '\xD7FF') Bool -> Bool -> Bool
||
(Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
>= '\xF900' Bool -> Bool -> Bool
&& Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= '\xFDCF') Bool -> Bool -> Bool
||
(Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
>= '\xFDF0' Bool -> Bool -> Bool
&& Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= '\xFFFD') Bool -> Bool -> Bool
||
(Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
>= '\x10000' Bool -> Bool -> Bool
&& Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= '\xEFFFF')
isNameChar :: Char -> Bool
isNameChar :: Char -> Bool
isNameChar c :: Char
c =
Char
c Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
== '-' Bool -> Bool -> Bool
||
Char
c Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
== '.' Bool -> Bool -> Bool
||
Char
c Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
== '\xB7' Bool -> Bool -> Bool
||
Char -> Bool
isDigit Char
c Bool -> Bool -> Bool
||
Char -> Bool
isNameStart Char
c Bool -> Bool -> Bool
||
(Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
>= '\x0300' Bool -> Bool -> Bool
&& Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= '\x036F') Bool -> Bool -> Bool
||
(Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
>= '\x203F' Bool -> Bool -> Bool
&& Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= '\x2040')