{-# LANGUAGE CPP #-}
#if __GLASGOW_HASKELL__ >= 701
{-# LANGUAGE Trustworthy #-}
#endif
module Codec.Binary.UTF8.String (
encode
, decode
, encodeString
, decodeString
, encodeChar
, isUTF8Encoded
, utf8Encode
) where
import Data.Word (Word8,Word32)
import Data.Bits ((.|.),(.&.),shiftL,shiftR)
import Data.Char (chr,ord)
default(Int)
encodeString :: String -> String
encodeString :: String -> String
encodeString String
xs = (Word8 -> Char) -> [Word8] -> String
forall a b. (a -> b) -> [a] -> [b]
map (Int -> Char
forall a. Enum a => Int -> a
toEnum (Int -> Char) -> (Word8 -> Int) -> Word8 -> Char
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Word8 -> Int
forall a. Enum a => a -> Int
fromEnum) (String -> [Word8]
encode String
xs)
decodeString :: String -> String
decodeString :: String -> String
decodeString String
xs = [Word8] -> String
decode ((Char -> Word8) -> String -> [Word8]
forall a b. (a -> b) -> [a] -> [b]
map (Int -> Word8
forall a. Enum a => Int -> a
toEnum (Int -> Word8) -> (Char -> Int) -> Char -> Word8
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Char -> Int
forall a. Enum a => a -> Int
fromEnum) String
xs)
replacement_character :: Char
replacement_character :: Char
replacement_character = Char
'\xfffd'
encodeChar :: Char -> [Word8]
encodeChar :: Char -> [Word8]
encodeChar = (Int -> Word8) -> [Int] -> [Word8]
forall a b. (a -> b) -> [a] -> [b]
map Int -> Word8
forall a b. (Integral a, Num b) => a -> b
fromIntegral ([Int] -> [Word8]) -> (Char -> [Int]) -> Char -> [Word8]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Int -> [Int]
forall {a}. (Ord a, Num a, Bits a) => a -> [a]
go (Int -> [Int]) -> (Char -> Int) -> Char -> [Int]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Char -> Int
ord
where
go :: a -> [a]
go a
oc
| a
oc a -> a -> Bool
forall a. Ord a => a -> a -> Bool
<= a
0x7f = [a
oc]
| a
oc a -> a -> Bool
forall a. Ord a => a -> a -> Bool
<= a
0x7ff = [ a
0xc0 a -> a -> a
forall a. Num a => a -> a -> a
+ (a
oc a -> Int -> a
forall a. Bits a => a -> Int -> a
`shiftR` Int
6)
, a
0x80 a -> a -> a
forall a. Num a => a -> a -> a
+ a
oc a -> a -> a
forall a. Bits a => a -> a -> a
.&. a
0x3f
]
| a
oc a -> a -> Bool
forall a. Ord a => a -> a -> Bool
<= a
0xffff = [ a
0xe0 a -> a -> a
forall a. Num a => a -> a -> a
+ (a
oc a -> Int -> a
forall a. Bits a => a -> Int -> a
`shiftR` Int
12)
, a
0x80 a -> a -> a
forall a. Num a => a -> a -> a
+ ((a
oc a -> Int -> a
forall a. Bits a => a -> Int -> a
`shiftR` Int
6) a -> a -> a
forall a. Bits a => a -> a -> a
.&. a
0x3f)
, a
0x80 a -> a -> a
forall a. Num a => a -> a -> a
+ a
oc a -> a -> a
forall a. Bits a => a -> a -> a
.&. a
0x3f
]
| Bool
otherwise = [ a
0xf0 a -> a -> a
forall a. Num a => a -> a -> a
+ (a
oc a -> Int -> a
forall a. Bits a => a -> Int -> a
`shiftR` Int
18)
, a
0x80 a -> a -> a
forall a. Num a => a -> a -> a
+ ((a
oc a -> Int -> a
forall a. Bits a => a -> Int -> a
`shiftR` Int
12) a -> a -> a
forall a. Bits a => a -> a -> a
.&. a
0x3f)
, a
0x80 a -> a -> a
forall a. Num a => a -> a -> a
+ ((a
oc a -> Int -> a
forall a. Bits a => a -> Int -> a
`shiftR` Int
6) a -> a -> a
forall a. Bits a => a -> a -> a
.&. a
0x3f)
, a
0x80 a -> a -> a
forall a. Num a => a -> a -> a
+ a
oc a -> a -> a
forall a. Bits a => a -> a -> a
.&. a
0x3f
]
encode :: String -> [Word8]
encode :: String -> [Word8]
encode = (Char -> [Word8]) -> String -> [Word8]
forall (t :: * -> *) a b. Foldable t => (a -> [b]) -> t a -> [b]
concatMap Char -> [Word8]
encodeChar
decode :: [Word8] -> String
decode :: [Word8] -> String
decode [ ] = String
""
decode (Word8
c:[Word8]
cs)
| Word8
c Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> Bool
< Word8
0x80 = Int -> Char
chr (Word8 -> Int
forall a. Enum a => a -> Int
fromEnum Word8
c) Char -> String -> String
forall a. a -> [a] -> [a]
: [Word8] -> String
decode [Word8]
cs
| Word8
c Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> Bool
< Word8
0xc0 = Char
replacement_character Char -> String -> String
forall a. a -> [a] -> [a]
: [Word8] -> String
decode [Word8]
cs
| Word8
c Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> Bool
< Word8
0xe0 = String
multi1
| Word8
c Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> Bool
< Word8
0xf0 = Int -> Word8 -> Int -> String
multi_byte Int
2 Word8
0xf Int
0x800
| Word8
c Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> Bool
< Word8
0xf8 = Int -> Word8 -> Int -> String
multi_byte Int
3 Word8
0x7 Int
0x10000
| Word8
c Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> Bool
< Word8
0xfc = Int -> Word8 -> Int -> String
multi_byte Int
4 Word8
0x3 Int
0x200000
| Word8
c Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> Bool
< Word8
0xfe = Int -> Word8 -> Int -> String
multi_byte Int
5 Word8
0x1 Int
0x4000000
| Bool
otherwise = Char
replacement_character Char -> String -> String
forall a. a -> [a] -> [a]
: [Word8] -> String
decode [Word8]
cs
where
multi1 :: String
multi1 = case [Word8]
cs of
Word8
c1 : [Word8]
ds | Word8
c1 Word8 -> Word8 -> Word8
forall a. Bits a => a -> a -> a
.&. Word8
0xc0 Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
== Word8
0x80 ->
let d :: Int
d = ((Word8 -> Int
forall a. Enum a => a -> Int
fromEnum Word8
c Int -> Int -> Int
forall a. Bits a => a -> a -> a
.&. Int
0x1f) Int -> Int -> Int
forall a. Bits a => a -> Int -> a
`shiftL` Int
6) Int -> Int -> Int
forall a. Bits a => a -> a -> a
.|. Word8 -> Int
forall a. Enum a => a -> Int
fromEnum (Word8
c1 Word8 -> Word8 -> Word8
forall a. Bits a => a -> a -> a
.&. Word8
0x3f)
in if Int
d Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
>= Int
0x000080 then Int -> Char
forall a. Enum a => Int -> a
toEnum Int
d Char -> String -> String
forall a. a -> [a] -> [a]
: [Word8] -> String
decode [Word8]
ds
else Char
replacement_character Char -> String -> String
forall a. a -> [a] -> [a]
: [Word8] -> String
decode [Word8]
ds
[Word8]
_ -> Char
replacement_character Char -> String -> String
forall a. a -> [a] -> [a]
: [Word8] -> String
decode [Word8]
cs
multi_byte :: Int -> Word8 -> Int -> [Char]
multi_byte :: Int -> Word8 -> Int -> String
multi_byte Int
i Word8
mask Int
overlong = Int -> [Word8] -> Int -> String
forall {t}. (Eq t, Num t) => t -> [Word8] -> Int -> String
aux Int
i [Word8]
cs (Word8 -> Int
forall a. Enum a => a -> Int
fromEnum (Word8
c Word8 -> Word8 -> Word8
forall a. Bits a => a -> a -> a
.&. Word8
mask))
where
aux :: t -> [Word8] -> Int -> String
aux t
0 [Word8]
rs Int
acc
| Int
overlong Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
<= Int
acc Bool -> Bool -> Bool
&& Int
acc Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
<= Int
0x10ffff Bool -> Bool -> Bool
&&
(Int
acc Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< Int
0xd800 Bool -> Bool -> Bool
|| Int
0xdfff Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< Int
acc) Bool -> Bool -> Bool
&&
(Int
acc Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< Int
0xfffe Bool -> Bool -> Bool
|| Int
0xffff Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< Int
acc) = Int -> Char
chr Int
acc Char -> String -> String
forall a. a -> [a] -> [a]
: [Word8] -> String
decode [Word8]
rs
| Bool
otherwise = Char
replacement_character Char -> String -> String
forall a. a -> [a] -> [a]
: [Word8] -> String
decode [Word8]
rs
aux t
n (Word8
r:[Word8]
rs) Int
acc
| Word8
r Word8 -> Word8 -> Word8
forall a. Bits a => a -> a -> a
.&. Word8
0xc0 Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
== Word8
0x80 = t -> [Word8] -> Int -> String
aux (t
nt -> t -> t
forall a. Num a => a -> a -> a
-t
1) [Word8]
rs
(Int -> String) -> Int -> String
forall a b. (a -> b) -> a -> b
$ Int -> Int -> Int
forall a. Bits a => a -> Int -> a
shiftL Int
acc Int
6 Int -> Int -> Int
forall a. Bits a => a -> a -> a
.|. Word8 -> Int
forall a. Enum a => a -> Int
fromEnum (Word8
r Word8 -> Word8 -> Word8
forall a. Bits a => a -> a -> a
.&. Word8
0x3f)
aux t
_ [Word8]
rs Int
_ = Char
replacement_character Char -> String -> String
forall a. a -> [a] -> [a]
: [Word8] -> String
decode [Word8]
rs
utf8Encode :: String -> String
utf8Encode :: String -> String
utf8Encode String
str
| String -> Bool
isUTF8Encoded String
str = String
str
| Bool
otherwise = String -> String
encodeString String
str
isUTF8Encoded :: String -> Bool
isUTF8Encoded :: String -> Bool
isUTF8Encoded [] = Bool
True
isUTF8Encoded (Char
x:String
xs) =
case Word32
ox of
Word32
_ | Word32
ox Word32 -> Word32 -> Bool
forall a. Ord a => a -> a -> Bool
< Word32
0x80 -> String -> Bool
isUTF8Encoded String
xs
| Word32
ox Word32 -> Word32 -> Bool
forall a. Ord a => a -> a -> Bool
> Word32
0xff -> Bool
False
| Word32
ox Word32 -> Word32 -> Bool
forall a. Ord a => a -> a -> Bool
< Word32
0xc0 -> Bool
False
| Word32
ox Word32 -> Word32 -> Bool
forall a. Ord a => a -> a -> Bool
< Word32
0xe0 -> Bool
check1
| Word32
ox Word32 -> Word32 -> Bool
forall a. Ord a => a -> a -> Bool
< Word32
0xf0 -> Int -> Word32 -> Word32 -> Bool
check_byte Int
2 Word32
0xf Word32
0
| Word32
ox Word32 -> Word32 -> Bool
forall a. Ord a => a -> a -> Bool
< Word32
0xf8 -> Int -> Word32 -> Word32 -> Bool
check_byte Int
3 Word32
0x7 Word32
0x10000
| Word32
ox Word32 -> Word32 -> Bool
forall a. Ord a => a -> a -> Bool
< Word32
0xfc -> Int -> Word32 -> Word32 -> Bool
check_byte Int
4 Word32
0x3 Word32
0x200000
| Word32
ox Word32 -> Word32 -> Bool
forall a. Ord a => a -> a -> Bool
< Word32
0xfe -> Int -> Word32 -> Word32 -> Bool
check_byte Int
5 Word32
0x1 Word32
0x4000000
| Bool
otherwise -> Bool
False
where
ox :: Word32
ox = Char -> Word32
toW32 Char
x
toW32 :: Char -> Word32
toW32 :: Char -> Word32
toW32 Char
ch = Int -> Word32
forall a b. (Integral a, Num b) => a -> b
fromIntegral (Char -> Int
forall a. Enum a => a -> Int
fromEnum Char
ch)
check1 :: Bool
check1 =
case String
xs of
[] -> Bool
False
Char
c1 : String
ds
| Word32
oc Word32 -> Word32 -> Word32
forall a. Bits a => a -> a -> a
.&. Word32
0xc0 Word32 -> Word32 -> Bool
forall a. Eq a => a -> a -> Bool
/= Word32
0x80 Bool -> Bool -> Bool
|| Word32
d Word32 -> Word32 -> Bool
forall a. Ord a => a -> a -> Bool
< Word32
0x000080 -> Bool
False
| Bool
otherwise -> String -> Bool
isUTF8Encoded String
ds
where
oc :: Word32
oc = Char -> Word32
toW32 Char
c1
d :: Word32
d = ((Word32
ox Word32 -> Word32 -> Word32
forall a. Bits a => a -> a -> a
.&. Word32
0x1f) Word32 -> Int -> Word32
forall a. Bits a => a -> Int -> a
`shiftL` Int
6) Word32 -> Word32 -> Word32
forall a. Bits a => a -> a -> a
.|. (Word32
oc Word32 -> Word32 -> Word32
forall a. Bits a => a -> a -> a
.&. Word32
0x3f)
check_byte :: Int -> Word32 -> Word32 -> Bool
check_byte :: Int -> Word32 -> Word32 -> Bool
check_byte Int
i Word32
mask Word32
overlong = Int -> String -> Word32 -> Bool
forall {t}. (Eq t, Num t) => t -> String -> Word32 -> Bool
aux Int
i String
xs (Word32
ox Word32 -> Word32 -> Word32
forall a. Bits a => a -> a -> a
.&. Word32
mask)
where
aux :: t -> String -> Word32 -> Bool
aux t
0 String
rs Word32
acc
| Word32
overlong Word32 -> Word32 -> Bool
forall a. Ord a => a -> a -> Bool
<= Word32
acc Bool -> Bool -> Bool
&&
Word32
acc Word32 -> Word32 -> Bool
forall a. Ord a => a -> a -> Bool
<= Word32
0x10ffff Bool -> Bool -> Bool
&&
(Word32
acc Word32 -> Word32 -> Bool
forall a. Ord a => a -> a -> Bool
< Word32
0xd800 Bool -> Bool -> Bool
|| Word32
0xdfff Word32 -> Word32 -> Bool
forall a. Ord a => a -> a -> Bool
< Word32
acc) Bool -> Bool -> Bool
&&
(Word32
acc Word32 -> Word32 -> Bool
forall a. Ord a => a -> a -> Bool
< Word32
0xfffe Bool -> Bool -> Bool
|| Word32
0xffff Word32 -> Word32 -> Bool
forall a. Ord a => a -> a -> Bool
< Word32
acc) = String -> Bool
isUTF8Encoded String
rs
| Bool
otherwise = Bool
False
aux t
n (Char
r:String
rs) Word32
acc
| Char -> Word32
toW32 Char
r Word32 -> Word32 -> Word32
forall a. Bits a => a -> a -> a
.&. Word32
0xc0 Word32 -> Word32 -> Bool
forall a. Eq a => a -> a -> Bool
== Word32
0x80 =
t -> String -> Word32 -> Bool
aux (t
nt -> t -> t
forall a. Num a => a -> a -> a
-t
1) String
rs (Word32
acc Word32 -> Int -> Word32
forall a. Bits a => a -> Int -> a
`shiftL` Int
6 Word32 -> Word32 -> Word32
forall a. Bits a => a -> a -> a
.|. (Char -> Word32
toW32 Char
r Word32 -> Word32 -> Word32
forall a. Bits a => a -> a -> a
.&. Word32
0x3f))
aux t
_ String
_ Word32
_ = Bool
False