{-

Copyright (c) 2002, members of the Haskell Internationalisation Working
Group All rights reserved.

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:

* Redistributions of source code must retain the above copyright notice,
   this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice,
   this list of conditions and the following disclaimer in the
   documentation and/or other materials provided with the distribution.
* Neither the name of the Haskell Internationalisation Working Group nor
   the names of its contributors may be used to endorse or promote products
   derived from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.

This module provides lazy stream encoding/decoding facilities for UTF-8,
the Unicode Transformation Format with 8-bit words.

2002-09-02  Sven Moritz Hallberg <pesco@gmx.de>

-}

{-

2007-04-30 Henning Thielemann:
Slight changes to make decode lazy.
The calls of 'reverse' in the original version have broken laziness
and thus had memory leaks.

-}

module Data.String.UTF8
  ( encode
  , decode
  , decodeEmbedErrors
  , encodeOne
  , decodeOne
  , Error
         -- Haddock does not want to document signatures with private types
         -- these functions should be moved to a utility module
  ) where

import Data.Char (ord, chr)
import Data.Word (Word8, Word16, Word32)
import Data.Bits (shiftL, shiftR, (.&.), (.|.))

import Data.List (unfoldr)

-- - UTF-8 in General -

-- Adapted from the Unicode standard, version 3.2,
-- Table 3.1 "UTF-8 Bit Distribution" (excluded are UTF-16 encodings):

--   Scalar                    1st Byte  2nd Byte  3rd Byte  4th Byte
--           000000000xxxxxxx  0xxxxxxx
--           00000yyyyyxxxxxx  110yyyyy  10xxxxxx
--           zzzzyyyyyyxxxxxx  1110zzzz  10yyyyyy  10xxxxxx
--   000uuuzzzzzzyyyyyyxxxxxx  11110uuu  10zzzzzz  10yyyyyy  10xxxxxx

-- Also from the Unicode standard, version 3.2,
-- Table 3.1B "Legal UTF-8 Byte Sequences":

--   Code Points         1st Byte  2nd Byte  3rd Byte  4th Byte
--     U+0000..U+007F    00..7F
--     U+0080..U+07FF    C2..DF    80..BF
--     U+0800..U+0FFF    E0        A0..BF    80..BF
--     U+1000..U+CFFF    E1..EC    80..BF    80..BF
--     U+D000..U+D7FF    ED        80..9F    80..BF
--     U+D800..U+DFFF    ill-formed
--     U+E000..U+FFFF    EE..EF    80..BF    80..BF
--    U+10000..U+3FFFF   F0        90..BF    80..BF    80..BF
--    U+40000..U+FFFFF   F1..F3    80..BF    80..BF    80..BF
--   U+100000..U+10FFFF  F4        80..8F    80..BF    80..BF



-- - Encoding Functions -

-- Must the encoder ensure that no illegal byte sequences are output or
-- can we trust the Haskell system to supply only legal values?
-- For now I include error case for the surrogate values U+D800..U+DFFF and
-- out-of-range scalars.

-- The function is pretty much a transscript of table 3.1B with error checks.
-- It dispatches the actual encoding to functions specific to the number of
-- required bytes.

encodeOne :: Char -> [Word8]
encodeOne :: Char -> [Word8]
encodeOne Char
c
    -- The report guarantees in (6.1.2) that this won't happen:
    --   | n < 0       = error "encodeUTF8: ord returned a negative value"
    | Int
n Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< Int
0x0080  = Word8 -> [Word8]
encodeOne_onebyte Word8
n8
    | Int
n Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< Int
0x0800  = Word16 -> [Word8]
encodeOne_twobyte Word16
n16
    | Int
n Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< Int
0xD800  = Word16 -> [Word8]
encodeOne_threebyte Word16
n16
    | Int
n Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< Int
0xE000  = [Char] -> [Word8]
forall a. HasCallStack => [Char] -> a
error [Char]
"encodeUTF8: ord returned a surrogate value"
    | Int
n Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< Int
0x10000       = Word16 -> [Word8]
encodeOne_threebyte Word16
n16
    -- Haskell 98 only talks about 16 bit characters, but ghc handles 20.1.
    | Int
n Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< Int
0x10FFFF      = Word32 -> [Word8]
encodeOne_fourbyte Word32
n32
    | Bool
otherwise  = [Char] -> [Word8]
forall a. HasCallStack => [Char] -> a
error [Char]
"encodeUTF8: ord returned a value above 0x10FFFF"
    where
    n :: Int
n = Char -> Int
ord Char
c            :: Int
    n8 :: Word8
n8 = Int -> Word8
forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
n  :: Word8
    n16 :: Word16
n16 = Int -> Word16
forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
n :: Word16
    n32 :: Word32
n32 = Int -> Word32
forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
n :: Word32


-- With the above, a stream decoder is trivial:

encode :: [Char] -> [Word8]
encode :: [Char] -> [Word8]
encode = (Char -> [Word8]) -> [Char] -> [Word8]
forall (t :: * -> *) a b. Foldable t => (a -> [b]) -> t a -> [b]
concatMap Char -> [Word8]
encodeOne


-- Now follow the individual encoders for certain numbers of bytes...
--           _
--          / |  __  ___  __ __
--         / ^| //  /__/ // //
--        /.==| \\ //_  // //
-- It's  //  || // \_/_//_//_  and it's here to stay!

encodeOne_onebyte :: Word8 -> [Word8]
encodeOne_onebyte :: Word8 -> [Word8]
encodeOne_onebyte Word8
cp = [Word8
cp]


-- 00000yyyyyxxxxxx -> 110yyyyy 10xxxxxx

encodeOne_twobyte :: Word16 -> [Word8]
encodeOne_twobyte :: Word16 -> [Word8]
encodeOne_twobyte Word16
cp = [(Word8
0xC0Word8 -> Word8 -> Word8
forall a. Bits a => a -> a -> a
.|.Word8
ys), (Word8
0x80Word8 -> Word8 -> Word8
forall a. Bits a => a -> a -> a
.|.Word8
xs)]
    where
    xs, ys :: Word8
    ys :: Word8
ys = Word16 -> Word8
forall a b. (Integral a, Num b) => a -> b
fromIntegral (Word16 -> Int -> Word16
forall a. Bits a => a -> Int -> a
shiftR Word16
cp Int
6)
    xs :: Word8
xs = (Word16 -> Word8
forall a b. (Integral a, Num b) => a -> b
fromIntegral Word16
cp) Word8 -> Word8 -> Word8
forall a. Bits a => a -> a -> a
.&. Word8
0x3F


-- zzzzyyyyyyxxxxxx -> 1110zzzz 10yyyyyy 10xxxxxx

encodeOne_threebyte :: Word16 -> [Word8]
encodeOne_threebyte :: Word16 -> [Word8]
encodeOne_threebyte Word16
cp = [(Word8
0xE0Word8 -> Word8 -> Word8
forall a. Bits a => a -> a -> a
.|.Word8
zs), (Word8
0x80Word8 -> Word8 -> Word8
forall a. Bits a => a -> a -> a
.|.Word8
ys), (Word8
0x80Word8 -> Word8 -> Word8
forall a. Bits a => a -> a -> a
.|.Word8
xs)]
    where
    xs, ys, zs :: Word8
    xs :: Word8
xs = (Word16 -> Word8
forall a b. (Integral a, Num b) => a -> b
fromIntegral Word16
cp) Word8 -> Word8 -> Word8
forall a. Bits a => a -> a -> a
.&. Word8
0x3F
    ys :: Word8
ys = (Word16 -> Word8
forall a b. (Integral a, Num b) => a -> b
fromIntegral (Word16 -> Int -> Word16
forall a. Bits a => a -> Int -> a
shiftR Word16
cp Int
6)) Word8 -> Word8 -> Word8
forall a. Bits a => a -> a -> a
.&. Word8
0x3F
    zs :: Word8
zs = Word16 -> Word8
forall a b. (Integral a, Num b) => a -> b
fromIntegral (Word16 -> Int -> Word16
forall a. Bits a => a -> Int -> a
shiftR Word16
cp Int
12)


-- 000uuuzzzzzzyyyyyyxxxxxx -> 11110uuu 10zzzzzz 10yyyyyy 10xxxxxx

encodeOne_fourbyte :: Word32 -> [Word8]
encodeOne_fourbyte :: Word32 -> [Word8]
encodeOne_fourbyte Word32
cp = [Word8
0xF0Word8 -> Word8 -> Word8
forall a. Bits a => a -> a -> a
.|.Word8
us, Word8
0x80Word8 -> Word8 -> Word8
forall a. Bits a => a -> a -> a
.|.Word8
zs, Word8
0x80Word8 -> Word8 -> Word8
forall a. Bits a => a -> a -> a
.|.Word8
ys, Word8
0x80Word8 -> Word8 -> Word8
forall a. Bits a => a -> a -> a
.|.Word8
xs]
    where
    xs, ys, zs, us :: Word8
    xs :: Word8
xs = (Word32 -> Word8
forall a b. (Integral a, Num b) => a -> b
fromIntegral Word32
cp) Word8 -> Word8 -> Word8
forall a. Bits a => a -> a -> a
.&. Word8
0x3F
    ys :: Word8
ys = (Word32 -> Word8
forall a b. (Integral a, Num b) => a -> b
fromIntegral (Word32 -> Int -> Word32
forall a. Bits a => a -> Int -> a
shiftR Word32
cp Int
6)) Word8 -> Word8 -> Word8
forall a. Bits a => a -> a -> a
.&. Word8
0x3F
    zs :: Word8
zs = (Word32 -> Word8
forall a b. (Integral a, Num b) => a -> b
fromIntegral (Word32 -> Int -> Word32
forall a. Bits a => a -> Int -> a
shiftR Word32
cp Int
12)) Word8 -> Word8 -> Word8
forall a. Bits a => a -> a -> a
.&. Word8
0x3F
    us :: Word8
us = Word32 -> Word8
forall a b. (Integral a, Num b) => a -> b
fromIntegral (Word32 -> Int -> Word32
forall a. Bits a => a -> Int -> a
shiftR Word32
cp Int
18)



-- - Decoding -

-- The decoding is a bit more involved. The byte sequence could contain all
-- sorts of corruptions. The user must be able to either notice or ignore these
-- errors.

-- I will first look at the decoding of a single character. The process
-- consumes a certain number of bytes from the input. It returns the
-- remaining input and either an error and the index of its occurance in the
-- byte sequence or the decoded character.

data Error

-- The first byte in a sequence starts with either zero, two, three, or four
-- ones and one zero to indicate the length of the sequence. If it doesn't,
-- it is invalid. It is dropped and the next byte interpreted as the start
-- of a new sequence.

    = InvalidFirstByte

-- All bytes in the sequence except the first match the bit pattern 10xxxxxx.
-- If one doesn't, it is invalid. The sequence up to that point is dropped
-- and the "invalid" byte interpreted as the start of a new sequence. The error
-- includes the length of the partial sequence and the number of expected bytes.

    | InvalidLaterByte Int      -- the byte at relative index n was invalid

-- If a sequence ends prematurely, it has been truncated. It dropped and
-- decoding stops. The error reports the actual and expected lengths of the
-- sequence.

    | Truncated Int Int         -- only n of m expected bytes were present

-- Some sequences would represent code points which would be encoded as a
-- shorter sequence by a conformant encoder. Such non-shortest sequences are
-- considered erroneous and dropped. The error reports the actual and
-- expected number of bytes used.

    | NonShortest Int Int       -- n instead of m bytes were used

-- Unicode code points are in the range of [0..0x10FFFF]. Any values outside
-- of those bounds are simply invalid.

    | ValueOutOfBounds

-- There is no such thing as "surrogate pairs" any more in UTF-8. The
-- corresponding code points now form illegal byte sequences.

    | Surrogate
      deriving (Int -> Error -> ShowS
[Error] -> ShowS
Error -> [Char]
(Int -> Error -> ShowS)
-> (Error -> [Char]) -> ([Error] -> ShowS) -> Show Error
forall a.
(Int -> a -> ShowS) -> (a -> [Char]) -> ([a] -> ShowS) -> Show a
$cshowsPrec :: Int -> Error -> ShowS
showsPrec :: Int -> Error -> ShowS
$cshow :: Error -> [Char]
show :: Error -> [Char]
$cshowList :: [Error] -> ShowS
showList :: [Error] -> ShowS
Show, Error -> Error -> Bool
(Error -> Error -> Bool) -> (Error -> Error -> Bool) -> Eq Error
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
$c== :: Error -> Error -> Bool
== :: Error -> Error -> Bool
$c/= :: Error -> Error -> Bool
/= :: Error -> Error -> Bool
Eq)


-- Second, third, and fourth bytes share the common requirement to start
-- with the bit sequence 10. So, here's the function to check that property.

first_bits_not_10 :: Word8 -> Bool
first_bits_not_10 :: Word8 -> Bool
first_bits_not_10 Word8
b
    | (Word8
bWord8 -> Word8 -> Word8
forall a. Bits a => a -> a -> a
.&.Word8
0xC0) Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
/= Word8
0x80  = Bool
True
    | Bool
otherwise           = Bool
False


-- Erm, OK, the single-character decoding function's return type is a bit
-- longish. It is a tripel:

--  - The first component contains the decoded character or an error
--    if the byte sequence was erroneous.
--  - The second component contains the number of bytes that were consumed
--    from the input.
--  - The third component contains the remaining bytes of input.

decodeOne :: [Word8] -> (Either Error Char, Int, [Word8])
decodeOne :: [Word8] -> (Either Error Char, Int, [Word8])
decodeOne bs :: [Word8]
bs@(Word8
b1:[Word8]
rest)
    | Word8
b1 Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> Bool
< Word8
0x80   = [Word8] -> (Either Error Char, Int, [Word8])
decodeOne_onebyte [Word8]
bs
    | Word8
b1 Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> Bool
< Word8
0xC0   = (Error -> Either Error Char
forall a b. a -> Either a b
Left Error
InvalidFirstByte, Int
1, [Word8]
rest)
    | Word8
b1 Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> Bool
< Word8
0xE0   = [Word8] -> (Either Error Char, Int, [Word8])
decodeOne_twobyte [Word8]
bs
    | Word8
b1 Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> Bool
< Word8
0xF0   = [Word8] -> (Either Error Char, Int, [Word8])
decodeOne_threebyte [Word8]
bs
    | Word8
b1 Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> Bool
< Word8
0xF5   = [Word8] -> (Either Error Char, Int, [Word8])
decodeOne_fourbyte [Word8]
bs
    | Bool
otherwise   = (Error -> Either Error Char
forall a b. a -> Either a b
Left Error
ValueOutOfBounds, Int
1, [Word8]
rest)
decodeOne [] = [Char] -> (Either Error Char, Int, [Word8])
forall a. HasCallStack => [Char] -> a
error [Char]
"UTF8.decodeOne: No input"


-- 0xxxxxxx -> 000000000xxxxxxx

decodeOne_onebyte :: [Word8] -> (Either Error Char, Int, [Word8])
decodeOne_onebyte :: [Word8] -> (Either Error Char, Int, [Word8])
decodeOne_onebyte (Word8
b:[Word8]
bs) = (Char -> Either Error Char
forall a b. b -> Either a b
Right (Word8 -> Char
forall a. Integral a => a -> Char
cpToChar Word8
b), Int
1, [Word8]
bs)
decodeOne_onebyte[] = [Char] -> (Either Error Char, Int, [Word8])
forall a. HasCallStack => [Char] -> a
error [Char]
"UTF8.decodeOne_onebyte: No input (can't happen)"

cpToChar :: Integral a => a -> Char
cpToChar :: forall a. Integral a => a -> Char
cpToChar = Int -> Char
chr (Int -> Char) -> (a -> Int) -> a -> Char
forall b c a. (b -> c) -> (a -> b) -> a -> c
. a -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral


-- 110yyyyy 10xxxxxx -> 00000yyyyyxxxxxx

decodeOne_twobyte :: [Word8] -> (Either Error Char, Int, [Word8])
decodeOne_twobyte :: [Word8] -> (Either Error Char, Int, [Word8])
decodeOne_twobyte (Word8
_:[])
    = (Error -> Either Error Char
forall a b. a -> Either a b
Left (Int -> Int -> Error
Truncated Int
1 Int
2), Int
1, [])
decodeOne_twobyte (Word8
b1:Word8
b2:[Word8]
bs)
    | Word8
b1 Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> Bool
< Word8
0xC2            = (Error -> Either Error Char
forall a b. a -> Either a b
Left (Int -> Int -> Error
NonShortest Int
2 Int
1), Int
2, [Word8]
bs)
    | Word8 -> Bool
first_bits_not_10 Word8
b2 = (Error -> Either Error Char
forall a b. a -> Either a b
Left (Int -> Error
InvalidLaterByte Int
1), Int
1, (Word8
b2Word8 -> [Word8] -> [Word8]
forall a. a -> [a] -> [a]
:[Word8]
bs))
    | Bool
otherwise            = (Char -> Either Error Char
forall a b. b -> Either a b
Right (Word32 -> Char
forall a. Integral a => a -> Char
cpToChar Word32
result), Int
2, [Word8]
bs)
    where
    xs, ys, result :: Word32
    xs :: Word32
xs = Word8 -> Word32
forall a b. (Integral a, Num b) => a -> b
fromIntegral (Word8
b2Word8 -> Word8 -> Word8
forall a. Bits a => a -> a -> a
.&.Word8
0x3F)
    ys :: Word32
ys = Word8 -> Word32
forall a b. (Integral a, Num b) => a -> b
fromIntegral (Word8
b1Word8 -> Word8 -> Word8
forall a. Bits a => a -> a -> a
.&.Word8
0x1F)
    result :: Word32
result = Word32 -> Int -> Word32
forall a. Bits a => a -> Int -> a
shiftL Word32
ys Int
6 Word32 -> Word32 -> Word32
forall a. Bits a => a -> a -> a
.|. Word32
xs
decodeOne_twobyte[] = [Char] -> (Either Error Char, Int, [Word8])
forall a. HasCallStack => [Char] -> a
error [Char]
"UTF8.decodeOne_twobyte: No input (can't happen)"


-- 1110zzzz 10yyyyyy 10xxxxxx -> zzzzyyyyyyxxxxxx

decodeOne_threebyte :: [Word8] -> (Either Error Char, Int, [Word8])
decodeOne_threebyte :: [Word8] -> (Either Error Char, Int, [Word8])
decodeOne_threebyte (Word8
_:[])   = Int -> (Either Error Char, Int, [Word8])
threebyte_truncated Int
1
decodeOne_threebyte (Word8
_:Word8
_:[]) = Int -> (Either Error Char, Int, [Word8])
threebyte_truncated Int
2
decodeOne_threebyte bs :: [Word8]
bs@(Word8
b1:Word8
b2:Word8
b3:[Word8]
rest)
    | Word8 -> Bool
first_bits_not_10 Word8
b2
        = (Error -> Either Error Char
forall a b. a -> Either a b
Left (Int -> Error
InvalidLaterByte Int
1), Int
1, Int -> [Word8] -> [Word8]
forall a. Int -> [a] -> [a]
drop Int
1 [Word8]
bs)
    | Word8 -> Bool
first_bits_not_10 Word8
b3
        = (Error -> Either Error Char
forall a b. a -> Either a b
Left (Int -> Error
InvalidLaterByte Int
2), Int
2, Int -> [Word8] -> [Word8]
forall a. Int -> [a] -> [a]
drop Int
2 [Word8]
bs)
    | Word32
result Word32 -> Word32 -> Bool
forall a. Ord a => a -> a -> Bool
< Word32
0x0080
        = (Error -> Either Error Char
forall a b. a -> Either a b
Left (Int -> Int -> Error
NonShortest Int
3 Int
1), Int
3, [Word8]
rest)
    | Word32
result Word32 -> Word32 -> Bool
forall a. Ord a => a -> a -> Bool
< Word32
0x0800
        = (Error -> Either Error Char
forall a b. a -> Either a b
Left (Int -> Int -> Error
NonShortest Int
3 Int
2), Int
3, [Word8]
rest)
    | Word32
result Word32 -> Word32 -> Bool
forall a. Ord a => a -> a -> Bool
>= Word32
0xD800 Bool -> Bool -> Bool
&& Word32
result Word32 -> Word32 -> Bool
forall a. Ord a => a -> a -> Bool
< Word32
0xE000
        = (Error -> Either Error Char
forall a b. a -> Either a b
Left Error
Surrogate, Int
3, [Word8]
rest)
    | Bool
otherwise
        = (Char -> Either Error Char
forall a b. b -> Either a b
Right (Word32 -> Char
forall a. Integral a => a -> Char
cpToChar Word32
result), Int
3, [Word8]
rest)
    where
    xs, ys, zs, result :: Word32
    xs :: Word32
xs = Word8 -> Word32
forall a b. (Integral a, Num b) => a -> b
fromIntegral (Word8
b3Word8 -> Word8 -> Word8
forall a. Bits a => a -> a -> a
.&.Word8
0x3F)
    ys :: Word32
ys = Word8 -> Word32
forall a b. (Integral a, Num b) => a -> b
fromIntegral (Word8
b2Word8 -> Word8 -> Word8
forall a. Bits a => a -> a -> a
.&.Word8
0x3F)
    zs :: Word32
zs = Word8 -> Word32
forall a b. (Integral a, Num b) => a -> b
fromIntegral (Word8
b1Word8 -> Word8 -> Word8
forall a. Bits a => a -> a -> a
.&.Word8
0x0F)
    result :: Word32
result = Word32 -> Int -> Word32
forall a. Bits a => a -> Int -> a
shiftL Word32
zs Int
12 Word32 -> Word32 -> Word32
forall a. Bits a => a -> a -> a
.|. Word32 -> Int -> Word32
forall a. Bits a => a -> Int -> a
shiftL Word32
ys Int
6 Word32 -> Word32 -> Word32
forall a. Bits a => a -> a -> a
.|. Word32
xs
decodeOne_threebyte[]
 = [Char] -> (Either Error Char, Int, [Word8])
forall a. HasCallStack => [Char] -> a
error [Char]
"UTF8.decodeOne_threebyte: No input (can't happen)"

threebyte_truncated :: Int -> (Either Error Char, Int, [Word8])
threebyte_truncated :: Int -> (Either Error Char, Int, [Word8])
threebyte_truncated Int
n = (Error -> Either Error Char
forall a b. a -> Either a b
Left (Int -> Int -> Error
Truncated Int
n Int
3), Int
n, [])


-- 11110uuu 10zzzzzz 10yyyyyy 10xxxxxx -> 000uuuzzzzzzyyyyyyxxxxxx

decodeOne_fourbyte :: [Word8] -> (Either Error Char, Int, [Word8])
decodeOne_fourbyte :: [Word8] -> (Either Error Char, Int, [Word8])
decodeOne_fourbyte (Word8
_:[])     = Int -> (Either Error Char, Int, [Word8])
fourbyte_truncated Int
1
decodeOne_fourbyte (Word8
_:Word8
_:[])   = Int -> (Either Error Char, Int, [Word8])
fourbyte_truncated Int
2
decodeOne_fourbyte (Word8
_:Word8
_:Word8
_:[]) = Int -> (Either Error Char, Int, [Word8])
fourbyte_truncated Int
3
decodeOne_fourbyte bs :: [Word8]
bs@(Word8
b1:Word8
b2:Word8
b3:Word8
b4:[Word8]
rest)
    | Word8 -> Bool
first_bits_not_10 Word8
b2
        = (Error -> Either Error Char
forall a b. a -> Either a b
Left (Int -> Error
InvalidLaterByte Int
1), Int
1, Int -> [Word8] -> [Word8]
forall a. Int -> [a] -> [a]
drop Int
1 [Word8]
bs)
    | Word8 -> Bool
first_bits_not_10 Word8
b3
        = (Error -> Either Error Char
forall a b. a -> Either a b
Left (Int -> Error
InvalidLaterByte Int
2), Int
2, Int -> [Word8] -> [Word8]
forall a. Int -> [a] -> [a]
drop Int
2 [Word8]
bs)
    | Word8 -> Bool
first_bits_not_10 Word8
b4
        = (Error -> Either Error Char
forall a b. a -> Either a b
Left (Int -> Error
InvalidLaterByte Int
3), Int
3, Int -> [Word8] -> [Word8]
forall a. Int -> [a] -> [a]
drop Int
3 [Word8]
bs)
    | Word32
result Word32 -> Word32 -> Bool
forall a. Ord a => a -> a -> Bool
< Word32
0x0080
        = (Error -> Either Error Char
forall a b. a -> Either a b
Left (Int -> Int -> Error
NonShortest Int
4 Int
1), Int
4, [Word8]
rest)
    | Word32
result Word32 -> Word32 -> Bool
forall a. Ord a => a -> a -> Bool
< Word32
0x0800
        = (Error -> Either Error Char
forall a b. a -> Either a b
Left (Int -> Int -> Error
NonShortest Int
4 Int
2), Int
4, [Word8]
rest)
    | Word32
result Word32 -> Word32 -> Bool
forall a. Ord a => a -> a -> Bool
< Word32
0x10000
        = (Error -> Either Error Char
forall a b. a -> Either a b
Left (Int -> Int -> Error
NonShortest Int
4 Int
3), Int
4, [Word8]
rest)
    | Word32
result Word32 -> Word32 -> Bool
forall a. Ord a => a -> a -> Bool
> Word32
0x10FFFF
        = (Error -> Either Error Char
forall a b. a -> Either a b
Left Error
ValueOutOfBounds, Int
4, [Word8]
rest)
    | Bool
otherwise
        = (Char -> Either Error Char
forall a b. b -> Either a b
Right (Word32 -> Char
forall a. Integral a => a -> Char
cpToChar Word32
result), Int
4, [Word8]
rest)
    where
    xs, ys, zs, us, result :: Word32
    xs :: Word32
xs = Word8 -> Word32
forall a b. (Integral a, Num b) => a -> b
fromIntegral (Word8
b4 Word8 -> Word8 -> Word8
forall a. Bits a => a -> a -> a
.&. Word8
0x3F)
    ys :: Word32
ys = Word8 -> Word32
forall a b. (Integral a, Num b) => a -> b
fromIntegral (Word8
b3 Word8 -> Word8 -> Word8
forall a. Bits a => a -> a -> a
.&. Word8
0x3F)
    zs :: Word32
zs = Word8 -> Word32
forall a b. (Integral a, Num b) => a -> b
fromIntegral (Word8
b2 Word8 -> Word8 -> Word8
forall a. Bits a => a -> a -> a
.&. Word8
0x3F)
    us :: Word32
us = Word8 -> Word32
forall a b. (Integral a, Num b) => a -> b
fromIntegral (Word8
b1 Word8 -> Word8 -> Word8
forall a. Bits a => a -> a -> a
.&. Word8
0x07)
    result :: Word32
result = Word32
xs Word32 -> Word32 -> Word32
forall a. Bits a => a -> a -> a
.|. Word32 -> Int -> Word32
forall a. Bits a => a -> Int -> a
shiftL Word32
ys Int
6 Word32 -> Word32 -> Word32
forall a. Bits a => a -> a -> a
.|. Word32 -> Int -> Word32
forall a. Bits a => a -> Int -> a
shiftL Word32
zs Int
12 Word32 -> Word32 -> Word32
forall a. Bits a => a -> a -> a
.|. Word32 -> Int -> Word32
forall a. Bits a => a -> Int -> a
shiftL Word32
us Int
18
decodeOne_fourbyte[]
 = [Char] -> (Either Error Char, Int, [Word8])
forall a. HasCallStack => [Char] -> a
error [Char]
"UTF8.decodeOne_fourbyte: No input (can't happen)"

fourbyte_truncated :: Int -> (Either Error Char, Int, [Word8])
fourbyte_truncated :: Int -> (Either Error Char, Int, [Word8])
fourbyte_truncated Int
n = (Error -> Either Error Char
forall a b. a -> Either a b
Left (Int -> Int -> Error
Truncated Int
n Int
4), Int
n, [])


-- The decoder examines all input, recording decoded characters as well as
-- error-index pairs along the way.

decode :: [Word8] -> ([Char], [(Error,Int)])
decode :: [Word8] -> ([Char], [(Error, Int)])
decode = ([(Error, Int)], [Char]) -> ([Char], [(Error, Int)])
forall a b. (a, b) -> (b, a)
swap (([(Error, Int)], [Char]) -> ([Char], [(Error, Int)]))
-> ([Word8] -> ([(Error, Int)], [Char]))
-> [Word8]
-> ([Char], [(Error, Int)])
forall b c a. (b -> c) -> (a -> b) -> a -> c
. [Either (Error, Int) Char] -> ([(Error, Int)], [Char])
forall a b. [Either a b] -> ([a], [b])
partitionEither ([Either (Error, Int) Char] -> ([(Error, Int)], [Char]))
-> ([Word8] -> [Either (Error, Int) Char])
-> [Word8]
-> ([(Error, Int)], [Char])
forall b c a. (b -> c) -> (a -> b) -> a -> c
. [Word8] -> [Either (Error, Int) Char]
decodeEmbedErrors

decodeEmbedErrors :: [Word8] -> [Either (Error,Int) Char]
decodeEmbedErrors :: [Word8] -> [Either (Error, Int) Char]
decodeEmbedErrors =
   ((Int, [Word8])
 -> Maybe (Either (Error, Int) Char, (Int, [Word8])))
-> (Int, [Word8]) -> [Either (Error, Int) Char]
forall b a. (b -> Maybe (a, b)) -> b -> [a]
unfoldr (\(Int
pos,[Word8]
xs) ->
       Bool
-> (Either (Error, Int) Char, (Int, [Word8]))
-> Maybe (Either (Error, Int) Char, (Int, [Word8]))
forall a. Bool -> a -> Maybe a
toMaybe
          (Bool -> Bool
not (Bool -> Bool) -> Bool -> Bool
forall a b. (a -> b) -> a -> b
$ [Word8] -> Bool
forall a. [a] -> Bool
forall (t :: * -> *) a. Foldable t => t a -> Bool
null [Word8]
xs)
          (let (Either Error Char
c,Int
n,[Word8]
rest) = [Word8] -> (Either Error Char, Int, [Word8])
decodeOne [Word8]
xs
           in  ((Error -> Either (Error, Int) Char)
-> (Char -> Either (Error, Int) Char)
-> Either Error Char
-> Either (Error, Int) Char
forall a c b. (a -> c) -> (b -> c) -> Either a b -> c
either (\Error
err -> (Error, Int) -> Either (Error, Int) Char
forall a b. a -> Either a b
Left (Error
err,Int
pos)) Char -> Either (Error, Int) Char
forall a b. b -> Either a b
Right Either Error Char
c,
                (Int
posInt -> Int -> Int
forall a. Num a => a -> a -> a
+Int
n,[Word8]
rest)))) ((Int, [Word8]) -> [Either (Error, Int) Char])
-> ([Word8] -> (Int, [Word8]))
-> [Word8]
-> [Either (Error, Int) Char]
forall b c a. (b -> c) -> (a -> b) -> a -> c
.
   (,) Int
0

swap :: (a,b) -> (b,a)
swap :: forall a b. (a, b) -> (b, a)
swap (a
x,b
y) = (b
y,a
x)
{-# INLINE swap #-}

partitionEither :: [Either a b] -> ([a], [b])
partitionEither :: forall a b. [Either a b] -> ([a], [b])
partitionEither =
   (Either a b -> ([a], [b]) -> ([a], [b]))
-> ([a], [b]) -> [Either a b] -> ([a], [b])
forall a b. (a -> b -> b) -> b -> [a] -> b
forall (t :: * -> *) a b.
Foldable t =>
(a -> b -> b) -> b -> t a -> b
foldr (\Either a b
x ~([a]
ls,[b]
rs) -> (a -> ([a], [b])) -> (b -> ([a], [b])) -> Either a b -> ([a], [b])
forall a c b. (a -> c) -> (b -> c) -> Either a b -> c
either (\a
l -> (a
la -> [a] -> [a]
forall a. a -> [a] -> [a]
:[a]
ls,[b]
rs)) (\b
r -> ([a]
ls,b
rb -> [b] -> [b]
forall a. a -> [a] -> [a]
:[b]
rs)) Either a b
x) ([],[])
{-# INLINE partitionEither #-}

toMaybe :: Bool -> a -> Maybe a
toMaybe :: forall a. Bool -> a -> Maybe a
toMaybe Bool
False a
_ = Maybe a
forall a. Maybe a
Nothing
toMaybe Bool
True  a
x = a -> Maybe a
forall a. a -> Maybe a
Just a
x
{-# INLINE toMaybe #-}

-- ------------------------------------------------------------