{-# LANGUAGE DeriveDataTypeable #-}

-- ------------------------------------------------------------

{- |
   Module     : Data.Atom
   Copyright  : Copyright (C) 2008 Uwe Schmidt
   License    : MIT

   Maintainer : Uwe Schmidt (uwe\@fh-wedel.de)
   Stability  : experimental
   Portability: non-portable

   Unique Atoms generated from Strings and
   managed as flyweights

   Data.Atom can be used for caching and storage optimization
   of frequently used strings. An @Atom@ is constructed from a @String@.
   For two equal strings the identical atom is returned.

   This module can be used for optimizing memory usage when working with
   strings or names. Many applications use data types like
   @Map String SomeAttribute@ where a rather fixed set of keys is used.
   Especially XML applications often work with a limited set of element and attribute names.
   For these applications it becomes more memory efficient when working with types like
   @Map Atom SomeAttribute@ and convert the keys into atoms before operating
   on such a map.

   Internally this module manages a map of atoms. The atoms are internally represented
   by @ByteString@s. When creating a new atom from a string, the string is first converted
   into an UTF8 @Word8@ sequence, which is packed into a @ByteString@. This @ByteString@ is looked
   up in the table of atoms. If it is already there, the value in the map is used as atom, else
   the new @ByteString@ is inserted into the map.

   Of course the implementation of this name cache uses @unsavePerformIO@.
   The global cache is managed by ue of an @IORef@ and atomicModifyIORef.

   The following laws hold for atoms

   >
   > s  ==       t => newAtom s  ==       newAtom t
   > s `compare` t => newAtom s `compare` newAtom t
   > show . newAtom == id

   Equality test for @Atom@s runs in /O(1)/, it is just a pointer comparison.
   The @Ord@ comparisons have the same runtime like the @ByteString@ comparisons.
   Internally there is an UTF8 comparison, but UTF8 encoding preserves the total order.

   Warning: The internal cache never shrinks during execution. So using it in a
   undisciplined way can lead to memory leaks.
-}

-----------------------------------------------------------------------------

module Data.Atom (
   -- * Atom objects
   Atom,                -- instance (Eq, Ord, Read, Show)
   newAtom,             -- :: String -> Atom
   share                -- :: String -> String
 ) where

import           Control.DeepSeq

import           Data.ByteString          (ByteString, pack, unpack)
import           Data.ByteString.Internal (c2w, toForeignPtr, w2c)
import           Data.IORef
import qualified Data.Map                 as M
import           Data.String.Unicode      (unicodeToUtf8)
import           Data.String.UTF8Decoding (decodeUtf8)
import           Data.Typeable

import           System.IO.Unsafe         (unsafePerformIO)

-- ------------------------------------------------------------

type Atoms      = M.Map ByteString ByteString

newtype Atom    = A { Atom -> ByteString
bs :: ByteString }
                  deriving (Typeable)

-- ------------------------------------------------------------

-- | the internal cache for the strings

theAtoms        :: IORef Atoms
theAtoms :: IORef Atoms
theAtoms        = IO (IORef Atoms) -> IORef Atoms
forall a. IO a -> a
unsafePerformIO (Atoms -> IO (IORef Atoms)
forall a. a -> IO (IORef a)
newIORef Atoms
forall k a. Map k a
M.empty)
{-# NOINLINE theAtoms #-}

-- | insert a bytestring into the atom cache

insertAtom      :: ByteString -> Atoms -> (Atoms, Atom)
insertAtom :: ByteString -> Atoms -> (Atoms, Atom)
insertAtom ByteString
s Atoms
m  = (Atoms, Atom)
-> (ByteString -> (Atoms, Atom))
-> Maybe ByteString
-> (Atoms, Atom)
forall b a. b -> (a -> b) -> Maybe a -> b
maybe (ByteString -> ByteString -> Atoms -> Atoms
forall k a. Ord k => k -> a -> Map k a -> Map k a
M.insert ByteString
s ByteString
s Atoms
m, ByteString -> Atom
A ByteString
s)
                        (\ ByteString
s' -> (Atoms
m, ByteString -> Atom
A ByteString
s'))
                  (Maybe ByteString -> (Atoms, Atom))
-> (Atoms -> Maybe ByteString) -> Atoms -> (Atoms, Atom)
forall b c a. (b -> c) -> (a -> b) -> a -> c
.
                  ByteString -> Atoms -> Maybe ByteString
forall k a. Ord k => k -> Map k a -> Maybe a
M.lookup ByteString
s (Atoms -> (Atoms, Atom)) -> Atoms -> (Atoms, Atom)
forall a b. (a -> b) -> a -> b
$ Atoms
m

-- | creation of an @Atom@ from a @String@

newAtom         :: String -> Atom
newAtom :: String -> Atom
newAtom         = IO Atom -> Atom
forall a. IO a -> a
unsafePerformIO (IO Atom -> Atom) -> (String -> IO Atom) -> String -> Atom
forall b c a. (b -> c) -> (a -> b) -> a -> c
. String -> IO Atom
newAtom'
{-# NOINLINE newAtom #-}

-- | The internal operation running in the IO monad
newAtom'        :: String -> IO Atom
newAtom' :: String -> IO Atom
newAtom' String
s      = do
                  -- putStrLn "insert atom into cache"
                  Atom
res <- IORef Atoms -> (Atoms -> (Atoms, Atom)) -> IO Atom
forall a b. IORef a -> (a -> (a, b)) -> IO b
atomicModifyIORef IORef Atoms
theAtoms Atoms -> (Atoms, Atom)
insert
                  -- putStrLn "atom cache updated"
                  Atom -> IO Atom
forall a. a -> IO a
forall (m :: * -> *) a. Monad m => a -> m a
return Atom
res
  where
    insert :: Atoms -> (Atoms, Atom)
insert Atoms
m    = let r :: (Atoms, Atom)
r = ByteString -> Atoms -> (Atoms, Atom)
insertAtom ([Word8] -> ByteString
pack([Word8] -> ByteString)
-> (String -> [Word8]) -> String -> ByteString
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Char -> Word8) -> String -> [Word8]
forall a b. (a -> b) -> [a] -> [b]
map Char -> Word8
c2w (String -> [Word8]) -> (String -> String) -> String -> [Word8]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. String -> String
unicodeToUtf8 (String -> ByteString) -> String -> ByteString
forall a b. (a -> b) -> a -> b
$ String
s) Atoms
m
                  in
                   (Atoms, Atom) -> Atoms
forall a b. (a, b) -> a
fst (Atoms, Atom)
r Atoms -> (Atoms, Atom) -> (Atoms, Atom)
forall a b. a -> b -> b
`seq` (Atoms, Atom)
r

-- | Insert a @String@ into the atom cache and convert the atom back into a @String@.
--
-- locically @share == id@ holds, but internally equal strings share the same memory.

share           :: String -> String
share :: String -> String
share           = Atom -> String
forall a. Show a => a -> String
show (Atom -> String) -> (String -> Atom) -> String -> String
forall b c a. (b -> c) -> (a -> b) -> a -> c
. String -> Atom
newAtom

instance Eq Atom where
    Atom
a1 == :: Atom -> Atom -> Bool
== Atom
a2    = ForeignPtr Word8
fp1 ForeignPtr Word8 -> ForeignPtr Word8 -> Bool
forall a. Eq a => a -> a -> Bool
== ForeignPtr Word8
fp2
                  where
                  (ForeignPtr Word8
fp1, Int
_, Int
_) = ByteString -> (ForeignPtr Word8, Int, Int)
toForeignPtr (ByteString -> (ForeignPtr Word8, Int, Int))
-> (Atom -> ByteString) -> Atom -> (ForeignPtr Word8, Int, Int)
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Atom -> ByteString
bs (Atom -> (ForeignPtr Word8, Int, Int))
-> Atom -> (ForeignPtr Word8, Int, Int)
forall a b. (a -> b) -> a -> b
$ Atom
a1
                  (ForeignPtr Word8
fp2, Int
_, Int
_) = ByteString -> (ForeignPtr Word8, Int, Int)
toForeignPtr (ByteString -> (ForeignPtr Word8, Int, Int))
-> (Atom -> ByteString) -> Atom -> (ForeignPtr Word8, Int, Int)
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Atom -> ByteString
bs (Atom -> (ForeignPtr Word8, Int, Int))
-> Atom -> (ForeignPtr Word8, Int, Int)
forall a b. (a -> b) -> a -> b
$ Atom
a2

instance Ord Atom where
    compare :: Atom -> Atom -> Ordering
compare Atom
a1 Atom
a2
                | Atom
a1 Atom -> Atom -> Bool
forall a. Eq a => a -> a -> Bool
== Atom
a2      = Ordering
EQ
                | Bool
otherwise     = ByteString -> ByteString -> Ordering
forall a. Ord a => a -> a -> Ordering
compare (Atom -> ByteString
bs Atom
a1) (Atom -> ByteString
bs Atom
a2)

instance Read Atom where
    readsPrec :: Int -> ReadS Atom
readsPrec Int
p String
str = [ (String -> Atom
newAtom String
x, String
y) | (String
x, String
y) <- Int -> ReadS String
forall a. Read a => Int -> ReadS a
readsPrec Int
p String
str ]

instance Show Atom where
    show :: Atom -> String
show        = (String, [String]) -> String
forall a b. (a, b) -> a
fst ((String, [String]) -> String)
-> (Atom -> (String, [String])) -> Atom -> String
forall b c a. (b -> c) -> (a -> b) -> a -> c
. String -> (String, [String])
decodeUtf8 (String -> (String, [String]))
-> (Atom -> String) -> Atom -> (String, [String])
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Word8 -> Char) -> [Word8] -> String
forall a b. (a -> b) -> [a] -> [b]
map Word8 -> Char
w2c ([Word8] -> String) -> (Atom -> [Word8]) -> Atom -> String
forall b c a. (b -> c) -> (a -> b) -> a -> c
. ByteString -> [Word8]
unpack (ByteString -> [Word8]) -> (Atom -> ByteString) -> Atom -> [Word8]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Atom -> ByteString
bs
    -- show     = show . toForeignPtr . bs                      -- for debug only

instance NFData Atom where
    rnf :: Atom -> ()
rnf Atom
x = Atom -> () -> ()
forall a b. a -> b -> b
seq Atom
x ()

-----------------------------------------------------------------------------