Copyright | Copyright (C) 2010- Uwe Schmidt |
---|---|
License | MIT |
Maintainer | Uwe Schmidt (uwe@fh-wedel.de) |
Stability | stable |
Portability | portable |
Safe Haskell | Safe-Inferred |
Language | Haskell98 |
Unicode and UTF-8 Conversion Functions
Synopsis
- type Unicode = Char
- type UString = [Unicode]
- type UTF8Char = Char
- type UTF8String = String
- type UStringWithErrors = [Either String Char]
- type DecodingFct = String -> (UString, [String])
- type DecodingFctEmbedErrors = String -> UStringWithErrors
- utf8ToUnicode :: DecodingFct
- utf8ToUnicodeEmbedErrors :: DecodingFctEmbedErrors
- latin1ToUnicode :: String -> UString
- ucs2ToUnicode :: String -> UString
- ucs2BigEndianToUnicode :: String -> UString
- ucs2LittleEndianToUnicode :: String -> UString
- utf16beToUnicode :: String -> UString
- utf16leToUnicode :: String -> UString
- unicodeCharToUtf8 :: Unicode -> UTF8String
- unicodeToUtf8 :: UString -> UTF8String
- unicodeToXmlEntity :: UString -> String
- unicodeToLatin1 :: UString -> String
- unicodeRemoveNoneAscii :: UString -> String
- unicodeRemoveNoneLatin1 :: UString -> String
- intToCharRef :: Int -> String
- intToCharRefHex :: Int -> String
- intToHexString :: Int -> String
- getDecodingFct :: String -> Maybe DecodingFct
- getDecodingFctEmbedErrors :: String -> Maybe DecodingFctEmbedErrors
- getOutputEncodingFct :: String -> Maybe (String -> UString)
- normalizeNL :: String -> String
- guessEncoding :: String -> String
- getOutputEncodingFct' :: String -> Maybe (Char -> StringFct)
- unicodeCharToUtf8' :: Char -> StringFct
- unicodeCharToXmlEntity' :: Char -> StringFct
- unicodeCharToLatin1' :: Char -> StringFct
Unicode Type declarations
Unicode is represented as the Char type Precondition for this is the support of Unicode character range in the compiler (e.g. ghc but not hugs)
type UTF8String = String Source #
UTF-8 strings are implemented as Haskell strings
type DecodingFct = String -> (UString, [String]) Source #
Decoding function with a pair containing the result string and a list of decoding errors as result
type DecodingFctEmbedErrors = String -> UStringWithErrors Source #
Decoding function where decoding errors are interleaved with decoded characters
utf8ToUnicode :: DecodingFct Source #
UTF-8 to Unicode conversion with deletion of leading byte order mark, as described in XML standard F.1
latin1ToUnicode :: String -> UString Source #
code conversion from latin1 to Unicode
ucs2ToUnicode :: String -> UString Source #
UCS-2 to UTF-8 conversion with byte order mark analysis
ucs2BigEndianToUnicode :: String -> UString Source #
UCS-2 big endian to Unicode conversion
ucs2LittleEndianToUnicode :: String -> UString Source #
UCS-2 little endian to Unicode conversion
utf16beToUnicode :: String -> UString Source #
UTF-16 big endian to UTF-8 conversion with removal of byte order mark
utf16leToUnicode :: String -> UString Source #
UTF-16 little endian to UTF-8 conversion with removal of byte order mark
unicodeCharToUtf8 :: Unicode -> UTF8String Source #
conversion from Unicode (Char) to a UTF8 encoded string.
unicodeToUtf8 :: UString -> UTF8String Source #
conversion from Unicode strings (UString) to UTF8 encoded strings.
unicodeToXmlEntity :: UString -> String Source #
substitute all Unicode characters, that are not legal 1-byte UTF-8 XML characters by a character reference.
This function can be used to translate all text nodes and attribute values into pure ascii.
see also : unicodeToLatin1
unicodeToLatin1 :: UString -> String Source #
substitute all Unicode characters, that are not legal latin1 UTF-8 XML characters by a character reference.
This function can be used to translate all text nodes and attribute values into ISO latin1.
see also : unicodeToXmlEntity
unicodeRemoveNoneAscii :: UString -> String Source #
removes all non ascii chars, may be used to transform a document into a pure ascii representation by removing all non ascii chars from tag and attibute names
see also : unicodeRemoveNoneLatin1
, unicodeToXmlEntity
unicodeRemoveNoneLatin1 :: UString -> String Source #
removes all non latin1 chars, may be used to transform a document into a pure ascii representation by removing all non ascii chars from tag and attibute names
see also : unicodeRemoveNoneAscii
, unicodeToLatin1
intToCharRef :: Int -> String Source #
convert an Unicode into a XML character reference.
see also : intToCharRefHex
intToCharRefHex :: Int -> String Source #
convert an Unicode into a XML hexadecimal character reference.
see also: intToCharRef
intToHexString :: Int -> String Source #
getDecodingFct :: String -> Maybe DecodingFct Source #
the lookup function for selecting the decoding function
getDecodingFctEmbedErrors :: String -> Maybe DecodingFctEmbedErrors Source #
the lookup function for selecting the decoding function
getOutputEncodingFct :: String -> Maybe (String -> UString) Source #
the lookup function for selecting the encoding function
normalizeNL :: String -> String Source #
White Space (XML Standard 2.3) and end of line handling (2.11)
#x0D and #x0D#x0A are mapped to #x0A
guessEncoding :: String -> String Source #
getOutputEncodingFct' :: String -> Maybe (Char -> StringFct) Source #
the lookup function for selecting the encoding function
unicodeCharToUtf8' :: Char -> StringFct Source #
conversion from Unicode (Char) to a UTF8 encoded string.
unicodeCharToXmlEntity' :: Char -> StringFct Source #
substitute all Unicode characters, that are not legal 1-byte UTF-8 XML characters by a character reference.
unicodeCharToLatin1' :: Char -> StringFct Source #
substitute all Unicode characters, that are not legal latin1 UTF-8 XML characters by a character reference.