hxt-unicode-9.0.2.4: Unicode en-/decoding functions for utf8, iso-latin-* and other encodings

Copyright	Copyright (C) 2010- Uwe Schmidt
License	MIT
Maintainer	Uwe Schmidt (uwe@fh-wedel.de)
Stability	stable
Portability	portable
Safe Haskell	Safe-Inferred
Language	Haskell98

Data.String.Unicode

Contents

Unicode Type declarations

Description

Unicode and UTF-8 Conversion Functions

Synopsis

type Unicode = Char
type UString = [Unicode]
type UTF8Char = Char
type UTF8String = String
type UStringWithErrors = [Either String Char]
type DecodingFct = String -> (UString, [String])
type DecodingFctEmbedErrors = String -> UStringWithErrors
utf8ToUnicode :: DecodingFct
utf8ToUnicodeEmbedErrors :: DecodingFctEmbedErrors
latin1ToUnicode :: String -> UString
ucs2ToUnicode :: String -> UString
ucs2BigEndianToUnicode :: String -> UString
ucs2LittleEndianToUnicode :: String -> UString
utf16beToUnicode :: String -> UString
utf16leToUnicode :: String -> UString
unicodeCharToUtf8 :: Unicode -> UTF8String
unicodeToUtf8 :: UString -> UTF8String
unicodeToXmlEntity :: UString -> String
unicodeToLatin1 :: UString -> String
unicodeRemoveNoneAscii :: UString -> String
unicodeRemoveNoneLatin1 :: UString -> String
intToCharRef :: Int -> String
intToCharRefHex :: Int -> String
intToHexString :: Int -> String
getDecodingFct :: String -> Maybe DecodingFct
getDecodingFctEmbedErrors :: String -> Maybe DecodingFctEmbedErrors
getOutputEncodingFct :: String -> Maybe (String -> UString)
normalizeNL :: String -> String
guessEncoding :: String -> String
getOutputEncodingFct' :: String -> Maybe (Char -> StringFct)
unicodeCharToUtf8' :: Char -> StringFct
unicodeCharToXmlEntity' :: Char -> StringFct
unicodeCharToLatin1' :: Char -> StringFct

Unicode Type declarations

type Unicode = Char Source #

Unicode is represented as the Char type Precondition for this is the support of Unicode character range in the compiler (e.g. ghc but not hugs)

type UString = [Unicode] Source #

the type for Unicode strings

type UTF8Char = Char Source #

UTF-8 charachters are represented by the Char type

type UTF8String = String Source #

UTF-8 strings are implemented as Haskell strings

type UStringWithErrors = [Either String Char] Source #

type DecodingFct = String -> (UString, [String]) Source #

Decoding function with a pair containing the result string and a list of decoding errors as result

type DecodingFctEmbedErrors = String -> UStringWithErrors Source #

Decoding function where decoding errors are interleaved with decoded characters

utf8ToUnicode :: DecodingFct Source #

UTF-8 to Unicode conversion with deletion of leading byte order mark, as described in XML standard F.1

utf8ToUnicodeEmbedErrors :: DecodingFctEmbedErrors Source #

latin1ToUnicode :: String -> UString Source #

code conversion from latin1 to Unicode

ucs2ToUnicode :: String -> UString Source #

UCS-2 to UTF-8 conversion with byte order mark analysis

ucs2BigEndianToUnicode :: String -> UString Source #

UCS-2 big endian to Unicode conversion

ucs2LittleEndianToUnicode :: String -> UString Source #

UCS-2 little endian to Unicode conversion

utf16beToUnicode :: String -> UString Source #

UTF-16 big endian to UTF-8 conversion with removal of byte order mark

utf16leToUnicode :: String -> UString Source #

UTF-16 little endian to UTF-8 conversion with removal of byte order mark

unicodeCharToUtf8 :: Unicode -> UTF8String Source #

conversion from Unicode (Char) to a UTF8 encoded string.

unicodeToUtf8 :: UString -> UTF8String Source #

conversion from Unicode strings (UString) to UTF8 encoded strings.

unicodeToXmlEntity :: UString -> String Source #

substitute all Unicode characters, that are not legal 1-byte UTF-8 XML characters by a character reference.

This function can be used to translate all text nodes and attribute values into pure ascii.

see also : unicodeToLatin1

unicodeToLatin1 :: UString -> String Source #

substitute all Unicode characters, that are not legal latin1 UTF-8 XML characters by a character reference.

This function can be used to translate all text nodes and attribute values into ISO latin1.

see also : unicodeToXmlEntity

unicodeRemoveNoneAscii :: UString -> String Source #

removes all non ascii chars, may be used to transform a document into a pure ascii representation by removing all non ascii chars from tag and attibute names

see also : unicodeRemoveNoneLatin1, unicodeToXmlEntity

unicodeRemoveNoneLatin1 :: UString -> String Source #

removes all non latin1 chars, may be used to transform a document into a pure ascii representation by removing all non ascii chars from tag and attibute names

see also : unicodeRemoveNoneAscii, unicodeToLatin1

intToCharRef :: Int -> String Source #

convert an Unicode into a XML character reference.

see also : intToCharRefHex

intToCharRefHex :: Int -> String Source #

convert an Unicode into a XML hexadecimal character reference.

see also: intToCharRef

intToHexString :: Int -> String Source #

getDecodingFct :: String -> Maybe DecodingFct Source #

the lookup function for selecting the decoding function

getDecodingFctEmbedErrors :: String -> Maybe DecodingFctEmbedErrors Source #

the lookup function for selecting the decoding function

getOutputEncodingFct :: String -> Maybe (String -> UString) Source #

the lookup function for selecting the encoding function

normalizeNL :: String -> String Source #

White Space (XML Standard 2.3) and end of line handling (2.11)

#x0D and #x0D#x0A are mapped to #x0A

guessEncoding :: String -> String Source #

getOutputEncodingFct' :: String -> Maybe (Char -> StringFct) Source #

the lookup function for selecting the encoding function

unicodeCharToUtf8' :: Char -> StringFct Source #

conversion from Unicode (Char) to a UTF8 encoded string.

unicodeCharToXmlEntity' :: Char -> StringFct Source #

substitute all Unicode characters, that are not legal 1-byte UTF-8 XML characters by a character reference.

unicodeCharToLatin1' :: Char -> StringFct Source #

substitute all Unicode characters, that are not legal latin1 UTF-8 XML characters by a character reference.