I’ve spent a fair amount of time this evening trying to implement what @bodigrim wants, and while the additions to base didn’t end up being too egregious, moving instance Binary Text into binary is turning out quite ghastly. Here is an incomplete diff of just Data.Binary.Class.hs; it doesn’t compile yet because there are still more things to copy over.
It's long.
diff --git a/src/Data/Binary/Class.hs b/src/Data/Binary/Class.hs
index 0b79743..e91e533 100644
--- a/src/Data/Binary/Class.hs
+++ b/src/Data/Binary/Class.hs
@@ -4,6 +4,11 @@
{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE PatternGuards #-}
{-# LANGUAGE Trustworthy #-}
+{-# LANGUAGE UnboxedTuples #-}
+{-# LANGUAGE BangPatterns #-}
+{-# LANGUAGE MagicHash #-}
+{-# LANGUAGE RankNTypes #-}
+{-# LANGUAGE UnboxedTuples #-}
#if __GLASGOW_HASKELL__ >= 706
{-# LANGUAGE PolyKinds #-}
@@ -127,6 +132,18 @@ import GHC.Fingerprint
import Data.Version (Version(..))
+import Control.Monad.ST.Unsafe (unsafeSTToIO)
+import qualified Data.ByteString.Internal as B
+import qualified Data.ByteString.Short.Internal as SBS
+import Data.Array.Byte (ByteArray(..), MutableByteArray(..))
+import Data.Char (chr)
+import Data.Text.Type (Text(..))
+import GHC.Exts (Addr#, Int(..), byteArrayContents#, copyByteArray#, indexWord8OffAddr#, newByteArray#, newPinnedByteArray#, unsafeCoerce#, unsafeFreezeByteArray#)
+import GHC.ForeignPtr (ForeignPtr(..), ForeignPtrContents(PlainPtr))
+import GHC.IO (unsafeDupablePerformIO)
+import GHC.ST (ST(..), runST)
+import GHC.Word (Word8(..))
+
------------------------------------------------------------------------
-- Factored into two classes because this makes GHC optimize the
@@ -855,6 +872,288 @@ instance Binary a => Binary (NE.NonEmpty a) where
put = put . NE.toList
#endif
+------------------------------------------------------------------------
+-- Text
+
+instance Binary Text where
+ put t = put (encodeUtf8 t)
+ get = do
+ bs <- get
+ case decodeUtf8' bs of
+ Left exn -> fail (show exn)
+ Right a -> return a
+
+-- | Decode a 'ByteString' containing UTF-8 encoded text.
+--
+-- If the input contains any invalid UTF-8 data, the relevant
+-- exception will be returned, otherwise the decoded text.
+decodeUtf8' :: B.ByteString -> Either UnicodeException Text
+decodeUtf8' = unsafeDupablePerformIO . try . evaluate . decodeUtf8With strictDecode
+{-# INLINE decodeUtf8' #-}
+
+-- | State of decoding a 'ByteString' in UTF-8.
+-- Enables incremental decoding ('validateUtf8Chunk', 'validateUtf8More',
+-- 'decodeUtf8Chunk', 'decodeUtf8More').
+
+-- Internal invariant:
+-- the first component is the initial state if and only if
+-- the second component is empty.
+--
+-- @
+-- 'utf9CodePointState' s = 'utf8StartState'
+-- <=>
+-- 'partialUtf8CodePoint' s = 'PartialUtf8CodePoint' 0
+-- @
+data Utf8State = Utf8State
+ { -- | State of the UTF-8 state machine.
+ utf8CodePointState :: {-# UNPACK #-} !DecoderState
+ -- | Bytes of the currently incomplete code point (if any).
+ , partialUtf8CodePoint :: {-# UNPACK #-} !PartialUtf8CodePoint
+ }
+ deriving (Eq, Show)
+
+-- | An exception type for representing Unicode encoding errors.
+data UnicodeException =
+ DecodeError String (Maybe Word8)
+ -- ^ Could not decode a byte sequence because it was invalid under
+ -- the given encoding, or ran out of input in mid-decode.
+ | EncodeError String (Maybe Char)
+ -- ^ Tried to encode a character that could not be represented
+ -- under the given encoding, or ran out of input in mid-encode.
+
+type OnDecodeError = String -> Maybe Word8 -> Maybe Char
+
+-- | A delayed representation of strict 'Text'.
+data StrictBuilder = StrictBuilder
+ { sbLength :: {-# UNPACK #-} !Int
+ , sbWrite :: forall s. MutableByteArray s -> Int -> ST s ()
+ }
+
+-- | Use 'StrictBuilder' to build 'Text'.
+toText :: StrictBuilder -> Text
+toText (StrictBuilder 0 _) = mempty
+toText (StrictBuilder n write) = runST (do
+ dst <- new n
+ write dst 0
+ arr <- unsafeFreeze dst
+ pure (Text arr 0 n))
+
+-- | Copy a 'ByteString'.
+--
+-- Unsafe: This may not be valid UTF-8 text.
+unsafeFromByteString :: ByteString -> StrictBuilder
+unsafeFromByteString bs =
+ StrictBuilder (B.length bs) (\dst ofs -> copyFromByteString dst ofs bs)
+
+decodeUtf8With :: OnDecodeError -> B.ByteString -> Text
+decodeUtf8With onErr = decodeUtf8With1 onErr invalidUtf8Msg
+
+-- | Helper for 'Data.Text.Encoding.decodeUtf8With'.
+--
+
+-- This could be shorter by calling 'decodeUtf8With2' directly, but we make the
+-- first call validateUtf8Chunk directly to return even faster in successful
+-- cases.
+decodeUtf8With1 :: OnDecodeError -> String -> B.ByteString -> Text
+decodeUtf8With1 onErr msg bs = validateUtf8ChunkFrom 0 bs $ \len ms -> case ms of
+ Just s
+ | len == B.length bs ->
+ let !(SBS.SBS arr) = SBS.toShort bs in
+ Text (ByteArray arr) 0 len
+ | otherwise -> toText $
+ unsafeFromByteString (B.take len bs) <> skipIncomplete onErr msg s
+ Nothing ->
+ let (builder, _, s) = decodeUtf8With2 onErr msg startUtf8State (B.drop (len + 1) bs) in
+ toText $
+ unsafeFromByteString (B.take len bs) <>
+ handleUtf8Error onErr msg (B.index bs len) <>
+ builder <>
+ skipIncomplete onErr msg s
+
+-- | Helper for 'Data.Text.Encoding.decodeUtf8With',
+-- 'Data.Text.Encoding.streamDecodeUtf8With', and lazy
+-- 'Data.Text.Lazy.Encoding.decodeUtf8With',
+-- which use an 'OnDecodeError' to process bad bytes.
+--
+-- See 'decodeUtf8Chunk' for a more flexible alternative.
+--
+-- @since 2.0.2
+decodeUtf8With2 :: OnDecodeError -> String -> Utf8State -> B.ByteString -> (StrictBuilder, B.ByteString, Utf8State)
+decodeUtf8With2 onErr msg s0 bs = loop s0 0 mempty
+ where
+ loop s i !builder =
+ let nonEmptyPrefix len = builder
+ <> utf8StateToStrictBuilder s
+ <> unsafeFromByteString (B.take len (B.drop i bs))
+ in validateUtf8MoreCont s (B.drop i bs) $ \len ms -> case ms of
+ Nothing ->
+ if len < 0
+ then
+ -- If the first byte cannot complete the partial code point in s,
+ -- retry from startUtf8State.
+ let builder' = builder <> skipIncomplete onErr msg s
+ -- Note: loop is strict on builder, so if onErr raises an error it will
+ -- be forced here, short-circuiting the loop as desired.
+ in loop startUtf8State i builder'
+ else
+ let builder' = nonEmptyPrefix len
+ <> handleUtf8Error onErr msg (B.index bs (i + len))
+ in loop startUtf8State (i + len + 1) builder'
+ Just s' ->
+ let builder' = if len <= 0 then builder else nonEmptyPrefix len
+ undecoded = if B.length bs >= partUtf8Len (partialUtf8CodePoint s')
+ then B.drop (i + len) bs -- Reuse bs if possible
+ else partUtf8ToByteString (partialUtf8CodePoint s')
+ in (builder', undecoded, s')
+
+invalidUtf8Msg :: String
+invalidUtf8Msg = "Data.Binary.Class: Invalid UTF-8 stream"
+
+-- | Encode text using UTF-8 encoding.
+encodeUtf8 :: Text -> B.ByteString
+encodeUtf8 (Text arr off len)
+ | len == 0 = B.empty
+ -- It would be easier to use Data.ByteString.Short.fromShort and slice later,
+ -- but this is undesirable when len is significantly smaller than length arr.
+ | otherwise = unsafeDupablePerformIO $ do
+ marr@(MutableByteArray mba) <- unsafeSTToIO $ newPinned len
+ unsafeSTToIO $ copyI len marr 0 arr off
+ let fp = ForeignPtr (byteArrayContents# (unsafeCoerce# mba))
+ (PlainPtr mba)
+ pure $ B.fromForeignPtr fp 0 len
+
+-- | Copy some elements of an immutable array.
+-- This code is duplicated from Data.Text.Type for (mumble mumble) reasons.
+copyI :: Int -- ^ Count
+ -> MutableByteArray s -- ^ Destination
+ -> Int -- ^ Destination offset
+ -> ByteArray -- ^ Source
+ -> Int -- ^ Source offset
+ -> ST s ()
+copyI (I# count#) (MutableByteArray dst#) (I# dstOff#) (ByteArray src#) (I# srcOff#) =
+ ST $ \s1# ->
+ case copyByteArray# src# srcOff# dst# dstOff# count# s1# of
+ s2# -> (# s2#, () #)
+{-# INLINE copyI #-}
+
+-- | Create an uninitialized mutable array.
+-- This code is duplicated from Data.Text.Type for (mumble mumble) reasons.
+new :: Int -> ST s (MutableByteArray s)
+new (I# len#) = ST $ \s1# ->
+ case newByteArray# len# s1# of
+ (# s2#, marr# #) -> (# s2#, MutableByteArray marr# #)
+{-# INLINE new #-}
+
+-- | Create an uninitialized mutable pinned array.
+newPinned :: forall s. Int -> ST s (MutableByteArray s)
+newPinned (I# len#) =
+ ST $ \s1# ->
+ case newPinnedByteArray# len# s1# of
+ (# s2#, marr# #) -> (# s2#, MutableByteArray marr# #)
+{-# INLINE newPinned #-}
+
+-- | Freeze a mutable array. Do not mutate the 'MutableByteArray' afterwards!
+-- This code is duplicated from Data.Text.Type for (mumble mumble) reasons.
+unsafeFreeze :: MutableByteArray s -> ST s ByteArray
+unsafeFreeze (MutableByteArray marr) = ST $ \s1# ->
+ case unsafeFreezeByteArray# marr s1# of
+ (# s2#, ba# #) -> (# s2#, ByteArray ba# #)
+{-# INLINE unsafeFreeze #-}
+
+
+-- | Prefix of a UTF-8 code point encoded in 4 bytes,
+-- possibly empty.
+--
+-- - The most significant byte contains the number of bytes,
+-- between 0 and 3.
+-- - The remaining bytes hold the incomplete code point.
+-- - Unused bytes must be 0.
+--
+-- All of operations available on it are the functions below.
+-- The constructor should never be used outside of those.
+--
+-- @since 2.0.2
+newtype PartialUtf8CodePoint = PartialUtf8CodePoint Word32
+ deriving (Eq, Show)
+
+-- | Empty prefix.
+partUtf8Empty :: PartialUtf8CodePoint
+partUtf8Empty = PartialUtf8CodePoint 0
+
+-- | Length of the partial code point, stored in the most significant byte.
+partUtf8Len :: PartialUtf8CodePoint -> Int
+partUtf8Len (PartialUtf8CodePoint w) = fromIntegral $ w `shiftR` 24
+
+word8ToInt :: Word8 -> Int
+word8ToInt = fromIntegral
+
+-------------------------------------------------------------------------------
+-- Naive UTF8 decoder.
+-- See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for the explanation of the state machine.
+
+newtype ByteClass = ByteClass Word8
+
+byteToClass :: Word8 -> ByteClass
+byteToClass n = ByteClass (W8# el#)
+ where
+ !(I# n#) = word8ToInt n
+ el# = indexWord8OffAddr# table# n#
+
+ table# :: Addr#
+ table# = "\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\SOH\SOH\SOH\SOH\SOH\SOH\SOH\SOH\SOH\SOH\SOH\SOH\SOH\SOH\SOH\SOH\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\b\b\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\n\ETX\ETX\ETX\ETX\ETX\ETX\ETX\ETX\ETX\ETX\ETX\ETX\EOT\ETX\ETX\v\ACK\ACK\ACK\ENQ\b\b\b\b\b\b\b\b\b\b\b"#
+
+newtype DecoderState = DecoderState Word8
+ deriving (Eq, Show)
+
+utf8AcceptState :: DecoderState
+utf8AcceptState = DecoderState 0
+
+utf8RejectState :: DecoderState
+utf8RejectState = DecoderState 12
+
+updateState :: ByteClass -> DecoderState -> DecoderState
+updateState (ByteClass c) (DecoderState s) = DecoderState (W8# el#)
+ where
+ !(I# n#) = word8ToInt (c + s)
+ el# = indexWord8OffAddr# table# n#
+
+ table# :: Addr#
+ table# = "\NUL\f\CAN$<`T\f\f\f0H\f\f\f\f\f\f\f\f\f\f\f\f\f\NUL\f\f\f\f\f\NUL\f\NUL\f\f\f\CAN\f\f\f\f\f\CAN\f\CAN\f\f\f\f\f\f\f\f\f\CAN\f\f\f\f\f\CAN\f\f\f\f\f\f\f\CAN\f\f\f\f\f\f\f\f\f$\f$\f\f\f$\f\f\f\f\f$\f$\f\f\f$\f\f\f\f\f\f\f\f\f\f"#
+
+updateDecoderState :: Word8 -> DecoderState -> DecoderState
+updateDecoderState b s = updateState (byteToClass b) s
+
+newtype CodePoint = CodePoint Int
+
+-- | @since 2.0
+data DecoderResult
+ = Accept !Char
+ | Incomplete !DecoderState !CodePoint
+ | Reject
+
+-- | @since 2.0
+utf8DecodeStart :: Word8 -> DecoderResult
+utf8DecodeStart !w
+ | st == utf8AcceptState = Accept (chr (word8ToInt w))
+ | st == utf8RejectState = Reject
+ | otherwise = Incomplete st (CodePoint cp)
+ where
+ cl@(ByteClass cl') = byteToClass w
+ st = updateState cl utf8AcceptState
+ cp = word8ToInt $ (0xff `unsafeShiftR` word8ToInt cl') .&. w
+
+-- | @since 2.0
+utf8DecodeContinue :: Word8 -> DecoderState -> CodePoint -> DecoderResult
+utf8DecodeContinue !w !st (CodePoint !cp)
+ | st' == utf8AcceptState = Accept (chr cp')
+ | st' == utf8RejectState = Reject
+ | otherwise = Incomplete st' (CodePoint cp')
+ where
+ cl = byteToClass w
+ st' = updateState cl st
+ cp' = (cp `shiftL` 6) .|. word8ToInt (w .&. 0x3f)
+
+
------------------------------------------------------------------------
-- Typeable/Reflection
Is this really a good idea?