Just want to add one datapoint to this discussion. Using the fiwiki
dump, it takes 8 seconds for awk to finish, 20 seconds for ordered-containers and 19 seconds for unordered containers. This with absolutely no effort spent on optimizing.
{-# LANGUAGE OverloadedStrings #-}
module Main where
import qualified Data.Map.Strict as M
import qualified Data.Text.Lazy as Lazy
import qualified Data.Text.Lazy.IO as LazyIO
import qualified Data.Text as Strict
import Data.Map (Map)
import Data.Foldable (foldl', for_)
import qualified Data.Text.IO as StrictIO
import Data.List (sortOn)
import Data.Ord (Down(Down))
import Data.HashMap.Strict (HashMap)
import qualified Data.HashMap.Strict as HM
orderedHisto :: Lazy.Text -> [(Strict.Text, Int)]
orderedHisto = M.toList . histo
where
histo :: Lazy.Text -> Map Strict.Text Int
histo = foldl' (\acc x -> M.insertWith (+) (Lazy.toStrict x) 1 acc) M.empty . Lazy.lines
unorderedHisto :: Lazy.Text -> [(Strict.Text, Int)]
unorderedHisto = HM.toList . histo
where
histo :: Lazy.Text -> HashMap Strict.Text Int
histo = foldl' (\acc x -> HM.insertWith (+) (Lazy.toStrict x) 1 acc) HM.empty . Lazy.lines
main :: IO ()
main = do
contents <- LazyIO.getContents
for_ (take 10 $ sortOn (Down . snd) $ unorderedHisto contents) $ \(key, val) ->
StrictIO.putStrLn $ key <> " " <> Strict.pack (show val)
$ bzcat ~/temp/realytemp/fiwiki-20220520-pages-articles-multistream.xml.bz2 | head -1000000| tr ' ' '\n' | time ./test > /dev/null
./test > /dev/null 17.39s user 1.18s system 93% cpu 19.908 total
$ bzcat ~/temp/realytemp/fiwiki-20220520-pages-articles-multistream.xml.bz2 | head -1000000| tr ' ' '\n' | time awk '{s[$0]++} END{ PROCINFO["sorted_in"]="@val_num_desc"; i=1;for(x in s){if (i++>10)break;print x,s[x]}}' > /dev/null
awk > /dev/null 7.34s user 0.25s system 87% cpu 8.629 total