Optimizing Karatsuba algorithm

I am implementing Karatsuba algorithm as part of an algorithms course.
I tried implementing in both haskell and c++, and it seems c++ is a clear winner by factor of 100~1000x.

My somewhat optimized c++ code:

#include <algorithm>
#include <chrono>
#include <complex>
#include <iostream>
#include <vector>
using namespace std;

typedef complex<double> Complex;
typedef double Real;

/**
 * Operator overloading for printing vectors.
 * @tparam T
 * @param os
 * @param v
 * @return
 */
template <typename T> ostream &operator<<(ostream &os, const vector<T> &v) {
  for (size_t i = 0; i < v.size(); i++) {
    os << v[i] << " ";
  }
  return os;
}

template <typename T> ostream &operator<<(ostream &os, const span<T> &v) {
  for (auto x : v) {
    os << x << " ";
  }
  return os;
}

vector<int> random_int_vector(size_t size) {
  auto result = vector<int>();
  for (size_t i = 0; i < size; i++) {
    result.push_back(rand() % 100);
  }
  return result;
}

vector<Real> random_real_vector(size_t size) {
  auto result = vector<Real>();
  for (size_t i = 0; i < size; i++) {
    result.push_back((double)rand() / (double)RAND_MAX);
  }
  return result;
}

vector<Complex> vector_as_complex(vector<Real> &v) {
  auto result = vector<Complex>(v.size());
  for (size_t i = 0; i < v.size(); i++) {
    result[i] = Complex(v[i], 0);
  }
  return result;
}

template <typename R> vector<R> poly_add(vector<R> &a, vector<R> &b) {
  auto res = vector<R>(max(a.size(), b.size()));
  for (size_t i = 0; i < a.size(); i++) {
    res[i] = a[i];
  }
  for (size_t i = 0; i < b.size(); i++) {
    res[i] += b[i];
  }
  return res;
}

template <typename R> vector<R> poly_sub(vector<R> &a, vector<R> &b) {
  auto res = vector<R>(max(a.size(), b.size()));
  for (size_t i = 0; i < a.size(); i++) {
    res[i] = a[i];
  }
  for (size_t i = 0; i < b.size(); i++) {
    res[i] -= b[i];
  }
  return res;
}

// Add polynomials in-place assuming the size is allocated
template <typename R>
void add_inplace(span<R> &a, span<R> &b, span<R> &result) {
  for (size_t i = 0; i < a.size(); i++)
    result[i] = a[i] + b[i];
}

// Subtract polynomials in-place assuming the size is allocated
template <typename R>
void sub_inplace(span<R> &a, span<R> &b, span<R> &result) {
  for (size_t i = 0; i < a.size(); i++)
    result[i] = a[i] - b[i];
}

template <typename R> vector<R> poly_normalize(vector<R> &a) {
  int i;
  for (i = a.size() - 1; i >= 0; i--) {
    if (a[i] != 0)
      break;
  }
  return vector(a.begin(), a.begin() + i + 1);
}

// Computes basic multiplication, assuming result is initialized as 0 and has enough space
template <typename R> void poly_mult_basic_span(span<R> &a, span<R> &b, span<R> &result) {
  for (size_t i = 0; i < a.size(); i++) {
    // Add at i-th position
    auto at_ith = result.subspan(i, b.size());
    for (size_t j = 0; j < b.size(); j++) {
      at_ith[j] = at_ith[j] + a[i] * b[j];
    }
  }
}

// Basic polynomial multiplication.
template <typename R> vector<R> poly_mult_basic(vector<R> &a, vector<R> &b) {
  if (a.empty() && b.empty())
    return vector<R>(0);

  vector<R> result = vector<R> (a.size() + b.size() - 1);
  auto span_a = span(a);
  auto span_b = span(b);
  auto span_result = span(result);
  poly_mult_basic_span(span_a, span_b, span_result);

  return result;
}

#define THRESHOLD 16

/**
 * A step of the Karatsuba function.
 *
 * NOTE: interestingly, the basic case is quite a performance bottleneck.
 * Hence, the basic case needs to be implemented well.
 *
 * @param deg_bnd power-of-2 degree bound
 * @param buffer the buffer which is used only throughout the invocation
 */
template <typename R>
void poly_mult_Karatsuba_step(const size_t deg_bnd, span<R> &a, span<R> &b,
                              span<R> &result, span<R> &buffer) {
  // Result may be reused, so this needs clearing
  for (auto &entry : result)
    entry = 0;

  if (deg_bnd <= THRESHOLD) {
    poly_mult_basic_span(a, b, result);
    return;
  }

  const auto next_bnd = deg_bnd >> 1;
  auto a0 = a.subspan(0, next_bnd);
  auto a1 = a.subspan(next_bnd, next_bnd);
  auto b0 = b.subspan(0, next_bnd);
  auto b1 = b.subspan(next_bnd, next_bnd);

  auto a01 = buffer.subspan(0, next_bnd);
  auto b01 = buffer.subspan(next_bnd, next_bnd);

  auto prod0 = result.subspan(0, deg_bnd);
  auto prod1 = result.subspan(deg_bnd, deg_bnd);
  auto prod_add = buffer.subspan(next_bnd * 2, next_bnd * 2);

  auto next_buffer = buffer.subspan(next_bnd * 4, next_bnd * 4);

  // correctly put into prod0 and prod1 position
  poly_mult_Karatsuba_step(next_bnd, a0, b0, prod0, next_buffer);
  poly_mult_Karatsuba_step(next_bnd, a1, b1, prod1, next_buffer);

  add_inplace(a0, a1, a01);
  add_inplace(b0, b1, b01);
  poly_mult_Karatsuba_step(next_bnd, a01, b01, prod_add, next_buffer);

  // adjust prod_add
  sub_inplace(prod_add, prod0, prod_add);
  sub_inplace(prod_add, prod1, prod_add);

  // Add middle term at X^next_bnd position
  auto result_mid = result.subspan(next_bnd, deg_bnd);
  add_inplace(prod_add, result_mid, result_mid);
}

template <typename R>
vector<R> poly_mult_Karatsuba(vector<R> &a, vector<R> &b) {
  size_t deg_bound = 1;
  while (deg_bound < max(a.size(), b.size()))
    deg_bound = deg_bound << 1;

  a.resize(deg_bound);
  b.resize(deg_bound);
  auto result = vector<R>(deg_bound << 1);
  auto buffer = vector<R>(deg_bound * 4);

  auto span_a = span(a);
  auto span_b = span(b);
  auto span_result = span(result);
  auto span_buffer = span(buffer);
  poly_mult_Karatsuba_step(deg_bound, span_a, span_b, span_result, span_buffer);
  return result;
}

My current haskell algorithm:
Poly.hs:

module Poly where

import Data.List
import Data.Maybe
import Data.Vector.Unboxed qualified as V

-- Zip two vectors while padding 0s on the shorter vector.
vecZipPad0With :: (V.Unbox a, Num a) => (a -> a -> a) -> V.Vector a -> V.Vector a -> V.Vector a
vecZipPad0With f xs ys = V.generate (max (V.length xs) (V.length ys)) $
  \i -> fromMaybe 0 (xs V.!? i) `f` fromMaybe 0 (ys V.!? i)

-- | Polynomial type.
--
-- >>> Poly (V.fromList [1 :: Int .. 5])
-- 1 X^0 + 2 X^1 + 3 X^2 + 4 X^3 + 5 X^4

-- >>> Poly (V.fromList [1 :: Int, 2]) * Poly (V.fromList [3 :: Int, 4, 5])
-- 3 X^0 + 10 X^1 + 13 X^2 + 10 X^3

-- >>> Poly (V.fromList [1 :: Int, 2]) * Poly (V.fromList [])
-- 0 X^0
newtype Poly a = Poly (V.Vector a)
  deriving (Eq)

makePoly :: (V.Unbox a) => [a] -> Poly a
makePoly = Poly . V.fromList

-- | Degree, assuming top term is nonzero
degree :: (V.Unbox a) => Poly a -> Int
degree (Poly f) = V.length f - 1

-- | Shift up polynomial by X^n
shiftUp :: (V.Unbox a, Num a) => Int -> Poly a -> Poly a
shiftUp n (Poly f) = Poly $ V.replicate n 0 <> f

-- | Shift down polynomial by X^n
shiftDown :: (V.Unbox a) => Int -> Poly a -> Poly a
shiftDown n (Poly f) = Poly $ V.drop n f

-- | Remainder under X^n
remXn :: (V.Unbox a) => Int -> Poly a -> Poly a
remXn n (Poly f) = Poly $ V.take n f

-- | Normalize polynomial, removing leading 0s
--
-- >>> normalize $ Poly (V.fromList [1 :: Int, 0, 0])
-- 1 X^0
--
-- >>> normalize $ Poly (V.fromList [1 :: Int, 2, 3, 0])
-- 1 X^0 + 2 X^1 + 3 X^2
normalize :: (Eq a, Num a, V.Unbox a) => Poly a -> Poly a
normalize (Poly f) = Poly remain
  where
    (_, remain) = V.spanR (== 0) f

-- | This Num instance implements the classical multiplication.
instance (Num a, V.Unbox a) => Num (Poly a) where
  (+) :: Poly a -> Poly a -> Poly a
  Poly f + Poly g = Poly $ vecZipPad0With (+) f g
  (-) :: Poly a -> Poly a -> Poly a
  Poly f - Poly g = Poly $ vecZipPad0With (-) f g
  (*) :: Poly a -> Poly a -> Poly a
  Poly f * Poly g = sum (map Poly mults)
    where
      mults = zipWith (\i fi -> V.map (fi *) (V.replicate i 0 <> g)) [0 ..] (V.toList f)
  negate :: Poly a -> Poly a
  negate (Poly f) = Poly $ V.map negate f
  abs :: Poly a -> Poly a
  abs = error "abs: invalid on poly"
  signum :: Poly a -> Poly a
  signum = error "signum: invalid on poly"
  fromInteger :: Integer -> Poly a
  fromInteger = Poly . V.singleton . fromInteger

instance (V.Unbox a, Show a) => Show (Poly a) where
  show (Poly p) = intercalate " + " $ zipWith (\i coeff -> show coeff <> " X^" <> show i) [0 :: Int ..] (V.toList p)

karatsubaMult :: (Num a, V.Unbox a) => Poly a -> Poly a -> Poly a
karatsubaMult a b = atLog degBound a b
  where
    degBound = fromJust $ find (> max (degree a) (degree b)) [2 ^ i | i <- [0 :: Int ..]]

    -- degBnd: power-of-two degree bound
    atLog degBnd f g = case degBnd of
      1 -> f * g
      _ -> shiftUp degBnd prod1 + shiftUp nextBound (prodAdd - prod0 - prod1) + prod0
      where
        nextBound = degBnd `div` 2
        f1 = shiftDown nextBound f
        f0 = remXn nextBound f
        g1 = shiftDown nextBound g
        g0 = remXn nextBound g
        prod0 = atLog nextBound f0 g0
        prod1 = atLog nextBound f1 g1
        prodAdd = atLog nextBound (f0 + f1) (g0 + g1)

Main.hs:

module Main (main) where

import Control.Exception
import Poly
import System.Random
import System.TimeIt
import Control.Monad

main :: IO ()
main = do
  do
    let f :: Poly Int = makePoly [1, 2, 3]
    let g :: Poly Int = makePoly [4, 5]
    putStrLn $ "f: " <> show f <> ", g: " <> show g
    putStrLn $ "f + g: " <> show (f + g)
    putStrLn $ "Naive f * g: " <> show (f * g)
    putStrLn $ "Karatsuba f * g: " <> show (normalize $ karatsubaMult f g)


  putStrLn ""
  experimentFor 250
  experimentFor 500
  experimentFor 1000
  karatsubaFor 2000
  karatsubaFor 4000
  where
    experimentFor n = do
      setStdGen $ mkStdGen 10
      let randomPoly size = makePoly <$> replicateM size (randomRIO (-100, 100))
      putStrLn $ "Size " <> show n
      f :: Poly Int <- randomPoly n
      g :: Poly Int <- randomPoly n
      putStrLn "naive:"
      _ <- timeIt $ evaluate (f * g)
      putStrLn "Karatsuba:"
      _ <- timeIt $ evaluate (karatsubaMult f g)
      putStrLn "Finished"

    karatsubaFor n = do
      setStdGen $ mkStdGen 10
      let randomPoly size = makePoly <$> replicateM size (randomRIO (-100, 100))
      putStrLn $ "Size " <> show n
      f :: Poly Int <- randomPoly n
      g :: Poly Int <- randomPoly n
      putStrLn "Karatsuba:"
      _ <- timeIt $ evaluate (karatsubaMult f g)
      putStrLn "Finished"

How can I optimize the haskell algorithm to be comparable with c++ one?

1 Like

It usually starts by looking at the Core of your program with -ddump-simpl and perhaps a sprinkling of “suppress” options too: 5.13. Debugging the compiler — Glasgow Haskell Compiler 9.12.1 User's Guide

Can you include the actual code you use to test your implementation (or share a link to your repo)?

If that is in a different module then your code might benefit from adding {-# INLINEABLE ... #-} pragmas to all your functions to make sure that GHC can specialize everything.

And it is probably also a good idea to manually add specializations for at least Double or Complex Double, whatever you plan to use most often.

1 Like

Am I reading this right? Seems like C exits recursion at degree 16, but Haskell carries on until degree 1.

This is probably not very fast. At the very least you must avoid fromMaybe and V.!?.


Here is my implementation of Karatsuba polynomial multiplication in Haskell: https://hackage.haskell.org/package/poly-0.5.1.0/docs/src/Data.Poly.Internal.Dense.html#karatsuba. I’m curious if you can benchmark it against C.

1 Like

I included the code testing the implementation.
I will try adding INLINEABLE; how do I specialize for specific datatype?

Ah, my bad, there is the discrepancy. That said, I doubt this would make it significantly faster.

Thanks, I will test the code out, and compare it with my C++ code.

https://downloads.haskell.org/~ghc/latest/docs/users_guide/exts/pragmas.html#specialize-pragma

So:

{-# SPECIALIZE karatsubaMult :: Poly Double -> Poly Double -> Poly Double #-}
1 Like

These are the results on my machine (with specialization to Int):

f: 1 X^0 + 2 X^1 + 3 X^2, g: 4 X^0 + 5 X^1
f + g: 5 X^0 + 7 X^1 + 3 X^2
Naive f * g: 4 X^0 + 13 X^1 + 22 X^2 + 15 X^3
Karatsuba f * g: 4 X^0 + 13 X^1 + 22 X^2 + 15 X^3
Size 250
naive:
CPU time:   0.01s
Karatsuba:
CPU time:   0.00s
Finished
Size 500
naive:
CPU time:   0.04s
Karatsuba:
CPU time:   0.00s
Finished
Size 1000
naive:
CPU time:   0.15s
Karatsuba:
CPU time:   0.02s
Finished
Size 2000
Karatsuba:
CPU time:   0.05s
Finished
Size 4000
Karatsuba:
CPU time:   0.14s
Finished

How does that compare to C++?

On a laptop, I got

...
Degree 511
Basic took 0.000285791s
Karatsuba took 0.000113562s
+ Match

Degree 1023
Basic took 0.00115727s
Karatsuba took 0.00032155s
+ Match

Degree 2047
Basic took 0.00462531s
Karatsuba took 0.000954033s
+ Match

Degree 4095
Basic took 0.00990323s
Karatsuba took 0.000920718s
+ Match

Degree 8191
Basic took 0.0250494s
Karatsuba took 0.00281552s
+ Match

Degree 16383
Basic took 0.0990669s
Karatsuba took 0.00813868s
+ Match

Degree 32767
Karatsuba took 0.0242313s
Degree 65535
Karatsuba took 0.0725052s
Degree 131071
Karatsuba took 0.216955s
Degree 1048575
Karatsuba took 5.84726s

as part of output; The running code was

void basic_vs_Karatsuba(size_t size) {
  auto p = random_int_vector(size);
  auto q = random_int_vector(size);

  cout << "Degree " << size - 1 << endl;

  auto begin = chrono::high_resolution_clock::now();
  auto basic = poly_mult_basic(p, q);
  auto end = chrono::high_resolution_clock::now();
  auto spent = chrono::duration<double>(end - begin);
  cout << "Basic took " << spent.count() << "s" << endl;

  begin = chrono::high_resolution_clock::now();
  auto karat = poly_mult_Karatsuba(p, q);
  end = chrono::high_resolution_clock::now();
  spent = chrono::duration<double>(end - begin);
  cout << "Karatsuba took " << spent.count() << "s" << endl;

  if (poly_normalize(basic) == poly_normalize(karat))
    cout << "+ Match" << endl;
  else
    cout << "- Mismatch" << endl;
  cout << endl;
}

void only_Karatsuba(size_t size) {
  auto p = random_int_vector(size);
  auto q = random_int_vector(size);

  cout << "Degree " << size - 1 << endl;

  auto begin = chrono::high_resolution_clock::now();
  poly_mult_Karatsuba(p, q);
  auto end = chrono::high_resolution_clock::now();
  auto spent = chrono::duration<double>(end - begin);
  cout << "Karatsuba took " << spent.count() << "s" << endl;
}

int main() {
  {
    auto p = vector<int>{1, 2};
    auto q = vector<int>{3, 4, 5};
    cout << "P: " << p << endl;
    cout << "Q: " << q << endl;
    cout << "P + Q: " << poly_add(p, q) << endl;
    cout << "basic P * Q: " << poly_mult_basic(p, q) << endl;
    auto karat = poly_mult_Karatsuba(p, q);
    cout << "Karatsuba P * Q: " << karat << endl;
    cout << "normalized: " << poly_normalize(karat) << endl;
    cout << endl;
  }

  {
    auto p = random_int_vector(6);
    auto q = random_int_vector(8);
    cout << "P: " << p << endl;
    cout << "Q: " << q << endl;
    auto basic = poly_mult_basic(p, q);
    cout << "basic P * Q: " << basic << endl;
    auto karat = poly_mult_Karatsuba(p, q);
    cout << "Karatsuba P * Q: " << karat << endl;
    cout << endl;
  }

  basic_vs_Karatsuba(128);
  basic_vs_Karatsuba(256);
  basic_vs_Karatsuba(512);
  basic_vs_Karatsuba(1024);
  basic_vs_Karatsuba(2048);
  basic_vs_Karatsuba(4096);
  basic_vs_Karatsuba(8192);
  basic_vs_Karatsuba(16384);

  only_Karatsuba(32768);
  only_Karatsuba(65536);
  only_Karatsuba(131072);

  only_Karatsuba(1 << 20);

  return 0;
}

I tried it with this version, the result was as follows:

f: 1 X^0 + 2 X^1 + 3 X^2, g: 4 X^0 + 5 X^1
f + g: 5 X^0 + 7 X^1 + 3 X^2
Naive f * g: 4 X^0 + 13 X^1 + 22 X^2 + 15 X^3
Karatsuba f * g: 4 X^0 + 13 X^1 + 22 X^2 + 15 X^3

Size 512
Fast Karatsuba:
CPU time:   0.00s
Finished
Size 1024
Fast Karatsuba:
CPU time:   0.01s
Finished
Size 2048
Fast Karatsuba:
CPU time:   0.02s
Finished
Size 4096
Fast Karatsuba:
CPU time:   0.08s
Finished
Size 8192
Fast Karatsuba:
CPU time:   0.23s
Finished
Size 16384
Fast Karatsuba:
CPU time:   0.70s
Finished
Size 32768
Fast Karatsuba:
CPU time:   2.01s
Finished
Size 65536
Fast Karatsuba:
CPU time:   6.07s
Finished

where the code was,
PolyFast.hs:

module PolyFast where

import Control.Monad
import Control.Monad.ST
import Data.Bits
import Data.List
import Data.Vector.Generic qualified as G
import Data.Vector.Generic.Mutable qualified as MG
import GHC.Base

plusPoly ::
  (G.Vector v a) =>
  (a -> a -> a) ->
  v a ->
  v a ->
  v a
plusPoly add xs ys = runST $ do
  let lenXs = G.length xs
      lenYs = G.length ys
      lenMn = lenXs `min` lenYs
      lenMx = lenXs `max` lenYs

  zs <- MG.unsafeNew lenMx
  forM_ [0 .. lenMn - 1] $ \i ->
    MG.unsafeWrite zs i (add (G.unsafeIndex xs i) (G.unsafeIndex ys i))
  G.unsafeCopy
    (MG.unsafeSlice lenMn (lenMx - lenMn) zs)
    (G.unsafeSlice lenMn (lenMx - lenMn) (if lenXs <= lenYs then ys else xs))

  G.unsafeFreeze zs
{-# INLINEABLE plusPoly #-}

karatsubaThreshold :: Int
karatsubaThreshold = 32

karatsuba ::
  (G.Vector v a) =>
  a ->
  (a -> a -> a) ->
  (a -> a -> a) ->
  (a -> a -> a) ->
  v a ->
  v a ->
  v a
karatsuba zer add sub mul = go
  where
    conv = inline convolution zer add mul
    go xs ys
      | lenXs <= karatsubaThreshold || lenYs <= karatsubaThreshold =
          conv xs ys
      | otherwise = runST $ do
          zs <- MG.unsafeNew lenZs
          forM_ [0 .. lenZs - 1] $ \k -> do
            let z0 =
                  if k < G.length zs0
                    then G.unsafeIndex zs0 k
                    else zer
                z11 =
                  if k - m >= 0 && k - m < G.length zs11
                    then G.unsafeIndex zs11 (k - m)
                    else zer
                z10 =
                  if k - m >= 0 && k - m < G.length zs0
                    then G.unsafeIndex zs0 (k - m)
                    else zer
                z12 =
                  if k - m >= 0 && k - m < G.length zs2
                    then G.unsafeIndex zs2 (k - m)
                    else zer
                z2 =
                  if k - 2 * m >= 0 && k - 2 * m < G.length zs2
                    then G.unsafeIndex zs2 (k - 2 * m)
                    else zer
            MG.unsafeWrite zs k (z0 `add` (z11 `sub` (z10 `add` z12)) `add` z2)
          G.unsafeFreeze zs
      where
        lenXs = G.length xs
        lenYs = G.length ys
        lenZs = lenXs + lenYs - 1

        m = ((lenXs `min` lenYs) + 1) `shiftR` 1

        xs0 = G.slice 0 m xs
        xs1 = G.slice m (lenXs - m) xs
        ys0 = G.slice 0 m ys
        ys1 = G.slice m (lenYs - m) ys

        xs01 = plusPoly add xs0 xs1
        ys01 = plusPoly add ys0 ys1
        zs0 = go xs0 ys0
        zs2 = go xs1 ys1
        zs11 = go xs01 ys01
{-# INLINEABLE karatsuba #-}

convolution ::
  (G.Vector v a) =>
  a ->
  (a -> a -> a) ->
  (a -> a -> a) ->
  v a ->
  v a ->
  v a
convolution zer add mul = \xs ys ->
  let lenXs = G.length xs
      lenYs = G.length ys
      lenZs = lenXs + lenYs - 1
   in if lenXs == 0 || lenYs == 0
        then G.empty
        else G.generate lenZs $ \k ->
          foldl'
            (\acc i -> acc `add` mul (G.unsafeIndex xs i) (G.unsafeIndex ys (k - i)))
            zer
            [max (k - lenYs + 1) 0 .. min k (lenXs - 1)]
{-# INLINEABLE convolution #-}

Main.hs:

module Main (main) where

import Control.Exception
import Poly
import System.Random
import System.TimeIt
import Control.Monad
import PolyFast

main :: IO ()
main = do
  do
    let f :: Poly Int = makePoly [1, 2, 3]
    let g :: Poly Int = makePoly [4, 5]
    putStrLn $ "f: " <> show f <> ", g: " <> show g
    putStrLn $ "f + g: " <> show (f + g)
    putStrLn $ "Naive f * g: " <> show (f * g)
    putStrLn $ "Karatsuba f * g: " <> show (normalize $ karatsubaMult f g)

  putStrLn ""
  -- experimentFor 250
  -- experimentFor 500
  -- experimentFor 1000
  -- karatsubaFor 2000
  -- karatsubaFor 4000

  fastKaratsubaFor 512
  fastKaratsubaFor 1024
  fastKaratsubaFor 2048
  fastKaratsubaFor 4096
  fastKaratsubaFor 8192
  fastKaratsubaFor 16384
  fastKaratsubaFor 32768
  fastKaratsubaFor 65536
  where
    experimentFor n = do
      setStdGen $ mkStdGen 10
      let randomPoly size = makePoly <$> replicateM size (randomRIO (-100, 100))
      putStrLn $ "Size " <> show n
      f :: Poly Int <- randomPoly n
      g :: Poly Int <- randomPoly n
      putStrLn "naive:"
      _ <- timeIt $ evaluate (f * g)
      putStrLn "Karatsuba:"
      _ <- timeIt $ evaluate (karatsubaMult f g)
      putStrLn "Finished"

    karatsubaFor n = do
      setStdGen $ mkStdGen 10
      let randomPoly size = makePoly <$> replicateM size (randomRIO (-100, 100))
      putStrLn $ "Size " <> show n
      f :: Poly Int <- randomPoly n
      g :: Poly Int <- randomPoly n
      putStrLn "Karatsuba:"
      _ <- timeIt $ evaluate (karatsubaMult f g)
      putStrLn "Finished"

    fastKaratsubaFor n = do
      setStdGen $ mkStdGen 10
      let randomPoly size = makePoly <$> replicateM size (randomRIO (-100, 100))
      putStrLn $ "Size " <> show n
      f :: Poly Int <- randomPoly n
      g :: Poly Int <- randomPoly n
      putStrLn "Fast Karatsuba:"
      _ <- timeIt $ evaluate (karatsuba 0 (+) (-) (*) (unwrapPoly f) (unwrapPoly g))
      putStrLn "Finished"

Honestly this is still quite slow, maybe I did some experiment wrong.

EDIT: Profiling was on, remedied it.
EDIT2: Note that vector size = degree + 1.

Anyway, it is still slow by factor of around 100x …

@Abab9579 how do you compile this code? Like ghc Main.hs?

I ran cabal run with default settings, so likely just with -O1.

cabal.project.local:

ignore-project: False
profiling: False

mathematical-algorithms.cabal:

cabal-version:      3.0
name:               mathematical-algorithms
version:            0.1.0.0
-- synopsis:
-- description:
license:            BSD-3-Clause
license-file:       LICENSE
author:             Abastro
maintainer:         abab9579@gmail.com
-- copyright:
category:           Math
build-type:         Simple
-- extra-doc-files:    CHANGELOG.md
-- extra-source-files:

common warnings
    ghc-options: -Wall

common deps
    build-depends:
        base ^>=4.17.2.1,
        vector,
        random,
        timeit,

library
    import:           warnings, deps
    exposed-modules:
        Poly
        PolyFast
    hs-source-dirs:   src
    default-language: GHC2021

executable mathematical-algorithms
    import:           warnings, deps
    main-is:          Main.hs
    -- other-modules:
    build-depends:
        mathematical-algorithms
    hs-source-dirs:   app
    default-language: GHC2021

And related question: With what compiler (g++, clang++, something else?) and what flags did you compile the C++ code?

Side quest: implement Karatsuba algorithm in ghc-bignum.

I’ve only implemented the basic multiplication algorithm in ghc-bignum (see libraries/ghc-internal/src/GHC/Internal/Bignum/Backend/Native.hs · 2fdd0be9747e2014102c1dd002425ce2f0a5db4e · Glasgow Haskell Compiler / GHC · GitLab). We should probably switch to more efficient algorithms depending on input sizes. Contributions welcome!

2 Likes

Ah. All modern C compilers recognise such loops and vectorise them using AVX / NEON instructions automatically. This is easily 10x speed up and very hard to replicate in native Haskell.

2 Likes

Is it possible to confirm that that is indeed making the difference by turning off such optimizations in the C compile, and checking whether the run times then become roughly equal?

Putting INLINE pragmas on shiftUp and vecZipPad0With and compiling with -O2 gets me a ~3x speedup. I’m now at Haskell being ~30x slower than C++.

2 Likes