I am implementing Karatsuba algorithm as part of an algorithms course.
I tried implementing in both haskell and c++, and it seems c++ is a clear winner by factor of 100~1000x.
My somewhat optimized c++ code:
#include <algorithm>
#include <chrono>
#include <complex>
#include <iostream>
#include <vector>
using namespace std;
typedef complex<double> Complex;
typedef double Real;
/**
* Operator overloading for printing vectors.
* @tparam T
* @param os
* @param v
* @return
*/
template <typename T> ostream &operator<<(ostream &os, const vector<T> &v) {
for (size_t i = 0; i < v.size(); i++) {
os << v[i] << " ";
}
return os;
}
template <typename T> ostream &operator<<(ostream &os, const span<T> &v) {
for (auto x : v) {
os << x << " ";
}
return os;
}
vector<int> random_int_vector(size_t size) {
auto result = vector<int>();
for (size_t i = 0; i < size; i++) {
result.push_back(rand() % 100);
}
return result;
}
vector<Real> random_real_vector(size_t size) {
auto result = vector<Real>();
for (size_t i = 0; i < size; i++) {
result.push_back((double)rand() / (double)RAND_MAX);
}
return result;
}
vector<Complex> vector_as_complex(vector<Real> &v) {
auto result = vector<Complex>(v.size());
for (size_t i = 0; i < v.size(); i++) {
result[i] = Complex(v[i], 0);
}
return result;
}
template <typename R> vector<R> poly_add(vector<R> &a, vector<R> &b) {
auto res = vector<R>(max(a.size(), b.size()));
for (size_t i = 0; i < a.size(); i++) {
res[i] = a[i];
}
for (size_t i = 0; i < b.size(); i++) {
res[i] += b[i];
}
return res;
}
template <typename R> vector<R> poly_sub(vector<R> &a, vector<R> &b) {
auto res = vector<R>(max(a.size(), b.size()));
for (size_t i = 0; i < a.size(); i++) {
res[i] = a[i];
}
for (size_t i = 0; i < b.size(); i++) {
res[i] -= b[i];
}
return res;
}
// Add polynomials in-place assuming the size is allocated
template <typename R>
void add_inplace(span<R> &a, span<R> &b, span<R> &result) {
for (size_t i = 0; i < a.size(); i++)
result[i] = a[i] + b[i];
}
// Subtract polynomials in-place assuming the size is allocated
template <typename R>
void sub_inplace(span<R> &a, span<R> &b, span<R> &result) {
for (size_t i = 0; i < a.size(); i++)
result[i] = a[i] - b[i];
}
template <typename R> vector<R> poly_normalize(vector<R> &a) {
int i;
for (i = a.size() - 1; i >= 0; i--) {
if (a[i] != 0)
break;
}
return vector(a.begin(), a.begin() + i + 1);
}
// Computes basic multiplication, assuming result is initialized as 0 and has enough space
template <typename R> void poly_mult_basic_span(span<R> &a, span<R> &b, span<R> &result) {
for (size_t i = 0; i < a.size(); i++) {
// Add at i-th position
auto at_ith = result.subspan(i, b.size());
for (size_t j = 0; j < b.size(); j++) {
at_ith[j] = at_ith[j] + a[i] * b[j];
}
}
}
// Basic polynomial multiplication.
template <typename R> vector<R> poly_mult_basic(vector<R> &a, vector<R> &b) {
if (a.empty() && b.empty())
return vector<R>(0);
vector<R> result = vector<R> (a.size() + b.size() - 1);
auto span_a = span(a);
auto span_b = span(b);
auto span_result = span(result);
poly_mult_basic_span(span_a, span_b, span_result);
return result;
}
#define THRESHOLD 16
/**
* A step of the Karatsuba function.
*
* NOTE: interestingly, the basic case is quite a performance bottleneck.
* Hence, the basic case needs to be implemented well.
*
* @param deg_bnd power-of-2 degree bound
* @param buffer the buffer which is used only throughout the invocation
*/
template <typename R>
void poly_mult_Karatsuba_step(const size_t deg_bnd, span<R> &a, span<R> &b,
span<R> &result, span<R> &buffer) {
// Result may be reused, so this needs clearing
for (auto &entry : result)
entry = 0;
if (deg_bnd <= THRESHOLD) {
poly_mult_basic_span(a, b, result);
return;
}
const auto next_bnd = deg_bnd >> 1;
auto a0 = a.subspan(0, next_bnd);
auto a1 = a.subspan(next_bnd, next_bnd);
auto b0 = b.subspan(0, next_bnd);
auto b1 = b.subspan(next_bnd, next_bnd);
auto a01 = buffer.subspan(0, next_bnd);
auto b01 = buffer.subspan(next_bnd, next_bnd);
auto prod0 = result.subspan(0, deg_bnd);
auto prod1 = result.subspan(deg_bnd, deg_bnd);
auto prod_add = buffer.subspan(next_bnd * 2, next_bnd * 2);
auto next_buffer = buffer.subspan(next_bnd * 4, next_bnd * 4);
// correctly put into prod0 and prod1 position
poly_mult_Karatsuba_step(next_bnd, a0, b0, prod0, next_buffer);
poly_mult_Karatsuba_step(next_bnd, a1, b1, prod1, next_buffer);
add_inplace(a0, a1, a01);
add_inplace(b0, b1, b01);
poly_mult_Karatsuba_step(next_bnd, a01, b01, prod_add, next_buffer);
// adjust prod_add
sub_inplace(prod_add, prod0, prod_add);
sub_inplace(prod_add, prod1, prod_add);
// Add middle term at X^next_bnd position
auto result_mid = result.subspan(next_bnd, deg_bnd);
add_inplace(prod_add, result_mid, result_mid);
}
template <typename R>
vector<R> poly_mult_Karatsuba(vector<R> &a, vector<R> &b) {
size_t deg_bound = 1;
while (deg_bound < max(a.size(), b.size()))
deg_bound = deg_bound << 1;
a.resize(deg_bound);
b.resize(deg_bound);
auto result = vector<R>(deg_bound << 1);
auto buffer = vector<R>(deg_bound * 4);
auto span_a = span(a);
auto span_b = span(b);
auto span_result = span(result);
auto span_buffer = span(buffer);
poly_mult_Karatsuba_step(deg_bound, span_a, span_b, span_result, span_buffer);
return result;
}
My current haskell algorithm:
Poly.hs:
module Poly where
import Data.List
import Data.Maybe
import Data.Vector.Unboxed qualified as V
-- Zip two vectors while padding 0s on the shorter vector.
vecZipPad0With :: (V.Unbox a, Num a) => (a -> a -> a) -> V.Vector a -> V.Vector a -> V.Vector a
vecZipPad0With f xs ys = V.generate (max (V.length xs) (V.length ys)) $
\i -> fromMaybe 0 (xs V.!? i) `f` fromMaybe 0 (ys V.!? i)
-- | Polynomial type.
--
-- >>> Poly (V.fromList [1 :: Int .. 5])
-- 1 X^0 + 2 X^1 + 3 X^2 + 4 X^3 + 5 X^4
-- >>> Poly (V.fromList [1 :: Int, 2]) * Poly (V.fromList [3 :: Int, 4, 5])
-- 3 X^0 + 10 X^1 + 13 X^2 + 10 X^3
-- >>> Poly (V.fromList [1 :: Int, 2]) * Poly (V.fromList [])
-- 0 X^0
newtype Poly a = Poly (V.Vector a)
deriving (Eq)
makePoly :: (V.Unbox a) => [a] -> Poly a
makePoly = Poly . V.fromList
-- | Degree, assuming top term is nonzero
degree :: (V.Unbox a) => Poly a -> Int
degree (Poly f) = V.length f - 1
-- | Shift up polynomial by X^n
shiftUp :: (V.Unbox a, Num a) => Int -> Poly a -> Poly a
shiftUp n (Poly f) = Poly $ V.replicate n 0 <> f
-- | Shift down polynomial by X^n
shiftDown :: (V.Unbox a) => Int -> Poly a -> Poly a
shiftDown n (Poly f) = Poly $ V.drop n f
-- | Remainder under X^n
remXn :: (V.Unbox a) => Int -> Poly a -> Poly a
remXn n (Poly f) = Poly $ V.take n f
-- | Normalize polynomial, removing leading 0s
--
-- >>> normalize $ Poly (V.fromList [1 :: Int, 0, 0])
-- 1 X^0
--
-- >>> normalize $ Poly (V.fromList [1 :: Int, 2, 3, 0])
-- 1 X^0 + 2 X^1 + 3 X^2
normalize :: (Eq a, Num a, V.Unbox a) => Poly a -> Poly a
normalize (Poly f) = Poly remain
where
(_, remain) = V.spanR (== 0) f
-- | This Num instance implements the classical multiplication.
instance (Num a, V.Unbox a) => Num (Poly a) where
(+) :: Poly a -> Poly a -> Poly a
Poly f + Poly g = Poly $ vecZipPad0With (+) f g
(-) :: Poly a -> Poly a -> Poly a
Poly f - Poly g = Poly $ vecZipPad0With (-) f g
(*) :: Poly a -> Poly a -> Poly a
Poly f * Poly g = sum (map Poly mults)
where
mults = zipWith (\i fi -> V.map (fi *) (V.replicate i 0 <> g)) [0 ..] (V.toList f)
negate :: Poly a -> Poly a
negate (Poly f) = Poly $ V.map negate f
abs :: Poly a -> Poly a
abs = error "abs: invalid on poly"
signum :: Poly a -> Poly a
signum = error "signum: invalid on poly"
fromInteger :: Integer -> Poly a
fromInteger = Poly . V.singleton . fromInteger
instance (V.Unbox a, Show a) => Show (Poly a) where
show (Poly p) = intercalate " + " $ zipWith (\i coeff -> show coeff <> " X^" <> show i) [0 :: Int ..] (V.toList p)
karatsubaMult :: (Num a, V.Unbox a) => Poly a -> Poly a -> Poly a
karatsubaMult a b = atLog degBound a b
where
degBound = fromJust $ find (> max (degree a) (degree b)) [2 ^ i | i <- [0 :: Int ..]]
-- degBnd: power-of-two degree bound
atLog degBnd f g = case degBnd of
1 -> f * g
_ -> shiftUp degBnd prod1 + shiftUp nextBound (prodAdd - prod0 - prod1) + prod0
where
nextBound = degBnd `div` 2
f1 = shiftDown nextBound f
f0 = remXn nextBound f
g1 = shiftDown nextBound g
g0 = remXn nextBound g
prod0 = atLog nextBound f0 g0
prod1 = atLog nextBound f1 g1
prodAdd = atLog nextBound (f0 + f1) (g0 + g1)
Main.hs:
module Main (main) where
import Control.Exception
import Poly
import System.Random
import System.TimeIt
import Control.Monad
main :: IO ()
main = do
do
let f :: Poly Int = makePoly [1, 2, 3]
let g :: Poly Int = makePoly [4, 5]
putStrLn $ "f: " <> show f <> ", g: " <> show g
putStrLn $ "f + g: " <> show (f + g)
putStrLn $ "Naive f * g: " <> show (f * g)
putStrLn $ "Karatsuba f * g: " <> show (normalize $ karatsubaMult f g)
putStrLn ""
experimentFor 250
experimentFor 500
experimentFor 1000
karatsubaFor 2000
karatsubaFor 4000
where
experimentFor n = do
setStdGen $ mkStdGen 10
let randomPoly size = makePoly <$> replicateM size (randomRIO (-100, 100))
putStrLn $ "Size " <> show n
f :: Poly Int <- randomPoly n
g :: Poly Int <- randomPoly n
putStrLn "naive:"
_ <- timeIt $ evaluate (f * g)
putStrLn "Karatsuba:"
_ <- timeIt $ evaluate (karatsubaMult f g)
putStrLn "Finished"
karatsubaFor n = do
setStdGen $ mkStdGen 10
let randomPoly size = makePoly <$> replicateM size (randomRIO (-100, 100))
putStrLn $ "Size " <> show n
f :: Poly Int <- randomPoly n
g :: Poly Int <- randomPoly n
putStrLn "Karatsuba:"
_ <- timeIt $ evaluate (karatsubaMult f g)
putStrLn "Finished"
How can I optimize the haskell algorithm to be comparable with c++ one?