/* classikar.c
 * Daniel S. Roche, January 2009
 * http://www.cs.uwaterloo.ca/~droche/
 *
 * Routines for classical and Karatsuba-like polynomial multiplication.
 * All function calls are thread-safe and use no extra space.
 *
 * See LICENSE.txt for copyright and permissions.
 */

#include "lsmul.h"

/* Okay, so we go a little crazy with the preprocessor.
 * The basic idea is to use a so-called "Supermacro" (Stephen Beal),
 * which is a separate header file (classikar_body.c) which I will
 * include three times. It will generate code for the three "types" of
 * multiplication (see that file for details). */

// The crossover point from classical to Karatsuba
#define KARX 32

/* This arithmetic right shift trick shamelessly stolen from NTL
 * Not surprisingly, "LVAL" must be an lvalue (i.e. can be assigned to).
 * If -AMT <= LVAL < AMT before the call, we will have
 * 0 <= LVAL < AMT after the call, whith equivalence modulo AMT. */
#define RED_TRICK(LVAL,AMT) (LVAL) += ((AMT) & ((LVAL) >> 31))

/* Adds a scalar multiple of one polynomial to another.
 * That is, c += a * b, where a is a polynomial and b is a scalar. 
 * All elements of c are reduced modulo p (before and after the call). 
 * The idea for the technique here is also from NTL. */
inline void addSMul 
   (lsmul_ele *c, const lsmul_ele *a, long sa, lsmul_ele b, lsmul_ele p) 
{
   double bpinv = ((double) b) / ((double) p);
   lsmul_ele q;
   long i;

   for (i=0; i<sa; ++i) {
      q = ((lsmul_ele) (((double) a[i]) * bpinv)) + 1;
      c[i] += (a[i]*b - q*p);
      // Now we know that -2p <= c[i] < 2p
      RED_TRICK (c[i], p<<1);
      c[i] -= p;
      RED_TRICK (c[i], p);
   }
}

/* I have to define two macros to do the work of one because
 * (for whatever reason) gcc doesn't expand macro arguments when
 * they are to be concatenated. */
#define CAT(BASE, NUM) BASE ## NUM
#define REALLY_CAT(BASE, NUM) CAT (BASE, NUM)

#define CK REALLY_CAT (classikar_, TYPE)
#define OD REALLY_CAT (classikar_od_, TYPE)
#define CL REALLY_CAT (classical_, TYPE)
#define KR REALLY_CAT (kar_, TYPE)
#define KD REALLY_CAT (kar_od_, TYPE)

#define TYPE 2
#include "classikar_body.c"
#undef TYPE

#define TYPE 1
#include "classikar_body.c"
#undef TYPE

#define TYPE 0
#include "classikar_body.c"
#undef TYPE

#undef CK
#undef OD
#undef CL
#undef KR
#undef KD
#undef REALLY_CAT
#undef CAT

// The size of the arguments is different, and |a| <= |b|
void classikar_diff (lsmul_ele*, const lsmul_ele*, long, 
   const lsmul_ele*, long, lsmul_ele);

// c = a * b, |a| < |b|
// Uses a blocking strategy, splitting b into blocks of size |a|-1.
inline void block_mul (lsmul_ele *c, const lsmul_ele *a, long sa,
   const lsmul_ele *b, long sb, lsmul_ele p)
{
   long sliceSize=sa-1, nSlices=sb/sliceSize, curSlice=sb%sliceSize;
   if (curSlice) classikar_diff (c, b, curSlice, a, sa, p);
   while (curSlice < sb) {
      classikar_od_1 (c+curSlice, b+curSlice, a, sliceSize, p);
      curSlice += sliceSize;
   }
}

// c = a * b, |a| < |b|
void classikar_diff (lsmul_ele *c, const lsmul_ele *a, long sa,
   const lsmul_ele *b, long sb, lsmul_ele p)
{
   if (sa <= KARX) classical_0 (c, a, sa, b, sb, p);
   else block_mul (c, a, sa, b, sb, p);
}

// The general-purpose, top-level call to multiply a times b and put the
// result into c.
void mul_ck (lsmul_ele *c, const lsmul_ele *a, long sa,
   const lsmul_ele *b, long sb, lsmul_ele p)
{
   if ((sa==0) || (sb==0)) return;
   else if (sa == sb) classikar_0 (c, a, b, sa, p);
   else if (sa < sb) {
      if (sa+1 == sb) classikar_od_0 (c, a, b, sa, p);
      else classikar_diff (c, a, sa, b, sb, p);
   }
   else {
      if (sa == sb+1) classikar_od_0 (c, b, a, sb, p);
      else classikar_diff (c, b, sb, a, sa, p);
   }
}

#undef RED_TRICK
#undef KARX
