/* classikar_body.c
 * Daniel S. Roche, January 2009
 * http://www.cs.uwaterloo.ca/~droche/
 * 
 * The body of routines for classical/Karatsuba style multiplication.
 * This will be repeatedly included from classikar.c.
 * Before each include, the macro TYPE should be defined as 0, 1, or 2.
 * Type 0 is just ordinary multiplication, c = a1 * b.
 * Type 1 means that the first |a| elements of c are pre-initialized
 *   and must be added to, c += a1 * b
 * Type 2 means the condition of Type 1, and now the first operand
 *   of the multiplication is given as the sum of two polynomials,
 *   c += (a1 + a2) * b
 *
 * See LICENSE.txt for copyright and permissions.
 */

// The "2" type of multiplication takes a second argument, so we define
// two macros to handle that transparently.
#if TYPE == 2
#define SARG_DEC , const lsmul_ele *a2
#else
#define SARG_DEC
#endif

/* Prototypes. These functions check the size and then call
 * either classical or Karatsuba. CK assumes both arguments have
 * the same size, and OD (for "one different") assumes the second
 * argument has degree one greater than the first. */
void CK (lsmul_ele*, const lsmul_ele* SARG_DEC,
   const lsmul_ele*, long, lsmul_ele);
void OD (lsmul_ele*, const lsmul_ele* SARG_DEC,
   const lsmul_ele*, long, lsmul_ele);

// Type "2" uses a second argument.
#if TYPE == 2
#define SARG_USE(IND) + a2[IND]
#else
#define SARG_USE(IND)
#endif

// Classical algorithm, requires |a| <= |b|
inline void CL (lsmul_ele *c, const lsmul_ele *a1 SARG_DEC, long sa,
   const lsmul_ele *b, long sb, lsmul_ele p)
{
   unsigned long long accum, lp = (unsigned long long) p;
   long i=0, j;

   assert ((sa <= sb) && (sa <= 32));

   while (i < sa) {
#if TYPE == 0
      accum = ((unsigned long long) a1[0]) * b[i];
      for (j=1; j<=i; ++j)
         accum += ((unsigned long long) a1[j]) * b[i-j];
#else
      accum = (unsigned long long) (c[i]+p);
      for (j=0; j<=i; ++j)
         accum += ((unsigned long long) a1[j] SARG_USE(j) ) * b[i-j];
#endif
      c[i] = (lsmul_ele) (accum % lp);
      ++i;
   }

   while (i < sb) {
      accum = ((unsigned long long) a1[0] SARG_USE(0) ) * b[i];
      for (j=1; j<sa; ++j)
         accum += ((unsigned long long) a1[j] SARG_USE(j) ) * b[i-j];
      c[i] = (lsmul_ele) (accum % lp);
      ++i;
   }

   while (i < sa+sb-1) {
      j = i+1-sb;
      accum = ((unsigned long long) a1[j] SARG_USE(j) ) * b[sb-1];
      while (++j < sa)
         accum += ((unsigned long long) a1[j] SARG_USE(j) ) * b[i-j];
      c[i] = (lsmul_ele) (accum % lp);
      ++i;
   }
}

#undef SARG_USE

// Karatsuba-like algorithm (with no extra space), requires |a| == |b|
inline void KR (lsmul_ele *c, const lsmul_ele *a1 SARG_DEC,
   const lsmul_ele *b, long sa, lsmul_ele p)
{
   long i, k = sa >> 1, tk;
   
   if ((sa&1) == 0) { // even case, sa == 2*k
      tk = 3*k-1;

      for (i=0; i<k; ++i) {
         c[tk+i] = a1[i] + a1[k+i];
         #if TYPE == 2
            c[tk+i] += a2[i] + a2[k+i] - (p<<1);
            RED_TRICK (c[tk+i], p<<1);
         #endif
         c[tk+i] -= p;
         RED_TRICK (c[tk+i], p);
      }

      #if TYPE == 0
         for (i=0; i<k; ++i) {
            c[i] = b[i] + b[k+i] - p;
            RED_TRICK (c[i], p);
         }
         CK (c+k, c, c+tk, k, p);
      #else
         for (i=0; i<k; ++i) {
            c[k+i] += c[i];
            RED_TRICK (c[k+i], p<<1);
            c[k+i] -= p;
         }
         classikar_2 (c+k, b, b+k, c+tk, k, p);
      #endif

      for (i=0; i<k-1; ++i)
         c[tk+i] = c[k+i] + c[sa+i];
      c[(sa-1)<<1] = c[sa-1];

      #if TYPE < 2
         CK (c, a1, b, k, p);
      #else
         CK (c, a1, a2, b, k, p);
      #endif

      for (i=0; i<k-1; ++i)
         c[sa+i] -= c[k+i];
      for (i=0; i<k; ++i)
         c[k+i] = c[tk+i] - c[i];
      c[tk] = (lsmul_ele) 0;

      #if TYPE < 2
         classikar_1 (c+sa, a1+k, b+k, k, p);
      #else
         CK (c+sa, a1+k, a2+k, b+k, k, p);
      #endif

      for (i=k; i<sa; ++i) {
         c[i] -= c[k+i];
         RED_TRICK (c[i], p<<1);
         c[i] -= p;
         RED_TRICK (c[i], p);
      }

      for (i=sa; i<tk; ++i) {
         c[i] -= c[k+i];
         RED_TRICK (c[i], p);
      }
   
   } else { // odd case, sa == 2*k + 1
      #if TYPE > 0
         lsmul_ele temp = c[k<<1];
      #endif
      tk = 3*k;

      for (i=0; i<k; ++i) {
         c[tk+i] = a1[i] + a1[k+i];
         #if TYPE == 2
            c[tk+i] += a2[i] + a2[k+i] - (p<<1);
            RED_TRICK (c[tk+i], p<<1);
         #endif
         c[tk+i] -= p;
         RED_TRICK (c[tk+i], p);
      }

      c[k<<2] = a1[k<<1];
      #if TYPE == 2
         c[k<<2] += a2[k<<1] - p;
         RED_TRICK (c[k<<2], p);
      #endif

      #if TYPE == 0
         for (i=0; i<k; ++i) {
            c[i] = b[i] + b[k+i] - p;
            RED_TRICK (c[i], p);
         }
         OD (c+k, c, c+tk, k, p);
      #else
         for (i=0; i<k; ++i) {
            c[k+i] += c[i];
            RED_TRICK (c[k+i], p<<1);
            c[k+i] -= p;
         }
         classikar_od_2 (c+k, b, b+k, c+tk, k, p);
      #endif

      addSMul (c+(k<<1), c+tk, 1, b[k<<1], p);
      c[tk] = (lsmul_ele) 0;
      addSMul (c+sa, c+(tk+1), k, b[k<<1], p);

      for (i=k; i<(k<<1); ++i)
         c[sa+i] = c[i] + c[k+i];

      #if TYPE < 2
         CK (c, a1, b, k, p);
      #else
         CK (c, a1, a2, b, k, p);
      #endif
      c[sa-2] = (lsmul_ele) 0;

      for (i=k; i<(sa-2); ++i)
         c[k+i] -= c[i];
      for (i=0; i<k; ++i)
         c[k+i] = c[tk+i+1] - c[i];
      c[k<<1] += c[tk];
      c[k] += c[tk];

      #if TYPE < 2
         classikar_1 (c+(k<<1), a1+k, b+k, k+1, p);
      #else
         CK (c+(k<<1), a1+k, a2+k, b+k, k+1, p);
      #endif

      for (i=k; i<(k<<1); ++i) {
         c[i] -= c[k+i];
         RED_TRICK (c[i], p<<1);
         c[i] -= p;
         RED_TRICK (c[i], p);
      }

      c[k<<1] -= c[tk];
      for (i=sa; i<tk+1; ++i) {
         c[i] -= c[k+i];
         RED_TRICK (c[i], p);
      }
      c[k] -= p;
      RED_TRICK (c[k], p);

      #if TYPE > 0
         c[k<<1] += temp;
         RED_TRICK (c[k<<1], p<<1);
         c[k<<1] -= p;
      #endif
      RED_TRICK (c[k<<1], p);
      
   }
}

// Karatsuba-like algorithm (with no extra space), requires |a|+1 == |b|
// (i.e. "One-different" version)
inline void KD (lsmul_ele *c, const lsmul_ele *a1 SARG_DEC,
   const lsmul_ele *b, long sa, lsmul_ele p)
{
   long i, k = sa >> 1, tk = sa + k;

   if ((sa&1) == 0) { // even case. |a|=2k, |b|=2k+1

      for (i=0; i<k; ++i) {
         c[tk+i] = a1[i] + a1[k+i];
         #if TYPE == 2
            c[tk+i] += a2[i] + a2[k+i] - (p<<1);
            RED_TRICK (c[tk+i], p<<1);
         #endif
         c[tk+i] -= p;
         RED_TRICK (c[tk+i], p);
      }

      #if TYPE == 0
         for (i=0; i<k; ++i) {
            c[i] = b[i] + b[k+i] - p;
            RED_TRICK (c[i], p);
         }
         CK (c+k, c, c+tk, k, p);
      #else
         for (i=0; i<k; ++i) {
            c[k+i] += c[i];
            RED_TRICK (c[k+i], p<<1);
            c[k+i] -= p;
         }
         classikar_2 (c+k, b, b+k, c+tk, k, p);
      #endif
      c[tk-1] = (lsmul_ele) 0;
      addSMul (c+sa, c+tk, k, b[sa], p);

      for (i=k; i<sa; ++i)
         c[sa+i] = c[i] + c[k+i];

      #if TYPE < 2
         CK (c, a1, b, k, p);
      #else
         CK (c, a1, a2, b, k, p);
      #endif

      for (i=0; i<k-1; ++i)
         c[sa+i] -= c[k+i];
      for (i=0; i<k; ++i)
         c[k+i] = c[tk+i] - c[i];

      #if TYPE < 2
         classikar_od_1 (c+sa, a1+k, b+k, k, p);
      #else
         OD (c+sa, a1+k, a2+k, b+k, k, p);
      #endif

      for (i=k; i<sa; ++i) {
         c[i] -= c[k+i];
         RED_TRICK (c[i], p<<1);
         c[i] -= p;
         RED_TRICK (c[i], p);
      }

      for (i=sa; i<tk; ++i) {
         c[i] -= c[k+i];
         RED_TRICK (c[i], p);
      }

   } else { // odd case

      c[tk] = a1[k];
      #if TYPE == 2
         c[tk] += a2[k] - p;
         RED_TRICK (c[tk], p);
      #endif
      for (i=0; i<k; ++i) {
         c[tk+1+i] = a1[i] + a1[k+1+i];
         #if TYPE == 2
            c[tk+1+i] += a2[i] + a2[k+1+i] - (p<<1);
            RED_TRICK (c[tk+1+i], p<<1);
         #endif
         c[tk+1+i] -= p;
         RED_TRICK (c[tk+1+i], p);
      }

      #if TYPE == 0
         for (i=1; i<k+1; ++i) {
            c[i] = b[i] + b[k+1+i] - p;
            RED_TRICK (c[i], p);
         }
         OD (c+(k+1), c+1, c+tk, k, p);
         c[k] = (lsmul_ele) 0;
         addSMul (c+k, c+tk, k+1, b[0]+b[k+1], p);
      #else
         for (i=0; i<k; ++i) {
            c[k+1+i] += c[i];
            RED_TRICK (c[k+1+i], p<<1);
            c[k+1+i] -= p;
         }
         classikar_2 (c+k, b, b+(k+1), c+tk, k+1, p);
      #endif

      for (i=0; i<k; ++i)
         c[tk+i] = c[k+i] + c[sa+i];
      c[tk+k] = c[sa-1];

      #if TYPE < 2
         OD (c, a1, b, k, p);
      #else
         OD (c, a1, a2, b, k, p);
      #endif

      for (i=0; i<k; ++i)
         c[sa+i] -= c[k+i];
      c[k] = c[tk];
      for (i=0; i<k; ++i)
         c[k+1+i] = c[tk+1+i] - c[i];
      c[tk] = (lsmul_ele) 0;

      #if TYPE < 2
         classikar_1 (c+sa, a1+k, b+(k+1), k+1, p);
      #else
         CK (c+sa, a1+k, a2+k, b+(k+1), k+1, p);
      #endif

      for (i=k; i<sa; ++i) {
         c[i] -= c[k+1+i];
         RED_TRICK (c[i], p<<1);
         c[i] -= p;
         RED_TRICK (c[i], p);
      }

      for (i=sa; i<tk; ++i) {
         c[i] -= c[k+1+i];
         RED_TRICK (c[i], p);
      }
   
   }
}

// These functions choose whether to call classical or karatsuba,
// based on the argument size.

// Type "2" requires a second argument.
#if TYPE == 2
#define SARG_USE , a2
#else
#define SARG_USE
#endif

// Requires |a| = |b|
void CK (lsmul_ele *c, const lsmul_ele *a1 SARG_DEC,
   const lsmul_ele *b, long sa, lsmul_ele p)
{
   if (sa <= KARX) CL (c,a1 SARG_USE,sa,b,sa,p);
   else KR (c,a1 SARG_USE,b,sa,p);
}

// Requires |a| + 1 = |b|
void OD (lsmul_ele *c, const lsmul_ele *a1 SARG_DEC,
   const lsmul_ele *b, long sa, lsmul_ele p)
{
   if (sa <= KARX) CL (c, a1 SARG_USE, sa, b, sa+1, p);
   else KD (c, a1 SARG_USE, b, sa, p);
}

#undef SARG_USE
#undef SARG_DEC
