/* 
 *  M_APM  -  mapmfmul.c
 *
 *  Copyright (C) 1999 - 2007   Michael C. Ring
 *
 *  Permission to use, copy, and distribute this software and its
 *  documentation for any purpose with or without fee is hereby granted,
 *  provided that the above copyright notice appear in all copies and
 *  that both that copyright notice and this permission notice appear
 *  in supporting documentation.
 *
 *  Permission to modify the software is granted. Permission to distribute
 *  the modified code is granted. Modifications are to be distributed by
 *  using the file 'license.txt' as a template to modify the file header.
 *  'license.txt' is available in the official MAPM distribution.
 *
 *  This software is provided "as is" without express or implied warranty.
 */

/*
 *      $Id: mapmfmul.c,v 1.33 2007/12/03 01:52:22 mike Exp $
 *
 *      This file contains the divide-and-conquer FAST MULTIPLICATION 
 *	function as well as its support functions.
 *
 *      $Log: mapmfmul.c,v $
 *      Revision 1.33  2007/12/03 01:52:22  mike
 *      Update license
 *
 *      Revision 1.32  2004/02/18 03:16:15  mike
 *      optimize 4 byte multiply (when FFT is disabled)
 *
 *      Revision 1.31  2003/12/04 01:14:06  mike
 *      redo math on 'borrow'
 *
 *      Revision 1.30  2003/07/21 20:34:18  mike
 *      Modify error messages to be in a consistent format.
 *
 *      Revision 1.29  2003/03/31 21:55:07  mike
 *      call generic error handling function
 *
 *      Revision 1.28  2002/11/03 22:38:15  mike
 *      Updated function parameters to use the modern style
 *
 *      Revision 1.27  2002/02/14 19:53:32  mike
 *      add conditional compiler option to disable use
 *      of FFT multiply if the user so chooses.
 *
 *      Revision 1.26  2001/07/26 20:56:38  mike
 *      fix comment, no code change
 *
 *      Revision 1.25  2001/07/16 19:43:45  mike
 *      add function M_free_all_fmul
 *
 *      Revision 1.24  2001/02/11 22:34:47  mike
 *      modify parameters to REALLOC
 *
 *      Revision 1.23  2000/10/20 19:23:26  mike
 *      adjust power_of_2 function so it should work with
 *      64 bit processors and beyond.
 *
 *      Revision 1.22  2000/08/23 22:27:34  mike
 *      no real code change, re-named 2 local functions
 *      so they make more sense.
 *
 *      Revision 1.21  2000/08/01 22:24:38  mike
 *      use sizeof(int) function call to stop some
 *      compilers from complaining.
 *
 *      Revision 1.20  2000/07/19 17:12:00  mike
 *      lower the number of bytes that the FFT can handle. worst case
 *      testing indicated math overflow when >= 1048576
 *
 *      Revision 1.19  2000/07/08 18:29:03  mike
 *      increase define so FFT can handle bigger numbers
 *
 *      Revision 1.18  2000/07/06 23:20:12  mike
 *      changed my mind. use static local MAPM numbers
 *      for temp data storage
 *
 *      Revision 1.17  2000/07/06 20:52:34  mike
 *      use init function to get local writable copies
 *      instead of using the stack
 *
 *      Revision 1.16  2000/07/04 17:25:09  mike
 *      guarantee 16 bit compilers still work OK
 *
 *      Revision 1.15  2000/07/04 15:40:02  mike
 *      add call to use FFT algorithm
 *
 *      Revision 1.14  2000/05/05 21:10:46  mike
 *      add comment indicating availability of assembly language
 *      version of M_4_byte_multiply for Linux on x86 platforms.
 *
 *      Revision 1.13  2000/04/20 19:30:45  mike
 *      minor optimization to 4 byte multiply
 *
 *      Revision 1.12  2000/04/14 15:39:30  mike
 *      optimize the fast multiply function. don't re-curse down
 *      to a size of 1. recurse down to a size of '4' and then
 *      call a special 4 byte multiply function.
 *
 *      Revision 1.11  2000/02/03 23:02:13  mike
 *      put in RCS for real...
 *
 *      Revision 1.10  2000/02/03 22:59:08  mike
 *      remove the extra recursive function. not needed any
 *      longer since all current compilers should not have
 *      any problem with true recursive calls.
 *
 *      Revision 1.9  2000/02/03 22:47:39  mike
 *      use MAPM_* generic memory function
 *
 *      Revision 1.8  1999/09/19 21:13:44  mike
 *      eliminate unneeded local int in _split
 *
 *      Revision 1.7  1999/08/12 22:36:23  mike
 *      move the 3 'simple' function to the top of file
 *      so GCC can in-line the code.
 *
 *      Revision 1.6  1999/08/12 22:01:14  mike
 *      more minor optimizations
 *
 *      Revision 1.5  1999/08/12 02:02:06  mike
 *      minor optimization
 *
 *      Revision 1.4  1999/08/10 22:51:59  mike
 *      minor tweak
 *
 *      Revision 1.3  1999/08/10 00:45:47  mike
 *      added more comments and a few minor tweaks
 *
 *      Revision 1.2  1999/08/09 02:50:02  mike
 *      add some comments
 *
 *      Revision 1.1  1999/08/08 18:27:57  mike
 *      Initial revision
 */

#include "m_apm_lc.h"

static int M_firsttimef = TRUE;

/*
 *      specify the max size the FFT routine can handle 
 *      (in MAPM, #digits = 2 * #bytes)
 *
 *      this number *must* be an exact power of 2.
 *
 *      **WORST** case input numbers (all 9's) has shown that
 *	the FFT math will overflow if the #define here is 
 *      >= 1048576. On my system, 524,288 worked OK. I will
 *      factor down another factor of 2 to safeguard against 
 *	other computers have less precise floating point math. 
 *	If you are confident in your system, 524288 will 
 *	theoretically work fine.
 *
 *      the define here allows the FFT algorithm to multiply two
 *      524,288 digit numbers yielding a 1,048,576 digit result.
 */

#define MAX_FFT_BYTES 262144

/*
 *      the Divide-and-Conquer multiplication kicks in when the size of
 *	the numbers exceed the capability of the FFT (#define just above).
 *
 *	#bytes    D&C call depth
 *	------    --------------
 *       512K           1
 *        1M            2
 *        2M            3
 *        4M            4
 *       ...           ...
 *    2.1990E+12       23 
 *
 *	the following stack sizes are sized to meet the 
 *      above 2.199E+12 example, though I wouldn't want to
 *	wait for it to finish...
 *
 *      Each call requires 7 stack variables to be saved so 
 *	we need a stack depth of 23 * 7 + PAD.  (we use 164)
 *
 *      For 'exp_stack', 3 integers also are required to be saved 
 *      for each recursive call so we need a stack depth of 
 *      23 * 3 + PAD. (we use 72)
 *
 *
 *      If the FFT multiply is disabled, resize the arrays
 *	as follows:
 *
 *      the following stack sizes are sized to meet the 
 *      worst case expected assuming we are multiplying 
 *      numbers with 2.14E+9 (2 ^ 31) digits. 
 *
 *      For sizeof(int) == 4 (32 bits) there may be up to 32 recursive
 *      calls. Each call requires 7 stack variables so we need a
 *      stack depth of 32 * 7 + PAD.  (we use 240)
 *
 *      For 'exp_stack', 3 integers also are required to be saved 
 *      for each recursive call so we need a stack depth of 
 *      32 * 3 + PAD. (we use 100)
 */

#ifdef NO_FFT_MULTIPLY
#define M_STACK_SIZE 240
#define M_ISTACK_SIZE 100
#else
#define M_STACK_SIZE 164
#define M_ISTACK_SIZE 72
#endif

static int    exp_stack[M_ISTACK_SIZE];
static int    exp_stack_ptr;

static UCHAR  *mul_stack_data[M_STACK_SIZE];
static int    mul_stack_data_size[M_STACK_SIZE];
static int    M_mul_stack_ptr;

static UCHAR  *fmul_a1, *fmul_a0, *fmul_a9, *fmul_b1, *fmul_b0, 
	      *fmul_b9, *fmul_t0;

static int    size_flag, bit_limit, stmp, itmp, mii;

static M_APM  M_ain;
static M_APM  M_bin;

static char   *M_stack_ptr_error_msg = "\'M_get_stack_ptr\', Out of memory";

extern void   M_fast_multiply(M_APM, M_APM, M_APM);
extern void   M_fmul_div_conq(UCHAR *, UCHAR *, UCHAR *, int);
extern void   M_fmul_add(UCHAR *, UCHAR *, int, int);
extern int    M_fmul_subtract(UCHAR *, UCHAR *, UCHAR *, int);
extern void   M_fmul_split(UCHAR *, UCHAR *, UCHAR *, int);
extern int    M_next_power_of_2(int);
extern int    M_get_stack_ptr(int);
extern void   M_push_mul_int(int);
extern int    M_pop_mul_int(void);

#ifdef NO_FFT_MULTIPLY
extern void   M_4_byte_multiply(UCHAR *, UCHAR *, UCHAR *);
#else
extern void   M_fast_mul_fft(UCHAR *, UCHAR *, UCHAR *, int);
#endif

/*
 *      the following algorithm is used in this fast multiply routine
 *	(sometimes called the divide-and-conquer technique.)
 *
 *	assume we have 2 numbers (a & b) with 2N digits. 
 *
 *      let : a = (2^N) * A1 + A0 , b = (2^N) * B1 + B0      
 *
 *	where 'A1' is the 'most significant half' of 'a' and 
 *      'A0' is the 'least significant half' of 'a'. Same for 
 *	B1 and B0.
 *
 *	Now use the identity :
 *
 *               2N   N            N                    N
 *	ab  =  (2  + 2 ) A1B1  +  2 (A1-A0)(B0-B1)  + (2 + 1)A0B0
 *
 *
 *      The original problem of multiplying 2 (2N) digit numbers has
 *	been reduced to 3 multiplications of N digit numbers plus some
 *	additions, subtractions, and shifts.
 *
 *	The fast multiplication algorithm used here uses the above 
 *	identity in a recursive process. This algorithm results in
 *	O(n ^ 1.585) growth.
 */


/****************************************************************************/
void	M_free_all_fmul()
{
int	k;

if (M_firsttimef == FALSE)
  {
   m_apm_free(M_ain);
   m_apm_free(M_bin);

   for (k=0; k < M_STACK_SIZE; k++)
     {
      if (mul_stack_data_size[k] != 0)
        {
         MAPM_FREE(mul_stack_data[k]);
	}
     }

   M_firsttimef = TRUE;
  }
}
/****************************************************************************/
void	M_push_mul_int(int val)
{
exp_stack[++exp_stack_ptr] = val;
}
/****************************************************************************/
int	M_pop_mul_int()
{
return(exp_stack[exp_stack_ptr--]);
}
/****************************************************************************/
void   	M_fmul_split(UCHAR *x1, UCHAR *x0, UCHAR *xin, int nbytes)
{
memcpy(x1, xin, nbytes);
memcpy(x0, (xin + nbytes), nbytes);
}
/****************************************************************************/
void	M_fast_multiply(M_APM rr, M_APM aa, M_APM bb)
{
void	*vp;
int	ii, k, nexp, sign;

if (M_firsttimef)
  {
   M_firsttimef = FALSE;

   for (k=0; k < M_STACK_SIZE; k++)
     mul_stack_data_size[k] = 0;

   size_flag = M_get_sizeof_int();
   bit_limit = 8 * size_flag + 1;

   M_ain = m_apm_init();
   M_bin = m_apm_init();
  }

exp_stack_ptr   = -1;
M_mul_stack_ptr = -1;

m_apm_copy(M_ain, aa);
m_apm_copy(M_bin, bb);

sign = M_ain->m_apm_sign * M_bin->m_apm_sign;
nexp = M_ain->m_apm_exponent + M_bin->m_apm_exponent;

if (M_ain->m_apm_datalength >= M_bin->m_apm_datalength)
  ii = M_ain->m_apm_datalength;
else
  ii = M_bin->m_apm_datalength;

ii = (ii + 1) >> 1;
ii = M_next_power_of_2(ii);

/* Note: 'ii' must be >= 4 here. this is guaranteed 
   by the caller: m_apm_multiply
*/

k = 2 * ii;                   /* required size of result, in bytes  */

M_apm_pad(M_ain, k);          /* fill out the data so the number of */
M_apm_pad(M_bin, k);          /* bytes is an exact power of 2       */

if (k > rr->m_apm_malloclength)
  {
   if ((vp = MAPM_REALLOC(rr->m_apm_data, (k + 32))) == NULL)
     {
      /* fatal, this does not return */

      M_apm_log_error_msg(M_APM_FATAL, "\'M_fast_multiply\', Out of memory");
     }
  
   rr->m_apm_malloclength = k + 28;
   rr->m_apm_data = (UCHAR *)vp;
  }

#ifdef NO_FFT_MULTIPLY

M_fmul_div_conq(rr->m_apm_data, M_ain->m_apm_data, 
                                M_bin->m_apm_data, ii);
#else

/*
 *     if the numbers are *really* big, use the divide-and-conquer
 *     routine first until the numbers are small enough to be handled 
 *     by the FFT algorithm. If the numbers are already small enough,
 *     call the FFT multiplication now.
 *
 *     Note that 'ii' here is (and must be) an exact power of 2.
 */

if (size_flag == 2)   /* if still using 16 bit compilers .... */
  {
   M_fast_mul_fft(rr->m_apm_data, M_ain->m_apm_data, 
                                  M_bin->m_apm_data, ii);
  }
else                  /* >= 32 bit compilers */
  {
   if (ii > (MAX_FFT_BYTES + 2))
     {
      M_fmul_div_conq(rr->m_apm_data, M_ain->m_apm_data, 
                                      M_bin->m_apm_data, ii);
     }
   else
     {
      M_fast_mul_fft(rr->m_apm_data, M_ain->m_apm_data, 
                                     M_bin->m_apm_data, ii);
     }
  }

#endif

rr->m_apm_sign       = sign;
rr->m_apm_exponent   = nexp;
rr->m_apm_datalength = 4 * ii;

M_apm_normalize(rr);
}
/****************************************************************************/
/*
 *      This is the recursive function to perform the multiply. The 
 *      design intent here is to have no local variables. Any local
 *      data that needs to be saved is saved on one of the two stacks.
 */
void	M_fmul_div_conq(UCHAR *rr, UCHAR *aa, UCHAR *bb, int sz)
{

#ifdef NO_FFT_MULTIPLY

if (sz == 4)                /* multiply 4x4 yielding an 8 byte result */
  {
   M_4_byte_multiply(rr, aa, bb);
   return;
  }

#else

/*
 *  if the numbers are now small enough, let the FFT algorithm
 *  finish up.
 */

if (sz == MAX_FFT_BYTES)     
  {
   M_fast_mul_fft(rr, aa, bb, sz);
   return;
  }

#endif

memset(rr, 0, (2 * sz));    /* zero out the result */
mii = sz >> 1;

itmp = M_get_stack_ptr(mii);
M_push_mul_int(itmp);

fmul_a1 = mul_stack_data[itmp];

itmp    = M_get_stack_ptr(mii);
fmul_a0 = mul_stack_data[itmp];

itmp    = M_get_stack_ptr(2 * sz);
fmul_a9 = mul_stack_data[itmp];

itmp    = M_get_stack_ptr(mii);
fmul_b1 = mul_stack_data[itmp];

itmp    = M_get_stack_ptr(mii);
fmul_b0 = mul_stack_data[itmp];

itmp    = M_get_stack_ptr(2 * sz);
fmul_b9 = mul_stack_data[itmp];

itmp    = M_get_stack_ptr(2 * sz);
fmul_t0 = mul_stack_data[itmp];

M_fmul_split(fmul_a1, fmul_a0, aa, mii);
M_fmul_split(fmul_b1, fmul_b0, bb, mii);

stmp  = M_fmul_subtract(fmul_a9, fmul_a1, fmul_a0, mii);
stmp *= M_fmul_subtract(fmul_b9, fmul_b0, fmul_b1, mii);

M_push_mul_int(stmp);
M_push_mul_int(mii);

M_fmul_div_conq(fmul_t0, fmul_a0, fmul_b0, mii);

mii  = M_pop_mul_int();
stmp = M_pop_mul_int();
itmp = M_pop_mul_int();

M_push_mul_int(itmp);
M_push_mul_int(stmp);
M_push_mul_int(mii);

/*   to restore all stack variables ...
fmul_a1 = mul_stack_data[itmp];
fmul_a0 = mul_stack_data[itmp+1];
fmul_a9 = mul_stack_data[itmp+2];
fmul_b1 = mul_stack_data[itmp+3];
fmul_b0 = mul_stack_data[itmp+4];
fmul_b9 = mul_stack_data[itmp+5];
fmul_t0 = mul_stack_data[itmp+6];
*/

fmul_a1 = mul_stack_data[itmp];
fmul_b1 = mul_stack_data[itmp+3];
fmul_t0 = mul_stack_data[itmp+6];

memcpy((rr + sz), fmul_t0, sz);    /* first 'add', result is now zero */
				   /* so we just copy in the bytes    */
M_fmul_add(rr, fmul_t0, mii, sz);

M_fmul_div_conq(fmul_t0, fmul_a1, fmul_b1, mii);

mii  = M_pop_mul_int();
stmp = M_pop_mul_int();
itmp = M_pop_mul_int();

M_push_mul_int(itmp);
M_push_mul_int(stmp);
M_push_mul_int(mii);

fmul_a9 = mul_stack_data[itmp+2];
fmul_b9 = mul_stack_data[itmp+5];
fmul_t0 = mul_stack_data[itmp+6];

M_fmul_add(rr, fmul_t0, 0, sz);
M_fmul_add(rr, fmul_t0, mii, sz);

if (stmp != 0)
  M_fmul_div_conq(fmul_t0, fmul_a9, fmul_b9, mii);

mii  = M_pop_mul_int();
stmp = M_pop_mul_int();
itmp = M_pop_mul_int();

fmul_t0 = mul_stack_data[itmp+6];

/*
 *  if the sign of (A1 - A0)(B0 - B1) is positive, ADD to
 *  the result. if it is negative, SUBTRACT from the result.
 */

if (stmp < 0)
  {
   fmul_a9 = mul_stack_data[itmp+2];
   fmul_b9 = mul_stack_data[itmp+5];

   memset(fmul_b9, 0, (2 * sz)); 
   memcpy((fmul_b9 + mii), fmul_t0, sz); 
   M_fmul_subtract(fmul_a9, rr, fmul_b9, (2 * sz));
   memcpy(rr, fmul_a9, (2 * sz));
  }

if (stmp > 0)
  M_fmul_add(rr, fmul_t0, mii, sz);

M_mul_stack_ptr -= 7;
}
/****************************************************************************/
/*
 *	special addition function for use with the fast multiply operation
 */
void    M_fmul_add(UCHAR *r, UCHAR *a, int offset, int sz)
{
int	i, j;
UCHAR   carry;

carry = 0;
j = offset + sz;
i = sz;

while (TRUE)
  {
   r[--j] += carry + a[--i];

   if (r[j] >= 100)
     {
      r[j] -= 100;
      carry = 1;
     }
   else
      carry = 0;

   if (i == 0)
     break;
  }

if (carry)
  {
   while (TRUE)
     {
      r[--j] += 1;
   
      if (r[j] < 100)
        break;

      r[j] -= 100;
     }
  }
}
/****************************************************************************/
/*
 *	special subtraction function for use with the fast multiply operation
 */
int     M_fmul_subtract(UCHAR *r, UCHAR *a, UCHAR *b, int sz)
{
int	k, jtmp, sflag, nb, borrow;

nb    = sz;
sflag = 0;      /* sign flag: assume the numbers are equal */

/*
 *   find if a > b (so we perform a-b)
 *   or      a < b (so we perform b-a)
 */

for (k=0; k < nb; k++)
  {
   if (a[k] < b[k])
     {
      sflag = -1;
      break;
     }

   if (a[k] > b[k])
     {
      sflag = 1;
      break;
     }
  }

if (sflag == 0)
  {
   memset(r, 0, nb);            /* zero out the result */
  }
else
  {
   k = nb;
   borrow = 0;

   while (TRUE)
     {
      k--;

      if (sflag == 1)
        jtmp = (int)a[k] - ((int)b[k] + borrow);
      else
        jtmp = (int)b[k] - ((int)a[k] + borrow);

      if (jtmp >= 0)
        {
         r[k] = (UCHAR)jtmp;
         borrow = 0;
        }
      else
        {
         r[k] = (UCHAR)(100 + jtmp);
         borrow = 1;
        }

      if (k == 0)
        break;
     }
  }

return(sflag);
}
/****************************************************************************/
int     M_next_power_of_2(int n)
{
int     ct, k;

if (n <= 2)
  return(n);

k  = 2;
ct = 0;

while (TRUE)
  {
   if (k >= n)
     break;

   k = k << 1;

   if (++ct == bit_limit)
     {
      /* fatal, this does not return */

      M_apm_log_error_msg(M_APM_FATAL, 
               "\'M_next_power_of_2\', ERROR :sizeof(int) too small ??");
     }
  }

return(k);
}
/****************************************************************************/
int	M_get_stack_ptr(int sz)
{
int	i, k;
UCHAR   *cp;

k = ++M_mul_stack_ptr;

/* if size is 0, just need to malloc and return */
if (mul_stack_data_size[k] == 0)
  {
   if ((i = sz) < 16)
     i = 16;

   if ((cp = (UCHAR *)MAPM_MALLOC(i + 4)) == NULL)
     {
      /* fatal, this does not return */

      M_apm_log_error_msg(M_APM_FATAL, M_stack_ptr_error_msg);
     }

   mul_stack_data[k]      = cp;
   mul_stack_data_size[k] = i;
  }
else        /* it has been malloc'ed, see if it's big enough */
  {
   if (sz > mul_stack_data_size[k])
     {
      cp = mul_stack_data[k];

      if ((cp = (UCHAR *)MAPM_REALLOC(cp, (sz + 4))) == NULL)
        {
         /* fatal, this does not return */
   
         M_apm_log_error_msg(M_APM_FATAL, M_stack_ptr_error_msg);
        }
   
      mul_stack_data[k]      = cp;
      mul_stack_data_size[k] = sz;
     }
  }

return(k);
}
/****************************************************************************/

#ifdef NO_FFT_MULTIPLY

/*
 *      multiply a 4 byte number by a 4 byte number
 *      yielding an 8 byte result. each byte contains
 *      a base 100 'digit', i.e.: range from 0-99.
 *
 *             MSB         LSB
 *
 *      a,b    [0] [1] [2] [3]
 *   result    [0]  .....  [7]
 */

void	M_4_byte_multiply(UCHAR *r, UCHAR *a, UCHAR *b)
{
int	      jj;
unsigned int  *ip, t1, rr[8];

memset(rr, 0, (8 * sizeof(int)));        /* zero out result */
jj = 3;
ip = rr + 5;

/*
 *   loop for one number [b], un-roll the inner 'loop' [a]
 *
 *   accumulate partial sums in UINT array, release carries 
 *   and convert back to base 100 at the end
 */

while (1)
  {
   t1  = (unsigned int)b[jj];
   ip += 2;

   *ip-- += t1 * a[3];
   *ip-- += t1 * a[2];
   *ip-- += t1 * a[1];
   *ip   += t1 * a[0];
   
   if (jj-- == 0)
     break;
  }

jj = 7;

while (1)
  {
   t1 = rr[jj] / 100;
   r[jj] = (UCHAR)(rr[jj] - 100 * t1);

   if (jj == 0)
     break;

   rr[--jj] += t1;
  }
}

#endif

/****************************************************************************/