1
2 /*
3 * M_APM - mapmfmul.c
4 *
5 * Copyright (C) 1999 - 2007 Michael C. Ring
6 *
7 * Permission to use, copy, and distribute this software and its
8 * documentation for any purpose with or without fee is hereby granted,
9 * provided that the above copyright notice appear in all copies and
10 * that both that copyright notice and this permission notice appear
11 * in supporting documentation.
12 *
13 * Permission to modify the software is granted. Permission to distribute
14 * the modified code is granted. Modifications are to be distributed by
15 * using the file 'license.txt' as a template to modify the file header.
16 * 'license.txt' is available in the official MAPM distribution.
17 *
18 * This software is provided "as is" without express or implied warranty.
19 */
20
21 /*
22 * $Id: mapmfmul.c,v 1.33 2007/12/03 01:52:22 mike Exp $
23 *
24 * This file contains the divide-and-conquer FAST MULTIPLICATION
25 * function as well as its support functions.
26 *
27 * $Log: mapmfmul.c,v $
28 * Revision 1.33 2007/12/03 01:52:22 mike
29 * Update license
30 *
31 * Revision 1.32 2004/02/18 03:16:15 mike
32 * optimize 4 byte multiply (when FFT is disabled)
33 *
34 * Revision 1.31 2003/12/04 01:14:06 mike
35 * redo math on 'borrow'
36 *
37 * Revision 1.30 2003/07/21 20:34:18 mike
38 * Modify error messages to be in a consistent format.
39 *
40 * Revision 1.29 2003/03/31 21:55:07 mike
41 * call generic error handling function
42 *
43 * Revision 1.28 2002/11/03 22:38:15 mike
44 * Updated function parameters to use the modern style
45 *
46 * Revision 1.27 2002/02/14 19:53:32 mike
47 * add conditional compiler option to disable use
48 * of FFT multiply if the user so chooses.
49 *
50 * Revision 1.26 2001/07/26 20:56:38 mike
51 * fix comment, no code change
52 *
53 * Revision 1.25 2001/07/16 19:43:45 mike
54 * add function M_free_all_fmul
55 *
56 * Revision 1.24 2001/02/11 22:34:47 mike
57 * modify parameters to REALLOC
58 *
59 * Revision 1.23 2000/10/20 19:23:26 mike
60 * adjust power_of_2 function so it should work with
61 * 64 bit processors and beyond.
62 *
63 * Revision 1.22 2000/08/23 22:27:34 mike
64 * no real code change, re-named 2 local functions
65 * so they make more sense.
66 *
67 * Revision 1.21 2000/08/01 22:24:38 mike
68 * use sizeof(int) function call to stop some
69 * compilers from complaining.
70 *
71 * Revision 1.20 2000/07/19 17:12:00 mike
72 * lower the number of bytes that the FFT can handle. worst case
73 * testing indicated math overflow when >= 1048576
74 *
75 * Revision 1.19 2000/07/08 18:29:03 mike
76 * increase define so FFT can handle bigger numbers
77 *
78 * Revision 1.18 2000/07/06 23:20:12 mike
79 * changed my mind. use static local MAPM numbers
80 * for temp data storage
81 *
82 * Revision 1.17 2000/07/06 20:52:34 mike
83 * use init function to get local writable copies
84 * instead of using the stack
85 *
86 * Revision 1.16 2000/07/04 17:25:09 mike
87 * guarantee 16 bit compilers still work OK
88 *
89 * Revision 1.15 2000/07/04 15:40:02 mike
90 * add call to use FFT algorithm
91 *
92 * Revision 1.14 2000/05/05 21:10:46 mike
93 * add comment indicating availability of assembly language
94 * version of M_4_byte_multiply for Linux on x86 platforms.
95 *
96 * Revision 1.13 2000/04/20 19:30:45 mike
97 * minor optimization to 4 byte multiply
98 *
99 * Revision 1.12 2000/04/14 15:39:30 mike
100 * optimize the fast multiply function. don't re-curse down
101 * to a size of 1. recurse down to a size of '4' and then
102 * call a special 4 byte multiply function.
103 *
104 * Revision 1.11 2000/02/03 23:02:13 mike
105 * put in RCS for real...
106 *
107 * Revision 1.10 2000/02/03 22:59:08 mike
108 * remove the extra recursive function. not needed any
109 * longer since all current compilers should not have
110 * any problem with true recursive calls.
111 *
112 * Revision 1.9 2000/02/03 22:47:39 mike
113 * use MAPM_* generic memory function
114 *
115 * Revision 1.8 1999/09/19 21:13:44 mike
116 * eliminate unneeded local int in _split
117 *
118 * Revision 1.7 1999/08/12 22:36:23 mike
119 * move the 3 'simple' function to the top of file
120 * so GCC can in-line the code.
121 *
122 * Revision 1.6 1999/08/12 22:01:14 mike
123 * more minor optimizations
124 *
125 * Revision 1.5 1999/08/12 02:02:06 mike
126 * minor optimization
127 *
128 * Revision 1.4 1999/08/10 22:51:59 mike
129 * minor tweak
130 *
131 * Revision 1.3 1999/08/10 00:45:47 mike
132 * added more comments and a few minor tweaks
133 *
134 * Revision 1.2 1999/08/09 02:50:02 mike
135 * add some comments
136 *
137 * Revision 1.1 1999/08/08 18:27:57 mike
138 * Initial revision
139 */
140
141 #include "m_apm_lc.h"
142
143 static int M_firsttimef = TRUE;
144
145 /*
146 * specify the max size the FFT routine can handle
147 * (in MAPM, #digits = 2 * #bytes)
148 *
149 * this number *must* be an exact power of 2.
150 *
151 * **WORST** case input numbers (all 9's) has shown that
152 * the FFT math will overflow if the #define here is
153 * >= 1048576. On my system, 524,288 worked OK. I will
154 * factor down another factor of 2 to safeguard against
155 * other computers have less precise floating point math.
156 * If you are confident in your system, 524288 will
157 * theoretically work fine.
158 *
159 * the define here allows the FFT algorithm to multiply two
160 * 524,288 digit numbers yielding a 1,048,576 digit result.
161 */
162
163 #define MAX_FFT_BYTES 262144
164
165 /*
166 * the Divide-and-Conquer multiplication kicks in when the size of
167 * the numbers exceed the capability of the FFT (#define just above).
168 *
169 * #bytes D&C call depth
170 * ------ --------------
171 * 512K 1
172 * 1M 2
173 * 2M 3
174 * 4M 4
175 * ... ...
176 * 2.1990E+12 23
177 *
178 * the following stack sizes are sized to meet the
179 * above 2.199E+12 example, though I wouldn't want to
180 * wait for it to finish...
181 *
182 * Each call requires 7 stack variables to be saved so
183 * we need a stack depth of 23 * 7 + PAD. (we use 164)
184 *
185 * For 'exp_stack', 3 integers also are required to be saved
186 * for each recursive call so we need a stack depth of
187 * 23 * 3 + PAD. (we use 72)
188 *
189 *
190 * If the FFT multiply is disabled, resize the arrays
191 * as follows:
192 *
193 * the following stack sizes are sized to meet the
194 * worst case expected assuming we are multiplying
195 * numbers with 2.14E+9 (2 ^ 31) digits.
196 *
197 * For sizeof(int) == 4 (32 bits) there may be up to 32 recursive
198 * calls. Each call requires 7 stack variables so we need a
199 * stack depth of 32 * 7 + PAD. (we use 240)
200 *
201 * For 'exp_stack', 3 integers also are required to be saved
202 * for each recursive call so we need a stack depth of
203 * 32 * 3 + PAD. (we use 100)
204 */
205
206 #ifdef NO_FFT_MULTIPLY
207 #define M_STACK_SIZE 240
208 #define M_ISTACK_SIZE 100
209 #else
210 #define M_STACK_SIZE 164
211 #define M_ISTACK_SIZE 72
212 #endif
213
214 static int exp_stack[M_ISTACK_SIZE];
215 static int exp_stack_ptr;
216
217 static UCHAR *mul_stack_data[M_STACK_SIZE];
218 static int mul_stack_data_size[M_STACK_SIZE];
219 static int M_mul_stack_ptr;
220
221 static UCHAR *fmul_a1, *fmul_a0, *fmul_a9, *fmul_b1, *fmul_b0,
222 *fmul_b9, *fmul_t0;
223
224 static int size_flag, bit_limit, stmp, itmp, mii;
225
226 static M_APM M_ain;
227 static M_APM M_bin;
228
229 static char *M_stack_ptr_error_msg = "\'M_get_stack_ptr\', Out of memory";
230
231 extern void M_fast_multiply(M_APM, M_APM, M_APM);
232 extern void M_fmul_div_conq(UCHAR *, UCHAR *, UCHAR *, int);
233 extern void M_fmul_add(UCHAR *, UCHAR *, int, int);
234 extern int M_fmul_subtract(UCHAR *, UCHAR *, UCHAR *, int);
235 extern void M_fmul_split(UCHAR *, UCHAR *, UCHAR *, int);
236 extern int M_next_power_of_2(int);
237 extern int M_get_stack_ptr(int);
238 extern void M_push_mul_int(int);
239 extern int M_pop_mul_int(void);
240
241 #ifdef NO_FFT_MULTIPLY
242 extern void M_4_byte_multiply(UCHAR *, UCHAR *, UCHAR *);
243 #else
244 extern void M_fast_mul_fft(UCHAR *, UCHAR *, UCHAR *, int);
245 #endif
246
247 /*
248 * the following algorithm is used in this fast multiply routine
249 * (sometimes called the divide-and-conquer technique.)
250 *
251 * assume we have 2 numbers (a & b) with 2N digits.
252 *
253 * let : a = (2^N) * A1 + A0 , b = (2^N) * B1 + B0
254 *
255 * where 'A1' is the 'most significant half' of 'a' and
256 * 'A0' is the 'least significant half' of 'a'. Same for
257 * B1 and B0.
258 *
259 * Now use the identity :
260 *
261 * 2N N N N
262 * ab = (2 + 2 ) A1B1 + 2 (A1-A0)(B0-B1) + (2 + 1)A0B0
263 *
264 *
265 * The original problem of multiplying 2 (2N) digit numbers has
266 * been reduced to 3 multiplications of N digit numbers plus some
267 * additions, subtractions, and shifts.
268 *
269 * The fast multiplication algorithm used here uses the above
270 * identity in a recursive process. This algorithm results in
271 * O(n ^ 1.585) growth.
272 */
273
274
275 /****************************************************************************/
M_free_all_fmul()276 void M_free_all_fmul()
277 {
278 int k;
279
280 if (M_firsttimef == FALSE)
281 {
282 m_apm_free(M_ain);
283 m_apm_free(M_bin);
284
285 for (k=0; k < M_STACK_SIZE; k++)
286 {
287 if (mul_stack_data_size[k] != 0)
288 {
289 MAPM_FREE(mul_stack_data[k]);
290 }
291 }
292
293 M_firsttimef = TRUE;
294 }
295 }
296 /****************************************************************************/
M_push_mul_int(int val)297 void M_push_mul_int(int val)
298 {
299 exp_stack[++exp_stack_ptr] = val;
300 }
301 /****************************************************************************/
M_pop_mul_int()302 int M_pop_mul_int()
303 {
304 return(exp_stack[exp_stack_ptr--]);
305 }
306 /****************************************************************************/
M_fmul_split(UCHAR * x1,UCHAR * x0,UCHAR * xin,int nbytes)307 void M_fmul_split(UCHAR *x1, UCHAR *x0, UCHAR *xin, int nbytes)
308 {
309 memcpy(x1, xin, nbytes);
310 memcpy(x0, (xin + nbytes), nbytes);
311 }
312 /****************************************************************************/
M_fast_multiply(M_APM rr,M_APM aa,M_APM bb)313 void M_fast_multiply(M_APM rr, M_APM aa, M_APM bb)
314 {
315 void *vp;
316 int ii, k, nexp, sign;
317
318 if (M_firsttimef)
319 {
320 M_firsttimef = FALSE;
321
322 for (k=0; k < M_STACK_SIZE; k++)
323 mul_stack_data_size[k] = 0;
324
325 size_flag = M_get_sizeof_int();
326 bit_limit = 8 * size_flag + 1;
327
328 M_ain = m_apm_init();
329 M_bin = m_apm_init();
330 }
331
332 exp_stack_ptr = -1;
333 M_mul_stack_ptr = -1;
334
335 m_apm_copy(M_ain, aa);
336 m_apm_copy(M_bin, bb);
337
338 sign = M_ain->m_apm_sign * M_bin->m_apm_sign;
339 nexp = M_ain->m_apm_exponent + M_bin->m_apm_exponent;
340
341 if (M_ain->m_apm_datalength >= M_bin->m_apm_datalength)
342 ii = M_ain->m_apm_datalength;
343 else
344 ii = M_bin->m_apm_datalength;
345
346 ii = (ii + 1) >> 1;
347 ii = M_next_power_of_2(ii);
348
349 /* Note: 'ii' must be >= 4 here. this is guaranteed
350 by the caller: m_apm_multiply
351 */
352
353 k = 2 * ii; /* required size of result, in bytes */
354
355 M_apm_pad(M_ain, k); /* fill out the data so the number of */
356 M_apm_pad(M_bin, k); /* bytes is an exact power of 2 */
357
358 if (k > rr->m_apm_malloclength)
359 {
360 if ((vp = MAPM_REALLOC(rr->m_apm_data, (k + 32))) == NULL)
361 {
362 /* fatal, this does not return */
363
364 M_apm_log_error_msg(M_APM_FATAL, "\'M_fast_multiply\', Out of memory");
365 }
366
367 rr->m_apm_malloclength = k + 28;
368 rr->m_apm_data = (UCHAR *)vp;
369 }
370
371 #ifdef NO_FFT_MULTIPLY
372
373 M_fmul_div_conq(rr->m_apm_data, M_ain->m_apm_data,
374 M_bin->m_apm_data, ii);
375 #else
376
377 /*
378 * if the numbers are *really* big, use the divide-and-conquer
379 * routine first until the numbers are small enough to be handled
380 * by the FFT algorithm. If the numbers are already small enough,
381 * call the FFT multiplication now.
382 *
383 * Note that 'ii' here is (and must be) an exact power of 2.
384 */
385
386 if (size_flag == 2) /* if still using 16 bit compilers .... */
387 {
388 M_fast_mul_fft(rr->m_apm_data, M_ain->m_apm_data,
389 M_bin->m_apm_data, ii);
390 }
391 else /* >= 32 bit compilers */
392 {
393 if (ii > (MAX_FFT_BYTES + 2))
394 {
395 M_fmul_div_conq(rr->m_apm_data, M_ain->m_apm_data,
396 M_bin->m_apm_data, ii);
397 }
398 else
399 {
400 M_fast_mul_fft(rr->m_apm_data, M_ain->m_apm_data,
401 M_bin->m_apm_data, ii);
402 }
403 }
404
405 #endif
406
407 rr->m_apm_sign = sign;
408 rr->m_apm_exponent = nexp;
409 rr->m_apm_datalength = 4 * ii;
410
411 M_apm_normalize(rr);
412 }
413 /****************************************************************************/
414 /*
415 * This is the recursive function to perform the multiply. The
416 * design intent here is to have no local variables. Any local
417 * data that needs to be saved is saved on one of the two stacks.
418 */
M_fmul_div_conq(UCHAR * rr,UCHAR * aa,UCHAR * bb,int sz)419 void M_fmul_div_conq(UCHAR *rr, UCHAR *aa, UCHAR *bb, int sz)
420 {
421
422 #ifdef NO_FFT_MULTIPLY
423
424 if (sz == 4) /* multiply 4x4 yielding an 8 byte result */
425 {
426 M_4_byte_multiply(rr, aa, bb);
427 return;
428 }
429
430 #else
431
432 /*
433 * if the numbers are now small enough, let the FFT algorithm
434 * finish up.
435 */
436
437 if (sz == MAX_FFT_BYTES)
438 {
439 M_fast_mul_fft(rr, aa, bb, sz);
440 return;
441 }
442
443 #endif
444
445 memset(rr, 0, (2 * sz)); /* zero out the result */
446 mii = sz >> 1;
447
448 itmp = M_get_stack_ptr(mii);
449 M_push_mul_int(itmp);
450
451 fmul_a1 = mul_stack_data[itmp];
452
453 itmp = M_get_stack_ptr(mii);
454 fmul_a0 = mul_stack_data[itmp];
455
456 itmp = M_get_stack_ptr(2 * sz);
457 fmul_a9 = mul_stack_data[itmp];
458
459 itmp = M_get_stack_ptr(mii);
460 fmul_b1 = mul_stack_data[itmp];
461
462 itmp = M_get_stack_ptr(mii);
463 fmul_b0 = mul_stack_data[itmp];
464
465 itmp = M_get_stack_ptr(2 * sz);
466 fmul_b9 = mul_stack_data[itmp];
467
468 itmp = M_get_stack_ptr(2 * sz);
469 fmul_t0 = mul_stack_data[itmp];
470
471 M_fmul_split(fmul_a1, fmul_a0, aa, mii);
472 M_fmul_split(fmul_b1, fmul_b0, bb, mii);
473
474 stmp = M_fmul_subtract(fmul_a9, fmul_a1, fmul_a0, mii);
475 stmp *= M_fmul_subtract(fmul_b9, fmul_b0, fmul_b1, mii);
476
477 M_push_mul_int(stmp);
478 M_push_mul_int(mii);
479
480 M_fmul_div_conq(fmul_t0, fmul_a0, fmul_b0, mii);
481
482 mii = M_pop_mul_int();
483 stmp = M_pop_mul_int();
484 itmp = M_pop_mul_int();
485
486 M_push_mul_int(itmp);
487 M_push_mul_int(stmp);
488 M_push_mul_int(mii);
489
490 /* to restore all stack variables ...
491 fmul_a1 = mul_stack_data[itmp];
492 fmul_a0 = mul_stack_data[itmp+1];
493 fmul_a9 = mul_stack_data[itmp+2];
494 fmul_b1 = mul_stack_data[itmp+3];
495 fmul_b0 = mul_stack_data[itmp+4];
496 fmul_b9 = mul_stack_data[itmp+5];
497 fmul_t0 = mul_stack_data[itmp+6];
498 */
499
500 fmul_a1 = mul_stack_data[itmp];
501 fmul_b1 = mul_stack_data[itmp+3];
502 fmul_t0 = mul_stack_data[itmp+6];
503
504 memcpy((rr + sz), fmul_t0, sz); /* first 'add', result is now zero */
505 /* so we just copy in the bytes */
506 M_fmul_add(rr, fmul_t0, mii, sz);
507
508 M_fmul_div_conq(fmul_t0, fmul_a1, fmul_b1, mii);
509
510 mii = M_pop_mul_int();
511 stmp = M_pop_mul_int();
512 itmp = M_pop_mul_int();
513
514 M_push_mul_int(itmp);
515 M_push_mul_int(stmp);
516 M_push_mul_int(mii);
517
518 fmul_a9 = mul_stack_data[itmp+2];
519 fmul_b9 = mul_stack_data[itmp+5];
520 fmul_t0 = mul_stack_data[itmp+6];
521
522 M_fmul_add(rr, fmul_t0, 0, sz);
523 M_fmul_add(rr, fmul_t0, mii, sz);
524
525 if (stmp != 0)
526 M_fmul_div_conq(fmul_t0, fmul_a9, fmul_b9, mii);
527
528 mii = M_pop_mul_int();
529 stmp = M_pop_mul_int();
530 itmp = M_pop_mul_int();
531
532 fmul_t0 = mul_stack_data[itmp+6];
533
534 /*
535 * if the sign of (A1 - A0)(B0 - B1) is positive, ADD to
536 * the result. if it is negative, SUBTRACT from the result.
537 */
538
539 if (stmp < 0)
540 {
541 fmul_a9 = mul_stack_data[itmp+2];
542 fmul_b9 = mul_stack_data[itmp+5];
543
544 memset(fmul_b9, 0, (2 * sz));
545 memcpy((fmul_b9 + mii), fmul_t0, sz);
546 M_fmul_subtract(fmul_a9, rr, fmul_b9, (2 * sz));
547 memcpy(rr, fmul_a9, (2 * sz));
548 }
549
550 if (stmp > 0)
551 M_fmul_add(rr, fmul_t0, mii, sz);
552
553 M_mul_stack_ptr -= 7;
554 }
555 /****************************************************************************/
556 /*
557 * special addition function for use with the fast multiply operation
558 */
M_fmul_add(UCHAR * r,UCHAR * a,int offset,int sz)559 void M_fmul_add(UCHAR *r, UCHAR *a, int offset, int sz)
560 {
561 int i, j;
562 UCHAR carry;
563
564 carry = 0;
565 j = offset + sz;
566 i = sz;
567
568 while (TRUE)
569 {
570 r[--j] += carry + a[--i];
571
572 if (r[j] >= 100)
573 {
574 r[j] -= 100;
575 carry = 1;
576 }
577 else
578 carry = 0;
579
580 if (i == 0)
581 break;
582 }
583
584 if (carry)
585 {
586 while (TRUE)
587 {
588 r[--j] += 1;
589
590 if (r[j] < 100)
591 break;
592
593 r[j] -= 100;
594 }
595 }
596 }
597 /****************************************************************************/
598 /*
599 * special subtraction function for use with the fast multiply operation
600 */
M_fmul_subtract(UCHAR * r,UCHAR * a,UCHAR * b,int sz)601 int M_fmul_subtract(UCHAR *r, UCHAR *a, UCHAR *b, int sz)
602 {
603 int k, jtmp, sflag, nb, borrow;
604
605 nb = sz;
606 sflag = 0; /* sign flag: assume the numbers are equal */
607
608 /*
609 * find if a > b (so we perform a-b)
610 * or a < b (so we perform b-a)
611 */
612
613 for (k=0; k < nb; k++)
614 {
615 if (a[k] < b[k])
616 {
617 sflag = -1;
618 break;
619 }
620
621 if (a[k] > b[k])
622 {
623 sflag = 1;
624 break;
625 }
626 }
627
628 if (sflag == 0)
629 {
630 memset(r, 0, nb); /* zero out the result */
631 }
632 else
633 {
634 k = nb;
635 borrow = 0;
636
637 while (TRUE)
638 {
639 k--;
640
641 if (sflag == 1)
642 jtmp = (int)a[k] - ((int)b[k] + borrow);
643 else
644 jtmp = (int)b[k] - ((int)a[k] + borrow);
645
646 if (jtmp >= 0)
647 {
648 r[k] = (UCHAR)jtmp;
649 borrow = 0;
650 }
651 else
652 {
653 r[k] = (UCHAR)(100 + jtmp);
654 borrow = 1;
655 }
656
657 if (k == 0)
658 break;
659 }
660 }
661
662 return(sflag);
663 }
664 /****************************************************************************/
M_next_power_of_2(int n)665 int M_next_power_of_2(int n)
666 {
667 int ct, k;
668
669 if (n <= 2)
670 return(n);
671
672 k = 2;
673 ct = 0;
674
675 while (TRUE)
676 {
677 if (k >= n)
678 break;
679
680 k = k << 1;
681
682 if (++ct == bit_limit)
683 {
684 /* fatal, this does not return */
685
686 M_apm_log_error_msg(M_APM_FATAL,
687 "\'M_next_power_of_2\', ERROR :sizeof(int) too small ??");
688 }
689 }
690
691 return(k);
692 }
693 /****************************************************************************/
M_get_stack_ptr(int sz)694 int M_get_stack_ptr(int sz)
695 {
696 int i, k;
697 UCHAR *cp;
698
699 k = ++M_mul_stack_ptr;
700
701 /* if size is 0, just need to malloc and return */
702 if (mul_stack_data_size[k] == 0)
703 {
704 if ((i = sz) < 16)
705 i = 16;
706
707 if ((cp = (UCHAR *)MAPM_MALLOC(i + 4)) == NULL)
708 {
709 /* fatal, this does not return */
710
711 M_apm_log_error_msg(M_APM_FATAL, M_stack_ptr_error_msg);
712 }
713
714 mul_stack_data[k] = cp;
715 mul_stack_data_size[k] = i;
716 }
717 else /* it has been malloc'ed, see if it's big enough */
718 {
719 if (sz > mul_stack_data_size[k])
720 {
721 cp = mul_stack_data[k];
722
723 if ((cp = (UCHAR *)MAPM_REALLOC(cp, (sz + 4))) == NULL)
724 {
725 /* fatal, this does not return */
726
727 M_apm_log_error_msg(M_APM_FATAL, M_stack_ptr_error_msg);
728 }
729
730 mul_stack_data[k] = cp;
731 mul_stack_data_size[k] = sz;
732 }
733 }
734
735 return(k);
736 }
737 /****************************************************************************/
738
739 #ifdef NO_FFT_MULTIPLY
740
741 /*
742 * multiply a 4 byte number by a 4 byte number
743 * yielding an 8 byte result. each byte contains
744 * a base 100 'digit', i.e.: range from 0-99.
745 *
746 * MSB LSB
747 *
748 * a,b [0] [1] [2] [3]
749 * result [0] ..... [7]
750 */
751
M_4_byte_multiply(UCHAR * r,UCHAR * a,UCHAR * b)752 void M_4_byte_multiply(UCHAR *r, UCHAR *a, UCHAR *b)
753 {
754 int jj;
755 unsigned int *ip, t1, rr[8];
756
757 memset(rr, 0, (8 * sizeof(int))); /* zero out result */
758 jj = 3;
759 ip = rr + 5;
760
761 /*
762 * loop for one number [b], un-roll the inner 'loop' [a]
763 *
764 * accumulate partial sums in UINT array, release carries
765 * and convert back to base 100 at the end
766 */
767
768 while (1)
769 {
770 t1 = (unsigned int)b[jj];
771 ip += 2;
772
773 *ip-- += t1 * a[3];
774 *ip-- += t1 * a[2];
775 *ip-- += t1 * a[1];
776 *ip += t1 * a[0];
777
778 if (jj-- == 0)
779 break;
780 }
781
782 jj = 7;
783
784 while (1)
785 {
786 t1 = rr[jj] / 100;
787 r[jj] = (UCHAR)(rr[jj] - 100 * t1);
788
789 if (jj == 0)
790 break;
791
792 rr[--jj] += t1;
793 }
794 }
795
796 #endif
797
798 /****************************************************************************/
799