1 2 /* 3 * M_APM - mapmfmul.c 4 * 5 * Copyright (C) 1999 - 2007 Michael C. Ring 6 * 7 * Permission to use, copy, and distribute this software and its 8 * documentation for any purpose with or without fee is hereby granted, 9 * provided that the above copyright notice appear in all copies and 10 * that both that copyright notice and this permission notice appear 11 * in supporting documentation. 12 * 13 * Permission to modify the software is granted. Permission to distribute 14 * the modified code is granted. Modifications are to be distributed by 15 * using the file 'license.txt' as a template to modify the file header. 16 * 'license.txt' is available in the official MAPM distribution. 17 * 18 * This software is provided "as is" without express or implied warranty. 19 */ 20 21 /* 22 * $Id: mapmfmul.c,v 1.33 2007/12/03 01:52:22 mike Exp $ 23 * 24 * This file contains the divide-and-conquer FAST MULTIPLICATION 25 * function as well as its support functions. 26 * 27 * $Log: mapmfmul.c,v $ 28 * Revision 1.33 2007/12/03 01:52:22 mike 29 * Update license 30 * 31 * Revision 1.32 2004/02/18 03:16:15 mike 32 * optimize 4 byte multiply (when FFT is disabled) 33 * 34 * Revision 1.31 2003/12/04 01:14:06 mike 35 * redo math on 'borrow' 36 * 37 * Revision 1.30 2003/07/21 20:34:18 mike 38 * Modify error messages to be in a consistent format. 39 * 40 * Revision 1.29 2003/03/31 21:55:07 mike 41 * call generic error handling function 42 * 43 * Revision 1.28 2002/11/03 22:38:15 mike 44 * Updated function parameters to use the modern style 45 * 46 * Revision 1.27 2002/02/14 19:53:32 mike 47 * add conditional compiler option to disable use 48 * of FFT multiply if the user so chooses. 49 * 50 * Revision 1.26 2001/07/26 20:56:38 mike 51 * fix comment, no code change 52 * 53 * Revision 1.25 2001/07/16 19:43:45 mike 54 * add function M_free_all_fmul 55 * 56 * Revision 1.24 2001/02/11 22:34:47 mike 57 * modify parameters to REALLOC 58 * 59 * Revision 1.23 2000/10/20 19:23:26 mike 60 * adjust power_of_2 function so it should work with 61 * 64 bit processors and beyond. 62 * 63 * Revision 1.22 2000/08/23 22:27:34 mike 64 * no real code change, re-named 2 local functions 65 * so they make more sense. 66 * 67 * Revision 1.21 2000/08/01 22:24:38 mike 68 * use sizeof(int) function call to stop some 69 * compilers from complaining. 70 * 71 * Revision 1.20 2000/07/19 17:12:00 mike 72 * lower the number of bytes that the FFT can handle. worst case 73 * testing indicated math overflow when >= 1048576 74 * 75 * Revision 1.19 2000/07/08 18:29:03 mike 76 * increase define so FFT can handle bigger numbers 77 * 78 * Revision 1.18 2000/07/06 23:20:12 mike 79 * changed my mind. use static local MAPM numbers 80 * for temp data storage 81 * 82 * Revision 1.17 2000/07/06 20:52:34 mike 83 * use init function to get local writable copies 84 * instead of using the stack 85 * 86 * Revision 1.16 2000/07/04 17:25:09 mike 87 * guarantee 16 bit compilers still work OK 88 * 89 * Revision 1.15 2000/07/04 15:40:02 mike 90 * add call to use FFT algorithm 91 * 92 * Revision 1.14 2000/05/05 21:10:46 mike 93 * add comment indicating availability of assembly language 94 * version of M_4_byte_multiply for Linux on x86 platforms. 95 * 96 * Revision 1.13 2000/04/20 19:30:45 mike 97 * minor optimization to 4 byte multiply 98 * 99 * Revision 1.12 2000/04/14 15:39:30 mike 100 * optimize the fast multiply function. don't re-curse down 101 * to a size of 1. recurse down to a size of '4' and then 102 * call a special 4 byte multiply function. 103 * 104 * Revision 1.11 2000/02/03 23:02:13 mike 105 * put in RCS for real... 106 * 107 * Revision 1.10 2000/02/03 22:59:08 mike 108 * remove the extra recursive function. not needed any 109 * longer since all current compilers should not have 110 * any problem with true recursive calls. 111 * 112 * Revision 1.9 2000/02/03 22:47:39 mike 113 * use MAPM_* generic memory function 114 * 115 * Revision 1.8 1999/09/19 21:13:44 mike 116 * eliminate unneeded local int in _split 117 * 118 * Revision 1.7 1999/08/12 22:36:23 mike 119 * move the 3 'simple' function to the top of file 120 * so GCC can in-line the code. 121 * 122 * Revision 1.6 1999/08/12 22:01:14 mike 123 * more minor optimizations 124 * 125 * Revision 1.5 1999/08/12 02:02:06 mike 126 * minor optimization 127 * 128 * Revision 1.4 1999/08/10 22:51:59 mike 129 * minor tweak 130 * 131 * Revision 1.3 1999/08/10 00:45:47 mike 132 * added more comments and a few minor tweaks 133 * 134 * Revision 1.2 1999/08/09 02:50:02 mike 135 * add some comments 136 * 137 * Revision 1.1 1999/08/08 18:27:57 mike 138 * Initial revision 139 */ 140 141 #include "m_apm_lc.h" 142 143 static int M_firsttimef = TRUE; 144 145 /* 146 * specify the max size the FFT routine can handle 147 * (in MAPM, #digits = 2 * #bytes) 148 * 149 * this number *must* be an exact power of 2. 150 * 151 * **WORST** case input numbers (all 9's) has shown that 152 * the FFT math will overflow if the #define here is 153 * >= 1048576. On my system, 524,288 worked OK. I will 154 * factor down another factor of 2 to safeguard against 155 * other computers have less precise floating point math. 156 * If you are confident in your system, 524288 will 157 * theoretically work fine. 158 * 159 * the define here allows the FFT algorithm to multiply two 160 * 524,288 digit numbers yielding a 1,048,576 digit result. 161 */ 162 163 #define MAX_FFT_BYTES 262144 164 165 /* 166 * the Divide-and-Conquer multiplication kicks in when the size of 167 * the numbers exceed the capability of the FFT (#define just above). 168 * 169 * #bytes D&C call depth 170 * ------ -------------- 171 * 512K 1 172 * 1M 2 173 * 2M 3 174 * 4M 4 175 * ... ... 176 * 2.1990E+12 23 177 * 178 * the following stack sizes are sized to meet the 179 * above 2.199E+12 example, though I wouldn't want to 180 * wait for it to finish... 181 * 182 * Each call requires 7 stack variables to be saved so 183 * we need a stack depth of 23 * 7 + PAD. (we use 164) 184 * 185 * For 'exp_stack', 3 integers also are required to be saved 186 * for each recursive call so we need a stack depth of 187 * 23 * 3 + PAD. (we use 72) 188 * 189 * 190 * If the FFT multiply is disabled, resize the arrays 191 * as follows: 192 * 193 * the following stack sizes are sized to meet the 194 * worst case expected assuming we are multiplying 195 * numbers with 2.14E+9 (2 ^ 31) digits. 196 * 197 * For sizeof(int) == 4 (32 bits) there may be up to 32 recursive 198 * calls. Each call requires 7 stack variables so we need a 199 * stack depth of 32 * 7 + PAD. (we use 240) 200 * 201 * For 'exp_stack', 3 integers also are required to be saved 202 * for each recursive call so we need a stack depth of 203 * 32 * 3 + PAD. (we use 100) 204 */ 205 206 #ifdef NO_FFT_MULTIPLY 207 #define M_STACK_SIZE 240 208 #define M_ISTACK_SIZE 100 209 #else 210 #define M_STACK_SIZE 164 211 #define M_ISTACK_SIZE 72 212 #endif 213 214 static int exp_stack[M_ISTACK_SIZE]; 215 static int exp_stack_ptr; 216 217 static UCHAR *mul_stack_data[M_STACK_SIZE]; 218 static int mul_stack_data_size[M_STACK_SIZE]; 219 static int M_mul_stack_ptr; 220 221 static UCHAR *fmul_a1, *fmul_a0, *fmul_a9, *fmul_b1, *fmul_b0, 222 *fmul_b9, *fmul_t0; 223 224 static int size_flag, bit_limit, stmp, itmp, mii; 225 226 static M_APM M_ain; 227 static M_APM M_bin; 228 229 static char *M_stack_ptr_error_msg = "\'M_get_stack_ptr\', Out of memory"; 230 231 extern void M_fast_multiply(M_APM, M_APM, M_APM); 232 extern void M_fmul_div_conq(UCHAR *, UCHAR *, UCHAR *, int); 233 extern void M_fmul_add(UCHAR *, UCHAR *, int, int); 234 extern int M_fmul_subtract(UCHAR *, UCHAR *, UCHAR *, int); 235 extern void M_fmul_split(UCHAR *, UCHAR *, UCHAR *, int); 236 extern int M_next_power_of_2(int); 237 extern int M_get_stack_ptr(int); 238 extern void M_push_mul_int(int); 239 extern int M_pop_mul_int(void); 240 241 #ifdef NO_FFT_MULTIPLY 242 extern void M_4_byte_multiply(UCHAR *, UCHAR *, UCHAR *); 243 #else 244 extern void M_fast_mul_fft(UCHAR *, UCHAR *, UCHAR *, int); 245 #endif 246 247 /* 248 * the following algorithm is used in this fast multiply routine 249 * (sometimes called the divide-and-conquer technique.) 250 * 251 * assume we have 2 numbers (a & b) with 2N digits. 252 * 253 * let : a = (2^N) * A1 + A0 , b = (2^N) * B1 + B0 254 * 255 * where 'A1' is the 'most significant half' of 'a' and 256 * 'A0' is the 'least significant half' of 'a'. Same for 257 * B1 and B0. 258 * 259 * Now use the identity : 260 * 261 * 2N N N N 262 * ab = (2 + 2 ) A1B1 + 2 (A1-A0)(B0-B1) + (2 + 1)A0B0 263 * 264 * 265 * The original problem of multiplying 2 (2N) digit numbers has 266 * been reduced to 3 multiplications of N digit numbers plus some 267 * additions, subtractions, and shifts. 268 * 269 * The fast multiplication algorithm used here uses the above 270 * identity in a recursive process. This algorithm results in 271 * O(n ^ 1.585) growth. 272 */ 273 274 275 /****************************************************************************/ 276 void M_free_all_fmul() 277 { 278 int k; 279 280 if (M_firsttimef == FALSE) 281 { 282 m_apm_free(M_ain); 283 m_apm_free(M_bin); 284 285 for (k=0; k < M_STACK_SIZE; k++) 286 { 287 if (mul_stack_data_size[k] != 0) 288 { 289 MAPM_FREE(mul_stack_data[k]); 290 } 291 } 292 293 M_firsttimef = TRUE; 294 } 295 } 296 /****************************************************************************/ 297 void M_push_mul_int(int val) 298 { 299 exp_stack[++exp_stack_ptr] = val; 300 } 301 /****************************************************************************/ 302 int M_pop_mul_int() 303 { 304 return(exp_stack[exp_stack_ptr--]); 305 } 306 /****************************************************************************/ 307 void M_fmul_split(UCHAR *x1, UCHAR *x0, UCHAR *xin, int nbytes) 308 { 309 memcpy(x1, xin, nbytes); 310 memcpy(x0, (xin + nbytes), nbytes); 311 } 312 /****************************************************************************/ 313 void M_fast_multiply(M_APM rr, M_APM aa, M_APM bb) 314 { 315 void *vp; 316 int ii, k, nexp, sign; 317 318 if (M_firsttimef) 319 { 320 M_firsttimef = FALSE; 321 322 for (k=0; k < M_STACK_SIZE; k++) 323 mul_stack_data_size[k] = 0; 324 325 size_flag = M_get_sizeof_int(); 326 bit_limit = 8 * size_flag + 1; 327 328 M_ain = m_apm_init(); 329 M_bin = m_apm_init(); 330 } 331 332 exp_stack_ptr = -1; 333 M_mul_stack_ptr = -1; 334 335 m_apm_copy(M_ain, aa); 336 m_apm_copy(M_bin, bb); 337 338 sign = M_ain->m_apm_sign * M_bin->m_apm_sign; 339 nexp = M_ain->m_apm_exponent + M_bin->m_apm_exponent; 340 341 if (M_ain->m_apm_datalength >= M_bin->m_apm_datalength) 342 ii = M_ain->m_apm_datalength; 343 else 344 ii = M_bin->m_apm_datalength; 345 346 ii = (ii + 1) >> 1; 347 ii = M_next_power_of_2(ii); 348 349 /* Note: 'ii' must be >= 4 here. this is guaranteed 350 by the caller: m_apm_multiply 351 */ 352 353 k = 2 * ii; /* required size of result, in bytes */ 354 355 M_apm_pad(M_ain, k); /* fill out the data so the number of */ 356 M_apm_pad(M_bin, k); /* bytes is an exact power of 2 */ 357 358 if (k > rr->m_apm_malloclength) 359 { 360 if ((vp = MAPM_REALLOC(rr->m_apm_data, (k + 32))) == NULL) 361 { 362 /* fatal, this does not return */ 363 364 M_apm_log_error_msg(M_APM_FATAL, "\'M_fast_multiply\', Out of memory"); 365 } 366 367 rr->m_apm_malloclength = k + 28; 368 rr->m_apm_data = (UCHAR *)vp; 369 } 370 371 #ifdef NO_FFT_MULTIPLY 372 373 M_fmul_div_conq(rr->m_apm_data, M_ain->m_apm_data, 374 M_bin->m_apm_data, ii); 375 #else 376 377 /* 378 * if the numbers are *really* big, use the divide-and-conquer 379 * routine first until the numbers are small enough to be handled 380 * by the FFT algorithm. If the numbers are already small enough, 381 * call the FFT multiplication now. 382 * 383 * Note that 'ii' here is (and must be) an exact power of 2. 384 */ 385 386 if (size_flag == 2) /* if still using 16 bit compilers .... */ 387 { 388 M_fast_mul_fft(rr->m_apm_data, M_ain->m_apm_data, 389 M_bin->m_apm_data, ii); 390 } 391 else /* >= 32 bit compilers */ 392 { 393 if (ii > (MAX_FFT_BYTES + 2)) 394 { 395 M_fmul_div_conq(rr->m_apm_data, M_ain->m_apm_data, 396 M_bin->m_apm_data, ii); 397 } 398 else 399 { 400 M_fast_mul_fft(rr->m_apm_data, M_ain->m_apm_data, 401 M_bin->m_apm_data, ii); 402 } 403 } 404 405 #endif 406 407 rr->m_apm_sign = sign; 408 rr->m_apm_exponent = nexp; 409 rr->m_apm_datalength = 4 * ii; 410 411 M_apm_normalize(rr); 412 } 413 /****************************************************************************/ 414 /* 415 * This is the recursive function to perform the multiply. The 416 * design intent here is to have no local variables. Any local 417 * data that needs to be saved is saved on one of the two stacks. 418 */ 419 void M_fmul_div_conq(UCHAR *rr, UCHAR *aa, UCHAR *bb, int sz) 420 { 421 422 #ifdef NO_FFT_MULTIPLY 423 424 if (sz == 4) /* multiply 4x4 yielding an 8 byte result */ 425 { 426 M_4_byte_multiply(rr, aa, bb); 427 return; 428 } 429 430 #else 431 432 /* 433 * if the numbers are now small enough, let the FFT algorithm 434 * finish up. 435 */ 436 437 if (sz == MAX_FFT_BYTES) 438 { 439 M_fast_mul_fft(rr, aa, bb, sz); 440 return; 441 } 442 443 #endif 444 445 memset(rr, 0, (2 * sz)); /* zero out the result */ 446 mii = sz >> 1; 447 448 itmp = M_get_stack_ptr(mii); 449 M_push_mul_int(itmp); 450 451 fmul_a1 = mul_stack_data[itmp]; 452 453 itmp = M_get_stack_ptr(mii); 454 fmul_a0 = mul_stack_data[itmp]; 455 456 itmp = M_get_stack_ptr(2 * sz); 457 fmul_a9 = mul_stack_data[itmp]; 458 459 itmp = M_get_stack_ptr(mii); 460 fmul_b1 = mul_stack_data[itmp]; 461 462 itmp = M_get_stack_ptr(mii); 463 fmul_b0 = mul_stack_data[itmp]; 464 465 itmp = M_get_stack_ptr(2 * sz); 466 fmul_b9 = mul_stack_data[itmp]; 467 468 itmp = M_get_stack_ptr(2 * sz); 469 fmul_t0 = mul_stack_data[itmp]; 470 471 M_fmul_split(fmul_a1, fmul_a0, aa, mii); 472 M_fmul_split(fmul_b1, fmul_b0, bb, mii); 473 474 stmp = M_fmul_subtract(fmul_a9, fmul_a1, fmul_a0, mii); 475 stmp *= M_fmul_subtract(fmul_b9, fmul_b0, fmul_b1, mii); 476 477 M_push_mul_int(stmp); 478 M_push_mul_int(mii); 479 480 M_fmul_div_conq(fmul_t0, fmul_a0, fmul_b0, mii); 481 482 mii = M_pop_mul_int(); 483 stmp = M_pop_mul_int(); 484 itmp = M_pop_mul_int(); 485 486 M_push_mul_int(itmp); 487 M_push_mul_int(stmp); 488 M_push_mul_int(mii); 489 490 /* to restore all stack variables ... 491 fmul_a1 = mul_stack_data[itmp]; 492 fmul_a0 = mul_stack_data[itmp+1]; 493 fmul_a9 = mul_stack_data[itmp+2]; 494 fmul_b1 = mul_stack_data[itmp+3]; 495 fmul_b0 = mul_stack_data[itmp+4]; 496 fmul_b9 = mul_stack_data[itmp+5]; 497 fmul_t0 = mul_stack_data[itmp+6]; 498 */ 499 500 fmul_a1 = mul_stack_data[itmp]; 501 fmul_b1 = mul_stack_data[itmp+3]; 502 fmul_t0 = mul_stack_data[itmp+6]; 503 504 memcpy((rr + sz), fmul_t0, sz); /* first 'add', result is now zero */ 505 /* so we just copy in the bytes */ 506 M_fmul_add(rr, fmul_t0, mii, sz); 507 508 M_fmul_div_conq(fmul_t0, fmul_a1, fmul_b1, mii); 509 510 mii = M_pop_mul_int(); 511 stmp = M_pop_mul_int(); 512 itmp = M_pop_mul_int(); 513 514 M_push_mul_int(itmp); 515 M_push_mul_int(stmp); 516 M_push_mul_int(mii); 517 518 fmul_a9 = mul_stack_data[itmp+2]; 519 fmul_b9 = mul_stack_data[itmp+5]; 520 fmul_t0 = mul_stack_data[itmp+6]; 521 522 M_fmul_add(rr, fmul_t0, 0, sz); 523 M_fmul_add(rr, fmul_t0, mii, sz); 524 525 if (stmp != 0) 526 M_fmul_div_conq(fmul_t0, fmul_a9, fmul_b9, mii); 527 528 mii = M_pop_mul_int(); 529 stmp = M_pop_mul_int(); 530 itmp = M_pop_mul_int(); 531 532 fmul_t0 = mul_stack_data[itmp+6]; 533 534 /* 535 * if the sign of (A1 - A0)(B0 - B1) is positive, ADD to 536 * the result. if it is negative, SUBTRACT from the result. 537 */ 538 539 if (stmp < 0) 540 { 541 fmul_a9 = mul_stack_data[itmp+2]; 542 fmul_b9 = mul_stack_data[itmp+5]; 543 544 memset(fmul_b9, 0, (2 * sz)); 545 memcpy((fmul_b9 + mii), fmul_t0, sz); 546 M_fmul_subtract(fmul_a9, rr, fmul_b9, (2 * sz)); 547 memcpy(rr, fmul_a9, (2 * sz)); 548 } 549 550 if (stmp > 0) 551 M_fmul_add(rr, fmul_t0, mii, sz); 552 553 M_mul_stack_ptr -= 7; 554 } 555 /****************************************************************************/ 556 /* 557 * special addition function for use with the fast multiply operation 558 */ 559 void M_fmul_add(UCHAR *r, UCHAR *a, int offset, int sz) 560 { 561 int i, j; 562 UCHAR carry; 563 564 carry = 0; 565 j = offset + sz; 566 i = sz; 567 568 while (TRUE) 569 { 570 r[--j] += carry + a[--i]; 571 572 if (r[j] >= 100) 573 { 574 r[j] -= 100; 575 carry = 1; 576 } 577 else 578 carry = 0; 579 580 if (i == 0) 581 break; 582 } 583 584 if (carry) 585 { 586 while (TRUE) 587 { 588 r[--j] += 1; 589 590 if (r[j] < 100) 591 break; 592 593 r[j] -= 100; 594 } 595 } 596 } 597 /****************************************************************************/ 598 /* 599 * special subtraction function for use with the fast multiply operation 600 */ 601 int M_fmul_subtract(UCHAR *r, UCHAR *a, UCHAR *b, int sz) 602 { 603 int k, jtmp, sflag, nb, borrow; 604 605 nb = sz; 606 sflag = 0; /* sign flag: assume the numbers are equal */ 607 608 /* 609 * find if a > b (so we perform a-b) 610 * or a < b (so we perform b-a) 611 */ 612 613 for (k=0; k < nb; k++) 614 { 615 if (a[k] < b[k]) 616 { 617 sflag = -1; 618 break; 619 } 620 621 if (a[k] > b[k]) 622 { 623 sflag = 1; 624 break; 625 } 626 } 627 628 if (sflag == 0) 629 { 630 memset(r, 0, nb); /* zero out the result */ 631 } 632 else 633 { 634 k = nb; 635 borrow = 0; 636 637 while (TRUE) 638 { 639 k--; 640 641 if (sflag == 1) 642 jtmp = (int)a[k] - ((int)b[k] + borrow); 643 else 644 jtmp = (int)b[k] - ((int)a[k] + borrow); 645 646 if (jtmp >= 0) 647 { 648 r[k] = (UCHAR)jtmp; 649 borrow = 0; 650 } 651 else 652 { 653 r[k] = (UCHAR)(100 + jtmp); 654 borrow = 1; 655 } 656 657 if (k == 0) 658 break; 659 } 660 } 661 662 return(sflag); 663 } 664 /****************************************************************************/ 665 int M_next_power_of_2(int n) 666 { 667 int ct, k; 668 669 if (n <= 2) 670 return(n); 671 672 k = 2; 673 ct = 0; 674 675 while (TRUE) 676 { 677 if (k >= n) 678 break; 679 680 k = k << 1; 681 682 if (++ct == bit_limit) 683 { 684 /* fatal, this does not return */ 685 686 M_apm_log_error_msg(M_APM_FATAL, 687 "\'M_next_power_of_2\', ERROR :sizeof(int) too small ??"); 688 } 689 } 690 691 return(k); 692 } 693 /****************************************************************************/ 694 int M_get_stack_ptr(int sz) 695 { 696 int i, k; 697 UCHAR *cp; 698 699 k = ++M_mul_stack_ptr; 700 701 /* if size is 0, just need to malloc and return */ 702 if (mul_stack_data_size[k] == 0) 703 { 704 if ((i = sz) < 16) 705 i = 16; 706 707 if ((cp = (UCHAR *)MAPM_MALLOC(i + 4)) == NULL) 708 { 709 /* fatal, this does not return */ 710 711 M_apm_log_error_msg(M_APM_FATAL, M_stack_ptr_error_msg); 712 } 713 714 mul_stack_data[k] = cp; 715 mul_stack_data_size[k] = i; 716 } 717 else /* it has been malloc'ed, see if it's big enough */ 718 { 719 if (sz > mul_stack_data_size[k]) 720 { 721 cp = mul_stack_data[k]; 722 723 if ((cp = (UCHAR *)MAPM_REALLOC(cp, (sz + 4))) == NULL) 724 { 725 /* fatal, this does not return */ 726 727 M_apm_log_error_msg(M_APM_FATAL, M_stack_ptr_error_msg); 728 } 729 730 mul_stack_data[k] = cp; 731 mul_stack_data_size[k] = sz; 732 } 733 } 734 735 return(k); 736 } 737 /****************************************************************************/ 738 739 #ifdef NO_FFT_MULTIPLY 740 741 /* 742 * multiply a 4 byte number by a 4 byte number 743 * yielding an 8 byte result. each byte contains 744 * a base 100 'digit', i.e.: range from 0-99. 745 * 746 * MSB LSB 747 * 748 * a,b [0] [1] [2] [3] 749 * result [0] ..... [7] 750 */ 751 752 void M_4_byte_multiply(UCHAR *r, UCHAR *a, UCHAR *b) 753 { 754 int jj; 755 unsigned int *ip, t1, rr[8]; 756 757 memset(rr, 0, (8 * sizeof(int))); /* zero out result */ 758 jj = 3; 759 ip = rr + 5; 760 761 /* 762 * loop for one number [b], un-roll the inner 'loop' [a] 763 * 764 * accumulate partial sums in UINT array, release carries 765 * and convert back to base 100 at the end 766 */ 767 768 while (1) 769 { 770 t1 = (unsigned int)b[jj]; 771 ip += 2; 772 773 *ip-- += t1 * a[3]; 774 *ip-- += t1 * a[2]; 775 *ip-- += t1 * a[1]; 776 *ip += t1 * a[0]; 777 778 if (jj-- == 0) 779 break; 780 } 781 782 jj = 7; 783 784 while (1) 785 { 786 t1 = rr[jj] / 100; 787 r[jj] = (UCHAR)(rr[jj] - 100 * t1); 788 789 if (jj == 0) 790 break; 791 792 rr[--jj] += t1; 793 } 794 } 795 796 #endif 797 798 /****************************************************************************/ 799