ReactOS Fundraising Campaign 2012
 
€ 4,410 / € 30,000

Information | Donate

Home | Info | Community | Development | myReactOS | Contact Us

  1. Home
  2. Community
  3. Development
  4. myReactOS
  5. Fundraiser 2012

  1. Main Page
  2. Alphabetical List
  3. Data Structures
  4. Directories
  5. File List
  6. Data Fields
  7. Globals
  8. Related Pages

ReactOS Development > Doxygen

mpi.c
Go to the documentation of this file.
00001 /*
00002  * dlls/rsaenh/mpi.c
00003  * Multi Precision Integer functions
00004  *
00005  * Copyright 2004 Michael Jung
00006  * Based on public domain code by Tom St Denis (tomstdenis@iahu.ca)
00007  *
00008  * This library is free software; you can redistribute it and/or
00009  * modify it under the terms of the GNU Lesser General Public
00010  * License as published by the Free Software Foundation; either
00011  * version 2.1 of the License, or (at your option) any later version.
00012  *
00013  * This library is distributed in the hope that it will be useful,
00014  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00015  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00016  * Lesser General Public License for more details.
00017  *
00018  * You should have received a copy of the GNU Lesser General Public
00019  * License along with this library; if not, write to the Free Software
00020  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA
00021  */
00022 
00023 /*
00024  * This file contains code from the LibTomCrypt cryptographic 
00025  * library written by Tom St Denis (tomstdenis@iahu.ca). LibTomCrypt
00026  * is in the public domain. The code in this file is tailored to
00027  * special requirements. Take a look at http://libtomcrypt.org for the
00028  * original version. 
00029  */
00030 
00031 #include <stdarg.h>
00032 
00033 #include "windef.h"
00034 #include "winbase.h"
00035 #include "tomcrypt.h"
00036 
00037 /* Known optimal configurations
00038  CPU                    /Compiler     /MUL CUTOFF/SQR CUTOFF
00039 -------------------------------------------------------------
00040  Intel P4 Northwood     /GCC v3.4.1   /        88/       128/LTM 0.32 ;-)
00041 */
00042 static const int KARATSUBA_MUL_CUTOFF = 88,  /* Min. number of digits before Karatsuba multiplication is used. */
00043                  KARATSUBA_SQR_CUTOFF = 128; /* Min. number of digits before Karatsuba squaring is used. */
00044 
00045 
00046 /* trim unused digits */
00047 static void mp_clamp(mp_int *a);
00048 
00049 /* compare |a| to |b| */
00050 static int mp_cmp_mag(const mp_int *a, const mp_int *b);
00051 
00052 /* Counts the number of lsbs which are zero before the first zero bit */
00053 static int mp_cnt_lsb(const mp_int *a);
00054 
00055 /* computes a = B**n mod b without division or multiplication useful for
00056  * normalizing numbers in a Montgomery system.
00057  */
00058 static int mp_montgomery_calc_normalization(mp_int *a, const mp_int *b);
00059 
00060 /* computes x/R == x (mod N) via Montgomery Reduction */
00061 static int mp_montgomery_reduce(mp_int *a, const mp_int *m, mp_digit mp);
00062 
00063 /* setups the montgomery reduction */
00064 static int mp_montgomery_setup(const mp_int *a, mp_digit *mp);
00065 
00066 /* Barrett Reduction, computes a (mod b) with a precomputed value c
00067  *
00068  * Assumes that 0 < a <= b*b, note if 0 > a > -(b*b) then you can merely
00069  * compute the reduction as -1 * mp_reduce(mp_abs(a)) [pseudo code].
00070  */
00071 static int mp_reduce(mp_int *a, const mp_int *b, const mp_int *c);
00072 
00073 /* reduces a modulo b where b is of the form 2**p - k [0 <= a] */
00074 static int mp_reduce_2k(mp_int *a, const mp_int *n, mp_digit d);
00075 
00076 /* determines k value for 2k reduction */
00077 static int mp_reduce_2k_setup(const mp_int *a, mp_digit *d);
00078 
00079 /* used to setup the Barrett reduction for a given modulus b */
00080 static int mp_reduce_setup(mp_int *a, const mp_int *b);
00081 
00082 /* set to a digit */
00083 static void mp_set(mp_int *a, mp_digit b);
00084 
00085 /* b = a*a  */
00086 static int mp_sqr(const mp_int *a, mp_int *b);
00087 
00088 /* c = a * a (mod b) */
00089 static int mp_sqrmod(const mp_int *a, mp_int *b, mp_int *c);
00090 
00091 
00092 static void bn_reverse(unsigned char *s, int len);
00093 static int s_mp_add(mp_int *a, mp_int *b, mp_int *c);
00094 static int s_mp_exptmod (const mp_int * G, const mp_int * X, mp_int * P, mp_int * Y);
00095 #define s_mp_mul(a, b, c) s_mp_mul_digs(a, b, c, (a)->used + (b)->used + 1)
00096 static int s_mp_mul_digs(const mp_int *a, const mp_int *b, mp_int *c, int digs);
00097 static int s_mp_mul_high_digs(const mp_int *a, const mp_int *b, mp_int *c, int digs);
00098 static int s_mp_sqr(const mp_int *a, mp_int *b);
00099 static int s_mp_sub(const mp_int *a, const mp_int *b, mp_int *c);
00100 static int mp_exptmod_fast(const mp_int *G, const mp_int *X, mp_int *P, mp_int *Y, int mode);
00101 static int mp_invmod_slow (const mp_int * a, mp_int * b, mp_int * c);
00102 static int mp_karatsuba_mul(const mp_int *a, const mp_int *b, mp_int *c);
00103 static int mp_karatsuba_sqr(const mp_int *a, mp_int *b);
00104 
00105 /* grow as required */
00106 static int mp_grow (mp_int * a, int size)
00107 {
00108   int     i;
00109   mp_digit *tmp;
00110 
00111   /* if the alloc size is smaller alloc more ram */
00112   if (a->alloc < size) {
00113     /* ensure there are always at least MP_PREC digits extra on top */
00114     size += (MP_PREC * 2) - (size % MP_PREC);
00115 
00116     /* reallocate the array a->dp
00117      *
00118      * We store the return in a temporary variable
00119      * in case the operation failed we don't want
00120      * to overwrite the dp member of a.
00121      */
00122     tmp = HeapReAlloc(GetProcessHeap(), 0, a->dp, sizeof (mp_digit) * size);
00123     if (tmp == NULL) {
00124       /* reallocation failed but "a" is still valid [can be freed] */
00125       return MP_MEM;
00126     }
00127 
00128     /* reallocation succeeded so set a->dp */
00129     a->dp = tmp;
00130 
00131     /* zero excess digits */
00132     i        = a->alloc;
00133     a->alloc = size;
00134     for (; i < a->alloc; i++) {
00135       a->dp[i] = 0;
00136     }
00137   }
00138   return MP_OKAY;
00139 }
00140 
00141 /* b = a/2 */
00142 static int mp_div_2(const mp_int * a, mp_int * b)
00143 {
00144   int     x, res, oldused;
00145 
00146   /* copy */
00147   if (b->alloc < a->used) {
00148     if ((res = mp_grow (b, a->used)) != MP_OKAY) {
00149       return res;
00150     }
00151   }
00152 
00153   oldused = b->used;
00154   b->used = a->used;
00155   {
00156     register mp_digit r, rr, *tmpa, *tmpb;
00157 
00158     /* source alias */
00159     tmpa = a->dp + b->used - 1;
00160 
00161     /* dest alias */
00162     tmpb = b->dp + b->used - 1;
00163 
00164     /* carry */
00165     r = 0;
00166     for (x = b->used - 1; x >= 0; x--) {
00167       /* get the carry for the next iteration */
00168       rr = *tmpa & 1;
00169 
00170       /* shift the current digit, add in carry and store */
00171       *tmpb-- = (*tmpa-- >> 1) | (r << (DIGIT_BIT - 1));
00172 
00173       /* forward carry to next iteration */
00174       r = rr;
00175     }
00176 
00177     /* zero excess digits */
00178     tmpb = b->dp + b->used;
00179     for (x = b->used; x < oldused; x++) {
00180       *tmpb++ = 0;
00181     }
00182   }
00183   b->sign = a->sign;
00184   mp_clamp (b);
00185   return MP_OKAY;
00186 }
00187 
00188 /* swap the elements of two integers, for cases where you can't simply swap the
00189  * mp_int pointers around
00190  */
00191 static void
00192 mp_exch (mp_int * a, mp_int * b)
00193 {
00194   mp_int  t;
00195 
00196   t  = *a;
00197   *a = *b;
00198   *b = t;
00199 }
00200 
00201 /* init a new mp_int */
00202 static int mp_init (mp_int * a)
00203 {
00204   int i;
00205 
00206   /* allocate memory required and clear it */
00207   a->dp = HeapAlloc(GetProcessHeap(), 0, sizeof (mp_digit) * MP_PREC);
00208   if (a->dp == NULL) {
00209     return MP_MEM;
00210   }
00211 
00212   /* set the digits to zero */
00213   for (i = 0; i < MP_PREC; i++) {
00214       a->dp[i] = 0;
00215   }
00216 
00217   /* set the used to zero, allocated digits to the default precision
00218    * and sign to positive */
00219   a->used  = 0;
00220   a->alloc = MP_PREC;
00221   a->sign  = MP_ZPOS;
00222 
00223   return MP_OKAY;
00224 }
00225 
00226 /* init an mp_init for a given size */
00227 static int mp_init_size (mp_int * a, int size)
00228 {
00229   int x;
00230 
00231   /* pad size so there are always extra digits */
00232   size += (MP_PREC * 2) - (size % MP_PREC);
00233 
00234   /* alloc mem */
00235   a->dp = HeapAlloc(GetProcessHeap(), 0, sizeof (mp_digit) * size);
00236   if (a->dp == NULL) {
00237     return MP_MEM;
00238   }
00239 
00240   /* set the members */
00241   a->used  = 0;
00242   a->alloc = size;
00243   a->sign  = MP_ZPOS;
00244 
00245   /* zero the digits */
00246   for (x = 0; x < size; x++) {
00247       a->dp[x] = 0;
00248   }
00249 
00250   return MP_OKAY;
00251 }
00252 
00253 /* clear one (frees)  */
00254 static void
00255 mp_clear (mp_int * a)
00256 {
00257   int i;
00258 
00259   /* only do anything if a hasn't been freed previously */
00260   if (a->dp != NULL) {
00261     /* first zero the digits */
00262     for (i = 0; i < a->used; i++) {
00263         a->dp[i] = 0;
00264     }
00265 
00266     /* free ram */
00267     HeapFree(GetProcessHeap(), 0, a->dp);
00268 
00269     /* reset members to make debugging easier */
00270     a->dp    = NULL;
00271     a->alloc = a->used = 0;
00272     a->sign  = MP_ZPOS;
00273   }
00274 }
00275 
00276 /* set to zero */
00277 static void
00278 mp_zero (mp_int * a)
00279 {
00280   a->sign = MP_ZPOS;
00281   a->used = 0;
00282   memset (a->dp, 0, sizeof (mp_digit) * a->alloc);
00283 }
00284 
00285 /* b = |a|
00286  *
00287  * Simple function copies the input and fixes the sign to positive
00288  */
00289 static int
00290 mp_abs (const mp_int * a, mp_int * b)
00291 {
00292   int     res;
00293 
00294   /* copy a to b */
00295   if (a != b) {
00296      if ((res = mp_copy (a, b)) != MP_OKAY) {
00297        return res;
00298      }
00299   }
00300 
00301   /* force the sign of b to positive */
00302   b->sign = MP_ZPOS;
00303 
00304   return MP_OKAY;
00305 }
00306 
00307 /* computes the modular inverse via binary extended euclidean algorithm, 
00308  * that is c = 1/a mod b 
00309  *
00310  * Based on slow invmod except this is optimized for the case where b is 
00311  * odd as per HAC Note 14.64 on pp. 610
00312  */
00313 static int
00314 fast_mp_invmod (const mp_int * a, mp_int * b, mp_int * c)
00315 {
00316   mp_int  x, y, u, v, B, D;
00317   int     res, neg;
00318 
00319   /* 2. [modified] b must be odd   */
00320   if (mp_iseven (b) == 1) {
00321     return MP_VAL;
00322   }
00323 
00324   /* init all our temps */
00325   if ((res = mp_init_multi(&x, &y, &u, &v, &B, &D, NULL)) != MP_OKAY) {
00326      return res;
00327   }
00328 
00329   /* x == modulus, y == value to invert */
00330   if ((res = mp_copy (b, &x)) != MP_OKAY) {
00331     goto __ERR;
00332   }
00333 
00334   /* we need y = |a| */
00335   if ((res = mp_abs (a, &y)) != MP_OKAY) {
00336     goto __ERR;
00337   }
00338 
00339   /* 3. u=x, v=y, A=1, B=0, C=0,D=1 */
00340   if ((res = mp_copy (&x, &u)) != MP_OKAY) {
00341     goto __ERR;
00342   }
00343   if ((res = mp_copy (&y, &v)) != MP_OKAY) {
00344     goto __ERR;
00345   }
00346   mp_set (&D, 1);
00347 
00348 top:
00349   /* 4.  while u is even do */
00350   while (mp_iseven (&u) == 1) {
00351     /* 4.1 u = u/2 */
00352     if ((res = mp_div_2 (&u, &u)) != MP_OKAY) {
00353       goto __ERR;
00354     }
00355     /* 4.2 if B is odd then */
00356     if (mp_isodd (&B) == 1) {
00357       if ((res = mp_sub (&B, &x, &B)) != MP_OKAY) {
00358         goto __ERR;
00359       }
00360     }
00361     /* B = B/2 */
00362     if ((res = mp_div_2 (&B, &B)) != MP_OKAY) {
00363       goto __ERR;
00364     }
00365   }
00366 
00367   /* 5.  while v is even do */
00368   while (mp_iseven (&v) == 1) {
00369     /* 5.1 v = v/2 */
00370     if ((res = mp_div_2 (&v, &v)) != MP_OKAY) {
00371       goto __ERR;
00372     }
00373     /* 5.2 if D is odd then */
00374     if (mp_isodd (&D) == 1) {
00375       /* D = (D-x)/2 */
00376       if ((res = mp_sub (&D, &x, &D)) != MP_OKAY) {
00377         goto __ERR;
00378       }
00379     }
00380     /* D = D/2 */
00381     if ((res = mp_div_2 (&D, &D)) != MP_OKAY) {
00382       goto __ERR;
00383     }
00384   }
00385 
00386   /* 6.  if u >= v then */
00387   if (mp_cmp (&u, &v) != MP_LT) {
00388     /* u = u - v, B = B - D */
00389     if ((res = mp_sub (&u, &v, &u)) != MP_OKAY) {
00390       goto __ERR;
00391     }
00392 
00393     if ((res = mp_sub (&B, &D, &B)) != MP_OKAY) {
00394       goto __ERR;
00395     }
00396   } else {
00397     /* v - v - u, D = D - B */
00398     if ((res = mp_sub (&v, &u, &v)) != MP_OKAY) {
00399       goto __ERR;
00400     }
00401 
00402     if ((res = mp_sub (&D, &B, &D)) != MP_OKAY) {
00403       goto __ERR;
00404     }
00405   }
00406 
00407   /* if not zero goto step 4 */
00408   if (mp_iszero (&u) == 0) {
00409     goto top;
00410   }
00411 
00412   /* now a = C, b = D, gcd == g*v */
00413 
00414   /* if v != 1 then there is no inverse */
00415   if (mp_cmp_d (&v, 1) != MP_EQ) {
00416     res = MP_VAL;
00417     goto __ERR;
00418   }
00419 
00420   /* b is now the inverse */
00421   neg = a->sign;
00422   while (D.sign == MP_NEG) {
00423     if ((res = mp_add (&D, b, &D)) != MP_OKAY) {
00424       goto __ERR;
00425     }
00426   }
00427   mp_exch (&D, c);
00428   c->sign = neg;
00429   res = MP_OKAY;
00430 
00431 __ERR:mp_clear_multi (&x, &y, &u, &v, &B, &D, NULL);
00432   return res;
00433 }
00434 
00435 /* computes xR**-1 == x (mod N) via Montgomery Reduction
00436  *
00437  * This is an optimized implementation of montgomery_reduce
00438  * which uses the comba method to quickly calculate the columns of the
00439  * reduction.
00440  *
00441  * Based on Algorithm 14.32 on pp.601 of HAC.
00442 */
00443 static int
00444 fast_mp_montgomery_reduce (mp_int * x, const mp_int * n, mp_digit rho)
00445 {
00446   int     ix, res, olduse;
00447   mp_word W[MP_WARRAY];
00448 
00449   /* get old used count */
00450   olduse = x->used;
00451 
00452   /* grow a as required */
00453   if (x->alloc < n->used + 1) {
00454     if ((res = mp_grow (x, n->used + 1)) != MP_OKAY) {
00455       return res;
00456     }
00457   }
00458 
00459   /* first we have to get the digits of the input into
00460    * an array of double precision words W[...]
00461    */
00462   {
00463     register mp_word *_W;
00464     register mp_digit *tmpx;
00465 
00466     /* alias for the W[] array */
00467     _W   = W;
00468 
00469     /* alias for the digits of  x*/
00470     tmpx = x->dp;
00471 
00472     /* copy the digits of a into W[0..a->used-1] */
00473     for (ix = 0; ix < x->used; ix++) {
00474       *_W++ = *tmpx++;
00475     }
00476 
00477     /* zero the high words of W[a->used..m->used*2] */
00478     for (; ix < n->used * 2 + 1; ix++) {
00479       *_W++ = 0;
00480     }
00481   }
00482 
00483   /* now we proceed to zero successive digits
00484    * from the least significant upwards
00485    */
00486   for (ix = 0; ix < n->used; ix++) {
00487     /* mu = ai * m' mod b
00488      *
00489      * We avoid a double precision multiplication (which isn't required)
00490      * by casting the value down to a mp_digit.  Note this requires
00491      * that W[ix-1] have  the carry cleared (see after the inner loop)
00492      */
00493     register mp_digit mu;
00494     mu = (mp_digit) (((W[ix] & MP_MASK) * rho) & MP_MASK);
00495 
00496     /* a = a + mu * m * b**i
00497      *
00498      * This is computed in place and on the fly.  The multiplication
00499      * by b**i is handled by offsetting which columns the results
00500      * are added to.
00501      *
00502      * Note the comba method normally doesn't handle carries in the
00503      * inner loop In this case we fix the carry from the previous
00504      * column since the Montgomery reduction requires digits of the
00505      * result (so far) [see above] to work.  This is
00506      * handled by fixing up one carry after the inner loop.  The
00507      * carry fixups are done in order so after these loops the
00508      * first m->used words of W[] have the carries fixed
00509      */
00510     {
00511       register int iy;
00512       register mp_digit *tmpn;
00513       register mp_word *_W;
00514 
00515       /* alias for the digits of the modulus */
00516       tmpn = n->dp;
00517 
00518       /* Alias for the columns set by an offset of ix */
00519       _W = W + ix;
00520 
00521       /* inner loop */
00522       for (iy = 0; iy < n->used; iy++) {
00523           *_W++ += ((mp_word)mu) * ((mp_word)*tmpn++);
00524       }
00525     }
00526 
00527     /* now fix carry for next digit, W[ix+1] */
00528     W[ix + 1] += W[ix] >> ((mp_word) DIGIT_BIT);
00529   }
00530 
00531   /* now we have to propagate the carries and
00532    * shift the words downward [all those least
00533    * significant digits we zeroed].
00534    */
00535   {
00536     register mp_digit *tmpx;
00537     register mp_word *_W, *_W1;
00538 
00539     /* nox fix rest of carries */
00540 
00541     /* alias for current word */
00542     _W1 = W + ix;
00543 
00544     /* alias for next word, where the carry goes */
00545     _W = W + ++ix;
00546 
00547     for (; ix <= n->used * 2 + 1; ix++) {
00548       *_W++ += *_W1++ >> ((mp_word) DIGIT_BIT);
00549     }
00550 
00551     /* copy out, A = A/b**n
00552      *
00553      * The result is A/b**n but instead of converting from an
00554      * array of mp_word to mp_digit than calling mp_rshd
00555      * we just copy them in the right order
00556      */
00557 
00558     /* alias for destination word */
00559     tmpx = x->dp;
00560 
00561     /* alias for shifted double precision result */
00562     _W = W + n->used;
00563 
00564     for (ix = 0; ix < n->used + 1; ix++) {
00565       *tmpx++ = (mp_digit)(*_W++ & ((mp_word) MP_MASK));
00566     }
00567 
00568     /* zero oldused digits, if the input a was larger than
00569      * m->used+1 we'll have to clear the digits
00570      */
00571     for (; ix < olduse; ix++) {
00572       *tmpx++ = 0;
00573     }
00574   }
00575 
00576   /* set the max used and clamp */
00577   x->used = n->used + 1;
00578   mp_clamp (x);
00579 
00580   /* if A >= m then A = A - m */
00581   if (mp_cmp_mag (x, n) != MP_LT) {
00582     return s_mp_sub (x, n, x);
00583   }
00584   return MP_OKAY;
00585 }
00586 
00587 /* Fast (comba) multiplier
00588  *
00589  * This is the fast column-array [comba] multiplier.  It is 
00590  * designed to compute the columns of the product first 
00591  * then handle the carries afterwards.  This has the effect 
00592  * of making the nested loops that compute the columns very
00593  * simple and schedulable on super-scalar processors.
00594  *
00595  * This has been modified to produce a variable number of 
00596  * digits of output so if say only a half-product is required 
00597  * you don't have to compute the upper half (a feature 
00598  * required for fast Barrett reduction).
00599  *
00600  * Based on Algorithm 14.12 on pp.595 of HAC.
00601  *
00602  */
00603 static int
00604 fast_s_mp_mul_digs (const mp_int * a, const mp_int * b, mp_int * c, int digs)
00605 {
00606   int     olduse, res, pa, ix, iz;
00607   mp_digit W[MP_WARRAY];
00608   register mp_word  _W;
00609 
00610   /* grow the destination as required */
00611   if (c->alloc < digs) {
00612     if ((res = mp_grow (c, digs)) != MP_OKAY) {
00613       return res;
00614     }
00615   }
00616 
00617   /* number of output digits to produce */
00618   pa = MIN(digs, a->used + b->used);
00619 
00620   /* clear the carry */
00621   _W = 0;
00622   for (ix = 0; ix <= pa; ix++) { 
00623       int      tx, ty;
00624       int      iy;
00625       mp_digit *tmpx, *tmpy;
00626 
00627       /* get offsets into the two bignums */
00628       ty = MIN(b->used-1, ix);
00629       tx = ix - ty;
00630 
00631       /* setup temp aliases */
00632       tmpx = a->dp + tx;
00633       tmpy = b->dp + ty;
00634 
00635       /* This is the number of times the loop will iterate, essentially it's
00636          while (tx++ < a->used && ty-- >= 0) { ... }
00637        */
00638       iy = MIN(a->used-tx, ty+1);
00639 
00640       /* execute loop */
00641       for (iz = 0; iz < iy; ++iz) {
00642          _W += ((mp_word)*tmpx++)*((mp_word)*tmpy--);
00643       }
00644 
00645       /* store term */
00646       W[ix] = ((mp_digit)_W) & MP_MASK;
00647 
00648       /* make next carry */
00649       _W = _W >> ((mp_word)DIGIT_BIT);
00650   }
00651 
00652   /* setup dest */
00653   olduse  = c->used;
00654   c->used = digs;
00655 
00656   {
00657     register mp_digit *tmpc;
00658     tmpc = c->dp;
00659     for (ix = 0; ix < digs; ix++) {
00660       /* now extract the previous digit [below the carry] */
00661       *tmpc++ = W[ix];
00662     }
00663 
00664     /* clear unused digits [that existed in the old copy of c] */
00665     for (; ix < olduse; ix++) {
00666       *tmpc++ = 0;
00667     }
00668   }
00669   mp_clamp (c);
00670   return MP_OKAY;
00671 }
00672 
00673 /* this is a modified version of fast_s_mul_digs that only produces
00674  * output digits *above* digs.  See the comments for fast_s_mul_digs
00675  * to see how it works.
00676  *
00677  * This is used in the Barrett reduction since for one of the multiplications
00678  * only the higher digits were needed.  This essentially halves the work.
00679  *
00680  * Based on Algorithm 14.12 on pp.595 of HAC.
00681  */
00682 static int
00683 fast_s_mp_mul_high_digs (const mp_int * a, const mp_int * b, mp_int * c, int digs)
00684 {
00685   int     olduse, res, pa, ix, iz;
00686   mp_digit W[MP_WARRAY];
00687   mp_word  _W;
00688 
00689   /* grow the destination as required */
00690   pa = a->used + b->used;
00691   if (c->alloc < pa) {
00692     if ((res = mp_grow (c, pa)) != MP_OKAY) {
00693       return res;
00694     }
00695   }
00696 
00697   /* number of output digits to produce */
00698   pa = a->used + b->used;
00699   _W = 0;
00700   for (ix = digs; ix <= pa; ix++) { 
00701       int      tx, ty, iy;
00702       mp_digit *tmpx, *tmpy;
00703 
00704       /* get offsets into the two bignums */
00705       ty = MIN(b->used-1, ix);
00706       tx = ix - ty;
00707 
00708       /* setup temp aliases */
00709       tmpx = a->dp + tx;
00710       tmpy = b->dp + ty;
00711 
00712       /* This is the number of times the loop will iterate, essentially it's
00713          while (tx++ < a->used && ty-- >= 0) { ... }
00714        */
00715       iy = MIN(a->used-tx, ty+1);
00716 
00717       /* execute loop */
00718       for (iz = 0; iz < iy; iz++) {
00719          _W += ((mp_word)*tmpx++)*((mp_word)*tmpy--);
00720       }
00721 
00722       /* store term */
00723       W[ix] = ((mp_digit)_W) & MP_MASK;
00724 
00725       /* make next carry */
00726       _W = _W >> ((mp_word)DIGIT_BIT);
00727   }
00728 
00729   /* setup dest */
00730   olduse  = c->used;
00731   c->used = pa;
00732 
00733   {
00734     register mp_digit *tmpc;
00735 
00736     tmpc = c->dp + digs;
00737     for (ix = digs; ix <= pa; ix++) {
00738       /* now extract the previous digit [below the carry] */
00739       *tmpc++ = W[ix];
00740     }
00741 
00742     /* clear unused digits [that existed in the old copy of c] */
00743     for (; ix < olduse; ix++) {
00744       *tmpc++ = 0;
00745     }
00746   }
00747   mp_clamp (c);
00748   return MP_OKAY;
00749 }
00750 
00751 /* fast squaring
00752  *
00753  * This is the comba method where the columns of the product
00754  * are computed first then the carries are computed.  This
00755  * has the effect of making a very simple inner loop that
00756  * is executed the most
00757  *
00758  * W2 represents the outer products and W the inner.
00759  *
00760  * A further optimizations is made because the inner
00761  * products are of the form "A * B * 2".  The *2 part does
00762  * not need to be computed until the end which is good
00763  * because 64-bit shifts are slow!
00764  *
00765  * Based on Algorithm 14.16 on pp.597 of HAC.
00766  *
00767  */
00768 /* the jist of squaring...
00769 
00770 you do like mult except the offset of the tmpx [one that starts closer to zero]
00771 can't equal the offset of tmpy.  So basically you set up iy like before then you min it with
00772 (ty-tx) so that it never happens.  You double all those you add in the inner loop
00773 
00774 After that loop you do the squares and add them in.
00775 
00776 Remove W2 and don't memset W
00777 
00778 */
00779 
00780 static int fast_s_mp_sqr (const mp_int * a, mp_int * b)
00781 {
00782   int       olduse, res, pa, ix, iz;
00783   mp_digit   W[MP_WARRAY], *tmpx;
00784   mp_word   W1;
00785 
00786   /* grow the destination as required */
00787   pa = a->used + a->used;
00788   if (b->alloc < pa) {
00789     if ((res = mp_grow (b, pa)) != MP_OKAY) {
00790       return res;
00791     }
00792   }
00793 
00794   /* number of output digits to produce */
00795   W1 = 0;
00796   for (ix = 0; ix <= pa; ix++) { 
00797       int      tx, ty, iy;
00798       mp_word  _W;
00799       mp_digit *tmpy;
00800 
00801       /* clear counter */
00802       _W = 0;
00803 
00804       /* get offsets into the two bignums */
00805       ty = MIN(a->used-1, ix);
00806       tx = ix - ty;
00807 
00808       /* setup temp aliases */
00809       tmpx = a->dp + tx;
00810       tmpy = a->dp + ty;
00811 
00812       /* This is the number of times the loop will iterate, essentially it's
00813          while (tx++ < a->used && ty-- >= 0) { ... }
00814        */
00815       iy = MIN(a->used-tx, ty+1);
00816 
00817       /* now for squaring tx can never equal ty 
00818        * we halve the distance since they approach at a rate of 2x
00819        * and we have to round because odd cases need to be executed
00820        */
00821       iy = MIN(iy, (ty-tx+1)>>1);
00822 
00823       /* execute loop */
00824       for (iz = 0; iz < iy; iz++) {
00825          _W += ((mp_word)*tmpx++)*((mp_word)*tmpy--);
00826       }
00827 
00828       /* double the inner product and add carry */
00829       _W = _W + _W + W1;
00830 
00831       /* even columns have the square term in them */
00832       if ((ix&1) == 0) {
00833          _W += ((mp_word)a->dp[ix>>1])*((mp_word)a->dp[ix>>1]);
00834       }
00835 
00836       /* store it */
00837       W[ix] = _W;
00838 
00839       /* make next carry */
00840       W1 = _W >> ((mp_word)DIGIT_BIT);
00841   }
00842 
00843   /* setup dest */
00844   olduse  = b->used;
00845   b->used = a->used+a->used;
00846 
00847   {
00848     mp_digit *tmpb;
00849     tmpb = b->dp;
00850     for (ix = 0; ix < pa; ix++) {
00851       *tmpb++ = W[ix] & MP_MASK;
00852     }
00853 
00854     /* clear unused digits [that existed in the old copy of c] */
00855     for (; ix < olduse; ix++) {
00856       *tmpb++ = 0;
00857     }
00858   }
00859   mp_clamp (b);
00860   return MP_OKAY;
00861 }
00862 
00863 /* computes a = 2**b 
00864  *
00865  * Simple algorithm which zeroes the int, grows it then just sets one bit
00866  * as required.
00867  */
00868 static int
00869 mp_2expt (mp_int * a, int b)
00870 {
00871   int     res;
00872 
00873   /* zero a as per default */
00874   mp_zero (a);
00875 
00876   /* grow a to accommodate the single bit */
00877   if ((res = mp_grow (a, b / DIGIT_BIT + 1)) != MP_OKAY) {
00878     return res;
00879   }
00880 
00881   /* set the used count of where the bit will go */
00882   a->used = b / DIGIT_BIT + 1;
00883 
00884   /* put the single bit in its place */
00885   a->dp[b / DIGIT_BIT] = ((mp_digit)1) << (b % DIGIT_BIT);
00886 
00887   return MP_OKAY;
00888 }
00889 
00890 /* high level addition (handles signs) */
00891 int mp_add (mp_int * a, mp_int * b, mp_int * c)
00892 {
00893   int     sa, sb, res;
00894 
00895   /* get sign of both inputs */
00896   sa = a->sign;
00897   sb = b->sign;
00898 
00899   /* handle two cases, not four */
00900   if (sa == sb) {
00901     /* both positive or both negative */
00902     /* add their magnitudes, copy the sign */
00903     c->sign = sa;
00904     res = s_mp_add (a, b, c);
00905   } else {
00906     /* one positive, the other negative */
00907     /* subtract the one with the greater magnitude from */
00908     /* the one of the lesser magnitude.  The result gets */
00909     /* the sign of the one with the greater magnitude. */
00910     if (mp_cmp_mag (a, b) == MP_LT) {
00911       c->sign = sb;
00912       res = s_mp_sub (b, a, c);
00913     } else {
00914       c->sign = sa;
00915       res = s_mp_sub (a, b, c);
00916     }
00917   }
00918   return res;
00919 }
00920 
00921 
00922 /* single digit addition */
00923 static int
00924 mp_add_d (mp_int * a, mp_digit b, mp_int * c)
00925 {
00926   int     res, ix, oldused;
00927   mp_digit *tmpa, *tmpc, mu;
00928 
00929   /* grow c as required */
00930   if (c->alloc < a->used + 1) {
00931      if ((res = mp_grow(c, a->used + 1)) != MP_OKAY) {
00932         return res;
00933      }
00934   }
00935 
00936   /* if a is negative and |a| >= b, call c = |a| - b */
00937   if (a->sign == MP_NEG && (a->used > 1 || a->dp[0] >= b)) {
00938      /* temporarily fix sign of a */
00939      a->sign = MP_ZPOS;
00940 
00941      /* c = |a| - b */
00942      res = mp_sub_d(a, b, c);
00943 
00944      /* fix sign  */
00945      a->sign = c->sign = MP_NEG;
00946 
00947      return res;
00948   }
00949 
00950   /* old number of used digits in c */
00951   oldused = c->used;
00952 
00953   /* sign always positive */
00954   c->sign = MP_ZPOS;
00955 
00956   /* source alias */
00957   tmpa    = a->dp;
00958 
00959   /* destination alias */
00960   tmpc    = c->dp;
00961 
00962   /* if a is positive */
00963   if (a->sign == MP_ZPOS) {
00964      /* add digit, after this we're propagating
00965       * the carry.
00966       */
00967      *tmpc   = *tmpa++ + b;
00968      mu      = *tmpc >> DIGIT_BIT;
00969      *tmpc++ &= MP_MASK;
00970 
00971      /* now handle rest of the digits */
00972      for (ix = 1; ix < a->used; ix++) {
00973         *tmpc   = *tmpa++ + mu;
00974         mu      = *tmpc >> DIGIT_BIT;
00975         *tmpc++ &= MP_MASK;
00976      }
00977      /* set final carry */
00978      ix++;
00979      *tmpc++  = mu;
00980 
00981      /* setup size */
00982      c->used = a->used + 1;
00983   } else {
00984      /* a was negative and |a| < b */
00985      c->used  = 1;
00986 
00987      /* the result is a single digit */
00988      if (a->used == 1) {
00989         *tmpc++  =  b - a->dp[0];
00990      } else {
00991         *tmpc++  =  b;
00992      }
00993 
00994      /* setup count so the clearing of oldused
00995       * can fall through correctly
00996       */
00997      ix       = 1;
00998   }
00999 
01000   /* now zero to oldused */
01001   while (ix++ < oldused) {
01002      *tmpc++ = 0;
01003   }
01004   mp_clamp(c);
01005 
01006   return MP_OKAY;
01007 }
01008 
01009 /* trim unused digits 
01010  *
01011  * This is used to ensure that leading zero digits are
01012  * trimed and the leading "used" digit will be non-zero
01013  * Typically very fast.  Also fixes the sign if there
01014  * are no more leading digits
01015  */
01016 void
01017 mp_clamp (mp_int * a)
01018 {
01019   /* decrease used while the most significant digit is
01020    * zero.
01021    */
01022   while (a->used > 0 && a->dp[a->used - 1] == 0) {
01023     --(a->used);
01024   }
01025 
01026   /* reset the sign flag if used == 0 */
01027   if (a->used == 0) {
01028     a->sign = MP_ZPOS;
01029   }
01030 }
01031 
01032 void mp_clear_multi(mp_int *mp, ...) 
01033 {
01034     mp_int* next_mp = mp;
01035     va_list args;
01036     va_start(args, mp);
01037     while (next_mp != NULL) {
01038         mp_clear(next_mp);
01039         next_mp = va_arg(args, mp_int*);
01040     }
01041     va_end(args);
01042 }
01043 
01044 /* compare two ints (signed)*/
01045 int
01046 mp_cmp (const mp_int * a, const mp_int * b)
01047 {
01048   /* compare based on sign */
01049   if (a->sign != b->sign) {
01050      if (a->sign == MP_NEG) {
01051         return MP_LT;
01052      } else {
01053         return MP_GT;
01054      }
01055   }
01056   
01057   /* compare digits */
01058   if (a->sign == MP_NEG) {
01059      /* if negative compare opposite direction */
01060      return mp_cmp_mag(b, a);
01061   } else {
01062      return mp_cmp_mag(a, b);
01063   }
01064 }
01065 
01066 /* compare a digit */
01067 int mp_cmp_d(const mp_int * a, mp_digit b)
01068 {
01069   /* compare based on sign */
01070   if (a->sign == MP_NEG) {
01071     return MP_LT;
01072   }
01073 
01074   /* compare based on magnitude */
01075   if (a->used > 1) {
01076     return MP_GT;
01077   }
01078 
01079   /* compare the only digit of a to b */
01080   if (a->dp[0] > b) {
01081     return MP_GT;
01082   } else if (a->dp[0] < b) {
01083     return MP_LT;
01084   } else {
01085     return MP_EQ;
01086   }
01087 }
01088 
01089 /* compare maginitude of two ints (unsigned) */
01090 int mp_cmp_mag (const mp_int * a, const mp_int * b)
01091 {
01092   int     n;
01093   mp_digit *tmpa, *tmpb;
01094 
01095   /* compare based on # of non-zero digits */
01096   if (a->used > b->used) {
01097     return MP_GT;
01098   }
01099   
01100   if (a->used < b->used) {
01101     return MP_LT;
01102   }
01103 
01104   /* alias for a */
01105   tmpa = a->dp + (a->used - 1);
01106 
01107   /* alias for b */
01108   tmpb = b->dp + (a->used - 1);
01109 
01110   /* compare based on digits  */
01111   for (n = 0; n < a->used; ++n, --tmpa, --tmpb) {
01112     if (*tmpa > *tmpb) {
01113       return MP_GT;
01114     }
01115 
01116     if (*tmpa < *tmpb) {
01117       return MP_LT;
01118     }
01119   }
01120   return MP_EQ;
01121 }
01122 
01123 static const int lnz[16] = { 
01124    4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0
01125 };
01126 
01127 /* Counts the number of lsbs which are zero before the first zero bit */
01128 int mp_cnt_lsb(const mp_int *a)
01129 {
01130    int x;
01131    mp_digit q, qq;
01132 
01133    /* easy out */
01134    if (mp_iszero(a) == 1) {
01135       return 0;
01136    }
01137 
01138    /* scan lower digits until non-zero */
01139    for (x = 0; x < a->used && a->dp[x] == 0; x++);
01140    q = a->dp[x];
01141    x *= DIGIT_BIT;
01142 
01143    /* now scan this digit until a 1 is found */
01144    if ((q & 1) == 0) {
01145       do {
01146          qq  = q & 15;
01147          x  += lnz[qq];
01148          q >>= 4;
01149       } while (qq == 0);
01150    }
01151    return x;
01152 }
01153 
01154 /* copy, b = a */
01155 int
01156 mp_copy (const mp_int * a, mp_int * b)
01157 {
01158   int     res, n;
01159 
01160   /* if dst == src do nothing */
01161   if (a == b) {
01162     return MP_OKAY;
01163   }
01164 
01165   /* grow dest */
01166   if (b->alloc < a->used) {
01167      if ((res = mp_grow (b, a->used)) != MP_OKAY) {
01168         return res;
01169      }
01170   }
01171 
01172   /* zero b and copy the parameters over */
01173   {
01174     register mp_digit *tmpa, *tmpb;
01175 
01176     /* pointer aliases */
01177 
01178     /* source */
01179     tmpa = a->dp;
01180 
01181     /* destination */
01182     tmpb = b->dp;
01183 
01184     /* copy all the digits */
01185     for (n = 0; n < a->used; n++) {
01186       *tmpb++ = *tmpa++;
01187     }
01188 
01189     /* clear high digits */
01190     for (; n < b->used; n++) {
01191       *tmpb++ = 0;
01192     }
01193   }
01194 
01195   /* copy used count and sign */
01196   b->used = a->used;
01197   b->sign = a->sign;
01198   return MP_OKAY;
01199 }
01200 
01201 /* returns the number of bits in an int */
01202 int
01203 mp_count_bits (const mp_int * a)
01204 {
01205   int     r;
01206   mp_digit q;
01207 
01208   /* shortcut */
01209   if (a->used == 0) {
01210     return 0;
01211   }
01212 
01213   /* get number of digits and add that */
01214   r = (a->used - 1) * DIGIT_BIT;
01215   
01216   /* take the last digit and count the bits in it */
01217   q = a->dp[a->used - 1];
01218   while (q > 0) {
01219     ++r;
01220     q >>= ((mp_digit) 1);
01221   }
01222   return r;
01223 }
01224 
01225 /* calc a value mod 2**b */
01226 static int
01227 mp_mod_2d (const mp_int * a, int b, mp_int * c)
01228 {
01229   int     x, res;
01230 
01231   /* if b is <= 0 then zero the int */
01232   if (b <= 0) {
01233     mp_zero (c);
01234     return MP_OKAY;
01235   }
01236 
01237   /* if the modulus is larger than the value than return */
01238   if (b > a->used * DIGIT_BIT) {
01239     res = mp_copy (a, c);
01240     return res;
01241   }
01242 
01243   /* copy */
01244   if ((res = mp_copy (a, c)) != MP_OKAY) {
01245     return res;
01246   }
01247 
01248   /* zero digits above the last digit of the modulus */
01249   for (x = (b / DIGIT_BIT) + ((b % DIGIT_BIT) == 0 ? 0 : 1); x < c->used; x++) {
01250     c->dp[x] = 0;
01251   }
01252   /* clear the digit that is not completely outside/inside the modulus */
01253   c->dp[b / DIGIT_BIT] &= (1 << ((mp_digit)b % DIGIT_BIT)) - 1;
01254   mp_clamp (c);
01255   return MP_OKAY;
01256 }
01257 
01258 /* shift right a certain amount of digits */
01259 static void mp_rshd (mp_int * a, int b)
01260 {
01261   int     x;
01262 
01263   /* if b <= 0 then ignore it */
01264   if (b <= 0) {
01265     return;
01266   }
01267 
01268   /* if b > used then simply zero it and return */
01269   if (a->used <= b) {
01270     mp_zero (a);
01271     return;
01272   }
01273 
01274   {
01275     register mp_digit *bottom, *top;
01276 
01277     /* shift the digits down */
01278 
01279     /* bottom */
01280     bottom = a->dp;
01281 
01282     /* top [offset into digits] */
01283     top = a->dp + b;
01284 
01285     /* this is implemented as a sliding window where
01286      * the window is b-digits long and digits from
01287      * the top of the window are copied to the bottom
01288      *
01289      * e.g.
01290 
01291      b-2 | b-1 | b0 | b1 | b2 | ... | bb |   ---->
01292                  /\                   |      ---->
01293                   \-------------------/      ---->
01294      */
01295     for (x = 0; x < (a->used - b); x++) {
01296       *bottom++ = *top++;
01297     }
01298 
01299     /* zero the top digits */
01300     for (; x < a->used; x++) {
01301       *bottom++ = 0;
01302     }
01303   }
01304 
01305   /* remove excess digits */
01306   a->used -= b;
01307 }
01308 
01309 /* shift right by a certain bit count (store quotient in c, optional remainder in d) */
01310 static int mp_div_2d (const mp_int * a, int b, mp_int * c, mp_int * d)
01311 {
01312   mp_digit D, r, rr;
01313   int     x, res;
01314   mp_int  t;
01315 
01316 
01317   /* if the shift count is <= 0 then we do no work */
01318   if (b <= 0) {
01319     res = mp_copy (a, c);
01320     if (d != NULL) {
01321       mp_zero (d);
01322     }
01323     return res;
01324   }
01325 
01326   if ((res = mp_init (&t)) != MP_OKAY) {
01327     return res;
01328   }
01329 
01330   /* get the remainder */
01331   if (d != NULL) {
01332     if ((res = mp_mod_2d (a, b, &t)) != MP_OKAY) {
01333       mp_clear (&t);
01334       return res;
01335     }
01336   }
01337 
01338   /* copy */
01339   if ((res = mp_copy (a, c)) != MP_OKAY) {
01340     mp_clear (&t);
01341     return res;
01342   }
01343 
01344   /* shift by as many digits in the bit count */
01345   if (b >= DIGIT_BIT) {
01346     mp_rshd (c, b / DIGIT_BIT);
01347   }
01348 
01349   /* shift any bit count < DIGIT_BIT */
01350   D = (mp_digit) (b % DIGIT_BIT);
01351   if (D != 0) {
01352     register mp_digit *tmpc, mask, shift;
01353 
01354     /* mask */
01355     mask = (((mp_digit)1) << D) - 1;
01356 
01357     /* shift for lsb */
01358     shift = DIGIT_BIT - D;
01359 
01360     /* alias */
01361     tmpc = c->dp + (c->used - 1);
01362 
01363     /* carry */
01364     r = 0;
01365     for (x = c->used - 1; x >= 0; x--) {
01366       /* get the lower  bits of this word in a temp */
01367       rr = *tmpc & mask;
01368 
01369       /* shift the current word and mix in the carry bits from the previous word */
01370       *tmpc = (*tmpc >> D) | (r << shift);
01371       --tmpc;
01372 
01373       /* set the carry to the carry bits of the current word found above */
01374       r = rr;
01375     }
01376   }
01377   mp_clamp (c);
01378   if (d != NULL) {
01379     mp_exch (&t, d);
01380   }
01381   mp_clear (&t);
01382   return MP_OKAY;
01383 }
01384 
01385 /* shift left a certain amount of digits */
01386 static int mp_lshd (mp_int * a, int b)
01387 {
01388   int     x, res;
01389 
01390   /* if its less than zero return */
01391   if (b <= 0) {
01392     return MP_OKAY;
01393   }
01394 
01395   /* grow to fit the new digits */
01396   if (a->alloc < a->used + b) {
01397      if ((res = mp_grow (a, a->used + b)) != MP_OKAY) {
01398        return res;
01399      }
01400   }
01401 
01402   {
01403     register mp_digit *top, *bottom;
01404 
01405     /* increment the used by the shift amount then copy upwards */
01406     a->used += b;
01407 
01408     /* top */
01409     top = a->dp + a->used - 1;
01410 
01411     /* base */
01412     bottom = a->dp + a->used - 1 - b;
01413 
01414     /* much like mp_rshd this is implemented using a sliding window
01415      * except the window goes the other way around.  Copying from
01416      * the bottom to the top.  see bn_mp_rshd.c for more info.
01417      */
01418     for (x = a->used - 1; x >= b; x--) {
01419       *top-- = *bottom--;
01420     }
01421 
01422     /* zero the lower digits */
01423     top = a->dp;
01424     for (x = 0; x < b; x++) {
01425       *top++ = 0;
01426     }
01427   }
01428   return MP_OKAY;
01429 }
01430 
01431 /* shift left by a certain bit count */
01432 static int mp_mul_2d (const mp_int * a, int b, mp_int * c)
01433 {
01434   mp_digit d;
01435   int      res;
01436 
01437   /* copy */
01438   if (a != c) {
01439      if ((res = mp_copy (a, c)) != MP_OKAY) {
01440        return res;
01441      }
01442   }
01443 
01444   if (c->alloc < c->used + b/DIGIT_BIT + 1) {
01445      if ((res = mp_grow (c, c->used + b / DIGIT_BIT + 1)) != MP_OKAY) {
01446        return res;
01447      }
01448   }
01449 
01450   /* shift by as many digits in the bit count */
01451   if (b >= DIGIT_BIT) {
01452     if ((res = mp_lshd (c, b / DIGIT_BIT)) != MP_OKAY) {
01453       return res;
01454     }
01455   }
01456 
01457   /* shift any bit count < DIGIT_BIT */
01458   d = (mp_digit) (b % DIGIT_BIT);
01459   if (d != 0) {
01460     register mp_digit *tmpc, shift, mask, r, rr;
01461     register int x;
01462 
01463     /* bitmask for carries */
01464     mask = (((mp_digit)1) << d) - 1;
01465 
01466     /* shift for msbs */
01467     shift = DIGIT_BIT - d;
01468 
01469     /* alias */
01470     tmpc = c->dp;
01471 
01472     /* carry */
01473     r    = 0;
01474     for (x = 0; x < c->used; x++) {
01475       /* get the higher bits of the current word */
01476       rr = (*tmpc >> shift) & mask;
01477 
01478       /* shift the current word and OR in the carry */
01479       *tmpc = ((*tmpc << d) | r) & MP_MASK;
01480       ++tmpc;
01481 
01482       /* set the carry to the carry bits of the current word */
01483       r = rr;
01484     }
01485 
01486     /* set final carry */
01487     if (r != 0) {
01488        c->dp[(c->used)++] = r;
01489     }
01490   }
01491   mp_clamp (c);
01492   return MP_OKAY;
01493 }
01494 
01495 /* multiply by a digit */
01496 static int
01497 mp_mul_d (const mp_int * a, mp_digit b, mp_int * c)
01498 {
01499   mp_digit u, *tmpa, *tmpc;
01500   mp_word  r;
01501   int      ix, res, olduse;
01502 
01503   /* make sure c is big enough to hold a*b */
01504   if (c->alloc < a->used + 1) {
01505     if ((res = mp_grow (c, a->used + 1)) != MP_OKAY) {
01506       return res;
01507     }
01508   }
01509 
01510   /* get the original destinations used count */
01511   olduse = c->used;
01512 
01513   /* set the sign */
01514   c->sign = a->sign;
01515 
01516   /* alias for a->dp [source] */
01517   tmpa = a->dp;
01518 
01519   /* alias for c->dp [dest] */
01520   tmpc = c->dp;
01521 
01522   /* zero carry */
01523   u = 0;
01524 
01525   /* compute columns */
01526   for (ix = 0; ix < a->used; ix++) {
01527     /* compute product and carry sum for this term */
01528     r       = ((mp_word) u) + ((mp_word)*tmpa++) * ((mp_word)b);
01529 
01530     /* mask off higher bits to get a single digit */
01531     *tmpc++ = (mp_digit) (r & ((mp_word) MP_MASK));
01532 
01533     /* send carry into next iteration */
01534     u       = (mp_digit) (r >> ((mp_word) DIGIT_BIT));
01535   }
01536 
01537   /* store final carry [if any] */
01538   *tmpc++ = u;
01539 
01540   /* now zero digits above the top */
01541   while (ix++ < olduse) {
01542      *tmpc++ = 0;
01543   }
01544 
01545   /* set used count */
01546   c->used = a->used + 1;
01547   mp_clamp(c);
01548 
01549   return MP_OKAY;
01550 }
01551 
01552 /* integer signed division. 
01553  * c*b + d == a [e.g. a/b, c=quotient, d=remainder]
01554  * HAC pp.598 Algorithm 14.20
01555  *
01556  * Note that the description in HAC is horribly 
01557  * incomplete.  For example, it doesn't consider 
01558  * the case where digits are removed from 'x' in 
01559  * the inner loop.  It also doesn't consider the 
01560  * case that y has fewer than three digits, etc..
01561  *
01562  * The overall algorithm is as described as 
01563  * 14.20 from HAC but fixed to treat these cases.
01564 */
01565 static int mp_div (const mp_int * a, const mp_int * b, mp_int * c, mp_int * d)
01566 {
01567   mp_int  q, x, y, t1, t2;
01568   int     res, n, t, i, norm, neg;
01569 
01570   /* is divisor zero ? */
01571   if (mp_iszero (b) == 1) {
01572     return MP_VAL;
01573   }
01574 
01575   /* if a < b then q=0, r = a */
01576   if (mp_cmp_mag (a, b) == MP_LT) {
01577     if (d != NULL) {
01578       res = mp_copy (a, d);
01579     } else {
01580       res = MP_OKAY;
01581     }
01582     if (c != NULL) {
01583       mp_zero (c);
01584     }
01585     return res;
01586   }
01587 
01588   if ((res = mp_init_size (&q, a->used + 2)) != MP_OKAY) {
01589     return res;
01590   }
01591   q.used = a->used + 2;
01592 
01593   if ((res = mp_init (&t1)) != MP_OKAY) {
01594     goto __Q;
01595   }
01596 
01597   if ((res = mp_init (&t2)) != MP_OKAY) {
01598     goto __T1;
01599   }
01600 
01601   if ((res = mp_init_copy (&x, a)) != MP_OKAY) {
01602     goto __T2;
01603   }
01604 
01605   if ((res = mp_init_copy (&y, b)) != MP_OKAY) {
01606     goto __X;
01607   }
01608 
01609   /* fix the sign */
01610   neg = (a->sign == b->sign) ? MP_ZPOS : MP_NEG;
01611   x.sign = y.sign = MP_ZPOS;
01612 
01613   /* normalize both x and y, ensure that y >= b/2, [b == 2**DIGIT_BIT] */
01614   norm = mp_count_bits(&y) % DIGIT_BIT;
01615   if (norm < DIGIT_BIT-1) {
01616      norm = (DIGIT_BIT-1) - norm;
01617      if ((res = mp_mul_2d (&x, norm, &x)) != MP_OKAY) {
01618        goto __Y;
01619      }
01620      if ((res = mp_mul_2d (&y, norm, &y)) != MP_OKAY) {
01621        goto __Y;
01622      }
01623   } else {
01624      norm = 0;
01625   }
01626 
01627   /* note hac does 0 based, so if used==5 then its 0,1,2,3,4, e.g. use 4 */
01628   n = x.used - 1;
01629   t = y.used - 1;
01630 
01631   /* while (x >= y*b**n-t) do { q[n-t] += 1; x -= y*b**{n-t} } */
01632   if ((res = mp_lshd (&y, n - t)) != MP_OKAY) { /* y = y*b**{n-t} */
01633     goto __Y;
01634   }
01635 
01636   while (mp_cmp (&x, &y) != MP_LT) {
01637     ++(q.dp[n - t]);
01638     if ((res = mp_sub (&x, &y, &x)) != MP_OKAY) {
01639       goto __Y;
01640     }
01641   }
01642 
01643   /* reset y by shifting it back down */
01644   mp_rshd (&y, n - t);
01645 
01646   /* step 3. for i from n down to (t + 1) */
01647   for (i = n; i >= (t + 1); i--) {
01648     if (i > x.used) {
01649       continue;
01650     }
01651 
01652     /* step 3.1 if xi == yt then set q{i-t-1} to b-1, 
01653      * otherwise set q{i-t-1} to (xi*b + x{i-1})/yt */
01654     if (x.dp[i] == y.dp[t]) {
01655       q.dp[i - t - 1] = ((((mp_digit)1) << DIGIT_BIT) - 1);
01656     } else {
01657       mp_word tmp;
01658       tmp = ((mp_word) x.dp[i]) << ((mp_word) DIGIT_BIT);
01659       tmp |= ((mp_word) x.dp[i - 1]);
01660       tmp /= ((mp_word) y.dp[t]);
01661       if (tmp > (mp_word) MP_MASK)
01662         tmp = MP_MASK;
01663       q.dp[i - t - 1] = (mp_digit) (tmp & (mp_word) (MP_MASK));
01664     }
01665 
01666     /* while (q{i-t-1} * (yt * b + y{t-1})) > 
01667              xi * b**2 + xi-1 * b + xi-2 
01668      
01669        do q{i-t-1} -= 1; 
01670     */
01671     q.dp[i - t - 1] = (q.dp[i - t - 1] + 1) & MP_MASK;
01672     do {
01673       q.dp[i - t - 1] = (q.dp[i - t - 1] - 1) & MP_MASK;
01674 
01675       /* find left hand */
01676       mp_zero (&t1);
01677       t1.dp[0] = (t - 1 < 0) ? 0 : y.dp[t - 1];
01678       t1.dp[1] = y.dp[t];
01679       t1.used = 2;
01680       if ((res = mp_mul_d (&t1, q.dp[i - t - 1], &t1)) != MP_OKAY) {
01681         goto __Y;
01682       }
01683 
01684       /* find right hand */
01685       t2.dp[0] = (i - 2 < 0) ? 0 : x.dp[i - 2];
01686       t2.dp[1] = (i - 1 < 0) ? 0 : x.dp[i - 1];
01687       t2.dp[2] = x.dp[i];
01688       t2.used = 3;
01689     } while (mp_cmp_mag(&t1, &t2) == MP_GT);
01690 
01691     /* step 3.3 x = x - q{i-t-1} * y * b**{i-t-1} */
01692     if ((res = mp_mul_d (&y, q.dp[i - t - 1], &t1)) != MP_OKAY) {
01693       goto __Y;
01694     }
01695 
01696     if ((res = mp_lshd (&t1, i - t - 1)) != MP_OKAY) {
01697       goto __Y;
01698     }
01699 
01700     if ((res = mp_sub (&x, &t1, &x)) != MP_OKAY) {
01701       goto __Y;
01702     }
01703 
01704     /* if x < 0 then { x = x + y*b**{i-t-1}; q{i-t-1} -= 1; } */
01705     if (x.sign == MP_NEG) {
01706       if ((res = mp_copy (&y, &t1)) != MP_OKAY) {
01707         goto __Y;
01708       }
01709       if ((res = mp_lshd (&t1, i - t - 1)) != MP_OKAY) {
01710         goto __Y;
01711       }
01712       if ((res = mp_add (&x, &t1, &x)) != MP_OKAY) {
01713         goto __Y;
01714       }
01715 
01716       q.dp[i - t - 1] = (q.dp[i - t - 1] - 1UL) & MP_MASK;
01717     }
01718   }
01719 
01720   /* now q is the quotient and x is the remainder 
01721    * [which we have to normalize] 
01722    */
01723   
01724   /* get sign before writing to c */
01725   x.sign = x.used == 0 ? MP_ZPOS : a->sign;
01726 
01727   if (c != NULL) {
01728     mp_clamp (&q);
01729     mp_exch (&q, c);
01730     c->sign = neg;
01731   }
01732 
01733   if (d != NULL) {
01734     mp_div_2d (&x, norm, &x, NULL);
01735     mp_exch (&x, d);
01736   }
01737 
01738   res = MP_OKAY;
01739 
01740 __Y:mp_clear (&y);
01741 __X:mp_clear (&x);
01742 __T2:mp_clear (&t2);
01743 __T1:mp_clear (&t1);
01744 __Q:mp_clear (&q);
01745   return res;
01746 }
01747 
01748 static int s_is_power_of_two(mp_digit b, int *p)
01749 {
01750    int x;
01751 
01752    for (x = 1; x < DIGIT_BIT; x++) {
01753       if (b == (((mp_digit)1)<<x)) {
01754          *p = x;
01755          return 1;
01756       }
01757    }
01758    return 0;
01759 }
01760 
01761 /* single digit division (based on routine from MPI) */
01762 static int mp_div_d (const mp_int * a, mp_digit b, mp_int * c, mp_digit * d)
01763 {
01764   mp_int  q;
01765   mp_word w;
01766   mp_digit t;
01767   int     res, ix;
01768 
01769   /* cannot divide by zero */
01770   if (b == 0) {
01771      return MP_VAL;
01772   }
01773 
01774   /* quick outs */
01775   if (b == 1 || mp_iszero(a) == 1) {
01776      if (d != NULL) {
01777         *d = 0;
01778      }
01779      if (c != NULL) {
01780         return mp_copy(a, c);
01781      }
01782      return MP_OKAY;
01783   }
01784 
01785   /* power of two ? */
01786   if (s_is_power_of_two(b, &ix) == 1) {
01787      if (d != NULL) {
01788         *d = a->dp[0] & ((((mp_digit)1)<<ix) - 1);
01789      }
01790      if (c != NULL) {
01791         return mp_div_2d(a, ix, c, NULL);
01792      }
01793      return MP_OKAY;
01794   }
01795 
01796   /* no easy answer [c'est la vie].  Just division */
01797   if ((res = mp_init_size(&q, a->used)) != MP_OKAY) {
01798      return res;
01799   }
01800   
01801   q.used = a->used;
01802   q.sign = a->sign;
01803   w = 0;
01804   for (ix = a->used - 1; ix >= 0; ix--) {
01805      w = (w << ((mp_word)DIGIT_BIT)) | ((mp_word)a->dp[ix]);
01806      
01807      if (w >= b) {
01808         t = (mp_digit)(w / b);
01809         w -= ((mp_word)t) * ((mp_word)b);
01810       } else {
01811         t = 0;
01812       }
01813       q.dp[ix] = t;
01814   }
01815 
01816   if (d != NULL) {
01817      *d = (mp_digit)w;
01818   }
01819   
01820   if (c != NULL) {
01821      mp_clamp(&q);
01822      mp_exch(&q, c);
01823   }
01824   mp_clear(&q);
01825   
01826   return res;
01827 }
01828 
01829 /* reduce "x" in place modulo "n" using the Diminished Radix algorithm.
01830  *
01831  * Based on algorithm from the paper
01832  *
01833  * "Generating Efficient Primes for Discrete Log Cryptosystems"
01834  *                 Chae Hoon Lim, Pil Loong Lee,
01835  *          POSTECH Information Research Laboratories
01836  *
01837  * The modulus must be of a special format [see manual]
01838  *
01839  * Has been modified to use algorithm 7.10 from the LTM book instead
01840  *
01841  * Input x must be in the range 0 <= x <= (n-1)**2
01842  */
01843 static int
01844 mp_dr_reduce (mp_int * x, const mp_int * n, mp_digit k)
01845 {
01846   int      err, i, m;
01847   mp_word  r;
01848   mp_digit mu, *tmpx1, *tmpx2;
01849 
01850   /* m = digits in modulus */
01851   m = n->used;
01852 
01853   /* ensure that "x" has at least 2m digits */
01854   if (x->alloc < m + m) {
01855     if ((err = mp_grow (x, m + m)) != MP_OKAY) {
01856       return err;
01857     }
01858   }
01859 
01860 /* top of loop, this is where the code resumes if
01861  * another reduction pass is required.
01862  */
01863 top:
01864   /* aliases for digits */
01865   /* alias for lower half of x */
01866   tmpx1 = x->dp;
01867 
01868   /* alias for upper half of x, or x/B**m */
01869   tmpx2 = x->dp + m;
01870 
01871   /* set carry to zero */
01872   mu = 0;
01873 
01874   /* compute (x mod B**m) + k * [x/B**m] inline and inplace */
01875   for (i = 0; i < m; i++) {
01876       r         = ((mp_word)*tmpx2++) * ((mp_word)k) + *tmpx1 + mu;
01877       *tmpx1++  = (mp_digit)(r & MP_MASK);
01878       mu        = (mp_digit)(r >> ((mp_word)DIGIT_BIT));
01879   }
01880 
01881   /* set final carry */
01882   *tmpx1++ = mu;
01883 
01884   /* zero words above m */
01885   for (i = m + 1; i < x->used; i++) {
01886       *tmpx1++ = 0;
01887   }
01888 
01889   /* clamp, sub and return */
01890   mp_clamp (x);
01891 
01892   /* if x >= n then subtract and reduce again
01893    * Each successive "recursion" makes the input smaller and smaller.
01894    */
01895   if (mp_cmp_mag (x, n) != MP_LT) {
01896     s_mp_sub(x, n, x);
01897     goto top;
01898   }
01899   return MP_OKAY;
01900 }
01901 
01902 /* sets the value of "d" required for mp_dr_reduce */
01903 static void mp_dr_setup(const mp_int *a, mp_digit *d)
01904 {
01905    /* the casts are required if DIGIT_BIT is one less than
01906     * the number of bits in a mp_digit [e.g. DIGIT_BIT==31]
01907     */
01908    *d = (mp_digit)((((mp_word)1) << ((mp_word)DIGIT_BIT)) - 
01909         ((mp_word)a->dp[0]));
01910 }
01911 
01912 /* this is a shell function that calls either the normal or Montgomery
01913  * exptmod functions.  Originally the call to the montgomery code was
01914  * embedded in the normal function but that wasted a lot of stack space
01915  * for nothing (since 99% of the time the Montgomery code would be called)
01916  */
01917 int mp_exptmod (const mp_int * G, const mp_int * X, mp_int * P, mp_int * Y)
01918 {
01919   int dr;
01920 
01921   /* modulus P must be positive */
01922   if (P->sign == MP_NEG) {
01923      return MP_VAL;
01924   }
01925 
01926   /* if exponent X is negative we have to recurse */
01927   if (X->sign == MP_NEG) {
01928      mp_int tmpG, tmpX;
01929      int err;
01930 
01931      /* first compute 1/G mod P */
01932      if ((err = mp_init(&tmpG)) != MP_OKAY) {
01933         return err;
01934      }
01935      if ((err = mp_invmod(G, P, &tmpG)) != MP_OKAY) {
01936         mp_clear(&tmpG);
01937         return err;
01938      }
01939 
01940      /* now get |X| */
01941      if ((err = mp_init(&tmpX)) != MP_OKAY) {
01942         mp_clear(&tmpG);
01943         return err;
01944      }
01945      if ((err = mp_abs(X, &tmpX)) != MP_OKAY) {
01946         mp_clear_multi(&tmpG, &tmpX, NULL);
01947         return err;
01948      }
01949 
01950      /* and now compute (1/G)**|X| instead of G**X [X < 0] */
01951      err = mp_exptmod(&tmpG, &tmpX, P, Y);
01952      mp_clear_multi(&tmpG, &tmpX, NULL);
01953      return err;
01954   }
01955 
01956   dr = 0;
01957 
01958   /* if the modulus is odd or dr != 0 use the fast method */
01959   if (mp_isodd (P) == 1 || dr !=  0) {
01960     return mp_exptmod_fast (G, X, P, Y, dr);
01961   } else {
01962     /* otherwise use the generic Barrett reduction technique */
01963     return s_mp_exptmod (G, X, P, Y);
01964   }
01965 }
01966 
01967 /* computes Y == G**X mod P, HAC pp.616, Algorithm 14.85
01968  *
01969  * Uses a left-to-right k-ary sliding window to compute the modular exponentiation.
01970  * The value of k changes based on the size of the exponent.
01971  *
01972  * Uses Montgomery or Diminished Radix reduction [whichever appropriate]
01973  */
01974 
01975 int
01976 mp_exptmod_fast (const mp_int * G, const mp_int * X, mp_int * P, mp_int * Y, int redmode)
01977 {
01978   mp_int  M[256], res;
01979   mp_digit buf, mp;
01980   int     err, bitbuf, bitcpy, bitcnt, mode, digidx, x, y, winsize;
01981 
01982   /* use a pointer to the reduction algorithm.  This allows us to use
01983    * one of many reduction algorithms without modding the guts of
01984    * the code with if statements everywhere.
01985    */
01986   int     (*redux)(mp_int*,const mp_int*,mp_digit);
01987 
01988   /* find window size */
01989   x = mp_count_bits (X);
01990   if (x <= 7) {
01991     winsize = 2;
01992   } else if (x <= 36) {
01993     winsize = 3;
01994   } else if (x <= 140) {
01995     winsize = 4;
01996   } else if (x <= 450) {
01997     winsize = 5;
01998   } else if (x <= 1303) {
01999     winsize = 6;
02000   } else if (x <= 3529) {
02001     winsize = 7;
02002   } else {
02003     winsize = 8;
02004   }
02005 
02006   /* init M array */
02007   /* init first cell */
02008   if ((err = mp_init(&M[1])) != MP_OKAY) {
02009      return err;
02010   }
02011 
02012   /* now init the second half of the array */
02013   for (x = 1<<(winsize-1); x < (1 << winsize); x++) {
02014     if ((err = mp_init(&M[x])) != MP_OKAY) {
02015       for (y = 1<<(winsize-1); y < x; y++) {
02016         mp_clear (&M[y]);
02017       }
02018       mp_clear(&M[1]);
02019       return err;
02020     }
02021   }
02022 
02023   /* determine and setup reduction code */
02024   if (redmode == 0) {
02025      /* now setup montgomery  */
02026      if ((err = mp_montgomery_setup (P, &mp)) != MP_OKAY) {
02027         goto __M;
02028      }
02029 
02030      /* automatically pick the comba one if available (saves quite a few calls/ifs) */
02031      if (((P->used * 2 + 1) < MP_WARRAY) &&
02032           P->used < (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
02033         redux = fast_mp_montgomery_reduce;
02034      } else {
02035         /* use slower baseline Montgomery method */
02036         redux = mp_montgomery_reduce;
02037      }
02038   } else if (redmode == 1) {
02039      /* setup DR reduction for moduli of the form B**k - b */
02040      mp_dr_setup(P, &mp);
02041      redux = mp_dr_reduce;
02042   } else {
02043      /* setup DR reduction for moduli of the form 2**k - b */
02044      if ((err = mp_reduce_2k_setup(P, &mp)) != MP_OKAY) {
02045         goto __M;
02046      }
02047      redux = mp_reduce_2k;
02048   }
02049 
02050   /* setup result */
02051   if ((err = mp_init (&res)) != MP_OKAY) {
02052     goto __M;
02053   }
02054 
02055   /* create M table
02056    *
02057 
02058    *
02059    * The first half of the table is not computed though accept for M[0] and M[1]
02060    */
02061 
02062   if (redmode == 0) {
02063      /* now we need R mod m */
02064      if ((err = mp_montgomery_calc_normalization (&res, P)) != MP_OKAY) {
02065        goto __RES;
02066      }
02067 
02068      /* now set M[1] to G * R mod m */
02069      if ((err = mp_mulmod (G, &res, P, &M[1])) != MP_OKAY) {
02070        goto __RES;
02071      }
02072   } else {
02073      mp_set(&res, 1);
02074      if ((err = mp_mod(G, P, &M[1])) != MP_OKAY) {
02075         goto __RES;
02076      }
02077   }
02078 
02079   /* compute the value at M[1<<(winsize-1)] by squaring M[1] (winsize-1) times */
02080   if ((err = mp_copy (&M[1], &M[1 << (winsize - 1)])) != MP_OKAY) {
02081     goto __RES;
02082   }
02083 
02084   for (x = 0; x < (winsize - 1); x++) {
02085     if ((err = mp_sqr (&M[1 << (winsize - 1)], &M[1 << (winsize - 1)])) != MP_OKAY) {
02086       goto __RES;
02087     }
02088     if ((err = redux (&M[1 << (winsize - 1)], P, mp)) != MP_OKAY) {
02089       goto __RES;
02090     }
02091   }
02092 
02093   /* create upper table */
02094   for (x = (1 << (winsize - 1)) + 1; x < (1 << winsize); x++) {
02095     if ((err = mp_mul (&M[x - 1], &M[1], &M[x])) != MP_OKAY) {
02096       goto __RES;
02097     }
02098     if ((err = redux (&M[x], P, mp)) != MP_OKAY) {
02099       goto __RES;
02100     }
02101   }
02102 
02103   /* set initial mode and bit cnt */
02104   mode   = 0;
02105   bitcnt = 1;
02106   buf    = 0;
02107   digidx = X->used - 1;
02108   bitcpy = 0;
02109   bitbuf = 0;
02110 
02111   for (;;) {
02112     /* grab next digit as required */
02113     if (--bitcnt == 0) {
02114       /* if digidx == -1 we are out of digits so break */
02115       if (digidx == -1) {
02116         break;
02117       }
02118       /* read next digit and reset bitcnt */
02119       buf    = X->dp[digidx--];
02120       bitcnt = DIGIT_BIT;
02121     }
02122 
02123     /* grab the next msb from the exponent */
02124     y     = (buf >> (DIGIT_BIT - 1)) & 1;
02125     buf <<= (mp_digit)1;
02126 
02127     /* if the bit is zero and mode == 0 then we ignore it
02128      * These represent the leading zero bits before the first 1 bit
02129      * in the exponent.  Technically this opt is not required but it
02130      * does lower the # of trivial squaring/reductions used
02131      */
02132     if (mode == 0 && y == 0) {
02133       continue;
02134     }
02135 
02136     /* if the bit is zero and mode == 1 then we square */
02137     if (mode == 1 && y == 0) {
02138       if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
02139         goto __RES;
02140       }
02141       if ((err = redux (&res, P, mp)) != MP_OKAY) {
02142         goto __RES;
02143       }
02144       continue;
02145     }
02146 
02147     /* else we add it to the window */
02148     bitbuf |= (y << (winsize - ++bitcpy));
02149     mode    = 2;
02150 
02151     if (bitcpy == winsize) {
02152       /* ok window is filled so square as required and multiply  */
02153       /* square first */
02154       for (x = 0; x < winsize; x++) {
02155         if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
02156           goto __RES;
02157         }
02158         if ((err = redux (&res, P, mp)) != MP_OKAY) {
02159           goto __RES;
02160         }
02161       }
02162 
02163       /* then multiply */
02164       if ((err = mp_mul (&res, &M[bitbuf], &res)) != MP_OKAY) {
02165         goto __RES;
02166       }
02167       if ((err = redux (&res, P, mp)) != MP_OKAY) {
02168         goto __RES;
02169       }
02170 
02171       /* empty window and reset */
02172       bitcpy = 0;
02173       bitbuf = 0;
02174       mode   = 1;
02175     }
02176   }
02177 
02178   /* if bits remain then square/multiply */
02179   if (mode == 2 && bitcpy > 0) {
02180     /* square then multiply if the bit is set */
02181     for (x = 0; x < bitcpy; x++) {
02182       if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
02183         goto __RES;
02184       }
02185       if ((err = redux (&res, P, mp)) != MP_OKAY) {
02186         goto __RES;
02187       }
02188 
02189       /* get next bit of the window */
02190       bitbuf <<= 1;
02191       if ((bitbuf & (1 << winsize)) != 0) {
02192         /* then multiply */
02193         if ((err = mp_mul (&res, &M[1], &res)) != MP_OKAY) {
02194           goto __RES;
02195         }
02196         if ((err = redux (&res, P, mp)) != MP_OKAY) {
02197           goto __RES;
02198         }
02199       }
02200     }
02201   }
02202 
02203   if (redmode == 0) {
02204      /* fixup result if Montgomery reduction is used
02205       * recall that any value in a Montgomery system is
02206       * actually multiplied by R mod n.  So we have
02207       * to reduce one more time to cancel out the factor
02208       * of R.
02209       */
02210      if ((err = redux(&res, P, mp)) != MP_OKAY) {
02211        goto __RES;
02212      }
02213   }
02214 
02215   /* swap res with Y */
02216   mp_exch (&res, Y);
02217   err = MP_OKAY;
02218 __RES:mp_clear (&res);
02219 __M:
02220   mp_clear(&M[1]);
02221   for (x = 1<<(winsize-1); x < (1 << winsize); x++) {
02222     mp_clear (&M[x]);
02223   }
02224   return err;
02225 }
02226 
02227 /* Greatest Common Divisor using the binary method */
02228 int mp_gcd (const mp_int * a, const mp_int * b, mp_int * c)
02229 {
02230   mp_int  u, v;
02231   int     k, u_lsb, v_lsb, res;
02232 
02233   /* either zero than gcd is the largest */
02234   if (mp_iszero (a) == 1 && mp_iszero (b) == 0) {
02235     return mp_abs (b, c);
02236   }
02237   if (mp_iszero (a) == 0 && mp_iszero (b) == 1) {
02238     return mp_abs (a, c);
02239   }
02240 
02241   /* optimized.  At this point if a == 0 then
02242    * b must equal zero too
02243    */
02244   if (mp_iszero (a) == 1) {
02245     mp_zero(c);
02246     return MP_OKAY;
02247   }
02248 
02249   /* get copies of a and b we can modify */
02250   if ((res = mp_init_copy (&u, a)) != MP_OKAY) {
02251     return res;
02252   }
02253 
02254   if ((res = mp_init_copy (&v, b)) != MP_OKAY) {
02255     goto __U;
02256   }
02257 
02258   /* must be positive for the remainder of the algorithm */
02259   u.sign = v.sign = MP_ZPOS;
02260 
02261   /* B1.  Find the common power of two for u and v */
02262   u_lsb = mp_cnt_lsb(&u);
02263   v_lsb = mp_cnt_lsb(&v);
02264   k     = MIN(u_lsb, v_lsb);
02265 
02266   if (k > 0) {
02267      /* divide the power of two out */
02268      if ((res = mp_div_2d(&u, k, &u, NULL)) != MP_OKAY) {
02269         goto __V;
02270      }
02271 
02272      if ((res = mp_div_2d(&v, k, &v, NULL)) != MP_OKAY) {
02273         goto __V;
02274      }
02275   }
02276 
02277   /* divide any remaining factors of two out */
02278   if (u_lsb != k) {
02279      if ((res = mp_div_2d(&u, u_lsb - k, &u, NULL)) != MP_OKAY) {
02280         goto __V;
02281      }
02282   }
02283 
02284   if (v_lsb != k) {
02285      if ((res = mp_div_2d(&v, v_lsb - k, &v, NULL)) != MP_OKAY) {
02286         goto __V;
02287      }
02288   }
02289 
02290   while (mp_iszero(&v) == 0) {
02291      /* make sure v is the largest */
02292      if (mp_cmp_mag(&u, &v) == MP_GT) {
02293         /* swap u and v to make sure v is >= u */
02294         mp_exch(&u, &v);
02295      }
02296      
02297      /* subtract smallest from largest */
02298      if ((res = s_mp_sub(&v, &u, &v)) != MP_OKAY) {
02299         goto __V;
02300      }
02301      
02302      /* Divide out all factors of two */
02303      if ((res = mp_div_2d(&v, mp_cnt_lsb(&v), &v, NULL)) != MP_OKAY) {
02304         goto __V;
02305      } 
02306   } 
02307 
02308   /* multiply by 2**k which we divided out at the beginning */
02309   if ((res = mp_mul_2d (&u, k, c)) != MP_OKAY) {
02310      goto __V;
02311   }
02312   c->sign = MP_ZPOS;
02313   res = MP_OKAY;
02314 __V:mp_clear (&u);
02315 __U:mp_clear (&v);
02316   return res;
02317 }
02318 
02319 /* get the lower 32-bits of an mp_int */
02320 unsigned long mp_get_int(const mp_int * a)
02321 {
02322   int i;
02323   unsigned long res;
02324 
02325   if (a->used == 0) {
02326      return 0;
02327   }
02328 
02329   /* get number of digits of the lsb we have to read */
02330   i = MIN(a->used,(int)((sizeof(unsigned long)*CHAR_BIT+DIGIT_BIT-1)/DIGIT_BIT))-1;
02331 
02332   /* get most significant digit of result */
02333   res = DIGIT(a,i);
02334    
02335   while (--i >= 0) {
02336     res = (res << DIGIT_BIT) | DIGIT(a,i);
02337   }
02338 
02339   /* force result to 32-bits always so it is consistent on non 32-bit platforms */
02340   return res & 0xFFFFFFFFUL;
02341 }
02342 
02343 /* creates "a" then copies b into it */
02344 int mp_init_copy (mp_int * a, const mp_int * b)
02345 {
02346   int     res;
02347 
02348   if ((res = mp_init (a)) != MP_OKAY) {
02349     return res;
02350   }
02351   return mp_copy (b, a);
02352 }
02353 
02354 int mp_init_multi(mp_int *mp, ...) 
02355 {
02356     mp_err res = MP_OKAY;      /* Assume ok until proven otherwise */
02357     int n = 0;                 /* Number of ok inits */
02358     mp_int* cur_arg = mp;
02359     va_list args;
02360 
02361     va_start(args, mp);        /* init args to next argument from caller */
02362     while (cur_arg != NULL) {
02363         if (mp_init(cur_arg) != MP_OKAY) {
02364             /* Oops - error! Back-track and mp_clear what we already
02365                succeeded in init-ing, then return error.
02366             */
02367             va_list clean_args;
02368             
02369             /* end the current list */
02370             va_end(args);
02371             
02372             /* now start cleaning up */            
02373             cur_arg = mp;
02374             va_start(clean_args, mp);
02375             while (n--) {
02376                 mp_clear(cur_arg);
02377                 cur_arg = va_arg(clean_args, mp_int*);
02378             }
02379             va_end(clean_args);
02380             res = MP_MEM;
02381             break;
02382         }
02383         n++;
02384         cur_arg = va_arg(args, mp_int*);
02385     }
02386     va_end(args);
02387     return res;                /* Assumed ok, if error flagged above. */
02388 }
02389 
02390 /* hac 14.61, pp608 */
02391 int mp_invmod (const mp_int * a, mp_int * b, mp_int * c)
02392 {
02393   /* b cannot be negative */
02394   if (b->sign == MP_NEG || mp_iszero(b) == 1) {
02395     return MP_VAL;
02396   }
02397 
02398   /* if the modulus is odd we can use a faster routine instead */
02399   if (mp_isodd (b) == 1) {
02400     return fast_mp_invmod (a, b, c);
02401   }
02402   
02403   return mp_invmod_slow(a, b, c);
02404 }
02405 
02406 /* hac 14.61, pp608 */
02407 int mp_invmod_slow (const mp_int * a, mp_int * b, mp_int * c)
02408 {
02409   mp_int  x, y, u, v, A, B, C, D;
02410   int     res;
02411 
02412   /* b cannot be negative */
02413   if (b->sign == MP_NEG || mp_iszero(b) == 1) {
02414     return MP_VAL;
02415   }
02416 
02417   /* init temps */
02418   if ((res = mp_init_multi(&x, &y, &u, &v, 
02419                            &A, &B, &C, &D, NULL)) != MP_OKAY) {
02420      return res;
02421   }
02422 
02423   /* x = a, y = b */
02424   if ((res = mp_copy (a, &x)) != MP_OKAY) {
02425     goto __ERR;
02426   }
02427   if ((res = mp_copy (b, &y)) != MP_OKAY) {
02428     goto __ERR;
02429   }
02430 
02431   /* 2. [modified] if x,y are both even then return an error! */
02432   if (mp_iseven (&x) == 1 && mp_iseven (&y) == 1) {
02433     res = MP_VAL;
02434     goto __ERR;
02435   }
02436 
02437   /* 3. u=x, v=y, A=1, B=0, C=0,D=1 */
02438   if ((res = mp_copy (&x, &u)) != MP_OKAY) {
02439     goto __ERR;
02440   }
02441   if ((res = mp_copy (&y, &v)) != MP_OKAY) {
02442     goto __ERR;
02443   }
02444   mp_set (&A, 1);
02445   mp_set (&D, 1);
02446 
02447 top:
02448   /* 4.  while u is even do */
02449   while (mp_iseven (&u) == 1) {
02450     /* 4.1 u = u/2 */
02451     if ((res = mp_div_2 (&u, &u)) != MP_OKAY) {
02452       goto __ERR;
02453     }
02454     /* 4.2 if A or B is odd then */
02455     if (mp_isodd (&A) == 1 || mp_isodd (&B) == 1) {
02456       /* A = (A+y)/2, B = (B-x)/2 */
02457       if ((res = mp_add (&A, &y, &A)) != MP_OKAY) {
02458          goto __ERR;
02459       }
02460       if ((res = mp_sub (&B, &x, &B)) != MP_OKAY) {
02461          goto __ERR;
02462       }
02463     }
02464     /* A = A/2, B = B/2 */
02465     if ((res = mp_div_2 (&A, &A)) != MP_OKAY) {
02466       goto __ERR;
02467     }
02468     if ((res = mp_div_2 (&B, &B)) != MP_OKAY) {
02469       goto __ERR;
02470     }
02471   }
02472 
02473   /* 5.  while v is even do */
02474   while (mp_iseven (&v) == 1) {
02475     /* 5.1 v = v/2 */
02476     if ((res = mp_div_2 (&v, &v)) != MP_OKAY) {
02477       goto __ERR;
02478     }
02479     /* 5.2 if C or D is odd then */
02480     if (mp_isodd (&C) == 1 || mp_isodd (&D) == 1) {
02481       /* C = (C+y)/2, D = (D-x)/2 */
02482       if ((res = mp_add (&C, &y, &C)) != MP_OKAY) {
02483          goto __ERR;
02484       }
02485       if ((res = mp_sub (&D, &x, &D)) != MP_OKAY) {
02486          goto __ERR;
02487       }
02488     }
02489     /* C = C/2, D = D/2 */
02490     if ((res = mp_div_2 (&C, &C)) != MP_OKAY) {
02491       goto __ERR;
02492     }
02493     if ((res = mp_div_2 (&D, &D)) != MP_OKAY) {
02494       goto __ERR;
02495     }
02496   }
02497 
02498   /* 6.  if u >= v then */
02499   if (mp_cmp (&u, &v) != MP_LT) {
02500     /* u = u - v, A = A - C, B = B - D */
02501     if ((res = mp_sub (&u, &v, &u)) != MP_OKAY) {
02502       goto __ERR;
02503     }
02504 
02505     if ((res = mp_sub (&A, &C, &A)) != MP_OKAY) {
02506       goto __ERR;
02507     }
02508 
02509     if ((res = mp_sub (&B, &D, &B)) != MP_OKAY) {
02510       goto __ERR;
02511     }
02512   } else {
02513     /* v - v - u, C = C - A, D = D - B */
02514     if ((res = mp_sub (&v, &u, &v)) != MP_OKAY) {
02515       goto __ERR;
02516     }
02517 
02518     if ((res = mp_sub (&C, &A, &C)) != MP_OKAY) {
02519       goto __ERR;
02520     }
02521 
02522     if ((res = mp_sub (&D, &B, &D)) != MP_OKAY) {
02523       goto __ERR;
02524     }
02525   }
02526 
02527   /* if not zero goto step 4 */
02528   if (mp_iszero (&u) == 0)
02529     goto top;
02530 
02531   /* now a = C, b = D, gcd == g*v */
02532 
02533   /* if v != 1 then there is no inverse */
02534   if (mp_cmp_d (&v, 1) != MP_EQ) {
02535     res = MP_VAL;
02536     goto __ERR;
02537   }
02538 
02539   /* if its too low */
02540   while (mp_cmp_d(&C, 0) == MP_LT) {
02541       if ((res = mp_add(&C, b, &C)) != MP_OKAY) {
02542          goto __ERR;
02543       }
02544   }
02545   
02546   /* too big */
02547   while (mp_cmp_mag(&C, b) != MP_LT) {
02548       if ((res = mp_sub(&C, b, &C)) != MP_OKAY) {
02549          goto __ERR;
02550       }
02551   }
02552   
02553   /* C is now the inverse */
02554   mp_exch (&C, c);
02555   res = MP_OKAY;
02556 __ERR:mp_clear_multi (&x, &y, &u, &v, &A, &B, &C, &D, NULL);
02557   return res;
02558 }
02559 
02560 /* c = |a| * |b| using Karatsuba Multiplication using 
02561  * three half size multiplications
02562  *
02563  * Let B represent the radix [e.g. 2**DIGIT_BIT] and 
02564  * let n represent half of the number of digits in 
02565  * the min(a,b)
02566  *
02567  * a = a1 * B**n + a0
02568  * b = b1 * B**n + b0
02569  *
02570  * Then, a * b => 
02571    a1b1 * B**2n + ((a1 - a0)(b1 - b0) + a0b0 + a1b1) * B + a0b0
02572  *
02573  * Note that a1b1 and a0b0 are used twice and only need to be 
02574  * computed once.  So in total three half size (half # of 
02575  * digit) multiplications are performed, a0b0, a1b1 and 
02576  * (a1-b1)(a0-b0)
02577  *
02578  * Note that a multiplication of half the digits requires
02579  * 1/4th the number of single precision multiplications so in 
02580  * total after one call 25% of the single precision multiplications 
02581  * are saved.  Note also that the call to mp_mul can end up back 
02582  * in this function if the a0, a1, b0, or b1 are above the threshold.  
02583  * This is known as divide-and-conquer and leads to the famous 
02584  * O(N**lg(3)) or O(N**1.584) work which is asymptotically lower than
02585  * the standard O(N**2) that the baseline/comba methods use.  
02586  * Generally though the overhead of this method doesn't pay off 
02587  * until a certain size (N ~ 80) is reached.
02588  */
02589 int mp_karatsuba_mul (const mp_int * a, const mp_int * b, mp_int * c)
02590 {
02591   mp_int  x0, x1, y0, y1, t1, x0y0, x1y1;
02592   int     B, err;
02593 
02594   /* default the return code to an error */
02595   err = MP_MEM;
02596 
02597   /* min # of digits */
02598   B = MIN (a->used, b->used);
02599 
02600   /* now divide in two */
02601   B = B >> 1;
02602 
02603   /* init copy all the temps */
02604   if (mp_init_size (&x0, B) != MP_OKAY)
02605     goto ERR;
02606   if (mp_init_size (&x1, a->used - B) != MP_OKAY)
02607     goto X0;
02608   if (mp_init_size (&y0, B) != MP_OKAY)
02609     goto X1;
02610   if (mp_init_size (&y1, b->used - B) != MP_OKAY)
02611     goto Y0;
02612 
02613   /* init temps */
02614   if (mp_init_size (&t1, B * 2) != MP_OKAY)
02615     goto Y1;
02616   if (mp_init_size (&x0y0, B * 2) != MP_OKAY)
02617     goto T1;
02618   if (mp_init_size (&x1y1, B * 2) != MP_OKAY)
02619     goto X0Y0;
02620 
02621   /* now shift the digits */
02622   x0.used = y0.used = B;
02623   x1.used = a->used - B;
02624   y1.used = b->used - B;
02625 
02626   {
02627     register int x;
02628     register mp_digit *tmpa, *tmpb, *tmpx, *tmpy;
02629 
02630     /* we copy the digits directly instead of using higher level functions
02631      * since we also need to shift the digits
02632      */
02633     tmpa = a->dp;
02634     tmpb = b->dp;
02635 
02636     tmpx = x0.dp;
02637     tmpy = y0.dp;
02638     for (x = 0; x < B; x++) {
02639       *tmpx++ = *tmpa++;
02640       *tmpy++ = *tmpb++;
02641     }
02642 
02643     tmpx = x1.dp;
02644     for (x = B; x < a->used; x++) {
02645       *tmpx++ = *tmpa++;
02646     }
02647 
02648     tmpy = y1.dp;
02649     for (x = B; x < b->used; x++) {
02650       *tmpy++ = *tmpb++;
02651     }
02652   }
02653 
02654   /* only need to clamp the lower words since by definition the 
02655    * upper words x1/y1 must have a known number of digits
02656    */
02657   mp_clamp (&x0);
02658   mp_clamp (&y0);
02659 
02660   /* now calc the products x0y0 and x1y1 */
02661   /* after this x0 is no longer required, free temp [x0==t2]! */
02662   if (mp_mul (&x0, &y0, &x0y0) != MP_OKAY)  
02663     goto X1Y1;          /* x0y0 = x0*y0 */
02664   if (mp_mul (&x1, &y1, &x1y1) != MP_OKAY)
02665     goto X1Y1;          /* x1y1 = x1*y1 */
02666 
02667   /* now calc x1-x0 and y1-y0 */
02668   if (mp_sub (&x1, &x0, &t1) != MP_OKAY)
02669     goto X1Y1;          /* t1 = x1 - x0 */
02670   if (mp_sub (&y1, &y0, &x0) != MP_OKAY)
02671     goto X1Y1;          /* t2 = y1 - y0 */
02672   if (mp_mul (&t1, &x0, &t1) != MP_OKAY)
02673     goto X1Y1;          /* t1 = (x1 - x0) * (y1 - y0) */
02674 
02675   /* add x0y0 */
02676   if (mp_add (&x0y0, &x1y1, &x0) != MP_OKAY)
02677     goto X1Y1;          /* t2 = x0y0 + x1y1 */
02678   if (mp_sub (&x0, &t1, &t1) != MP_OKAY)
02679     goto X1Y1;          /* t1 = x0y0 + x1y1 - (x1-x0)*(y1-y0) */
02680 
02681   /* shift by B */
02682   if (mp_lshd (&t1, B) != MP_OKAY)
02683     goto X1Y1;          /* t1 = (x0y0 + x1y1 - (x1-x0)*(y1-y0))<<B */
02684   if (mp_lshd (&x1y1, B * 2) != MP_OKAY)
02685     goto X1Y1;          /* x1y1 = x1y1 << 2*B */
02686 
02687   if (mp_add (&x0y0, &t1, &t1) != MP_OKAY)
02688     goto X1Y1;          /* t1 = x0y0 + t1 */
02689   if (mp_add (&t1, &x1y1, c) != MP_OKAY)
02690     goto X1Y1;          /* t1 = x0y0 + t1 + x1y1 */
02691 
02692   /* Algorithm succeeded set the return code to MP_OKAY */
02693   err = MP_OKAY;
02694 
02695 X1Y1:mp_clear (&x1y1);
02696 X0Y0:mp_clear (&x0y0);
02697 T1:mp_clear (&t1);
02698 Y1:mp_clear (&y1);
02699 Y0:mp_clear (&y0);
02700 X1:mp_clear (&x1);
02701 X0:mp_clear (&x0);
02702 ERR:
02703   return err;
02704 }
02705 
02706 /* Karatsuba squaring, computes b = a*a using three 
02707  * half size squarings
02708  *
02709  * See comments of karatsuba_mul for details.  It 
02710  * is essentially the same algorithm but merely 
02711  * tuned to perform recursive squarings.
02712  */
02713 int mp_karatsuba_sqr (const mp_int * a, mp_int * b)
02714 {
02715   mp_int  x0, x1, t1, t2, x0x0, x1x1;
02716   int     B, err;
02717 
02718   err = MP_MEM;
02719 
02720   /* min # of digits */
02721   B = a->used;
02722 
02723   /* now divide in two */
02724   B = B >> 1;
02725 
02726   /* init copy all the temps */
02727   if (mp_init_size (&x0, B) != MP_OKAY)
02728     goto ERR;
02729   if (mp_init_size (&x1, a->used - B) != MP_OKAY)
02730     goto X0;
02731 
02732   /* init temps */
02733   if (mp_init_size (&t1, a->used * 2) != MP_OKAY)
02734     goto X1;
02735   if (mp_init_size (&t2, a->used * 2) != MP_OKAY)
02736     goto T1;
02737   if (mp_init_size (&x0x0, B * 2) != MP_OKAY)
02738     goto T2;
02739   if (mp_init_size (&x1x1, (a->used - B) * 2) != MP_OKAY)
02740     goto X0X0;
02741 
02742   {
02743     register int x;
02744     register mp_digit *dst, *src;
02745 
02746     src = a->dp;
02747 
02748     /* now shift the digits */
02749     dst = x0.dp;
02750     for (x = 0; x < B; x++) {
02751       *dst++ = *src++;
02752     }
02753 
02754     dst = x1.dp;
02755     for (x = B; x < a->used; x++) {
02756       *dst++ = *src++;
02757     }
02758   }
02759 
02760   x0.used = B;
02761   x1.used = a->used - B;
02762 
02763   mp_clamp (&x0);
02764 
02765   /* now calc the products x0*x0 and x1*x1 */
02766   if (mp_sqr (&x0, &x0x0) != MP_OKAY)
02767     goto X1X1;           /* x0x0 = x0*x0 */
02768   if (mp_sqr (&x1, &x1x1) != MP_OKAY)
02769     goto X1X1;           /* x1x1 = x1*x1 */
02770 
02771   /* now calc (x1-x0)**2 */
02772   if (mp_sub (&x1, &x0, &t1) != MP_OKAY)
02773     goto X1X1;           /* t1 = x1 - x0 */
02774   if (mp_sqr (&t1, &t1) != MP_OKAY)
02775     goto X1X1;           /* t1 = (x1 - x0) * (x1 - x0) */
02776 
02777   /* add x0y0 */
02778   if (s_mp_add (&x0x0, &x1x1, &t2) != MP_OKAY)
02779     goto X1X1;           /* t2 = x0x0 + x1x1 */
02780   if (mp_sub (&t2, &t1, &t1) != MP_OKAY)
02781     goto X1X1;           /* t1 = x0x0 + x1x1 - (x1-x0)*(x1-x0) */
02782 
02783   /* shift by B */
02784   if (mp_lshd (&t1, B) != MP_OKAY)
02785     goto X1X1;           /* t1 = (x0x0 + x1x1 - (x1-x0)*(x1-x0))<<B */
02786   if (mp_lshd (&x1x1, B * 2) != MP_OKAY)
02787     goto X1X1;           /* x1x1 = x1x1 << 2*B */
02788 
02789   if (mp_add (&x0x0, &t1, &t1) != MP_OKAY)
02790     goto X1X1;           /* t1 = x0x0 + t1 */
02791   if (mp_add (&t1, &x1x1, b) != MP_OKAY)
02792     goto X1X1;           /* t1 = x0x0 + t1 + x1x1 */
02793 
02794   err = MP_OKAY;
02795 
02796 X1X1:mp_clear (&x1x1);
02797 X0X0:mp_clear (&x0x0);
02798 T2:mp_clear (&t2);
02799 T1:mp_clear (&t1);
02800 X1:mp_clear (&x1);
02801 X0:mp_clear (&x0);
02802 ERR:
02803   return err;
02804 }
02805 
02806 /* computes least common multiple as |a*b|/(a, b) */
02807 int mp_lcm (const mp_int * a, const mp_int * b, mp_int * c)
02808 {
02809   int     res;
02810   mp_int  t1, t2;
02811 
02812 
02813   if ((res = mp_init_multi (&t1, &t2, NULL)) != MP_OKAY) {
02814     return res;
02815   }
02816 
02817   /* t1 = get the GCD of the two inputs */
02818   if ((res = mp_gcd (a, b, &t1)) != MP_OKAY) {
02819     goto __T;
02820   }
02821 
02822   /* divide the smallest by the GCD */
02823   if (mp_cmp_mag(a, b) == MP_LT) {
02824      /* store quotient in t2 such that t2 * b is the LCM */
02825      if ((res = mp_div(a, &t1, &t2, NULL)) != MP_OKAY) {
02826         goto __T;
02827      }
02828      res = mp_mul(b, &t2, c);
02829   } else {
02830      /* store quotient in t2 such that t2 * a is the LCM */
02831      if ((res = mp_div(b, &t1, &t2, NULL)) != MP_OKAY) {
02832         goto __T;
02833      }
02834      res = mp_mul(a, &t2, c);
02835   }
02836 
02837   /* fix the sign to positive */
02838   c->sign = MP_ZPOS;
02839 
02840 __T:
02841   mp_clear_multi (&t1, &t2, NULL);
02842   return res;
02843 }
02844 
02845 /* c = a mod b, 0 <= c < b */
02846 int
02847 mp_mod (const mp_int * a, mp_int * b, mp_int * c)
02848 {
02849   mp_int  t;
02850   int     res;
02851 
02852   if ((res = mp_init (&t)) != MP_OKAY) {
02853     return res;
02854   }
02855 
02856   if ((res = mp_div (a, b, NULL, &t)) != MP_OKAY) {
02857     mp_clear (&t);
02858     return res;
02859   }
02860 
02861   if (t.sign != b->sign) {
02862     res = mp_add (b, &t, c);
02863   } else {
02864     res = MP_OKAY;
02865     mp_exch (&t, c);
02866   }
02867 
02868   mp_clear (&t);
02869   return res;
02870 }
02871 
02872 static int
02873 mp_mod_d (const mp_int * a, mp_digit b, mp_digit * c)
02874 {
02875   return mp_div_d(a, b, NULL, c);
02876 }
02877 
02878 /* b = a*2 */
02879 static int mp_mul_2(const mp_int * a, mp_int * b)
02880 {
02881   int     x, res, oldused;
02882 
02883   /* grow to accommodate result */
02884   if (b->alloc < a->used + 1) {
02885     if ((res = mp_grow (b, a->used + 1)) != MP_OKAY) {
02886       return res;
02887     }
02888   }
02889 
02890   oldused = b->used;
02891   b->used = a->used;
02892 
02893   {
02894     register mp_digit r, rr, *tmpa, *tmpb;
02895 
02896     /* alias for source */
02897     tmpa = a->dp;
02898 
02899     /* alias for dest */
02900     tmpb = b->dp;
02901 
02902     /* carry */
02903     r = 0;
02904     for (x = 0; x < a->used; x++) {
02905 
02906       /* get what will be the *next* carry bit from the
02907        * MSB of the current digit
02908        */
02909       rr = *tmpa >> ((mp_digit)(DIGIT_BIT - 1));
02910 
02911       /* now shift up this digit, add in the carry [from the previous] */
02912       *tmpb++ = ((*tmpa++ << ((mp_digit)1)) | r) & MP_MASK;
02913 
02914       /* copy the carry that would be from the source
02915        * digit into the next iteration
02916        */
02917       r = rr;
02918     }
02919 
02920     /* new leading digit? */
02921     if (r != 0) {
02922       /* add a MSB which is always 1 at this point */
02923       *tmpb = 1;
02924       ++(b->used);
02925     }
02926 
02927     /* now zero any excess digits on the destination
02928      * that we didn't write to
02929      */
02930     tmpb = b->dp + b->used;
02931     for (x = b->used; x < oldused; x++) {
02932       *tmpb++ = 0;
02933     }
02934   }
02935   b->sign = a->sign;
02936   return MP_OKAY;
02937 }
02938 
02939 /*
02940  * shifts with subtractions when the result is greater than b.
02941  *
02942  * The method is slightly modified to shift B unconditionally up to just under
02943  * the leading bit of b.  This saves a lot of multiple precision shifting.
02944  */
02945 int mp_montgomery_calc_normalization (mp_int * a, const mp_int * b)
02946 {
02947   int     x, bits, res;
02948 
02949   /* how many bits of last digit does b use */
02950   bits = mp_count_bits (b) % DIGIT_BIT;
02951 
02952 
02953   if (b->used > 1) {
02954      if ((res = mp_2expt (a, (b->used - 1) * DIGIT_BIT + bits - 1)) != MP_OKAY) {
02955         return res;
02956      }
02957   } else {
02958      mp_set(a, 1);
02959      bits = 1;
02960   }
02961 
02962 
02963   /* now compute C = A * B mod b */
02964   for (x = bits - 1; x < DIGIT_BIT; x++) {
02965     if ((res = mp_mul_2 (a, a)) != MP_OKAY) {
02966       return res;
02967     }
02968     if (mp_cmp_mag (a, b) != MP_LT) {
02969       if ((res = s_mp_sub (a, b, a)) != MP_OKAY) {
02970         return res;
02971       }
02972     }
02973   }
02974 
02975   return MP_OKAY;
02976 }
02977 
02978 /* computes xR**-1 == x (mod N) via Montgomery Reduction */
02979 int
02980 mp_montgomery_reduce (mp_int * x, const mp_int * n, mp_digit rho)
02981 {
02982   int     ix, res, digs;
02983   mp_digit mu;
02984 
02985   /* can the fast reduction [comba] method be used?
02986    *
02987    * Note that unlike in mul you're safely allowed *less*
02988    * than the available columns [255 per default] since carries
02989    * are fixed up in the inner loop.
02990    */
02991   digs = n->used * 2 + 1;
02992   if ((digs < MP_WARRAY) &&
02993       n->used <
02994       (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
02995     return fast_mp_montgomery_reduce (x, n, rho);
02996   }
02997 
02998   /* grow the input as required */
02999   if (x->alloc < digs) {
03000     if ((res = mp_grow (x, digs)) != MP_OKAY) {
03001       return res;
03002     }
03003   }
03004   x->used = digs;
03005 
03006   for (ix = 0; ix < n->used; ix++) {
03007     /* mu = ai * rho mod b
03008      *
03009      * The value of rho must be precalculated via
03010      * montgomery_setup() such that
03011      * it equals -1/n0 mod b this allows the
03012      * following inner loop to reduce the
03013      * input one digit at a time
03014      */
03015     mu = (mp_digit) (((mp_word)x->dp[ix]) * ((mp_word)rho) & MP_MASK);
03016 
03017     /* a = a + mu * m * b**i */
03018     {
03019       register int iy;
03020       register mp_digit *tmpn, *tmpx, u;
03021       register mp_word r;
03022 
03023       /* alias for digits of the modulus */
03024       tmpn = n->dp;
03025 
03026       /* alias for the digits of x [the input] */
03027       tmpx = x->dp + ix;
03028 
03029       /* set the carry to zero */
03030       u = 0;
03031 
03032       /* Multiply and add in place */
03033       for (iy = 0; iy < n->used; iy++) {
03034         /* compute product and sum */
03035         r       = ((mp_word)mu) * ((mp_word)*tmpn++) +
03036                   ((mp_word) u) + ((mp_word) * tmpx);
03037 
03038         /* get carry */
03039         u       = (mp_digit)(r >> ((mp_word) DIGIT_BIT));
03040 
03041         /* fix digit */
03042         *tmpx++ = (mp_digit)(r & ((mp_word) MP_MASK));
03043       }
03044       /* At this point the ix'th digit of x should be zero */
03045 
03046 
03047       /* propagate carries upwards as required*/
03048       while (u) {
03049         *tmpx   += u;
03050         u        = *tmpx >> DIGIT_BIT;
03051         *tmpx++ &= MP_MASK;
03052       }
03053     }
03054   }
03055 
03056   /* at this point the n.used'th least
03057    * significant digits of x are all zero
03058    * which means we can shift x to the
03059    * right by n.used digits and the
03060    * residue is unchanged.
03061    */
03062 
03063   /* x = x/b**n.used */
03064   mp_clamp(x);
03065   mp_rshd (x, n->used);
03066 
03067   /* if x >= n then x = x - n */
03068   if (mp_cmp_mag (x, n) != MP_LT) {
03069     return s_mp_sub (x, n, x);
03070   }
03071 
03072   return MP_OKAY;
03073 }
03074 
03075 /* setups the montgomery reduction stuff */
03076 int
03077 mp_montgomery_setup (const mp_int * n, mp_digit * rho)
03078 {
03079   mp_digit x, b;
03080 
03081 /* fast inversion mod 2**k
03082  *
03083  * Based on the fact that
03084  *
03085  * XA = 1 (mod 2**n)  =>  (X(2-XA)) A = 1 (mod 2**2n)
03086  *                    =>  2*X*A - X*X*A*A = 1
03087  *                    =>  2*(1) - (1)     = 1
03088  */
03089   b = n->dp[0];
03090 
03091   if ((b & 1) == 0) {
03092     return MP_VAL;
03093   }
03094 
03095   x = (((b + 2) & 4) << 1) + b; /* here x*a==1 mod 2**4 */
03096   x *= 2 - b * x;               /* here x*a==1 mod 2**8 */
03097   x *= 2 - b * x;               /* here x*a==1 mod 2**16 */
03098   x *= 2 - b * x;               /* here x*a==1 mod 2**32 */
03099 
03100   /* rho = -1/m mod b */
03101   *rho = (((mp_word)1 << ((mp_word) DIGIT_BIT)) - x) & MP_MASK;
03102 
03103   return MP_OKAY;
03104 }
03105 
03106 /* high level multiplication (handles sign) */
03107 int mp_mul (const mp_int * a, const mp_int * b, mp_int * c)
03108 {
03109   int     res, neg;
03110   neg = (a->sign == b->sign) ? MP_ZPOS : MP_NEG;
03111 
03112   /* use Karatsuba? */
03113   if (MIN (a->used, b->used) >= KARATSUBA_MUL_CUTOFF) {
03114     res = mp_karatsuba_mul (a, b, c);
03115   } else 
03116   {
03117     /* can we use the fast multiplier?
03118      *
03119      * The fast multiplier can be used if the output will 
03120      * have less than MP_WARRAY digits and the number of 
03121      * digits won't affect carry propagation
03122      */
03123     int     digs = a->used + b->used + 1;
03124 
03125     if ((digs < MP_WARRAY) &&
03126         MIN(a->used, b->used) <= 
03127         (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
03128       res = fast_s_mp_mul_digs (a, b, c, digs);
03129     } else 
03130       res = s_mp_mul (a, b, c); /* uses s_mp_mul_digs */
03131   }
03132   c->sign = (c->used > 0) ? neg : MP_ZPOS;
03133   return res;
03134 }
03135 
03136 /* d = a * b (mod c) */
03137 int
03138 mp_mulmod (const mp_int * a, const mp_int * b, mp_int * c, mp_int * d)
03139 {
03140   int     res;
03141   mp_int  t;
03142 
03143   if ((res = mp_init (&t)) != MP_OKAY) {
03144     return res;
03145   }
03146 
03147   if ((res = mp_mul (a, b, &t)) != MP_OKAY) {
03148     mp_clear (&t);
03149     return res;
03150   }
03151   res = mp_mod (&t, c, d);
03152   mp_clear (&t);
03153   return res;
03154 }
03155 
03156 /* table of first PRIME_SIZE primes */
03157 static const mp_digit __prime_tab[] = {
03158   0x0002, 0x0003, 0x0005, 0x0007, 0x000B, 0x000D, 0x0011, 0x0013,
03159   0x0017, 0x001D, 0x001F, 0x0025, 0x0029, 0x002B, 0x002F, 0x0035,
03160   0x003B, 0x003D, 0x0043, 0x0047, 0x0049, 0x004F, 0x0053, 0x0059,
03161   0x0061, 0x0065, 0x0067, 0x006B, 0x006D, 0x0071, 0x007F, 0x0083,
03162   0x0089, 0x008B, 0x0095, 0x0097, 0x009D, 0x00A3, 0x00A7, 0x00AD,
03163   0x00B3, 0x00B5, 0x00BF, 0x00C1, 0x00C5, 0x00C7, 0x00D3, 0x00DF,
03164   0x00E3, 0x00E5, 0x00E9, 0x00EF, 0x00F1, 0x00FB, 0x0101, 0x0107,
03165   0x010D, 0x010F, 0x0115, 0x0119, 0x011B, 0x0125, 0x0133, 0x0137,
03166 
03167   0x0139, 0x013D, 0x014B, 0x0151, 0x015B, 0x015D, 0x0161, 0x0167,
03168   0x016F, 0x0175, 0x017B, 0x017F, 0x0185, 0x018D, 0x0191, 0x0199,
03169   0x01A3, 0x01A5, 0x01AF, 0x01B1, 0x01B7, 0x01BB, 0x01C1, 0x01C9,
03170   0x01CD, 0x01CF, 0x01D3, 0x01DF, 0x01E7, 0x01EB, 0x01F3, 0x01F7,
03171   0x01FD, 0x0209, 0x020B, 0x021D, 0x0223, 0x022D, 0x0233, 0x0239,
03172   0x023B, 0x0241, 0x024B, 0x0251, 0x0257, 0x0259, 0x025F, 0x0265,
03173   0x0269, 0x026B, 0x0277, 0x0281, 0x0283, 0x0287, 0x028D, 0x0293,
03174   0x0295, 0x02A1, 0x02A5, 0x02AB, 0x02B3, 0x02BD, 0x02C5, 0x02CF,
03175 
03176   0x02D7, 0x02DD, 0x02E3, 0x02E7, 0x02EF, 0x02F5, 0x02F9, 0x0301,
03177   0x0305, 0x0313, 0x031D, 0x0329, 0x032B, 0x0335, 0x0337, 0x033B,
03178   0x033D, 0x0347, 0x0355, 0x0359, 0x035B, 0x035F, 0x036D, 0x0371,
03179   0x0373, 0x0377, 0x038B, 0x038F, 0x0397, 0x03A1, 0x03A9, 0x03AD,
03180   0x03B3, 0x03B9, 0x03C7, 0x03CB, 0x03D1, 0x03D7, 0x03DF, 0x03E5,
03181   0x03F1, 0x03F5, 0x03FB, 0x03FD, 0x0407, 0x0409, 0x040F, 0x0419,
03182   0x041B, 0x0425, 0x0427, 0x042D, 0x043F, 0x0443, 0x0445, 0x0449,
03183   0x044F, 0x0455, 0x045D, 0x0463, 0x0469, 0x047F, 0x0481, 0x048B,
03184 
03185   0x0493, 0x049D, 0x04A3, 0x04A9, 0x04B1, 0x04BD, 0x04C1, 0x04C7,
03186   0x04CD, 0x04CF, 0x04D5, 0x04E1, 0x04EB, 0x04FD, 0x04FF, 0x0503,
03187   0x0509, 0x050B, 0x0511, 0x0515, 0x0517, 0x051B, 0x0527, 0x0529,
03188   0x052F, 0x0551, 0x0557, 0x055D, 0x0565, 0x0577, 0x0581, 0x058F,
03189   0x0593, 0x0595, 0x0599, 0x059F, 0x05A7, 0x05AB, 0x05AD, 0x05B3,
03190   0x05BF, 0x05C9, 0x05CB, 0x05CF, 0x05D1, 0x05D5, 0x05DB, 0x05E7,
03191   0x05F3, 0x05FB, 0x0607, 0x060D, 0x0611, 0x0617, 0x061F, 0x0623,
03192   0x062B, 0x062F, 0x063D, 0x0641, 0x0647, 0x0649, 0x064D, 0x0653
03193 };
03194 
03195 /* determines if an integers is divisible by one 
03196  * of the first PRIME_SIZE primes or not
03197  *
03198  * sets result to 0 if not, 1 if yes
03199  */
03200 static int mp_prime_is_divisible (const mp_int * a, int *result)
03201 {
03202   int     err, ix;
03203   mp_digit res;
03204 
03205   /* default to not */
03206   *result = MP_NO;
03207 
03208   for (ix = 0; ix < PRIME_SIZE; ix++) {
03209     /* what is a mod __prime_tab[ix] */
03210     if ((err = mp_mod_d (a, __prime_tab[ix], &res)) != MP_OKAY) {
03211       return err;
03212     }
03213 
03214     /* is the residue zero? */
03215     if (res == 0) {
03216       *result = MP_YES;
03217       return MP_OKAY;
03218     }
03219   }
03220 
03221   return MP_OKAY;
03222 }
03223 
03224 /* Miller-Rabin test of "a" to the base of "b" as described in 
03225  * HAC pp. 139 Algorithm 4.24
03226  *
03227  * Sets result to 0 if definitely composite or 1 if probably prime.
03228  * Randomly the chance of error is no more than 1/4 and often 
03229  * very much lower.
03230  */
03231 static int mp_prime_miller_rabin (mp_int * a, const mp_int * b, int *result)
03232 {
03233   mp_int  n1, y, r;
03234   int     s, j, err;
03235 
03236   /* default */
03237   *result = MP_NO;
03238 
03239   /* ensure b > 1 */
03240   if (mp_cmp_d(b, 1) != MP_GT) {
03241      return MP_VAL;
03242   }     
03243 
03244   /* get n1 = a - 1 */
03245   if ((err = mp_init_copy (&n1, a)) != MP_OKAY) {
03246     return err;
03247   }
03248   if ((err = mp_sub_d (&n1, 1, &n1)) != MP_OKAY) {
03249     goto __N1;
03250   }
03251 
03252   /* set 2**s * r = n1 */
03253   if ((err = mp_init_copy (&r, &n1)) != MP_OKAY) {
03254     goto __N1;
03255   }
03256 
03257   /* count the number of least significant bits
03258    * which are zero
03259    */
03260   s = mp_cnt_lsb(&r);
03261 
03262   /* now divide n - 1 by 2**s */
03263   if ((err = mp_div_2d (&r, s, &r, NULL)) != MP_OKAY) {
03264     goto __R;
03265   }
03266 
03267   /* compute y = b**r mod a */
03268   if ((err = mp_init (&y)) != MP_OKAY) {
03269     goto __R;
03270   }
03271   if ((err = mp_exptmod (b, &r, a, &y)) != MP_OKAY) {
03272     goto __Y;
03273   }
03274 
03275   /* if y != 1 and y != n1 do */
03276   if (mp_cmp_d (&y, 1) != MP_EQ && mp_cmp (&y, &n1) != MP_EQ) {
03277     j = 1;
03278     /* while j <= s-1 and y != n1 */
03279     while ((j <= (s - 1)) && mp_cmp (&y, &n1) != MP_EQ) {
03280       if ((err = mp_sqrmod (&y, a, &y)) != MP_OKAY) {
03281          goto __Y;
03282       }
03283 
03284       /* if y == 1 then composite */
03285       if (mp_cmp_d (&y, 1) == MP_EQ) {
03286          goto __Y;
03287       }
03288 
03289       ++j;
03290     }
03291 
03292     /* if y != n1 then composite */
03293     if (mp_cmp (&y, &n1) != MP_EQ) {
03294       goto __Y;
03295     }
03296   }
03297 
03298   /* probably prime now */
03299   *result = MP_YES;
03300 __Y:mp_clear (&y);
03301 __R:mp_clear (&r);
03302 __N1:mp_clear (&n1);
03303   return err;
03304 }
03305 
03306 /* performs a variable number of rounds of Miller-Rabin
03307  *
03308  * Probability of error after t rounds is no more than
03309 
03310  *
03311  * Sets result to 1 if probably prime, 0 otherwise
03312  */
03313 static int mp_prime_is_prime (mp_int * a, int t, int *result)
03314 {
03315   mp_int  b;
03316   int     ix, err, res;
03317 
03318   /* default to no */
03319   *result = MP_NO;
03320 
03321   /* valid value of t? */
03322   if (t <= 0 || t > PRIME_SIZE) {
03323     return MP_VAL;
03324   }
03325 
03326   /* is the input equal to one of the primes in the table? */
03327   for (ix = 0; ix < PRIME_SIZE; ix++) {
03328       if (mp_cmp_d(a, __prime_tab[ix]) == MP_EQ) {
03329          *result = 1;
03330          return MP_OKAY;
03331       }
03332   }
03333 
03334   /* first perform trial division */
03335   if ((err = mp_prime_is_divisible (a, &res)) != MP_OKAY) {
03336     return err;
03337   }
03338 
03339   /* return if it was trivially divisible */
03340   if (res == MP_YES) {
03341     return MP_OKAY;
03342   }
03343 
03344   /* now perform the miller-rabin rounds */
03345   if ((err = mp_init (&b)) != MP_OKAY) {
03346     return err;
03347   }
03348 
03349   for (ix = 0; ix < t; ix++) {
03350     /* set the prime */
03351     mp_set (&b, __prime_tab[ix]);
03352 
03353     if ((err = mp_prime_miller_rabin (a, &b, &res)) != MP_OKAY) {
03354       goto __B;
03355     }
03356 
03357     if (res == MP_NO) {
03358       goto __B;
03359     }
03360   }
03361 
03362   /* passed the test */
03363   *result = MP_YES;
03364 __B:mp_clear (&b);
03365   return err;
03366 }
03367 
03368 static const struct {
03369    int k, t;
03370 } sizes[] = {
03371 {   128,    28 },
03372 {   256,    16 },
03373 {   384,    10 },
03374 {   512,     7 },
03375 {   640,     6 },
03376 {   768,     5 },
03377 {   896,     4 },
03378 {  1024,     4 }
03379 };
03380 
03381 /* returns # of RM trials required for a given bit size */
03382 int mp_prime_rabin_miller_trials(int size)
03383 {
03384    int x;
03385 
03386    for (x = 0; x < (int)(sizeof(sizes)/(sizeof(sizes[0]))); x++) {
03387        if (sizes[x].k == size) {
03388           return sizes[x].t;
03389        } else if (sizes[x].k > size) {
03390           return (x == 0) ? sizes[0].t : sizes[x - 1].t;
03391        }
03392    }
03393    return sizes[x-1].t + 1;
03394 }
03395 
03396 /* makes a truly random prime of a given size (bits),
03397  *
03398  * Flags are as follows:
03399  * 
03400  *   LTM_PRIME_BBS      - make prime congruent to 3 mod 4
03401  *   LTM_PRIME_SAFE     - make sure (p-1)/2 is prime as well (implies LTM_PRIME_BBS)
03402  *   LTM_PRIME_2MSB_OFF - make the 2nd highest bit zero
03403  *   LTM_PRIME_2MSB_ON  - make the 2nd highest bit one
03404  *
03405  * You have to supply a callback which fills in a buffer with random bytes.  "dat" is a parameter you can
03406  * have passed to the callback (e.g. a state or something).  This function doesn't use "dat" itself
03407  * so it can be NULL
03408  *
03409  */
03410 
03411 /* This is possibly the mother of all prime generation functions, muahahahahaha! */
03412 int mp_prime_random_ex(mp_int *a, int t, int size, int flags, ltm_prime_callback cb, void *dat)
03413 {
03414    unsigned char *tmp, maskAND, maskOR_msb, maskOR_lsb;
03415    int res, err, bsize, maskOR_msb_offset;
03416 
03417    /* sanity check the input */
03418    if (size <= 1 || t <= 0) {
03419       return MP_VAL;
03420    }
03421 
03422    /* LTM_PRIME_SAFE implies LTM_PRIME_BBS */
03423    if (flags & LTM_PRIME_SAFE) {
03424       flags |= LTM_PRIME_BBS;
03425    }
03426 
03427    /* calc the byte size */
03428    bsize = (size>>3)+((size&7)?1:0);
03429 
03430    /* we need a buffer of bsize bytes */
03431    tmp = HeapAlloc(GetProcessHeap(), 0, bsize);
03432    if (tmp == NULL) {
03433       return MP_MEM;
03434    }
03435 
03436    /* calc the maskAND value for the MSbyte*/
03437    maskAND = ((size&7) == 0) ? 0xFF : (0xFF >> (8 - (size & 7))); 
03438 
03439    /* calc the maskOR_msb */
03440    maskOR_msb        = 0;
03441    maskOR_msb_offset = ((size & 7) == 1) ? 1 : 0;
03442    if (flags & LTM_PRIME_2MSB_ON) {
03443       maskOR_msb     |= 1 << ((size - 2) & 7);
03444    } else if (flags & LTM_PRIME_2MSB_OFF) {
03445       maskAND        &= ~(1 << ((size - 2) & 7));
03446    }
03447 
03448    /* get the maskOR_lsb */
03449    maskOR_lsb         = 0;
03450    if (flags & LTM_PRIME_BBS) {
03451       maskOR_lsb     |= 3;
03452    }
03453 
03454    do {
03455       /* read the bytes */
03456       if (cb(tmp, bsize, dat) != bsize) {
03457          err = MP_VAL;
03458          goto error;
03459       }
03460  
03461       /* work over the MSbyte */
03462       tmp[0]    &= maskAND;
03463       tmp[0]    |= 1 << ((size - 1) & 7);
03464 
03465       /* mix in the maskORs */
03466       tmp[maskOR_msb_offset]   |= maskOR_msb;
03467       tmp[bsize-1]             |= maskOR_lsb;
03468 
03469       /* read it in */
03470       if ((err = mp_read_unsigned_bin(a, tmp, bsize)) != MP_OKAY)     { goto error; }
03471 
03472       /* is it prime? */
03473       if ((err = mp_prime_is_prime(a, t, &res)) != MP_OKAY)           { goto error; }
03474       if (res == MP_NO) {  
03475          continue;
03476       }
03477 
03478       if (flags & LTM_PRIME_SAFE) {
03479          /* see if (a-1)/2 is prime */
03480          if ((err = mp_sub_d(a, 1, a)) != MP_OKAY)                    { goto error; }
03481          if ((err = mp_div_2(a, a)) != MP_OKAY)                       { goto error; }
03482  
03483          /* is it prime? */
03484          if ((err = mp_prime_is_prime(a, t, &res)) != MP_OKAY)        { goto error; }
03485       }
03486    } while (res == MP_NO);
03487 
03488    if (flags & LTM_PRIME_SAFE) {
03489       /* restore a to the original value */
03490       if ((err = mp_mul_2(a, a)) != MP_OKAY)                          { goto error; }
03491       if ((err = mp_add_d(a, 1, a)) != MP_OKAY)                       { goto error; }
03492    }
03493 
03494    err = MP_OKAY;
03495 error:
03496    HeapFree(GetProcessHeap(), 0, tmp);
03497    return err;
03498 }
03499 
03500 /* reads an unsigned char array, assumes the msb is stored first [big endian] */
03501 int
03502 mp_read_unsigned_bin (mp_int * a, const unsigned char *b, int c)
03503 {
03504   int     res;
03505 
03506   /* make sure there are at least two digits */
03507   if (a->alloc < 2) {
03508      if ((res = mp_grow(a, 2)) != MP_OKAY) {
03509         return res;
03510      }
03511   }
03512 
03513   /* zero the int */
03514   mp_zero (a);
03515 
03516   /* read the bytes in */
03517   while (c-- > 0) {
03518     if ((res = mp_mul_2d (a, 8, a)) != MP_OKAY) {
03519       return res;
03520     }
03521 
03522       a->dp[0] |= *b++;
03523       a->used += 1;
03524   }
03525   mp_clamp (a);
03526   return MP_OKAY;
03527 }
03528 
03529 /* reduces x mod m, assumes 0 < x < m**2, mu is 
03530  * precomputed via mp_reduce_setup.
03531  * From HAC pp.604 Algorithm 14.42
03532  */
03533 int
03534 mp_reduce (mp_int * x, const mp_int * m, const mp_int * mu)
03535 {
03536   mp_int  q;
03537   int     res, um = m->used;
03538 
03539   /* q = x */
03540   if ((res = mp_init_copy (&q, x)) != MP_OKAY) {
03541     return res;
03542   }
03543 
03544   /* q1 = x / b**(k-1)  */
03545   mp_rshd (&q, um - 1);         
03546 
03547   /* according to HAC this optimization is ok */
03548   if (((unsigned long) um) > (((mp_digit)1) << (DIGIT_BIT - 1))) {
03549     if ((res = mp_mul (&q, mu, &q)) != MP_OKAY) {
03550       goto CLEANUP;
03551     }
03552   } else {
03553     if ((res = s_mp_mul_high_digs (&q, mu, &q, um - 1)) != MP_OKAY) {
03554       goto CLEANUP;
03555     }
03556   }
03557 
03558   /* q3 = q2 / b**(k+1) */
03559   mp_rshd (&q, um + 1);         
03560 
03561   /* x = x mod b**(k+1), quick (no division) */
03562   if ((res = mp_mod_2d (x, DIGIT_BIT * (um + 1), x)) != MP_OKAY) {
03563     goto CLEANUP;
03564   }
03565 
03566   /* q = q * m mod b**(k+1), quick (no division) */
03567   if ((res = s_mp_mul_digs (&q, m, &q, um + 1)) != MP_OKAY) {
03568     goto CLEANUP;
03569   }
03570 
03571   /* x = x - q */
03572   if ((res = mp_sub (x, &q, x)) != MP_OKAY) {
03573     goto CLEANUP;
03574   }
03575 
03576   /* If x < 0, add b**(k+1) to it */
03577   if (mp_cmp_d (x, 0) == MP_LT) {
03578     mp_set (&q, 1);
03579     if ((res = mp_lshd (&q, um + 1)) != MP_OKAY)
03580       goto CLEANUP;
03581     if ((res = mp_add (x, &q, x)) != MP_OKAY)
03582       goto CLEANUP;
03583   }
03584 
03585   /* Back off if it's too big */
03586   while (mp_cmp (x, m) != MP_LT) {
03587     if ((res = s_mp_sub (x, m, x)) != MP_OKAY) {
03588       goto CLEANUP;
03589     }
03590   }
03591   
03592 CLEANUP:
03593   mp_clear (&q);
03594 
03595   return res;
03596 }
03597 
03598 /* reduces a modulo n where n is of the form 2**p - d */
03599 int
03600 mp_reduce_2k(mp_int *a, const mp_int *n, mp_digit d)
03601 {
03602    mp_int q;
03603    int    p, res;
03604    
03605    if ((res = mp_init(&q)) != MP_OKAY) {
03606       return res;
03607    }
03608    
03609    p = mp_count_bits(n);    
03610 top:
03611    /* q = a/2**p, a = a mod 2**p */
03612    if ((res = mp_div_2d(a, p, &q, a)) != MP_OKAY) {
03613       goto ERR;
03614    }
03615    
03616    if (d != 1) {
03617       /* q = q * d */
03618       if ((res = mp_mul_d(&q, d, &q)) != MP_OKAY) { 
03619          goto ERR;
03620       }
03621    }
03622    
03623    /* a = a + q */
03624    if ((res = s_mp_add(a, &q, a)) != MP_OKAY) {
03625       goto ERR;
03626    }
03627    
03628    if (mp_cmp_mag(a, n) != MP_LT) {
03629       s_mp_sub(a, n, a);
03630       goto top;
03631    }
03632    
03633 ERR:
03634    mp_clear(&q);
03635    return res;
03636 }
03637 
03638 /* determines the setup value */
03639 static int
03640 mp_reduce_2k_setup(const mp_int *a, mp_digit *d)
03641 {
03642    int res, p;
03643    mp_int tmp;
03644    
03645    if ((res = mp_init(&tmp)) != MP_OKAY) {
03646       return res;
03647    }
03648    
03649    p = mp_count_bits(a);
03650    if ((res = mp_2expt(&tmp, p)) != MP_OKAY) {
03651       mp_clear(&tmp);
03652       return res;
03653    }
03654    
03655    if ((res = s_mp_sub(&tmp, a, &tmp)) != MP_OKAY) {
03656       mp_clear(&tmp);
03657       return res;
03658    }
03659    
03660    *d = tmp.dp[0];
03661    mp_clear(&tmp);
03662    return MP_OKAY;
03663 }
03664 
03665 /* pre-calculate the value required for Barrett reduction
03666  * For a given modulus "b" it calculates the value required in "a"
03667  */
03668 int mp_reduce_setup (mp_int * a, const mp_int * b)
03669 {
03670   int     res;
03671 
03672   if ((res = mp_2expt (a, b->used * 2 * DIGIT_BIT)) != MP_OKAY) {
03673     return res;
03674   }
03675   return mp_div (a, b, a, NULL);
03676 }
03677 
03678 /* set to a digit */
03679 void mp_set (mp_int * a, mp_digit b)
03680 {
03681   mp_zero (a);
03682   a->dp[0] = b & MP_MASK;
03683   a->used  = (a->dp[0] != 0) ? 1 : 0;
03684 }
03685 
03686 /* set a 32-bit const */
03687 int mp_set_int (mp_int * a, unsigned long b)
03688 {
03689   int     x, res;
03690 
03691   mp_zero (a);
03692   
03693   /* set four bits at a time */
03694   for (x = 0; x < 8; x++) {
03695     /* shift the number up four bits */
03696     if ((res = mp_mul_2d (a, 4, a)) != MP_OKAY) {
03697       return res;
03698     }
03699 
03700     /* OR in the top four bits of the source */
03701     a->dp[0] |= (b >> 28) & 15;
03702 
03703     /* shift the source up to the next four bits */
03704     b <<= 4;
03705 
03706     /* ensure that digits are not clamped off */
03707     a->used += 1;
03708   }
03709   mp_clamp (a);
03710   return MP_OKAY;
03711 }
03712 
03713 /* shrink a bignum */
03714 int mp_shrink (mp_int * a)
03715 {
03716   mp_digit *tmp;
03717   if (a->alloc != a->used && a->used > 0) {
03718     if ((tmp = HeapReAlloc(GetProcessHeap(), 0, a->dp, sizeof (mp_digit) * a->used)) == NULL) {
03719       return MP_MEM;
03720     }
03721     a->dp    = tmp;
03722     a->alloc = a->used;
03723   }
03724   return MP_OKAY;
03725 }
03726 
03727 /* computes b = a*a */
03728 int
03729 mp_sqr (const mp_int * a, mp_int * b)
03730 {
03731   int     res;
03732 
03733 if (a->used >= KARATSUBA_SQR_CUTOFF) {
03734     res = mp_karatsuba_sqr (a, b);
03735   } else 
03736   {
03737     /* can we use the fast comba multiplier? */
03738     if ((a->used * 2 + 1) < MP_WARRAY && 
03739          a->used < 
03740          (1 << (sizeof(mp_word) * CHAR_BIT - 2*DIGIT_BIT - 1))) {
03741       res = fast_s_mp_sqr (a, b);
03742     } else
03743       res = s_mp_sqr (a, b);
03744   }
03745   b->sign = MP_ZPOS;
03746   return res;
03747 }
03748 
03749 /* c = a * a (mod b) */
03750 int
03751 mp_sqrmod (const mp_int * a, mp_int * b, mp_int * c)
03752 {
03753   int     res;
03754   mp_int  t;
03755 
03756   if ((res = mp_init (&t)) != MP_OKAY) {
03757     return res;
03758   }
03759 
03760   if ((res = mp_sqr (a, &t)) != MP_OKAY) {
03761     mp_clear (&t);
03762     return res;
03763   }
03764   res = mp_mod (&t, b, c);
03765   mp_clear (&t);
03766   return res;
03767 }
03768 
03769 /* high level subtraction (handles signs) */
03770 int
03771 mp_sub (mp_int * a, mp_int * b, mp_int * c)
03772 {
03773   int     sa, sb, res;
03774 
03775   sa = a->sign;
03776   sb = b->sign;
03777 
03778   if (sa != sb) {
03779     /* subtract a negative from a positive, OR */
03780     /* subtract a positive from a negative. */
03781     /* In either case, ADD their magnitudes, */
03782     /* and use the sign of the first number. */
03783     c->sign = sa;
03784     res = s_mp_add (a, b, c);
03785   } else {
03786     /* subtract a positive from a positive, OR */
03787     /* subtract a negative from a negative. */
03788     /* First, take the difference between their */
03789     /* magnitudes, then... */
03790     if (mp_cmp_mag (a, b) != MP_LT) {
03791       /* Copy the sign from the first */
03792       c->sign = sa;
03793       /* The first has a larger or equal magnitude */
03794       res = s_mp_sub (a, b, c);
03795     } else {
03796       /* The result has the *opposite* sign from */
03797       /* the first number. */
03798       c->sign = (sa == MP_ZPOS) ? MP_NEG : MP_ZPOS;
03799       /* The second has a larger magnitude */
03800       res = s_mp_sub (b, a, c);
03801     }
03802   }
03803   return res;
03804 }
03805 
03806 /* single digit subtraction */
03807 int
03808 mp_sub_d (mp_int * a, mp_digit b, mp_int * c)
03809 {
03810   mp_digit *tmpa, *tmpc, mu;
03811   int       res, ix, oldused;
03812 
03813   /* grow c as required */
03814   if (c->alloc < a->used + 1) {
03815      if ((res = mp_grow(c, a->used + 1)) != MP_OKAY) {
03816         return res;
03817      }
03818   }
03819 
03820   /* if a is negative just do an unsigned
03821    * addition [with fudged signs]
03822    */
03823   if (a->sign == MP_NEG) {
03824      a->sign = MP_ZPOS;
03825      res     = mp_add_d(a, b, c);
03826      a->sign = c->sign = MP_NEG;
03827      return res;
03828   }
03829 
03830   /* setup regs */
03831   oldused = c->used;
03832   tmpa    = a->dp;
03833   tmpc    = c->dp;
03834 
03835   /* if a <= b simply fix the single digit */
03836   if ((a->used == 1 && a->dp[0] <= b) || a->used == 0) {
03837      if (a->used == 1) {
03838         *tmpc++ = b - *tmpa;
03839      } else {
03840         *tmpc++ = b;
03841      }
03842      ix      = 1;
03843 
03844      /* negative/1digit */
03845      c->sign = MP_NEG;
03846      c->used = 1;
03847   } else {
03848      /* positive/size */
03849      c->sign = MP_ZPOS;
03850      c->used = a->used;
03851 
03852      /* subtract first digit */
03853      *tmpc    = *tmpa++ - b;
03854      mu       = *tmpc >> (sizeof(mp_digit) * CHAR_BIT - 1);
03855      *tmpc++ &= MP_MASK;
03856 
03857      /* handle rest of the digits */
03858      for (ix = 1; ix < a->used; ix++) {
03859         *tmpc    = *tmpa++ - mu;
03860         mu       = *tmpc >> (sizeof(mp_digit) * CHAR_BIT - 1);
03861         *tmpc++ &= MP_MASK;
03862      }
03863   }
03864 
03865   /* zero excess digits */
03866   while (ix++ < oldused) {
03867      *tmpc++ = 0;
03868   }
03869   mp_clamp(c);
03870   return MP_OKAY;
03871 }
03872 
03873 /* store in unsigned [big endian] format */
03874 int
03875 mp_to_unsigned_bin (const mp_int * a, unsigned char *b)
03876 {
03877   int     x, res;
03878   mp_int  t;
03879 
03880   if ((res = mp_init_copy (&t, a)) != MP_OKAY) {
03881     return res;
03882   }
03883 
03884   x = 0;
03885   while (mp_iszero (&t) == 0) {
03886     b[x++] = (unsigned char) (t.dp[0] & 255);
03887     if ((res = mp_div_2d (&t, 8, &t, NULL)) != MP_OKAY) {
03888       mp_clear (&t);
03889       return res;
03890     }
03891   }
03892   bn_reverse (b, x);
03893   mp_clear (&t);
03894   return MP_OKAY;
03895 }
03896 
03897 /* get the size for an unsigned equivalent */
03898 int
03899 mp_unsigned_bin_size (const mp_int * a)
03900 {
03901   int     size = mp_count_bits (a);
03902   return (size / 8 + ((size & 7) != 0 ? 1 : 0));
03903 }
03904 
03905 /* reverse an array, used for radix code */
03906 static void
03907 bn_reverse (unsigned char *s, int len)
03908 {
03909   int     ix, iy;
03910   unsigned char t;
03911 
03912   ix = 0;
03913   iy = len - 1;
03914   while (ix < iy) {
03915     t     = s[ix];
03916     s[ix] = s[iy];
03917     s[iy] = t;
03918     ++ix;
03919     --iy;
03920   }
03921 }
03922 
03923 /* low level addition, based on HAC pp.594, Algorithm 14.7 */
03924 static int
03925 s_mp_add (mp_int * a, mp_int * b, mp_int * c)
03926 {
03927   mp_int *x;
03928   int     olduse, res, min, max;
03929 
03930   /* find sizes, we let |a| <= |b| which means we have to sort
03931    * them.  "x" will point to the input with the most digits
03932    */
03933   if (a->used > b->used) {
03934     min = b->used;
03935     max = a->used;
03936     x = a;
03937   } else {
03938     min = a->used;
03939     max = b->used;
03940     x = b;
03941   }
03942 
03943   /* init result */
03944   if (c->alloc < max + 1) {
03945     if ((res = mp_grow (c, max + 1)) != MP_OKAY) {
03946       return res;
03947     }
03948   }
03949 
03950   /* get old used digit count and set new one */
03951   olduse = c->used;
03952   c->used = max + 1;
03953 
03954   {
03955     register mp_digit u, *tmpa, *tmpb, *tmpc;
03956     register int i;
03957 
03958     /* alias for digit pointers */
03959 
03960     /* first input */
03961     tmpa = a->dp;
03962 
03963     /* second input */
03964     tmpb = b->dp;
03965 
03966     /* destination */
03967     tmpc = c->dp;
03968 
03969     /* zero the carry */
03970     u = 0;
03971     for (i = 0; i < min; i++) {
03972       /* Compute the sum at one digit, T[i] = A[i] + B[i] + U */
03973       *tmpc = *tmpa++ + *tmpb++ + u;
03974 
03975       /* U = carry bit of T[i] */
03976       u = *tmpc >> ((mp_digit)DIGIT_BIT);
03977 
03978       /* take away carry bit from T[i] */
03979       *tmpc++ &= MP_MASK;
03980     }
03981 
03982     /* now copy higher words if any, that is in A+B 
03983      * if A or B has more digits add those in 
03984      */
03985     if (min != max) {
03986       for (; i < max; i++) {
03987         /* T[i] = X[i] + U */
03988         *tmpc = x->dp[i] + u;
03989 
03990         /* U = carry bit of T[i] */
03991         u = *tmpc >> ((mp_digit)DIGIT_BIT);
03992 
03993         /* take away carry bit from T[i] */
03994         *tmpc++ &= MP_MASK;
03995       }
03996     }
03997 
03998     /* add carry */
03999     *tmpc++ = u;
04000 
04001     /* clear digits above oldused */
04002     for (i = c->used; i < olduse; i++) {
04003       *tmpc++ = 0;
04004     }
04005   }
04006 
04007   mp_clamp (c);
04008   return MP_OKAY;
04009 }
04010 
04011 static int s_mp_exptmod (const mp_int * G, const mp_int * X, mp_int * P, mp_int * Y)
04012 {
04013   mp_int  M[256], res, mu;
04014   mp_digit buf;
04015   int     err, bitbuf, bitcpy, bitcnt, mode, digidx, x, y, winsize;
04016 
04017   /* find window size */
04018   x = mp_count_bits (X);
04019   if (x <= 7) {
04020     winsize = 2;
04021   } else if (x <= 36) {
04022     winsize = 3;
04023   } else if (x <= 140) {
04024     winsize = 4;
04025   } else if (x <= 450) {
04026     winsize = 5;
04027   } else if (x <= 1303) {
04028     winsize = 6;
04029   } else if (x <= 3529) {
04030     winsize = 7;
04031   } else {
04032     winsize = 8;
04033   }
04034 
04035   /* init M array */
04036   /* init first cell */
04037   if ((err = mp_init(&M[1])) != MP_OKAY) {
04038      return err; 
04039   }
04040 
04041   /* now init the second half of the array */
04042   for (x = 1<<(winsize-1); x < (1 << winsize); x++) {
04043     if ((err = mp_init(&M[x])) != MP_OKAY) {
04044       for (y = 1<<(winsize-1); y < x; y++) {
04045         mp_clear (&M[y]);
04046       }
04047       mp_clear(&M[1]);
04048       return err;
04049     }
04050   }
04051 
04052   /* create mu, used for Barrett reduction */
04053   if ((err = mp_init (&mu)) != MP_OKAY) {
04054     goto __M;
04055   }
04056   if ((err = mp_reduce_setup (&mu, P)) != MP_OKAY) {
04057     goto __MU;
04058   }
04059 
04060   /* create M table
04061    *
04062    * The M table contains powers of the base, 
04063    * e.g. M[x] = G**x mod P
04064    *
04065    * The first half of the table is not 
04066    * computed though accept for M[0] and M[1]
04067    */
04068   if ((err = mp_mod (G, P, &M[1])) != MP_OKAY) {
04069     goto __MU;
04070   }
04071 
04072   /* compute the value at M[1<<(winsize-1)] by squaring 
04073    * M[1] (winsize-1) times 
04074    */
04075   if ((err = mp_copy (&M[1], &M[1 << (winsize - 1)])) != MP_OKAY) {
04076     goto __MU;
04077   }
04078 
04079   for (x = 0; x < (winsize - 1); x++) {
04080     if ((err = mp_sqr (&M[1 << (winsize - 1)], 
04081                        &M[1 << (winsize - 1)])) != MP_OKAY) {
04082       goto __MU;
04083     }
04084     if ((err = mp_reduce (&M[1 << (winsize - 1)], P, &mu)) != MP_OKAY) {
04085       goto __MU;
04086     }
04087   }
04088 
04089   /* create upper table, that is M[x] = M[x-1] * M[1] (mod P)
04090    * for x = (2**(winsize - 1) + 1) to (2**winsize - 1)
04091    */
04092   for (x = (1 << (winsize - 1)) + 1; x < (1 << winsize); x++) {
04093     if ((err = mp_mul (&M[x - 1], &M[1], &M[x])) != MP_OKAY) {
04094       goto __MU;
04095     }
04096     if ((err = mp_reduce (&M[x], P, &mu)) != MP_OKAY) {
04097       goto __MU;
04098     }
04099   }
04100 
04101   /* setup result */
04102   if ((err = mp_init (&res)) != MP_OKAY) {
04103     goto __MU;
04104   }
04105   mp_set (&res, 1);
04106 
04107   /* set initial mode and bit cnt */
04108   mode   = 0;
04109   bitcnt = 1;
04110   buf    = 0;
04111   digidx = X->used - 1;
04112   bitcpy = 0;
04113   bitbuf = 0;
04114 
04115   for (;;) {
04116     /* grab next digit as required */
04117     if (--bitcnt == 0) {
04118       /* if digidx == -1 we are out of digits */
04119       if (digidx == -1) {
04120         break;
04121       }
04122       /* read next digit and reset the bitcnt */
04123       buf    = X->dp[digidx--];
04124       bitcnt = DIGIT_BIT;
04125     }
04126 
04127     /* grab the next msb from the exponent */
04128     y     = (buf >> (mp_digit)(DIGIT_BIT - 1)) & 1;
04129     buf <<= (mp_digit)1;
04130 
04131     /* if the bit is zero and mode == 0 then we ignore it
04132      * These represent the leading zero bits before the first 1 bit
04133      * in the exponent.  Technically this opt is not required but it
04134      * does lower the # of trivial squaring/reductions used
04135      */
04136     if (mode == 0 && y == 0) {
04137       continue;
04138     }
04139 
04140     /* if the bit is zero and mode == 1 then we square */
04141     if (mode == 1 && y == 0) {
04142       if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
04143         goto __RES;
04144       }
04145       if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) {
04146         goto __RES;
04147       }
04148       continue;
04149     }
04150 
04151     /* else we add it to the window */
04152     bitbuf |= (y << (winsize - ++bitcpy));
04153     mode    = 2;
04154 
04155     if (bitcpy == winsize) {
04156       /* ok window is filled so square as required and multiply  */
04157       /* square first */
04158       for (x = 0; x < winsize; x++) {
04159         if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
04160           goto __RES;
04161         }
04162         if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) {
04163           goto __RES;
04164         }
04165       }
04166 
04167       /* then multiply */
04168       if ((err = mp_mul (&res, &M[bitbuf], &res)) != MP_OKAY) {
04169         goto __RES;
04170       }
04171       if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) {
04172         goto __RES;
04173       }
04174 
04175       /* empty window and reset */
04176       bitcpy = 0;
04177       bitbuf = 0;
04178       mode   = 1;
04179     }
04180   }
04181 
04182   /* if bits remain then square/multiply */
04183   if (mode == 2 && bitcpy > 0) {
04184     /* square then multiply if the bit is set */
04185     for (x = 0; x < bitcpy; x++) {
04186       if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
04187         goto __RES;
04188       }
04189       if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) {
04190         goto __RES;
04191       }
04192 
04193       bitbuf <<= 1;
04194       if ((bitbuf & (1 << winsize)) != 0) {
04195         /* then multiply */
04196         if ((err = mp_mul (&res, &M[1], &res)) != MP_OKAY) {
04197           goto __RES;
04198         }
04199         if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) {
04200           goto __RES;
04201         }
04202       }
04203     }
04204   }
04205 
04206   mp_exch (&res, Y);
04207   err = MP_OKAY;
04208 __RES:mp_clear (&res);
04209 __MU:mp_clear (&mu);
04210 __M:
04211   mp_clear(&M[1]);
04212   for (x = 1<<(winsize-1); x < (1 << winsize); x++) {
04213     mp_clear (&M[x]);
04214   }
04215   return err;
04216 }
04217 
04218 /* multiplies |a| * |b| and only computes up to digs digits of result
04219  * HAC pp. 595, Algorithm 14.12  Modified so you can control how 
04220  * many digits of output are created.
04221  */
04222 static int
04223 s_mp_mul_digs (const mp_int * a, const mp_int * b, mp_int * c, int digs)
04224 {
04225   mp_int  t;
04226   int     res, pa, pb, ix, iy;
04227   mp_digit u;
04228   mp_word r;
04229   mp_digit tmpx, *tmpt, *tmpy;
04230 
04231   /* can we use the fast multiplier? */
04232   if (((digs) < MP_WARRAY) &&
04233       MIN (a->used, b->used) < 
04234           (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
04235     return fast_s_mp_mul_digs (a, b, c, digs);
04236   }
04237 
04238   if ((res = mp_init_size (&t, digs)) != MP_OKAY) {
04239     return res;
04240   }
04241   t.used = digs;
04242 
04243   /* compute the digits of the product directly */
04244   pa = a->used;
04245   for (ix = 0; ix < pa; ix++) {
04246     /* set the carry to zero */
04247     u = 0;
04248 
04249     /* limit ourselves to making digs digits of output */
04250     pb = MIN (b->used, digs - ix);
04251 
04252     /* setup some aliases */
04253     /* copy of the digit from a used within the nested loop */
04254     tmpx = a->dp[ix];
04255     
04256     /* an alias for the destination shifted ix places */
04257     tmpt = t.dp + ix;
04258     
04259     /* an alias for the digits of b */
04260     tmpy = b->dp;
04261 
04262     /* compute the columns of the output and propagate the carry */
04263     for (iy = 0; iy < pb; iy++) {
04264       /* compute the column as a mp_word */
04265       r       = ((mp_word)*tmpt) +
04266                 ((mp_word)tmpx) * ((mp_word)*tmpy++) +
04267                 ((mp_word) u);
04268 
04269       /* the new column is the lower part of the result */
04270       *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK));
04271 
04272       /* get the carry word from the result */
04273       u       = (mp_digit) (r >> ((mp_word) DIGIT_BIT));
04274     }
04275     /* set carry if it is placed below digs */
04276     if (ix + iy < digs) {
04277       *tmpt = u;
04278     }
04279   }
04280 
04281   mp_clamp (&t);
04282   mp_exch (&t, c);
04283 
04284   mp_clear (&t);
04285   return MP_OKAY;
04286 }
04287 
04288 /* multiplies |a| * |b| and does not compute the lower digs digits
04289  * [meant to get the higher part of the product]
04290  */
04291 static int
04292 s_mp_mul_high_digs (const mp_int * a, const mp_int * b, mp_int * c, int digs)
04293 {
04294   mp_int  t;
04295   int     res, pa, pb, ix, iy;
04296   mp_digit u;
04297   mp_word r;
04298   mp_digit tmpx, *tmpt, *tmpy;
04299 
04300   /* can we use the fast multiplier? */
04301   if (((a->used + b->used + 1) < MP_WARRAY)
04302       && MIN (a->used, b->used) < (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
04303     return fast_s_mp_mul_high_digs (a, b, c, digs);
04304   }
04305 
04306   if ((res = mp_init_size (&t, a->used + b->used + 1)) != MP_OKAY) {
04307     return res;
04308   }
04309   t.used = a->used + b->used + 1;
04310 
04311   pa = a->used;
04312   pb = b->used;
04313   for (ix = 0; ix < pa; ix++) {
04314     /* clear the carry */
04315     u = 0;
04316 
04317     /* left hand side of A[ix] * B[iy] */
04318     tmpx = a->dp[ix];
04319 
04320     /* alias to the address of where the digits will be stored */
04321     tmpt = &(t.dp[digs]);
04322 
04323     /* alias for where to read the right hand side from */
04324     tmpy = b->dp + (digs - ix);
04325 
04326     for (iy = digs - ix; iy < pb; iy++) {
04327       /* calculate the double precision result */
04328       r       = ((mp_word)*tmpt) +
04329                 ((mp_word)tmpx) * ((mp_word)*tmpy++) +
04330                 ((mp_word) u);
04331 
04332       /* get the lower part */
04333       *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK));
04334 
04335       /* carry the carry */
04336       u       = (mp_digit) (r >> ((mp_word) DIGIT_BIT));
04337     }
04338     *tmpt = u;
04339   }
04340   mp_clamp (&t);
04341   mp_exch (&t, c);
04342   mp_clear (&t);
04343   return MP_OKAY;
04344 }
04345 
04346 /* low level squaring, b = a*a, HAC pp.596-597, Algorithm 14.16 */
04347 static int
04348 s_mp_sqr (const mp_int * a, mp_int * b)
04349 {
04350   mp_int  t;
04351   int     res, ix, iy, pa;
04352   mp_word r;
04353   mp_digit u, tmpx, *tmpt;
04354 
04355   pa = a->used;
04356   if ((res = mp_init_size (&t, 2*pa + 1)) != MP_OKAY) {
04357     return res;
04358   }
04359 
04360   /* default used is maximum possible size */
04361   t.used = 2*pa + 1;
04362 
04363   for (ix = 0; ix < pa; ix++) {
04364     /* first calculate the digit at 2*ix */
04365     /* calculate double precision result */
04366     r = ((mp_word) t.dp[2*ix]) +
04367         ((mp_word)a->dp[ix])*((mp_word)a->dp[ix]);
04368 
04369     /* store lower part in result */
04370     t.dp[ix+ix] = (mp_digit) (r & ((mp_word) MP_MASK));
04371 
04372     /* get the carry */
04373     u           = (mp_digit)(r >> ((mp_word) DIGIT_BIT));
04374 
04375     /* left hand side of A[ix] * A[iy] */
04376     tmpx        = a->dp[ix];
04377 
04378     /* alias for where to store the results */
04379     tmpt        = t.dp + (2*ix + 1);
04380     
04381     for (iy = ix + 1; iy < pa; iy++) {
04382       /* first calculate the product */
04383       r       = ((mp_word)tmpx) * ((mp_word)a->dp[iy]);
04384 
04385       /* now calculate the double precision result, note we use
04386        * addition instead of *2 since it's easier to optimize
04387        */
04388       r       = ((mp_word) *tmpt) + r + r + ((mp_word) u);
04389 
04390       /* store lower part */
04391       *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK));
04392 
04393       /* get carry */
04394       u       = (mp_digit)(r >> ((mp_word) DIGIT_BIT));
04395     }
04396     /* propagate upwards */
04397     while (u != 0) {
04398       r       = ((mp_word) *tmpt) + ((mp_word) u);
04399       *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK));
04400       u       = (mp_digit)(r >> ((mp_word) DIGIT_BIT));
04401     }
04402   }
04403 
04404   mp_clamp (&t);
04405   mp_exch (&t, b);
04406   mp_clear (&t);
04407   return MP_OKAY;
04408 }
04409 
04410 /* low level subtraction (assumes |a| > |b|), HAC pp.595 Algorithm 14.9 */
04411 int
04412 s_mp_sub (const mp_int * a, const mp_int * b, mp_int * c)
04413 {
04414   int     olduse, res, min, max;
04415 
04416   /* find sizes */
04417   min = b->used;
04418   max = a->used;
04419 
04420   /* init result */
04421   if (c->alloc < max) {
04422     if ((res = mp_grow (c, max)) != MP_OKAY) {
04423       return res;
04424     }
04425   }
04426   olduse = c->used;
04427   c->used = max;
04428 
04429   {
04430     register mp_digit u, *tmpa, *tmpb, *tmpc;
04431     register int i;
04432 
04433     /* alias for digit pointers */
04434     tmpa = a->dp;
04435     tmpb = b->dp;
04436     tmpc = c->dp;
04437 
04438     /* set carry to zero */
04439     u = 0;
04440     for (i = 0; i < min; i++) {
04441       /* T[i] = A[i] - B[i] - U */
04442       *tmpc = *tmpa++ - *tmpb++ - u;
04443 
04444       /* U = carry bit of T[i]
04445        * Note this saves performing an AND operation since
04446        * if a carry does occur it will propagate all the way to the
04447        * MSB.  As a result a single shift is enough to get the carry
04448        */
04449       u = *tmpc >> ((mp_digit)(CHAR_BIT * sizeof (mp_digit) - 1));
04450 
04451       /* Clear carry from T[i] */
04452       *tmpc++ &= MP_MASK;
04453     }
04454 
04455     /* now copy higher words if any, e.g. if A has more digits than B  */
04456     for (; i < max; i++) {
04457       /* T[i] = A[i] - U */
04458       *tmpc = *tmpa++ - u;
04459 
04460       /* U = carry bit of T[i] */
04461       u = *tmpc >> ((mp_digit)(CHAR_BIT * sizeof (mp_digit) - 1));
04462 
04463       /* Clear carry from T[i] */
04464       *tmpc++ &= MP_MASK;
04465     }
04466 
04467     /* clear digits above used (since we may not have grown result above) */
04468     for (i = c->used; i < olduse; i++) {
04469       *tmpc++ = 0;
04470     }
04471   }
04472 
04473   mp_clamp (c);
04474   return MP_OKAY;
04475 }

Generated on Sun May 27 2012 04:26:08 for ReactOS by doxygen 1.7.6.1

ReactOS is a registered trademark or a trademark of ReactOS Foundation in the United States and other countries.