xenocara/app/xedit/lisp/mp/mp.c

/*
 * Copyright (c) 2002 by The XFree86 Project, Inc.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *  
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE XFREE86 PROJECT BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
 * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 *
 * Except as contained in this notice, the name of the XFree86 Project shall
 * not be used in advertising or otherwise to promote the sale, use or other
 * dealings in this Software without prior written authorization from the
 * XFree86 Project.
 *
 * Author: Paulo César Pereira de Andrade
 */

/* $XFree86: xc/programs/xedit/lisp/mp/mp.c,v 1.2 2002/11/08 08:01:00 paulo Exp $ */

#include "mp.h"

/*
 * TODO:
 *	o Optimize squaring
 *	o Write better division code and move from mpi.c to here
 *	o Make multiplication code don't required memory to be zeroed
 *		+ The first step is easy, just multiply the low word,
 *		then the high word, that may overlap with the result
 *		of the first multiply (in case of carry), and then
 *		just make sure carry is properly propagated in the
 *		subsequent multiplications.
 *		+ Some code needs also to be rewritten because some
 *		intermediate addition code in mp_mul, mp_karatsuba_mul,
 *		and mp_toom_mul is assuming the memory is zeroed.
 */

/*
 * Prototypes
 */
	/* out of memory handler */
static void mp_outmem(void);

	/* memory allocation fallback functions */
static void *_mp_malloc(size_t);
static void *_mp_calloc(size_t, size_t);
static void *_mp_realloc(void*, size_t);
static void _mp_free(void*);

/*
 * Initialization
 */
static mp_malloc_fun __mp_malloc = _mp_malloc;
static mp_calloc_fun __mp_calloc = _mp_calloc;
static mp_realloc_fun __mp_realloc = _mp_realloc;
static mp_free_fun __mp_free = _mp_free;

/*
 * Implementation
 */
static void
mp_outmem(void)
{
    fprintf(stderr, "out of memory in MP library.\n");
    exit(1);
}

static void *
_mp_malloc(size_t size)
{
    return (malloc(size));
}

void *
mp_malloc(size_t size)
{
    void *pointer = (*__mp_malloc)(size);

    if (pointer == NULL)
	mp_outmem();

    return (pointer);
}

mp_malloc_fun
mp_set_malloc(mp_malloc_fun fun)
{
    mp_malloc_fun old = __mp_malloc;

    __mp_malloc = fun;

    return (old);
}

static void *
_mp_calloc(size_t nmemb, size_t size)
{
    return (calloc(nmemb, size));
}

void *
mp_calloc(size_t nmemb, size_t size)
{
    void *pointer = (*__mp_calloc)(nmemb, size);

    if (pointer == NULL)
	mp_outmem();

    return (pointer);
}

mp_calloc_fun
mp_set_calloc(mp_calloc_fun fun)
{
    mp_calloc_fun old = __mp_calloc;

    __mp_calloc = fun;

    return (old);
}

static void *
_mp_realloc(void *old, size_t size)
{
    return (realloc(old, size));
}

void *
mp_realloc(void *old, size_t size)
{
    void *pointer = (*__mp_realloc)(old, size);

    if (pointer == NULL)
	mp_outmem();

    return (pointer);
}

mp_realloc_fun
mp_set_realloc(mp_realloc_fun fun)
{
    mp_realloc_fun old = __mp_realloc;

    __mp_realloc = fun;

    return (old);
}

static void
_mp_free(void *pointer)
{
    free(pointer);
}

void
mp_free(void *pointer)
{
    (*__mp_free)(pointer);
}

mp_free_fun
mp_set_free(mp_free_fun fun)
{
    mp_free_fun old = __mp_free;

    __mp_free = fun;

    return (old);
}

long
mp_add(BNS *rop, BNS *op1, BNS *op2, BNI len1, BNI len2)
{
    BNI value;			/* intermediate result */
    BNS carry;			/* carry flag */
    long size;			/* result size */

    if (len1 < len2)
	MP_SWAP(op1, op2, len1, len2);

    /* unroll start of loop */
    value = op1[0] + op2[0];
    rop[0] = value;
    carry = value >> BNSBITS;

    /* add op1 and op2 */
    for (size = 1; size < len2; size++) {
	value = op1[size] + op2[size] + carry;
	rop[size] = value;
	carry = value >> BNSBITS;
    }
    if (rop != op1) {
	for (; size < len1; size++) {
	    value = op1[size] + carry;
	    rop[size] = value;
	    carry = value >> BNSBITS;
	}
    }
    else {
	/* if rop == op1, than just adjust carry */
	for (; carry && size < len1; size++) {
	    value = op1[size] + carry;
	    rop[size] = value;
	    carry = value >> BNSBITS;
	}
	size = len1;
    }
    if (carry)
	rop[size++] = carry;

    return (size);
}

long
mp_sub(BNS *rop, BNS *op1, BNS *op2, BNI len1, BNI len2)
{
    long svalue;		/* intermediate result */
    BNS carry;			/* carry flag */
    long size;			/* result size */

    /* special case */
    if (op1 == op2) {
	rop[0] = 0;

	return (1);
    }

    /* unroll start of loop */
    svalue = op1[0] - op2[0];
    rop[0] = svalue;
    carry = svalue < 0;

    /* subtracts op2 from op1 */
    for (size = 1; size < len2; size++) {
	svalue = (long)(op1[size]) - op2[size] - carry;
	rop[size] = svalue;
	carry = svalue < 0;
    }
    if (rop != op1) {
	for (; size < len1; size++) {
	    svalue = op1[size] - carry;
	    rop[size] = svalue;
	    carry = svalue < 0;
	}
    }
    else {
	/* if rop == op1, than just adjust carry */
	for (; carry && size < len1; size++) {
	    svalue = op1[size] - carry;
	    rop[size] = svalue;
	    carry = svalue < 0;
	}
	size = len1;
    }

    /* calculate result size */
    while (size > 1 && rop[size - 1] == 0)
	--size;

    return (size);
}

long
mp_lshift(BNS *rop, BNS *op, BNI len, long shift)
{
    long i, size;
    BNI words, bits;		/* how many word and bit shifts */

    words = shift / BNSBITS;
    bits = shift % BNSBITS;
    size = len + words;

    if (bits) {
	BNS hi, lo;
	BNI carry;
	int adj;

	for (i = 1, carry = CARRY >> 1; carry; i++, carry >>= 1)
	    if (op[len - 1] & carry)
		break;
	adj = (bits + (BNSBITS - i)) / BNSBITS;
	size += adj;

	lo = hi = op[0];
	rop[words] = lo << bits;
	for (i = 1; i < len; i++) {
	    hi = op[i];
	    rop[words + i] = hi << bits | (lo >> (BNSBITS - bits));
	    lo = hi;
	}
	if (adj)
	    rop[size - 1] = hi >> (BNSBITS - bits);
    }
    else
	memmove(rop + size - len, op, sizeof(BNS) * len);

    if (words)
	memset(rop, '\0', sizeof(BNS) * words);

    return (size);
}

long
mp_rshift(BNS *rop, BNS *op, BNI len, long shift)
{
    int adj = 0;
    long i, size;
    BNI words, bits;		/* how many word and bit shifts */

    words = shift / BNSBITS;
    bits = shift % BNSBITS;
    size = len - words;

    if (bits) {
	BNS hi, lo;
	BNI carry;

	for (i = 0, carry = CARRY >> 1; carry; i++, carry >>= 1)
	    if (op[len - 1] & carry)
		break;
	adj = (bits + i) / BNSBITS;
	if (size - adj == 0) {
	    rop[0] = 0;

	    return (1);
	}

	hi = lo = op[words + size - 1];
	rop[size - 1] = hi >> bits;
	for (i = size - 2; i >= 0; i--) {
	    lo = op[words + i];
	    rop[i] = (lo >> bits) | (hi << (BNSBITS - bits));
	    hi = lo;
	}
	if (adj)
	    rop[0] |= lo << (BNSBITS - bits);
    }
    else
	memmove(rop, op + len - size, size * sizeof(BNS));

    return (size - adj);
}

	/* rop must be a pointer to len1 + len2 elements
	 * rop cannot be either op1 or op2
	 * rop must be all zeros */
long
mp_base_mul(BNS *rop, BNS *op1, BNS *op2, BNI len1, BNI len2)
{
    long i, j;			/* counters */
    BNI value;			/* intermediate result */
    BNS carry;			/* carry value */
    long size = len1 + len2;

    /* simple optimization: first pass does not need to deference rop[i+j] */
    if (op1[0]) {
	value = (BNI)(op1[0]) * op2[0];
	rop[0] = value;
	carry = (BNS)(value >> BNSBITS);
	for (j = 1; j < len2; j++) {
	    value = (BNI)(op1[0]) * op2[j] + carry;
	    rop[j] = value;
	    carry = (BNS)(value >> BNSBITS);
	}
	rop[j] = carry;
    }

    /* do the multiplication */
    for (i = 1; i < len1; i++) {
	if (op1[i]) {
	    /* unrool loop initialization */
	    value = (BNI)(op1[i]) * op2[0] + rop[i];
	    rop[i] = value;
	    carry = (BNS)(value >> BNSBITS);
	    /* multiply */
	    for (j = 1; j < len2; j++) {
		value = (BNI)(op1[i]) * op2[j] + rop[i + j] + carry;
		rop[i + j] = value;
		carry = (BNS)(value >> BNSBITS);
	    }
	    rop[i + j] = carry;
	}
    }

    if (size > 1 && rop[size - 1] == 0)
	--size;

    return (size);
}

	/* Karatsuba method
	 * t + ((a0 + a1) (b0 + b1) - t - u) x + ux²
	 * where t = a0b0 and u = a1b1
	 *
	 * Karatsuba method reduces the number of multiplications. Example:
	 *	Square a 40 length number
	 *	instead of a plain 40*40 = 1600 multiplies/adds, it does:
	 *	20*20+20*20+20*20 = 1200
	 *	but since it is recursive, every 20*20=400 is reduced to
	 *	10*10+10*10+10*10=300
	 *	and so on.
	 * The multiplication by x and x² is a just a shift, as it is a
	 * power of two, and is implemented below by just writting at the
	 * correct offset */
long
mp_karatsuba_mul(BNS *rop, BNS *op1, BNS *op2, BNI len1, BNI len2)
{
    BNI x;				/* shift count */
    BNI la0, la1, lb0, lb1;		/* length of a0, a1, b0, and b1 */
    BNS *t;				/* temporary memory for t product */
    BNS *u;				/* temporary memory for u product */
    BNS *r;				/* pointer to rop */
    long xlen, tlen, ulen;

    /* calculate value of x, that is 2^(BNSBITS*x) */
    if (len1 >= len2)
	x = (len1 + 1) >> 1;
    else
	x = (len2 + 1) >> 1;

    /* calculate length of operands */
    la0 = x;
    la1 = len1 - x;
    lb0 = x;
    lb1 = len2 - x;

    /* allocate buffer for t and (a0 + a1) */
    tlen = la0 + lb0;
    t = mp_malloc(sizeof(BNS) * tlen);

    /* allocate buffer for u and (b0 + b1) */
    if (la1 + lb1 < lb0 + lb1 + 1)
	ulen = lb0 + lb1 + 1;
    else
	ulen = la1 + lb1;
    u = mp_malloc(sizeof(BNS) * ulen);

    /* calculate a0 + a1, store result in t */
    tlen = mp_add(t, op1, op1 + x, la0, la1);

    /* calculate b0 + b1, store result in u */
    ulen = mp_add(u, op2, op2 + x, lb0, lb1);

    /* store (a0 + a1) * (b0 + b1) in rop */

    r = rop + x;		/* multiplied by 2^(BNSBITS*x) */
    xlen = mp_mul(r, t, u, tlen, ulen);

    /* must zero t and u memory, this is required for mp_mul */

    /* calculate t = a0 * b0 */
    tlen = la0 + lb0;
    memset(t, '\0', sizeof(BNS) * tlen);
    tlen = mp_mul(t, op1, op2, la0, lb0);

    /* calculate u = a1 * b1 */
    ulen = la1 + lb1;
    memset(u, '\0', sizeof(BNS) * ulen);
    ulen = mp_mul(u, op1 + x, op2 + x, la1, lb1);

    /* subtract t from partial result */
    xlen = mp_sub(r, r, t, xlen, tlen);

    /* subtract u form partial result */
    xlen = mp_sub(r, r, u, xlen, ulen);

    /* add ux^2 to partial result */

    r = rop + (x << 1);		/* multiplied by x^2 = 2^(BNSBITS*x*2) */
    xlen = len1 + len2;
    xlen = mp_add(r, r, u, xlen, ulen);

    /* now add t to final result */
    xlen = mp_add(rop, rop, t, xlen, tlen);

    mp_free(t);
    mp_free(u);

    if (xlen > 1 && rop[xlen - 1] == 0)
	--xlen;

    return (xlen);
}

	/* Toom method	(partially based on GMP documentation)
	 * Evaluation at k = [ 0 1/2 1 2 oo ]
	 * U(x) = (U2k + U1)k + U0
	 * V(x) = (V2k + V1)k + V0
	 * W(x) = U(x)V(x)
	 *
	 * Sample:
	 *	123 * 456
	 *
	 *	EVALUATION:
	 * U(0) = (1*0+2)*0+3	=> 3
	 * U(1) = 1+(2+3*2)*2	=> 17
	 * U(2) = 1+2+3		=> 6
	 * U(3) = (1*2+2)*2+3	=> 11
	 * U(4)	= 1+(2+3*0)*0	=> 1
	 *
	 * V(0) = (4*0+5)*0+6	=> 6
	 * V(1) = 4+(5+6*2)*2	=> 38
	 * V(2) = 4+5+6		=> 15
	 * V(3) = (4*2+5)*2+6	=> 32
	 * V(4)	= 4+(5+6*0)*0	=> 4
	 *
	 *	U = [ 3   17  6  11 1 ]
	 *	V = [ 6   38 15  32 4 ]
	 *	W = [ 18 646 90 352 4 ]
	 *
	 * After that, we have:
	 *	a = 18					(w0 already known)
	 *	b = 16w0 + 8w1 + 4w2 + 2w3 +   w4
	 *	c =   w0 +  w1 +  w2 +  w3 +   w4
	 *	d =   w0 + 2w1 + 4w2 + 8w3 + 16w4
	 *	e = 4					(w4 already known)
	 *
	 *	INTERPOLATION:
	 *	b = b -16a - e		(354)
	 *	c = c - a - e		(68)
	 *	d = d - a - 16e		(270)
	 *
	 *	w = (b + d) - 8c = (10w1+8w2+10w3) - (8w1+8w2+8w3) = 2w1+2w3
	 *	w = 2c - w		(56)
	 *	b = b/2 = 4w1+w+w3
	 *	b = b-c = 4w1+w+w3 - w1+w2+w3 = 3w1+w2
	 *	c = w/2			(w2 = 28)
	 *	b = b-c = 3w1+c - c = 3w1
	 *	b = b/3			(w1 = 27)
	 *	d = d/2
	 *	d = d-b-w = b+w+4w3 - b-w = 4w3
	 *	d = d/4			(w3 = 13)
	 *
	 *	RESULT:
	 *	w4*10^4 + w3*10³ + w2*10² + w1*10 + w0
	 *	40000   + 13000   + 2800   + 270    + 18
	 *	10 is the base where the calculation was done
	 *
	 *	This sample uses small numbers, so it does not show the
	 * advantage of the method. But for example (in base 10), when squaring
	 *	123456789012345678901234567890
	 *	The normal method would do 30*30=900 multiplications
	 *	Karatsuba method would do 15*15*3=675 multiplications
	 *	Toom method would do 10*10*5=500 multiplications
	 * Toom method has a larger overhead if compared with Karatsuba method,
	 * due to evaluation and interpolation, so it should be used for larger
	 * numbers, so that the computation time of evaluation/interpolation
	 * would be smaller than the time spent using other methods.
	 *
	 *	Note that Karatsuba method can be seen as a special case of
	 * Toom method, i.e:
	 *	U1U0 * V1V0
	 *	with k = [ 0 1 oo ]
	 *	U = [ U0 U1+U0 U1 ]
	 *	V = [ V0 V1+V0 V1 ]
	 *	W = [ U0*V0 (U1+U0)*(V1+V0) (U1+V1) ]
	 *
	 *	w0 = U0*V0
	 *	w = (U1+U0)*(V1+V0)
	 *	w2 = (U1*V1)
	 *
	 *	w1 = w - w0 - w2
	 * w2x² + w1x + w0
	 *
	 *	See Knuth's Seminumerical Algorithms for a sample implemention
	 * using 4 stacks and k = [ 0 1 2 3 ... ], based on the size of the
	 * input.
	 */
long
mp_toom_mul(BNS *rop, BNS *op1, BNS *op2, BNI len1, BNI len2)
{
    long size, xsize, i;
    BNI value;				/* used in division */
    BNS carry;
    BNI x;				/* shift count */
    BNI l1, l2;
    BNI al, bl, cl, dl, el, Ul[3], Vl[3];
    BNS *a, *b, *c, *d, *e, *U[3], *V[3];

    /* x is the base i.e. 2^(BNSBITS*x) */
    x = (len1 + len2 + 4) / 6;
    l1 = len1 - (x << 1);	/* length of remaining piece of op1 */
    l2 = len2 - (x << 1);	/* length of remaining piece of op2 */

    /* allocate memory for storing U and V */
    U[0] = mp_malloc(sizeof(BNS) * (x + 2));
    V[0] = mp_malloc(sizeof(BNS) * (x + 2));
    U[1] = mp_malloc(sizeof(BNS) * (x + 1));
    V[1] = mp_malloc(sizeof(BNS) * (x + 1));
    U[2] = mp_malloc(sizeof(BNS) * (x + 2));
    V[2] = mp_malloc(sizeof(BNS) * (x + 2));

	/* EVALUATE U AND V */

    /* Numbers are in the format U2x²+U1x+U0 and V2x²+V1x+V0 */

	/* U[0] = U2+U1*2+U0*4 */

    /* store U1*2 in U[1], this value is used twice */
    Ul[1] = mp_lshift(U[1], op1 + x, x, 1);

    /* store U0*4 in U[0] */
    Ul[0] = mp_lshift(U[0], op1, x, 2);
    /* add U1*2 to U[0] */
    Ul[0] = mp_add(U[0], U[0], U[1], Ul[0], Ul[1]);
    /* add U2 to U[0] */
    Ul[0] = mp_add(U[0], U[0], op1 + x + x, Ul[0], l1);

	/* U[2] = U2*4+U1*2+U0 */

    /* store U2*4 in U[2] */
    Ul[2] = mp_lshift(U[2], op1 + x + x, l1, 2);
    /* add U1*2 to U[2] */
    Ul[2] = mp_add(U[2], U[2], U[1], Ul[2], Ul[1]);
    /* add U0 to U[2] */
    Ul[2] = mp_add(U[2], U[2], op1, Ul[2], x);

	/* U[1] = U2+U1+U0 */

    Ul[1] = mp_add(U[1], op1, op1 + x, x, x);
    Ul[1] = mp_add(U[1], U[1], op1 + x + x, Ul[1], l1);


	/* Evaluate V[x], same code as U[x] */
    Vl[1] = mp_lshift(V[1], op2 + x, x, 1);
    Vl[0] = mp_lshift(V[0], op2, x, 2);
    Vl[0] = mp_add(V[0], V[0], V[1], Vl[0], Vl[1]);
    Vl[0] = mp_add(V[0], V[0], op2 + x + x, Vl[0], l2);
    Vl[2] = mp_lshift(V[2], op2 + x + x, l2, 2);
    Vl[2] = mp_add(V[2], V[2], V[1], Vl[2], Vl[1]);
    Vl[2] = mp_add(V[2], V[2], op2, Vl[2], x);
    Vl[1] = mp_add(V[1], op2, op2 + x, x, x);
    Vl[1] = mp_add(V[1], V[1], op2 + x + x, Vl[1], l2);


	/* MULTIPLY U[] AND V[] */

	/* calculate (U2+U1*2+U0*4) * (V2+V1*2+V0*4) */
    b = mp_calloc(1, sizeof(BNS) * (Ul[0] * Vl[0]));
    bl = mp_mul(b, U[0], V[0], Ul[0], Vl[0]);
    mp_free(U[0]);
    mp_free(V[0]);

	/* calculate (U2+U1+U0) * (V2+V1+V0) */
    c = mp_calloc(1, sizeof(BNS) * (Ul[1] * Vl[1]));
    cl = mp_mul(c, U[1], V[1], Ul[1], Vl[1]);
    mp_free(U[1]);
    mp_free(V[1]);

	/* calculate (U2*4+U1*2+U0) * (V2*4+V1*2+V0) */
    d = mp_calloc(1, sizeof(BNS) * (Ul[2] * Vl[2]));
    dl = mp_mul(d, U[2], V[2], Ul[2], Vl[2]);
    mp_free(U[2]);
    mp_free(V[2]);

	/* calculate U0 * V0 */
    a = mp_calloc(1, sizeof(BNS) * (x + x));
    al = mp_mul(a, op1, op2, x, x);

	/* calculate U2 * V2 */
    e = mp_calloc(1, sizeof(BNS) * (l1 + l2));
    el = mp_mul(e, op1 + x + x, op2 + x + x, l1, l2);


	/* INTERPOLATE COEFFICIENTS */

    /* b = b - 16a - e */
    size = mp_lshift(rop, a, al, 4);
    bl = mp_sub(b, b, rop, bl, size);
    bl = mp_sub(b, b, e, bl, el);

    /* c = c - a - e*/
    cl = mp_sub(c, c, a, cl, al);
    cl = mp_sub(c, c, e, cl, el);

    /* d = d - a - 16e */
    dl = mp_sub(d, d, a, dl, al);
    size = mp_lshift(rop, e, el, 4);
    dl = mp_sub(d, d, rop, dl, size);

    /* w = (b + d) - 8c */
    size = mp_add(rop, b, d, bl, dl);
    xsize = mp_lshift(rop + size, c, cl, 3);	/* rop has enough storage */
    size = mp_sub(rop, rop, rop + size, size, xsize);

    /* w = 2c - w*/
    xsize = mp_lshift(rop + size, c, cl, 1);
    size = mp_sub(rop, rop + size, rop, xsize, size);

    /* b = b/2 */
    bl = mp_rshift(b, b, bl, 1);

    /* b = b - c */
    bl = mp_sub(b, b, c, bl, cl);

    /* c = w / 2 */
    cl = mp_rshift(c, rop, size, 1);

    /* b = b - c */
    bl = mp_sub(b, b, c, bl, cl);

    /* b = b/3 */
	/* maybe the most expensive calculation */
    i = bl - 1;
    value = b[i];
    b[i] = value / 3;
    for (--i; i >= 0; i--) {
	carry = value % 3;
	value = ((BNI)carry << BNSBITS) + b[i];
	b[i] = (BNS)(value / 3);
    }

    /* d = d/2 */
    dl = mp_rshift(d, d, dl, 1);

    /* d = d - b - w */
    dl = mp_sub(d, d, b, dl, bl);
    dl = mp_sub(d, d, rop, dl, size);

    /* d = d/4 */
    dl = mp_rshift(d, d, dl, 2);


	/* STORE RESULT IN ROP */
    /* first clear memory used as temporary variable w and 8c */
    memset(rop, '\0', sizeof(BNS) * (len1 + len2));

    i = x * 4;
    xsize = (len1 + len2) - i;
    size = mp_add(rop + i, rop + i, e, xsize, el) + i;
    i = x * 3;
    xsize = size - i;
    size = mp_add(rop + i, rop + i, d, xsize, dl) + i;
    i = x * 2;
    xsize = size - i;
    size = mp_add(rop + i, rop + i, c, xsize, cl) + i;
    i = x;
    xsize = size - i;
    size = mp_add(rop + i, rop + i, b, xsize, bl) + i;
    size = mp_add(rop, rop, a, size, al);

    mp_free(e);
    mp_free(d);
    mp_free(c);
    mp_free(b);
    mp_free(a);

    if (size > 1 && rop[size - 1] == 0)
	--size;

    return (size);
}

long
mp_mul(BNS *rop, BNS *op1, BNS *op2, BNI len1, BNI len2)
{
    if (len1 < len2)
	MP_SWAP(op1, op2, len1, len2);

    if (len1 < KARATSUBA || len2 < KARATSUBA)
	return (mp_base_mul(rop, op1, op2, len1, len2));
    else if (len1 < TOOM && len2 < TOOM && len2 > ((len1 + 1) >> 1))
	return (mp_karatsuba_mul(rop, op1, op2, len1, len2));
    else if (len1 >= TOOM && len2 >= TOOM && (len2 + 2) / 3 == (len1 + 2) / 3)
	return (mp_toom_mul(rop, op1, op2, len1, len2));
    else {
	long xsize, psize, isize;
	BNS *ptr;

	/* adjust index pointer and estimated size of result */
	isize = 0;
	xsize = len1 + len2;
	mp_mul(rop, op1, op2, len2, len2);
	/* adjust pointers */
	len1 -= len2;
	op1 += len2;

	/* allocate buffer for intermediate multiplications */
	if (len1 > len2)
	    ptr = mp_calloc(1, sizeof(BNS) * (len2 + len2));
	else
	    ptr = mp_calloc(1, sizeof(BNS) * (len1 + len2));

	/* loop multiplying len2 size operands at a time */
	while (len1 >= len2) {
	    isize += len2;
	    psize = mp_mul(ptr, op1, op2, len2, len2);
	    mp_add(rop + isize, rop + isize, ptr, xsize - isize, psize);
	    len1 -= len2;
	    op1 += len2;

	    /* multiplication routines require zeroed memory */
	    memset(ptr, '\0', sizeof(BNS) * (MIN(len1, len2) + len2));
	}

	/* len1 was not a multiple of len2 */
	if (len1) {
	    isize += len2;
	    psize = mp_mul(ptr, op2, op1, len2, len1);
	    mp_add(rop + isize, rop + isize, ptr, xsize, psize);
	}

	/* adjust result size */
	if (rop[xsize - 1] == 0)
	    --xsize;

	mp_free(ptr);

	return (xsize);
    }
}
Importing from X.Org 7.2RC2 2006-11-25 13:07:29 -07:00			`/*`
			`* Copyright (c) 2002 by The XFree86 Project, Inc.`
			`*`
			`* Permission is hereby granted, free of charge, to any person obtaining a`
			`* copy of this software and associated documentation files (the "Software"),`
			`* to deal in the Software without restriction, including without limitation`
			`* the rights to use, copy, modify, merge, publish, distribute, sublicense,`
			`* and/or sell copies of the Software, and to permit persons to whom the`
			`* Software is furnished to do so, subject to the following conditions:`
			`*`
			`* The above copyright notice and this permission notice shall be included in`
			`* all copies or substantial portions of the Software.`
			`*`
			`* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR`
			`* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,`
			`* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL`
			`* THE XFREE86 PROJECT BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,`
			`* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF`
			`* OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE`
			`* SOFTWARE.`
			`*`
			`* Except as contained in this notice, the name of the XFree86 Project shall`
			`* not be used in advertising or otherwise to promote the sale, use or other`
			`* dealings in this Software without prior written authorization from the`
			`* XFree86 Project.`
			`*`
			`* Author: Paulo César Pereira de Andrade`
			`*/`

			`/* $XFree86: xc/programs/xedit/lisp/mp/mp.c,v 1.2 2002/11/08 08:01:00 paulo Exp $ */`

			`#include "mp.h"`

			`/*`
			`* TODO:`
			`* o Optimize squaring`
			`* o Write better division code and move from mpi.c to here`
			`* o Make multiplication code don't required memory to be zeroed`
			`* + The first step is easy, just multiply the low word,`
			`* then the high word, that may overlap with the result`
			`* of the first multiply (in case of carry), and then`
			`* just make sure carry is properly propagated in the`
			`* subsequent multiplications.`
			`* + Some code needs also to be rewritten because some`
			`* intermediate addition code in mp_mul, mp_karatsuba_mul,`
			`* and mp_toom_mul is assuming the memory is zeroed.`
			`*/`

			`/*`
			`* Prototypes`
			`*/`
			`/* out of memory handler */`
			`static void mp_outmem(void);`

			`/* memory allocation fallback functions */`
			`static void *_mp_malloc(size_t);`
			`static void *_mp_calloc(size_t, size_t);`
			`static void _mp_realloc(void, size_t);`
			`static void _mp_free(void*);`

			`/*`
			`* Initialization`
			`*/`
			`static mp_malloc_fun __mp_malloc = _mp_malloc;`
			`static mp_calloc_fun __mp_calloc = _mp_calloc;`
			`static mp_realloc_fun __mp_realloc = _mp_realloc;`
			`static mp_free_fun __mp_free = _mp_free;`

			`/*`
			`* Implementation`
			`*/`
			`static void`
			`mp_outmem(void)`
			`{`
			`fprintf(stderr, "out of memory in MP library.\n");`
			`exit(1);`
			`}`

			`static void *`
			`_mp_malloc(size_t size)`
			`{`
			`return (malloc(size));`
			`}`

			`void *`
			`mp_malloc(size_t size)`
			`{`
			`void pointer = (__mp_malloc)(size);`

			`if (pointer == NULL)`
			`mp_outmem();`

			`return (pointer);`
			`}`

			`mp_malloc_fun`
			`mp_set_malloc(mp_malloc_fun fun)`
			`{`
			`mp_malloc_fun old = __mp_malloc;`

			`__mp_malloc = fun;`

			`return (old);`
			`}`

			`static void *`
			`_mp_calloc(size_t nmemb, size_t size)`
			`{`
			`return (calloc(nmemb, size));`
			`}`

			`void *`
			`mp_calloc(size_t nmemb, size_t size)`
			`{`
			`void pointer = (__mp_calloc)(nmemb, size);`

			`if (pointer == NULL)`
			`mp_outmem();`

			`return (pointer);`
			`}`

			`mp_calloc_fun`
			`mp_set_calloc(mp_calloc_fun fun)`
			`{`
			`mp_calloc_fun old = __mp_calloc;`

			`__mp_calloc = fun;`

			`return (old);`
			`}`

			`static void *`
			`_mp_realloc(void *old, size_t size)`
			`{`
			`return (realloc(old, size));`
			`}`

			`void *`
			`mp_realloc(void *old, size_t size)`
			`{`
			`void pointer = (__mp_realloc)(old, size);`

			`if (pointer == NULL)`
			`mp_outmem();`

			`return (pointer);`
			`}`

			`mp_realloc_fun`
			`mp_set_realloc(mp_realloc_fun fun)`
			`{`
			`mp_realloc_fun old = __mp_realloc;`

			`__mp_realloc = fun;`

			`return (old);`
			`}`

			`static void`
			`_mp_free(void *pointer)`
			`{`
			`free(pointer);`
			`}`

			`void`
			`mp_free(void *pointer)`
			`{`
			`(*__mp_free)(pointer);`
			`}`

			`mp_free_fun`
			`mp_set_free(mp_free_fun fun)`
			`{`
			`mp_free_fun old = __mp_free;`

			`__mp_free = fun;`

			`return (old);`
			`}`

			`long`
			`mp_add(BNS rop, BNS op1, BNS *op2, BNI len1, BNI len2)`
			`{`
			`BNI value; /* intermediate result */`
			`BNS carry; /* carry flag */`
			`long size; /* result size */`

			`if (len1 < len2)`
			`MP_SWAP(op1, op2, len1, len2);`

			`/* unroll start of loop */`
			`value = op1[0] + op2[0];`
			`rop[0] = value;`
			`carry = value >> BNSBITS;`

			`/* add op1 and op2 */`
			`for (size = 1; size < len2; size++) {`
			`value = op1[size] + op2[size] + carry;`
			`rop[size] = value;`
			`carry = value >> BNSBITS;`
			`}`
			`if (rop != op1) {`
			`for (; size < len1; size++) {`
			`value = op1[size] + carry;`
			`rop[size] = value;`
			`carry = value >> BNSBITS;`
			`}`
			`}`
			`else {`
			`/* if rop == op1, than just adjust carry */`
			`for (; carry && size < len1; size++) {`
			`value = op1[size] + carry;`
			`rop[size] = value;`
			`carry = value >> BNSBITS;`
			`}`
			`size = len1;`
			`}`
			`if (carry)`
			`rop[size++] = carry;`

			`return (size);`
			`}`

			`long`
			`mp_sub(BNS rop, BNS op1, BNS *op2, BNI len1, BNI len2)`
			`{`
			`long svalue; /* intermediate result */`
			`BNS carry; /* carry flag */`
			`long size; /* result size */`

			`/* special case */`
			`if (op1 == op2) {`
			`rop[0] = 0;`

			`return (1);`
			`}`

			`/* unroll start of loop */`
			`svalue = op1[0] - op2[0];`
			`rop[0] = svalue;`
			`carry = svalue < 0;`

			`/* subtracts op2 from op1 */`
			`for (size = 1; size < len2; size++) {`
			`svalue = (long)(op1[size]) - op2[size] - carry;`
			`rop[size] = svalue;`
			`carry = svalue < 0;`
			`}`
			`if (rop != op1) {`
			`for (; size < len1; size++) {`
			`svalue = op1[size] - carry;`
			`rop[size] = svalue;`
			`carry = svalue < 0;`
			`}`
			`}`
			`else {`
			`/* if rop == op1, than just adjust carry */`
			`for (; carry && size < len1; size++) {`
			`svalue = op1[size] - carry;`
			`rop[size] = svalue;`
			`carry = svalue < 0;`
			`}`
			`size = len1;`
			`}`

			`/* calculate result size */`
			`while (size > 1 && rop[size - 1] == 0)`
			`--size;`

			`return (size);`
			`}`

			`long`
			`mp_lshift(BNS rop, BNS op, BNI len, long shift)`
			`{`
			`long i, size;`
			`BNI words, bits; /* how many word and bit shifts */`

			`words = shift / BNSBITS;`
			`bits = shift % BNSBITS;`
			`size = len + words;`

			`if (bits) {`
			`BNS hi, lo;`
			`BNI carry;`
			`int adj;`

			`for (i = 1, carry = CARRY >> 1; carry; i++, carry >>= 1)`
			`if (op[len - 1] & carry)`
			`break;`
			`adj = (bits + (BNSBITS - i)) / BNSBITS;`
			`size += adj;`

			`lo = hi = op[0];`
			`rop[words] = lo << bits;`
			`for (i = 1; i < len; i++) {`
			`hi = op[i];`
			`rop[words + i] = hi << bits \| (lo >> (BNSBITS - bits));`
			`lo = hi;`
			`}`
			`if (adj)`
			`rop[size - 1] = hi >> (BNSBITS - bits);`
			`}`
			`else`
			`memmove(rop + size - len, op, sizeof(BNS) * len);`

			`if (words)`
			`memset(rop, '\0', sizeof(BNS) * words);`

			`return (size);`
			`}`

			`long`
			`mp_rshift(BNS rop, BNS op, BNI len, long shift)`
			`{`
			`int adj = 0;`
			`long i, size;`
			`BNI words, bits; /* how many word and bit shifts */`

			`words = shift / BNSBITS;`
			`bits = shift % BNSBITS;`
			`size = len - words;`

			`if (bits) {`
			`BNS hi, lo;`
			`BNI carry;`

			`for (i = 0, carry = CARRY >> 1; carry; i++, carry >>= 1)`
			`if (op[len - 1] & carry)`
			`break;`
			`adj = (bits + i) / BNSBITS;`
			`if (size - adj == 0) {`
			`rop[0] = 0;`

			`return (1);`
			`}`

			`hi = lo = op[words + size - 1];`
			`rop[size - 1] = hi >> bits;`
			`for (i = size - 2; i >= 0; i--) {`
			`lo = op[words + i];`
			`rop[i] = (lo >> bits) \| (hi << (BNSBITS - bits));`
			`hi = lo;`
			`}`
			`if (adj)`
			`rop[0] \|= lo << (BNSBITS - bits);`
			`}`
			`else`
			`memmove(rop, op + len - size, size * sizeof(BNS));`

			`return (size - adj);`
			`}`

			`/* rop must be a pointer to len1 + len2 elements`
			`* rop cannot be either op1 or op2`
			`* rop must be all zeros */`
			`long`
			`mp_base_mul(BNS rop, BNS op1, BNS *op2, BNI len1, BNI len2)`
			`{`
			`long i, j; /* counters */`
			`BNI value; /* intermediate result */`
			`BNS carry; /* carry value */`
			`long size = len1 + len2;`

			`/* simple optimization: first pass does not need to deference rop[i+j] */`
			`if (op1[0]) {`
			`value = (BNI)(op1[0]) * op2[0];`
			`rop[0] = value;`
			`carry = (BNS)(value >> BNSBITS);`
			`for (j = 1; j < len2; j++) {`
			`value = (BNI)(op1[0]) * op2[j] + carry;`
			`rop[j] = value;`
			`carry = (BNS)(value >> BNSBITS);`
			`}`
			`rop[j] = carry;`
			`}`

			`/* do the multiplication */`
			`for (i = 1; i < len1; i++) {`
			`if (op1[i]) {`
			`/* unrool loop initialization */`
			`value = (BNI)(op1[i]) * op2[0] + rop[i];`
			`rop[i] = value;`
			`carry = (BNS)(value >> BNSBITS);`
			`/* multiply */`
			`for (j = 1; j < len2; j++) {`
			`value = (BNI)(op1[i]) * op2[j] + rop[i + j] + carry;`
			`rop[i + j] = value;`
			`carry = (BNS)(value >> BNSBITS);`
			`}`
			`rop[i + j] = carry;`
			`}`
			`}`

			`if (size > 1 && rop[size - 1] == 0)`
			`--size;`

			`return (size);`
			`}`

			`/* Karatsuba method`
			`* t + ((a0 + a1) (b0 + b1) - t - u) x + ux²`
			`* where t = a0b0 and u = a1b1`
			`*`
			`* Karatsuba method reduces the number of multiplications. Example:`
			`* Square a 40 length number`
			`* instead of a plain 40*40 = 1600 multiplies/adds, it does:`
			`* 2020+2020+20*20 = 1200`
			`* but since it is recursive, every 20*20=400 is reduced to`
			`* 1010+1010+10*10=300`
			`* and so on.`
			`* The multiplication by x and x² is a just a shift, as it is a`
			`* power of two, and is implemented below by just writting at the`
			`* correct offset */`
			`long`
			`mp_karatsuba_mul(BNS rop, BNS op1, BNS *op2, BNI len1, BNI len2)`
			`{`
			`BNI x; /* shift count */`
			`BNI la0, la1, lb0, lb1; /* length of a0, a1, b0, and b1 */`
			`BNS t; / temporary memory for t product */`
			`BNS u; / temporary memory for u product */`
			`BNS r; / pointer to rop */`
			`long xlen, tlen, ulen;`

			`/* calculate value of x, that is 2^(BNSBITSx) /`
			`if (len1 >= len2)`
			`x = (len1 + 1) >> 1;`
			`else`
			`x = (len2 + 1) >> 1;`

			`/* calculate length of operands */`
			`la0 = x;`
			`la1 = len1 - x;`
			`lb0 = x;`
			`lb1 = len2 - x;`

			`/* allocate buffer for t and (a0 + a1) */`
			`tlen = la0 + lb0;`
			`t = mp_malloc(sizeof(BNS) * tlen);`

			`/* allocate buffer for u and (b0 + b1) */`
			`if (la1 + lb1 < lb0 + lb1 + 1)`
			`ulen = lb0 + lb1 + 1;`
			`else`
			`ulen = la1 + lb1;`
			`u = mp_malloc(sizeof(BNS) * ulen);`

			`/* calculate a0 + a1, store result in t */`
			`tlen = mp_add(t, op1, op1 + x, la0, la1);`

			`/* calculate b0 + b1, store result in u */`
			`ulen = mp_add(u, op2, op2 + x, lb0, lb1);`

			`/* store (a0 + a1) * (b0 + b1) in rop */`

			`r = rop + x; /* multiplied by 2^(BNSBITSx) /`
			`xlen = mp_mul(r, t, u, tlen, ulen);`

			`/* must zero t and u memory, this is required for mp_mul */`

			`/* calculate t = a0 * b0 */`
			`tlen = la0 + lb0;`
			`memset(t, '\0', sizeof(BNS) * tlen);`
			`tlen = mp_mul(t, op1, op2, la0, lb0);`

			`/* calculate u = a1 * b1 */`
			`ulen = la1 + lb1;`
			`memset(u, '\0', sizeof(BNS) * ulen);`
			`ulen = mp_mul(u, op1 + x, op2 + x, la1, lb1);`

			`/* subtract t from partial result */`
			`xlen = mp_sub(r, r, t, xlen, tlen);`

			`/* subtract u form partial result */`
			`xlen = mp_sub(r, r, u, xlen, ulen);`

			`/* add ux^2 to partial result */`

			`r = rop + (x << 1); /* multiplied by x^2 = 2^(BNSBITSx2) */`
			`xlen = len1 + len2;`
			`xlen = mp_add(r, r, u, xlen, ulen);`

			`/* now add t to final result */`
			`xlen = mp_add(rop, rop, t, xlen, tlen);`

			`mp_free(t);`
			`mp_free(u);`

			`if (xlen > 1 && rop[xlen - 1] == 0)`
			`--xlen;`

			`return (xlen);`
			`}`

			`/* Toom method (partially based on GMP documentation)`
			`* Evaluation at k = [ 0 1/2 1 2 oo ]`
			`* U(x) = (U2k + U1)k + U0`
			`* V(x) = (V2k + V1)k + V0`
			`* W(x) = U(x)V(x)`
			`*`
			`* Sample:`
			`* 123 * 456`
			`*`
			`* EVALUATION:`
			`* U(0) = (10+2)0+3 => 3`
			`* U(1) = 1+(2+32)2 => 17`
			`* U(2) = 1+2+3 => 6`
			`* U(3) = (12+2)2+3 => 11`
			`* U(4) = 1+(2+30)0 => 1`
			`*`
			`* V(0) = (40+5)0+6 => 6`
			`* V(1) = 4+(5+62)2 => 38`
			`* V(2) = 4+5+6 => 15`
			`* V(3) = (42+5)2+6 => 32`
			`* V(4) = 4+(5+60)0 => 4`
			`*`
			`* U = [ 3 17 6 11 1 ]`
			`* V = [ 6 38 15 32 4 ]`
			`* W = [ 18 646 90 352 4 ]`
			`*`
			`* After that, we have:`
			`* a = 18 (w0 already known)`
			`* b = 16w0 + 8w1 + 4w2 + 2w3 + w4`
			`* c = w0 + w1 + w2 + w3 + w4`
			`* d = w0 + 2w1 + 4w2 + 8w3 + 16w4`
			`* e = 4 (w4 already known)`
			`*`
			`* INTERPOLATION:`
			`* b = b -16a - e (354)`
			`* c = c - a - e (68)`
			`* d = d - a - 16e (270)`
			`*`
			`* w = (b + d) - 8c = (10w1+8w2+10w3) - (8w1+8w2+8w3) = 2w1+2w3`
			`* w = 2c - w (56)`
			`* b = b/2 = 4w1+w+w3`
			`* b = b-c = 4w1+w+w3 - w1+w2+w3 = 3w1+w2`
			`* c = w/2 (w2 = 28)`
			`* b = b-c = 3w1+c - c = 3w1`
			`* b = b/3 (w1 = 27)`
			`* d = d/2`
			`* d = d-b-w = b+w+4w3 - b-w = 4w3`
			`* d = d/4 (w3 = 13)`
			`*`
			`* RESULT:`
			`* w410^4 + w310³ + w210² + w110 + w0`
			`* 40000 + 13000 + 2800 + 270 + 18`
			`* 10 is the base where the calculation was done`
			`*`
			`* This sample uses small numbers, so it does not show the`
			`* advantage of the method. But for example (in base 10), when squaring`
			`* 123456789012345678901234567890`
			`* The normal method would do 30*30=900 multiplications`
			`* Karatsuba method would do 15153=675 multiplications`
			`* Toom method would do 10105=500 multiplications`
			`* Toom method has a larger overhead if compared with Karatsuba method,`
			`* due to evaluation and interpolation, so it should be used for larger`
			`* numbers, so that the computation time of evaluation/interpolation`
			`* would be smaller than the time spent using other methods.`
			`*`
			`* Note that Karatsuba method can be seen as a special case of`
			`* Toom method, i.e:`
			`* U1U0 * V1V0`
			`* with k = [ 0 1 oo ]`
			`* U = [ U0 U1+U0 U1 ]`
			`* V = [ V0 V1+V0 V1 ]`
			`* W = [ U0V0 (U1+U0)(V1+V0) (U1+V1) ]`
			`*`
			`* w0 = U0*V0`
			`* w = (U1+U0)*(V1+V0)`
			`* w2 = (U1*V1)`
			`*`
			`* w1 = w - w0 - w2`
			`* w2x² + w1x + w0`
			`*`
			`* See Knuth's Seminumerical Algorithms for a sample implemention`
			`* using 4 stacks and k = [ 0 1 2 3 ... ], based on the size of the`
			`* input.`
			`*/`
			`long`
			`mp_toom_mul(BNS rop, BNS op1, BNS *op2, BNI len1, BNI len2)`
			`{`
			`long size, xsize, i;`
			`BNI value; /* used in division */`
			`BNS carry;`
			`BNI x; /* shift count */`
			`BNI l1, l2;`
			`BNI al, bl, cl, dl, el, Ul[3], Vl[3];`
			`BNS a, b, c, d, e, U[3], *V[3];`

			`/* x is the base i.e. 2^(BNSBITSx) /`
			`x = (len1 + len2 + 4) / 6;`
			`l1 = len1 - (x << 1); /* length of remaining piece of op1 */`
			`l2 = len2 - (x << 1); /* length of remaining piece of op2 */`

			`/* allocate memory for storing U and V */`
			`U[0] = mp_malloc(sizeof(BNS) * (x + 2));`
			`V[0] = mp_malloc(sizeof(BNS) * (x + 2));`
			`U[1] = mp_malloc(sizeof(BNS) * (x + 1));`
			`V[1] = mp_malloc(sizeof(BNS) * (x + 1));`
			`U[2] = mp_malloc(sizeof(BNS) * (x + 2));`
			`V[2] = mp_malloc(sizeof(BNS) * (x + 2));`

			`/* EVALUATE U AND V */`

			`/* Numbers are in the format U2x²+U1x+U0 and V2x²+V1x+V0 */`

			`/* U[0] = U2+U12+U04 */`

			`/* store U12 in U[1], this value is used twice /`
			`Ul[1] = mp_lshift(U[1], op1 + x, x, 1);`

			`/* store U04 in U[0] /`
			`Ul[0] = mp_lshift(U[0], op1, x, 2);`
			`/* add U12 to U[0] /`
			`Ul[0] = mp_add(U[0], U[0], U[1], Ul[0], Ul[1]);`
			`/* add U2 to U[0] */`
			`Ul[0] = mp_add(U[0], U[0], op1 + x + x, Ul[0], l1);`

			`/* U[2] = U24+U12+U0 */`

			`/* store U24 in U[2] /`
			`Ul[2] = mp_lshift(U[2], op1 + x + x, l1, 2);`
			`/* add U12 to U[2] /`
			`Ul[2] = mp_add(U[2], U[2], U[1], Ul[2], Ul[1]);`
			`/* add U0 to U[2] */`
			`Ul[2] = mp_add(U[2], U[2], op1, Ul[2], x);`

			`/* U[1] = U2+U1+U0 */`

			`Ul[1] = mp_add(U[1], op1, op1 + x, x, x);`
			`Ul[1] = mp_add(U[1], U[1], op1 + x + x, Ul[1], l1);`


			`/* Evaluate V[x], same code as U[x] */`
			`Vl[1] = mp_lshift(V[1], op2 + x, x, 1);`
			`Vl[0] = mp_lshift(V[0], op2, x, 2);`
			`Vl[0] = mp_add(V[0], V[0], V[1], Vl[0], Vl[1]);`
			`Vl[0] = mp_add(V[0], V[0], op2 + x + x, Vl[0], l2);`
			`Vl[2] = mp_lshift(V[2], op2 + x + x, l2, 2);`
			`Vl[2] = mp_add(V[2], V[2], V[1], Vl[2], Vl[1]);`
			`Vl[2] = mp_add(V[2], V[2], op2, Vl[2], x);`
			`Vl[1] = mp_add(V[1], op2, op2 + x, x, x);`
			`Vl[1] = mp_add(V[1], V[1], op2 + x + x, Vl[1], l2);`


			`/* MULTIPLY U[] AND V[] */`

			`/* calculate (U2+U12+U04) * (V2+V12+V04) */`
			`b = mp_calloc(1, sizeof(BNS) * (Ul[0] * Vl[0]));`
			`bl = mp_mul(b, U[0], V[0], Ul[0], Vl[0]);`
			`mp_free(U[0]);`
			`mp_free(V[0]);`

			`/* calculate (U2+U1+U0) * (V2+V1+V0) */`
			`c = mp_calloc(1, sizeof(BNS) * (Ul[1] * Vl[1]));`
			`cl = mp_mul(c, U[1], V[1], Ul[1], Vl[1]);`
			`mp_free(U[1]);`
			`mp_free(V[1]);`

			`/* calculate (U24+U12+U0) * (V24+V12+V0) */`
			`d = mp_calloc(1, sizeof(BNS) * (Ul[2] * Vl[2]));`
			`dl = mp_mul(d, U[2], V[2], Ul[2], Vl[2]);`
			`mp_free(U[2]);`
			`mp_free(V[2]);`

			`/* calculate U0 * V0 */`
			`a = mp_calloc(1, sizeof(BNS) * (x + x));`
			`al = mp_mul(a, op1, op2, x, x);`

			`/* calculate U2 * V2 */`
			`e = mp_calloc(1, sizeof(BNS) * (l1 + l2));`
			`el = mp_mul(e, op1 + x + x, op2 + x + x, l1, l2);`


			`/* INTERPOLATE COEFFICIENTS */`

			`/* b = b - 16a - e */`
			`size = mp_lshift(rop, a, al, 4);`
			`bl = mp_sub(b, b, rop, bl, size);`
			`bl = mp_sub(b, b, e, bl, el);`

			`/* c = c - a - e*/`
			`cl = mp_sub(c, c, a, cl, al);`
			`cl = mp_sub(c, c, e, cl, el);`

			`/* d = d - a - 16e */`
			`dl = mp_sub(d, d, a, dl, al);`
			`size = mp_lshift(rop, e, el, 4);`
			`dl = mp_sub(d, d, rop, dl, size);`

			`/* w = (b + d) - 8c */`
			`size = mp_add(rop, b, d, bl, dl);`
			`xsize = mp_lshift(rop + size, c, cl, 3); /* rop has enough storage */`
			`size = mp_sub(rop, rop, rop + size, size, xsize);`

			`/* w = 2c - w*/`
			`xsize = mp_lshift(rop + size, c, cl, 1);`
			`size = mp_sub(rop, rop + size, rop, xsize, size);`

			`/* b = b/2 */`
			`bl = mp_rshift(b, b, bl, 1);`

			`/* b = b - c */`
			`bl = mp_sub(b, b, c, bl, cl);`

			`/* c = w / 2 */`
			`cl = mp_rshift(c, rop, size, 1);`

			`/* b = b - c */`
			`bl = mp_sub(b, b, c, bl, cl);`

			`/* b = b/3 */`
			`/* maybe the most expensive calculation */`
			`i = bl - 1;`
			`value = b[i];`
			`b[i] = value / 3;`
			`for (--i; i >= 0; i--) {`
			`carry = value % 3;`
			`value = ((BNI)carry << BNSBITS) + b[i];`
			`b[i] = (BNS)(value / 3);`
			`}`

			`/* d = d/2 */`
			`dl = mp_rshift(d, d, dl, 1);`

			`/* d = d - b - w */`
			`dl = mp_sub(d, d, b, dl, bl);`
			`dl = mp_sub(d, d, rop, dl, size);`

			`/* d = d/4 */`
			`dl = mp_rshift(d, d, dl, 2);`


			`/* STORE RESULT IN ROP */`
			`/* first clear memory used as temporary variable w and 8c */`
			`memset(rop, '\0', sizeof(BNS) * (len1 + len2));`

			`i = x * 4;`
			`xsize = (len1 + len2) - i;`
			`size = mp_add(rop + i, rop + i, e, xsize, el) + i;`
			`i = x * 3;`
			`xsize = size - i;`
			`size = mp_add(rop + i, rop + i, d, xsize, dl) + i;`
			`i = x * 2;`
			`xsize = size - i;`
			`size = mp_add(rop + i, rop + i, c, xsize, cl) + i;`
			`i = x;`
			`xsize = size - i;`
			`size = mp_add(rop + i, rop + i, b, xsize, bl) + i;`
			`size = mp_add(rop, rop, a, size, al);`

			`mp_free(e);`
			`mp_free(d);`
			`mp_free(c);`
			`mp_free(b);`
			`mp_free(a);`

			`if (size > 1 && rop[size - 1] == 0)`
			`--size;`

			`return (size);`
			`}`

			`long`
			`mp_mul(BNS rop, BNS op1, BNS *op2, BNI len1, BNI len2)`
			`{`
			`if (len1 < len2)`
			`MP_SWAP(op1, op2, len1, len2);`

			`if (len1 < KARATSUBA \|\| len2 < KARATSUBA)`
			`return (mp_base_mul(rop, op1, op2, len1, len2));`
			`else if (len1 < TOOM && len2 < TOOM && len2 > ((len1 + 1) >> 1))`
			`return (mp_karatsuba_mul(rop, op1, op2, len1, len2));`
			`else if (len1 >= TOOM && len2 >= TOOM && (len2 + 2) / 3 == (len1 + 2) / 3)`
			`return (mp_toom_mul(rop, op1, op2, len1, len2));`
			`else {`
			`long xsize, psize, isize;`
			`BNS *ptr;`

			`/* adjust index pointer and estimated size of result */`
			`isize = 0;`
			`xsize = len1 + len2;`
			`mp_mul(rop, op1, op2, len2, len2);`
			`/* adjust pointers */`
			`len1 -= len2;`
			`op1 += len2;`

			`/* allocate buffer for intermediate multiplications */`
			`if (len1 > len2)`
			`ptr = mp_calloc(1, sizeof(BNS) * (len2 + len2));`
			`else`
			`ptr = mp_calloc(1, sizeof(BNS) * (len1 + len2));`

			`/* loop multiplying len2 size operands at a time */`
			`while (len1 >= len2) {`
			`isize += len2;`
			`psize = mp_mul(ptr, op1, op2, len2, len2);`
			`mp_add(rop + isize, rop + isize, ptr, xsize - isize, psize);`
			`len1 -= len2;`
			`op1 += len2;`

			`/* multiplication routines require zeroed memory */`
			`memset(ptr, '\0', sizeof(BNS) * (MIN(len1, len2) + len2));`
			`}`

			`/* len1 was not a multiple of len2 */`
			`if (len1) {`
			`isize += len2;`
			`psize = mp_mul(ptr, op2, op1, len2, len1);`
			`mp_add(rop + isize, rop + isize, ptr, xsize, psize);`
			`}`

			`/* adjust result size */`
			`if (rop[xsize - 1] == 0)`
			`--xsize;`

			`mp_free(ptr);`

			`return (xsize);`
			`}`
			`}`