/*
 * Copyright 1995,96 Thierry Bousch
 * Licensed under the Gnu Public License, Version 2
 *
 * $Id: Integer.c,v 2.3 1996/09/14 09:39:37 bousch Exp $
 *
 * Arbitrary big integers -- well, not quite, you are limited to
 * numbers less than base^MAX_INT.
 */

#include <assert.h>
#include <ctype.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "saml.h"
#include "saml-errno.h"
#include "mnode.h"
#include "builtin.h"
#include "mp-arch.h"

#define BI(mn)		((big_int*)(mn))
#define BASE		1000000000U
#define alloca_u32(n)	alloca((n)*sizeof(__u32))
#define SIGN(mn)	(BI(mn)->blocks? ((BI(mn)->blocks > 0) ? 1 : -1) : 0)

static s_mnode *p_integer_zero, *p_integer_one;

typedef struct {
	struct mnode_header hdr;
	int blocks;
	__u32 d[0];
} big_int;

static s_mnode* bigint_build (const char*);
static gr_string* bigint_stringify (s_mnode*);
static s_mnode* bigint_add (s_mnode*, s_mnode*);
static s_mnode* bigint_sub (s_mnode*, s_mnode*);
static s_mnode* bigint_mul (s_mnode*, s_mnode*);
static s_mnode* bigint_div (s_mnode*, s_mnode*);
static s_mnode* bigint_gcd (s_mnode*, s_mnode*);
static int bigint_notzero (s_mnode*);
static int bigint_isneg (s_mnode*);
static int bigint_differ (s_mnode*, s_mnode*);
static int bigint_lessthan (s_mnode*, s_mnode*);
static s_mnode* bigint_zero (s_mnode*);
static s_mnode* bigint_negate (s_mnode*);
static s_mnode* bigint_one (s_mnode*);
static s_mnode* bigint_sqrt (s_mnode*);
static s_mnode* mint2integer (s_mnode*);

static unsafe_s_mtype MathType_Integer = {
	"Integer",
	free, bigint_build, bigint_stringify,
	NULL, NULL,
	bigint_add, bigint_sub, bigint_mul, bigint_div, bigint_gcd,
	bigint_notzero, bigint_isneg, NULL, bigint_differ, bigint_lessthan,
	bigint_zero, bigint_negate, bigint_one, NULL,
	bigint_sqrt
};

static inline s_mnode* create_bigint (int blocks)
{
	return __mnalloc(ST_INTEGER,sizeof(big_int)+blocks*sizeof(__u32));
}

static inline s_mnode* set_blocks (s_mnode *mn, int blocks, int is_negative)
{
	BI(mn)->blocks = is_negative ? -blocks : blocks;
	return mn;
}

void init_MathType_Integer (void)
{
	register_mtype(ST_INTEGER, &MathType_Integer);
	p_integer_zero = create_bigint(0);
	set_blocks(p_integer_zero, 0, 0);
	p_integer_one = create_bigint(1);
	set_blocks(p_integer_one, 1, 0);
	BI(p_integer_one)->d[0] = 1;
	nb_mnodes_reserved += 2;
	register_CV_routine(ST_MINT, ST_INTEGER, mint2integer);
}

static s_mnode* integer_new (int x)
{
	s_mnode* n;
	int blocks;
	unsigned int ax, tmp;
	__u32 *data;

	if (x == 0)
		return bigint_zero(NULL);
	if (x == 1)
		return bigint_one(NULL);
	ax = abs(x);
	for (blocks = 0, tmp = ax; tmp; tmp /= BASE)
		blocks++;
	n = create_bigint(blocks);
	data = BI(n)->d;
	do {
		*data++ = ax % BASE;
		ax /= BASE;
	} while (ax);
	return set_blocks(n, blocks, (x<0));
}

static s_mnode* mint2integer (s_mnode* n)
{
	/* Yes, it's inefficient. I know. */
	gr_string* grs = mnode_stringify(n);
	s_mnode* N = bigint_build(grs->s);
	free(grs);
	return N;
}

static s_mnode* bigint_zero (s_mnode* dummy)
{
	return copy_mnode(p_integer_zero);
}

static s_mnode* bigint_one (s_mnode* dummy)
{
	return copy_mnode(p_integer_one);
}

static s_mnode* bigint_build (const char *str)
{
	s_mnode* n;
	__u32 *bid;
	const char *pc;
	int isneg=0, digits, blocks, i, mainblk;

	/* A leading plus or minus is allowed */
	if (str[0] == '+')
		str++;
	else if (str[0] == '-')
		str++, isneg=1;
	/* Now there should be only digits, and at least one */
	if (str[0] == '\0')
		return mnode_error(SE_STRING, "bigint_build");
	for (pc = str; *pc; pc++)
		if (!isdigit((unsigned)(*pc)))
			return mnode_error(SE_STRING, "bigint_build");
	/* Ok, the number is valid. First skip leading zeros */
	while (str[0] == '0')
		str++;
	if ((digits = pc-str) == 0)
		return copy_mnode(p_integer_zero);
	blocks = (digits + 8) / 9;
	n = create_bigint(blocks); bid = BI(n)->d;
	/* pc now points to the end of the string */
	for (i = 0; i < blocks-1; i++) {
		__u32 tmp;
		tmp =          (unsigned char)pc[-9] - '0';
		tmp = 10*tmp + (unsigned char)pc[-8] - '0';
		tmp = 10*tmp + (unsigned char)pc[-7] - '0';
		tmp = 10*tmp + (unsigned char)pc[-6] - '0';
		tmp = 10*tmp + (unsigned char)pc[-5] - '0';
		tmp = 10*tmp + (unsigned char)pc[-4] - '0';
		tmp = 10*tmp + (unsigned char)pc[-3] - '0';
		tmp = 10*tmp + (unsigned char)pc[-2] - '0';
		tmp = 10*tmp + (unsigned char)pc[-1] - '0';
		bid[i] = tmp;
		pc -= 9;
	}
	/* But the last block can contain 1 to 9 digits */
	assert(str+1 <= pc && pc <= str+9);
	mainblk = 0;
	while (str < pc)
		mainblk = 10 * mainblk + (*str++ - '0');
	bid[blocks-1] = mainblk;
	return set_blocks(n, blocks, isneg);
}

static gr_string* bigint_stringify (s_mnode* n)
{
	big_int* bi = BI(n);
	int i, blocks;
	gr_string *grs;
	char buff[10], *mslimb;

	grs = new_gr_string(0);
	blocks = abs(bi->blocks);
	if (blocks == 0)
		return grs_append1(grs, '0');
	/* Not zero. Test the sign */
	if (bi->blocks < 0)
		grs = grs_append1(grs, '-');
	/* Leading digits (at most 9) */
	mslimb = u32toa(bi->d[blocks-1]);
	grs = grs_append(grs, mslimb, strlen(mslimb));
	/* Other digits */
	for (i = blocks-2; i >= 0; i--) {
		unsigned int n = bi->d[i];
		buff[8] = n%10 + '0'; n /= 10;
		buff[7] = n%10 + '0'; n /= 10;
		buff[6] = n%10 + '0'; n /= 10;
		buff[5] = n%10 + '0'; n /= 10;
		buff[4] = n%10 + '0'; n /= 10;
		buff[3] = n%10 + '0'; n /= 10;
		buff[2] = n%10 + '0'; n /= 10;
		buff[1] = n%10 + '0'; n /= 10;
		buff[0] = n    + '0';
		grs = grs_append(grs, buff, 9);
	}
	return grs;
}

static int bigint_notzero (s_mnode* n)
{
	return (BI(n)->blocks != 0);
}

static int bigint_isneg (s_mnode* n)
{
	return (BI(n)->blocks < 0);
}

static int cmp_bigint (big_int *b1, big_int *b2)
{
	int i, diff, bl1 = abs(b1->blocks), bl2 = abs(b2->blocks);

	if ((diff = bl1 - bl2) != 0)
		return diff;
	for (i = bl1-1; i >= 0; i--) {
		diff = (int)(b1->d[i]) - (int)(b2->d[i]);
		if (diff != 0)
			return diff;
	}
	return 0;
}

static inline int bigint_acompare (s_mnode* n1, s_mnode* n2)
{
	return cmp_bigint(BI(n1), BI(n2));
}

static s_mnode* bigint_aadd (s_mnode* n1, s_mnode* n2)
{
	big_int *b1 = BI(n1), *b2 = BI(n2);
	int bl1 = abs(b1->blocks), bl2 = abs(b2->blocks), bl, i, carry;
	__u32 *b1d = b1->d, *b2d = b2->d, *bd;
	s_mnode* n;
	
	bl = 1 + (bl1 > bl2 ? bl1 : bl2);
	n = create_bigint(bl);
	bd = BI(n)->d;
	carry = 0;
	for (i = 0; i < bl; i++) {
		if (i < bl1)
			carry += b1d[i];
		if (i < bl2)
			carry += b2d[i];
		if (carry >= BASE) {
			bd[i] = carry - BASE;
			carry = 1;
		} else {
			bd[i] = carry;
			carry = 0;
		}
	}
	assert(carry == 0);
	while (bl > 0 && bd[bl-1] == 0)
		--bl;
	return set_blocks(n, bl, 0);
}

static s_mnode* bigint_asub (s_mnode* n1, s_mnode* n2)
{
	big_int *b1 = BI(n1), *b2 = BI(n2);
	int bl1 = abs(b1->blocks), bl2 = abs(b2->blocks);
	int bl, i, carry;
	__u32 *b1d = b1->d, *b2d = b2->d, *bd;
	s_mnode* n;

	bl = (bl1 > bl2 ? bl1 : bl2);
	bd = alloca_u32(bl);
	carry = 0;
	for (i = 0; i < bl; i++) {
		if (i < bl1)
			carry += b1d[i];
		if (i < bl2)
			carry -= b2d[i];
		if (carry < 0) {
			bd[i] = carry + BASE;
			carry = -1;
		} else {
			bd[i] = carry;
			carry = 0;
		}
	}
	assert(carry == 0);
	while (bl > 0 && bd[bl-1] == 0)
		--bl;
	n = create_bigint(bl);
	memcpy(BI(n)->d, bd, bl * sizeof(__u32));
	return set_blocks(n, bl, 0);
}

static int bigint_lessthan (s_mnode* n1, s_mnode* n2)
{
	int s1 = SIGN(n1), s2 = SIGN(n2), diff;

	if (s1 != s2)
		return (s1 < s2);	/* different signs */
	if (s1 == 0)
		return 0;		/* both zero */

	/* Same sign -- compare the absolute magnitudes */
	diff = bigint_acompare(n1,n2);
	if (s1 < 0)
		return (diff > 0);
	else
		return (diff < 0);
}

static int bigint_differ (s_mnode* n1, s_mnode* n2)
{
	if (BI(n1)->blocks != BI(n2)->blocks)
		return 1;
	/* Same sign, same number of blocks */
	return (bigint_acompare(n1,n2) != 0);
}

static s_mnode* bigint_add (s_mnode* n1, s_mnode* n2)
{
	int s1 = SIGN(n1), s2 = SIGN(n2);
	int diff;
	s_mnode *n;

	if (s1 == 0)
		return copy_mnode(n2);
	if (s2 == 0)
		return copy_mnode(n1);
	if (s1 == s2) {
		n = bigint_aadd(n1,n2);
		if (s1 < 0)
			BI(n)->blocks *= -1;
		return n;
	}
	/* Here n1 and n2 have different signs */
	diff = bigint_acompare(n1,n2);
	if (diff == 0)
		return bigint_zero(NULL);
	if (diff > 0) {
		/* n1 has greater magnitude than n2 */
		n = bigint_asub(n1, n2);
		if (s1 < 0)
			BI(n)->blocks *= -1;
	} else {
		n = bigint_asub(n2, n1);
		if (s2 < 0)
			BI(n)->blocks *= -1;
	}
	return n;
}

static s_mnode* bigint_sub (s_mnode* n1, s_mnode* n2)
{
	int s1 = SIGN(n1), s2 = SIGN(n2);
	int diff;
	s_mnode *n;
	
	if (s1 == 0)
		return bigint_negate(n2);
	if (s2 == 0)
		return copy_mnode(n1);
	if (s1 != s2) {
		n = bigint_aadd(n1,n2);
		if (s1 < 0)
			BI(n)->blocks *= -1;
		return n;
	}
	/* Here n1 and n2 have the same sign */
	diff = bigint_acompare(n1,n2);
	if (diff == 0)
		return bigint_zero(NULL);
	if (diff > 0) {
		n = bigint_asub(n1,n2);
		if (s1 < 0)
			BI(n)->blocks *= -1;
	} else {
		n = bigint_asub(n2,n1);
		if (s2 > 0)
			BI(n)->blocks *= -1;
	}
	return n;
}
	
static s_mnode* bigint_negate (s_mnode* n)
{
	s_mnode* t;
	big_int *bi = BI(n);
	int bl = abs(bi->blocks);

	if (bl == 0)
		return copy_mnode(n);
	t = create_bigint(bl);
	memcpy(BI(t)->d, BI(n)->d, bl * sizeof(__u32));
	return set_blocks(t, bl, (bi->blocks > 0));
}

/*
 * The previous (naive) algorithm, reimplemented with minor changes in
 * bi_trivial_mul(), runs in O(N*M) where N and M are the numbers of blocks
 * of the factors. When N=M it is an O(N^2) algorithm. The following one,
 * described in Knuth, "Seminumerical Algorithms", sect. 4.3.3, formula (2),
 * runs in O(N^(log3/log2)). We only use bi_knuth_mul2() when N is big
 * enough (N >= 8) else we revert to the trivial method.
 * The formula used here is
 *
 *  (b.U1+U0).(b.V1+V0) == (b^2+b).U1.V1 - b.(U1-U0).(V1-V0) + (b+1).U0.V0
 *
 * which uses three multiplications instead of four.
 */

static s_mnode* bigint_mul1 (s_mnode* n1, __u32 v, int isneg)
{
	s_mnode *n;
	big_int *b, *b1 = BI(n1);
	int bl1 = abs(b1->blocks), bl;
	__u32 *bd, *b1d, carry, th, tl, nh, nl;

	bl = bl1 + 1;
	n = create_bigint(bl);
	b = BI(n); bd = b->d; b1d = b1->d;
	carry = 0;
	while (bl1--) {
		umul_ppmm(th, tl, *b1d, v);
		add_ssaaaa(nh, nl, th, tl, 0, carry);
		udiv_qrnnd(carry, *bd, nh, nl, BASE);
		b1d++; bd++;
	}
	if (carry)
	  *bd = carry;
	else
	  --bl;
	return set_blocks(n, bl, (b1->blocks < 0) ^ isneg);
}

static void bi_trivial_mul (__u32 *d1, int l1, __u32 *d2, int l2, __u32 *d)
{
	int i, j;
	__u32 th1, tl1, th2, tl2, f, carry;

	memset(d, 0, (l1+l2) * sizeof(__u32));
	for (i = 0; i < l1; i++) {
		f = d1[i];
		carry = 0;
		for (j = 0; j < l2; j++) {
			umul_ppmm(th1, tl1, f, d2[j]);
			add_ssaaaa(th2, tl2, th1, tl1, 0, d[i+j] + carry);
			udiv_qrnnd(carry, d[i+j], th2, tl2, BASE);
		}
		d[i+l2] = carry;
	}
}

/*
 * This helper function expects two N-arrays (d1,d2) of base 10000 numbers,
 * where N is a power of two, and puts the result in d, which must be
 * the address of an already allocated 2N-array.
 */

static void bi_knuth_mul2 (__u32 *d1, __u32 *d2, __u32 *d, int N)
{
	int i, n, sign1, sign2, carry, diff;
	__u32 *d3, *d4, *d5;

#if 1
	if (N <= 8) {
		/* Revert to the trivial method */
		bi_trivial_mul(d1, N, d2, N, d);
		return;
	}
#else
	if (N == 4) {
		/* Hand-coded 4x4 multiplication */
		__u32 a0, a1, a2, a3, a4, a5, a6;

		a0 = (__u32)d1[0] * d2[0];
		d[0] = a0 % BASE;
		a1 = a0 / BASE + (__u32)d1[0] * d2[1] + (__u32)d1[1] * d2[0];
		d[1] = a1 % BASE;
		a2 = a1 / BASE + (__u32)d1[0] * d2[2] + (__u32)d1[1] * d2[1]
			+ (__u32)d1[2] * d2[0];
		d[2] = a2 % BASE;
		a3 = a2 / BASE + (__u32)d1[0] * d2[3] + (__u32)d1[1] * d2[2]
			+ (__u32)d1[2] * d2[1] + (__u32)d1[3] * d2[0];
		d[3] = a3 % BASE;
		a4 = a3 / BASE + (__u32)d1[1] * d2[3] + (__u32)d1[2] * d2[2]
			+ (__u32)d1[3] * d2[1];
		d[4] = a4 % BASE;
		a5 = a4 / BASE + (__u32)d1[2] * d2[3] + (__u32)d1[3] * d2[2];
		d[5] = a5 % BASE;
		a6 = a5 / BASE + (__u32)d1[3] * d2[3];
		d[6] = a6 % BASE;
		d[7] = a6 / BASE;
		return;
	}
#endif
	n = N/2;
	memset(d, 0, (4*n) * sizeof(__u32));
	d3 = alloca_u32(2*n);
	bi_knuth_mul2(d1, d2, d3, n);
	/* Multiply "d3" by BASE^n+1 */
	for (i = 0; i < 2*n; i++) {
		d[i] += d3[i];
		d[i+n] += d3[i];
	}
	/* Propagate the carry */
	carry = 0;
	for (i = 0; i < 4*n; i++) {
		diff = carry + d[i];
		carry = 0;
		while (diff >= BASE)
			diff -= BASE, ++carry;
		d[i] = diff;
	}
	assert(carry == 0);

	bi_knuth_mul2(d1+n, d2+n, d3, n);
	/* Multiply "d3" by BASE^2n+BASE^n */
	for (i = 0; i < 2*n; i++) {
		d[i+n] += d3[i];
		d[i+2*n] += d3[i];
	}
	/* Propagate the carry */
	carry = 0;
	for (i = 0; i < 4*n; i++) {
		diff = carry + d[i];
		carry = 0;
		while (diff >= BASE)
			diff -= BASE, ++carry;
		d[i] = diff;
	}
	assert(carry == 0);

	/* sign1 is the sign of U1-U0 */
	sign1 = 0;
	for (i = n-1; i >= 0; i--) {
		diff = (int)(d1[i+n]) - (int)(d1[i]);
		if (diff) {
			sign1 = (diff > 0) ? 1 : -1;
			break;
		}
	}
	if (sign1 == 0)
		return;

	/* sign2 is the sign of V1-V0 */
	sign2 = 0;
	for (i = n-1; i >= 0; i--) {
		diff = (int)(d2[i+n]) - (int)(d2[i]);
		if (diff) {
			sign2 = (diff > 0) ? 1 : -1;
			break;
		}
	}
	if (sign2 == 0)
		return;

	/* Now calculate abs(U1-U0) and abs(V1-V0) */
	d4 = alloca_u32(n);
	carry = 0;
	for (i = 0; i < n; i++) {
		if (sign1 > 0)
			diff = (carry + d1[i+n]) - d1[i];
		else
			diff = (carry + d1[i]) - d1[i+n];
		if (diff < 0) {
			d4[i] = diff + BASE;
			carry = -1;
		} else {
			d4[i] = diff;
			carry = 0;
		}
	}
	assert(carry == 0);
	d5 = alloca_u32(n);
	for (i = 0; i < n; i++) {
		if (sign2 > 0)
			diff = (carry + d2[i+n]) - d2[i];
		else
			diff = (carry + d2[i]) - d2[i+n];
		if (diff < 0) {
			d5[i] = diff + BASE;
			carry = -1;
		} else {
			d5[i] = diff;
			carry = 0;
		}
	}
	assert(carry == 0);
	bi_knuth_mul2(d4, d5, d3, n);

	carry = 0;
	if (sign1 != sign2) {
		for (i = 0; i < 2*n; i++) {
			diff = (carry + d[i+n]) + d3[i];
			if (diff >= BASE) {
				d[i+n] = diff - BASE;
				carry = 1;
			} else {
				d[i+n] = diff;
				carry = 0;
			}
		}
		for (i = 3*n; i < 4*n; i++) {
			diff = carry + d[i];
			if (diff >= BASE) {
				d[i] = diff - BASE;
				carry = 1;
			} else {
				d[i] = diff;
				carry = 0;
			}
		}
	} else {
		for (i = 0; i < 2*n; i++) {
			diff = (carry + d[i+n]) - d3[i];
			if (diff < 0) {
				d[i+n] = diff + BASE;
				carry = -1;
			} else {
				d[i+n] = diff;
				carry = 0;
			}
		}
		for (i = 3*n; i < 4*n; i++) {
			diff = carry + d[i];
			if (diff < 0) {
				d[i] = diff + BASE;
				carry = -1;
			} else {
				d[i] = diff;
				carry = 0;
			}
		}
	}
	assert(carry == 0);
}

static void bi_knuth_mul (__u32 *d1, int l1, __u32 *d2, int l2, __u32 *d)
{
	__u32 *D1, *D2, *D;
	int i, j, left, shift, carry, pieces, N;

#if 0
	if (l1 > l2) {
		int tmp; __u32 *ptp;
		tmp = l1; l1 = l2; l2 = tmp;
		ptp = d1; d1 = d2; d2 = ptp;
	}
#endif
	assert(l1 <= l2);
	if (l1 < 15) {
		bi_trivial_mul(d1, l1, d2, l2, d);
		return;
	}
	for (N = 1; N < l2; N <<= 1)
		;
	assert(l2 <= N && (N&(N-1)) == 0);
	if (l1 <= N/2) {
		/* Break d2 into pieces */
		for (N = 1; N < l1; N <<= 1)
			;
		assert(l1 <= N && (N&(N-1)) == 0);
		pieces = (l2 + N - 1) / N;
		D1 = alloca_u32(N);
		D2 = alloca_u32(N);
		memset(D1, 0, N * sizeof(__u32));
		memcpy(D1, d1, l1 * sizeof(__u32));
		memset(d, 0, (l1+l2) * sizeof(__u32));
		D = alloca_u32(2*N);
		for (i = 0; i < pieces; i++) {
			shift = i*N;
			left = (l2-shift < N)? l2-shift : N;
			if (left < N) {
				/* The last piece may be shorter */
				memset(D2, 0, N*sizeof(__u32));
				memcpy(D2, d2+shift, left*sizeof(__u32));
				bi_knuth_mul2(D1, D2, D, N);
			} else
				bi_knuth_mul2(D1, d2 + shift, D, N);
			/* Now add D shifted to d */
			carry = 0;
			for (j = 0; j < l1+left; j++) {
				carry += d[j+shift] + D[j];
				if (carry >= BASE) {
					d[j+shift] = carry - BASE;
					carry = 1;
				} else {
					d[j+shift] = carry;
					carry = 0;
				}
			}
			if (carry) {
				j += shift;
				while (++d[j] >= BASE)
					d[j] -= BASE, j++ ;
			}
			assert(j < l1+l2);
		}
	} else {
		/* The operands have roughly the same size */
		D1 = alloca_u32(N);
		D2 = alloca_u32(N);
		D = alloca_u32(2*N);
		memset(D1, 0, N * sizeof(__u32));
		memcpy(D1, d1, l1 * sizeof(__u32));
		memset(D2, 0, N * sizeof(__u32));
		memcpy(D2, d2, l2 * sizeof(__u32));
		bi_knuth_mul2(D1, D2, D, N);
		memcpy(d, D, (l1+l2) * sizeof(__u32));
	}
}

static s_mnode* bigint_mul (s_mnode* n1, s_mnode* n2)
{
	s_mnode* n;
	big_int *b1 = BI(n1), *b2 = BI(n2), *b;
	int bl1 = abs(b1->blocks), bl2 = abs(b2->blocks), bl, v;

	/* Optimize a few common cases */
	if (!bl1 || !bl2)
		return copy_mnode(p_integer_zero);
	if (bl1 == 1) {
		v = b1->d[0];
		if (b1->blocks > 0)
			return v==1? copy_mnode(n2):bigint_mul1(n2,v,0);
		else
			return v==1? bigint_negate(n2):bigint_mul1(n2,v,1);
	}
	if (bl2 == 1) {
		v = b2->d[0];
		if (b2->blocks > 0)
			return v==1? copy_mnode(n1):bigint_mul1(n1,v,0);
		else
			return v==1? bigint_negate(n1):bigint_mul1(n1,v,1);
	}
	/* Now the general case */
	bl = bl1 + bl2;
	n = create_bigint(bl);
	b = BI(n);
	if (bl1 <= bl2)
		bi_knuth_mul(b1->d, bl1, b2->d, bl2, b->d);
	else
		bi_knuth_mul(b2->d, bl2, b1->d, bl1, b->d);

	/* The number of blocks is either bl or bl-1 */
	if (b->d[bl-1] == 0)
		--bl;
	return set_blocks(n, bl, (b1->blocks<0) ^ (b2->blocks<0));
}

static s_mnode* bigint_div1 (s_mnode* n1, __u32 v, int isneg)
{
	big_int *b, *b1 = BI(n1);
	int bl1 = abs(b1->blocks), bl, i;
	__u32 carry, *b1d, *bd;
	__u32 th, tl, nh, nl;
	s_mnode *n;

	n = create_bigint(bl1);
	b = BI(n); b1d = b1->d; bd = b->d;
	carry = 0;
	for (i = bl1-1; i >= 0; i--) {
		umul_ppmm(th, tl, carry, BASE);
		add_ssaaaa(nh, nl, th, tl, 0, b1d[i]);
		udiv_qrnnd(bd[i], carry, nh, nl, v);
	}
	/* Now carry contains the remainder of the division */
	bl = bl1;
	while (bl > 0 && bd[bl-1] == 0)
		--bl;
	return set_blocks(n, bl, (b1->blocks < 0) ^ isneg);
}

/*
 * Multi-precision division routine. A faithful implementation of the
 * algorithm D given in Knuth, "Seminumerical Algorithms", section 4.3.1.
 */

static s_mnode* bigint_divm (s_mnode* n1, s_mnode* n2)
{
	big_int *b1 = BI(n1), *b2 = BI(n2), *b;
	int bl1 = abs(b1->blocks), bl2 = abs(b2->blocks);
	int tmp, i, j, k, m, bl, qh;
	__u32 carry, *bd, *bd1, *bd2, *bd1o, *bd2o, v1, v2, *U, *V;
	__u32 normf, t1h, t1l, t2h, t2l, t3h, t3l, quot, rem;
	s_mnode *n;

	if ((m = bl1-bl2) < 0)
		return bigint_zero(NULL);
	bl = m+1;
	n = create_bigint(bl); b = BI(n); bd = b->d;

	/* D1: Normalize dividend and divisor */
	bd1o = b1->d; bd2o = b2->d;
	normf = BASE / (bd2o[bl2-1] + 1);
	bd1 = alloca_u32(bl1 + 1);
	if (normf == 1) {
		bd2 = bd2o;
		memcpy(bd1, bd1o, bl1*sizeof(__u32));
		bd1[bl1] = 0;
	} else {
		bd2 = alloca_u32(bl2);
		carry = 0;
		for (i = 0; i < bl2; i++) {
			umul_ppmm(t1h, t1l, normf, bd2o[i]);
			add_ssaaaa(t2h, t2l, t1h, t1l, 0, carry);
			udiv_qrnnd(quot, rem, t2h, t2l, BASE);
			bd2[i] = rem;
			carry = quot;
		}
		assert(carry == 0);
		for (i = 0; i < bl1; i++) {
			umul_ppmm(t1h, t1l, normf, bd1o[i]);
			add_ssaaaa(t2h, t2l, t1h, t1l, 0, carry);
			udiv_qrnnd(quot, rem, t2h, t2l, BASE);
			bd1[i] = rem;
			carry = quot;
		}
		bd1[bl1] = carry;
	}
	v1 = bd2[bl2-1];
	v2 = bd2[bl2-2];

	/* Main loop D2--D7 */
	for (j = 0; j <= m; j++) {
		/* D3: Calculate qh */
		umul_ppmm(t1h, t1l, bd1[bl1-j], BASE);
		add_ssaaaa(t2h, t2l, t1h, t1l, 0, bd1[bl1-j-1]);
		udiv_qrnnd(qh, rem, t2h, t2l, v1);
		if (qh == BASE)
			--qh, rem += v1;

		/* Decrement if necessary */
		while ((__u64)v2*qh > (__u64)BASE * rem + bd1[bl1-j-2]) {
			--qh;
			rem += v1;
		}
		assert(0 <= qh && qh < BASE);

		/* D4: Multiply and substract */
		carry = 0;
		U = &bd1[bl1-j-bl2];
		V = bd2; k = bl2;
		do {
			umul_ppmm(t1h, t1l, qh, *V); V++;
			add_ssaaaa(t2h, t2l, t1h, t1l, 0, carry);
			if (!t2h && *U >= t2l) {
				tmp = *U - t2l;
				carry = 0;
			} else {
				sub_ddmmss(t3h, t3l, t2h, t2l, 0, *U);
				add_ssaaaa(t2h, t2l, t3h, t3l, 0, BASE-1);
				udiv_qrnnd(quot, rem, t2h, t2l, BASE);
				carry = quot;
				tmp = (BASE-1) - rem;
			}
			*U = tmp; U++;
		} while (--k);
		tmp = (*U < carry);
		*U -= carry;

		/* D5: Test remainder */
		if (tmp) {
			/* D6: Add back. */
			--qh; carry = 0;
			U = &bd1[bl1-j-bl2];
			V = bd2; k = bl2;
			do {
				/* This requires BASE < 2^30 */
				tmp = carry + (*U) + (*V++);
				if (tmp >= BASE)
					carry = 1, tmp -= BASE;
				else
					carry = 0;
				*U++ = tmp;
			} while (--k);
			*U += carry;
		}
		/* Store digit */
		bd[bl-1-j] = qh;
	}
	while (bl > 0 && bd[bl-1] == 0)
		--bl;
	return set_blocks(n, bl, (b1->blocks < 0) ^ (b2->blocks < 0));
}

static s_mnode* bigint_div (s_mnode* n1, s_mnode* n2)
{
	big_int *b2 = BI(n2);
	int v;

	switch (b2->blocks) {
	    case 0:
		return mnode_error(SE_DIVZERO, "bigint_div");
	    case 1:
	  	v = b2->d[0];
	  	if (v == 1)
	  		return copy_mnode(n1);
	  	else
			return bigint_div1(n1, v, 0);
	    case -1:
	    	v = b2->d[0];
	    	if (v == 1)
	    		return bigint_negate(n1);
	    	else
			return bigint_div1(n1, v, 1);
	    default:
		return bigint_divm(n1, n2);
	}
}

/*
 * Square root of a positive integer. This is horribly inefficient; we're
 * using the recursive formula
 *
 *	floor_sqrt(n) = 2.floor_sqrt(n/4)+x  where x is 0 or 1.
 */

static s_mnode* bigint_sqrt_gt0 (s_mnode* n)
{
	big_int *bi = BI(n);
	s_mnode *four, *n4, *p, *q, *t;

	if (bi->blocks == 1 && bi->d[0] < 4) {
		/* The number is between 1 and 3 */
		return bigint_one(n);
	}
	four = integer_new(4);
	n4 = bigint_div(n, four);
	t = bigint_sqrt_gt0(n4);
	unlink_mnode(n4); unlink_mnode(four);
	p = bigint_add(t, t); unlink_mnode(t);
	q = bigint_add(p, p_integer_one);
	/* The square root can be either p or q=p+1 */
	t = bigint_mul(q, q);
	if (bigint_acompare(t,n) > 0) {
		unlink_mnode(t); unlink_mnode(q);
		return p;
	} else {
		unlink_mnode(t); unlink_mnode(p);
		return q;
	}
}

static s_mnode* bigint_sqrt (s_mnode* n)
{
	switch (SIGN(n)) {
	    case 0:
		return copy_mnode(n);
	    case 1:
	    	return bigint_sqrt_gt0(n);
	    default:
	    	return mnode_error(SE_OODOMAIN, "bigint_sqrt");
	}
}

static void halve_bigint (big_int *b)
{
	int bl1 = b->blocks;
	__u32 *bd, carry, old;

	if (!bl1)
		return;			/* Zero is not modified */
	carry = 0;
	bd = &b->d[bl1-1];
	if (*bd < 2)
		b->blocks = bl1-1;	/* We lost one block */
	do {
		old = *bd;
		*bd-- = (old >> 1) + carry;
		carry = (old&1) ? BASE/2 : 0;
	}
	while (--bl1);
}

static s_mnode* bigint_gcd (s_mnode* n1, s_mnode* n2)
{
	s_mnode *n, *n1a, *n2a;
	big_int *b1, *b2, *bs;
	int bl1 = BI(n1)->blocks, bl2 = BI(n2)->blocks;
	int pow2, i, c, carry;

	/* Find the absolute values */
	if (bl1 < 0)
		n1a = mnode_negate(n1), bl1 = -bl1;
	else
		n1a = copy_mnode(n1);
	if (bl2 < 0)
		n2a = mnode_negate(n2), bl2 = -bl2;
	else
		n2a = copy_mnode(n2);
	if (cmp_bigint(BI(n1a),BI(n2a)) > 0)
		n = n1a, n1a = n2a, n2a = n;
	/* Now we can assume n1 < n2 */
	bl1 = abs(BI(n1a)->blocks);
	bl2 = abs(BI(n2a)->blocks);
	assert(0 <= bl1 && bl1 <= bl2);
	if (bl1 == 0) {
		/* n1 is zero */
		unlink_mnode(n1a);
		return n2a;
	}
	if (bl1 == 1 && BI(n1a)->d[0] == 1) {
		/* n1 is +1 or -1 */
		unlink_mnode(n2a);
		return n1a;
	}
	/*
	 * If the numbers have very different sizes, it's a good idea to
	 * apply the first step of Euclid's algorithm, i.e., to replace
	 * n2 with n2%n1.
	 * This optimization is very good for the PARI benchmark.
	 */
	if (2*bl1 < bl2) {
		s_mnode *n3, *n4;
		n3 = mnode_div(n2a, n1a);
		n4 = mnode_mul(n1a, n3);
		unlink_mnode(n3);
		n3 = mnode_sub(n2a, n4);
		unlink_mnode(n4);
		unlink_mnode(n2a);
		n2a = n3;
		if ((bl2 = abs(BI(n2a)->blocks)) == 0) {
			unlink_mnode(n2a);
			return n1a;
		}
	}
	b1 = alloca(sizeof(big_int)+bl1*sizeof(__u32));
	b2 = alloca(sizeof(big_int)+bl2*sizeof(__u32));
	memcpy(b1, BI(n1a), sizeof(big_int)+bl1*sizeof(__u32));
	memcpy(b2, BI(n2a), sizeof(big_int)+bl2*sizeof(__u32));
	unlink_mnode(n1a);
	unlink_mnode(n2a);
	/* Find the biggest power of two dividing n1 and n2 */
	for (pow2 = 0; ; pow2++) {
		if ((b1->d[0] & 1) || (b2->d[0] & 1))
			break;
		halve_bigint(b1);
		halve_bigint(b2);
	}
	/* Main loop */
	while (1) {
		/* Reduce n1 and n2 to odd numbers */
		while ((b1->d[0] & 1) == 0)
			halve_bigint(b1);
		while ((b2->d[0] & 1) == 0)
			halve_bigint(b2);
		if ((c = cmp_bigint(b1, b2)) == 0)
			break;
		if (c < 0)
			bs = b1, b1 = b2, b2 = bs;
		/* Now we can assume n1 > n2 */
		carry = 0;
		bl1 = b1->blocks;
		bl2 = b2->blocks;
		for (i = 0; i < bl1; i++) {
			carry += b1->d[i];
			if (i < bl2)
				carry -= b2->d[i];
			if (carry < 0) {
				b1->d[i] = carry + BASE;
				carry = -1;
			} else {
				b1->d[i] = carry;
				carry = 0;
			}
		}
		assert(carry == 0);
		while (bl1 > 0 && b1->d[bl1-1] == 0)
			--bl1;
		assert(bl1 > 0);
		b1->blocks = bl1;
	}
	/* Multiply n1, say, by the appropriate power of two */
	while (--pow2 >= 0) {
		carry = 0;
		bl1 = b1->blocks;
		for (i = 0; i < bl1; i++)
			if (b1->d[i] < BASE/2) {
				b1->d[i] = 2*b1->d[i] + carry;
				carry = 0;
			} else {
				b1->d[i] = 2*b1->d[i] + carry - BASE;
				carry = 1;
			}
		if (carry) {
			b1->d[bl1] = 1;
			++(b1->blocks);
		}
	}
	bl1 = b1->blocks;
	n = create_bigint(bl1);
	memcpy(BI(n)->d, b1->d, bl1 * sizeof(__u32));
	return set_blocks(n, bl1, 0);
}
