/*
 * jpeg-hyst.cc --
 *
 *      FIXME: This file needs a description here.
 */

/*
 * This code is derived from the Independent JPEG Group's JPEG software:
 *
 * Copyright (C) 1991, 1992, Thomas G. Lane.
 * This file is part of the Independent JPEG Group's software.
 * For conditions of distribution and use, see the accompanying
 * README.IJPG file.
 */

#include "jpeg.h"

#include <stdlib.h>
#include <stdio.h>
#include <sys/param.h>
#include <netinet/in.h>

extern "C" {
void j_rev_dct(short*);
void init_pre_idct();
}

#include <bstring.h>

#ifdef __alpha
#include <machine/endian.h>
#endif


/*
 * These two macros stolen from nv.
 */
/* Sick little macro which will limit x to [0..255] with logical ops */
#define UCLIMIT(x) ((t = (x)), (t &= ~(t>>31)), (t | ~((t-256) >> 31)))
/* A variant of above which will limit x to [-128..127] */
#define SCLIMIT(x) (UCLIMIT((x)+128)-128)

/*
 * ZAG[i] is the natural-order position of the i'th element of zigzag order.
 * If the incoming data is corrupted, huff_decode_mcu could attempt to
 * reference values beyond the end of the array.  To avoid a wild store,
 * we put some extra zeroes after the real entries.
 */
static const int ZAG[] = {
	0,  1,  8, 16,  9,  2,  3, 10,
	17, 24, 32, 25, 18, 11,  4,  5,
	12, 19, 26, 33, 40, 48, 41, 34,
	27, 20, 13,  6,  7, 14, 21, 28,
	35, 42, 49, 56, 57, 50, 43, 36,
	29, 22, 15, 23, 30, 37, 44, 51,
	58, 59, 52, 45, 38, 31, 39, 46,
	53, 60, 61, 54, 47, 55, 62, 63,
	/* extra entries in case k>63 below */
	0,  0,  0,  0,  0,  0,  0,  0,
	0,  0,  0,  0,  0,  0,  0,  0
};

JpegDecoder::JpegDecoder(const config& c)
{
	reconfig(c);
}

void JpegDecoder::reconfig(const config& c)
{
	width = c.width;
	height = c.height;
	jpeg_color_space = c.cs;
	bcopy(c.qtab, qtab, sizeof(qtab));
	ncomp = c.ncomp;

	max_h_samp_factor = 1;
	max_v_samp_factor = 1;
	for (int i = ncomp; --i >= 0; ) {
		comp[i].id = c.comp[i].id;
		int v = comp[i].h_samp_factor = c.comp[i].h_samp_factor;
		if (max_h_samp_factor < v)
			max_h_samp_factor = v;
		v = comp[i].v_samp_factor = c.comp[i].v_samp_factor;
		if (max_v_samp_factor < v)
			max_v_samp_factor = v;
		comp[i].quant_tbl_no = c.comp[i].quant_tbl_no;
		comp[i].dc_tbl_no = c.comp[i].dc_tbl_no;
		comp[i].ac_tbl_no = c.comp[i].ac_tbl_no;
	}
	for (i = 0; i < 4; ++i) {
		if (c.dc_huffval[i] != 0)
			dc_hufftab[i] = huffbuild(c.dc_huffbits[i],
						  c.dc_huffval[i]);
		if (c.ac_huffval[i] != 0)
			ac_hufftab[i] = huffbuild(c.ac_huffbits[i],
						  c.ac_huffval[i]);
	}
	interleaved_scan_setup();
}

void JpegDecoder::init()
{
#ifdef ROWE
	init_pre_idct();
#endif
	huffreset();
}

/* Compute a rounded up to next multiple of b; a >= 0, b > 0 */
static inline int jround_up(int a, int b)
{
	a += b - 1;
	return (a - (a % b));
}

void JpegDecoder::interleaved_scan_setup()
{
	short ci, mcublks;

	MCUs_per_row = (width + 8 * max_h_samp_factor - 1)
		/ (8 * max_h_samp_factor);

	MCU_rows_in_scan = (height + 8 * max_v_samp_factor - 1)
		/ (8 * max_v_samp_factor);

	blocks_in_MCU = 0;

	for (ci = 0; ci < ncomp; ci++) {
		/*
		 * for interleaved scan, sampling factors give # of blocks
		 * per component
		 */
		component& p = comp[ci];
		p.MCU_width = p.h_samp_factor;
		p.MCU_height = p.v_samp_factor;
		p.MCU_blocks = p.MCU_width * p.MCU_height;

		/* Prepare array describing MCU composition */
		mcublks = p.MCU_blocks;
		if (blocks_in_MCU + mcublks > MAX_BLOCKS_IN_MCU)
			abort();
		while (mcublks-- > 0)
			MCU_membership[blocks_in_MCU++] = ci;
	}
}

void JpegDecoder::fill(int dc, u_char* out, const int stride) const
{
	register int t;

	dc = UCLIMIT(dc) & 0xff;
	dc |= dc << 8;
	dc |= dc << 16;
	*(u_word*)out = dc;
	*(u_word*)(out + 4) = dc;
	out += stride;
	*(u_word*)out = dc;
	*(u_word*)(out + 4) = dc;
	out += stride;
	*(u_word*)out = dc;
	*(u_word*)(out + 4) = dc;
	out += stride;
	*(u_word*)out = dc;
	*(u_word*)(out + 4) = dc;
	out += stride;
	*(u_word*)out = dc;
	*(u_word*)(out + 4) = dc;
	out += stride;
	*(u_word*)out = dc;
	*(u_word*)(out + 4) = dc;
	out += stride;
	*(u_word*)out = dc;
	*(u_word*)(out + 4) = dc;
	out += stride;
	*(u_word*)out = dc;
	*(u_word*)(out + 4) = dc;
}

void JpegDecoder::mix(const int dc, const short* bp, u_char* out,
		      const int stride) const
{
	for (register int k = 8; --k >= 0; ) {
		register u_word o;
		register int t;
#if BYTE_ORDER == LITTLE_ENDIAN
		o = UCLIMIT(bp[0] + dc) & 0xff;
		o |= (UCLIMIT(bp[1] + dc) & 0xff) << 8;
		o |= (UCLIMIT(bp[2] + dc) & 0xff) << 16;
		o |= (UCLIMIT(bp[3] + dc) & 0xff) << 24;
		*(u_word *)out = o;
		o = UCLIMIT(bp[4] + dc) & 0xff;
		o |= (UCLIMIT(bp[5] + dc) & 0xff) << 8;
		o |= (UCLIMIT(bp[6] + dc) & 0xff) << 16;
		o |= (UCLIMIT(bp[7] + dc) & 0xff) << 24;
		*(u_word *)(out + 4) = o;
#else
		o = (UCLIMIT(bp[0] + dc) & 0xff) << 24;
		o |= (UCLIMIT(bp[1] + dc) & 0xff) << 16;
		o |= (UCLIMIT(bp[2] + dc) & 0xff) << 8;
		o |= UCLIMIT(bp[3] + dc) & 0xff;
		*(u_word *)out = o;
		o = (UCLIMIT(bp[4] + dc) & 0xff) << 24;
		o |= (UCLIMIT(bp[5] + dc) & 0xff) << 16;
		o |= (UCLIMIT(bp[6] + dc) & 0xff) << 8;
		o |= UCLIMIT(bp[7] + dc) & 0xff;
		*(u_word *)(out + 4) = o;
#endif
		bp += 8;
		out += stride;
	}
}

#define MAXCODE 24
struct hcnode {
	struct hcnode *next;
	u_word n;
	u_word code[MAXCODE];
	short block[64];
};

#define HASHSIZE (2*4096)
struct hcnode *hashtab[HASHSIZE];
/*FIXME*/
#define NHC (2*HASHSIZE)
struct hcnode hcpool[NHC];
static int nhc;

static struct hcnode *
scavenge()
{
	register struct hcnode *p;

	int i = nhc;
	if (i >= NHC) {
		static int rover = 0;

		i = rover;
		do
			i = (i + 1) & (HASHSIZE - 1);
		while ((p = hashtab[i]) == 0);
		rover = i;
		hashtab[i] = p->next;
	} else {
		p  = &hcpool[i];
		nhc = i + 1;
	}
	return (p);
}

static inline int
hchash(int n, u_word *code)
{
	int v = 0;

	while (--n >= 0)
		v += code[n] + 37;
	return ((v ^ v >> 16) & (HASHSIZE - 1));
}

static inline struct hcnode *
hclookup(int h, int n, u_word *code)
{
	struct hcnode *p;

	for (p = hashtab[h]; p != 0; p = p->next)
		if (p->n == n &&
		    bcmp(p->code, code, n * sizeof(*code)) == 0)
			break;

	return (p);
}

static void
hcenter(struct hcnode *p, int h, int n, u_word *code)
{
	bcopy(code, p->code, n * sizeof(*code));
	p->n = n;

	p->block[0] = 0;
	j_rev_dct(p->block);

	p->next = hashtab[h];
	hashtab[h] = p;
}

int hcmiss;
int hchit;
int dcblk;
int bblk;
int sblk;

static int
length(struct hcnode *p)
{
	int n = 0;
	for (; p != 0; p = p->next)
		++n;
	return (n);
}

void
pstats()
{
	int i;
	int len;
	int bucket = 0;
	int maxlen = 0;

	for (i = 0; i < HASHSIZE; ++i) {
		if (hashtab[i] == 0)
			continue;
		++bucket;
		len = length(hashtab[i]);
		if (len > maxlen)
			maxlen = len;
	}
	printf("hit\t%d\n", hchit);
	printf("miss\t%d\n", hcmiss);
	printf("bucket\t%d\n", bucket);
	printf("maxlen\t%d\n", maxlen);
	printf("dcblk\t%d\n", dcblk);
	printf("bblk\t%d\n", bblk);
	printf("sblk\t%d\n", sblk);
}

struct blkcache {
	int init;
	short block[64];
};

int
blkthresh(short* b0, short* b1, int thresh, short *qt)
{
	int n = 0;
	for (int i = 64; --i >= 0; ) {
		int k = ZAG[i];
		int d = b0[k] - b1[k];
		if (d < 0)
			d = -d;
		if (d > hlevel * qt[i]) {
			if (i == 0)
				n += 5;
			else
				n += 1;
			if (n > thresh)
				return (-1);
		}
	}
	return (0);
}

#ifdef notdef
void
pblk(short *b, short *a, short *qt)
{
	for (int i = 0; i < 64; ++i) {
		int k = ZAG[i];
		int d = b[k] - a[k];
		if (d != 0)
			printf("%d:\t%d\td %d\tq %d\n", i, b[k], d, qt[i]);
	}
}
#endif


int JpegDecoder::decode_block_with_hysteresis(int ci, int blkno,
					       int off, int stride)
{
	int n, h;
	struct hcnode *p;
	short *qt;
	int dc;
	struct blkcache *bc;
	short block[64];
	u_word code[128];

	/*FIXME*/
	component& c = comp[ci];
	qt = qtab[c.quant_tbl_no];

	bc = &dctcache[ci][blkno];

	n = huffparse(ci, code);
	if (n < 0 || huffblock(ci, n, code, block) < 0)
		return (-1);

	if (!bc->init || blkthresh(block, bc->block, hysteresis, qt)) {
		bcopy(block, bc->block, sizeof(block));
		j_rev_dct(block);
		mix(128, block, image[ci] + off, stride);
		bc->init = 1;
	}
	return (0);
}

int JpegDecoder::decode_block(int ci, int blkno, int off, int stride)
{
	int n, h;
	struct hcnode *p;
	short *qt;
	int dc;
	short block[64];
	u_word code[128];

	/*FIXME*/
	if (hysteresis)
		return (decode_block_with_hysteresis(ci, blkno, off, stride));

	/*FIXME*/
	component& c = comp[ci];
	qt = qtab[c.quant_tbl_no];

	n = huffparse(ci, code);
	if (n == 0) {
		dc = c.dc * qt[0];
		fill((dc >> 3) + 128, image[ci] + off, stride);
		++dcblk;
	} else if (n < MAXCODE) {
		++sblk;
		h = hchash(n, code);
		p = hclookup(h, n, code);
		if (p == 0) {
			++hcmiss;
			p = scavenge();
			if (huffblock(ci, n, code, p->block) < 0)
				return (-1);
			hcenter(p, h, n, code);
		} else
			++hchit;
		dc = c.dc * qt[0];
		mix((dc >> 3) + 128, p->block, image[ci] + off, stride);
	} else {
		++bblk;
		if (huffblock(ci, n, code, block) < 0)
			return (-1);
		j_rev_dct(block);
		mix(128, block, image[ci] + off, stride);
	}
	return (0);
}

int JpegDecoder::decode(u_char* in, int len)
{
#ifdef notdef
	nbits = 8 * len;
#endif
	inb = in;
	nbb = 0;

	const int stride0 = 7 * width;/*FIXME*/
	const int stride1 = 7 * width / 2;/*FIXME*/

	int off0 = 0;
	int off1 = 0;
	int blkno = 0;
	huffreset();
	for (int row = MCU_rows_in_scan; --row >= 0; ) {
		for (int mcu = MCUs_per_row; --mcu >= 0; ) {
			/* FIXME this works only for hsamp = 2, vsamp = 1 */
			if (decode_block(0, blkno, off0, width) < 0)
				return (-1);
			off0 += 8;
			if (decode_block(0, blkno + 1, off0, width) < 0)
				return (-1);
			off0 += 8;
			if (decode_block(1, blkno >> 1, off1, width / 2) < 0)
				return (-1);
			if (decode_block(2, blkno >> 1, off1, width / 2) < 0)
				return (-1);
			off1 += 8;
			blkno += 2;
		}
		off0 += stride0;
		off1 += stride1;
	}
	return (0);
}

/* Figure F.12: extend sign bit */

#ifdef notdef
#define huff_EXTEND(x,s)  ((x) < extend_test[s] ? (x) + extend_offset[s] : (x))

static const int extend_test[16] =   /* entry n is 2**(n-1) */
  { 0, 0x0001, 0x0002, 0x0004, 0x0008, 0x0010, 0x0020, 0x0040, 0x0080,
    0x0100, 0x0200, 0x0400, 0x0800, 0x1000, 0x2000, 0x4000 };

static const int extend_offset[16] = /* entry n is (-1 << n) + 1 */
  { 0, ((-1)<<1) + 1, ((-1)<<2) + 1, ((-1)<<3) + 1, ((-1)<<4) + 1,
    ((-1)<<5) + 1, ((-1)<<6) + 1, ((-1)<<7) + 1, ((-1)<<8) + 1,
    ((-1)<<9) + 1, ((-1)<<10) + 1, ((-1)<<11) + 1, ((-1)<<12) + 1,
    ((-1)<<13) + 1, ((-1)<<14) + 1, ((-1)<<15) + 1 };
#else
/* is this really faster? */
inline int
huff_EXTEND(int x, int s)
{
	register int b = x >> (s - 1);
	register int m = ((b & 1) - 1) << s;
	return ((x | m) + (~b & 1));
}
#endif

#define HUFFRQ(bb) \
 { \
	register int v; \
	register u_char *cp = inb; \
 \
	bb <<= 16; \
	v = *cp++; \
	if (v == 0xff) ++cp; \
	bb |= v << 8; \
	v = *cp++; \
	if (v == 0xff) ++cp; \
	bb |= v; \
	inb = cp; \
 \
}

#define MASK(s) ((1 << (s)) - 1)

#define HUFF_DECODE(ht, nbb, bb, result) { \
	register int s_, v_; \
 \
	if (nbb < 16) { \
		HUFFRQ(bb); \
		nbb += 16; \
	} \
	v_ = (bb >> (nbb - 16)) & 0xffff; \
	s_ = (ht)[v_]; \
	nbb -= (s_ >> 8); \
	result = s_ & 0xff; \
 }

#define GET_BITS(n, nbb, bb, result) \
{ \
	nbb -= n; \
	if (nbb < 0)  { \
		HUFFRQ(bb); \
		nbb += 16; \
	} \
	(result) = ((bb >> nbb) & MASK(n)); \
}

#define SKIP_BITS(n, nbb, bb) \
{ \
	nbb -= n; \
	if (nbb < 0)  { \
		HUFFRQ(bb); \
		nbb += 16; \
	} \
}

int JpegDecoder::huffparse(int ci, u_word *code)
{
	component& p = comp[ci];

	/* Decode a single block's worth of coefficients */

	/* Section F.2.2.1: decode the DC coefficient difference */
	register int bb_ = bb;
	register int nbb_ = nbb;
	u_short* ht = dc_hufftab[p.dc_tbl_no];
	register int s, r;
	HUFF_DECODE(ht, nbb_, bb_, s);
	if (s != 0) {
		GET_BITS(s, nbb_, bb_, r);
		s = huff_EXTEND(r, s);
	}
	/* Convert DC difference to actual value, update last_dc_val */
	s += p.dc;
	p.dc = s;

	/* Section F.2.2.2: decode the AC coefficients */
	ht = ac_hufftab[p.ac_tbl_no];
	register int n = 0;
	for (register int k = 1; k < 64; ) {
		/* Symbol-1 */
		register int v;
		HUFF_DECODE(ht, nbb_, bb_, v);
		s = v & 15;
		r = v >> 4;
		if (s != 0) {
			k += r;
			/* Symbol-2 */
			GET_BITS(s, nbb_, bb_, s);
			*code++ = (s << 8) | v;
			++n;
			++k;
		} else {
			if (r != 15)
				/* end of block */
				break;
			*code++ = v;
			++n;
			k += 16;
		}
	}
	nbb = nbb_;
	bb = bb_;

	return (n);
}

int JpegDecoder::huffskip(int ci)
{
	/* Decode a single block's worth of coefficients */

	/* Section F.2.2.1: decode the DC coefficient difference */
	register int bb_ = bb;
	register int nbb_ = nbb;
	component& p = comp[ci];
	u_short* ht = dc_hufftab[p.dc_tbl_no];
	register int s;
	HUFF_DECODE(ht, nbb_, bb_, s);
	if (s != 0) {
		SKIP_BITS(s, nbb_, bb_);
	}
	/* Section F.2.2.2: decode the AC coefficients */
	ht = ac_hufftab[p.ac_tbl_no];
	for (register int k = 1; k < 64; ) {
		/* Symbol-1 */
		register int v;
		HUFF_DECODE(ht, nbb_, bb_, v);
		s = v & 15;
		register int r = v >> 4;
		if (s != 0) {
			k += r;
			/* Symbol-2 */
			SKIP_BITS(s, nbb_, bb_);
			++k;
		} else {
			if (r != 15)
				/* end of block */
				break;
			k += 16;
		}
	}
	nbb = nbb_;
	bb = bb_;

	return (0);
}

void JpegDecoder::huffreset()
{
	nbb = 0;
	comp[0].dc = 0;
	comp[1].dc = 0;
	comp[2].dc = 0;
	comp[3].dc = 0;
}

u_short* JpegDecoder::huffbuild(const u_char* bits, const u_char* vals) const
{
	/* Figure C.1: make table of Huffman code length for each symbol */
	/* Note that this is in code-length order. */

	int nsym = 0;
	int huffsize[257];
	for (int codelen = 1; codelen <= 16; ++codelen) {
		for (int i = 1; i <= bits[codelen]; ++i)
			/*
			 * FIXME should sanity check that nsym stays
			 * below 256.
			 */
			huffsize[nsym++] = codelen;
	}
	huffsize[nsym] = 0;

	/* Figure C.2: generate the codes themselves */
	/* Note that this is in code-length order. */

	int code = 0;
	int si = huffsize[0];
	u_short huffcode[256];
	int p = 0;
	while (p < nsym) {
		while (huffsize[p] == si)
			huffcode[p++] = code++;

		code <<= 1;
		++si;
	}
	/*
	 * Build the direct-map lookup table.
	 */
	u_short *ht = new u_short[65536];
	bzero(ht, 65536 * sizeof(u_short));
	for (int sym = 0; sym < nsym; ++sym) {
		int codelen = huffsize[sym];
		int nbit = 16 - codelen;
		int code = huffcode[sym] << nbit;
		int map = (codelen << 8) | vals[sym];
		/*
		 * The low nbit bits are don't cares.
		 * Spin through all possible combos.
		 */
		for (int n = 1 << nbit; --n >= 0; )
			ht[code | n] = map;
	}
	return (ht);
}

int JpegDecoder::huffblock(int ci, int n, u_word *code, short *blk) const
{
	register int k;
	register const short *qt;

	const component& p = comp[ci];
	qt = qtab[p.quant_tbl_no];
	bzero(blk, 64 * sizeof(*blk));
	/* Descale and output the DC coefficient (assumes ZAG[0] = 0) */
	blk[0] = p.dc * qt[0];

	/* Section F.2.2.2: decode the AC coefficients */
	/* Since zero values are skipped, output area must be zeroed
	   beforehand */
	for (k = 1; k < 64; ) {
		register int s, r;
		register int v;

		/*FIXME*/
		if (--n < 0)
			return (0);

		v = *code++;
		s = v & 0x0f;
		r = (v >> 4) & 0x0f;
		if (s != 0) {
			k += r;
			r = v >> 8;
			s = huff_EXTEND(r, s);
			/*
			 * Descale coefficient and output in natural
			 * (dezigzagged) order
			 */
			if (k >= 64)
				return (-1);
			blk[ZAG[k]] = s * qt[k];
			++k;
		} else {
			if (r != 15)
				/* end of block */
				break;
			k += 16;
		}
	}
	return (0);
}

YUVJpegDecoder::YUVJpegDecoder(const config& c) : JpegDecoder(c)
{
	allocimage();

	if (hysteresis) {
		int n = imagesize / 64;
		dctcache[0] = new blkcache[n];
		bzero(dctcache[0], sizeof(*dctcache) * n);
		dctcache[1] = new blkcache[n];
		bzero(dctcache[1], sizeof(*dctcache) * n);
		dctcache[2] = new blkcache[n];
		bzero(dctcache[2], sizeof(*dctcache) * n);
	}
}

void YUVJpegDecoder::allocimage()
{
	int imagesize = width * height;
	image[0] = new u_char[imagesize];
	image[1] = new u_char[imagesize];
	image[2] = new u_char[imagesize];
	image[3] = 0;
}

YUVJpegDecoder::reconfig(const config& c)
{
	JpegDecoder::reconfig(c);
	delete image[0];
	delete image[1];
	delete image[2];
	allocimage();
}

GrayJpegDecoder::GrayJpegDecoder(const config& c, u_char* out, int* clut,
				 int s)
	: JpegDecoder(c), scale(s)
{
	image[0] = out;
	bcopy(clut, clut_, sizeof(clut_));

	if (hysteresis) {
		int n = width * height / 64;
		dctcache[0] = new blkcache[n];
		bzero(dctcache[0], sizeof(dctcache[0]) * n);
	}
}

int GrayJpegDecoder::decode(u_char* in, int len)
{
	inb = in;
	nbb = 0;

	int delta = (scale < 0) ? 16 : 8;
	int stride0 = (delta - 1) * width;/*FIXME*/
	int stride1 = stride1 / 2;
	/*FIXME*/
	if (scale < 0) {
		stride0 *= 2;
		stride1 *= 2;
	}
	int off0 = 0;
	int off1 = 0;
	int blkno = 0;
	huffreset();
	for (int row = MCU_rows_in_scan; --row >= 0; ) {
		for (int mcu = MCUs_per_row; --mcu >= 0; ) {
			/* FIXME this works only for hsamp = 2, vsamp = 1 */
			if (decode_block(0, blkno, off0, width) < 0)
				return (-1);
			off0 += delta;
			if (decode_block(0, blkno + 1, off0, width) < 0)
				return (-1);
			off0 += delta;
			if (huffskip(1) < 0)
				return (-1);
			if (huffskip(2) < 0)
				return (-1);
			off1 += delta;
			blkno += 2;
		}
		off0 += stride0;
		off1 += stride1;
	}
	return (0);
}

void GrayJpegDecoder::dmix(const int dc, const short* bp, u_char* out,
			   const int stride) const
{
	register const int *clut = clut_;

	for (register int k = 8; --k >= 0; ) {
		register u_word o;
		register int t;
#if BYTE_ORDER == LITTLE_ENDIAN
		o = clut[(UCLIMIT(bp[0] + dc) & 0xff)];
		o |= clut[(UCLIMIT(bp[1] + dc) & 0xff)] << 16;
		o |= o << 8;
		*(u_word *)out = o;
		*(u_word *)(out + stride) = o;

		o = clut[(UCLIMIT(bp[2] + dc) & 0xff)];
		o |= clut[(UCLIMIT(bp[3] + dc) & 0xff)] << 16;
		o |= o << 8;
		*(u_word *)(out + 4) = o;
		*(u_word *)(out + stride + 4) = o;

		o = clut[(UCLIMIT(bp[4] + dc) & 0xff)];
		o |= clut[(UCLIMIT(bp[5] + dc) & 0xff)] << 16;
		o |= o << 8;
		*(u_word *)(out + 8) = o;
		*(u_word *)(out + stride + 8) = o;

		o = clut[(UCLIMIT(bp[6] + dc) & 0xff)];
		o |= clut[(UCLIMIT(bp[7] + dc) & 0xff)] << 16;
		o |= o << 8;
		*(u_word *)(out + 12) = o;
		*(u_word *)(out + stride + 12) = o;
#else
		o = clut[(UCLIMIT(bp[0] + dc) & 0xff)] << 16;
		o |= clut[(UCLIMIT(bp[1] + dc) & 0xff)];
		o |= o << 8;
		*(u_word *)out = o;
		*(u_word *)(out + stride) = o;

		o = clut[(UCLIMIT(bp[2] + dc) & 0xff)] << 16;
		o |= clut[(UCLIMIT(bp[3] + dc) & 0xff)];
		o |= o << 8;
		*(u_word *)(out + 4) = o;
		*(u_word *)(out + stride + 4) = o;

		o = clut[(UCLIMIT(bp[4] + dc) & 0xff)] << 16;
		o |= clut[(UCLIMIT(bp[5] + dc) & 0xff)];
		o |= o << 8;
		*(u_word *)(out + 8) = o;
		*(u_word *)(out + stride + 8) = o;

		o = clut[(UCLIMIT(bp[6] + dc) & 0xff)] << 16;
		o |= clut[(UCLIMIT(bp[7] + dc) & 0xff)];
		o |= o << 8;
		*(u_word *)(out + 12) = o;
		*(u_word *)(out + stride + 12) = o;
#endif
		bp += 8;
		out += stride << 1;
	}
}

void GrayJpegDecoder::mix(const int dc, const short* bp, u_char* out,
			  const int stride) const
{
	if (scale < 0) {
		dmix(dc, bp, out, stride << 1);
		return;
	}

	register const int *clut = clut_;

	for (register int k = 8; --k >= 0; ) {
		register int t;
#ifdef __alpha
		register u_long o;
		o = (u_long)clut[(UCLIMIT(bp[0] + dc) & 0xff)];
		o |= (u_long)clut[(UCLIMIT(bp[1] + dc) & 0xff)] << 8;
		o |= (u_long)clut[(UCLIMIT(bp[2] + dc) & 0xff)] << 16;
		o |= (u_long)clut[(UCLIMIT(bp[3] + dc) & 0xff)] << 24;
		o |= (u_long)clut[(UCLIMIT(bp[4] + dc) & 0xff)] << 32;
		o |= (u_long)clut[(UCLIMIT(bp[5] + dc) & 0xff)] << 40;
		o |= (u_long)clut[(UCLIMIT(bp[6] + dc) & 0xff)] << 48;
		o |= (u_long)clut[(UCLIMIT(bp[7] + dc) & 0xff)] << 56;
		*(u_long *)out = o;
#elif BYTE_ORDER == LITTLE_ENDIAN
		register u_word o;
		o = clut[(UCLIMIT(bp[0] + dc) & 0xff)];
		o |= clut[(UCLIMIT(bp[1] + dc) & 0xff)] << 8;
		o |= clut[(UCLIMIT(bp[2] + dc) & 0xff)] << 16;
		o |= clut[(UCLIMIT(bp[3] + dc) & 0xff)] << 24;
		*(u_word *)out = o;
		o = clut[(UCLIMIT(bp[4] + dc) & 0xff)];
		o |= clut[(UCLIMIT(bp[5] + dc) & 0xff)] << 8;
		o |= clut[(UCLIMIT(bp[6] + dc) & 0xff)] << 16;
		o |= clut[(UCLIMIT(bp[7] + dc) & 0xff)] << 24;
		*(u_word *)(out + 4) = o;
#else
		register u_word o;
		o = clut[(UCLIMIT(bp[0] + dc) & 0xff)] << 24;
		o |= clut[(UCLIMIT(bp[1] + dc) & 0xff)] << 16;
		o |= clut[(UCLIMIT(bp[2] + dc) & 0xff)] << 8;
		o |= clut[(UCLIMIT(bp[3] + dc) & 0xff)];
		*(u_word *)out = o;
		o = clut[(UCLIMIT(bp[4] + dc) & 0xff)] << 24;
		o |= clut[(UCLIMIT(bp[5] + dc) & 0xff)] << 16;
		o |= clut[(UCLIMIT(bp[6] + dc) & 0xff)] << 8;
		o |= clut[(UCLIMIT(bp[7] + dc) & 0xff)];
		*(u_word *)(out + 4) = o;
#endif
		bp += 8;
		out += stride;
	}
}

void GrayJpegDecoder::dfill(int dc, u_char* out, const int stride) const
{
	register int t;
	dc = UCLIMIT(dc) & 0xff;
	dc = clut_[dc];
	dc |= dc << 8;
	dc |= dc << 16;
	int i = 16;
	do {
		*(u_word*)out = dc;
		*(u_word*)(out + 4) = dc;
		*(u_word*)(out + 8) = dc;
		*(u_word*)(out + 12) = dc;
		out += stride;
	} while (--i > 0);
}

void GrayJpegDecoder::fill(int xdc, u_char* out, const int stride) const
{
	if (scale < 0) {
		dfill(xdc, out, stride << 1);
		return;
	}
	register int t;
	xdc = UCLIMIT(xdc) & 0xff;
	register u_long dc = clut_[xdc];
	dc |= dc << 8;
	dc |= dc << 16;
#ifdef __alpha
	dc |= dc << 32;
#endif
	*(u_long*)out = dc;
#ifndef __alpha
	*(u_long*)(out + 4) = dc;
#endif
	out += stride;
	*(u_long*)out = dc;
#ifndef __alpha
	*(u_long*)(out + 4) = dc;
#endif
	out += stride;
	*(u_long*)out = dc;
#ifndef __alpha
	*(u_long*)(out + 4) = dc;
#endif
	out += stride;
	*(u_long*)out = dc;
#ifndef __alpha
	*(u_long*)(out + 4) = dc;
#endif
	out += stride;
	*(u_long*)out = dc;
#ifndef __alpha
	*(u_long*)(out + 4) = dc;
#endif
	out += stride;
	*(u_long*)out = dc;
#ifndef __alpha
	*(u_long*)(out + 4) = dc;
#endif
	out += stride;
	*(u_long*)out = dc;
#ifndef __alpha
	*(u_long*)(out + 4) = dc;
#endif
	out += stride;
	*(u_long*)out = dc;
#ifndef __alpha
	*(u_long*)(out + 4) = dc;
#endif
}

DiffJpegDecoder::DiffJpegDecoder(const config& c) : JpegDecoder(c)
{
	int n = width * height / 64;
	dcts = new blkcache[n];
	bzero(dcts, sizeof(*dcts) * n);

	/*
	 * Don't use any quantization.
	 * It just complicates the differencing heuristic.
	 */
	short* qt = qtab[comp[0].quant_tbl_no];
	for (int i = 0; i < 64; ++i)
		qt[i] = 1;
}

int cost[64] = {
	5, 4, 4, 3, 3, 2, 1, 0,
	4, 4, 3, 3, 2, 1, 1, 0,
	4, 3, 3, 2, 1, 1, 0, 0,
	3, 3, 2, 1, 1, 0, 0, 0,
	3, 2, 1, 1, 0, 0, 0, 0,
	2, 1, 1, 0, 0, 0, 0, 0,
	1, 1, 0, 0, 0, 0, 0, 0,
	0, 0, 0, 0, 0, 0, 0, 0,
};

static int
diff(short* b0, short* b1)
{
	int n = 0;
	/*
	 * FIXME ignore high frequencies for differencing decision
	 */
	for (int i = 64; --i >= 0; ) {
		int k = ZAG[i];
		int d = b0[k] - b1[k];
		if (d < 0)
			d = -d;
		n += d << (5 - cost[k]);
#ifdef notdef
		n += d;
#endif
	}
	return (n);
}

int DiffJpegDecoder::decode_block_diff(blkcache* bc, blkcache* dctcache,
					u_char* dv)
{
	u_word code[128];

	int n = huffparse(0, code);
	if (n < 0)
		return (-1);
	if (huffblock(0, n, code, bc->block) < 0)
		return (-1);
	*dv = diff(bc->block, dctcache->block);
	return (0);
}

int DiffJpegDecoder::decode(u_char* in, int len, blkcache* dctcache,
				  u_char* diffvector)
{
	inb = in;
	nbb = 0;

	huffreset();
	int blkno = 0;
	struct blkcache* bc = dcts;
	for (int row = MCU_rows_in_scan; --row >= 0; ) {
		for (int mcu = MCUs_per_row; --mcu >= 0; ) {
			/* FIXME this works only for hsamp = 2, vsamp = 1 */
			if (decode_block_diff(bc++, dctcache++,
					      diffvector++) < 0)
				return (-1);

			if (decode_block_diff(bc++, dctcache++,
					      diffvector++) < 0)
				return (-1);
			if (huffskip(1) < 0)
				return (-1);
			if (huffskip(2) < 0)
				return (-1);
		}
	}
	return (0);
}
