//	warpsharp - edge sharpening filter for VirtualDub
//	Copyright (C) 1998-2001 Avery Lee
//
//	This program is free software; you can redistribute it and/or modify
//	it under the terms of the GNU General Public License as published by
//	the Free Software Foundation; either version 2 of the License, or
//	(at your option) any later version.
//
//	This program is distributed in the hope that it will be useful,
//	but WITHOUT ANY WARRANTY; without even the implied warranty of
//	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
//	GNU General Public License for more details.
//
//	You should have received a copy of the GNU General Public License
//	along with this program; if not, write to the Free Software
//	Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.

#define VDEXT_VIDEO_FILTER
#define VDEXT_MAIN

#include <crtdbg.h>
#include <windows.h>
#include <math.h>
#include <string.h>
#include <stdio.h>
#include <new>
#include <commctrl.h>

#include <ScriptInterpreter.h>
#include <ScriptValue.h>

#include "filter.h"
#include "effect.h"
#include "e_blur.h"
#include "resource.h"

void MakeCubic4Table(int *table, double A, bool mmx_table) throw() {
	int i;

	for(i=0; i<256; i++) {
		double d = (double)i / 256.0;
		int y1, y2, y3, y4, ydiff;

		// Coefficients for all four pixels *must* add up to 1.0 for
		// consistent unity gain.
		//
		// Two good values for A are -1.0 (original VirtualDub bicubic filter)
		// and -0.75 (closely matches Photoshop).

		y1 = (int)floor(0.5 + (        +     A*d -       2.0*A*d*d +       A*d*d*d) * 16384.0);
		y2 = (int)floor(0.5 + (+ 1.0             -     (A+3.0)*d*d + (A+2.0)*d*d*d) * 16384.0);
		y3 = (int)floor(0.5 + (        -     A*d + (2.0*A+3.0)*d*d - (A+2.0)*d*d*d) * 16384.0);
		y4 = (int)floor(0.5 + (                  +           A*d*d -       A*d*d*d) * 16384.0);

		// Normalize y's so they add up to 16384.

		ydiff = (16384 - y1 - y2 - y3 - y4)/4;
		_ASSERT(ydiff > -16 && ydiff < 16);

		y1 += ydiff;
		y2 += ydiff;
		y3 += ydiff;
		y4 += ydiff;

		if (mmx_table) {
			table[i*4 + 0] = table[i*4 + 1] = (y2<<16) | (y1 & 0xffff);
			table[i*4 + 2] = table[i*4 + 3] = (y4<<16) | (y3 & 0xffff);
		} else {
			table[i*4 + 0] = y1;
			table[i*4 + 1] = y2;
			table[i*4 + 2] = y3;
			table[i*4 + 3] = y4;
		}
	}
}

static int cubic_tab[1024];
static int cubic_tab_MMX[1024];

int warpsharp_run(const FilterActivation *fa, const FilterFunctions *ff);
int warpsharp2_run(const FilterActivation *fa, const FilterFunctions *ff);
int warpsharp3_run(const FilterActivation *fa, const FilterFunctions *ff);
long warpsharp_param(FilterActivation *fa, const FilterFunctions *ff);
int warpsharp_start(FilterActivation *fa, const FilterFunctions *ff);
int warpsharp_end(FilterActivation *fa, const FilterFunctions *ff);
void warpsharp_string(const FilterActivation *fa, const FilterFunctions *ff, char *buf);
int warpsharp_config(FilterActivation *fa, const FilterFunctions *ff, HWND hWnd);
void warpsharp_script_config(IScriptInterpreter *isi, void *lpVoid, CScriptValue *argv, int argc);
bool warpsharp_script_line(FilterActivation *fa, const FilterFunctions *ff, char *buf, int buflen);

static ScriptFunctionDef warpsharp_func_defs[]={
	{ (ScriptFunctionPtr)warpsharp_script_config, "Config", "0i" },
	{ (ScriptFunctionPtr)warpsharp_script_config, NULL, "0ii" },
	{ NULL },
};

static CScriptObject warpsharp_obj={
	NULL, warpsharp_func_defs
};

static FilterDefinition *fd_warpsharp;

typedef unsigned char Pixel8;

class warpsharpFilterData {
public:
	unsigned char *bumpbuf;
	VEffect *veffBlur;
	VBitmap vbmBump;
	IFilterPreview *ifp;
	int depth;
	int blurlevel;
	int bumppitch;
};


static struct FilterDefinition filterDef_warpsharp={
	0,0,NULL,
	"warp sharp",
	"Tightens edges in an image by warping the image toward edge boundaries. (Version 1.2)\n\n[Assembly optimized][Requires MMX]",
	"Avery Lee",
	NULL,sizeof(warpsharpFilterData),
	NULL,NULL,
	warpsharp_run,
	warpsharp_param,
	warpsharp_config,
	warpsharp_string,
	warpsharp_start,
	warpsharp_end,
	&warpsharp_obj,
	warpsharp_script_line,
};

extern "C" int __declspec(dllexport) __cdecl VirtualdubFilterModuleInit2(FilterModule *fm, const FilterFunctions *ff, int& vdfd_ver, int& vdfd_compat);
extern "C" void __declspec(dllexport) __cdecl VirtualdubFilterModuleDeinit(FilterModule *fm, const FilterFunctions *ff);

int __declspec(dllexport) __cdecl VirtualdubFilterModuleInit2(FilterModule *fm, const FilterFunctions *ff, int& vdfd_ver, int& vdfd_compat) {
	fd_warpsharp = ff->addFilter(fm, &filterDef_warpsharp, sizeof(FilterDefinition));

	vdfd_ver = VIRTUALDUB_FILTERDEF_VERSION;
	vdfd_compat = VIRTUALDUB_FILTERDEF_COMPATIBLE;

	INITIALIZE_VTBLS;

	return 0;
}

void __declspec(dllexport) __cdecl VirtualdubFilterModuleDeinit(FilterModule *fm, const FilterFunctions *ff) {
	ff->removeFilter(fd_warpsharp);
}

////////////////////////////////////////////////////////////

long warpsharp_param(FilterActivation *fa, const FilterFunctions *ff) {
	return FILTERPARAM_SWAP_BUFFERS;
}

/////////////////////////////////////////////////////////////

extern "C" {
	char MMX_enabled;
}

Pixel32 __declspec(naked) interpolate_cubic(const Pixel32 *src, ptrdiff_t pitch, const int *horiz, const int *vert) {
	static const __int64 rounder = 0x0000200000002000i64;

	__asm {
		mov		eax,[esp+4]
		mov		ecx,[esp+8]
		mov		edx,[esp+12]

		pxor		mm7,mm7

		movd		mm0,[eax]				;fetch p00
		movd		mm6,[eax+4]				;fetch p01
		punpcklbw	mm0,mm7					;mm0 = [a00][r00][g00][b00]
		movq		mm1,mm0
		punpcklbw	mm6,mm7
		punpcklwd	mm0,mm6					;mm0 = [g01][g00][b01][b00]
		punpckhwd	mm1,mm6					;mm1 = [a01][a00][r01][r00]
		pmaddwd		mm0,[edx]				;mm0 = [a00-01][r00-01]
		pmaddwd		mm1,[edx]				;mm1 = [g00-01][b00-01]

		movd		mm2,[eax+8]				;fetch p00
		movd		mm6,[eax+12]			;fetch p01
		punpcklbw	mm2,mm7					;mm2 = [a02][r02][g02][b02]
		movq		mm3,mm2
		punpcklbw	mm6,mm7
		punpcklwd	mm2,mm6					;mm2 = [g03][g02][b03][b02]
		punpckhwd	mm3,mm6					;mm3 = [a03][a02][r03][r02]
		pmaddwd		mm2,[edx+8]				;mm2 = [a02-03][r02-03]
		pmaddwd		mm3,[edx+8]				;mm3 = [g02-03][b02-03]

		paddd		mm0,mm2
		paddd		mm1,mm3

		paddd		mm0,rounder
		paddd		mm1,rounder

		psrad		mm0,14
		psrad		mm1,14

		packssdw	mm0,mm1					;mm0 = intermediate pixel 0
		add			eax,ecx

		;---------------------------------

		movd		mm1,[eax]				;fetch p00
		movd		mm6,[eax+4]				;fetch p01
		punpcklbw	mm1,mm7					;mm0 = [a00][r00][g00][b00]
		movq		mm2,mm1
		punpcklbw	mm6,mm7
		punpcklwd	mm1,mm6					;mm0 = [g01][g00][b01][b00]
		punpckhwd	mm2,mm6					;mm1 = [a01][a00][r01][r00]
		pmaddwd		mm1,[edx]				;mm0 = [a00-01][r00-01]
		pmaddwd		mm2,[edx]				;mm1 = [g00-01][b00-01]

		movd		mm3,[eax+8]				;fetch p00
		movd		mm6,[eax+12]			;fetch p01
		punpcklbw	mm3,mm7					;mm2 = [a02][r02][g02][b02]
		movq		mm4,mm3
		punpcklbw	mm6,mm7
		punpcklwd	mm3,mm6					;mm2 = [g03][g02][b03][b02]
		punpckhwd	mm4,mm6					;mm3 = [a03][a02][r03][r02]
		pmaddwd		mm3,[edx+8]				;mm2 = [a02-03][r02-03]
		pmaddwd		mm4,[edx+8]				;mm3 = [g02-03][b02-03]

		paddd		mm1,mm3
		paddd		mm2,mm4

		paddd		mm1,rounder
		paddd		mm2,rounder

		psrad		mm1,14
		psrad		mm2,14

		packssdw	mm1,mm2					;mm1 = intermediate pixel 1
		add			eax,ecx

		;---------------------------------

		movd		mm2,[eax]				;fetch p00
		movd		mm6,[eax+4]				;fetch p01
		punpcklbw	mm2,mm7					;mm0 = [a00][r00][g00][b00]
		movq		mm3,mm2
		punpcklbw	mm6,mm7
		punpcklwd	mm2,mm6					;mm0 = [g01][g00][b01][b00]
		punpckhwd	mm3,mm6					;mm1 = [a01][a00][r01][r00]
		pmaddwd		mm2,[edx]				;mm0 = [a00-01][r00-01]
		pmaddwd		mm3,[edx]				;mm1 = [g00-01][b00-01]

		movd		mm4,[eax+8]				;fetch p00
		movd		mm6,[eax+12]			;fetch p01
		punpcklbw	mm4,mm7					;mm2 = [a02][r02][g02][b02]
		movq		mm5,mm4
		punpcklbw	mm6,mm7
		punpcklwd	mm4,mm6					;mm2 = [g03][g02][b03][b02]
		punpckhwd	mm5,mm6					;mm3 = [a03][a02][r03][r02]
		pmaddwd		mm4,[edx+8]				;mm2 = [a02-03][r02-03]
		pmaddwd		mm5,[edx+8]				;mm3 = [g02-03][b02-03]

		paddd		mm2,mm4
		paddd		mm3,mm5

		paddd		mm2,rounder
		paddd		mm3,rounder

		psrad		mm2,14
		psrad		mm3,14

		packssdw	mm2,mm3 				;mm2 = intermediate pixel 2
		add			eax,ecx

		;---------------------------------

		movd		mm3,[eax]				;fetch p00
		movd		mm6,[eax+4]				;fetch p01
		punpcklbw	mm3,mm7					;mm0 = [a00][r00][g00][b00]
		movq		mm4,mm3
		punpcklbw	mm6,mm7
		punpcklwd	mm3,mm6					;mm0 = [g01][g00][b01][b00]
		punpckhwd	mm4,mm6					;mm1 = [a01][a00][r01][r00]
		pmaddwd		mm3,[edx]				;mm0 = [a00-01][r00-01]
		pmaddwd		mm4,[edx]				;mm1 = [g00-01][b00-01]

		movd		mm5,[eax+8]				;fetch p00
		movd		mm6,[eax+12]			;fetch p01
		punpcklbw	mm5,mm7					;mm2 = [a02][r02][g02][b02]
		punpcklbw	mm6,mm7
		movq		mm7,mm5
		punpcklwd	mm5,mm6					;mm2 = [g03][g02][b03][b02]
		punpckhwd	mm7,mm6					;mm3 = [a03][a02][r03][r02]
		pmaddwd		mm5,[edx+8]				;mm2 = [a02-03][r02-03]
		pmaddwd		mm7,[edx+8]				;mm3 = [g02-03][b02-03]

		paddd		mm3,mm5
		paddd		mm4,mm7

		paddd		mm3,rounder
		paddd		mm4,rounder

		psrad		mm3,14
		psrad		mm4,14

		packssdw	mm3,mm4 				;mm2 = intermediate pixel 2

		;---------------------------------

		mov			edx,[esp+16]

		movq		mm4,mm0
		movq		mm5,mm2
		punpcklwd	mm0,mm1
		punpckhwd	mm4,mm1
		punpcklwd	mm2,mm3
		punpckhwd	mm5,mm3

		pmaddwd		mm0,[edx]
		pmaddwd		mm4,[edx]
		pmaddwd		mm2,[edx+8]
		pmaddwd		mm5,[edx+8]

		paddd		mm0,mm2
		paddd		mm4,mm5

		paddd		mm0,rounder
		paddd		mm4,rounder

		psrad		mm0,14
		psrad		mm4,14

		packssdw	mm0,mm4
		packuswb	mm0,mm0

		movd		eax,mm0
		ret

	};
}

Pixel32 __declspec(naked) interpolate_bilinear(const Pixel32 *src, ptrdiff_t pitch, int horiz, int vert) {
	static const __int64 inv = 0x0100010001000100i64;

	__asm {
		mov		eax,[esp+4]
		mov		ecx,[esp+8]

		pxor		mm7,mm7

		movd		mm0,[esp+12]
		movd		mm1,[esp+16]
		pshufw		mm0,mm0,00h
		movd		mm2,[eax+ecx+4]
		pshufw		mm1,mm1,00h
		movd		mm3,[eax+ecx]
		punpcklbw	mm2,mm7
		movd		mm4,[eax+4]
		punpcklbw	mm3,mm7
		movd		mm5,[eax]
		punpcklbw	mm4,mm7

		movq		mm6,inv
		pmullw		mm2,mm0

		punpcklbw	mm5,mm7
		pmullw		mm4,mm0

		movq		mm7,mm6
		psubw		mm6,mm0

		psubw		mm7,mm1
		pmullw		mm3,mm6

		pmullw		mm5,mm6

		paddw		mm2,mm3

		pmulhuw		mm2,mm1
		paddw		mm4,mm5

		pmulhuw		mm4,mm7

		paddw		mm2,mm4

		packuswb	mm2,mm2

		movd		eax,mm2
		ret
	};
}

int warpsharp_start(FilterActivation *fa, const FilterFunctions *ff) {
	warpsharpFilterData *sfd;
	int i;

	MMX_enabled = ff->isMMXEnabled();

	if (!(sfd = (warpsharpFilterData *)fa->filter_data)) return 1;

	sfd->bumppitch = (fa->src.w+7)&~7;

	if (!(sfd->bumpbuf = new unsigned char[sfd->bumppitch * fa->src.h]))
		return 1;

	memset(sfd->bumpbuf, 0, sfd->bumppitch*fa->src.h);

	(::new(&sfd->vbmBump) VBitmap())->init(sfd->bumpbuf, sfd->bumppitch, fa->src.h, 8);

	if (!(sfd->veffBlur = VCreateEffectBlurHi(&sfd->vbmBump)))
		return 1;

	MakeCubic4Table(cubic_tab, -0.60, false);
	MakeCubic4Table(cubic_tab_MMX, -0.60, true);

	return 0;
}

void rectcopy(void *dst, ptrdiff_t dstpitch, const void *src, ptrdiff_t srcpitch, unsigned w, unsigned h) {
	do {
		memcpy(dst, src, w);

		dst = (char *)dst + dstpitch;
		src = (const char *)src + srcpitch;
	} while(--h);
}

bool bltrect(VBitmap *dst, int x1, int y1, const VBitmap *src, int x2, int y2, int dx, int dy) {
	if (dst->depth != src->depth)
		return false;

	if (x1 < 0) { x2 += x1; dx += x1; x1 = 0; }
	if (y1 < 0) { y2 += y1; dx += y1; y1 = 0; }
	if (x2 < 0) { x1 += x2; dx += x2; x2 = 0; }
	if (y2 < 0) { y1 += y2; dy += y2; y2 = 0; }

	int offset;
	if (x1+dx > dst->w) { dx = dst->w - x1; }
	if (y1+dy > dst->h) { dy = dst->h - y1; }
	if (x2+dx > src->w) { dx = src->w - x2; }
	if (y2+dy > src->h) { dy = src->h - y2; }

	if (dx>0 && dy>0) {
		int bpp = dst->depth >> 3;

		rectcopy((char *)dst->data + dst->pitch * (dst->h - (y1+dy)) + bpp*x1, dst->pitch,
			(const char *)src->data + src->pitch * (src->h - (y2+dy)) + bpp*x2, src->pitch,
			bpp*dx, dy);
	}

	return true;
}

int warpsharp_run(const FilterActivation *fa, const FilterFunctions *ff) {
	warpsharpFilterData *sfd = (warpsharpFilterData *)fa->filter_data;
	Pixel32 *src, *dst;
	const PixOffset pitch = fa->src.pitch;
	const int bumppitch = sfd->bumppitch;
	unsigned char *bump;
	int x, y;

	src = fa->src.data;

	for(y=1; y<fa->src.h-1; y++) {
		bump = sfd->bumpbuf + bumppitch*y + 1;
		for(x=0; x<fa->src.w-2; x++) {
			int rdx, gdx, bdx, rdy, gdy, bdy;
			int odx, ody, odz;

#if 1
			const Pixel32 *srcp = src+x;
			static const __int64 coeff = 0x0000003b00b7000ei64;

			__asm {
				mov			eax,srcp
				pxor		mm7,mm7
				mov			ecx,pitch
				movd		mm0,[eax+0]
				movd		mm1,[eax+4]
				punpcklbw	mm0,mm7
				movd		mm2,[eax+8]
				punpcklbw	mm1,mm7
				movd		mm3,[eax+ecx*2+0]
				punpcklbw	mm2,mm7
				movd		mm4,[eax+ecx*2+4]
				punpcklbw	mm3,mm7
				movd		mm5,[eax+ecx*2+8]
				punpcklbw	mm4,mm7
				punpcklbw	mm5,mm7
				paddw		mm1,mm1
				paddw		mm1,mm0
				paddw		mm4,mm4
				paddw		mm4,mm3
				paddw		mm1,mm2
				paddw		mm4,mm5
				movq		mm6,mm1
				psubusw		mm6,mm4
				psubusw		mm4,mm1
				por			mm6,mm4				;mm6 = absolute y difference

				;mm0, mm2, mm3, mm5 still hold corners.

				movd		mm1,[eax+ecx]
				movd		mm4,[eax+ecx+8]
				punpcklbw	mm1,mm7
				punpcklbw	mm4,mm7
				paddw		mm1,mm1
				paddw		mm4,mm4
				paddw		mm1,mm0
				paddw		mm1,mm3
				paddw		mm4,mm2
				paddw		mm4,mm5
				movq		mm5,mm4
				psubusw		mm5,mm1
				psubusw		mm1,mm4
				por			mm5,mm1				;mm5 = absolute x difference

				pmaddwd		mm6,coeff
				pmaddwd		mm5,coeff

				movq		mm0,mm5
				psrlq		mm5,32
				movq		mm1,mm6
				psrlq		mm6,32
				paddd		mm0,mm5				;mm0 = dx
				paddd		mm1,mm6				;mm1 = dy

				movd		eax,mm0
				movd		ecx,mm1
				add			eax,128
				add			ecx,128
				sar			eax,8
				sar			ecx,8
				mov			odx,eax
				mov			ody,ecx
			}
#elif 1
			rdx	= ((src[x + 0 + (pitch>>2)*0]>>16)&0xff) + ((src[x + 0 + (pitch>>2)*1]>>16)&0xff) + ((src[x + 0 + (pitch>>2)*2]>>16)&0xff)
				- ((src[x + 2 + (pitch>>2)*0]>>16)&0xff) - ((src[x + 2 + (pitch>>2)*1]>>16)&0xff) - ((src[x + 2 + (pitch>>2)*2]>>16)&0xff);
			gdx	= ((src[x + 0 + (pitch>>2)*0]>> 8)&0xff) + ((src[x + 0 + (pitch>>2)*1]>> 8)&0xff) + ((src[x + 0 + (pitch>>2)*2]>> 8)&0xff)
				- ((src[x + 2 + (pitch>>2)*0]>> 8)&0xff) - ((src[x + 2 + (pitch>>2)*1]>> 8)&0xff) - ((src[x + 2 + (pitch>>2)*2]>> 8)&0xff);
			bdx	= ((src[x + 0 + (pitch>>2)*0]    )&0xff) + ((src[x + 0 + (pitch>>2)*1]    )&0xff) + ((src[x + 0 + (pitch>>2)*2]    )&0xff)
				- ((src[x + 2 + (pitch>>2)*0]    )&0xff) - ((src[x + 2 + (pitch>>2)*1]    )&0xff) - ((src[x + 2 + (pitch>>2)*2]    )&0xff);

			rdy	= ((src[x + 0 + (pitch>>2)*0]>>16)&0xff) + ((src[x + 1 + (pitch>>2)*0]>>16)&0xff) + ((src[x + 2 + (pitch>>2)*0]>>16)&0xff)
				- ((src[x + 0 + (pitch>>2)*2]>>16)&0xff) - ((src[x + 1 + (pitch>>2)*2]>>16)&0xff) - ((src[x + 2 + (pitch>>2)*2]>>16)&0xff);
			gdy	= ((src[x + 0 + (pitch>>2)*0]>> 8)&0xff) + ((src[x + 1 + (pitch>>2)*0]>> 8)&0xff) + ((src[x + 2 + (pitch>>2)*0]>> 8)&0xff)
				- ((src[x + 0 + (pitch>>2)*2]>> 8)&0xff) - ((src[x + 1 + (pitch>>2)*2]>> 8)&0xff) - ((src[x + 2 + (pitch>>2)*2]>> 8)&0xff);
			bdy	= ((src[x + 0 + (pitch>>2)*0]    )&0xff) + ((src[x + 1 + (pitch>>2)*0]    )&0xff) + ((src[x + 2 + (pitch>>2)*0]    )&0xff)
				- ((src[x + 0 + (pitch>>2)*2]    )&0xff) - ((src[x + 1 + (pitch>>2)*2]    )&0xff) - ((src[x + 2 + (pitch>>2)*2]    )&0xff);

			odx = (abs(rdx)*59 + abs(gdx)*183 + abs(bdx)*14 + 128)>>8;
			ody = (abs(rdy)*59 + abs(gdy)*183 + abs(bdy)*14 + 128)>>8;

#else
			int y00 = ((src[x + 0 + (pitch>>2)*0]>>16)&0xff)*59 + ((src[x + 0 + (pitch>>2)*0]>> 8)&0xff)*183 + ((src[x + 0 + (pitch>>2)*0]    )&0xff)*14;
			int y01 = ((src[x + 1 + (pitch>>2)*0]>>16)&0xff)*59 + ((src[x + 1 + (pitch>>2)*0]>> 8)&0xff)*183 + ((src[x + 1 + (pitch>>2)*0]    )&0xff)*14;
			int y02 = ((src[x + 2 + (pitch>>2)*0]>>16)&0xff)*59 + ((src[x + 2 + (pitch>>2)*0]>> 8)&0xff)*183 + ((src[x + 2 + (pitch>>2)*0]    )&0xff)*14;
			int y10 = ((src[x + 0 + (pitch>>2)*1]>>16)&0xff)*59 + ((src[x + 0 + (pitch>>2)*1]>> 8)&0xff)*183 + ((src[x + 0 + (pitch>>2)*1]    )&0xff)*14;
			int y11 = ((src[x + 1 + (pitch>>2)*1]>>16)&0xff)*59 + ((src[x + 1 + (pitch>>2)*1]>> 8)&0xff)*183 + ((src[x + 1 + (pitch>>2)*1]    )&0xff)*14;
			int y12 = ((src[x + 2 + (pitch>>2)*1]>>16)&0xff)*59 + ((src[x + 2 + (pitch>>2)*1]>> 8)&0xff)*183 + ((src[x + 2 + (pitch>>2)*1]    )&0xff)*14;
			int y20 = ((src[x + 0 + (pitch>>2)*2]>>16)&0xff)*59 + ((src[x + 0 + (pitch>>2)*2]>> 8)&0xff)*183 + ((src[x + 0 + (pitch>>2)*2]    )&0xff)*14;
			int y21 = ((src[x + 1 + (pitch>>2)*2]>>16)&0xff)*59 + ((src[x + 1 + (pitch>>2)*2]>> 8)&0xff)*183 + ((src[x + 1 + (pitch>>2)*2]    )&0xff)*14;
			int y22 = ((src[x + 2 + (pitch>>2)*2]>>16)&0xff)*59 + ((src[x + 2 + (pitch>>2)*2]>> 8)&0xff)*183 + ((src[x + 2 + (pitch>>2)*2]    )&0xff)*14;

			odx = (abs((y00 + y10 + y20) - (y02 + y12 + y22)) + 128) >> 8;
			ody = (abs((y00 + y01 + y02) - (y20 + y21 + y22)) + 128) >> 8;
#endif

//			*bump++ = (int)(sqrt((double)dx*dx + (double)dy*dy)+0.5);
			// min(dx,dy) = dx + 0.5 * (fabs(dy-dx) - (dy-dx))
			// max(dx,dy) = dx + 0.5 * (fabs(dx-dy) - (dx-dy))

			// h ~= dx + dy - 0.5*min(dx,dy)
			//	  = dx + dy - 0.5*(dx + 0.5 * (fabs(dy-dx) - (dy-dx)))
			//	  = dx + dy - 0.5*dx + 0.25 * (fabs(dy-dx) - (dy-dx))
			//	  = 0.5*dx + dy + 0.25 * (fabs(dy-dx) - (dy-dx))
			//	  = 0.5*dx + dy + 0.25 * fabs(dy-dx) - 0.25*dy + 0.25*dx
			//	  = 0.75*(dx + dy) + 0.25 * fabs(dy-dx)

#if 0
			union {
				float f;
				int i;
			} converter;

			__asm emms

//			converter.f = (float)(0.75*(odx + ody) + 0.25*fabs(ody-odx) + (0.5 + (3<<22)));
			converter.f = (float)(sqrt((double)odx*odx + (double)ody*ody) + (0.5 + (3<<22)));
//			converter.f = (float)(abs(odx)+abs(ody) + (0.5 + (3<<22)));

			converter.i = (converter.i & 0xffff);

			if (converter.i > 128)
				converter.i = 128;

			*bump++ = converter.i;
#else

			int level = 3*(odx+ody) + abs(ody-odx);

			if (level > 128)
				level = 128;

			*bump++ = level;

#endif
		}

		src = (Pixel32 *)((char *)src + pitch);
	}

	for(int pass=0; pass<=sfd->blurlevel; ++pass) {
		sfd->veffBlur->run(&sfd->vbmBump);
		sfd->veffBlur->run(&sfd->vbmBump);
		sfd->veffBlur->run(&sfd->vbmBump);
	}

	__asm emms

	bltrect(&fa->dst, 0, 0, &fa->src, 0, 0, fa->src.w, 4);

	src = (Pixel32 *)((char *)fa->src.data + fa->src.pitch*4 + 16);
	dst = (Pixel32 *)((char *)fa->dst.data + fa->dst.pitch*4 + 16);

	int lo_dispy, hi_dispy;

	const int height = fa->src.h;
	const int width = fa->src.w;
	const int depth = sfd->depth * (sfd->blurlevel+1);

	for(y=0; y<height-8; y++) {
		dst[-4] = src[-4];
		dst[-3] = src[-3];
		dst[-2] = src[-2];
		dst[-1] = src[-1];

		lo_dispy = -(3+y)*256;
		hi_dispy = (height-2-y)*256 - 1;
		bump = sfd->bumpbuf + bumppitch * (3+y) + 3;

		int lo_dispx = -3*256;
		int hi_dispx = (width-2)*256 - 1;

		for(x=0; x<width-8; x++) {
			int dispx, dispy;

			// Vector points away from edge

			dispx = bump[1*bumppitch] - bump[2+bumppitch];
			dispy = bump[1] - bump[1+bumppitch*2];

			dispx = ((dispx*depth+8)>>4);
			dispy = ((dispy*depth+8)>>4);

			if (dispx < lo_dispx)
				dispx = lo_dispx;
			else if (dispx > hi_dispx)
				dispx = hi_dispx;

			if (dispy < lo_dispy)
				dispy = lo_dispy;
			else if (dispy > hi_dispy)
				dispy = hi_dispy;

#if 0
			Pixel32 z = (bump[1+bumppitch] * 0x010101);

			dst[x] = z+z - ((z>>7)&0x010101);
#elif 0
			const Pixel32 *src2 = &src[x + (dispx>>8) + (dispy>>8) * (pitch>>2)];

			dst[x] = interpolate_bilinear(src2, pitch, dispx&255, dispy&255);
#elif 1
			Pixel32 p00, p01, p02, p03, p10, p11, p12, p13, p20, p21, p22, p23, p30, p31, p32, p33;
			int ox = dispx&255;
			int oy = dispy&255;

/*
			const int *cubic_horiz = cubic_tab + ox*4;
			const int *cubic_vert = cubic_tab + oy*4;
			const Pixel32 *src2 = &src[x + (dispx>>8) + (dispy>>8) * (pitch>>2)];

			p00 = src2[-1 + -1*(pitch>>2)];
			p01 = src2[ 0 + -1*(pitch>>2)];
			p02 = src2[+1 + -1*(pitch>>2)];
			p03 = src2[+2 + -1*(pitch>>2)];
			p10 = src2[-1 +  0*(pitch>>2)];
			p11 = src2[ 0 +  0*(pitch>>2)];
			p12 = src2[+1 +  0*(pitch>>2)];
			p13 = src2[+2 +  0*(pitch>>2)];
			p20 = src2[-1 + +1*(pitch>>2)];
			p21 = src2[ 0 + +1*(pitch>>2)];
			p22 = src2[+1 + +1*(pitch>>2)];
			p23 = src2[+2 + +1*(pitch>>2)];
			p30 = src2[-1 + +2*(pitch>>2)];
			p31 = src2[ 0 + +2*(pitch>>2)];
			p32 = src2[+1 + +2*(pitch>>2)];
			p33 = src2[+2 + +2*(pitch>>2)];

			int ch0 = cubic_horiz[0];
			int ch1 = cubic_horiz[1];
			int ch2 = cubic_horiz[2];
			int ch3 = cubic_horiz[3];
			int cv0 = cubic_vert[0];
			int cv1 = cubic_vert[1];
			int cv2 = cubic_vert[2];
			int cv3 = cubic_vert[3];

			int r0 = ((int)((p00>>16)&0xff) * ch0 + (int)((p01>>16)&0xff) * ch1 + (int)((p02>>16)&0xff) * ch2 + (int)((p03>>16)&0xff) * ch3 + 128) >> 8;
			int g0 = ((int)((p00>> 8)&0xff) * ch0 + (int)((p01>> 8)&0xff) * ch1 + (int)((p02>> 8)&0xff) * ch2 + (int)((p03>> 8)&0xff) * ch3 + 128) >> 8;
			int b0 = ((int)((p00    )&0xff) * ch0 + (int)((p01    )&0xff) * ch1 + (int)((p02    )&0xff) * ch2 + (int)((p03    )&0xff) * ch3 + 128) >> 8;
			int r1 = ((int)((p10>>16)&0xff) * ch0 + (int)((p11>>16)&0xff) * ch1 + (int)((p12>>16)&0xff) * ch2 + (int)((p13>>16)&0xff) * ch3 + 128) >> 8;
			int g1 = ((int)((p10>> 8)&0xff) * ch0 + (int)((p11>> 8)&0xff) * ch1 + (int)((p12>> 8)&0xff) * ch2 + (int)((p13>> 8)&0xff) * ch3 + 128) >> 8;
			int b1 = ((int)((p10    )&0xff) * ch0 + (int)((p11    )&0xff) * ch1 + (int)((p12    )&0xff) * ch2 + (int)((p13    )&0xff) * ch3 + 128) >> 8;
			int r2 = ((int)((p20>>16)&0xff) * ch0 + (int)((p21>>16)&0xff) * ch1 + (int)((p22>>16)&0xff) * ch2 + (int)((p23>>16)&0xff) * ch3 + 128) >> 8;
			int g2 = ((int)((p20>> 8)&0xff) * ch0 + (int)((p21>> 8)&0xff) * ch1 + (int)((p22>> 8)&0xff) * ch2 + (int)((p23>> 8)&0xff) * ch3 + 128) >> 8;
			int b2 = ((int)((p20    )&0xff) * ch0 + (int)((p21    )&0xff) * ch1 + (int)((p22    )&0xff) * ch2 + (int)((p23    )&0xff) * ch3 + 128) >> 8;
			int r3 = ((int)((p30>>16)&0xff) * ch0 + (int)((p31>>16)&0xff) * ch1 + (int)((p32>>16)&0xff) * ch2 + (int)((p33>>16)&0xff) * ch3 + 128) >> 8;
			int g3 = ((int)((p30>> 8)&0xff) * ch0 + (int)((p31>> 8)&0xff) * ch1 + (int)((p32>> 8)&0xff) * ch2 + (int)((p33>> 8)&0xff) * ch3 + 128) >> 8;
			int b3 = ((int)((p30    )&0xff) * ch0 + (int)((p31    )&0xff) * ch1 + (int)((p32    )&0xff) * ch2 + (int)((p33    )&0xff) * ch3 + 128) >> 8;

			int r = (r0 * cv0 + r1 * cv1 + r2 * cv2 + r3 * cv3 + (1<<19)) >> 20;
			int g = (g0 * cv0 + g1 * cv1 + g2 * cv2 + g3 * cv3 + (1<<19)) >> 20;
			int b = (b0 * cv0 + b1 * cv1 + b2 * cv2 + b3 * cv3 + (1<<19)) >> 20;

			if (r<0) r=0; else if (r>255) r=255;
			if (g<0) g=0; else if (g>255) g=255;
			if (b<0) b=0; else if (b>255) b=255;

			dst[x] = (r<<16) + (g<<8) + b;*/

			const Pixel32 *src2 = &src[x + (dispx>>8) - 1 + ((dispy>>8)-1) * (pitch>>2)];
			const int *cubic_horiz = cubic_tab_MMX + ox*4;
			const int *cubic_vert = cubic_tab_MMX + oy*4;

			dst[x] = interpolate_cubic(src2, pitch, cubic_horiz, cubic_vert);

//			dst[x] = (bump[1 + fa->src.w]*255)>>6;
#elif 1
			Pixel32 p1, p2, p3, p4;
			int ox = (dispx>>4)&15;
			int oy = (dispy>>4)&15;

			p1 = src[x+(dispx>>8) + ((dispy>>8)+0)*(pitch>>2) + 0];
			p2 = src[x+(dispx>>8) + ((dispy>>8)+0)*(pitch>>2) + 1];
			p3 = src[x+(dispx>>8) + ((dispy>>8)+1)*(pitch>>2) + 0];
			p4 = src[x+(dispx>>8) + ((dispy>>8)+1)*(pitch>>2) + 1];

			dst[x]	= (((((p1&0xff00ff)*(16-ox) + (p2&0xff00ff)*ox)*(16-oy) + ((p3&0xff00ff)*(16-ox) + (p4&0xff00ff)*ox)*oy + 0x00800080)>>8)&0xff00ff)
					+ (((((p1&0x00ff00)*(16-ox) + (p2&0x00ff00)*ox)*(16-oy) + ((p3&0x00ff00)*(16-ox) + (p4&0x00ff00)*ox)*oy + 0x00008000)>>8)&0x00ff00);
#else
			dst[x] = ((dispx/2+128)<<8) + (dispy/2+128);
#endif

			++bump;
			lo_dispx -= 256;
			hi_dispx -= 256;
		}

		dst[x] = src[x];
		dst[x+1] = src[x+1];
		dst[x+2] = src[x+2];
		dst[x+3] = src[x+3];

		src = (Pixel32 *)((char *)src + pitch);
		dst = (Pixel32 *)((char *)dst + pitch);
	}

	__asm emms

	bltrect(&fa->dst, 0, fa->src.h-4, &fa->src, 0, fa->src.h-4, fa->src.w, 4);

	return 0;
}

int warpsharp_end(FilterActivation *fa, const FilterFunctions *ff) {
	warpsharpFilterData *sfd;

	if (!(sfd = (warpsharpFilterData *)fa->filter_data)) return 1;

	delete[] sfd->bumpbuf;

	sfd->bumpbuf = NULL;

	sfd->vbmBump.~VBitmap();	

	return 0;
}


static BOOL APIENTRY warpsharpDlgProc( HWND hDlg, UINT message, UINT wParam, LONG lParam) {
	warpsharpFilterData *mfd = (warpsharpFilterData *)GetWindowLong(hDlg, DWL_USER);
	char buf[32];

    switch (message)
    {
        case WM_INITDIALOG:
			{
				HWND hwndItem;

				mfd = (warpsharpFilterData *)lParam;
				SetWindowLong(hDlg, DWL_USER, lParam);

				hwndItem = GetDlgItem(hDlg, IDC_DEPTH);

				SendMessage(hwndItem, TBM_SETRANGE, TRUE, MAKELONG(0,1024));
				SendMessage(hwndItem, TBM_SETPOS, (WPARAM)TRUE, mfd->depth);
				SetDlgItemInt(hDlg, IDC_STATIC_DEPTH, MulDiv(mfd->depth, 100, 256), FALSE);

				hwndItem = GetDlgItem(hDlg, IDC_BLUR);

				SendMessage(hwndItem, TBM_SETRANGE, TRUE, MAKELONG(0,3));
				SendMessage(hwndItem, TBM_SETPOS, (WPARAM)TRUE, mfd->blurlevel);
				SetDlgItemInt(hDlg, IDC_STATIC_BLUR, mfd->blurlevel+1, FALSE);

				mfd->ifp->InitButton(GetDlgItem(hDlg, IDC_PREVIEW));

			}
            return (TRUE);

        case WM_COMMAND:
			switch(LOWORD(wParam)) {
            case IDOK:
				mfd->ifp->Close();
				EndDialog(hDlg, 0);
				return TRUE;

			case IDCANCEL:
				mfd->ifp->Close();
                EndDialog(hDlg, 1);
                return TRUE;

			case IDC_PREVIEW:
				mfd->ifp->Toggle(hDlg);
				return TRUE;

			}
			break;

		case WM_HSCROLL:
			if (lParam) {
				switch(GetWindowLong((HWND)lParam, GWL_ID)) {
				case IDC_DEPTH:
					if (wParam != mfd->depth) {
						mfd->depth = SendMessage((HWND)lParam, TBM_GETPOS, 0, 0);
						mfd->ifp->RedoFrame();
						SetDlgItemInt(hDlg, IDC_STATIC_DEPTH, MulDiv(mfd->depth, 100, 256), TRUE);
					}
					break;
				case IDC_BLUR:
					if (wParam != mfd->blurlevel) {
						mfd->blurlevel = SendMessage((HWND)lParam, TBM_GETPOS, 0, 0);
						mfd->ifp->RedoFrame();
						SetDlgItemInt(hDlg, IDC_STATIC_BLUR, mfd->blurlevel+1, TRUE);
					}
					break;
				}
			}
			return TRUE;
    }
    return FALSE;
}

static void warpsharp_string(const FilterActivation *fa, const FilterFunctions *ff, char *buf) {
	warpsharpFilterData *mfd = (warpsharpFilterData *)fa->filter_data;

	wsprintf(buf, " (depth: %d, blur %dx)", MulDiv(mfd->depth, 100, 256), mfd->blurlevel+1);
}

static int warpsharp_config(FilterActivation *fa, const FilterFunctions *ff, HWND hWnd) {
	warpsharpFilterData *mfd = (warpsharpFilterData *)fa->filter_data;
	warpsharpFilterData mfd2 = *mfd;
	int ret;

	mfd->ifp = fa->ifp;

	ret = DialogBoxParam(fa->filter->module->hInstModule, MAKEINTRESOURCE(IDD_FILTER_WARPSHARP), hWnd, warpsharpDlgProc, (LONG)mfd);

	if (ret)
		*mfd = mfd2;

	return ret;
}

static void warpsharp_script_config(IScriptInterpreter *isi, void *lpVoid, CScriptValue *argv, int argc) {
	FilterActivation *fa = (FilterActivation *)lpVoid;
	warpsharpFilterData *mfd = (warpsharpFilterData *)fa->filter_data;

	mfd->depth	= argv[0].asInt();
	mfd->blurlevel = 0;

	if (argc > 1)
		mfd->blurlevel = argv[1].asInt();
}

static bool warpsharp_script_line(FilterActivation *fa, const FilterFunctions *ff, char *buf, int buflen) {
	warpsharpFilterData *mfd = (warpsharpFilterData *)fa->filter_data;

	_snprintf(buf, buflen, "Config(%d, %d)", mfd->depth, mfd->blurlevel);

	return true;
}
