/*
    2d cleaner filter for VirtualDub -- blends a pixel with pixels
	surrounding it as long as those pixels are simular to the source
	pixel

    Copyright (C) 2000 Jim Casaburi
		Based on code by Avery Lee
		Useful suggestions and much help from Donald A. Graft

    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program; if not, write to the Free Software
    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.

	The author can be contacted at:
	Jim Casaburi
	casaburi@earthlink.net

	New versions of the source code and the compiled filter can be
	found at http://home.earthlink.net/~casaburi/download/
*/

#define ATHLON

#include "../include/filter.h"
#include "resource.h"
#include <stdio.h>
#include "../include/ScriptInterpreter.h"
#include "../include/ScriptError.h"
#include "../include/ScriptValue.h"

typedef struct MyFilterData {
	IFilterPreview		*ifp;
	bool fDebugNoise;
	bool fInterlace;
	int fThreshold;
	int fRadiusx;
	int fRadiusy;
} MyFilterData;

static int twodclean_run(const FilterActivation *fa, const FilterFunctions *ff);
BOOL CALLBACK twodclean_ConfigDlgProc(HWND hdlg, UINT msg, WPARAM wParam, LPARAM lParam);
int twodclean_ConfigProc(FilterActivation *fa, const FilterFunctions *ff, HWND hwnd);
void twodclean_StringProc(const FilterActivation *fa, const FilterFunctions *ff, char *str);
void twodcleanScriptConfig(IScriptInterpreter *isi, void *lpVoid, CScriptValue *argv, int argc);
void initindices(void);

ScriptFunctionDef twodclean_func_defs[]={
    { (ScriptFunctionPtr)twodcleanScriptConfig, "Config", "0iiii" },
    { NULL },
};

CScriptObject twodclean_obj={
    NULL, twodclean_func_defs
};

/* Warning! Only for Athlon, PIII and P4, because it makes use of pmaxub and pminub
This assembler code cannot be used for regions larger than 11x11, because MMX instructions
pcmpgtw and pmaddwd operate with 16-bit signed words. Thus, the sum of 129 pixels with
channel value 255 would lead to overflow. 121 pixels is still OK.

There is one more difference from the original C-code: the pixel will not be included
into blending, if at least one of its rgb components is beyond the threshold (was: only
the relevant channel will not participate). This is because if one channel is outside
the interval, it is highly probable that the pixel corresponds to certain detail of the
pixture and not to a noise. Hence, forget its other channels, too. As compared with the
C-code, on  PII-350, the speed gain is more than 3x. On AMD K7T, the gain is 4x.

One more thing: division is rounded to nearest integer (not truncated, as it was originally).

On Athlon it is nearly optimal: - around 4.5 cycles to process one pixel
                                - dependencies are minimized, so that in average,
								  > 2 instructions/cycles are performed

Assembly code written by: Jaan Kalda, kalda@ioc.ee */

/* Preview, help, miscellaneous fixes added by Donald Graft. */

int divtable[256], n_plus_1[256]; //11*11*2
char cellcount[21];
int *startindices[11];
int sitevectors[128];
int sitevectors2[128];
int sitevectors3[128];
int radiusx0, radiusy0, pitchm0;
int	esp00, ebp00, prefetchdistance, eax00, edi00, *startpoint, pointstot, pointstot0;
#define LOCVARSZ 64
#define STARTP 4
#define PIXTOT 8
#define WW 12
#define THRSH 16
#define SCRATCH 24
#define FDBN 28
#define RADIUSX 32
#define MAXX 36
#define DST 40
#define PITCH 44
#define CURRENT 48

static int twodclean_run(const FilterActivation *fa, const FilterFunctions *ff) {
	MyFilterData *mfd = (MyFilterData *)fa->filter_data;
	unsigned long h;
	int y, lowx0,lowy0, highy0, Thresh;
	unsigned long *src, *dst; //lowx, highx, lowxx
	int	radiusx, radiusy, lowy, highy, modulo0, width, height, maxx;
	int pitch, pitchm, multfact, multfactm;
	bool fDebugNoise;

	pitch = fa->src.pitch;
	src = (unsigned long *)fa->src.data;
	dst = (unsigned long *)fa->dst.data;
	modulo0 = fa->src.modulo;
	fDebugNoise = mfd->fDebugNoise;
	radiusx = mfd->fRadiusx;
	radiusy = mfd->fRadiusy;
    pointstot0 = (1 + 2 * radiusy) * (1 + 2 * radiusx) - 1;
	Thresh = mfd->fThreshold;
	Thresh |= (Thresh << 8) | (Thresh << 16) | 0xFF000000;
	src -= pitch>>2;
	if (mfd->fInterlace) {
	// are we dealing with interlaced source?
		multfact = 2;
		pitchm = 2 * pitch;
		multfactm = 0;
		lowy0 = 1 - 2 * radiusy;
		highy0 = 1 + 2 * radiusy ;
	} else {
		multfact = 1;
		multfactm = -1;
		pitchm = pitch;
		lowy0 = 1 - radiusy;
		highy0 = 1 + radiusy;
	}

	if ((radiusx0-radiusx) | (radiusy0-radiusy) | (pitchm0-pitchm))	{
		  radiusx0=radiusx;
		  radiusy0=radiusy;
		  pitchm0=pitchm;
		  initindices();
	  }

	prefetchdistance = pitch * (radiusy + 1);
	height = h = fa->src.h;
	y = 0;
	width = fa->src.w -1;
	maxx = width - radiusx;
	lowx0 = -radiusx;
	do {
		lowy =lowy0;
		highy = highy0;
		if (y + lowy0 <= 0) {// if it is a bottom line
		   lowy = 1-y ;
		   if (multfactm) pointstot = (highy-lowy + 1) * (1 + 2 * radiusx)-1;
		   else  pointstot = (((highy-lowy)>>1) + 1) * (1 + 2 * radiusx) - 1;
			startpoint = &sitevectors2[pointstot0-pointstot];
		   goto  borderrows;
	    }
		else if ((y + highy0 > height)) {//if it is a top line
			highy = height - y;
		   if (multfactm) pointstot = (highy-lowy + 1) * (1 + 2 * radiusx)-1;
		   else  pointstot = (((highy-lowy)>>1) + 1) * (1 + 2 * radiusx) - 1;
			startpoint = &sitevectors2[0];
		   goto  borderrows;
		}
		//A ROW IN THE MIDDLE
__asm{
//;we want to use ebp, but MSVC6 refuses to omit stack pointer. So we cannot access local variables here
//; and have to use either global ones or allocate space on stack. The latter is somewhat better,
//; because it allows shorter forms of addressing.

		movd	mm4,[Thresh]
		mov		eax,src //;eax will be the pointer to the source pixel
		mov		edx,maxx  //;reallocate local variables
		mov		ebx,radiusx //;reallocate local variables
		movsx	ecx,fDebugNoise //;reallocate local variables
		mov		edi,pitch //;reallocate local variables
		mov		esi,width //
		mov		ebp00,ebp
		mov		ebp,dst
		mov		esp00,esp
		and		esp,-8  //;align stack on qword boundary
		sub		esp,LOCVARSZ
		punpckldq mm4,mm4
		add		eax,edi //; because above is src -= pitch>>2; we can get rid of both statements
		mov		[esp+MAXX],edx //;this is the index of the last pixel of the middle part of the row
		mov		[esp+RADIUSX],ebx
		mov		[esp+FDBN],ecx
		mov		[esp+PITCH],edi
		mov		[esp+DST],ebp


		lea		edi,[esp+SCRATCH+4] //; where to write the non-valid pixel before the cycle begins
		pxor	mm7,mm7  //; keep 0
		movq	[esp+THRSH],mm4

align 16
mlinestart:
		movd	mm6,[eax] //(1); load the source pixel
		xor		ecx,ecx //(a);0 for cmov
		sub		edx,esi //(b);here esi = w-1, edx (was) = maxx. Now edx = maxx - (w-1)
		punpcklwd mm5,mm4 //(.Q);mm5 =(b,-1,g,-1)
		punpckhwd mm3,mm4 //(.R);mm3 =(r,-1,x,-1) the last result to be stored
				//;XXXXXXXXXXXXXX  eax - the source pixel (unsigned long *) ((char *) src + pitch);
		movq 	mm4,[esp+THRSH] //(2);
		movq	mm1,mm6 //(3); rgb0 in low dword as bytes
		punpckldq mm6,mm6 //(4); duplicate the source into high dword
		cmovge	edx,ecx //(c);index for leftedge, edx <= 0 (edx<0, if  w-1 > width-radiusx)
		pmaddwd	mm5,mm0 //(.S);(b*w+w/2,g*w+w/2)
		pmaddwd	mm3,mm0 //(.T);(r*w+w/2,x*w+w/2)
		movq	mm2,mm4 //(5)
		sub		ebx,esi //(d);ebx was radiusx. So ebx = (radiusx-(w-1)). If ebx > 0 => w-1 < radiusx
		paddusb	mm4,mm6 //(6);Threshold_High
		cmovle	ebx,ecx	//(e);ebx = index for right edge
		psrld	mm5,15 //(.U);0th and 4th bytes are <b> and <g>
		psrld	mm3,15 //(.V)
		psubusb	mm6,mm2 //(7);Threshold_Low
		punpcklbw	mm1,mm7 //(8);mm1=rgb0 as words
		mov		ecx,startindices[40+4*edx]   //(f);if ebx=[ecx+4*edx] then the source pixel is [eax+ebx]
		movsx	edx,cellcount[10+edx+ebx] //(g); NB! assumes width >= 2*radiusx and radiusx <= 10 (declaration: int cellcount[21])
		packuswb mm5,mm3 //(.X);0th, 2nd, and 4th bytes are <b>, <g> and <r>
		mov		[esp+WW],esi //(h); store (w-1) in ww
		mov		[esp+PIXTOT],edx //(i)
		pxor	mm2,mm2 //(9); mm2=0, so that it results on 0 term on first pass [at (I') til (M')]
		movq	[esp+CURRENT],mm7 //(10);
		packuswb mm5,mm5 //(.Y);0th,1st, and 3rd
		movq	[esp+CURRENT+8],mm7 //(10);
		mov		ebx,[ecx+4*edx-4] //(l)
		mov		esi,[ecx+4*edx-8] //(m)
		xor		ebp,ebp //(k);set pixel count to 0
		test	edx,3
		movd	[edi-4],mm5//(.Z)
		movq	mm5,mm1	//(11);mm5=rgb_count
		mov		edi,ebp //(j);set pixel count to 0
		jnz		oddnumerofpairs

evennumerofpairs: //;mm7=0, mm4=Thresh_High mm6=Thresh_Low mm5=rgb(unpacked), edi=ebp=0
		movd	mm0,[eax+ebx]	//(A+); mm0 :rgb of the pixel
		sub		edx,4
		punpckldq	mm0,[eax+esi] //(B+);one more rgb to bits 32-63
		movq	mm1,mm6			//(C) ;[8xTL]
		jz		lastfour
align 16
mloop:
		pmaxub	mm1,mm0			//(D)
		pminub	mm0,mm4			//(E+)
		movq	mm3,mm2			//(I')
		mov		ebx,[ecx+4*edx+4]
		mov		esi,[ecx+4*edx]
		punpcklbw mm2,mm7		//(J+')
		punpckhbw mm3,mm7		//(K')
		psubb	mm0,mm1			//(F+);mm0(r,g or b) == 0, if within limits, if so, mm0 is rgb
		paddusw	mm5,mm2			//(L+')
		movd	mm2,[eax+ebx]	//(A'+); mm0 :rgb of the pixel
		punpckldq	mm2,[eax+esi] //(B'+);one more rgb to bits 32-63
		sub		ebp,[esp+CURRENT]
		sub		edi,[esp+CURRENT+4]
		pcmpeqd mm0,mm7			//(G+); -1, if all were within limits; 0 otherwise
		paddusw	mm5,mm3			//(M+')
		movq	mm3,mm6			//(C') ;[8xTL]
		movq	[esp+CURRENT],mm0
		pand	mm0,mm1			//(H+); rgb masked
		pmaxub	mm3,mm2			//(D')
		pminub	mm2,mm4			//(E+')
		movq	mm1,mm0			//(I)
		mov		ebx,[ecx+4*edx-4]
		mov		esi,[ecx+4*edx-8]
		punpcklbw mm0,mm7		//(J+)
		punpckhbw mm1,mm7		//(K)
		psubb	mm2,mm3			//(F+');mm0(r,g or b) == 0, if within limits, if so, mm0 is rgb
		paddusw	mm5,mm0			//(L+)
		movd	mm0,[eax+ebx]	//(A+); mm0 :rgb of the pixel
		punpckldq	mm0,[eax+esi] //(B+);one more rgb to bits 32-63
		sub		ebp,[esp+CURRENT+8]
		sub		edi,[esp+CURRENT+12]
		pcmpeqd mm2,mm7			//(G+'); -1, if all were within limits; 0 otherwise
		paddusw	mm5,mm1			//(M+)
		movq	mm1,mm6			//(C) ;[8xTL]
		sub		edx,4
		movq	[esp+CURRENT+8],mm2
		pand	mm2,mm3			//(H+'); rgb masked
		jg		mloop
lastfour:
		pmaxub	mm1,mm0			//(D)
		pminub	mm0,mm4			//(E+)
		movq	mm3,mm2			//(I')
		mov		ebx,[ecx+4*edx+4]
		mov		esi,[ecx+4*edx]
		punpcklbw mm2,mm7		//(J+')
		punpckhbw mm3,mm7		//(K')
		psubb	mm0,mm1			//(F+);mm0(r,g or b) == 0, if within limits, if so, mm0 is rgb
		paddusw	mm5,mm2			//(L+')
		movd	mm2,[eax+ebx]	//(A'+); mm0 :rgb of the pixel
		punpckldq	mm2,[eax+esi] //(B'+);one more rgb to bits 32-63
		sub		ebp,[esp+CURRENT]
		sub		edi,[esp+CURRENT+4]
		pcmpeqd mm0,mm7			//(G+); -1, if all were within limits; 0 otherwise
		paddusw	mm5,mm3			//(M+')
		movq	mm3,mm6			//(C') ;[8xTL]
		movq	[esp+CURRENT],mm0
		pand	mm0,mm1			//(H+); rgb masked
		pmaxub	mm3,mm2			//(D')
		pminub	mm2,mm4			//(E+')
		movq	mm1,mm0			//(I)
		punpcklbw mm0,mm7		//(J+)
		punpckhbw mm1,mm7		//(K)
		sub		ebp,[esp+CURRENT+8]
		psubb	mm2,mm3			//(F+');mm0(r,g or b) == 0, if within limits, if so, mm0 is rgb
		paddusw	mm5,mm0			//(L+)
		sub		edi,[esp+CURRENT+12]
		pcmpeqd mm2,mm7			//(G+'); -1, if all were within limits; 0 otherwise
		paddusw	mm5,mm1			//(M+)
		sub		edx,4
		movd	[esp+CURRENT+8],mm2
		pand	mm3,mm2			//(H+'); rgb masked
final:
		punpckhdq mm2,mm2
		mov		ecx,prefetchdistance
		movd	[esp+CURRENT+12],mm2
		movq	mm2,mm3			//(I')
		punpcklbw mm3,mm7		//(J+')
		sub		ebp,[esp+CURRENT]
		prefetchnta [ecx+eax] //;for PIII and K7 (OK, but without any benefit on PII)
		punpckhbw mm2,mm7		//(K')
		mov		ecx,[esp+CURRENT+8]
		add		ecx,[esp+CURRENT+12]
		sub		edi,[esp+CURRENT+4]
		paddusw	mm5,mm3			//(L+')
		mov		edx,[esp+DST]	//;(i)
		mov		ebx,[esp+PITCH]	//;(ii)
		mov		esi,[esp+WW]			//;(iii)
		add		edx,4			//;(iv)
		add		eax,4

#ifdef ATHLON
		prefetchw [ebx+edx] //- for AMD-K7
#endif
		paddusw	mm5,mm2			//(M+')
//		sub		ebp,[esp+CURRENT+8]
		mov		[esp+DST],edx			//;(vii)
		test	[esp+FDBN],-1  //;(vi)
		mov		ebx,[esp+RADIUSX]	//;(ix)
		jnz		mdebugnoise

		movq	mm3,mm5 		//(.N);(b,g,r,x)
		sub		edi,ecx
		shl		ebp,3			//;(v)
mnodebugnoise:
		mov		ecx,edx
		mov		edx,[esp+MAXX]		//;(xi)
		dec		esi				//;(viii) esi = w-1
		movq 	mm4,[n_plus_1+8*edi+ebp] //(.O);(n+1, n+1, n+1, n+1)
		movq	mm0,[divtable+8*edi+ebp] //(.P);mm0=(w,-w/2,w,-w/2) (w<= 2^14)
		mov		edi,ecx			//;(x)
		jns		mlinestart
			//;THE END OF THE LINE


		punpcklwd mm5,mm4 //;mm5 =(b,-1,g,-1)
		punpckhwd mm3,mm4 //;mm3 =(r,-1,x,-1)
		pmaddwd	mm5,mm0 //;(b*w+w/2,g*w+w/2)
		pmaddwd	mm3,mm0 //;(r*w+w/2,x*w+w/2)
		psrld	mm5,15 //;0th and 4th bytes are <b> and <g>
		psrld	mm3,15
		packuswb mm5,mm3 //;0th, 2nd, and 4th bytes are <b>, <g> and <r>
		packuswb mm5,mm5 //;0th,1st, and 3rd
		movd	[edi-4],mm5

lineout:

		mov		ebp,ebp00
		mov		esp,esp00
}

		 //END THE ROW IN THE MIDDLE

		src += pitch>>2;
		dst += fa->dst.pitch>>2;
		y++;
	} while (--h);
	__asm { emms}

return 0;

	__asm{
mdebugnoise:
		sub		edi,ecx
		mov		ecx,[esp+PIXTOT]
		lea		ebx,[edi+ebp]
		shr		ecx,1
		cmp		ebx,ecx
		ja		mnodebugnoise0
		pxor	mm5,mm5
		pxor	mm3,mm3
mnodebugnoise0:
		shl		ebp,3			//;(v)
		mov		ebx,[esp+RADIUSX]	//;(ix)
		movq	mm3,mm5 		//(.N);(b,g,r,x)
		jmp		mnodebugnoise
oddnumerofpairs:
		test	edx,1
		jnz		oddnumerofcells
		sub		edx,2
		jz		lasttwo //;maybe there was just two cells?
		movd	mm2,[eax+ebx]	//(A'+); mm0 :rgb of the pixel
		punpckldq	mm2,[eax+esi] //(B'+);one more rgb to bits 32-63
		movq	mm3,mm6			//(C') ;[8xTL]
		pmaxub	mm3,mm2			//(D')
		pminub	mm2,mm4			//(E+')
		mov		ebx,[ecx+4*edx-4]
		mov		esi,[ecx+4*edx-8]
		psubb	mm2,mm3			//(F+');mm0(r,g or b) == 0, if within limits, if so, mm0 is rgb
		movd	mm0,[eax+ebx]	//(A+); mm0 :rgb of the pixel
		punpckldq	mm0,[eax+esi] //(B+);one more rgb to bits 32-63
		pcmpeqd mm2,mm7			//(G+'); -1, if all were within limits; 0 otherwise
		movq	mm1,mm6			//(C) ;[8xTL]
		sub		edx,4
		movq	[esp+CURRENT+8],mm2
		pand	mm2,mm3			//(H+'); rgb masked
		jg		mloop
		jmp 	lastfour
lasttwo:
		movd	mm2,[eax+ebx]	//(A'+); mm0 :rgb of the pixel
		punpckldq	mm2,[eax+esi] //(B'+);one more rgb to bits 32-63
		movq	mm3,mm6			//(C') ;[8xTL]
		pmaxub	mm3,mm2			//(D')
		pminub	mm2,mm4			//(E+')
		psubb	mm2,mm3			//(F+');mm0(r,g or b) == 0, if within limits, if so, mm0 is rgb
		pcmpeqd mm2,mm7			//(G+'); -1, if all were within limits; 0 otherwise
		movd	[esp+CURRENT+8],mm2
		pand	mm3,mm2			//(H+'); rgb masked
		jmp		final
oddnumerofcells:
		inc		edx //;add the source pixel,too, so that the coun will be even
		dec		edi //;this would lead to incorrect pixel count, here we make adjustment
		pxor	mm5,mm5 //; delete the source pixel from the total sum, it will be added later!
		xor		esi,esi		//; point to the pixel itself!
		test	edx,2
		jz		evennumerofpairs
		jmp		oddnumerofpairs
}

borderrows:
__asm{
		movd	mm4,[Thresh]
		mov		eax,src
		mov		edx,maxx  //;do it at the end also
		mov		ebx,radiusx //;do it at the end also
		movsx		ecx,fDebugNoise
		mov		edi,pitch
		mov		esi,width //
		mov		ebp00,ebp
		mov		ebp,dst
		mov		esp00,esp
		and		esp,-8  //;align stack on qword boundary
		sub		esp,LOCVARSZ
		punpckldq mm4,mm4
		add		eax,edi
		mov		[esp+MAXX],edx
		mov		[esp+RADIUSX],ebx
		mov		[esp+FDBN],ecx
		mov		[esp+PITCH],edi
		mov		[esp+DST],ebp


		lea		edi,[esp+SCRATCH+4] //; where to write the non-valid pixel before the cycle begins
		pxor	mm7,mm7  //; keep 0
		movq	[esp+THRSH],mm4
align 16
blinestart:

		punpcklwd mm5,mm4 //(.Q);mm5 =(b,-1,g,-1)
		punpckhwd mm3,mm4 //(.R);mm3 =(r,-1,x,-1) the last result to be stored
		movd	mm6,[eax] //(1); load the source pixel
		xor		ecx,ecx //(a);0 for cmov
		sub		edx,esi //(b);here esi = w-1, edx (was) = maxx. Now edx = maxx - (w-1)
				//;XXXXXXXXXXXXXX  eax - the source pixel (unsigned long *) ((char *) src + pitch);
		movq 	mm4,[esp+THRSH] //(2);
		pmaddwd	mm5,mm0 //(.S);(b*w+w/2,g*w+w/2)
		pmaddwd	mm3,mm0 //(.T);(r*w+w/2,x*w+w/2)
		movq	mm1,mm6 //(3); rgb0 in low dword as bytes
		punpckldq mm6,mm6 //(4); duplicate the source into high dword
		movq	mm2,mm4 //(5)

		cmovge	edx,ecx //(c);index for leftedge, edx <= 0 (edx<0, if  w-1 > width-radiusx)
		sub		ebx,esi //(d);ebx was radiusx. So ebx = (radiusx-(w-1)). If ebx > 0 => w-1 < radiusx

		psrld	mm5,15 //(.U);0th and 4th bytes are <b> and <g>
		psrld	mm3,15 //(.V)
		cmovle	ebx,ecx	//(e);ebx = index for right edge
		paddusb	mm4,mm6 //(6);Threshold_High
		psubusb	mm6,mm2 //(7);Threshold_Low
		punpcklbw	mm1,mm7 //(8);mm1=rgb0 as words
		cmp		edx,ebx
		mov		ecx,startpoint   //(f);if ebx=[ecx+4*edx] then the source pixel is [eax+ebx]
		packuswb mm5,mm3 //(.X);0th, 2nd, and 4th bytes are <b>, <g> and <r>
		mov		[esp+WW],esi //(h); store (w-1) in ww

		jne		corners  // can modify esi,ebx,ebp
		mov		edx,pointstot //(g);
fromcorners:
		packuswb mm5,mm5  //(.Y);0th,1st, and 3rd
		pxor	mm2,mm2 //(9); mm2=0, so that it results on 0 term on first pass [at (I') til (M')]
		mov		[esp+PIXTOT],edx //(i)
		movq	[esp+CURRENT],mm7 //(10);
		movd	[edi-4],mm5//(.Z)
		movq	mm5,mm1	//(11);mm5=rgb_count
		xor		edi,edi //(j);set pixel count to 0
		xor		ebp,ebp //(k);set pixel count to 0
		mov		ebx,[ecx+4*edx-4] //(l)
		mov		esi,[ecx+4*edx-8] //(m)
		test	edx,1
		jnz		boddnumerofcells
		test	edx,2
		jnz		boddnumerofpairs

bevennumerofpairs: //;mm7=0, mm4=Thresh_High mm6=Thresh_Low mm5=rgb(unpacked), edi=ebp=0
		movd	mm0,[eax+ebx]	//(A+); mm0 :rgb of the pixel
		sub		edx,4
		punpckldq	mm0,[eax+esi] //(B+);one more rgb to bits 32-63
		movq	mm1,mm6			//(C) ;[8xTL]
		jz		blastfour
align 16
bloop:
		pmaxub	mm1,mm0			//(D)
		pminub	mm0,mm4			//(E+)
		movq	mm3,mm2			//(I')
		mov		ebx,[ecx+4*edx+4]
		mov		esi,[ecx+4*edx]
		punpcklbw mm2,mm7		//(J+')
		punpckhbw mm3,mm7		//(K')
		psubb	mm0,mm1			//(F+);mm0(r,g or b) == 0, if within limits, if so, mm0 is rgb
		paddusw	mm5,mm2			//(L+')
		movd	mm2,[eax+ebx]	//(A'+); mm0 :rgb of the pixel
		punpckldq	mm2,[eax+esi] //(B'+);one more rgb to bits 32-63
		sub		ebp,[esp+CURRENT]
		sub		edi,[esp+CURRENT+4]
		pcmpeqd mm0,mm7			//(G+); -1, if all were within limits; 0 otherwise
		paddusw	mm5,mm3			//(M+')
		movq	mm3,mm6			//(C') ;[8xTL]
		movq	[esp+CURRENT],mm0
		pand	mm0,mm1			//(H+); rgb masked
		pmaxub	mm3,mm2			//(D')
		pminub	mm2,mm4			//(E+')
		movq	mm1,mm0			//(I)
		mov		ebx,[ecx+4*edx-4]
		mov		esi,[ecx+4*edx-8]
		punpcklbw mm0,mm7		//(J+)
		punpckhbw mm1,mm7		//(K)
		psubb	mm2,mm3			//(F+');mm0(r,g or b) == 0, if within limits, if so, mm0 is rgb
		paddusw	mm5,mm0			//(L+)
		movd	mm0,[eax+ebx]	//(A+); mm0 :rgb of the pixel
		punpckldq	mm0,[eax+esi] //(B+);one more rgb to bits 32-63
		sub		ebp,[esp+CURRENT]
		sub		edi,[esp+CURRENT+4]
		pcmpeqd mm2,mm7			//(G+'); -1, if all were within limits; 0 otherwise
		paddusw	mm5,mm1			//(M+)
		movq	mm1,mm6			//(C) ;[8xTL]
		sub		edx,4
		movq	[esp+CURRENT],mm2
		pand	mm2,mm3			//(H+'); rgb masked
		jg		bloop
blastfour:
		pmaxub	mm1,mm0			//(D)
		pminub	mm0,mm4			//(E+)
		movq	mm3,mm2			//(I')
		mov		ebx,[ecx+4*edx+4]
		mov		esi,[ecx+4*edx]
		punpcklbw mm2,mm7		//(J+')
		punpckhbw mm3,mm7		//(K')
		psubb	mm0,mm1			//(F+);mm0(r,g or b) == 0, if within limits, if so, mm0 is rgb
		paddusw	mm5,mm2			//(L+')
		movd	mm2,[eax+ebx]	//(A'+); mm0 :rgb of the pixel
		punpckldq	mm2,[eax+esi] //(B'+);one more rgb to bits 32-63
		sub		ebp,[esp+CURRENT]
		sub		edi,[esp+CURRENT+4]
		pcmpeqd mm0,mm7			//(G+); -1, if all were within limits; 0 otherwise
		paddusw	mm5,mm3			//(M+')
		movq	mm3,mm6			//(C') ;[8xTL]
		movq	[esp+CURRENT],mm0
		pand	mm0,mm1			//(H+); rgb masked
		pmaxub	mm3,mm2			//(D')
		pminub	mm2,mm4			//(E+')
		movq	mm1,mm0			//(I)
		punpcklbw mm0,mm7		//(J+)
		punpckhbw mm1,mm7		//(K)
		sub		ebp,[esp+CURRENT]
		psubb	mm2,mm3			//(F+');mm0(r,g or b) == 0, if within limits, if so, mm0 is rgb
		paddusw	mm5,mm0			//(L+)
		sub		edi,[esp+CURRENT+4]
		pcmpeqd mm2,mm7			//(G+'); -1, if all were within limits; 0 otherwise
		paddusw	mm5,mm1			//(M+)
		sub		edx,4
		movq	[esp+CURRENT],mm2
		pand	mm2,mm3			//(H+'); rgb masked
bfinal:
		movq	mm3,mm2			//(I')
		punpcklbw mm2,mm7		//(J+')
		sub		ebp,[esp+CURRENT]
		prefetchnta [ecx+eax] //;for PIII and K7 (OK, but without any benefit on PII)
			//;prefetch [ecx+2*esi] //- for AMD-K6 and K7
		punpckhbw mm3,mm7		//(K')
		sub		edi,[esp+CURRENT+4]
		paddusw	mm5,mm2			//(L+')
		mov		edx,[esp+DST]	//;(i)
		mov		ebx,[esp+PITCH]	//;(ii)
		mov		esi,[esp+WW]			//;(iii)
		add		edx,4			//;(iv)
		add		eax,4
#ifdef ATHLON
	prefetchw [ebx+edx] //- for AMD-K7
#endif
		paddusw	mm5,mm3			//(M+')
		shl		ebp,3			//;(v)
		test	[esp+FDBN],-1  //;(vi)
		mov		[esp+DST],edx			//;(vii)
		jnz		bdebugnoise
		dec		esi				//;(viii) esi = w-1
		mov		ebx,[esp+RADIUSX]	//;(ix)
		movq	mm3,mm5 		//(.N);(b,g,r,x)
		movq 	mm4,[n_plus_1+8*edi+ebp] //(.O);(n+1, n+1, n+1, n+1)
		movq	mm0,[divtable+8*edi+ebp] //(.P);mm0=(w,-w/2,w,-w/2) (w<= 2^14)
bnodebugnoise:
		mov		edi,edx			//;(x)
		mov		edx,[esp+MAXX]		//;(xi)
		jns		blinestart
			//;THE END OF THE LINE


		mov		ebx,[esp+PITCH]

		punpcklwd mm5,mm4 //;mm5 =(b,-1,g,-1)
		punpckhwd mm3,mm4 //;mm3 =(r,-1,x,-1)
		pmaddwd	mm5,mm0 //;(b*w+w/2,g*w+w/2)
		pmaddwd	mm3,mm0 //;(r*w+w/2,x*w+w/2)
		psrld	mm5,15 //;0th and 4th bytes are <b> and <g>
		psrld	mm3,15
		packuswb mm5,mm3 //;0th, 2nd, and 4th bytes are <b>, <g> and <r>
		packuswb mm5,mm5 //;0th,1st, and 3rd
		movd	[edi-4],mm5
		jmp		lineout


bdebugnoise:
		mov		ecx,[esp+PIXTOT]
		movq	mm0,[divtable+8*edi+ebp]
		movq 	mm4,[n_plus_1+8*edi+ebp]
		shr		ebp,2
		lea		edi,[2*edi+ebp]
		cmp		edi,ecx
		ja		bnodebugnoise0
		pxor	mm5,mm5
		pxor	mm3,mm3
bnodebugnoise0:
		dec		esi				//;(viii) esi = w-1
		movq	mm3,mm5 		//(.N);(b,g,r,x)
		mov		ebx,[esp+RADIUSX]	//;(ix)
		jmp		bnodebugnoise



corners:
		mov		esi,startindices[40+4*edx]
		movsx	ebx,cellcount[10+edx+ebx]
		mov		eax00,eax
		mov		edi00,edi
		mov		eax,pointstot
		mov		ebp,[ecx]		//esi - the smallest address (inlcusive)
		mov		ecx,[ecx+4*eax-4] //ebx - the largest address (exclusive)
		dec		ebp
		xor		edx,edx
cornloop:
		mov		eax,[esi+4*ebx-4]
		mov		edi,ebp
		mov		sitevectors3[4*edx],eax
		sub		edi,eax //if eax > boundmin-1, then sign
		sub		eax,ecx  //if eax < boundmax , then sign
		sar		edi,31
		sar		eax,31
		and		edi,eax
		sub		edx,edi
		dec		ebx
		jns		cornloop

		mov		ecx,offset sitevectors3
		mov		eax,eax00
		mov		edi,edi00
		jmp		fromcorners
boddnumerofpairs:
		sub		edx,2
		jz		blasttwo //;maybe there was just two cells?
		movd	mm2,[eax+ebx]	//(A'+); mm0 :rgb of the pixel
		punpckldq	mm2,[eax+esi] //(B'+);one more rgb to bits 32-63
		movq	mm3,mm6			//(C') ;[8xTL]
		pmaxub	mm3,mm2			//(D')
		pminub	mm2,mm4			//(E+')
		mov		ebx,[ecx+4*edx-4]
		mov		esi,[ecx+4*edx-8]
		psubb	mm2,mm3			//(F+');mm0(r,g or b) == 0, if within limits, if so, mm0 is rgb
		movd	mm0,[eax+ebx]	//(A+); mm0 :rgb of the pixel
		punpckldq	mm0,[eax+esi] //(B+);one more rgb to bits 32-63
		pcmpeqd mm2,mm7			//(G+'); -1, if all were within limits; 0 otherwise
		movq	mm1,mm6			//(C) ;[8xTL]
		sub		edx,4
		movq	[esp+CURRENT],mm2
		pand	mm2,mm3			//(H+'); rgb masked
		jg		bloop
		jmp 	blastfour
blasttwo:
		movd	mm2,[eax+ebx]	//(A'+); mm0 :rgb of the pixel
		punpckldq	mm2,[eax+esi] //(B'+);one more rgb to bits 32-63
		movq	mm3,mm6			//(C') ;[8xTL]
		pmaxub	mm3,mm2			//(D')
		pminub	mm2,mm4			//(E+')
		psubb	mm2,mm3			//(F+');mm0(r,g or b) == 0, if within limits, if so, mm0 is rgb
		pcmpeqd mm2,mm7			//(G+'); -1, if all were within limits; 0 otherwise
		movq	[esp+CURRENT],mm2
		pand	mm2,mm3			//(H+'); rgb masked
		jmp		bfinal
boddnumerofcells:
		inc		edx //;add the source pixel,too, so that the coun will be even
		dec		edi //;this would lead to incorrect pixel count, here we make adjustment
		pxor	mm5,mm5 //; delete the source pixel from the total sum, it will be added later!
		xor		esi,esi		//; point to the pixel itself!
		test	edx,2
		jz		bevennumerofpairs
		jmp		boddnumerofpairs
}


}

BOOL CALLBACK twodclean_ConfigDlgProc(HWND hdlg, UINT msg, WPARAM wParam, LPARAM lParam) {
         MyFilterData *mfd = (MyFilterData *)GetWindowLong(hdlg, DWL_USER);

         switch(msg) {
             case WM_INITDIALOG:
                 SetWindowLong(hdlg, DWL_USER, lParam);
                 mfd = (MyFilterData *)lParam;

                 CheckDlgButton(hdlg, IDC_DEBUGNOISE, mfd->fDebugNoise?BST_CHECKED:BST_UNCHECKED);
				 CheckDlgButton(hdlg, IDC_INTERLACE, mfd->fInterlace?BST_CHECKED:BST_UNCHECKED);
				 SetDlgItemInt(hdlg, IDC_THRESHOLD, mfd->fThreshold, FALSE);
				 SetDlgItemInt(hdlg, IDC_RADIUSX, mfd->fRadiusx, FALSE);
				 SetDlgItemInt(hdlg, IDC_RADIUSY, mfd->fRadiusy, FALSE);
 				 mfd->ifp->InitButton(GetDlgItem(hdlg, IDPREVIEW));
                return TRUE;

             case WM_COMMAND:
                switch(LOWORD(wParam)) {
 				case IDPREVIEW:
					mfd->ifp->Toggle(hdlg);
					break;
 				case IDHELP:
					{
					char prog[256];
					char path[256];
					LPTSTR ptr;

					GetModuleFileName(NULL, prog, 255);
					GetFullPathName(prog, 255, path, &ptr);
					*ptr = 0;
					strcat(path, "plugins\\2dcleanopt.html");
					ShellExecute(hdlg, "open", path, NULL, NULL, SW_SHOWNORMAL);
					return TRUE;
					}
				case IDOK:
					mfd->ifp->Close();
					EndDialog(hdlg, 0);
					return TRUE;
                 case IDCANCEL:
                    EndDialog(hdlg, 1);
                    return FALSE;
				 case IDC_DEBUGNOISE:
					mfd->fDebugNoise = !!IsDlgButtonChecked(hdlg, IDC_DEBUGNOISE);
					mfd->ifp->RedoFrame();
					return TRUE;
				 case IDC_INTERLACE:
					mfd->fInterlace = !!IsDlgButtonChecked(hdlg, IDC_INTERLACE);
					mfd->ifp->RedoFrame();
					return TRUE;
				 case IDC_RADIUSX:
					if (HIWORD(wParam) == EN_UPDATE) {
						unsigned long radiusx;
						BOOL success;

						radiusx = GetDlgItemInt(hdlg, IDC_RADIUSX, &success, FALSE);
						if (!success || ((2 * radiusx + 1) * (2 * mfd->fRadiusy + 1) > 121)
							         || (radiusx == 0 && mfd->fRadiusy == 0)
									 || radiusx > 10)
						{
							SetDlgItemInt(hdlg, IDC_RADIUSX, mfd->fRadiusx, FALSE);
							return TRUE;
						}
						if (mfd != NULL)
						{
							mfd->fRadiusx = radiusx;
							mfd->ifp->RedoFrame();
						}
					}
					return TRUE;
				 case IDC_RADIUSY:
					if (HIWORD(wParam) == EN_UPDATE) {
						unsigned long radiusy;
						BOOL success;

						radiusy = GetDlgItemInt(hdlg, IDC_RADIUSY, &success, FALSE);
						if (!success || ((2 * mfd->fRadiusx + 1) * (2 * radiusy + 1) > 121)
							         || (mfd->fRadiusx == 0 && radiusy == 0)
									 || radiusy > 10)
						{
							SetDlgItemInt(hdlg, IDC_RADIUSY, mfd->fRadiusy, FALSE);
							return TRUE;
						}
						if (mfd != NULL)
						{
							mfd->fRadiusy = radiusy;
							mfd->ifp->RedoFrame();
						}
					}
					return TRUE;
			  	 case IDC_THRESHOLD:
					if (HIWORD(wParam) == EN_UPDATE) {
						long threshold;
						BOOL success;

						threshold = GetDlgItemInt(hdlg, IDC_THRESHOLD, &success, FALSE);
						if (!success || threshold > 255) {
							SetDlgItemInt(hdlg, IDC_THRESHOLD, mfd->fThreshold, FALSE);
							return TRUE;
						}
						if (mfd != NULL)
						{
							mfd->fThreshold = threshold;
							mfd->ifp->RedoFrame();
						}
					}
					return TRUE;
               }
                 break;
         }

         return FALSE;
     }


int twodclean_ConfigProc(FilterActivation *fa, const FilterFunctions *ff, HWND hwnd) {
	MyFilterData *mfd = (MyFilterData *)fa->filter_data;
	MyFilterData mfd_old = *mfd;
	int ret;

	mfd->ifp = fa->ifp;

	if (DialogBoxParam(fa->filter->module->hInstModule, MAKEINTRESOURCE(IDD_2DCLEAN),
		hwnd, twodclean_ConfigDlgProc, (LPARAM)mfd))
	{
		*mfd = mfd_old;
		ret = TRUE;
	}
    else
	{
		ret = FALSE;
	}
	return(ret);
}

void twodclean_StringProc(const FilterActivation *fa, const FilterFunctions *ff, char *str) {
         const char *modes[2][2]={
			 " (normal",
			 " (interlaced",
			 " (show edges",
			 " (show edges+interlaced"
		 };
         MyFilterData *mfd = (MyFilterData *)fa->filter_data;
		 sprintf(str, "%s, thr %d, area %dx%d)",
			 modes[mfd->fDebugNoise][mfd->fInterlace], mfd->fThreshold,
			 mfd->fRadiusx * 2 + 1, mfd->fRadiusy * 2 + 1);
     }

void twodcleanScriptConfig(IScriptInterpreter *isi, void *lpVoid, CScriptValue *argv, int argc) {
    FilterActivation *fa = (FilterActivation *)lpVoid;
    MyFilterData *mfd = (MyFilterData *)fa->filter_data;

    mfd->fInterlace    = !!argv[0].asInt();
	mfd->fThreshold		= argv[1].asInt();
	mfd->fRadiusx	   = argv[2].asInt();
	mfd->fRadiusy	   = argv[3].asInt();
    mfd->fDebugNoise	= 0;
}

bool twodcleanFssProc(FilterActivation *fa, const FilterFunctions *ff, char *buf, int buflen) {
    MyFilterData *mfd = (MyFilterData *)fa->filter_data;

    _snprintf(buf, buflen, "Config(%d, %d, %d, %d)",
        mfd->fInterlace,
		mfd->fThreshold,
		mfd->fRadiusx,
		mfd->fRadiusy);

    return true;
}

static int twodclean_start(FilterActivation *fa, const FilterFunctions *ff) {
	MyFilterData *mfd = (MyFilterData *)fa->filter_data;
	return 0;
}

int twodclean_init(FilterActivation *fa, const FilterFunctions *ff) {
	MyFilterData *mfd = (MyFilterData *)fa->filter_data;
	register int i, u;
	mfd->fThreshold = 10;
	mfd->fRadiusx = 2;
	mfd->fRadiusy = 2;
	//here we initialize the table, which allows us to substitute divisions with multiplication
	for (i=0; i<128; ++i){
		u= (0x7fff)/ (i+1);
		u |= (u << 15) & 0xffff0000;
		divtable[2*i]=  divtable[2*i+1] =u;
		n_plus_1[2*i] = n_plus_1[2*i+1] =(i+1) | ((i+1) << 16);

	}
	return 0;
}

void initindices(){
	register int i, j, k, jmax;
	jmax = radiusy0 * pitchm0;
	k=0;

	for (i=-radiusx0; i <= radiusx0; i++)
	   for (j=-jmax; j <= jmax; j += pitchm0) {
		   if (j+i) {
			   sitevectors[k++] = j  + 4 * i;
		   }
	   }
	k=0;
	for (j=-jmax; j <= jmax; j += pitchm0)
	   for (i=-radiusx0; i <= radiusx0; i++) {
		   if (j+i) {
			   sitevectors2[k++] = j  + 4 * i;
		   }
	   }


	jmax = 2 * radiusy0 +1;
	k = (2 * radiusx0 +1) * jmax - 1;

	cellcount[10]= k;
	for (i=1, j=-1; i <= radiusx0; i++, j--){
		 k -= jmax;
		 cellcount[10+i] = cellcount[10+j] = k;
	 }

	 for (i=j=0; i+radiusx0 >= 0; i--, j += jmax)
	    startindices[10+i] = &sitevectors[j];
	return;
}

FilterDefinition filterDef_2dclean={
	0,0,NULL,
	"2d cleaner optimized (0.9)",
	"Structure-preserving spatial averager.\n"
	"Original filter by Jim Casaburi.\n"
	"Optimized for P3/4 and Athlon by Jaan Kalda.\n"
	"Preview and help added by Donald Graft.\n",
	"Jim Casaburi",NULL,
	sizeof(MyFilterData),
	twodclean_init,NULL,
	twodclean_run,
	NULL,
	twodclean_ConfigProc,
	twodclean_StringProc,
	NULL,
	NULL,
	&twodclean_obj,          // script_obj
    twodcleanFssProc,        // fssProc

};

extern "C" int __declspec(dllexport) __cdecl VirtualdubFilterModuleInit2(FilterModule *fm, const FilterFunctions *ff, int& vdfd_ver, int& vdfd_compat);
     extern "C" void __declspec(dllexport) __cdecl VirtualdubFilterModuleDeinit(FilterModule *fm, const FilterFunctions *ff);

     static FilterDefinition *fd_tutorial;

     int __declspec(dllexport) __cdecl VirtualdubFilterModuleInit2(FilterModule *fm, const FilterFunctions *ff, int& vdfd_ver, int& vdfd_compat) {
         if (!(fd_tutorial = ff->addFilter(fm, &filterDef_2dclean, sizeof(FilterDefinition))))
             return 1;

         vdfd_ver    = VIRTUALDUB_FILTERDEF_VERSION;
         vdfd_compat = VIRTUALDUB_FILTERDEF_COMPATIBLE;

         return 0;
     }

     void __declspec(dllexport) __cdecl VirtualdubFilterModuleDeinit(FilterModule *fm, const FilterFunctions *ff) {
         ff->removeFilter(fd_tutorial);
     }


