/*
NRS - "Noise reduction suite" filter for VirtualDub
Copyright (C) 2003 Antonio Foranna

This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation.
	
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.
		
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
			
The author can be contacted at:
kreel@tiscali.it
*/

#include <crtdbg.h>
#include<math.h>
#include "Struct.h"
#include "Define.h"
#include "Run.h"

//************************************************************************************************

/*
	Stores values of thresholds into the lookup table.*pLogThreshold.
	This version is called when using distances in RGB space.
*/
void SetLogThresholdSpace(MyFilterData *mfd, DWORD *pLogThreshold, BYTE Min, BYTE Max)
{
WORD i,j;
WORD tLog[]={2,4,8,16,32,64,128,256};
float step=(float)(Max-Min)/(THRESHOLDTABLELEN-1);
DWORD tmp;

	if(!pLogThreshold)
		return;

	if(Max<=Min)
		return;

	if(Min==254)
		for(i=0; i<256; i++)
			pLogThreshold[i]=256L*256*3;
	else
		for(i=0, j=0; i<256; i++)
		{
			tmp=(WORD)(Min+step*j+0.5); // 0.5 to round value
			_ASSERT(tmp<256);
			pLogThreshold[i]=tmp*tmp*3;
			if(i>=tLog[j])
				j++;
		}
}
//------------------------------------------------------------------------------------------------

/*
	Stores values of thresholds into the lookup table.*pLogThreshold.
	This version is called when using absolute values.
	Version for MMX-SSE code.
*/
void SetLogThresholdABS(MyFilterData *mfd, DWORD *pLogThreshold, BYTE Min, BYTE Max)
{
WORD i,j;
WORD tLog[]={2,4,8,16,32,64,128,256};
float step=(float)(Max-Min)/(THRESHOLDTABLELEN-1);
DWORD tmp;

	if(!pLogThreshold)
		return;

	if(Max<=Min)
		return;

	if(Min==254)
		for(i=0; i<256; i++)
			pLogThreshold[i]=(255L<<24) | (255L<<16) | (255L<<8) | 256;
	else
		for(i=0, j=0; i<256; i++)
		{
			tmp=(WORD)(Min+step*j+0.5); // 0.5 to round value
			_ASSERT(tmp<256);
			pLogThreshold[i]=(tmp<<24) | (tmp<<16) | (tmp<<8) | tmp;
			if(i>=tLog[j])
				j++;
		}
}
//------------------------------------------------------------------------------------------------

/*
	Stores values of thresholds into the lookup table.
	This version is called when using squares.
	Version for regular code that uses squares to avoid the usage of ABS (slower)
*/
void SetLogThresholdSquares(MyFilterData *mfd, WORD *pLogThreshold, BYTE Min, BYTE Max)
{
WORD i,j;
WORD tLog[]={2,4,8,16,32,64,128,256};
float step=(float)(Max-Min)/(THRESHOLDTABLELEN-1);
WORD tmp;

	if(!pLogThreshold)
		return;

	if(Max<=Min)
		return;

	if(Min==254)
		for(i=0; i<256; i++)
			pLogThreshold[i]=(WORD)255*255;
	else
		for(i=0, j=0; i<256; i++)
		{
			tmp=(WORD)(Min+step*j+0.5); // 0.5 to round value
			_ASSERT(tmp<256);
			pLogThreshold[i]=tmp*tmp;
			if(i>=tLog[j])
				j++;
		}
}
//************************************************************************************************

//	Function pointer used in RGB2Y()
void (*pRGB2YLine)(BYTE *Y, BYTE *rgb, DWORD pixels);

/*
	Converts a line of RGB values to a line of Y values
*/
void RGB2YLine(BYTE *Y, BYTE *rgb, DWORD pixels)
{
	do
	{
		*Y++=(BYTE)((*(rgb+2)*77L+*(rgb+1)*150L+*rgb*29L)>>8);
		rgb+=4;
	}while(--pixels);
}
//------------------------------------------------------------------------------------------------

/*
	MMX version of the above function
*/
void RGB2YLineMMX(BYTE *Y, BYTE *rgb, DWORD pixels)
{
/*	if(pixels&1)
	{
		*Y++=(BYTE)((*(rgb+2)*77L+*(rgb+1)*150L+*rgb*29L)>>8);
		rgb+=4;
		if(!--pixels)
			return;
	}
*/
const __int64	Ilum= 0x004d0096001d0000i64; //((__int64)77<<48) | (150<<32) | (29<<16);
const __int64	Iff00=0xff00;
	_asm
	{
		mov		ecx, pixels
		mov		esi, rgb
		mov		edi, Y
		movq	mm1, Ilum		; mm7= 0 77 150 29
		movq	mm3, Iff00
		pxor	mm0, mm0

		mov			eax, ecx
		and			eax, 1
		jz			do_loop

		movd		mm7, [esi]
		pslld		mm7, 8
		punpcklbw	mm7, mm0

		pmaddwd		mm7, mm1		; mm7= r*77+g*150 b*29+0*0
		punpckldq	mm2, mm7		; mm2= b*29 ?
		paddd		mm7, mm2		; mm7= r*77+g*150+b*29 ?
		psrlq		mm7, 40			; mm7= luminance1

		movd		eax, mm7		; eax= luminance1 luminance2
		add			esi, 4
		mov			[edi], al

		add		edi, 1
		sub		ecx, 1
		jz		end_loop

align 16
	do_loop:

		movq		mm7, [esi]
		pslld		mm7, 8
		punpckhbw	mm6, mm7
		punpcklbw	mm7, mm0
		psrlw		mm6, 8

		pmaddwd		mm7, mm1		; mm7= r*77+g*150 b*29+0*0
		punpckldq	mm2, mm7		; mm2= b*29 ?
		paddd		mm7, mm2		; mm7= r*77+g*150+b*29 ?
		psrlq		mm7, 40			; mm7= luminance1

		pmaddwd		mm6, mm1		; mm6= r*77+g*150 b*29+0*0
		punpckldq	mm2, mm6		; mm2= b*29 ?
		paddd		mm6, mm2		; mm7= r*77+g*150+b*29 ?
		psrlq		mm6, 32			; mm7= luminance2 0

		pand		mm6, mm3
		por			mm6, mm7
		movd		eax, mm6		; eax= luminance1 luminance2
		add			esi, 8
		mov			[edi], ax

		add		edi, 2
		sub		ecx, 2
		jnz		do_loop

	end_loop:

		emms
	}
}
//------------------------------------------------------------------------------------------------

/*
	Stores luminosity of old frame into Yold
*/
void RGB2Y(MyFilterData *mfd)
{
BYTE	*Y=mfd->Yold+mfd->src_shift;
DWORD	*old=(DWORD *)mfd->old_data+mfd->src_shift;
WORD	Width=(WORD)mfd->dst_width,
		Height=(WORD)mfd->dst_heigth,
		SrcWidth=mfd->wnc;
	if(Height)
		do
		{
			pRGB2YLine(Y,(BYTE *)old,Width);
			Y+=SrcWidth;
			old+=SrcWidth;
		}while(--Height);
}



//************************************************************************************************
//									INIT FUNC
//************************************************************************************************



BYTE InitDNR(MyFilterData *mfd, DWORD *src)
{
BYTE	*gamma=mfd->pGamma->GammaCorrection;

	if(src && mfd->Use_DNR && mfd->NewImage)
	{
		DELETE_ARRAY(mfd->pLogThresholdDNR);
		CREATE_ARRAY(mfd->pLogThresholdDNR,DWORD,256);

		if(mfd->MinThresholdDNR>=mfd->MaxThresholdDNR)
			!mfd->MinThresholdDNR ? mfd->MaxThresholdDNR=1 : mfd->MinThresholdDNR=mfd->MaxThresholdDNR-1;
		SetLogThresholdSpace(mfd,mfd->pLogThresholdDNR,mfd->MinThresholdDNR,mfd->MaxThresholdDNR);

		DELETE_ARRAY(mfd->pLogThresholdEdgesDNR);
		CREATE_ARRAY(mfd->pLogThresholdEdgesDNR,WORD,256);

		DELETE_ARRAY(mfd->sav_data);
		DELETE_ARRAY(mfd->old_data);
		DELETE_ARRAY(mfd->DiffAcc);
		CREATE_ARRAY(mfd->sav_data,BYTE,mfd->sizeSrc);
		CREATE_ARRAY(mfd->old_data,BYTE,mfd->sizeSrc);
		mfd->DiffAccSize=mfd->sizeDst;
		CREATE_ARRAY(mfd->DiffAcc,DWORD,(mfd->DiffAccSize>>2));
		DELETE_ARRAY(mfd->Yold);
		CREATE_ARRAY(mfd->Yold,BYTE,(mfd->sizeSrc>>2));
		DELETE_ARRAY(mfd->EdgesMask);
		CREATE_ARRAY(mfd->EdgesMask,BYTE,(mfd->sizeSrc>>2));

		mfd->ResetOldData=mfd->NewImage;
		mfd->NewImage=0;
	}

	if(src && mfd->Use_DNR && mfd->ResetOldData)
	{
		memset(mfd->DiffAcc,0,mfd->DiffAccSize); 
		if(!mfd->Use_gamma)
			memcpy(mfd->old_data,src,mfd->sizeSrc);
		else
		{
		BYTE	*bsrc=(BYTE *)src,
				*bold=(BYTE *)mfd->old_data;
		DWORD	nop;
			nop=mfd->sizeSrc>>2;
			while(nop--)
			{
				bold[0]=gamma[bsrc[0]];
				bold[1]=gamma[bsrc[1]];
				bold[2]=gamma[bsrc[2]];
				bold+=4;
				bsrc+=4;
			}
		}
		mfd->SCD_NumOfPixels=((mfd->sizeDst>>2)*mfd->SCD_threshold)/100;
		if(mfd->UseDnrMultiPass)
		{
			mfd->DnrMultiPassWeight=256/9;
			mfd->DnrMultiPassRest=256-mfd->DnrMultiPassWeight*9;
			if(g_MMXenabled)
			{
				pRGB2YLine=RGB2YLineMMX;
//				SetLogThresholdABS(mfd,mfd->pLogThresholdEdgesDNR,mfd->EdgesThreshold-2,mfd->EdgesThreshold+2);
				SetLogThresholdSquares(mfd,mfd->pLogThresholdEdgesDNR,mfd->EdgesThreshold-2,mfd->EdgesThreshold+2);
			}
			else
			{
				pRGB2YLine=RGB2YLine;
				SetLogThresholdSquares(mfd,mfd->pLogThresholdEdgesDNR,mfd->EdgesThreshold-2,mfd->EdgesThreshold+2);
			}
			RGB2Y(mfd);
		}
		memset(mfd->DiffAcc,0,mfd->DiffAccSize);
		mfd->ResetOldData=0;
	}
	return 0;
}



//************************************************************************************************
//									INLINE FUNCS
//************************************************************************************************



/*
	Apply DNR to one pixel
*/
inline BYTE DNRColor(	BYTE *bsrc, BYTE *bdst, BYTE *bold, DWORD *DiffAcc, const DWORD *pLogThresholdDNR)
{
DWORD	Diff=(DWORD)(	(*bsrc    -(long)*bold    )*(*bsrc    -(long)*bold    )+
						(*(bsrc+1)-(long)*(bold+1))*(*(bsrc+1)-(long)*(bold+1))+
						(*(bsrc+2)-(long)*(bold+2))*(*(bsrc+2)-(long)*(bold+2)));
	if(*DiffAcc+Diff>*(pLogThresholdDNR+(BYTE)((*(bsrc+2)*77L+*(bsrc+1)*150L+*bsrc*29L)>>8)))
	{
		*(DWORD*)bdst=*(DWORD*)bsrc;
		*DiffAcc=0;
		return 1;
	}
	else
	{
/*		*bdst    =(*bold    +(WORD)*bsrc    +1)>>1;
		*(bdst+1)=(*(bold+1)+(WORD)*(bsrc+1)+1)>>1;
		*(bdst+2)=(*(bold+2)+(WORD)*(bsrc+2)+1)>>1;
*/
		*(DWORD *)bdst=((*(DWORD *)bold&0xfefefefe)>>1) + ((*(DWORD *)bsrc>>1)&0x7f7f7f7f) + ((*(DWORD *)bold|*(DWORD *)bsrc)&0x01010101);
		*DiffAcc+=Diff>>1;
	}
	*(DWORD*)bold=*(DWORD*)bdst;

	return 0;
}



//************************************************************************************************
//								FUNCS TO PROCESS ROWS
//************************************************************************************************



/*
	Apply DNR to a row of pixels
*/
DWORD DNRline(DWORD *src, DWORD *dst, DWORD *old, DWORD *DiffAcc, const DWORD *pLogThresholdDNR, DWORD Width, const MyFilterData *mfd, DWORD dummy2, BYTE *dummy3)
{
const BYTE
		*bsrc=(BYTE *)src,
		*bold=(BYTE *)old;
BYTE	*bdst=(BYTE *)dst;
DWORD	Diff;
WORD	SCD=0;

	do
	{
		Diff=(DWORD)(	(bsrc[0]-(long)bold[0])*(bsrc[0]-(long)bold[0])+
						(bsrc[1]-(long)bold[1])*(bsrc[1]-(long)bold[1])+
						(bsrc[2]-(long)bold[2])*(bsrc[2]-(long)bold[2]));
		if(*DiffAcc+Diff>*(pLogThresholdDNR+(BYTE)((bsrc[2]*77L+bsrc[1]*150L+bsrc[0]*29L)>>8)))
		{
			*(DWORD*)bdst=*(DWORD*)bsrc;
			*DiffAcc=0;
			SCD++;
		}
		else
		{
/*			*bdst    =(*bold    +(WORD)*bsrc    +1)>>1;
			*(bdst+1)=(*(bold+1)+(WORD)*(bsrc+1)+1)>>1;
			*(bdst+2)=(*(bold+2)+(WORD)*(bsrc+2)+1)>>1;
*/
			*(DWORD *)bdst=((*(DWORD *)bold&0xfefefefe)>>1) + ((*(DWORD *)bsrc>>1)&0x7f7f7f7f) + ((*(DWORD *)bold|*(DWORD *)bsrc)&0x01010101);
			*DiffAcc+=Diff>>1;
		}
		*(DWORD*)bold=*(DWORD*)bdst;

		bsrc+=4;
		bdst+=4;
		bold+=4;
		DiffAcc++;
	}while(--Width);

	return SCD;
}
//************************************************************************************************

/*
	Apply DNR to a row of pixels.
	Edges in old frame aren't blended by filter
*/

DWORD DNRlineMP(DWORD *src, DWORD *dst, DWORD *old, DWORD *DiffAc, const DWORD *pLogThresholdDNR, DWORD Width, const MyFilterData *mfd, DWORD yOffset, BYTE *Yold)
{
long	*DiffAcc=(long *)DiffAc;
BYTE	*bEdgesMask=mfd->EdgesMask+mfd->src_shift+yOffset*mfd->wnc;
const WORD	*pLogThresholdEdgesDNR=mfd->pLogThresholdEdgesDNR;
const DWORD	Interlaced=mfd->Interlaced;
const long	ImageWidth=mfd->wnc<<Interlaced;
const long	TopLeft=yOffset>Interlaced ? ImageWidth+1 : 1,
		BotRight=(yOffset+((DWORD)1+Interlaced))<(WORD)mfd->dst_heigth ? ImageWidth+1 : 1,
		xGap=ImageWidth-3;
DWORD	xOffset=Width;
DWORD	tsrc,told,
		Rs,Gs,Bs;
long	Ro,Go,Bo;
short	SCD=0;
short	Lumi;
short	tYold;
const BYTE	show_blended=mfd->show_blended;

	SCD+=DNRColor((BYTE *)src,(BYTE *)dst,(BYTE *)old,(DWORD *)DiffAcc,pLogThresholdDNR);
	src++;
	dst++;
	old++;
	DiffAcc++;
	Yold++;
	bEdgesMask++;
	xOffset--;

	if(xOffset-->1)
		do
		{
			tsrc=*src;
			Rs=(tsrc>>16)&0xff;
			Gs=(tsrc>>8)&0xff;
			Bs=tsrc&0xff;
			told=*old;
			Ro=(told>>16)&0xff;
			Go=(told>>8)&0xff;
			Bo=told&0xff;
			tYold=*Yold;
			Lumi=(short)((Rs*77L+Gs*150L+Bs*29L)>>8);

			// don't blend edges in old frame. Edges are computed using an algorithm by me
			*bEdgesMask=((Yold[-TopLeft]  -(short)Yold[BotRight])  *(Yold[-TopLeft]  -(short)Yold[BotRight])  )<=(long)pLogThresholdEdgesDNR[tYold] &&
						((Yold[-TopLeft+1]-(short)Yold[BotRight-1])*(Yold[-TopLeft+1]-(short)Yold[BotRight-1]))<=(long)pLogThresholdEdgesDNR[tYold] &&
						((Yold[-TopLeft+2]-(short)Yold[BotRight-2])*(Yold[-TopLeft+2]-(short)Yold[BotRight-2]))<=(long)pLogThresholdEdgesDNR[tYold] &&
						((Yold[-1]        -(short)Yold[1])         *(Yold[-1]        -(short)Yold[1])         )<=(long)pLogThresholdEdgesDNR[tYold];

			bEdgesMask[1]=255; // prepare bEdgesMask to expand near edges; needed if TopLeft==-1

			if( // expand near edges
				!(bEdgesMask[-1] && *bEdgesMask && bEdgesMask[-TopLeft] && bEdgesMask[-TopLeft+1] && bEdgesMask[-TopLeft+2]) ||
				// this line is needed to catch changes across frames.
				// Here I use luminance because it could be unaffected by "snow effect";
				// furthermore square is applyed on *DiffAcc so negative values can be added to the sum ==> the convergence speed towards threshold is slower.
				(DWORD)((DiffAcc[0]+(Lumi-tYold))*(DiffAcc[0]+(Lumi-tYold))*3)>pLogThresholdDNR[(Lumi+tYold)>>1] ||
				// this is needed to catch changes taking in count colours.
				(DWORD)((Rs-Ro)*(Rs-Ro)+(Gs-Go)*(Gs-Go)+(Bs-Bo)*(Bs-Bo))>pLogThresholdDNR[(Lumi+tYold)>>1])
			{
				// expand current edge
				if(!*bEdgesMask)
				{
					dst[-1]=src[-1];
					DiffAcc[-1]=0;

					if(yOffset>Interlaced)
					{
						dst[-ImageWidth-1]=src[-ImageWidth-1];
						*(DiffAcc-Width-1)=0;

						dst[-ImageWidth]=src[-ImageWidth];
						*(DiffAcc-Width)=0;

						dst[-ImageWidth+1]=src[-ImageWidth+1];
						*(DiffAcc-Width+1)=0;
					}
				}

				*dst=tsrc;
				*DiffAcc=0;
				SCD++;
			}
			else
				if((Lumi-tYold)*(Lumi-tYold)>1) // average pixels
				{
					*dst=((told>>1)&0x7f7f7f7f) + ((tsrc>>1)&0x7f7f7f7f) + ((told|tsrc)&0x01010101);
					// update accum using distance between new pixel and old one
					DiffAcc[0]+=(Lumi-tYold)>>1;
				}
				else // fix pixel
				{
					*dst=told;
					DiffAcc[0]+=Lumi-tYold;
				}

			src++;
			dst++;
			old++;
			DiffAcc++;
			Yold++;
			bEdgesMask++;
		}while(--xOffset);

	SCD+=DNRColor((BYTE *)src,(BYTE *)dst,(BYTE *)old,(DWORD *)DiffAcc,pLogThresholdDNR);

	memcpy(old-Width+1,dst-Width+1,Width<<2);

	return SCD;
}
//************************************************************************************************

/*
	MMX version of the above code.
	Not yet implemented
*/
DWORD DNRlineMP_MMX(DWORD *src, DWORD *dst, DWORD *old, DWORD *DiffAc, const DWORD *pLogThresholdDNR, DWORD Width, const MyFilterData *mfd, DWORD yOffset, BYTE *Yold)
{
long	*DiffAcc=(long *)DiffAc;
short	SCD=0;

BYTE	*bsrc=(BYTE *)src,
		*bdst=(BYTE *)dst,
		*bold=(BYTE *)old;
BYTE	Interlaced=mfd->Interlaced;
DWORD	xOffset=Width;
WORD	ImageWidth=mfd->wnc<<Interlaced,
		ImageWidthx4=ImageWidth<<2;

long	TopLeft=yOffset>Interlaced ? ImageWidth+1 : 1,
		BotRight=(yOffset+((DWORD)1+Interlaced))<(WORD)mfd->dst_heigth ? ImageWidth+1 : 1,
		xGap=ImageWidth-3;
WORD	*pLogThresholdEdgesDNR=mfd->pLogThresholdEdgesDNR;
BYTE	show_blended=mfd->show_blended;
BYTE	*bEdgesMask=mfd->EdgesMask+mfd->src_shift+yOffset*mfd->wnc;

	if(Width&1)
	{
		SCD+=DNRColor(bsrc,bdst,bold,(DWORD *)DiffAcc,pLogThresholdDNR);
		bsrc+=4;
		bdst+=4;
		bold+=4;
		DiffAcc++;
		Yold++;
		bEdgesMask++;
		xOffset--;
		if(Width==1)
			return SCD;
	}

	SCD+=DNRColor(bsrc,bdst,bold,(DWORD *)DiffAcc,pLogThresholdDNR);
	bsrc+=4;
	bdst+=4;
	bold+=4;
	DiffAcc++;
	Yold++;
	bEdgesMask++;
	xOffset--;

	__asm
	{
	}

	SCD+=DNRColor(bsrc,bdst,bold,(DWORD *)DiffAcc,pLogThresholdDNR);

	memcpy(old,dst,Width<<2);

	return SCD;
}
//************************************************************************************************

/*
	Apply DNR to a row of pixels.
	MMX version
*/
DWORD DNRlineMMX(DWORD *src, DWORD *dst, DWORD *old, DWORD *DiffAcc, const DWORD *pLogThresholdDNR, DWORD Width, const MyFilterData *mfd, DWORD dummy2, BYTE *dummy3)
{
static DWORD ebp_sav;

const __int64	Ilum= 0x004d0096001d0000i64; //((__int64)77<<48) | (150<<32) | (29<<16);
static __int64	Iff	= 0xffffffffffffffffi64;

DWORD SCD;

	__asm
	{
		push		edi
		push		esi

		mov			esi, src
		mov			edi, dst
		mov			edx, old
		mov			ebx, DiffAcc
		mov			ecx, Width
		pxor		mm1, mm1		; mm0= SCD
		movq		mm4, Ilum
;		movq		mm5, Iff

		mov			eax, pLogThresholdDNR
		mov			ebp_sav, ebp
		mov			ebp, eax

align 16
	main_init:

		pxor		mm0, mm0
		punpcklbw	mm0, [esi]
		psllq		mm0, 8			; mm0= rgb0

		pxor		mm2, mm2
		punpcklbw	mm2, [edx]
		psllq		mm2, 8			; mm2= rgb0

// compute threshold

		movq		mm7, mm4		; mm7= 77 150 29 0
		pmaddwd		mm7, mm0		; mm7= r*77+g*150 b*29+0*0
		punpckldq	mm6, mm7		; mm6= b*29 ?
		paddd		mm7, mm6		; mm7= r*77+g*150+b*29 ?
		psrlq		mm7, 40			; mm7= luminance
		movd		eax, mm7		; eax= luminance
		punpckldq	mm3, [ebp+eax*4]; mm3= threshold ?

// compute new color

		movq		mm7, mm2
		paddw		mm2, mm0
		psrlw		mm2, 1			; (old+src)/2

// compute which color to keep

		psubw		mm7, mm0		; mm7= diff1 diff2 diff3 0
		pmaddwd		mm7, mm7		; mm7= d1*d1+d2*d2 d3*d3+0*0
		punpckldq	mm6, mm7		; mm6= d3*d3 ?
		paddd		mm6, mm7		; mm6= d1*d1+d2*d2+d3*d3 ?

		punpckldq	mm7, [ebx]		; mm7= DiffAcc ?
		movq		mm5, mm6
		paddd		mm6, mm7		; mm6= DiffAcc+(d1*d1+d2*d2+d3*d3) ?
		psrld		mm5, 1
		paddd		mm5, mm7		; mm6= DiffAcc+(d1*d1+d2*d2+d3*d3) ?

		pcmpgtd		mm6, mm3		; if(d1*d1+d2*d2+d3*d3>tThreshold)
		punpckhdq	mm6, mm6		; duplicate the mask

		movq		mm3, mm6
		psrlq		mm3, 63
		paddd		mm1, mm3		; if(d1*d1+d2*d2+d3*d3>tThreshold) SCD++

		pand		mm0, mm6		; if(d1*d1+d2*d2+d3*d3>tThreshold) src
		pxor		mm6, Iff		; mm6= !mm6
		pand		mm2, mm6		; if(d1*d1+d2*d2+d3*d3<=tThreshold) (old+src)/2
		por			mm0, mm2		; mm0= rgb0
		psrlq		mm0, 16			; mm0= 0rgb
		packuswb	mm0, mm0		; mm0= 0rgb

// update accumulator and write result

		pand		mm5, mm6		; if(d1*d1+d2*d2+d3*d3>tThreshold) DiffAcc=0
		psrlq		mm5, 32
		movd		[ebx], mm5		; update DiffAcc

		movd		[edi], mm0		; *dst= 0rgb
		movd		[edx], mm0		; *old= 0rgb

// process next pixel

		add			esi, 4			; src++
		add			edi, 4			; dst++
		add			edx, 4			; old++
		add			ebx, 4			; DiffAcc++

		sub			ecx, 1			; Width--
		jnz			main_init		; if(Width) goto main_init

// END

		mov			ebp, ebp_sav

		movd		SCD, mm1

		pop			esi
		pop			edi
		emms
	}
	return SCD;
}
