/*
NRS - "Noise reduction suite" filter for VirtualDub
Copyright (C) 2003 Antonio Foranna

This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation.
	
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.
		
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
			
The author can be contacted at:
kreel@tiscali.it
*/

#include <windows.h>
#include <crtdbg.h>
#include "Struct.h"
#include "Define.h"
#include "DNR.h"



BYTE InitSB(MyFilterData *mfd)
{
	if(!mfd->radius)
		mfd->radius=1;

	if(mfd->Use_SB)
	{
	short	i,j;
	short	radius=mfd->radius,
			diameter=(radius*2+1);

		if(mfd->SharpenStrength>=0)
			mfd->SharpenStrength2=8*(mfd->SharpenStrength+1);
		else
			mfd->SharpenStrength2=8/(-mfd->SharpenStrength+1);
		if(!mfd->SharpenEdges)
			mfd->SharpenStrength2=0;

		DELETE_ARRAY(mfd->pLogThresholdSB);
		CREATE_ARRAY(mfd->pLogThresholdSB,DWORD,256);

		if(mfd->MinThresholdSB>=mfd->MaxThresholdSB)
			!mfd->MinThresholdSB ? mfd->MaxThresholdSB=1 : mfd->MinThresholdSB=mfd->MaxThresholdSB-1;
		if(g_MMXenabled)
			SetLogThresholdABS(mfd,mfd->pLogThresholdSB,mfd->MinThresholdSB,mfd->MaxThresholdSB);
		else
			SetLogThresholdSpace(mfd,mfd->pLogThresholdSB,mfd->MinThresholdSB,mfd->MaxThresholdSB);

		DELETE_ARRAY(mfd->LUTminx);
		DELETE_ARRAY(mfd->LUTmaxx);
		DELETE_ARRAY(mfd->LUTminy);
		DELETE_ARRAY(mfd->LUTmaxy);
		CREATE_ARRAY(mfd->LUTminx,BYTE,mfd->dst_width);
		CREATE_ARRAY(mfd->LUTmaxx,BYTE,mfd->dst_width);
		CREATE_ARRAY(mfd->LUTminy,BYTE,mfd->dst_heigth);
		CREATE_ARRAY(mfd->LUTmaxy,BYTE,mfd->dst_heigth);

		for(i=0; i<mfd->dst_width; i++)
		{
			mfd->LUTminx[i]=(BYTE)((i>=radius) ? radius : i);
			mfd->LUTmaxx[i]=(BYTE)((i+radius<mfd->dst_width) ? radius : mfd->dst_width-1-i);
		}
		for(i=0; i<mfd->dst_heigth; i++)
		{
			j=i>>mfd->Interlaced;
			mfd->LUTminy[i]=(BYTE)((j>=radius) ? radius : j);
			mfd->LUTmaxy[i]=(BYTE)((j+radius<mfd->dst_heigth) ? radius : mfd->dst_heigth-1-j);
		}

	long	cnt=0,
			Idx;
	WORD	_255_Area=255-diameter*diameter;

		DELETE_ARRAY(mfd->pWeight);
		DELETE_ARRAY(mfd->pScaleWeight);

		if(g_MMXenabled)
		{
			CREATE_ARRAY(mfd->pWeight,BYTE,diameter*diameter*sizeof(__int64));
		__int64	*pWeight=(__int64 *)mfd->pWeight;

			mfd->WeightMax=(BYTE)diameter;
			for(j=-(short)radius; j<=radius; j++)
				for(i=-(short)radius; i<=radius; i++)
				{
					Idx=i+radius+(j+radius)*diameter;
					pWeight[Idx]=mfd->WeightMax-(i>=0 ? i : -i)-(j>=0 ? j : -j)-1;
					if(radius>3)
						pWeight[Idx]=pWeight[Idx]>>1;
					cnt+=(WORD)pWeight[Idx];
				}
			mfd->SumOfWeights=0;
			for(j=-radius; j<=radius; j++)
				for(i=-radius; i<=radius; i++)
				{
					Idx=i+radius+(j+radius)*diameter;
					pWeight[Idx]=1+(BYTE)((pWeight[Idx]*_255_Area)/cnt);
					mfd->SumOfWeights+=(WORD)pWeight[Idx];
					pWeight[Idx]=(pWeight[Idx]<<48) | (pWeight[Idx]<<32) | (pWeight[Idx]<<16) | pWeight[Idx];
				}
			mfd->WeightMax=(BYTE)(pWeight[radius+radius*diameter]&0xff);
			mfd->pCenterOfWeight=(BYTE *)(pWeight+radius+radius*diameter);
			mfd->_255DivMaskArea=255.0f/(mfd->SumOfWeights-mfd->WeightMax);

			CREATE_ARRAY(mfd->pScaleWeight,BYTE,257*sizeof(__int64));
		__int64	*pScaleWeight=(__int64 *)mfd->pScaleWeight;
		__int64	tmp;
			for(j=1; j<=256; j++)
			{
				tmp=0xffff/j;
				pScaleWeight[j]=(tmp<<48) | (tmp<<32) | (tmp<<16) | tmp;
			}
			pScaleWeight[0]=0;
		}
		else
		{
			CREATE_ARRAY(mfd->pWeight,BYTE,diameter*diameter);
		BYTE	*pWeight=mfd->pWeight;

			mfd->WeightMax=(BYTE)diameter;
			for(j=-(short)radius; j<=radius; j++)
				for(i=-(short)radius; i<=radius; i++)
				{
					Idx=i+radius+(j+radius)*diameter;
					pWeight[Idx]=mfd->WeightMax-(i>=0 ? i : -i)-(j>=0 ? j : -j)-1;
					if(radius>3)
						pWeight[Idx]=pWeight[Idx]>>1;
					cnt+=pWeight[Idx];
				}
			mfd->SumOfWeights=0;
			for(j=-radius; j<=radius; j++)
				for(i=-radius; i<=radius; i++)
				{
					Idx=i+radius+(j+radius)*diameter;
					pWeight[Idx]=1+(BYTE)((pWeight[Idx]*_255_Area)/cnt);
					mfd->SumOfWeights+=pWeight[Idx];
				}
			mfd->WeightMax=(BYTE)pWeight[radius+radius*diameter];
			mfd->pCenterOfWeight=(BYTE *)(pWeight+radius+radius*diameter);
			mfd->_255DivMaskArea=255.0f/(mfd->SumOfWeights-mfd->WeightMax);

			CREATE_ARRAY(mfd->pScaleWeight,BYTE,257*sizeof(WORD));
		WORD	*pScaleWeight=(WORD *)mfd->pScaleWeight;
			for(j=1; j<=256; j++)
				pScaleWeight[j]=0xffff/j;
			pScaleWeight[0]=0;
		}
		_ASSERT(mfd->SumOfWeights<256);
	}
	return 0;
}



//************************************************************************************************
//									INLINE FUNCS
//************************************************************************************************

inline void SharpenPixel(BYTE *bsrc, BYTE *bdst, long SumR, long SumG, long SumB, DWORD Mul, WORD Strength)
{
	// blurred pixel
	SumR=(SumR*Mul)>>16;
	SumG=(SumG*Mul)>>16;
	SumB=(SumB*Mul)>>16;
/*
	_ASSERT(SumR<256);
	_ASSERT(SumG<256);
	_ASSERT(SumB<256);
	bdst[0]=SumR;
	bdst[1]=SumG;
	bdst[2]=SumB;*/
short R,G,B;

	// sharpen amount
	R=((bsrc[0]-SumR)*Strength)>>3;
	G=((bsrc[1]-SumG)*Strength)>>3;
	B=((bsrc[2]-SumB)*Strength)>>3;
	// clipped output
	bdst[0]=CLIP(bsrc[0]+R);
	bdst[1]=CLIP(bsrc[1]+G);
	bdst[2]=CLIP(bsrc[2]+B);
}


/*
	Apply Blur to one pixel
*/
inline DWORD ComputeBlur(	BYTE *bsrc, BYTE *bdst, BYTE *bCenterOfWeight, WORD *pScaleWeight,
							BYTE minx, BYTE maxx, BYTE miny, BYTE maxy, BYTE radius, BYTE diameter, BYTE interlaced,
							DWORD ImageWidthx4, WORD SharpenStrength, const DWORD *pLogThresholdSB)
{
BYTE	*bNear, *bNearMaxX, *bWeight;
BYTE	*bWt, *bNt;
DWORD	R=0,G=0,B=0,cnt=0;
long	R2=0,G2=0,B2=0,cnt2=0;

	maxy+=miny+1;
	maxx=(minx+maxx+1)<<2;
	ImageWidthx4<<=interlaced;
	diameter<<=interlaced;
	bNt=bNear=bsrc-(minx<<2)-miny*ImageWidthx4;
	bNearMaxX=bNear+maxx;
	bWt=bWeight=bCenterOfWeight-minx-miny*(WORD)diameter;
	do
	{
		do
		{
			R2+=bNear[0];
			G2+=bNear[1];
			B2+=bNear[2];
			cnt2++;
			if((DWORD)(	(bsrc[0]-bNear[0])*(bsrc[0]-bNear[0])+
						(bsrc[1]-bNear[1])*(bsrc[1]-bNear[1])+
						(bsrc[2]-bNear[2])*(bsrc[2]-bNear[2]))<=
						pLogThresholdSB[(BYTE)((bsrc[2]*77L+bsrc[1]*150L+bsrc[0]*29L)>>8)])
			{
				R+=bNear[0]*(DWORD)*bWeight;
				G+=bNear[1]*(DWORD)*bWeight;
				B+=bNear[2]*(DWORD)*bWeight;
				cnt+=*bWeight;
			}
			_ASSERT(*bWeight);
			bWeight++;
			bNear+=4;
		}while(bNear<bNearMaxX);

		bNt=bNear=bNt+ImageWidthx4;
		bNearMaxX=bNear+maxx;
		bWt=bWeight=bWt+diameter;
	}while(--maxy);

// scale result and store it ---------------------------------------------------------------------

	bdst[0]=(BYTE)((R*pScaleWeight[cnt])>>16);
	bdst[1]=(BYTE)((G*pScaleWeight[cnt])>>16);
	bdst[2]=(BYTE)((B*pScaleWeight[cnt])>>16);

// optionally unsharp it -------------------------------------------------------------------------

	if(SharpenStrength && cnt<128)
		SharpenPixel(bsrc,bdst,R2,G2,B2,pScaleWeight[cnt2],SharpenStrength);

	return cnt;
}
//************************************************************************************************

/*
	Apply Blur to one pixel
	MMX version
*/
inline DWORD BlurMMX(	DWORD *bsrc, DWORD *bdst, BYTE *bCenterOfWeight, WORD *pScaleWeight, const DWORD *pLogThresholdSB, BYTE interlaced,
						BYTE minx, BYTE maxx, BYTE miny, BYTE maxy, BYTE radius, BYTE diameter,
						DWORD minxminydiameter, DWORD minxminyImageWidthx4, DWORD ImageWidthx4)
{
DWORD	cnt;
//const	DWORD	*bNear=bsrc-minxminyImageWidth;//((minx<<2)+(miny<<interlaced)*ImageWidthx4);
const __int64	*dWeight=(__int64 *)bCenterOfWeight-minxminydiameter;//(minx+miny*diameter);
const BYTE	MaskWidth=minx+maxx+1;
const BYTE	MaskHeight=miny+maxy+1;
const DWORD	xGapNear=(ImageWidthx4<<interlaced)-(MaskWidth<<2),
			xGapWeight=(diameter-MaskWidth)<<3; // Weigth is (__int64 *)
const DWORD MaskWidth_MaskHeight=((short)MaskWidth<<8) | MaskHeight;

const DWORD I1	= 0x0000000000000001i64,
			I256	= 0x0000000000000100i64;
const __int64 Ilum	= 0x004d0096001d0000i64; //((__int64)77<<48) | (150<<32) | (29<<16);
//const __int64 Iw	= 0x00ff00ff00ff00ffi64;

	__asm
	{
		push		edi
		push		esi
		push		ebx

// main init -------------------------------------------------------------------------------------

		mov			esi, bsrc
		pxor		mm0, mm0
		punpcklbw	mm0, [esi]
		psllq		mm0, 8			; mm0= rgb0

// compute threshold

		mov			edx, pLogThresholdSB
		movq		mm7, Ilum		; mm7= 77 150 29 0
		pmaddwd		mm7, mm0		; mm7= r*77+g*150 b*29+0*0
		punpckldq	mm6, mm7		; mm6= d3*d3 ?
		paddd		mm7, mm6		; mm7= d1*d1+d2*d2+d3*d3
		psrlq		mm7, 40			; mm7= luminance
		movd		eax, mm7		; eax= luminance
		movd		mm3, [edx+eax*4]; mm3= threshold

// x_loop init -----------------------------------------------------------------------------------

		sub			esi, minxminyImageWidthx4	; esi= bNear
		mov			ebx, xGapWeight
		mov			ecx, MaskWidth_MaskHeight
		mov			edx, dWeight
		mov			edi, xGapNear

		xor			eax, eax
		mov			al, ch

		movd		mm1, I1		; mm1= 1
		pxor		mm2, mm2	; mm2= raccum gaccum baccum waccum
		pxor		mm5, mm5
		packuswb	mm0, mm0	; mm0= ? rgb0

// go right

align 16
	x_loop:

		movd		mm4, [esi]	; mm4= ?rgb
		pslld		mm4, 8		; mm4= rgb0
		add			esi, 4		; go ahead 1 pixel

		movq		mm7, mm0	; mm7= src
		movq		mm6, mm4	; mm6= rgb0
		psubusb		mm7, mm4
		psubusb		mm6, mm0
		por			mm7, mm6	; mm7= |d1-d1| |d2-d2| |d3-d3| 0

;		movq		mm6, mm7
;		pminub		mm7, mm3
;		pcmpeqd		mm7, mm6	; mask= if(|d1-d1|&|d2-d2|&|d3-d3|<=tThreshold)
		psubusb		mm7, mm3
		pcmpeqd		mm7, mm5	; mask= if(|d1-d1|&|d2-d2|&|d3-d3|<=tThreshold)

		por			mm4, mm1	; mm4= 0 rgb1
		pand		mm4, mm7	; if(|d1-d1|&|d2-d2|&|d3-d3|<=tThreshold)

		punpcklbw	mm4, mm5	; mm7= 00rr 00gg 00bb 0001

		pmullw		mm4, [edx]	; mm4= r*w g*w b*w 1*w
		add			edx, 8		; go ahead 1 pixel
		paddusw		mm2, mm4	; mm2= raccum gaccum baccum waccum

;		dec			eax
		sub			eax, 1
		jnz			x_loop		; if(minx+maxx+1)

// go down

		add			edx, ebx	; bWeight+=xGapWeight
		add			esi, edi	; bNear+=xGapNear
		mov			al, ch		; eax= minx+maxx+1

;		dec			cl
		sub			cl, 1
		jnz			x_loop		; if(miny+maxy+1)

// scale result and store it ---------------------------------------------------------------------

		movd		eax, mm2
		mov			ecx, pScaleWeight
		and			eax, 0xffff			; eax= w
		movq		mm0, [ecx+eax*8]	; mm6= 0x7fff/w
		pmulhuw		mm0, mm2			; src*0x7fff/w
		packuswb	mm0, mm0			; mm0= 0000rgb?
		psrlq		mm0, 8				; mm6= 0rgb

		mov			ecx, bdst
		movd		[ecx], mm0

		mov			cnt, eax

		pop			ebx
		pop			esi
		pop			edi
		emms
	}
	return cnt;
}



//************************************************************************************************
//								FUNCS TO PROCESS ROWS
//************************************************************************************************


/*void UnsharpLine(	DWORD *src, DWORD *dst, DWORD *bCenterOfWeight, const DWORD *pLogThresholdSB,
					BYTE diameter, BYTE diametery, DWORD minydiameter, DWORD minyImageWidthx4,
					DWORD xGapNear, DWORD in_width, MyFilterData *mfd, DWORD yOffset, BYTE *dummy1, BYTE *dummy2);
*/

/*
	Needed to display a row of edges.
	Edges are used to set the strenght of filter
*/
void BlurShowEdgesLine(	DWORD *src, DWORD *dst, DWORD *bCenterOfWeight, const DWORD *pLogThresholdSB,
						BYTE diameter, BYTE diametery, DWORD minydiameter, DWORD minyImageWidthx4,
						DWORD xGapNear, DWORD dst_width, MyFilterData *mfd, BYTE *pMiny, BYTE *pMaxy)
{
const WORD	ImageWidthSrc=mfd->wnc;
const DWORD	ImageWidthx4=ImageWidthSrc<<2;
const BYTE	Interlaced=mfd->Interlaced;
const BYTE	radius=mfd->radius;
BYTE	*pMinx=mfd->LUTminx,
		*pMaxx=mfd->LUTmaxx;
DWORD	cnt;
WORD	*pScaleWeight=(WORD*)mfd->pScaleWeight;
WORD	SharpenStrength=mfd->SharpenStrength2;

	do
	{
		cnt=ComputeBlur((BYTE *)src,(BYTE *)dst,(BYTE *)bCenterOfWeight,pScaleWeight,
						*pMinx,*pMaxx,*pMiny,*pMaxy,radius,diameter,Interlaced,
						ImageWidthx4,SharpenStrength,pLogThresholdSB);
		*(BYTE*)dst=*((BYTE*)dst+1)=*((BYTE*)dst+2)=255-(BYTE)((cnt-mfd->WeightMax)*mfd->_255DivMaskArea);
//		*(BYTE*)dst=*((BYTE*)dst+1)=*((BYTE*)dst+2)=255-(BYTE)(cnt*mfd->_255DivMaskArea);
		src++;
		dst++;
		pMinx++;
		pMaxx++;
	}while(--dst_width);
}
//************************************************************************************************

/*
	Apply Blur to a row of pixels
*/
void BlurLine(	DWORD *src, DWORD *dst, DWORD *bCenterOfWeight, const DWORD *pLogThresholdSB,
				BYTE diameter, BYTE diametery, DWORD minydiameter, DWORD minyImageWidthx4,
				DWORD xGapNear, DWORD dummy1, MyFilterData *mfd, DWORD yOffset, BYTE *pMiny, BYTE *pMaxy)
{
const WORD	ImageWidthSrc=mfd->wnc;
const DWORD	ImageWidthx4=ImageWidthSrc<<2;
const BYTE	Interlaced=mfd->Interlaced;
const BYTE	radius=mfd->radius;
BYTE	*pMinx=mfd->LUTminx,
		*pMaxx=mfd->LUTmaxx;
WORD	dst_width=(WORD)mfd->dst_width;
WORD	*pScaleWeight=(WORD *)mfd->pScaleWeight;
WORD	SharpenStrength=mfd->SharpenStrength2;

	do
	{
		ComputeBlur((BYTE *)src,(BYTE *)dst,(BYTE *)bCenterOfWeight,pScaleWeight,
					*pMinx,*pMaxx,*pMiny,*pMaxy,radius,diameter,Interlaced,
					ImageWidthx4,SharpenStrength,pLogThresholdSB);
		src++;
		dst++;
		pMinx++;
		pMaxx++;
	}while(--dst_width);
}
//************************************************************************************************

/*
	Needed to display a row of edges.
	Edges are used to set the strenght of filter.
	Version for MMX code
*/
void BlurShowEdgesLineMMX(	DWORD *src, DWORD *dst, DWORD *bCenterOfWeight, const DWORD *pLogThresholdSB,
							BYTE diameter, BYTE diametery, DWORD minydiameter, DWORD minyImageWidthx4,
							DWORD xGapNear, DWORD dst_width, MyFilterData *mfd, BYTE *pMiny, BYTE *pMaxy)
{
const WORD	ImageWidthSrc=mfd->wnc;
const BYTE	Interlaced=mfd->Interlaced;
const DWORD	ImageWidthx4=ImageWidthSrc<<2;
const BYTE	radius=mfd->radius;
BYTE	*pMinx=mfd->LUTminx,
		*pMaxx=mfd->LUTmaxx;
DWORD	cnt;
WORD	*pScaleWeight=(WORD*)mfd->pScaleWeight;

	do
	{
		cnt=BlurMMX(src,dst,(BYTE *)bCenterOfWeight,pScaleWeight,pLogThresholdSB,Interlaced,
					*pMinx,*pMaxx,*pMiny,*pMaxy,radius,diameter,
					*pMinx+minydiameter,(*pMinx<<2)+minyImageWidthx4,ImageWidthx4);
		*(BYTE*)dst=*((BYTE*)dst+1)=*((BYTE*)dst+2)=255-(BYTE)((cnt-mfd->WeightMax)*mfd->_255DivMaskArea);
//		*(BYTE*)dst=*((BYTE*)dst+1)=*((BYTE*)dst+2)=255-(BYTE)(cnt*mfd->_255DivMaskArea);
		src++;
		dst++;
		pMinx++;
		pMaxx++;
	}while(--dst_width);
}
//************************************************************************************************

/*
	Apply Blur to a row of pixels
	MMX version
*/
void BlurLineMMX(	DWORD *src, DWORD *dst, DWORD *bCenterOfWeight, const DWORD *pLogThresholdSB,
					BYTE diameter, BYTE diametery,
					DWORD minydiameter, DWORD minyImageWidthx4, DWORD xGapNear, DWORD in_width, MyFilterData *mfd, DWORD dummy2, BYTE *dummy3, BYTE *dummy4)
{
const __int64	Ilum	= 0x004d0096001d0000i64, //((__int64)77<<48) | (150<<32) | (29<<16);
				I1		= 0x0000000100000001i64;

const __int64	*pWeight=(__int64 *)bCenterOfWeight-minydiameter;
const __int64	*pScaleWeight=(__int64	*)mfd->pScaleWeight;

const DWORD	radius=diameter>>1;
const BYTE	MaskWidth=(BYTE)radius+1,
			MaskHeight=diametery+1;
const DWORD MaskWidth_MaskHeight=((short)MaskWidth<<8) | MaskHeight,
			diameter_MaskHeight=(((short)diameter-1)<<8) | MaskHeight; // diameter-1 because a couple of pixel are processed at once
const DWORD	xGapWeight=(diameter-MaskWidth)<<3; // Weigth is (__int64 *)

	__asm
	{
		push		edi
		push		esi
		push		ebx

// main init -------------------------------------------------------------------------------------

		push		src
		push		dst
;		push		in_width

		mov			esi, src
		mov			edi, xGapNear
		mov			ebx, xGapWeight;
		movq		mm1, I1			; mm1= 1
		pxor		mm5, mm5


//******************//
// left column init ------------------------------------------------------------------------------
//******************//

		push		radius

align 16
	left_corner:

		pxor		mm0, mm0
		punpcklbw	mm0, [esi]
		psllq		mm0, 8			; mm0= rgb0

// compute threshold

		mov			edx, pLogThresholdSB
		movq		mm7, Ilum		; mm7= 77 150 29 0
		pmaddwd		mm7, mm0		; mm7= r*77+g*150 b*29+0*0
		punpckldq	mm6, mm7		; mm6= d3*d3 ?
		paddd		mm7, mm6		; mm7= d1*d1+d2*d2+d3*d3
		psrlq		mm7, 40			; mm7= luminance
		movd		eax, mm7		; eax= luminance
		movd		mm3, [edx+eax*4]; mm3= threshold

// x_loop init -----------------------------------------------------------------------------------

		sub			esi, minyImageWidthx4	; esi= bNear
		mov			ecx, MaskWidth_MaskHeight
		mov			edx, pWeight

		xor			eax, eax
		mov			al, ch

		pxor		mm2, mm2	; mm2= raccum gaccum baccum waccum
		packuswb	mm0, mm0	; mm0= ? rgb0

// go right

align 16
	x_loop_lc:

		movd		mm4, [esi]	; mm4= ?rgb
		pslld		mm4, 8		; mm4= 0 rgb0
		add			esi, 4		; go ahead 1 pixel

		movq		mm7, mm0	; mm7= src
		movq		mm6, mm4	; mm7= src
		psubusb		mm7, mm4
		psubusb		mm6, mm0
		por			mm7, mm6	; mm7= |d1-d1| |d2-d2| |d3-d3| 0

		psubusb		mm7, mm3
		pcmpeqd		mm7, mm5	; mask= if(|d1-d1|&|d2-d2|&|d3-d3|<=tThreshold)

		por			mm4, mm1	; mm4= 0 rgb1
		pand		mm4, mm7	; if(|d1-d1|&|d2-d2|&|d3-d3|<=tThreshold)

		punpcklbw	mm4, mm5	; mm7= 00rr 00gg 00bb 0001

		pmullw		mm4, [edx]	; mm4= r*w g*w b*w 1*w
		add			edx, 8		; go ahead 1 pixel
		paddusw		mm2, mm4	; mm2= raccum gaccum baccum waccum

;		dec			eax
		sub			eax, 1
		jnz			x_loop_lc	; if(minx+maxx+1)

// go down

		add			edx, ebx	; bWeight+=xGapWeight
		add			esi, edi	; bNear+=xGapNear
		mov			al, ch		; eax= minx+maxx+1

;		dec			cl
		sub			cl, 1
		jnz			x_loop_lc	; if(miny+maxy+1)

// scale result and store it ---------------------------------------------------------------------
/*
		movd		eax, mm2
		mov			ecx, pScaleWeight
		and			eax, 0xffff			; eax= w
		movq		mm6, [ecx+eax*8]	; mm6= 0x7fff/w
		movq		mm0, mm2
		pmulhuw		mm2, mm6			; src*0x7fff/w
		psllw		mm2, 1				; mm2= rgb?
		pmullw		mm0, mm6			; src*0x7fff/w
		psrlw		mm0, 15				; mm0= rgb?
		por			mm0, mm2			; mm0= rgb?
		packuswb	mm0, mm0			; mm0= 0000rgb?
		psrlq		mm0, 8				; mm6= 0rgb
*/
		movd		eax, mm2
		mov			ecx, pScaleWeight
		and			eax, 0xffff			; eax= w
		movq		mm0, [ecx+eax*8]	; mm6= 0x7fff/w
		pmulhuw		mm0, mm2			; src*0x7fff/w
		packuswb	mm0, mm0			; mm0= 0000rgb?
		psrlq		mm0, 8				; mm6= 0rgb

// process next pixel ----------------------------------------------------------------------------

		mov			edx, pWeight
		sub			edx, 8		; pWeight-=8
		mov			pWeight, edx
		mov			eax, minyImageWidthx4
		add			eax, 4
		mov			minyImageWidthx4, eax
		mov			ecx, MaskWidth_MaskHeight
		add			ecx, 0x100
		mov			MaskWidth_MaskHeight, ecx

		pop			eax			; radius
		pop			ecx			; dst
		pop			esi			; src

		add			esi, 4		; src+=4
		movd		[ecx], mm0	; *dst= 0rgb
		add			ecx, 4		; dst+=4

		sub			edi, 4		; xGapNear-=4
		sub			ebx, 8		; xGapWeight-=8

		dec			eax			; radius--
		push		esi
		push		ecx
		push		eax
		jnz			left_corner

//******************//
// mid pixels ------------------------------------------------------------------------------------
//******************//

		pop			eax				; radius
		push		in_width

align 16
	main_init:

;		pxor		mm0, mm0
;		punpcklbw	mm0, [esi]
;		psllq		mm0, 8			; mm0= rgb0
		movd		mm0, [esi]		; mm0= 0rgb
		punpcklbw	mm0, mm5
		psllq		mm0, 16			; mm0= rgb0

// compute threshold

		mov			edx, pLogThresholdSB
		movq		mm7, Ilum		; mm7= 77 150 29 0
		pmaddwd		mm7, mm0		; mm7= r*77+g*150 b*29+0*0
		punpckldq	mm6, mm7		; mm6= d3*d3 ?
		paddd		mm7, mm6		; mm7= d1*d1+d2*d2+d3*d3
		psrlq		mm7, 40			; mm7= luminance
		movd		eax, mm7		; eax= luminance
		movd		mm3, [edx+eax*4]; mm3= threshold
		punpckldq	mm3, mm3		; duplicate the mask placed into the low DWORD

// x_loop init -------------------------------------------------------------------------------------

		sub			esi, minyImageWidthx4	; esi= bNear
		mov			ecx, diameter_MaskHeight
		mov			edx, pWeight
;		mov			ebx, xGapWeight

		xor			eax, eax
		mov			al, ch		; eax= minx+maxx

		pxor		mm2, mm2	; mm2= raccum gaccum baccum waccum
		packuswb	mm0, mm0	; mm0= ? rgb0
		punpckldq	mm0, mm0	; mm6= rgb0 rgb0

// process pixels into radius

align 16
	x_loop:

// process a pair of pixels

		movq		mm4, [esi]	; mm4= ?rgb ?rgb
		add			esi, 8		; go ahead a pair of pixels
		pslld		mm4, 8		; mm4= rgb0 rgb0

		movq		mm7, mm0	; mm7= src src
		movq		mm6, mm4	; mm6= rgb0 rgb0
		psubusb		mm7, mm4
		psubusb		mm6, mm0
		por			mm7, mm6	; mm7= |d1-d1| |d2-d2| |d3-d3| 0 |d1-d1| |d2-d2| |d3-d3| 0

		psubusb		mm7, mm3
		pcmpeqd		mm7, mm5	; mask= if(|d1-d1|&|d2-d2|&|d3-d3|<=tThreshold) for both pixels

		por			mm4, mm1	; mm4= rgb1 rgb1
		pand		mm4, mm7	; if(|d1-d1|&|d2-d2|&|d3-d3|<=tThreshold)

		punpckhbw	mm6, mm4
		punpcklbw	mm4, mm5	; mm7= 00rr 00gg 00bb 0001
		pmullw		mm4, [edx]	; mm6= r*w g*w b*w 1*w
		paddw		mm2, mm4	; mm2= raccum gaccum baccum waccum

		psrlw		mm6, 8		; mm6= 00rr 00gg 00bb 0001
		pmullw		mm6, [edx+8]; mm4= r*w g*w b*w 1*w
		paddw		mm2, mm6	; mm2= raccum gaccum baccum waccum
		add			edx, 16		; go ahead a pair of pixels

// check for jump

;		dec			eax
		sub			eax, 2
		jnz			x_loop		; if(minx+maxx+1)

// add last pixel in the row

		movd		mm4, [esi]	; mm4= 0 ?rgb
		add			esi, 4		; go ahead 1 pixel
		pslld		mm4, 8		; mm4= 0 rgb0

		movq		mm7, mm0	; mm7= src
		movq		mm6, mm4	; mm7= src
		psubusb		mm7, mm4
		psubusb		mm6, mm0
		por			mm7, mm6	; mm7= |d1-d1| |d2-d2| |d3-d3| 0

		psubusb		mm7, mm3
		pcmpeqd		mm7, mm5	; mask= if(|d1-d1|&|d2-d2|&|d3-d3|<=tThreshold)

		por			mm4, mm1	; mm4= 0 rgb1
		pand		mm4, mm7	; if(|d1-d1|&|d2-d2|&|d3-d3|<=tThreshold)

		punpcklbw	mm4, mm5	; mm7= 00rr 00gg 00bb 0001

		pmullw		mm4, [edx]	; mm7= r*w g*w b*w 1*w
;		movq		mm6, [edx]	; mm4= wwww
		add			edx, 8		; go ahead 1 pixel
;		pmullw		mm4, mm6	; mm7= r*w g*w b*w 1*w
		paddusw		mm2, mm4	; mm2= raccum gaccum baccum waccum

// go down

		add			esi, edi	; bNear+=xGapNear
;		add			edx, ebx	; bNear+=xGapWeight
		mov			al, ch		; eax= minx+maxx

;		dec			cl
		sub			cl, 1
		jnz			x_loop		; if(miny+maxy+1)

// scale result and store it ---------------------------------------------------------------------

		movd		eax, mm2
		mov			ecx, pScaleWeight
		and			eax, 0xffff			; eax= w
		movq		mm0, [ecx+eax*8]	; mm6= 0x7fff/w
		pmulhuw		mm0, mm2			; src*0x7fff/w
		packuswb	mm0, mm0			; mm0= 0000rgb?
		psrlq		mm0, 8				; mm6= 0rgb

// process next pixel ----------------------------------------------------------------------------

		pop			eax			; in_width
		pop			ecx			; dst
		pop			esi			; src

		add			esi, 4		; src+=4
		movd		[ecx], mm0	; *dst= 0rgb
		add			ecx, 4		; dst+=4

		dec			eax			; in_width--
		push		esi
		push		ecx
		push		eax
		jnz			main_init

//******************//
// right column init ------------------------------------------------------------------------------
//******************//

		pop			eax			; in_width
		push		radius

		add			edi, 4		; xGapNear+=4
		add			ebx, 8		; xGapWeight+=8

align 16
	right_corner:

		pxor		mm0, mm0
		punpcklbw	mm0, [esi]
		psllq		mm0, 8			; mm0= rgb0

// compute threshold

		mov			edx, pLogThresholdSB
		movq		mm7, Ilum		; mm7= 77 150 29 0
		pmaddwd		mm7, mm0		; mm7= r*77+g*150 b*29+0*0
		punpckldq	mm6, mm7		; mm6= d3*d3 ?
		paddd		mm7, mm6		; mm7= d1*d1+d2*d2+d3*d3
		psrlq		mm7, 40			; mm7= luminance
		movd		eax, mm7		; eax= luminance
		movd		mm3, [edx+eax*4]; mm3= threshold

// x_loop init -----------------------------------------------------------------------------------

		mov			ecx, MaskWidth_MaskHeight
		sub			ecx, 0x100
		mov			MaskWidth_MaskHeight, ecx

		sub			esi, minyImageWidthx4	; esi= bNear
		mov			ecx, MaskWidth_MaskHeight
		mov			edx, pWeight

		xor			eax, eax
		mov			al, ch

		pxor		mm2, mm2	; mm2= raccum gaccum baccum waccum
		packuswb	mm0, mm0	; mm0= ? rgb0

// go right

align 16
	x_loop_rc:

		movd		mm4, [esi]	; mm4= ?rgb
		pslld		mm4, 8		; mm4= 0 rgb0
		add			esi, 4		; go ahead 1 pixel

		movq		mm7, mm0	; mm7= src
		movq		mm6, mm4	; mm7= src
		psubusb		mm7, mm4
		psubusb		mm6, mm0
		por			mm7, mm6	; mm7= |d1-d1| |d2-d2| |d3-d3| 0

		psubusb		mm7, mm3
		pcmpeqd		mm7, mm5	; mask= if(|d1-d1|&|d2-d2|&|d3-d3|<=tThreshold)

		por			mm4, mm1	; mm4= 0 rgb1
		pand		mm4, mm7	; if(|d1-d1|&|d2-d2|&|d3-d3|<=tThreshold)

		punpcklbw	mm4, mm5	; mm7= 00rr 00gg 00bb 0001

		pmullw		mm4, [edx]	; mm4= r*w g*w b*w 1*w
;		movq		mm6, [edx]	; mm4= wwww
		add			edx, 8		; go ahead 1 pixel
;		pmullw		mm4, mm6	; mm7= r*w g*w b*w 1*w
		paddusw		mm2, mm4	; mm2= raccum gaccum baccum waccum

;		dec			eax
		sub			eax, 1
		jnz			x_loop_rc	; if(minx+maxx+1)

// go down

		add			edx, ebx	; bWeight+=xGapWeight
		add			esi, edi	; bNear+=xGapNear
		mov			al, ch		; eax= minx+maxx+1

;		dec			cl
		sub			cl, 1
		jnz			x_loop_rc	; if(miny+maxy+1)

// scale result and store it ---------------------------------------------------------------------

		movd		eax, mm2
		mov			ecx, pScaleWeight
		and			eax, 0xffff			; eax= w
		movq		mm0, [ecx+eax*8]	; mm6= 0x7fff/w
		pmulhuw		mm0, mm2			; src*0x7fff/w
		packuswb	mm0, mm0			; mm0= 0000rgb?
		psrlq		mm0, 8				; mm6= 0rgb

// process next pixel ----------------------------------------------------------------------------

		pop			eax			; radius
		pop			ecx			; dst
		pop			esi			; src

		add			esi, 4		; src+=4
		movd		[ecx], mm0	; *dst= 0rgb
		add			ecx, 4		; dst+=4

		add			edi, 4		; xGapNear+=4
		add			ebx, 8		; xGapWeight+=8

		dec			eax			; radius--
		push		esi
		push		ecx
		push		eax
		jnz			right_corner

// restore stack ---------------------------------------------------------------------------------

		pop			ecx
		pop			ecx
		pop			ecx

		pop			ebx
		pop			esi
		pop			edi
		emms
	}
}
//************************************************************************************************

/*
	Apply Blur to a row of pixels while sharpening edges
	MMX version
*/
void BlurSharpenLineMMX(DWORD *src, DWORD *dst, DWORD *bCenterOfWeight, const DWORD *pLogThresholdSB,
						BYTE diameter, BYTE diametery, DWORD minydiameter, DWORD minyImageWidthx4,
						DWORD xGapNear, DWORD in_width, MyFilterData *mfd, DWORD yOffset, BYTE *dummy1, BYTE *dummy2)
{
WORD	max=0x7fff-255;
const __int64	Ilum	= 0x004d0096001d0000i64, //((__int64)77<<48) | (150<<32) | (29<<16);
				I1		= 0x0000000100000001i64,
				I128	= 0x0080008000800080i64,
				Iff		= 0xffffffffffffffffi64,
				Istrength=((__int64)mfd->SharpenStrength2<<48)|((__int64)mfd->SharpenStrength2<<32)|((__int64)mfd->SharpenStrength2<<16)|mfd->SharpenStrength2,
				Imax	= ((__int64)max<<48)|((__int64)max<<32)|((__int64)max<<16)|max;

const __int64	*pWeight=(__int64 *)bCenterOfWeight-minydiameter;
const __int64	*pScaleWeight=(__int64	*)mfd->pScaleWeight;

const DWORD	radius=diameter>>1;
const BYTE	MaskWidth=(BYTE)radius+1,
			MaskHeight=diametery+1;
const DWORD MaskWidth_MaskHeight=((short)MaskWidth<<8) | MaskHeight,
			diameter_MaskHeight=(((short)diameter-1)<<8) | MaskHeight;
const DWORD	xGapWeight=(diameter-MaskWidth)<<3; // Weigth is (__int64 *)

	__asm
	{
		push		edi
		push		esi
		push		ebx

// main init -------------------------------------------------------------------------------------

		push		src
		push		dst
;		push		in_width

		mov			esi, src
		mov			edi, xGapNear
		mov			ebx, xGapWeight;
		movq		mm1, I1			; mm1= 1
		pxor		mm5, mm5


//******************//
// left column init ------------------------------------------------------------------------------
//******************//

		push		radius

align 16
	left_corner:

		pxor		mm1, mm1		; accumulator for unsharp mask

		pxor		mm0, mm0
		punpcklbw	mm0, [esi]
		psllq		mm0, 8			; mm0= rgb0

// compute threshold

		mov			edx, pLogThresholdSB
		movq		mm7, Ilum		; mm7= 77 150 29 0
		pmaddwd		mm7, mm0		; mm7= r*77+g*150 b*29+0*0
		punpckldq	mm6, mm7		; mm6= d3*d3 ?
		paddd		mm7, mm6		; mm7= d1*d1+d2*d2+d3*d3
		psrlq		mm7, 40			; mm7= luminance
		movd		eax, mm7		; eax= luminance
		movd		mm3, [edx+eax*4]; mm3= threshold

// x_loop init -----------------------------------------------------------------------------------

		sub			esi, minyImageWidthx4	; esi= bNear
		mov			ecx, MaskWidth_MaskHeight
		mov			edx, pWeight

		xor			eax, eax
		mov			al, ch

		pxor		mm2, mm2	; mm2= raccum gaccum baccum waccum
		packuswb	mm0, mm0	; mm0= ? rgb0

// go right

align 16
	x_loop_lc:

		movd		mm4, [esi]	; mm4= ?rgb
		pslld		mm4, 8		; mm4= 0 rgb0
		add			esi, 4		; go ahead 1 pixel

		movq		mm7, mm0	; mm7= src
		movq		mm6, mm4	; mm7= src
		psubusb		mm7, mm4
		psubusb		mm6, mm0
		por			mm7, mm6	; mm7= |d1-d1| |d2-d2| |d3-d3| 0

		psubusb		mm7, mm3
		pcmpeqd		mm7, mm5	; mask= if(|d1-d1|&|d2-d2|&|d3-d3|<=tThreshold)

		por			mm4, I1		; mm4= 0 rgb1
;		pand		mm4, mm7	; if(|d1-d1|&|d2-d2|&|d3-d3|<=tThreshold)

		punpcklbw	mm4, mm5	; mm7= 00rr 00gg 00bb 0001
		pmullw		mm4, [edx]	; mm7= r*w g*w b*w 1*w
		add			edx, 8		; go ahead 1 pixel

		paddusw		mm1, mm4
		pand		mm4, mm7	; if(|d1-d1|&|d2-d2|&|d3-d3|<=tThreshold)
		paddusw		mm2, mm4	; mm2= raccum gaccum baccum waccum

;		dec			eax
		sub			eax, 1
		jnz			x_loop_lc	; if(minx+maxx+1)

// go down

		add			edx, ebx	; bWeight+=xGapWeight
		add			esi, edi	; bNear+=xGapNear
		mov			al, ch		; eax= minx+maxx+1

;		dec			cl
		sub			cl, 1
		jnz			x_loop_lc	; if(miny+maxy+1)

// add current pixel and store result ------------------------------------------------------------

		// mask to choose SB or UM
		movq		mm4, I128	; mm4= w
		pcmpgtw		mm4, mm2	; mask= if(128>w) Use unsharp mask
		punpcklwd	mm4, mm4	; mm6= ? ? mask mask
		punpckldq	mm4, mm4	; mm6= mask mask mask mask

		// blurred pixel for SB
		movd		eax, mm2
		mov			ecx, pScaleWeight
		and			eax, 0xffff			; eax= w
		pmulhuw		mm2, [ecx+eax*8]	; (src*0x7fff/w)>>16

		// blurred pixel for UM
		movd		eax, mm1
;		mov			ecx, pScaleWeight
		and			eax, 0xffff			; eax= w
		pmulhuw		mm1, [ecx+eax*8]	; (src*0x7fff/w)>>16

		// sharpen using unsharp mask
		punpcklbw	mm0, mm5		; mm0= rgb0
		psubw		mm1, mm0		; mm2= src-blurred pixel=d1
		pmullw		mm1, Istrength	; mm2= d1*strength d1*strength d1*strength ?
		psraw		mm1, 3			; mm2= d1 d1 d1 ?
		psubw		mm0, mm1		; if(d1>0) mm0+=d1*strength
		paddsw		mm0, Imax		; saturate if>255
		psubusw		mm0, Imax		; saturate if<0

		// result
		pand		mm0, mm4	; if(128>w) apply UM
		pxor		mm4, Iff
		pand		mm2, mm4
		por			mm0, mm2	; else mm0= dst
		packuswb	mm0, mm0	; mm0= 00000rgb
		psrld		mm0, 8				; mm1= rgb?

// process next pixel ----------------------------------------------------------------------------

		mov			edx, pWeight
		sub			edx, 8		; pWeight-=8
		mov			pWeight, edx
		mov			eax, minyImageWidthx4
		add			eax, 4
		mov			minyImageWidthx4, eax
		mov			ecx, MaskWidth_MaskHeight
		add			ecx, 0x100
		mov			MaskWidth_MaskHeight, ecx

		pop			eax			; radius
		pop			ecx			; dst
		pop			esi			; src

		add			esi, 4		; src+=4
		movd		[ecx], mm0	; *dst= 0rgb
		add			ecx, 4		; dst+=4

		sub			edi, 4		; xGapNear-=4
		sub			ebx, 8		; xGapWeight-=8

		dec			eax			; radius--
		push		esi
		push		ecx
		push		eax
		jnz			left_corner

//******************//
// mid pixels ------------------------------------------------------------------------------------
//******************//

		pop			eax				; radius
		push		in_width

align 16
	main_init:

		pxor		mm1, mm1		; accumulator for unsharp mask

;		pxor		mm0, mm0
;		punpcklbw	mm0, [esi]
;		psllq		mm0, 8			; mm0= rgb0
		movd		mm0, [esi]		; mm0= 0rgb
		punpcklbw	mm0, mm5
		psllq		mm0, 16			; mm0= rgb0

// compute threshold

		mov			edx, pLogThresholdSB
		movq		mm7, Ilum		; mm7= 77 150 29 0
		pmaddwd		mm7, mm0		; mm7= r*77+g*150 b*29+0*0
		punpckldq	mm6, mm7		; mm6= d3*d3 ?
		paddd		mm7, mm6		; mm7= d1*d1+d2*d2+d3*d3
		psrlq		mm7, 40			; mm7= luminance
		movd		eax, mm7		; eax= luminance
		movd		mm3, [edx+eax*4]; mm3= threshold
		punpckldq	mm3, mm3		; duplicate the mask placed into the low DWORD

// x_loop init -------------------------------------------------------------------------------------

		sub			esi, minyImageWidthx4	; esi= bNear
		mov			ecx, diameter_MaskHeight
		mov			edx, pWeight
;		mov			ebx, xGapWeight

		xor			eax, eax
		mov			al, ch		; eax= minx+maxx

		pxor		mm2, mm2	; mm2= raccum gaccum baccum waccum
		packuswb	mm0, mm0	; mm0= ? rgb0
		punpckldq	mm0, mm0	; mm6= rgb0 rgb0

// process pixels into radius

align 16
	x_loop:

// process a pair of pixels

		movq		mm4, [esi]	; mm4= ?rgb ?rgb
		add			esi, 8		; go ahead a pair of pixels
		pslld		mm4, 8		; mm4= rgb0 rgb0

		movq		mm7, mm0	; mm7= src src
		movq		mm6, mm4	; mm6= rgb0 rgb0
;		movq		mm6, mm0	; mm6= src src
		psubusb		mm7, mm4
		psubusb		mm6, mm0
		por			mm7, mm6	; mm7= |d1-d1| |d2-d2| |d3-d3| 0 |d1-d1| |d2-d2| |d3-d3| 0
;		pmaxub		mm7, mm4
;		pminub		mm6, mm4
;		psubb		mm7, mm6	; mm7= |d1-d1| |d2-d2| |d3-d3| 0 |d1-d1| |d2-d2| |d3-d3| 0

;		movq		mm6, mm7
;		pminub		mm7, mm3
;		pcmpeqd		mm7, mm6	; mask= if(|d1-d1|&|d2-d2|&|d3-d3|<=tThreshold) for both pixels
		psubusb		mm7, mm3
		pcmpeqd		mm7, mm5	; mask= if(|d1-d1|&|d2-d2|&|d3-d3|<=tThreshold) for both pixels

		por			mm4, I1		; mm4= rgb1 rgb1
;		pand		mm4, mm7	; if(|d1-d1|&|d2-d2|&|d3-d3|<=tThreshold)
		movq		mm3, mm7
		punpcklbw	mm7, mm7
		punpckhbw	mm3, mm3

		punpckhbw	mm6, mm4
		punpcklbw	mm4, mm5	; mm7= 00rr 00gg 00bb 0001
		pmullw		mm4, [edx]	; mm6= r*w g*w b*w 1*w

		paddw		mm1, mm4
		pand		mm4, mm7	; if(|d1-d1|&|d2-d2|&|d3-d3|<=tThreshold)
		paddw		mm2, mm4	; mm2= raccum gaccum baccum waccum

		psrlw		mm6, 8		; mm6= 00rr 00gg 00bb 0001
		pmullw		mm6, [edx+8]; mm4= r*w g*w b*w 1*w
		add			edx, 16		; go ahead a pair of pixels

		paddw		mm1, mm6
		pand		mm6, mm3	; if(|d1-d1|&|d2-d2|&|d3-d3|<=tThreshold)
		paddw		mm2, mm6	; mm2= raccum gaccum baccum waccum

// check for jump

;		dec			eax
		sub			eax, 2
		jnz			x_loop		; if(minx+maxx+1)

// add last pixel in the row

		movd		mm4, [esi]	; mm4= 0 ?rgb
		add			esi, 4		; go ahead 1 pixel
		pslld		mm4, 8		; mm4= 0 rgb0

		movq		mm7, mm0	; mm7= src
		movq		mm6, mm4	; mm7= src
		psubusb		mm7, mm4
		psubusb		mm6, mm0
		por			mm7, mm6	; mm7= |d1-d1| |d2-d2| |d3-d3| 0

		psubusb		mm7, mm3
		pcmpeqd		mm7, mm5	; mask= if(|d1-d1|&|d2-d2|&|d3-d3|<=tThreshold)
		punpcklbw	mm7, mm7

		por			mm4, I1		; mm4= 0 rgb1
;		pand		mm4, mm7	; if(|d1-d1|&|d2-d2|&|d3-d3|<=tThreshold)

		punpcklbw	mm4, mm5	; mm7= 00rr 00gg 00bb 0001
		pmullw		mm4, [edx]	; mm7= r*w g*w b*w 1*w
		add			edx, 8		; go ahead 1 pixel

		paddusw		mm1, mm4
		pand		mm4, mm7	; if(|d1-d1|&|d2-d2|&|d3-d3|<=tThreshold)
		paddusw		mm2, mm4	; mm2= raccum gaccum baccum waccum

// go down

		add			esi, edi	; bNear+=xGapNear
;		add			edx, ebx	; bNear+=xGapWeight
		mov			al, ch		; eax= minx+maxx

;		dec			cl
		sub			cl, 1
		jnz			x_loop		; if(miny+maxy+1)

// scale result and store it ---------------------------------------------------------------------

		// mask to choose SB or UM
		movq		mm4, I128			; mm4= w
		pcmpgtw		mm4, mm2			; mask= if(128>w) Use unsharp mask
		punpcklwd	mm4, mm4			; mm4= ? ? mask mask
		punpckldq	mm4, mm4			; mm4= mask mask mask mask

		// blurred pixel for SB
		movd		eax, mm2
		mov			ecx, pScaleWeight
		and			eax, 0xffff			; eax= w
		pmulhuw		mm2, [ecx+eax*8]	; mm2= (src*0x7fff/w)>>16

		// blurred pixel for UM
		movd		eax, mm1
;		mov			ecx, pScaleWeight
		and			eax, 0xffff			; eax= w
		pmulhuw		mm1, [ecx+eax*8]	; mm1= (src*0x7fff/w)>>16

		// sharpen using unsharp mask
		punpcklbw	mm0, mm5			; mm0= rgb0
		psubw		mm1, mm0			; mm1= src-blurred pixel=d1
		pmullw		mm1, Istrength		; mm1= d1*strength d1*strength d1*strength ?
		psraw		mm1, 3				; mm1= d1 d1 d1 ?
		psubw		mm0, mm1			; if(d1>0) mm0+=d1*strength
		paddsw		mm0, Imax			; saturate if>255
		psubusw		mm0, Imax			; saturate if<0

		// result
		pand		mm0, mm4			; if(128>w) apply UM
		pxor		mm4, Iff
		pand		mm2, mm4
		por			mm0, mm2			; else mm0= dst
		packuswb	mm0, mm0			; mm0= 00000rgb
		psrld		mm0, 8				; mm1= rgb?
/*
		punpcklbw	mm0, mm5	; mm0= rgb0
		movq		mm3, mm0
		psubw		mm3, mm1	; mm3= src-blurred pixel=d1
		movq		mm7, mm3	; mm7= d1
		psubw		mm1, mm0	; mm1= blurred pixel-src=d2
		pcmpgtw		mm7, mm5	; mask= if(d1>0)
		pand		mm3, mm7	; if(d1>0)
		pxor		mm7, Iff	; mask= !mask
		pand		mm1, mm7	; if(d1<=0)
		pmullw		mm3, Istrength	;	mm3= d1*256 d1*256 d1*256 ?
		psrlw		mm3, 8			;	mm3= d1 d1 d1 ?
		pmullw		mm1, Istrength	;	mm1= d2*256 d2*256 d2*256 ?
		psrlw		mm1, 8			;	mm1= d2 d2 d2 ?
		paddusb		mm0, mm3		; if(d1>0) mm0+=d1
		psubusb		mm0, mm1		; else mm0-=d2
*/
// process next pixel ----------------------------------------------------------------------------

		pop			eax			; in_width
		pop			ecx			; dst
		pop			esi			; src

		add			esi, 4		; src+=4
		movd		[ecx], mm0	; *dst= 0rgb
		add			ecx, 4		; dst+=4

		dec			eax			; in_width--
		push		esi
		push		ecx
		push		eax
		jnz			main_init

//******************//
// right column init ------------------------------------------------------------------------------
//******************//

		pop			eax			; in_width
		push		radius

		add			edi, 4		; xGapNear+=4
		add			ebx, 8		; xGapWeight+=8

align 16
	right_corner:

		pxor		mm1, mm1		; accumulator for unsharp mask

		pxor		mm0, mm0
		punpcklbw	mm0, [esi]
		psllq		mm0, 8			; mm0= rgb0

// compute threshold

		mov			edx, pLogThresholdSB
		movq		mm7, Ilum		; mm7= 77 150 29 0
		pmaddwd		mm7, mm0		; mm7= r*77+g*150 b*29+0*0
		punpckldq	mm6, mm7		; mm6= d3*d3 ?
		paddd		mm7, mm6		; mm7= d1*d1+d2*d2+d3*d3
		psrlq		mm7, 40			; mm7= luminance
		movd		eax, mm7		; eax= luminance
		movd		mm3, [edx+eax*4]; mm3= threshold

// x_loop init -----------------------------------------------------------------------------------

		mov			ecx, MaskWidth_MaskHeight
		sub			ecx, 0x100
		mov			MaskWidth_MaskHeight, ecx

		sub			esi, minyImageWidthx4	; esi= bNear
		mov			ecx, MaskWidth_MaskHeight
		mov			edx, pWeight

		xor			eax, eax
		mov			al, ch

		pxor		mm2, mm2	; mm2= raccum gaccum baccum waccum
		packuswb	mm0, mm0	; mm0= ? rgb0

// go right

align 16
	x_loop_rc:

		movd		mm4, [esi]	; mm4= ?rgb
		pslld		mm4, 8		; mm4= 0 rgb0
		add			esi, 4		; go ahead 1 pixel

		movq		mm7, mm0	; mm7= src
		movq		mm6, mm4	; mm7= src
		psubusb		mm7, mm4
		psubusb		mm6, mm0
		por			mm7, mm6	; mm7= |d1-d1| |d2-d2| |d3-d3| 0

		psubusb		mm7, mm3
		pcmpeqd		mm7, mm5	; mask= if(|d1-d1|&|d2-d2|&|d3-d3|<=tThreshold)

		por			mm4, I1		; mm4= 0 rgb1
;		pand		mm4, mm7	; if(|d1-d1|&|d2-d2|&|d3-d3|<=tThreshold)

		punpcklbw	mm4, mm5	; mm7= 00rr 00gg 00bb 0001
		pmullw		mm4, [edx]	; mm4= r*w g*w b*w 1*w
		add			edx, 8		; go ahead 1 pixel

		paddw		mm1, mm4
		pand		mm4, mm7	; if(|d1-d1|&|d2-d2|&|d3-d3|<=tThreshold)
		paddusw		mm2, mm4	; mm2= raccum gaccum baccum waccum

;		dec			eax
		sub			eax, 1
		jnz			x_loop_rc	; if(minx+maxx+1)

// go down

		add			edx, ebx	; bWeight+=xGapWeight
		add			esi, edi	; bNear+=xGapNear
		mov			al, ch		; eax= minx+maxx+1

;		dec			cl
		sub			cl, 1
		jnz			x_loop_rc	; if(miny+maxy+1)

// add current pixel and store result ------------------------------------------------------------

		// mask to choose SB or UM
		movq		mm4, I128	; mm4= w
		pcmpgtw		mm4, mm2	; mask= if(128>w) Use unsharp mask
		punpcklwd	mm4, mm4	; mm6= ? ? mask mask
		punpckldq	mm4, mm4	; mm6= mask mask mask mask

		// blurred pixel for SB
		movd		eax, mm2
		mov			ecx, pScaleWeight
		and			eax, 0xffff			; eax= w
		pmulhuw		mm2, [ecx+eax*8]	; (src*0x7fff/w)>>16

		// blurred pixel for UM
		movd		eax, mm1
;		mov			ecx, pScaleWeight
		and			eax, 0xffff			; eax= w
		pmulhuw		mm1, [ecx+eax*8]	; (src*0x7fff/w)>>16

		// sharpen using unsharp mask
		punpcklbw	mm0, mm5		; mm0= rgb0
		psubw		mm1, mm0		; mm2= src-blurred pixel=d1
		pmullw		mm1, Istrength	; mm2= d1*strength d1*strength d1*strength ?
		psraw		mm1, 3			; mm2= d1 d1 d1 ?
		psubw		mm0, mm1		; if(d1>0) mm0+=d1*strength
		paddsw		mm0, Imax		; saturate if>255
		psubusw		mm0, Imax		; saturate if<0

		// result
		pand		mm0, mm4	; if(128>w) apply UM
		pxor		mm4, Iff
		pand		mm2, mm4
		por			mm0, mm2	; else mm0= dst
		packuswb	mm0, mm0	; mm0= 00000rgb
		psrld		mm0, 8				; mm1= rgb?

// process next pixel ----------------------------------------------------------------------------

		pop			eax			; radius
		pop			ecx			; dst
		pop			esi			; src

		add			esi, 4		; src+=4
		movd		[ecx], mm0	; *dst= 0rgb
		add			ecx, 4		; dst+=4

		add			edi, 4		; xGapNear+=4
		add			ebx, 8		; xGapWeight+=8

		dec			eax			; radius--
		push		esi
		push		ecx
		push		eax
		jnz			right_corner

// restore stack ---------------------------------------------------------------------------------

		pop			ecx
		pop			ecx
		pop			ecx

		pop			ebx
		pop			esi
		pop			edi
		emms
	}
}
