/*
High Quality Smart Smoother Filter for VirtualDub -- performs structure
retaining noise reduction/smoothing.
Copyright (C) 1999-2000 Donald A. Graft
Copyright (C) 2001-2002 Klaus Post

This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation.
	
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.
		
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
			
The author can be contacted at:
Klaus Post
kp@interact.dk
*/

#include <windows.h>
#include <math.h>
#include <commctrl.h>
#include <stdio.h>
#include <crtdbg.h>

#include "ScriptInterpreter.h"
#include "ScriptError.h"
#include "ScriptValue.h"

#include "resource.h"
#include "filter.h"
#define WEIGHTMAX 65535					// Do not change - Plain code relies on these settings
#define WEIGHTBITS 16						// As above

//#define PRECISE_AVERAGE 1    // More precise rounding in plain mode (No effect on MMX)

#undef USE_MMX
#undef ATHLON

// MMX Inline Assembler settings
#define USE_MMX                              // Use MMX For weighed mode  (use plain integer, if undefined)
#define ATHLON 															// Use special ATHLON optimization (use MMX+, if undefined).


int RunProcMMX(const FilterActivation *fa, const FilterFunctions *ff);
int RunProcMMXAvg(const FilterActivation *fa, const FilterFunctions *ff);

///////////////////////////////////////////////////////////////////////////

int RunProc(const FilterActivation *fa, const FilterFunctions *ff);
int StartProc(FilterActivation *fa, const FilterFunctions *ff);
int EndProc(FilterActivation *fa, const FilterFunctions *ff);
long ParamProc(FilterActivation *fa, const FilterFunctions *ff);
int InitProc(FilterActivation *fa, const FilterFunctions *ff);
int ConfigProc(FilterActivation *fa, const FilterFunctions *ff, HWND hwnd);
void StringProc(const FilterActivation *fa, const FilterFunctions *ff, char *str);
void ScriptConfig(IScriptInterpreter *isi, void *lpVoid, CScriptValue *argv, int argc);
bool FssProc(FilterActivation *fa, const FilterFunctions *ff, char *buf, int buflen);

///////////////////////////////////////////////////////////////////////////
typedef struct RGBDiv {
	int r;
	int g;
	int b;
	int div;
} RGBDiv;
// For proper 8-byte alignment.
typedef struct RGBDivMMX {
	__int64 rg;
	__int64 bw;
} RGBDivMMX;

typedef struct RGBDivMMX64 {
	__int64 rg;
} RGBDivMMX64;

typedef struct T_val {
  __int64 T;
} T_val;

typedef struct MyFilterData {
	IFilterPreview		*ifp;
	int 				window_scale;
	int 				diameter;
	int					threshold;
	int					interlaced;
  int         blendmode;
	int					testgrey;
	int					tabletests;
	int					diffweight;
	int					mdiff;
  int  *weight;
	RGBDiv			*data;
	RGBDivMMX			*mmxdata;
	RGBDivMMX64			*mmxdata2;
	int		 *difftable;
  double      *dweight;
} MyFilterData;


bool FssProc(FilterActivation *fa, const FilterFunctions *ff, char *buf, int buflen) {
	MyFilterData *mfd = (MyFilterData *)fa->filter_data;
	
	_snprintf(buf, buflen, "Config(%d, %d, %d, %d, %d, %d, %d, %d)",
		mfd->diameter,
		mfd->threshold,
		mfd->interlaced,
		mfd->window_scale,
		mfd->blendmode,		
		mfd->testgrey,
		mfd->diffweight,
		mfd->mdiff);
	
	return true;
}

void ScriptConfig(IScriptInterpreter *isi, void *lpVoid, CScriptValue *argv, int argc) {
	FilterActivation *fa = (FilterActivation *)lpVoid;
	MyFilterData *mfd = (MyFilterData *)fa->filter_data;
	
	mfd->diameter		= argv[0].asInt();
	mfd->threshold		= argv[1].asInt();
	mfd->interlaced		= !!argv[2].asInt();
	mfd->window_scale		= argv[3].asInt();
	mfd->blendmode		= argv[4].asInt();
	mfd->testgrey		= !!argv[5].asInt();
	mfd->diffweight		= argv[6].asInt();
	mfd->mdiff		= argv[7].asInt();
}

ScriptFunctionDef func_defs[]={
	{ (ScriptFunctionPtr)ScriptConfig, "Config", "0iiiiiiii" },
	{ NULL },
};

CScriptObject script_obj={
	NULL, func_defs
};

struct FilterDefinition filterDef_tutorial = {
	
	NULL, NULL, NULL,		// next, prev, module
#ifdef _DEBUG
		"smart smoother HiQuality (2.11 [DEBUG])",	// name
#else
		"smart smoother HiQuality (2.11)",	// name
#endif
#ifdef USE_MMX
  #ifdef ATHLON
		"Smooth/blur while preserving structure. Quality improved.\nBased on smart smart smoother by Donald Graft\n\n[MMX, MMX+, Athlon optimized]",
  #else
		"Smooth/blur while preserving structure. Quality improved.\nBased on smart smart smoother by Donald Graft\n\n[MMX, MMX+ optimized]",
  #endif
#else
    "Smooth/blur while preserving structure. Quality improved.\nBased on smart smart smoother by Donald Graft\n",
#endif

		// desc
		"Klaus Post", 		// maker
		NULL,					// private_data
		sizeof(MyFilterData),	// inst_data_size
		
		InitProc,				// initProc
		NULL,					// deinitProc
		RunProc,				// runProc
		ParamProc,				// paramProc
		ConfigProc, 			// configProc
		StringProc, 			// stringProc
		StartProc,				// startProc
		EndProc,				// endProc
		
		&script_obj,			// script_obj
		FssProc,				// fssProc
		
};

//  A window is constructed around each pixel:
//        -----
//        -----           - = Already processed pixels. Their values are already blended into the current pixel (in xxxxsum)
//        --P**           P = Current Pixel
//        %%%++           * = Remaining pixels on the same line as pixel
//        %%%++           + = Remaining pixels right of pixel

//  Pipe:
//  1) * pixels are checked.
//  2) First line of % pixels are checked. 
//  3) First line of + pixels are checked.

//  Basic pass theory:
//  If any pixel in a row doesn't pass the threshold, the rest of the row isn't checked

//	Precision: No bytes should be lost, because recalculated figures are kept as int1616 - with 16 fraction bits. Should be more than enough
//						 A half fraction (1/512th pixel) will be lost (rounded down) in weighed average. use define to avoid (silly though)

//	NOTE: Divisor may not be bigger than 65536 - that makes a 256x256 matrix the theoretical biggest possible, but that would lead to massive rounding errors.



int buildtables(MyFilterData *mfd,const FilterActivation *fa) {
  int framesize=mfd->diameter;
  int xtraframe=framesize>>1;   // Frame left of pixel
  int pws=(xtraframe*framesize+xtraframe);
	if ((mfd->tabletests&0xf)!=(mfd->diameter)) {
		if (mfd->weight) {delete[] mfd->weight; mfd->weight= NULL;}
		if (mfd->dweight) {delete[] mfd->dweight; mfd->dweight= NULL;}
		if (mfd->difftable) {delete[] mfd->difftable; mfd->difftable= NULL;}
		if (mfd->data) {delete[] mfd->data; mfd->data= NULL;}
		if (mfd->mmxdata) {delete[] mfd->mmxdata; mfd->mmxdata= NULL;}
		if (mfd->mmxdata2) {delete[] mfd->mmxdata2; mfd->mmxdata2= NULL;}
		mfd->weight = new int[xtraframe+xtraframe*framesize];  // Allocate weight and difftable at the same time - increases propability that they will be placed continuously in memory -> Better cache efficiency
		mfd->difftable=new int[256*3];
		mfd->data=new RGBDiv[fa->src.w*fa->src.h];
		mfd->mmxdata=new RGBDivMMX[fa->src.w*fa->src.h];
		mfd->mmxdata2=new RGBDivMMX64[fa->src.w*fa->src.h+1];
		mfd->dweight = new double[xtraframe+xtraframe*framesize];  
	}
	if (!(mfd->weight&&mfd->dweight&&mfd->data&&mfd->difftable)){
    _ASSERT(FALSE);
		return 1;
	}
	mfd->tabletests=(mfd->diameter)|((mfd->window_scale)<<8)|(mfd->blendmode<<17)|(mfd->threshold<<24)|(mfd->mdiff<<4)|(mfd->interlaced<<23);
  int testsize=xtraframe+xtraframe*framesize;
	int i;
	// fill '*' area
	for (i=0;i<xtraframe;i++) {
		mfd->dweight[i]=1.0/((double)i+1.0);
	}
	// fill '+' area
	for (int h=1;h<=xtraframe;h++) {
		for (int w=-xtraframe;w<=xtraframe;w++) {						
			_ASSERT(i<pws);
			mfd->dweight[i++]=1.0/sqrt((double)(h*h+w*w));
		}
	}
		_ASSERT(i==pws);
		double scale;
		double matrixsum=0;  // Remember: Window is only covering half of the potential pixels with influence
		for (i=0;i<xtraframe+xtraframe*framesize;i++) {
			matrixsum+=(mfd->dweight[i]);
			double test=mfd->dweight[i];
		}
		matrixsum*=2.0;  // Since window is symmetrical matrixsum is is twice the testwindow size
    // Scale weight, so matrixsum=65536
#ifdef USE_MMX
		if (!mfd->interlaced) {
			scale=((double)mfd->window_scale/256.0)*((double)32767/matrixsum);   // Force size to 15 bits
		} else {
			scale=((double)mfd->window_scale/256.0)*((double)WEIGHTMAX/matrixsum);
		}
#else
		scale=((double)mfd->window_scale/256.0)*((double)WEIGHTMAX/matrixsum);
#endif
    for (i=0;i<xtraframe+xtraframe*framesize;i++) {
      mfd->weight[i]=(int)(mfd->dweight[i]*scale);   // Round down to ensure overflow does not occur
    }
		// Build diff-weight division table
		double stepsub=256.0/((double)(mfd->threshold-mfd->mdiff)*3.0);
		double basestep=256.0;
    for (i=0;i<768;i++) {
			if ((mfd->mdiff*3)>i) {
				mfd->difftable[i]=256;
			} else {
				int val=(int)(basestep+0.5);
				if (val<1){
					mfd->difftable[i]=256;
				} else if (val>255) {
					mfd->difftable[i]=256;
				} else {
					mfd->difftable[i]=val;
				}
				basestep-=stepsub;
			} 
		}
		
		return 0;
}


int StartProc(FilterActivation *fa, const FilterFunctions *ff) {
	MyFilterData *mfd = (MyFilterData *)fa->filter_data;		
  return buildtables(mfd,fa);        
}

int EndProc(FilterActivation *fa, const FilterFunctions *ff) {
	MyFilterData *mfd = (MyFilterData *)fa->filter_data;
  if (mfd->weight) {delete[] mfd->weight; mfd->weight= NULL;}
  if (mfd->dweight) {delete[] mfd->dweight; mfd->dweight= NULL;}
	if (mfd->difftable) {delete[] mfd->difftable; mfd->difftable= NULL;}
	if (mfd->data) {delete[] mfd->data; mfd->data= NULL;}
	if (mfd->mmxdata) {delete[] mfd->mmxdata; mfd->mmxdata= NULL;}
	if (mfd->mmxdata2) {delete[] mfd->mmxdata2; mfd->mmxdata2= NULL;}
		
	mfd->tabletests=-1;
  return 0;
}

long ParamProc(FilterActivation *fa, const FilterFunctions *ff)
{
	fa->dst.offset = fa->src.offset;
	return 0;	// run in place
}


int RunProc(const FilterActivation *fa, const FilterFunctions *ff) {
  MyFilterData *mfd = (MyFilterData *)fa->filter_data;
  const long		pitch = (fa->src.pitch)>>2;
  const PixDim	w = fa->src.w;
  const PixDim	h = fa->src.h;
  Pixel32 *src, *dst;
	// Check if tables are up-to-date
	if (mfd->tabletests!=((mfd->diameter)|((mfd->window_scale)<<8)|(mfd->blendmode<<17)|(mfd->threshold<<24)|(mfd->mdiff<<4)|(mfd->interlaced<<23))) {
		if (buildtables(mfd,fa)) {
      _ASSERT(FALSE);
			return 1;
		}
	}
  int *weight=mfd->weight;
	int *difftable=mfd->difftable;
  int testgrey=mfd->testgrey;
  int blurpixels;
	int diffweight=mfd->diffweight;
	if (mfd->blendmode==0) {
		blurpixels=FALSE;
	} else if (mfd->blendmode==1) {
		blurpixels=TRUE;
	}
#ifdef USE_MMX
	if (!mfd->interlaced) {
		if (blurpixels) return RunProcMMX(fa,ff);
		return RunProcMMXAvg(fa,ff);
	}
#endif


  if (!(mfd->weight&&mfd->dweight&&mfd->data)) return 1;
  src = fa->src.data;
  dst=src;
	RGBDiv *data = mfd->data;
	if (blurpixels) {
		memset(data, 0, w*h*sizeof(RGBDiv));
	} else {		
		for (int yi=0;yi<h;yi++) {
			Pixel32 *tmpp=(Pixel32*)&src[pitch*yi];
			int *tlu=(int *)&data[w*yi];
			for (int xi=0;xi<w;xi++) {
				int cpix=*(tmpp+xi);  
				if (diffweight) {
					*tlu++=(cpix&0xff)<<8;
					*tlu++=(cpix&0xff00);
					*tlu++=(cpix&0xff0000)>>8;
					*tlu++=256;
				} else {
					*tlu++=(cpix&0xff);
					*tlu++=(cpix&0xff00)>>8;
					*tlu++=(cpix&0xff0000)>>16;
					*tlu++=1;
				}
			}
		}
	}
	
  int framesize=mfd->diameter;
  int xtraframe=framesize>>1;   // Frame left of pixel
  int pws=(xtraframe*framesize+xtraframe);
  
  int greythresh=mfd->threshold*3;
	int threshold=mfd->threshold;
  int pixcheck=w*h;  // Not used in release (only on assertions)
	
	
  for (int y=0; y<h;y++) {
    for (int x=0; x<w;x++) {
      // test rest of row
      int offset=y*pitch;  // To be used when referring to src or dst
			int unpoffset=y*w;   // Unpitched offset. To be used for redsum, greensum,  bluesum & divisor
      int xtest=x+xtraframe+1;
      if (xtest>=w) {  // last pixel (not included)
        xtest=w-1;
      }
			//			unsigned char *pixp=(unsigned char*)&src[(offset+x)];
			Pixel32 *npixp=(Pixel32*)&src[(offset+x)];
      int tred;	//  test pixel red (P)
      int tgreen; // test pixel green
      int tblue; // test pixel blue
			Pixel32 cpix=*npixp;
			tred=cpix&0xff;	//  test pixel red (P) 
			tgreen=(cpix&0xff00)>>8;
			tblue=(cpix&0xff0000)>>16;
			
			
      int cred=data[unpoffset+x].r;   //color already been added to this pixel (red)
      int cgreen=data[unpoffset+x].g;//  ^ (green)
      int cblue=data[unpoffset+x].b;  //  ^ (blue)
      int cdiv=data[unpoffset+x].div;   // Current divisor (for !blendpixels)
			npixp++;  // First testpixel (Right of current pixel)
			
      for (int tx=x+1;tx<xtest;tx++) {
        int pixelpasses=0;		// does this pixel pass the threshold?
				int delta;
				Pixel32 testpix=*npixp;
        if (!testgrey) {
          int deltar=abs((testpix&0xff) - tred);
          int deltag=abs(((testpix&0xff00)>>8) - tgreen);
          int deltab=abs(((testpix&0xff0000)>>16) - tblue);
					if ((deltar<threshold) && (deltag<threshold) && (deltab<threshold) ) {
						pixelpasses=1;
						if (diffweight) delta=deltar+deltag+deltab;
					}
        } else {
          delta=(55*abs((testpix&0xff) - tred)+ 549*abs(((testpix&0xff00)>>8) - tgreen)+ 163*abs(((testpix&0xff0000)>>16) - tblue))>>8;  // Optimize to only use one shift down
          if (delta<greythresh) {
            pixelpasses=1;
          }
        }// end if testcoloronly
        if (pixelpasses) {
          int exactoffset=unpoffset+tx;  // offset of testpixel  (unpitched)
					if (blurpixels) {
						int cweight;
						if (diffweight) {
							cweight=(weight[tx-x-1]*difftable[delta])>>8;  // Weight of this pixel (0 to 65536) of this pixel compared to the other.
						} else {
							cweight=weight[tx-x-1];  // Weight of this pixel (0 to 65536) of this pixel compared to the other.
						}
						// Add into this pixel
						cred+=(testpix&0xff)*cweight;
						cgreen+=((testpix&0xff00)>>8)*cweight;
						cblue+=((testpix&0xff0000)>>16)*cweight;
						cdiv+=cweight;
						
            // Add into other pixel
            data[exactoffset].r+=(tred*cweight);
            data[exactoffset].g+=(tgreen*cweight);
            data[exactoffset].b+=(tblue*cweight);
            data[exactoffset].div+=cweight;

          } else {
						if (diffweight) {
							int cweight=difftable[delta];
							// Add into this pixel						
							cred+=(testpix&0xff)*cweight;
							cgreen+=((testpix&0xff00)>>8)*cweight;
							cblue+=((testpix&0xff0000)>>16)*cweight;
							cdiv+=cweight;
							// Add into other pixel
							data[exactoffset].r+=tred*cweight;
							data[exactoffset].g+=tgreen*cweight;
							data[exactoffset].b+=tblue*cweight;
							data[exactoffset].div+=cweight;
						} else {
							// Add into this pixel						
							cred+=(testpix&0xff);
							cgreen+=((testpix&0xff00)>>8);
							cblue+=((testpix&0xff0000)>>16);
							cdiv++;
							// Add into other pixel
							data[exactoffset].r+=tred;
							data[exactoffset].g+=tgreen;
							data[exactoffset].b+=tblue;
							data[exactoffset].div+=1;
            } 
          }          
					npixp++;
        } else { // pixel does not pass
          tx=xtest;  // Jump out of loop
        } // End if pixelpasses
      }// end for tx
      
      // Proceed further down!
      int endy=xtraframe;
      if (y+endy+1>h) endy=h-y-1;
      for (int tdown=0;tdown<endy;tdown++) {
        offset+=pitch;  // Offset the test pixel down a line
        unpoffset+=w;  // Offset the test pixel down a line
        int downoffset=xtraframe+tdown*framesize;  // Calc offset in frame
        int startx=x-xtraframe;  // 
        if (startx<0) {
          startx=0;
        }
        int pixelpasses=1;		// does this pixel pass the threshold?
        // CENTER TO LEFT HERE  (% area)  (includes x)
				npixp=((Pixel32*)&src[offset+x]);
        for (int tx=x;tx>=startx&&pixelpasses;tx--) {
					int delta;
					pixelpasses=0;
					Pixel32 testpix=*npixp;
					if (!testgrey) {
						int deltar=abs((testpix&0xff) - tred);
						int deltag=abs(((testpix&0xff00)>>8) - tgreen);
						int deltab=abs(((testpix&0xff0000)>>16) - tblue);
						if ((deltar<threshold) && (deltag<threshold) && (deltab<threshold) ) {
							pixelpasses=1;
							if (diffweight) delta=deltar+deltag+deltab;
						}
					} else {
						delta=(55*abs((testpix&0xff) - tred)+ 549*abs(((testpix&0xff00)>>8) - tgreen)+ 163*abs(((testpix&0xff0000)>>16) - tblue))>>8;  // Optimize to only use one shift down
						if (delta<greythresh) {
							pixelpasses=1;
						}
					}// end if testcoloronly
					if (pixelpasses) {
						npixp--;
						int exactoffset=unpoffset+tx;  // offset of testpixel  (unpitched)
						if (blurpixels) {
							int cweight;
							if (diffweight) {
								cweight=(weight[downoffset+(tx-startx)]*difftable[delta])>>8;  // Weight of this pixel (0 to 65536) of this pixel compared to the other.
							} else {
								cweight=weight[downoffset+(tx-startx)];  // Weight of this pixel (0 to 65536) of this pixel compared to the other.
							}
							// Add into this pixel
							cred+=(testpix&0xff)*cweight;
							cgreen+=((testpix&0xff00)>>8)*cweight;
							cblue+=((testpix&0xff0000)>>16)*cweight;
							cdiv+=cweight;
							
							// Add into other pixel
							data[exactoffset].r+=(tred*cweight);
							data[exactoffset].g+=(tgreen*cweight);
							data[exactoffset].b+=(tblue*cweight);
							data[exactoffset].div+=cweight;
							
						} else {
							if (diffweight) {
								int cweight=difftable[delta];
								// Add into this pixel						
								cred+=(testpix&0xff)*cweight;
								cgreen+=((testpix&0xff00)>>8)*cweight;
								cblue+=((testpix&0xff0000)>>16)*cweight;
								cdiv+=cweight;
								// Add into other pixel
								data[exactoffset].r+=tred*cweight;
								data[exactoffset].g+=tgreen*cweight;
								data[exactoffset].b+=tblue*cweight;
								data[exactoffset].div+=cweight;
							} else {
								// Add into this pixel						
								cred+=(testpix&0xff);
								cgreen+=((testpix&0xff00)>>8);
								cblue+=((testpix&0xff0000)>>16);
								cdiv++;
								// Add into other pixel
								data[exactoffset].r+=tred;
								data[exactoffset].g+=tgreen;
								data[exactoffset].b+=tblue;
								data[exactoffset].div+=1;
							} 
						}          
            // write to passmap
          } else  { // pixel does not pass
						if (tx==x) endy=tdown;  // Stop all further checking downwards if doesn't pass pixel on axis
          } // End pixelpass (pixel check)
        } // End for tx
        
        // CENTER TO RIGHT SIDE HERE  (+ area)
        startx=x+1;  //
        if (startx>=w) {
          startx=w-1;
        }
        
        pixelpasses=1;		// does this pixel pass the threshold?
				npixp=((Pixel32*)&src[offset+startx]);
        
        for (tx=startx;tx<xtest&&pixelpasses;tx++) {
					int delta;
					pixelpasses=0;
					Pixel32 testpix=*npixp;
					//						testp=(unsigned char*)&src[offset+tx];
					//            tpix=src[tx+offset];	// The pixel to check against
					if (!testgrey) {
						int deltar=abs((testpix&0xff) - tred);
						int deltag=abs(((testpix&0xff00)>>8) - tgreen);
						int deltab=abs(((testpix&0xff0000)>>16) - tblue);
						if ((deltar<threshold) && (deltag<threshold) && (deltab<threshold) ) {
							pixelpasses=1;
							if (diffweight) delta=deltar+deltag+deltab;
						}
					} else {
						delta=(55*abs((testpix&0xff) - tred)+ 549*abs(((testpix&0xff00)>>8) - tgreen)+ 163*abs(((testpix&0xff0000)>>16) - tblue))>>8;  
						if (delta<greythresh) {
							pixelpasses=1;
						}
					}// end if testcoloronly  
					if (pixelpasses) {
						npixp++;
						int exactoffset=unpoffset+tx;  // offset of testpixel  (unpitched)
						if (blurpixels) {
							int cweight;
								if (diffweight) {
									cweight=(weight[downoffset+(tx-startx)]*difftable[delta])>>8;  // Weight of this pixel (0 to 65536) of this pixel compared to the other.
								} else {
									cweight=weight[downoffset+(tx-startx)];  // Weight of this pixel (0 to 65536) of this pixel compared to the other.
								}
							// Add into this pixel
							cred+=(testpix&0xff)*cweight;
							cgreen+=((testpix&0xff00)>>8)*cweight;
							cblue+=((testpix&0xff0000)>>16)*cweight;
							cdiv+=cweight;
							
							// Add into other pixel
							data[exactoffset].r+=(tred*cweight);
							data[exactoffset].g+=(tgreen*cweight);
							data[exactoffset].b+=(tblue*cweight);
							data[exactoffset].div+=cweight;
							
						} else {
							if (diffweight) {
								int cweight=difftable[delta];
								// Add into this pixel						
								cred+=(testpix&0xff)*cweight;
								cgreen+=((testpix&0xff00)>>8)*cweight;
								cblue+=((testpix&0xff0000)>>16)*cweight;
								cdiv+=cweight;
								// Add into other pixel
								data[exactoffset].r+=tred*cweight;
								data[exactoffset].g+=tgreen*cweight;
								data[exactoffset].b+=tblue*cweight;
								data[exactoffset].div+=cweight;
							} else {
								// Add into this pixel						
								cred+=(testpix&0xff);
								cgreen+=((testpix&0xff00)>>8);
								cblue+=((testpix&0xff0000)>>16);
								cdiv++;
								// Add into other pixel
								data[exactoffset].r+=tred;
								data[exactoffset].g+=tgreen;
								data[exactoffset].b+=tblue;
								data[exactoffset].div+=1;
							} // end if !bluepixels
						}  // end if pixelpasses
          }
        } // End for tx
      } // End for tdown
			// Plot final pixel
			if(blurpixels) { 
				int invcdiv=(WEIGHTMAX)-cdiv;
				if (!mfd->interlaced) {
					cred+=tred*invcdiv;
					cgreen+=tgreen*invcdiv;
					cblue+=tblue*invcdiv;
				} else {
					cred=cgreen=cblue=cdiv<<8;
				}
#ifdef PRECISE_AVERAGE 
				dst[y*pitch+x]=(((cred+(1<<(WEIGHTBITS-1)))>>WEIGHTBITS)&0xff) | (((cgreen+(1<<(WEIGHTBITS-1)))>>(WEIGHTBITS-8))&0xff00) | (((cblue+(WEIGHTBITS-1)))&0xff0000);
#else
				dst[y*pitch+x]=(cred>>WEIGHTBITS) | ((cgreen>>(WEIGHTBITS-8))&0xff00) | ((cblue)&0xff0000);
#endif				
      } else {
				if (mfd->interlaced) {
					int dciv=(mfd->diameter*mfd->diameter);
					if (diffweight) dciv<<=8;
					int de_pix=((cdiv<<8)/dciv)-1;
          dst[y*pitch+x]=(de_pix | (de_pix<<8) | (de_pix<<16));
				} else {
        switch (cdiv) {
#ifdef PRECISE_AVERAGE 
        case 1:
          dst[y*pitch+x]=(cred | (cgreen<<8) | (cblue<<16));
          break;
        case 2:
          dst[y*pitch+x]=(cred+1)>>1 |
            (((cgreen+1)>>1)<<8) |
            (((cblue+1)>>1)<<16);                    
          break;
        case 4:
          dst[y*pitch+x]=(cred+2)>>2 |
            (((cgreen+2)>>2)<<8) |
            (((cblue+2)>>2)<<16);                    
          break;
        case 8:
          dst[y*pitch+x]=(cred+4)>>3 |
            (((cgreen+4)>>3)<<8) |
            (((cblue+4)>>3)<<16);                    
          break;
        case 16:
          dst[y*pitch+x]=(cred+8)>>4 |
            (((cgreen+8)>>4)<<8) |
            (((cblue+8)>>4)<<16);                    
          break;
        case 32:
          dst[y*pitch+x]=(cred+16)>>5 |
            (((cgreen+16)>>5)<<8) |
            (((cblue+16)>>5)<<16);                    
          break;
        case 64:
          dst[y*pitch+x]=(cred+32)>>6 |
            (((cgreen+32)>>6)<<8) |
            (((cblue+32)>>6)<<16);                    
          break;
				case 128:
          dst[y*pitch+x]=(cred+64)>>7 |
            (((cgreen+64)>>7)<<8) |
            (((cblue+64)>>7)<<16);                    
          break;
					
        default:
					
          dst[y*pitch+x]=(cred+(cdiv>>1))/cdiv |
            (((cgreen+(cdiv>>1))/cdiv)<<8) |
            (((cblue+(cdiv>>1))/cdiv)<<16);                    
#else
        case 1:
          dst[y*pitch+x]=(cred | (cgreen<<8) | (cblue<<16));
          break;
        case 2:
          dst[y*pitch+x]=(cred)>>1 |
            (((cgreen)>>1)<<8) |
            (((cblue)>>1)<<16);                    
          break;
        case 4:
          dst[y*pitch+x]=(cred)>>2 |
            (((cgreen)>>2)<<8) |
            (((cblue)>>2)<<16);                    
          break;
        case 8:
          dst[y*pitch+x]=(cred)>>3 |
            (((cgreen)>>3)<<8) |
            (((cblue)>>3)<<16);                    
          break;
        case 16:
          dst[y*pitch+x]=(cred)>>4 |
            (((cgreen)>>4)<<8) |
            (((cblue)>>4)<<16);                    
          break;
        case 32:
          dst[y*pitch+x]=(cred)>>5 |
            (((cgreen)>>5)<<8) |
            (((cblue)>>5)<<16);                    
          break;
        case 64:
          dst[y*pitch+x]=(cred)>>6 |
            (((cgreen)>>6)<<8) |
            (((cblue)>>6)<<16);                    
          break;
				case 128:
          dst[y*pitch+x]=(cred)>>7 |
            (((cgreen)>>7)<<8) |
            (((cblue)>>7)<<16);                    
          break;
					
        default:
					
          dst[y*pitch+x]=(cred)/cdiv |
            (((cgreen)/cdiv)<<8) |
            (((cblue)/cdiv)<<16);                    
					
#endif	
				}
        }// end case
      }// end if      
  }	 // end for x
  
} // end for y

return 0;
}



/*
	Pack, unpack and shifts cannot be paired (u/v ok)
	MMX multiplier unit cannot pair (pmull, pmulh, pmadd) (u/v ok)
	MMX instructions which access either memory or the integer register file can be issued in the U-pipe only.
	The MMX destination register of the U-pipe instruction should not match the source or destination register of the V-pipe instruction
*/

/*
	More optimized version, but cannot run in plain MMX mode
	Optimized for Athlon pipeline
*/
void MMX_asm_blend_ATH(unsigned long *src,RGBDivMMX *data, int beginx ,int endx, int endy, __int64 *T, int stride, int *weighttable, int *difftable, int diffweight,int winrad, int width) {
// Pointer to Source pixel, Pointer to RGB Data, x left, x right, y length,threshold, width stride, pointer to weighttable, pointer to difftable, boolean difftable, int window radius (xtraframe)
/*;	mm0: current pixel green32(accum) red32(accum) 
;	mm1: current pixel cweight(accum) blue32(accum) 
;	mm2: currentpixel red8 green8 blue8
;	mm3: testpixel red8 green8 blue8
;	mm4: temp
;	mm5: temp
;	mm6: temp
;	mm7: temp

;	RGB = RGB values
;	rgb	= RGB fraction values
;	w = weight values

	; Note: Inner loop is somewhat pairable - mostly because of dependency and lack of registers and much use of shifter unit. 
	; Instructions that are pairble with above command are marked wirk (p^)
*/
	static const __int64 I1 = 0x0001000000000000i64;
	static const __int64 I2 = 0x000000ff00ff00ffi64;
	static const __int64 I3 = 0x0000000000ffffffi64;
	static const __int64 I4 = 0x0000ffffffffffffi64;
	static const int I5 = 32767;
	static const __int64 I8 = 0x00000000ffff0000i64;
	static const __int64 I9 = 0x000000000000ffffi64;
  __int64 temp=0;
  __int64 temp2=0;
  int *i_temp;
  int *i_temp2;
	__asm {
#ifdef ATHLON
    prefetch [data]
    prefetch [src]
#else
    prefetchnta [data]
    prefetchnta [src]
#endif
  }
	static const __int64 I10 = 0x0000ffff0000ffffi64;

  i_temp = ((int*)&temp);
  i_temp2 = ((int*)&temp2);


	unsigned long *org_src=src;
	__int64 t_weight;
	unsigned short *p_t_weight = (unsigned short*)&t_weight;
	RGBDivMMX *org_data=data;
	int *org_weighttable=weighttable;
	static const __int64 i64_0xffffff = (__int64)0xffffff;

	__asm {    
	mov edx,src;
	mov eax,data;
  mov edi, i_temp
  mov esi, i_temp2
											 ; Store current pixel in mm2
	PUNPCKLBW mm2, [edx]	; Unpack lower bytes to words | mm2 = 0000 BB00 GG00 RR00
	movq mm0,[eax] 			; contains correct pixels and offsets		| mm0: 00GG gggg 00RR rrrr  U-pipe
	pand mm2, I4				; remove unneeded information | mm2=0000 BBXX GGXX RRXX (np^) - V-pipe
	movq mm1,[eax+8] 		; contains correct pixels and offsets		| mm1: 0000 wwww 00BB rrrr (np^) U-pipe
	PSRLW mm2,8									; Cleanup mm2  | mm2 = 0000 00BB 00GG 00RR
	}
	int i;

	__asm {
		mov         [i],1				; Move 1 into i
		jmp         afterloop1		;
		align 16
goloop1:
		mov         edx, [i]		; Move i into edx
		add         edx,1				; Add 1 to i
		mov         [i],edx			; Move edx back into i

afterloop1:
		mov         eax,[i]			; Move i into eax
		cmp         eax,[endx]	; Is eax(i) greater than endx
		jg          outloop1		; Jump out of loop if true

		mov ecx,src			; 
    mov eax,[T]
		add ecx,4				
		add data,16
#ifdef ATHLON
		prefetch [ecx]
#else
		prefetchnta [ecx]
#endif
		mov					src,ecx			; Add source offset  src must be retained
		add weighttable,4
    movq mm6,[I2]
		PUNPCKLBW mm3, [ecx]	; Unpack lower bytes to words | mm3 = 00xx BBxx GGxx RRxx V
		movq mm4, mm2				; make a copy of mm2 (p^) U-pipe
		PSRLW mm3,8					; Cleanup mm3  | mm3 = 00XX 00BB 00GG 00RR (p^) V-pipe
		movq mm7, [eax]					; Move threhold into mm7 (p^) U-pipe
    pand mm3,mm6
		movq mm5, mm3				; make a copy of mm3 (np^) V-pipe
		psubusb mm4, mm3		; compute difference one way (p^) U-pipe
		psubusb mm5, mm2		; compute difference the other way (p^) V-pipe
		por mm4, mm5				; or them together              | mm4 =0000 00BB 00GG 00RR  - absolute difference between the colors (np^) V
    pxor mm5,mm5
    movq mm6,mm4
												; Test if any values goes beyond the threshold.
		PCMPGTW mm4,mm7			; is any RGB-values in mm4 greater than Absolute difference? (np^)		V
    pxor mm7,mm7
		psadbw mm5,mm4			; Any past threshold?
    movd eax, mm5		    ; Past threshold = eax

		psadbw mm7,mm6			; Difference
		movd [temp2],mm7		; Absolute difference = esi

		cmp  eax,0				; compare eax to 0
		jg outloop1					; If one value is greater than threshold 

#ifdef ATHLON
		prefetchw [data]
#else
		prefetchnta [data]
#endif
		mov eax,diffweight		; Move value of diffweight into eax   OPT ME!
    mov edx,[esi]
		cmp eax,0							; Compare eax to 0
		jne do_diffweight1

		mov eax,weighttable
		pxor mm5,mm5
		movd mm4,[eax]	; Move weight into mm4 | mm4=0000 0000 0000 WWWW
		mov ebx,[eax]
		jmp diff_go_on1

		align 16
do_diffweight1:
		shl edx,2						; Mult by 4 bytes up (for proper offset in lookup table)
		add edx,difftable	  ; Add pointer to difftable
#ifdef ATHLON
		prefetch [edx]
#else
	  prefetchnta [edx]
#endif
		mov eax,weighttable				; Get pixel weight
		mov ecx,[eax]				; Get pixel weight
		mov ebx,[edx]				; Get proper diffweight
		pxor mm5,mm5
		imul ebx,ecx					; Multiply weights together
		shr ebx,8							; Shift down, so diffweight is moved into fraction area
		movd mm4,ebx								; Move weight into mm4 | mm4=0000 0000 0000 WWWW
		align 16
diff_go_on1:
		por mm3,[I1]								; Add one to get weight mm3 = 0001 00BB 00GG 00RR
		pshufw mm4,mm4,17						; Also add Weight to upper qword 00010001=17 | mm4=  WWWW 0000 WWWW 0000
		punpckHWD mm5,mm3						; mm5=0001 0000 00BB 0000
		pxor mm6,mm6
		pmaddwd mm5,mm4							; mm5=0000 WWWW 00BB bbbb

		punpckLWD mm6,mm3						; mm6=00GG 0000 00RR 0000
		pmaddwd mm6,mm4							; mm3=00GG gggg 00RR rrrr
		paddd mm1,mm5
		PADDD mm0,mm6								; add mm5 to mm0 
		; Add into current pixel

		mov edx, data

		pxor mm5,mm5
		pxor mm6,mm6
		punpckHWD mm5, mm2
		movq mm7,[edx] 							; Move currently weighed infomation for testpixel into mm7 | mm7= 00GG gggg 00RR rrrr
		punpckLWD mm6, mm2					; low			
		pmaddwd mm5,mm4							; high mm5=0000 0000 00BB bbbb
		movq mm3,[edx+8] 						; high Move currently weighed infomation for testpixel into mm4 
		pmaddwd mm6,mm4							; low mm4=00GG gggg 00RR rrrr
		paddd mm5,mm3								; high
		paddd mm7,mm6								; low
		add [edx+12],ebx
		movq [edx],mm7
		movd [edx+8],mm5


		jmp goloop1
outloop1:
 //;*************************** LOOP 2 *************************;//
}


		int ydown;

	__asm {
		align 16
		mov [ydown],1;
		jmp ydownafterloop
		align 16
ydowngoloop:
		mov         edx, [ydown]		; Move ydown into edx
		add         edx,1				; Add 1 to i
		mov         [ydown],edx			; Move edx back into i
ydownafterloop:
		mov         eax,[ydown]			; Move ydown into eax
		cmp         eax,[endy]	; Is eax(ydown) greater than endy
		jg          end_pixel		; Jump out of loop if true
		mov ebx, [ydown]
		mov eax,width
		imul eax,ebx
		mov ecx, org_data			; move data one row down
		add ecx, eax
		mov data,ecx

		mov eax,winrad
		mov ecx, org_weighttable			; move Weighttable one row down
		imul eax, ebx
		add ecx, eax
		mov weighttable,ecx

		mov eax,stride
		mov ecx, org_src			; move Image Source one row down
		imul eax, ebx
		add ecx, eax
		mov src,ecx

		; // XLOOP
		mov         [i],0				; Move 1 into i
		jmp         afterloop2		;
		align 16
goloop2:
		mov         edx, [i]		; Move i into edx
		add         edx,1				; Add 1 to i
		mov         [i],edx			; Move edx back into i

		mov ecx,src		
		add ecx,4
		add data,16
#ifdef ATHLON
		prefetch [ecx]
#else
		prefetchnta [ecx]
#endif
		add weighttable,4
		mov src,ecx
afterloop2:
		mov         eax,[i]			; Move i into eax
		cmp         eax,[endx]	; Is eax(i) greater than endx
		jg          outloop2		; Jump out of loop if true

    mov eax,[T]		
    movq mm6,[I2]
		PUNPCKLBW mm3, [ecx]	; Unpack lower bytes to words | mm3 = 00xx BBxx GGxx RRxx V
		movq mm4, mm2				; make a copy of mm2 (p^) U-pipe
		PSRLW mm3,8					; Cleanup mm3  | mm3 = 00XX 00BB 00GG 00RR (p^) V-pipe
		movq mm7, [eax]					; Move threhold into mm7 (p^) U-pipe
    pand mm3,mm6
		movq mm5, mm3				; make a copy of mm3 (np^) V-pipe
		psubusb mm4, mm3		; compute difference one way (p^) U-pipe
		psubusb mm5, mm2		; compute difference the other way (p^) V-pipe
		por mm4, mm5				; or them together              | mm4 =0000 00BB 00GG 00RR  - absolute difference between the colors (np^) V
    pxor mm5,mm5
    movq mm6,mm4
												; Test if any values goes beyond the threshold.
		PCMPGTW mm4,mm7			; is any RGB-values in mm4 greater than Absolute difference? (np^)		V
    pxor mm7,mm7
		psadbw mm5,mm4			; Any past threshold?
    movd eax, mm5		    ; Past threshold = eax

		psadbw mm7,mm6			; Difference
		movd [temp2],mm7		; Absolute difference = esi

		cmp  eax,0				; compare eax to 0
		jg outloop2					; If one value is greater than threshold 

#ifdef ATHLON
		prefetchw [data]
#else
		prefetchnta [data]
#endif

		mov eax,diffweight		; Move value of diffweight into eax   OPT ME!
    mov edx,[esi]
		cmp eax,0							; Compare eax to 0
		jne do_diffweight2

		mov eax,weighttable
		pxor mm5,mm5
		movd mm4,[eax]	; Move weight into mm4 | mm4=0000 0000 0000 WWWW
		mov ebx,[eax]
		jmp diff_go_on2

		align 16
do_diffweight2:
		shl edx,2						; Mult by 4 bytes up (for proper offset in lookup table)
		add edx,difftable	  ; Add pointer to difftable


#ifdef ATHLON
		prefetch [edx]
#else
	  prefetchnta [edx]
#endif
		mov eax,weighttable				; Get pixel weight
		mov ecx,[eax]				; Get pixel weight
		mov ebx,[edx]				; Get proper diffweight
		pxor mm5,mm5
		imul ebx,ecx					; Multiply weights together
		shr ebx,8							; Shift down, so diffweight is moved into fraction area
		movd mm4,ebx								; Move weight into mm4 | mm4=0000 0000 0000 WWWW
		align 16
diff_go_on2:
		por mm3,[I1]								; Add one to get weight mm3 = 0001 00BB 00GG 00RR
		pshufw mm4,mm4,17						; Also add Weight to upper qword 00010001=17 | mm4=  WWWW 0000 WWWW 0000
		punpckHWD mm5,mm3						; mm5=0001 0000 00BB 0000
		pxor mm6,mm6
		pmaddwd mm5,mm4							; mm5=0000 WWWW 00BB bbbb

		punpckLWD mm6,mm3						; mm6=00GG 0000 00RR 0000
		pmaddwd mm6,mm4							; mm3=00GG gggg 00RR rrrr
		paddd mm1,mm5
		PADDD mm0,mm6								; add mm5 to mm0 
		; Add into current pixel

		mov edx, data

		pxor mm5,mm5
		pxor mm6,mm6
		punpckHWD mm5, mm2
		movq mm7,[edx] 							; Move currently weighed infomation for testpixel into mm7 | mm7= 00GG gggg 00RR rrrr
		punpckLWD mm6, mm2					; low			
		pmaddwd mm5,mm4							; high mm5=0000 0000 00BB bbbb
		movq mm3,[edx+8] 						; high Move currently weighed infomation for testpixel into mm4 
		pmaddwd mm6,mm4							; low mm4=00GG gggg 00RR rrrr
		paddd mm5,mm3								; high
		paddd mm7,mm6								; low
		add [edx+12],ebx
		movq [edx],mm7
		movd [edx+8],mm5
		jmp goloop2
outloop2:

		//;*************************** LOOP 3 *************************;//
		; // XLOOP
		mov ebx, [ydown]
		mov eax,width
		imul eax,ebx
		mov ecx, org_data			; move data one row down
		add ecx, eax
		mov data,ecx

		mov eax,winrad
		mov ecx, org_weighttable			; move Weighttable one row down
		imul eax, ebx
		add ecx, eax
		mov weighttable,ecx

		mov eax,stride
		mov ecx, org_src			; move Image Source one row down
		imul eax, ebx
		add ecx, eax
		mov src,ecx

		mov         [i],1				; Move 1 into i
		jmp         afterloop3		;
		align 16
goloop3:
		mov         edx, [i]		; Move i into edx
		add         edx,1				; Add 1 to i
		mov         [i],edx			; Move edx back into i

afterloop3:
		mov         eax,[i]			; Move i into eax
		cmp         eax,[beginx]	; Is eax(i) greater than endx
		jg          outloop3		; Jump out of loop if true

		mov ecx,src		
    mov eax,[T]
		sub ecx,4
		sub weighttable,4
#ifdef ATHLON
		prefetch [ecx]
#else
		prefetchnta [ecx]
#endif
		sub data,16
		mov src,ecx
		
    movq mm6,[I2]
		PUNPCKLBW mm3, [ecx]	; Unpack lower bytes to words | mm3 = 00xx BBxx GGxx RRxx V
		movq mm4, mm2				; make a copy of mm2 (p^) U-pipe
		PSRLW mm3,8					; Cleanup mm3  | mm3 = 00XX 00BB 00GG 00RR (p^) V-pipe
		movq mm7, [eax]					; Move threhold into mm7 (p^) U-pipe
    pand mm3,mm6
		movq mm5, mm3				; make a copy of mm3 (np^) V-pipe
		psubusb mm4, mm3		; compute difference one way (p^) U-pipe
		psubusb mm5, mm2		; compute difference the other way (p^) V-pipe
		por mm4, mm5				; or them together              | mm4 =0000 00BB 00GG 00RR  - absolute difference between the colors (np^) V
    pxor mm5,mm5
    movq mm6,mm4
												; Test if any values goes beyond the threshold.
		PCMPGTW mm4,mm7			; is any RGB-values in mm4 greater than Absolute difference? (np^)		V
    pxor mm7,mm7
		psadbw mm5,mm4			; Any past threshold?
    movd eax, mm5		    ; Past threshold = eax

		psadbw mm7,mm6			; Difference
		movd [temp2],mm7		; Absolute difference = esi

		cmp  eax,0				; compare eax to 0
		jg outloop3					; If one value is greater than threshold 
#ifdef ATHLON
		prefetchw [data]
#else
		prefetchnta [data]
#endif

		mov eax,diffweight		; Move value of diffweight into eax   OPT ME!
    mov edx,[esi]
		cmp eax,0							; Compare eax to 0
		jne do_diffweight3

		mov eax,weighttable
		pxor mm5,mm5
		movd mm4,[eax]	; Move weight into mm4 | mm4=0000 0000 0000 WWWW
		mov ebx,[eax]
		jmp diff_go_on3

		align 16
do_diffweight3:
		shl edx,2						; Mult by 4 bytes up (for proper offset in lookup table)
		add edx,difftable	  ; Add pointer to difftable


#ifdef ATHLON
		prefetch [edx]
#else
	  prefetchnta [edx]
#endif
		mov eax,weighttable				; Get pixel weight
		mov ecx,[eax]				; Get pixel weight
		mov ebx,[edx]				; Get proper diffweight
		pxor mm5,mm5
		imul ebx,ecx					; Multiply weights together
		shr ebx,8							; Shift down, so diffweight is moved into fraction area
		movd mm4,ebx								; Move weight into mm4 | mm4=0000 0000 0000 WWWW
		align 16
diff_go_on3:
		por mm3,[I1]								; Add one to get weight mm3 = 0001 00BB 00GG 00RR
		pshufw mm4,mm4,17						; Also add Weight to upper qword 00010001=17 | mm4=  WWWW 0000 WWWW 0000
		punpckHWD mm5,mm3						; mm5=0001 0000 00BB 0000
		pxor mm6,mm6
		pmaddwd mm5,mm4							; mm5=0000 WWWW 00BB bbbb

		punpckLWD mm6,mm3						; mm6=00GG 0000 00RR 0000
		pmaddwd mm6,mm4							; mm3=00GG gggg 00RR rrrr
		paddd mm1,mm5
		PADDD mm0,mm6								; add mm5 to mm0 
		; Add into current pixel

		mov edx, data

		pxor mm5,mm5
		pxor mm6,mm6
		punpckHWD mm5, mm2
		movq mm7,[edx] 							; Move currently weighed infomation for testpixel into mm7 | mm7= 00GG gggg 00RR rrrr
		punpckLWD mm6, mm2					; low			
		pmaddwd mm5,mm4							; high mm5=0000 0000 00BB bbbb
		movq mm3,[edx+8] 						; high Move currently weighed infomation for testpixel into mm4 
		pmaddwd mm6,mm4							; low mm4=00GG gggg 00RR rrrr
		paddd mm5,mm3								; high
		paddd mm7,mm6								; low
		add [edx+12],ebx
		movq [edx],mm7
		movd [edx+8],mm5
		jmp goloop3
outloop3:
		
		jmp ydowngoloop
}


	__asm {
; ***************************  END   *************************

		align 16
end_pixel:
		align 16
		movq mm5,mm1
		movq mm3,mm2				; mm3 = 0000 00BB 00GG 00RR
												; Reverse weight 
		PSRLQ mm5,32				; Move weight down to bit 0 | mm5= 0000 0000 0000 WWWW  
		mov edx,[I5]				; Move maxweight into edx
    movd [edi],mm5
		mov eax,[edi]				; eax=weight

;		cmp eax,0           ; Bail out, if no pixels has been added
;   jle endall
		pxor mm3,mm3
    sub edx,eax					; Subtract weight (eax) from max_weight (edx) store in edx

		movd mm4,edx								; Move weight into mm4 | mm4=0000 0000 0000 WWWW
		punpckLWD mm3,mm2						; mm5=00GG 0000 00RR 0000
		pxor mm5,mm5
		pshufw mm4,mm4,17						; Also add Weight to upper qword 00010001=17 | mm4=  WWWW 0000 WWWW 0000
		punpckHWD mm5,mm2						; mm5=0000 0000 00BB 0000
		pmaddwd mm3,mm4							; mm3=00GG gggg 00RR rrrr
		pmaddwd mm5,mm4							; mm3=0000 0000 00BB bbbb
		paddd mm0,mm3
		paddd mm1,mm5
		
		; Add into current pixel


		; Pack mm0 and mm1 and store
		PSRLD mm0,15						; Shift packed down weightmax bits | mm0 = 0000 00BB 0000 00RR (np^)
		PSRLD mm1,15						; Shift packed down weightmax bits | mm1 = 0000 00WW 0000 00BB (np^)
		pand mm1,[I9]						; Remove WW | mm1 = 0000 0000 0000 00BB (np^)  (if we don't care about what's written into upper 8 bit - remove this)
		PACKSSDW mm0,mm1				; Pack doublewords into words | mm0 = 0000 00BB 00GG 00RR (np) 
		PACKUSWB mm0,mm0				; Pack words into byte | mm0= 0000 0000 00BB GGRR
		mov eax,org_src
		movd [eax],mm0 					; Store processed pixel (finally!)
;endall:
	}
/*  
	if (0) {
		__asm {
    assert:
        int 3
	  }
  }
  */
}


/*
	More optimized version, but cannot run in plain MMX mode
	Optimized for Athlon pipeline
*/

 //;*************************** Average WDW *************************;//

void MMX_asm_avg_wdw_ATH(unsigned long *src,RGBDivMMX *data, int beginx ,int endx, int endy, __int64 *T, int stride, int *difftable, int diffweight,int winrad, int width) {
// Pointer to Source pixel, Pointer to RGB Data, x left, x right, y length,threshold, width stride, pointer to weighttable, pointer to difftable, boolean difftable, int window radius (xtraframe)
/*;	mm0: current pixel green32(accum) red32(accum) 
;	mm1: current pixel cweight(accum) blue32(accum) 
;	mm2: currentpixel red8 green8 blue8
;	mm3: testpixel red8 green8 blue8
;	mm4: temp
;	mm5: temp
;	mm6: temp
;	mm7: temp

;	RGB = RGB values
;	rgb	= RGB fraction values
;	w = weight values

	; Note: Inner loop is somewhat pairable - mostly because of dependency and lack of registers and much use of shifter unit. 
	; Instructions that are pairble with above command are marked wirk (p^)
*/
	static const __int64 I1 = 0x0001000000000000i64;
	static const __int64 I2 = 0x000000ff00ff00ffi64;
	static const __int64 I3 = 0x0000000000ffffffi64;
	static const __int64 I4 = 0x0000ffffffffffffi64;
	static const int I5 = 32767;
	static const __int64 I8 = 0x00000000ffff0000i64;
	static const __int64 I9 = 0x000000000000ffffi64;
  __int64 temp=0;
  __int64 temp2=0;
  int *i_temp;
  int *i_temp2;
	__asm {
#ifdef ATHLON
    prefetch [data]
    prefetch [src]
#else
    prefetchnta [data]
    prefetchnta [src]
#endif
  }
	static const __int64 I10 = 0x0000ffff0000ffffi64;

  i_temp = ((int*)&temp);
  i_temp2 = ((int*)&temp2);


	unsigned long *org_src=src;
	__int64 t_weight;
	unsigned short *p_t_weight = (unsigned short*)&t_weight;
	RGBDivMMX *org_data=data;
	static const __int64 i64_0xffffff = (__int64)0xffffff;

	__asm {    
	mov edx,src;
	mov eax,data;
  mov edi, i_temp
  mov esi, i_temp2
											 ; Store current pixel in mm2
	PUNPCKLBW mm2, [edx]	; Unpack lower bytes to words | mm2 = 0000 BB00 GG00 RR00
	movq mm0,[eax] 			; contains correct pixels and offsets		| mm0: 00GG gggg 00RR rrrr  U-pipe
	pand mm2, I4				; remove unneeded information | mm2=0000 BBXX GGXX RRXX (np^) - V-pipe
	movq mm1,[eax+8] 		; contains correct pixels and offsets		| mm1: 0000 wwww 00BB rrrr (np^) U-pipe
	PSRLW mm2,8									; Cleanup mm2  | mm2 = 0000 00BB 00GG 00RR
	}
	int i;

	__asm {
		mov         [i],1				; Move 1 into i
		jmp         afterloop1		;
		align 16
goloop1:
		mov         edx, [i]		; Move i into edx
		add         edx,1				; Add 1 to i
		mov         [i],edx			; Move edx back into i

afterloop1:
		mov         eax,[i]			; Move i into eax
		cmp         eax,[endx]	; Is eax(i) greater than endx
		jg          outloop1		; Jump out of loop if true

		mov ecx,src			; 
    mov eax,[T]
		add ecx,4				
		add data,16
#ifdef ATHLON
		prefetch [ecx]
#else
		prefetchnta [ecx]
#endif
		mov					src,ecx			; Add source offset  src must be retained
    movq mm6,[I2]
		PUNPCKLBW mm3, [ecx]	; Unpack lower bytes to words | mm3 = 00xx BBxx GGxx RRxx V
		movq mm4, mm2				; make a copy of mm2 (p^) U-pipe
		PSRLW mm3,8					; Cleanup mm3  | mm3 = 00XX 00BB 00GG 00RR (p^) V-pipe
		movq mm7, [eax]					; Move threhold into mm7 (p^) U-pipe
    pand mm3,mm6
		movq mm5, mm3				; make a copy of mm3 (np^) V-pipe
		psubusb mm4, mm3		; compute difference one way (p^) U-pipe
		psubusb mm5, mm2		; compute difference the other way (p^) V-pipe
		por mm4, mm5				; or them together              | mm4 =0000 00BB 00GG 00RR  - absolute difference between the colors (np^) V
    pxor mm5,mm5
    movq mm6,mm4
												; Test if any values goes beyond the threshold.
		PCMPGTW mm4,mm7			; is any RGB-values in mm4 greater than Absolute difference? (np^)		V
    pxor mm7,mm7
		psadbw mm5,mm4			; Any past threshold?
    movd eax, mm5		    ; Past threshold = eax

		psadbw mm7,mm6			; Difference
		movd [temp2],mm7		; Absolute difference = esi

		cmp  eax,0				; compare eax to 0
		jg outloop1					; If one value is greater than threshold 

#ifdef ATHLON
		prefetchw [data]
#else
		prefetchnta [data]
#endif

    mov edx,[esi]
		shl edx,2						; Mult by 4 bytes up (for proper offset in lookup table)
		add edx,difftable	  ; Add pointer to difftable
		pxor mm5,mm5

		movd mm4,[edx]								; Move weight into mm4 | mm4=0000 0000 0000 WWWW
		por mm3,[I1]								; Add one to get weight mm3 = 0001 00BB 00GG 00RR
		pshufw mm4,mm4,17						; Also add Weight to upper qword 00010001=17 | mm4=  WWWW 0000 WWWW 0000
		mov ebx,[edx]
		punpckHWD mm5,mm3						; mm5=0001 0000 00BB 0000
		pxor mm6,mm6
		pmaddwd mm5,mm4							; mm5=0000 WWWW 00BB bbbb

		punpckLWD mm6,mm3						; mm6=00GG 0000 00RR 0000
		pmaddwd mm6,mm4							; mm3=00GG gggg 00RR rrrr
		paddd mm1,mm5
		PADDD mm0,mm6								; add mm5 to mm0 
		; Add into current pixel

		mov edx, data

		pxor mm5,mm5
		pxor mm6,mm6
		punpckHWD mm5, mm2
		movq mm7,[edx] 							; Move currently weighed infomation for testpixel into mm7 | mm7= 00GG gggg 00RR rrrr
		punpckLWD mm6, mm2					; low			
		pmaddwd mm5,mm4							; high mm5=0000 0000 00BB bbbb
		movq mm3,[edx+8] 						; high Move currently weighed infomation for testpixel into mm4 
		pmaddwd mm6,mm4							; low mm4=00GG gggg 00RR rrrr
		paddd mm5,mm3								; high
		paddd mm7,mm6								; low
		add [edx+12],ebx
		movq [edx],mm7
		movd [edx+8],mm5


		jmp goloop1
outloop1:
 //;*************************** WDW LOOP 2 *************************;//
}


		int ydown;

	__asm {
		align 16
		mov [ydown],1;
		jmp ydownafterloop
		align 16
ydowngoloop:
		mov         edx, [ydown]		; Move ydown into edx
		add         edx,1				; Add 1 to i
		mov         [ydown],edx			; Move edx back into i
ydownafterloop:
		mov         eax,[ydown]			; Move ydown into eax
		cmp         eax,[endy]	; Is eax(ydown) greater than endy
		jg          end_pixel		; Jump out of loop if true
		mov ebx, [ydown]
		mov eax,width
		imul eax,ebx
		mov ecx, org_data			; move data one row down
		add ecx, eax
		mov data,ecx

		mov eax,stride
		mov ecx, org_src			; move Image Source one row down
		imul eax, ebx
		add ecx, eax
		mov src,ecx

		; // XLOOP
		mov         [i],0				; Move 1 into i
		jmp         afterloop2		;
		align 16
goloop2:
		mov         edx, [i]		; Move i into edx
		add         edx,1				; Add 1 to i
		mov         [i],edx			; Move edx back into i

		mov ecx,src		
		add ecx,4
		add data,16
#ifdef ATHLON
		prefetch [ecx]
#else
		prefetchnta [ecx]
#endif
		mov src,ecx
afterloop2:
		mov         eax,[i]			; Move i into eax
		cmp         eax,[endx]	; Is eax(i) greater than endx
		jg          outloop2		; Jump out of loop if true

    mov eax,[T]		
    movq mm6,[I2]
		PUNPCKLBW mm3, [ecx]	; Unpack lower bytes to words | mm3 = 00xx BBxx GGxx RRxx V
		movq mm4, mm2				; make a copy of mm2 (p^) U-pipe
		PSRLW mm3,8					; Cleanup mm3  | mm3 = 00XX 00BB 00GG 00RR (p^) V-pipe
		movq mm7, [eax]					; Move threhold into mm7 (p^) U-pipe
    pand mm3,mm6
		movq mm5, mm3				; make a copy of mm3 (np^) V-pipe
		psubusb mm4, mm3		; compute difference one way (p^) U-pipe
		psubusb mm5, mm2		; compute difference the other way (p^) V-pipe
		por mm4, mm5				; or them together              | mm4 =0000 00BB 00GG 00RR  - absolute difference between the colors (np^) V
    pxor mm5,mm5
    movq mm6,mm4
												; Test if any values goes beyond the threshold.
		PCMPGTW mm4,mm7			; is any RGB-values in mm4 greater than Absolute difference? (np^)		V
    pxor mm7,mm7
		psadbw mm5,mm4			; Any past threshold?
    movd eax, mm5		    ; Past threshold = eax

		psadbw mm7,mm6			; Difference
		movd [temp2],mm7		; Absolute difference = esi

		cmp  eax,0				; compare eax to 0
		jg outloop2					; If one value is greater than threshold 

#ifdef ATHLON
		prefetchw [data]
#else
		prefetchnta [data]
#endif

    mov edx,[esi]
		shl edx,2						; Mult by 4 bytes up (for proper offset in lookup table)
		add edx,difftable	  ; Add pointer to difftable
		pxor mm5,mm5

		movd mm4,[edx]								; Move weight into mm4 | mm4=0000 0000 0000 WWWW
		por mm3,[I1]								; Add one to get weight mm3 = 0001 00BB 00GG 00RR
		pshufw mm4,mm4,17						; Also add Weight to upper qword 00010001=17 | mm4=  WWWW 0000 WWWW 0000
		mov ebx,[edx]
		punpckHWD mm5,mm3						; mm5=0001 0000 00BB 0000
		pxor mm6,mm6
		pmaddwd mm5,mm4							; mm5=0000 WWWW 00BB bbbb

		punpckLWD mm6,mm3						; mm6=00GG 0000 00RR 0000
		pmaddwd mm6,mm4							; mm3=00GG gggg 00RR rrrr
		paddd mm1,mm5
		PADDD mm0,mm6								; add mm5 to mm0 
		; Add into current pixel

		mov edx, data

		pxor mm5,mm5
		pxor mm6,mm6
		punpckHWD mm5, mm2
		movq mm7,[edx] 							; Move currently weighed infomation for testpixel into mm7 | mm7= 00GG gggg 00RR rrrr
		punpckLWD mm6, mm2					; low			
		pmaddwd mm5,mm4							; high mm5=0000 0000 00BB bbbb
		movq mm3,[edx+8] 						; high Move currently weighed infomation for testpixel into mm4 
		pmaddwd mm6,mm4							; low mm4=00GG gggg 00RR rrrr
		paddd mm5,mm3								; high
		paddd mm7,mm6								; low
		add [edx+12],ebx
		movq [edx],mm7
		movd [edx+8],mm5
		jmp goloop2
outloop2:

		//;*************************** WDW LOOP 3 *************************;//
		; // XLOOP
		mov ebx, [ydown]
		mov eax,width
		imul eax,ebx
		mov ecx, org_data			; move data one row down
		add ecx, eax
		mov data,ecx

		mov eax,stride
		mov ecx, org_src			; move Image Source one row down
		imul eax, ebx
		add ecx, eax
		mov src,ecx

		mov         [i],1				; Move 1 into i
		jmp         afterloop3		;
		align 16
goloop3:
		mov         edx, [i]		; Move i into edx
		add         edx,1				; Add 1 to i
		mov         [i],edx			; Move edx back into i

afterloop3:
		mov         eax,[i]			; Move i into eax
		cmp         eax,[beginx]	; Is eax(i) greater than endx
		jg          outloop3		; Jump out of loop if true

		mov ecx,src		
    mov eax,[T]
		sub ecx,4
#ifdef ATHLON
		prefetch [ecx]
#else
		prefetchnta [ecx]
#endif
		sub data,16
		mov src,ecx
		
    movq mm6,[I2]
		PUNPCKLBW mm3, [ecx]	; Unpack lower bytes to words | mm3 = 00xx BBxx GGxx RRxx V
		movq mm4, mm2				; make a copy of mm2 (p^) U-pipe
		PSRLW mm3,8					; Cleanup mm3  | mm3 = 00XX 00BB 00GG 00RR (p^) V-pipe
		movq mm7, [eax]					; Move threhold into mm7 (p^) U-pipe
    pand mm3,mm6
		movq mm5, mm3				; make a copy of mm3 (np^) V-pipe
		psubusb mm4, mm3		; compute difference one way (p^) U-pipe
		psubusb mm5, mm2		; compute difference the other way (p^) V-pipe
		por mm4, mm5				; or them together              | mm4 =0000 00BB 00GG 00RR  - absolute difference between the colors (np^) V
    pxor mm5,mm5
    movq mm6,mm4
												; Test if any values goes beyond the threshold.
		PCMPGTW mm4,mm7			; is any RGB-values in mm4 greater than Absolute difference? (np^)		V
    pxor mm7,mm7
		psadbw mm5,mm4			; Any past threshold?
    movd eax, mm5		    ; Past threshold = eax

		psadbw mm7,mm6			; Difference
		movd [temp2],mm7		; Absolute difference = esi

		cmp  eax,0				; compare eax to 0
		jg outloop3					; If one value is greater than threshold 

#ifdef ATHLON
		prefetchw [data]
#else
		prefetchnta [data]
#endif

    mov edx,[esi]
		shl edx,2						; Mult by 4 bytes up (for proper offset in lookup table)
		add edx,difftable	  ; Add pointer to difftable
		pxor mm5,mm5

		movd mm4,[edx]								; Move weight into mm4 | mm4=0000 0000 0000 WWWW
		por mm3,[I1]								; Add one to get weight mm3 = 0001 00BB 00GG 00RR
		pshufw mm4,mm4,17						; Also add Weight to upper qword 00010001=17 | mm4=  WWWW 0000 WWWW 0000
		mov ebx,[edx]
		punpckHWD mm5,mm3						; mm5=0001 0000 00BB 0000
		pxor mm6,mm6
		pmaddwd mm5,mm4							; mm5=0000 WWWW 00BB bbbb

		punpckLWD mm6,mm3						; mm6=00GG 0000 00RR 0000
		pmaddwd mm6,mm4							; mm3=00GG gggg 00RR rrrr
		paddd mm1,mm5
		PADDD mm0,mm6								; add mm5 to mm0 
		; Add into current pixel

		mov edx, data

		pxor mm5,mm5
		pxor mm6,mm6
		punpckHWD mm5, mm2
		movq mm7,[edx] 							; Move currently weighed infomation for testpixel into mm7 | mm7= 00GG gggg 00RR rrrr
		punpckLWD mm6, mm2					; low			
		pmaddwd mm5,mm4							; high mm5=0000 0000 00BB bbbb
		movq mm3,[edx+8] 						; high Move currently weighed infomation for testpixel into mm4 
		pmaddwd mm6,mm4							; low mm4=00GG gggg 00RR rrrr
		paddd mm5,mm3								; high
		paddd mm7,mm6								; low
		add [edx+12],ebx
		movq [edx],mm7
		movd [edx+8],mm5
		jmp goloop3
outloop3:
		
		jmp ydowngoloop
}


	__asm {
;// ***************************  WDW END   *************************

		align 16
end_pixel:
		movq mm5,mm1
		PSRLQ mm5,32				; Move weight down to bit 0 | mm5= 0000 0000 0000 WWWW
    movd [edi],mm5
		mov ebx,[edi]				; ebx=weight
		movd eax,mm0				; eax=RRRRrrrr
    CDQ 
    div ebx							; eax = RR
		psrlq mm0,32				; Shift green down
		mov ecx,eax					; edx=000000RR
		movd eax,mm1
    CDQ
    div ebx							; eax = BB
		shl eax,16						; eax = BB0000
		or ecx,eax					; edx = 00BB00RR
		movd eax,mm0
    CDQ
    div ebx							; eax = GG
		mov edx,org_src
		shl eax,8						; eax = GG00
		or ecx, eax					; edx=00BBGGRR
		mov [edx],ecx
;endall:
	}
/*  
	if (0) {
		__asm {
assert:
		nop
		}
	_ASSERT(FALSE);
	}
*/  
}


/********************** AVERAGE ***************************/
void MMX_asm_average_ATH(unsigned long *src,RGBDivMMX64 *data, int beginx ,int endx, int endy, __int64 *T, int stride, int *difftable, int diffweight,int winrad, int width) {
// Pointer to Source pixel, Pointer to RGB Data, x left, x right, y length,threshold, width stride, pointer to weighttable, pointer to difftable, boolean difftable, int window radius (xtraframe)
/*;	mm0: current pixel divisor(accum) blue16(accum) green16(accum) red16(accum) 
;	mm2: currentpixel red8 green8 blue8
;	mm3: testpixel red8 green8 blue8
;	mm4: temp
;	mm5: temp
;	mm6: temp
;	mm7: temp

;	RGB = RGB values
;	rgb	= RGB fraction values
;	w = weight values

*/
	static const __int64 I1 = 0x0001000000000000i64;
	static const __int64 I2 = 0x000000ff00ff00ffi64;
	static const __int64 I3 = 0x0000000000ffffffi64;
	static const __int64 I4 = 0x0000ffffffffffffi64;
	static const int I5 = 32767;
	static const __int64 I8 = 0x00000000ffff0000i64;
	static const __int64 I9 = 0x000000000000ffffi64;
  __int64 temp=0;
  __int64 temp2=0;
  int *i_temp;
  int *i_temp2;

	
	__asm {
#ifdef ATHLON
    prefetch [data]
    prefetchw [src]
#else
    prefetchnta [data]
    prefetchnta [src]
#endif
  }
	static const __int64 I10 = 0x0000ffff0000ffffi64;

  i_temp = ((int*)&temp);
  i_temp2 = ((int*)&temp2);


	unsigned long *org_src=src;
	__int64 t_weight;
	unsigned short *p_t_weight = (unsigned short*)&t_weight;
	RGBDivMMX64 *org_data=data;
	static const __int64 i64_0xffffff = (__int64)0xffffff;

	__asm {    
	mov edx,src;
	mov eax,org_data;  Align data  (remember!!)
	and eax,-8
	add eax,8
	mov org_data,eax
	mov eax,data;
	and eax,-8
	add eax,8
	mov data,eax
  mov edi, i_temp
  mov esi, i_temp2
											 ; Store current pixel in mm2
	PUNPCKLBW mm2, [edx]	; Unpack lower bytes to words | mm2 = 0000 BB00 GG00 RR00
	movq mm0,[eax] 			; contains correct pixels and offsets		| mm0: 00GG gggg 00RR rrrr  U-pipe
	pand mm2, I4				; remove unneeded information | mm2=0000 BBXX GGXX RRXX (np^) - V-pipe
	PSRLW mm2,8									; Cleanup mm2  | mm2 = 0000 00BB 00GG 00RR
	}
	int i;

	__asm {
		mov         [i],1				; Move 1 into i
		jmp         afterloop1		;
		align 16
goloop1:
		mov         edx, [i]		; Move i into edx
		add         edx,1				; Add 1 to i
		mov         [i],edx			; Move edx back into i

afterloop1:
		mov         eax,[i]			; Move i into eax
		cmp         eax,[endx]	; Is eax(i) greater than endx
		jg          outloop1		; Jump out of loop if true

		mov ecx,src			; 
    mov eax,[T]
		add ecx,4				
		add data,8
#ifdef ATHLON
		prefetch [ecx]
#else
		prefetchnta [ecx]
#endif
		mov					src,ecx			; Add source offset  src must be retained
    movq mm6,[I2]
		PUNPCKLBW mm3, [ecx]	; Unpack lower bytes to words | mm3 = 00xx BBxx GGxx RRxx V
		movq mm4, mm2				; make a copy of mm2 (p^) U-pipe
		PSRLW mm3,8					; Cleanup mm3  | mm3 = 00XX 00BB 00GG 00RR (p^) V-pipe
		movq mm7, [eax]					; Move threhold into mm7 (p^) U-pipe
    pand mm3,mm6
		pxor mm6,mm6				; Clear mm6
		movq mm5, mm3				; make a copy of mm3 (np^) V-pipe
		psubusb mm4, mm3		; compute difference one way (p^) U-pipe
		psubusb mm5, mm2		; compute difference the other way (p^) V-pipe 
		por mm4, mm5				; or them together              | mm4 =0000 00BB 00GG 00RR  - absolute difference between the colors (np^) V
		pxor mm6,mm6				; Clear mm5

												; Test if any values goes beyond the threshold.
		PCMPGTW mm4,mm7			; is any RGB-values in mm4 greater than Absolute difference? (np^)		V
		psadbw mm6,mm4			;
		movd eax, mm6				;
    movq mm4,mm3    ;tested pixel
    movq mm5,mm2   ; This pixel
		cmp  eax,0					; compare eax to 0
		jg outloop1					; If all values are greater than threshold 

#ifdef ATHLON
		prefetchw [data]
#else
		prefetchnta [data]
#endif

		mov edx, data
		por mm4,[I1]								; Add one to get weight mm3 = 0001 0000 0000 0000
    movq mm6,[edx]
		por mm5,[I1]								; Add one to get weight mm3 = 0001 0000 0000 0000
    paddw mm0,mm4
    paddw mm6,mm5
    movq [edx],mm6
    jmp goloop1

outloop1:
//;********************** AVERAGE LOOP 2 ***************************/
}


		int ydown;

	__asm {
		align 16
		mov [ydown],1;
		jmp ydownafterloop
ydowngoloop:
		mov         edx, [ydown]		; Move ydown into edx
		add         edx,1				; Add 1 to i
		mov         [ydown],edx			; Move edx back into i
ydownafterloop:
		mov         eax,[ydown]			; Move ydown into eax
		cmp         eax,[endy]	; Is eax(ydown) greater than endy
		jg          end_pixel		; Jump out of loop if true
		mov ebx, [ydown]
		mov eax,width
		imul eax,ebx
		mov ecx, org_data			; move data one row down
		add ecx, eax
		mov data,ecx

		mov eax,winrad
		imul eax, ebx

		mov eax,stride
		mov ecx, org_src			; move Image Source one row down
		imul eax, ebx
		add ecx, eax
		mov src,ecx

		; // XLOOP
		mov         [i],0				; Move 1 into i
		jmp         afterloop2		;
goloop2:
		mov         edx, [i]		; Move i into edx
		add         edx,1				; Add 1 to i
		mov         [i],edx			; Move edx back into i

		mov ecx,src		
		add ecx,4
		add data,8
#ifdef ATHLON
		prefetch [ecx]
#else
		prefetchnta [ecx]
#endif
		mov src,ecx
afterloop2:
		mov         eax,[i]			; Move i into eax
		cmp         eax,[endx]	; Is eax(i) greater than endx
		jg          outloop2		; Jump out of loop if true

    mov eax,[T]		
    movq mm6,[I2]
		PUNPCKLBW mm3, [ecx]	; Unpack lower bytes to words | mm3 = 00xx BBxx GGxx RRxx V
		movq mm4, mm2				; make a copy of mm2 (p^) U-pipe
		PSRLW mm3,8					; Cleanup mm3  | mm3 = 00XX 00BB 00GG 00RR (p^) V-pipe
		movq mm7, [eax]					; Move threhold into mm7 (p^) U-pipe
    pand mm3,mm6
		pxor mm6,mm6				; Clear mm6
		movq mm5, mm3				; make a copy of mm3 (np^) V-pipe
		psubusb mm4, mm3		; compute difference one way (p^) U-pipe
		psubusb mm5, mm2		; compute difference the other way (p^) V-pipe 
		por mm4, mm5				; or them together              | mm4 =0000 00BB 00GG 00RR  - absolute difference between the colors (np^) V
		pxor mm6,mm6				; Clear mm5

												; Test if any values goes beyond the threshold.
		PCMPGTW mm4,mm7			; is any RGB-values in mm4 greater than Absolute difference? (np^)		V
		psadbw mm6,mm4			;
		movd eax, mm6				;
    movq mm4,mm3    ;tested pixel
    movq mm5,mm2   ; This pixel
		cmp  eax,0					; compare eax to 0
		jg outloop2					; If all values are greater than threshold 

#ifdef ATHLON
		prefetchw [data]
#else
		prefetchnta [data]
#endif

		mov edx, data
		por mm4,[I1]								; Add one to get weight mm3 = 0001 0000 0000 0000
    movq mm6,[edx]
		por mm5,[I1]								; Add one to get weight mm3 = 0001 0000 0000 0000
    paddw mm0,mm4
    paddw mm6,mm5
    movq [edx],mm6
    jmp goloop2
outloop2:

//;********************** AVERAGE LOOP 3 ***************************/
		; // XLOOP
		mov ebx, [ydown]
		mov eax,width
		imul eax,ebx
		mov ecx, org_data			; move data one row down
		add ecx, eax
		mov data,ecx

		mov eax,stride
		mov ecx, org_src			; move Image Source one row down
		imul eax, ebx
		add ecx, eax
		mov src,ecx

		mov         [i],1				; Move 1 into i
		jmp         afterloop3		;
goloop3:
		mov         edx, [i]		; Move i into edx
		add         edx,1				; Add 1 to i
		mov         [i],edx			; Move edx back into i

afterloop3:
		mov         eax,[i]			; Move i into eax
		cmp         eax,[beginx]	; Is eax(i) greater than endx
		jg          outloop3		; Jump out of loop if true

		mov ecx,src		
    mov eax,[T]
		sub ecx,4
#ifdef ATHLON
		prefetch [ecx]
#else
		prefetchnta [ecx]
#endif
		sub data,8
		mov src,ecx
		
    movq mm6,[I2]
		PUNPCKLBW mm3, [ecx]	; Unpack lower bytes to words | mm3 = 00xx BBxx GGxx RRxx V
		movq mm4, mm2				; make a copy of mm2 (p^) U-pipe
		PSRLW mm3,8					; Cleanup mm3  | mm3 = 00XX 00BB 00GG 00RR (p^) V-pipe
		movq mm7, [eax]					; Move threhold into mm7 (p^) U-pipe
    pand mm3,mm6
		pxor mm6,mm6				; Clear mm6
		movq mm5, mm3				; make a copy of mm3 (np^) V-pipe
		psubusb mm4, mm3		; compute difference one way (p^) U-pipe
		psubusb mm5, mm2		; compute difference the other way (p^) V-pipe 
		por mm4, mm5				; or them together              | mm4 =0000 00BB 00GG 00RR  - absolute difference between the colors (np^) V
		pxor mm6,mm6				; Clear mm5

												; Test if any values goes beyond the threshold.
		PCMPGTW mm4,mm7			; is any RGB-values in mm4 greater than Absolute difference? (np^)		V
		psadbw mm6,mm4			;
		movd eax, mm6				;
    movq mm4,mm3    ;tested pixel
    movq mm5,mm2   ; This pixel
		cmp  eax,0					; compare eax to 0
		jg outloop3					; If all values are greater than threshold 

#ifdef ATHLON
		prefetch [data]
#else
		prefetchnta [data]
#endif

		mov edx, data
		por mm4,[I1]								; Add one to get weight mm3 = 0001 0000 0000 0000
    movq mm6,[edx]
		por mm5,[I1]								; Add one to get weight mm3 = 0001 0000 0000 0000
    paddw mm0,mm4
    paddw mm6,mm5
    movq [edx],mm6
		jmp goloop3
		align 16
outloop3:
		
		jmp ydowngoloop
}

	__asm {
//;********************** AVERAGE END ***************************/
	align 16
end_pixel:
	; edi=
		align 16
		movd edi,mm0
		psrlq mm0,32
		movd esi,mm0
    mov eax,65536
		mov ebx,esi
		shr ebx,16      ;edx=w
    CDQ
    div ebx         ; eax = 65536/divisor
    mov ebx,edi   ; ebx=R
    and ebx,0xffff ; ebx=r
    mov edx,edi   ; edx=G
    imul ebx,eax    ; ebx=R
    shr edx,16      ; edx=g
    shr ebx,16      ; ebx=r
    imul edx,eax    ; edx=G
    mov ecx,esi   ; ecx=B
    shr edx,16      ; ebx=g
    and ecx,0xffff ; ecx=b
    shl edx,8       ; edx=gg00
    imul ecx,eax    ; ecx=B
    or ebx,edx      ; ebx=0000GGRR
    and ecx,0xff0000
		mov eax,org_src
    or ebx, ecx     ; ebx=00BBGGRR
    mov [eax],ebx
	}
/*  
	if (0) {
		__asm {
assert:
		nop
		}
	_ASSERT(FALSE);
	}
*/  
}


void MMX_asm_pre_avg(unsigned int *src,RGBDivMMX64 *data,int stride, int width, int height) {
	__int64 I1=0x0100000000000000;
	__int64 I2=0x0000ff00ff00ff00;
	int fraction=width&3;    // fraction = w%4
	int lwidth=width-fraction;    // width handled by assembler loop
	int lwidth_bytes=lwidth*4;
	unsigned short *tlu;
	unsigned int *t_src=src;
	for (int y=0;y<height;y++) {

		tlu=(unsigned short *)&data[y*width];
/*		for (int x=0;x<lwidth;x++) {   // Performs the same in plain mode - Debug purposes only
				int cpix=*(t_src+x);
					*tlu++=(cpix&0xff);
					*tlu++=(cpix&0xff00)>>8;
					*tlu++=(cpix&0xff0000)>>16;
					*tlu++=1;
		}
*/
		// eax=src
		// ebx=data
		// ecx:src offset
		// edx:data offset
		
	__asm {

		movq mm7,[I2]
		movq mm6,[I1]
		mov eax,t_src			; Align data to 8 byte boundary (extra qword is allocated - beware - nasty hack)
		mov ebx,tlu
		mov ecx,0
		and ebx,-8
		add ebx,8
		mov tlu,ebx
		mov edx,0
		jmp afterloop
		align 16
goloop:
		add ecx,16
		add edx,32
afterloop:
		cmp       ecx,[lwidth_bytes]	; Is eax(i) greater than endx
		jge       outloop		; Jump out of loop if true

#ifdef ATHLON
		prefetch [eax+ecx]
		prefetchw [ebx+edx]
#endif
		; Processes four pixels at the time - Completely pairable!!
		movq mm0,[eax+ecx]		;2 pixels
		movq mm1,[eax+ecx+8]	;4 pixels
		PUNPCKLBW mm2,mm0			;mm2=XXxx BBxx GGxx RRxx
		movq mm4,mm0
		PUNPCKLBW mm3,mm1			;mm3=XXxx BBxx GGxx RRxx
		movq mm5,mm1
		PSRLQ mm4,32						;mm4=mm0 upper pixel
		pand mm2,mm7					;mm2=0000 BB00 GG00	RR00
		PSRLQ mm5,32						;mm5=mm1 upper pixel		
		pand mm3,mm7					;mm4=0000 BB00 GG00	RR00
		PUNPCKLBW mm4,mm4			;mm4=XXxx BBxx GGxx RRxx
		por mm3,mm6						;mm3=0100 BB00 GG00 RR00
		PUNPCKLBW mm5,mm5			;mm5=XXxx BBxx GGxx RRxx
		por mm2,mm6						;mm2=0100 BB00 GG00 RR00 
		pand mm4,mm7					;mm4=0000 BB00 GG00	RR00
		PSRLW mm2,8						;mm2=0001 00BB 00GG 00RR mm2 ready
		pand mm5,mm7					;mm4=0000 BB00 GG00	RR00
		PSRLW mm3,8						;mm3=0001 00BB 00GG 00RR mm3 ready
		por mm4,mm6						;mm2=0100 BB00 GG00 RR00
		por mm5,mm6						;mm2=0100 BB00 GG00 RR00
		PSRLW mm4,8						;mm2=0001 00BB 00GG 00RR mm4 ready
		movq [ebx+edx],mm2
		PSRLW mm5,8						;mm2=0001 00BB 00GG 00RR mm5 ready
		movq [ebx+8+edx],mm4
		movq [ebx+16+edx],mm3
		movq [ebx+24+edx],mm5
		jmp goloop
outloop:				
		} // end __asm	
		tlu=(unsigned short *)&data[y*width+lwidth];  // Fill up if any pixels not moved
		for (int x=0;x<fraction;x++) {
				int cpix=*(t_src+x+lwidth);
					*tlu++=(cpix&0xff);
					*tlu++=(cpix&0xff00)>>8;
					*tlu++=(cpix&0xff0000)>>16;
					*tlu++=1;
		}

/*	
		
	if (0) {
		__asm {
assert:
		nop
		}
	_ASSERT(FALSE);
	}
*/
		t_src+=stride;
	} // end for y
}


void MMX_asm_pre_wdw_avg(unsigned int *src,RGBDivMMX *data,int stride, int width, int height) {
	__int64 I1=0x0000010000000000;
	__int64 I2=0x0000ff00ff00ff00;
	int fraction=width&1;    // fraction = w%2
	int lwidth=width-fraction;    // width handled by assembler loop
	int lwidth_bytes=lwidth*4;
	unsigned short *tlu;
	unsigned int *t_src=src;
	for (int y=0;y<height;y++) {

	tlu=(unsigned short *)&data[y*width];
		// eax=src
		// ebx=data
		// ecx:src offset
		// edx:data offset
		
	__asm {

		movq mm7,[I2]
		movq mm6,[I1]
		mov eax,t_src		
		mov ebx,tlu
		mov ecx,0
		mov edx,0
		jmp afterloop
		align 16
goloop:
		add ecx,8
		add edx,32
afterloop:
		cmp       ecx,[lwidth_bytes]	; Is eax(i) greater than endx
		jge       outloop		; Jump out of loop if true

#ifdef ATHLON
		prefetch [eax+ecx]
		prefetchw [ebx+edx]
#endif
		; Processes two pixels at the time - Very pairable (2 non-pairable)
		movq mm2,[eax+ecx]		;2 pixels
		PUNPCKLBW mm1,mm2			;mm1=xxxx BBxx GGxx RRxx p1
		PUNPCKHBW mm2,mm2			;mm2=xxxx BBxx GGxx RRxx p2 (np^)
		pand mm1,mm7					;mm1=0000 BB00 GG00 RR00 p1
		pand mm2,mm7					;mm2=0000 BB00 GG00 RR00 p2
		pshufw mm3,mm1,254				; mm3=0000 0000 0000 BB00 p1 s=11111110 = 254
		pshufw mm4,mm2,254				; mm4=0000 0000 0000 BB00 p2 s=11111110 = 254 (np^)
		por mm3,mm6						; mm3=0000 0100 0000 BB00 p1 mm3 ready
		por mm4,mm6						; mm4=0000 0100 0000 BB00 p2 mm4 ready
		movq [ebx+edx+8],mm3		
		pshufw mm1,mm1,220				; mm1=0000 GG00 0000 RR00 p1 s=11011100 = 220  mm1 ready
		movq [ebx+edx+24],mm4		
		pshufw mm2,mm2,220				; mm2=0000 GG00 0000 RR00 p2 s=11011100 = 220	 mm2 ready
		movq [ebx+edx],mm1
		movq [ebx+edx+16],mm2		
		jmp goloop
outloop:				
		} // end __asm	
		if (fraction) {  // pixel missing
			int cpix=*(t_src+lwidth);
			int *tlu2=(int *)&data[y*width+lwidth];
			*tlu2++=(cpix&0xff)<<8;
			*tlu2++=(cpix&0xff00);
			*tlu2++=(cpix&0xff0000)>>8;
			*tlu2++=256;
		}

	
/*		
	if (0) {
		__asm {
assert:
		nop
		}
	_ASSERT(FALSE);
	}
*/
		t_src+=stride;
	} // end for y
}


int RunProcMMX(const FilterActivation *fa, const FilterFunctions *ff) {
  MyFilterData *mfd = (MyFilterData *)fa->filter_data;
  const long		pitch = (fa->src.pitch)>>2;
  const PixDim	w = fa->src.w;
  const PixDim	h = fa->src.h;
  Pixel32 *src, *dst;
	RGBDivMMX *data = mfd->mmxdata;
	// Check if tables are up-to-date
	if (mfd->tabletests!=((mfd->diameter)|((mfd->window_scale)<<8)|(mfd->blendmode<<17)|(mfd->threshold<<24)|(mfd->mdiff<<4))) {
		if (buildtables(mfd,fa)) {
      _ASSERT(FALSE);
			return 1;
		}
	}
  int blurpixels;
	int diffweight=mfd->diffweight;
	if (mfd->blendmode==0) {
		blurpixels=FALSE;
	} else if (mfd->blendmode==1) {
		blurpixels=TRUE;
	}
  if (!(mfd->weight&&mfd->dweight&&mfd->data)) return 1;
  src = fa->src.data;
  dst=src;
	if (blurpixels) {
		memset(data, 0, w*h*sizeof(RGBDiv));
	}
	int threshold=mfd->threshold;
	T_val *t=new T_val;
//  t->T=(__int64)threshold|(__int64)(threshold<<16)|((__int64)(threshold)<<32) | (0xffff000000000000i64);
  t->T=(__int64)threshold|(__int64)(threshold<<16)|((__int64)(threshold)<<32);
  int framesize=mfd->diameter;
  int xtraframe=framesize>>1;   // Frame left of pixel
		
  for (int y=0; y<h;y++) {
    int offset=y*pitch;  // To be used when referring to src or dst
		int unpoffset=y*w;   // Unpitched offset. To be used for redsum, greensum,  bluesum & divisor
    for (int x=0; x<w;x++) {
      int endy=xtraframe-1;
      if (y+endy+1>h) endy=h-y-1;

			int xright=xtraframe;  // last pixel of xright must be checked.
			if (xright+x>=w) {
				xright=w-x-1;
			}

			int xleft=xtraframe;  // last pixel of xleft must be checked.
			if (x-xleft<0) {
				xleft=x;
			}
			MMX_asm_blend_ATH((unsigned long *)&src[offset+x] , &data[unpoffset+x], xleft ,xright, endy, &t->T, pitch*4, mfd->weight , mfd->difftable, mfd->diffweight, framesize*4, w*16);
		}	 // end for x  
	} // end for y
  delete t; t= NULL;
	__asm emms
return 0;
}



int RunProcMMXAvg(const FilterActivation *fa, const FilterFunctions *ff) {
  MyFilterData *mfd = (MyFilterData *)fa->filter_data;
  const long		pitch = (fa->src.pitch)>>2;
  const PixDim	w = fa->src.w;
  const PixDim	h = fa->src.h;
  Pixel32 *src, *dst;
	RGBDivMMX *data = mfd->mmxdata;
	RGBDivMMX64 *shortdata = mfd->mmxdata2;
	// Check if tables are up-to-date
	if (mfd->tabletests!=((mfd->diameter)|((mfd->window_scale)<<8)|(mfd->blendmode<<17)|(mfd->threshold<<24)|(mfd->mdiff<<4))) {
		if (buildtables(mfd,fa)) {
      _ASSERT(FALSE);
			return 1;
		}
	}
  int blurpixels;
	int diffweight=mfd->diffweight;
	if (mfd->blendmode==0) {
		blurpixels=FALSE;
	} else if (mfd->blendmode==1) {
		blurpixels=TRUE;
	}
  if (!(mfd->weight&&mfd->dweight&&mfd->data)) return 1;
  src = fa->src.data;
  dst=src;
	if (diffweight) {
/*
		for (int yi=0;yi<h;yi++) {
			Pixel32 *tmpp=(Pixel32*)&src[pitch*yi];
			int *tlu=(int *)&data[w*yi];
			for (int xi=0;xi<w;xi++) {
				int cpix=*(tmpp+xi);
				*tlu++=(cpix&0xff)<<8;
				*tlu++=(cpix&0xff00);
				*tlu++=(cpix&0xff0000)>>8;
				*tlu++=256;
			}
		}
*/
		MMX_asm_pre_wdw_avg((unsigned  int*)src,mfd->mmxdata,pitch,w, h);
	} else {// if !diffweight
		MMX_asm_pre_avg((unsigned  int*)src,mfd->mmxdata2,pitch,w, h);
	}
	int threshold=mfd->threshold;
	T_val *t=new T_val;
//  t->T=(__int64)threshold|(__int64)(threshold<<16)|((__int64)(threshold)<<32) | (0xffff000000000000i64);
  t->T=(__int64)threshold|(__int64)(threshold<<16)|((__int64)(threshold)<<32);
  int framesize=mfd->diameter;
  int xtraframe=framesize>>1;   // Frame left of pixel
		
  for (int y=0; y<h;y++) {
    int offset=y*pitch;  // To be used when referring to src or dst
		int unpoffset=y*w;   // Unpitched offset. To be used for redsum, greensum,  bluesum & divisor
    for (int x=0; x<w;x++) {
      int endy=xtraframe-1;
      if (y+endy+1>h) endy=h-y-1;

			int xright=xtraframe;  // last pixel of xright must be checked.
			if (xright+x>=w) {
				xright=w-x-1;
			}

			int xleft=xtraframe;  // last pixel of xleft must be checked.
			if (x-xleft<0) {
				xleft=x;
			}
			if (!diffweight) {
				MMX_asm_average_ATH((unsigned long *)&src[offset+x] , &mfd->mmxdata2[unpoffset+x], xleft ,xright, endy, &t->T, pitch*4, mfd->difftable, mfd->diffweight, framesize*4, w*8);
			} else {
				MMX_asm_avg_wdw_ATH((unsigned long *)&src[offset+x] , &data[unpoffset+x], xleft ,xright, endy, &t->T, pitch*4, mfd->difftable, mfd->diffweight, framesize*4, w*16);
			}

		}	 // end for x  
	} // end for y
  delete t; t= NULL;
	__asm emms
return 0;
}

/*
	YUV2 Processing:
!!!1st try!!Y1 is NOT tested against Y2!!!!!!!!!!!!!!
	Data store struct 
		(for each pixel in image): Int64 (int)Y (short)weight (short)weight2
		(for each pixel in image/2): Int64 (int)U (int)V
	For each pixel:
		move 2 pixels (32bits) into mm2  VYUY
		unpack into 4 words 00VV 00YY00 UU00 YY00

	For each pixel in window:
		Test if UV passes: If not, move on
		Test if YY 1 passes. If so, add Y1 and UV to current pixel. If not - move on
		Weight1=(frameweight*Y-difference)/256
		Test if YY 2 passes  If so, add Y1 and UV to current (and tested) pixel. If not so - move on
		Weight2=(frameweight*Y-difference)/256

	Finalize pixel (each pixel):
		UV-weight=(weight1+weight2)/2

!!!!!!!!!!!!!!!!!!! 2nd try !!!!!!!!!!!!!!!!!!!!!!
	
	Data store struct 
		(for each pixel in image): Int64,Int64 = (int)Y,(int)U,(int)V,(int)weight 
even pixels:
	For each Y pixel:

			

*/

/*

  for (int y=0; y<h;y++) {
    int offset=y*pitch;  // To be used when referring to src or dst
		int unpoffset=y*w;   // Unpitched offset. To be used for redsum, greensum,  bluesum & divisor
    for (int x=0; x<w;x++) {
			Pixel32 testpix=*npixp;
			if (x&1) {
				int cY=(testpix&0xff000000)>>24;
				int cU=(testpix&0xff0000)>>16;
				int cV=(testpix&0xff);

			} else {
				int cY=(testpix&0xff000000)>>8;
				int cU=(testpix&0xff0000)>>16;
				int cV=(testpix&0xff);
				boolean moveon=true;
				npixp++;  // Next pixel is in next index
				x++;
				while (moveon) {
					// test odd first
					int tY=(testpix&0xff000000)>>24;
					int tU=(testpix&0xff0000)>>16;
					int tV=(testpix&0xff);

					int deltaY=abs(tY - cY);
					int deltaU=abs(tU - cU);
					int deltaV=abs(tV - cV);
					if ((deltaY>Ythreshold) || (deltaU>threshold) || (deltaV>threshold) ) {
						moveon=false;
					} else {
						int weight=difftable[deltaY]*weighttable[offset+x];
						
					}
				} // end while moveon
			} // End if x&1
		}
	}	

*/
extern "C" int __declspec(dllexport) __cdecl VirtualdubFilterModuleInit2(FilterModule *fm, const FilterFunctions *ff, int& vdfd_ver, int& vdfd_compat);
extern "C" void __declspec(dllexport) __cdecl VirtualdubFilterModuleDeinit(FilterModule *fm, const FilterFunctions *ff);

static FilterDefinition *fd_tutorial;

int __declspec(dllexport) __cdecl VirtualdubFilterModuleInit2(FilterModule *fm, const FilterFunctions *ff, int& vdfd_ver, int& vdfd_compat) {
	if (!(fd_tutorial = ff->addFilter(fm, &filterDef_tutorial, sizeof(FilterDefinition))))
		return 1;
	
	vdfd_ver = VIRTUALDUB_FILTERDEF_VERSION;
	vdfd_compat = VIRTUALDUB_FILTERDEF_COMPATIBLE;
	
	return 0;
}

void __declspec(dllexport) __cdecl VirtualdubFilterModuleDeinit(FilterModule *fm, const FilterFunctions *ff) {
	ff->removeFilter(fd_tutorial);
}

int InitProc(FilterActivation *fa, const FilterFunctions *ff) {
	MyFilterData *mfd = (MyFilterData *)fa->filter_data;
	
	mfd->diameter = 5;
	mfd->threshold = 50;
	mfd->interlaced = FALSE;
	mfd->window_scale = 254;
  mfd->testgrey=FALSE;
	mfd->blendmode=1;
	mfd->tabletests=-1;
	mfd->diffweight=TRUE;
	mfd->mdiff=0;
	mfd->interlaced=FALSE;
	return 0;
}

BOOL CALLBACK ConfigDlgProc(HWND hdlg, UINT msg, WPARAM wParam, LPARAM lParam) {
	MyFilterData *mfd = (MyFilterData *)GetWindowLong(hdlg, DWL_USER);
	
	switch(msg) {
		case WM_INITDIALOG:
			SetWindowLong(hdlg, DWL_USER, lParam);
			mfd = (MyFilterData *)lParam;
			HWND hWnd;
			
			CheckDlgButton(hdlg, IDC_INTERLACED, mfd->interlaced ? BST_CHECKED : BST_UNCHECKED);
			
			hWnd = GetDlgItem(hdlg, IDC_SDIAMETER);
			SendMessage(hWnd, TBM_SETRANGE, (WPARAM)TRUE, MAKELONG(1, 6));
			SendMessage(hWnd, TBM_SETPOS, (WPARAM)TRUE, mfd->diameter/2);
			SetDlgItemInt(hdlg, IDC_DIAMETER, mfd->diameter, FALSE);
			
			hWnd = GetDlgItem(hdlg, IDC_STHRESHOLD);
			SendMessage(hWnd, TBM_SETRANGE, (WPARAM)TRUE, MAKELONG(0, 200));
			SendMessage(hWnd, TBM_SETPOS, (WPARAM)TRUE, mfd->threshold);
			SetDlgItemInt(hdlg, IDC_THRESHOLD, mfd->threshold, FALSE);
			
			hWnd = GetDlgItem(hdlg, IDC_SAMOUNT);
			SendMessage(hWnd, TBM_SETRANGE, (WPARAM)TRUE, MAKELONG(1, 254));
			SendMessage(hWnd, TBM_SETPOS, (WPARAM)TRUE, mfd->window_scale);
			SetDlgItemInt(hdlg, IDC_AMOUNT, mfd->window_scale, FALSE);
			
      mfd->ifp->InitButton(GetDlgItem(hdlg, IDPREVIEW));
      CheckDlgButton(hdlg, IDC_TESTGREY, mfd->testgrey ? BST_CHECKED : BST_UNCHECKED);
			// OLD AVERAGE TRANSPARENT
			CheckDlgButton(hdlg, IDC_OLD, mfd->blendmode == 0 ? BST_CHECKED : BST_UNCHECKED);
			CheckDlgButton(hdlg, IDC_AVERAGE, mfd->blendmode == 1 ? BST_CHECKED : BST_UNCHECKED);
			CheckDlgButton(hdlg, IDC_TRANSPARENT, mfd->blendmode == 2 ? BST_CHECKED : BST_UNCHECKED);
			CheckDlgButton(hdlg, IDC_DIFF, mfd->diffweight ? BST_CHECKED : BST_UNCHECKED);
			CheckDlgButton(hdlg, IDC_INTERLACED, mfd->interlaced ? BST_CHECKED : BST_UNCHECKED);

			hWnd = GetDlgItem(hdlg, IDC_SMDIFF);
			SendMessage(hWnd, TBM_SETRANGE, (WPARAM)TRUE, MAKELONG(0, mfd->threshold));
			SendMessage(hWnd, TBM_SETPOS, (WPARAM)TRUE, mfd->mdiff);
			SetDlgItemInt(hdlg, IDC_MDIFF, mfd->mdiff, FALSE);
			
			if (mfd->blendmode==0) {
				hWnd = GetDlgItem(hdlg, IDC_SAMOUNT);
				EnableWindow(hWnd, FALSE);
				hWnd = GetDlgItem(hdlg, IDC_AMOUNTMINUS);
				EnableWindow(hWnd, FALSE);
				hWnd = GetDlgItem(hdlg, IDC_AMOUNTPLUS);
				EnableWindow(hWnd, FALSE);
				hWnd = GetDlgItem(hdlg, IDC_AMOUNT);
				EnableWindow(hWnd, FALSE);
				hWnd = GetDlgItem(hdlg, IDC_AMOUNTTEXT);
				EnableWindow(hWnd, FALSE);
			} else if ((mfd->blendmode==1) || (mfd->blendmode==2)) {
				if (mfd->blendmode==1) {
					EnableWindow(GetDlgItem(hdlg, IDC_DIFF), TRUE);
				} else {
					EnableWindow(GetDlgItem(hdlg, IDC_DIFF), FALSE);
				}
				hWnd = GetDlgItem(hdlg, IDC_SAMOUNT);
				EnableWindow(hWnd, TRUE);
				hWnd = GetDlgItem(hdlg, IDC_AMOUNTMINUS);
				EnableWindow(hWnd, TRUE);
				hWnd = GetDlgItem(hdlg, IDC_AMOUNTPLUS);
				EnableWindow(hWnd, TRUE);
				hWnd = GetDlgItem(hdlg, IDC_AMOUNT);
				EnableWindow(hWnd, TRUE);
			}
				int enabled;
				if (mfd->diffweight) {
					enabled=TRUE;
				} else {
					enabled=FALSE;
				}
				hWnd = GetDlgItem(hdlg, IDC_SMDIFF);
				EnableWindow(hWnd, enabled);
				hWnd = GetDlgItem(hdlg, IDC_MDIFFMINUS);
				EnableWindow(hWnd, enabled);
				hWnd = GetDlgItem(hdlg, IDC_MDIFFPLUS);
				EnableWindow(hWnd, enabled);
				hWnd = GetDlgItem(hdlg, IDC_MDIFF);
				EnableWindow(hWnd, enabled);
				hWnd = GetDlgItem(hdlg, IDC_MDIFFTEXT);
				EnableWindow(hWnd, enabled);
			
			return TRUE;
		case WM_HSCROLL:
			if ((HWND) lParam == GetDlgItem(hdlg, IDC_SDIAMETER))
			{
				int diameter = SendMessage(GetDlgItem(hdlg, IDC_SDIAMETER), TBM_GETPOS, 0, 0) * 2 + 1;
				if (diameter != mfd->diameter)
				{
					mfd->diameter = diameter;
					SetDlgItemInt(hdlg, IDC_DIAMETER, mfd->diameter, FALSE);
					mfd->ifp->RedoFrame();
				}
			}
			else if ((HWND) lParam == GetDlgItem(hdlg, IDC_STHRESHOLD))
			{
				int threshold = SendMessage(GetDlgItem(hdlg, IDC_STHRESHOLD), TBM_GETPOS, 0, 0);
				if (threshold != mfd->threshold)
				{
					mfd->threshold = threshold;
					if (mfd->threshold == 0) mfd->threshold = 1;					
					SetDlgItemInt(hdlg, IDC_THRESHOLD, mfd->threshold, FALSE);

					if (mfd->mdiff>mfd->threshold) mfd->mdiff=mfd->threshold;
					hWnd = GetDlgItem(hdlg, IDC_SMDIFF);
					SendMessage(hWnd, TBM_SETRANGE, (WPARAM)TRUE, MAKELONG(0, mfd->threshold));
					SendMessage(hWnd, TBM_SETPOS, (WPARAM)TRUE, mfd->mdiff);
					SetDlgItemInt(hdlg, IDC_MDIFF, mfd->mdiff, FALSE);

					mfd->ifp->RedoFrame();
				}
			}
			else if ((HWND) lParam == GetDlgItem(hdlg, IDC_SAMOUNT))
			{
				int amount = SendMessage(GetDlgItem(hdlg, IDC_SAMOUNT), TBM_GETPOS, 0, 0);
				if (amount != mfd->window_scale)
				{
					mfd->window_scale = amount;
					if (mfd->window_scale == 0) mfd->window_scale = 1;
					SetDlgItemInt(hdlg, IDC_AMOUNT, mfd->window_scale, FALSE);
					mfd->ifp->RedoFrame();
				}
			}
			else if ((HWND) lParam == GetDlgItem(hdlg, IDC_SMDIFF))
			{
				int amount = SendMessage(GetDlgItem(hdlg, IDC_SMDIFF), TBM_GETPOS, 0, 0);
				if (amount != mfd->mdiff)
				{
					mfd->mdiff = amount;
					SetDlgItemInt(hdlg, IDC_MDIFF, mfd->mdiff, FALSE);
					mfd->ifp->RedoFrame();
				}
			}
			break;
		case WM_COMMAND:
			switch(LOWORD(wParam))
			{
			case IDPREVIEW:
				mfd->ifp->Toggle(hdlg);
				break;
			case IDOK:
				EndDialog(hdlg, 0);
				return TRUE;
			case IDHELP:
				{
					char prog[256];
					char path[256];
					LPTSTR ptr;
					GetModuleFileName(NULL, prog, 255);
					GetFullPathName(prog, 255, path, &ptr);
					*ptr = 0;
					strcat(path, "plugins\\smoothhiq.html");
					ShellExecute(hdlg, "open", path, NULL, NULL, SW_SHOWNORMAL);
					return TRUE;
				}
			case IDCANCEL:
				EndDialog(hdlg, 1);
				return TRUE;
			case IDC_INTERLACED:
				mfd->interlaced = !!IsDlgButtonChecked(hdlg, IDC_INTERLACED);
				mfd->ifp->RedoFrame();
				break;
			case IDC_TESTGREY:
				mfd->testgrey = !!IsDlgButtonChecked(hdlg, IDC_TESTGREY);
				mfd->ifp->RedoFrame();
				break;
			case IDC_DIFF:
				mfd->diffweight = !!IsDlgButtonChecked(hdlg, IDC_DIFF);
				mfd->ifp->RedoFrame();
				int enabled;
				if (mfd->diffweight) {
					enabled=TRUE;
				} else {
					enabled=FALSE;
				}
				hWnd = GetDlgItem(hdlg, IDC_SMDIFF);
				EnableWindow(hWnd, enabled);
				hWnd = GetDlgItem(hdlg, IDC_MDIFFMINUS);
				EnableWindow(hWnd, enabled);
				hWnd = GetDlgItem(hdlg, IDC_MDIFFPLUS);
				EnableWindow(hWnd, enabled);
				hWnd = GetDlgItem(hdlg, IDC_MDIFF);
				EnableWindow(hWnd, enabled);
				hWnd = GetDlgItem(hdlg, IDC_MDIFFTEXT);
				EnableWindow(hWnd, enabled);
				break;
			case IDC_DIAMETERPLUS:
				if (mfd->diameter < 13)
				{
					mfd->diameter += 2;
					SetDlgItemInt(hdlg, IDC_DIAMETER, mfd->diameter, FALSE);
					SendMessage(GetDlgItem(hdlg, IDC_SDIAMETER), TBM_SETPOS, (WPARAM)TRUE, mfd->diameter/2);
					mfd->ifp->RedoFrame();
				}
				break;
			case IDC_DIAMETERMINUS:
				if (mfd->diameter > 3)
				{
					mfd->diameter -= 2;
					SetDlgItemInt(hdlg, IDC_DIAMETER, mfd->diameter, FALSE);
					SendMessage(GetDlgItem(hdlg, IDC_SDIAMETER), TBM_SETPOS, (WPARAM)TRUE, mfd->diameter/2);
					mfd->ifp->RedoFrame();
				}
				break;
			case IDC_THRESHOLDPLUS:
				if (mfd->threshold < 200)
				{
					mfd->threshold += 1;
					SetDlgItemInt(hdlg, IDC_THRESHOLD, mfd->threshold, FALSE);
					SendMessage(GetDlgItem(hdlg, IDC_STHRESHOLD), TBM_SETPOS, (WPARAM)TRUE, mfd->threshold);
					if (mfd->mdiff>mfd->threshold) mfd->mdiff=mfd->threshold;
					hWnd = GetDlgItem(hdlg, IDC_SMDIFF);
					SendMessage(hWnd, TBM_SETRANGE, (WPARAM)TRUE, MAKELONG(0, mfd->threshold));
					SendMessage(hWnd, TBM_SETPOS, (WPARAM)TRUE, mfd->mdiff);
					SetDlgItemInt(hdlg, IDC_MDIFF, mfd->mdiff, FALSE);
					mfd->ifp->RedoFrame();
				}
				break;
			case IDC_THRESHOLDMINUS:
				if (mfd->threshold > 0)
				{
					mfd->threshold -= 1;
					SetDlgItemInt(hdlg, IDC_THRESHOLD, mfd->threshold, FALSE);
					SendMessage(GetDlgItem(hdlg, IDC_STHRESHOLD), TBM_SETPOS, (WPARAM)TRUE, mfd->threshold);

					if (mfd->mdiff>mfd->threshold) mfd->mdiff=mfd->threshold;
					hWnd = GetDlgItem(hdlg, IDC_SMDIFF);
					SendMessage(hWnd, TBM_SETRANGE, (WPARAM)TRUE, MAKELONG(0, mfd->threshold));
					SendMessage(hWnd, TBM_SETPOS, (WPARAM)TRUE, mfd->mdiff);
					SetDlgItemInt(hdlg, IDC_MDIFF, mfd->mdiff, FALSE);

					mfd->ifp->RedoFrame();
				}
				break;
			case IDC_AMOUNTPLUS:
				if (mfd->threshold < 254)
				{
					mfd->window_scale += 1;
					SetDlgItemInt(hdlg, IDC_AMOUNT, mfd->window_scale, FALSE);
					SendMessage(GetDlgItem(hdlg, IDC_SAMOUNT), TBM_SETPOS, (WPARAM)TRUE, mfd->window_scale);
					mfd->ifp->RedoFrame();
				}
				break;
			case IDC_AMOUNTMINUS:
				if (mfd->window_scale > 0)
				{
					mfd->window_scale -= 1;
					SetDlgItemInt(hdlg, IDC_AMOUNT, mfd->window_scale,FALSE);
					SendMessage(GetDlgItem(hdlg, IDC_AMOUNT), TBM_SETPOS, (WPARAM)TRUE, mfd->window_scale);
					mfd->ifp->RedoFrame();
				}
				break;
			case IDC_MDIFFPLUS:
				if (mfd->mdiff < mfd->threshold)
				{
					mfd->threshold += 1;
					SetDlgItemInt(hdlg, IDC_MDIFF, mfd->mdiff, FALSE);
					SendMessage(GetDlgItem(hdlg, IDC_SMDIFF), TBM_SETPOS, (WPARAM)TRUE, mfd->mdiff);
					mfd->ifp->RedoFrame();
				}
				break;
			case IDC_MDIFFMINUS:
				if (mfd->mdiff > 0)
				{
					mfd->mdiff -= 1;
					SetDlgItemInt(hdlg, IDC_MDIFF, mfd->mdiff,FALSE);
					SendMessage(GetDlgItem(hdlg, IDC_MDIFF), TBM_SETPOS, (WPARAM)TRUE, mfd->mdiff);
					mfd->ifp->RedoFrame();
				}
				break;
      case IDC_OLD:
        mfd->blendmode=0;
				hWnd = GetDlgItem(hdlg, IDC_SAMOUNT);
				EnableWindow(hWnd, FALSE);
				hWnd = GetDlgItem(hdlg, IDC_AMOUNTMINUS);
				EnableWindow(hWnd, FALSE);
				hWnd = GetDlgItem(hdlg, IDC_AMOUNTPLUS);
				EnableWindow(hWnd, FALSE);
				hWnd = GetDlgItem(hdlg, IDC_AMOUNT);
				EnableWindow(hWnd, FALSE);
				hWnd = GetDlgItem(hdlg, IDC_AMOUNTTEXT);
				EnableWindow(hWnd, FALSE);

				mfd->ifp->RedoFrame();
        break;
      case IDC_AVERAGE:
        mfd->blendmode=1;
				hWnd = GetDlgItem(hdlg, IDC_SAMOUNT);
				EnableWindow(hWnd, TRUE);
				hWnd = GetDlgItem(hdlg, IDC_AMOUNTMINUS);
				EnableWindow(hWnd, TRUE);
				hWnd = GetDlgItem(hdlg, IDC_AMOUNTPLUS);
				EnableWindow(hWnd, TRUE);
				hWnd = GetDlgItem(hdlg, IDC_AMOUNT);
				EnableWindow(hWnd, TRUE);
				hWnd = GetDlgItem(hdlg, IDC_AMOUNTTEXT);
				EnableWindow(hWnd, TRUE);

				mfd->ifp->RedoFrame();
        break;
      case IDC_TRANSPARENT:
        mfd->blendmode=2;
				hWnd = GetDlgItem(hdlg, IDC_SAMOUNT);
				EnableWindow(hWnd, TRUE);
				hWnd = GetDlgItem(hdlg, IDC_AMOUNTMINUS);
				EnableWindow(hWnd, TRUE);
				hWnd = GetDlgItem(hdlg, IDC_AMOUNTPLUS);
				EnableWindow(hWnd, TRUE);
				hWnd = GetDlgItem(hdlg, IDC_AMOUNT);
				EnableWindow(hWnd, TRUE);
				hWnd = GetDlgItem(hdlg, IDC_AMOUNTTEXT);
				EnableWindow(hWnd, TRUE);
				EnableWindow(GetDlgItem(hdlg, IDC_DIFF), FALSE);
				mfd->ifp->RedoFrame();
        break;
			}
			break;
	}
	
	return FALSE;
}

int ConfigProc(FilterActivation *fa, const FilterFunctions *ff, HWND hwnd)
{
	MyFilterData *mfd = (MyFilterData *) fa->filter_data;
	MyFilterData mfd_old = *mfd;
	int ret;
	
	mfd->ifp = fa->ifp;
	if (DialogBoxParam(fa->filter->module->hInstModule,
		MAKEINTRESOURCE(IDD_FILTER), hwnd,
		ConfigDlgProc, (LPARAM) mfd))
	{
		*mfd = mfd_old;
		ret = TRUE;
	}
	else
	{
		ret = FALSE;
	}
	return(ret);
}

void StringProc(const FilterActivation *fa, const FilterFunctions *ff, char *str) {
	MyFilterData *mfd = (MyFilterData *)fa->filter_data;
	
	sprintf(str, " (diam %d, thr %d, amount %d)",
				mfd->diameter, mfd->threshold, mfd->window_scale);
}
