// xBRZ image upscaler/filter - modified for the Kega Fusion Emulator
// Version 1.1

// Original xbrz.cpp :
// ****************************************************************************
// * This file is part of the HqMAME project. It is distributed under         *
// * GNU General Public License: http://www.gnu.org/licenses/gpl.html         *
// * Copyright (C) Zenju (zenju AT gmx DOT de) - All Rights Reserved          *
// *                                                                          *
// * Additionally and as a special exception, the author gives permission     *
// * to link the code of this program with the MAME library (or with modified *
// * versions of MAME that use the same license as MAME), and distribute      *
// * linked combinations including the two. You must obey the GNU General     *
// * Public License in all respects for all of the code used other than MAME. *
// * If you modify this file, you may extend this exception to your version   *
// * of the file, but you are not obligated to do so. If you do not wish to   *
// * do so, delete this exception statement from your version.                *
// ****************************************************************************

// rpi-plugin creation and C++11 unraveling and some cleaning of the original code
// by "milo1012" (milo1012 AT freenet DOT de)
// -> no fundamental code changes besides speedups where possible and
//    removing lambda functions, some preprocessor mess, <cstdint> (uint32_t)
//    and compressing whitespace (tabs instead space, bracket style)
// (do we really need lambda functions for this small code and break compatibility
//  with the majority of the decent, well-tried, but older C++ compilers out there?
//  I don't think so, we can even get faster w/o it)
//
// detailed changes (Version 1.1):
// - aggressive speedup by removing unnecessary sub-routines for color
//   distance function and converting to float -> double precision is overkill
//   for an 8 bit resolution color model -> still faster, even on modern architectures
//   -> same function: we probably don't want to calculate three subtractions
//   and two divisions with known constants in every call -> unfold/precalc it
// - removal of elaborate fillBlock() and sub functions -> in place seems faster
// - removal of std::max and std::min -> unnecessary: "std::max()" == "(a<b)?b:a"
//   -> "((x-1<0)?(0):(x-1))" can be changed to "x-((x>0)?(1):(0))" or similar
//   -> definitely less code produced, theoret. faster
// - removal of ScalerCfg() and cfg vars -> we use xBRZ defaults anyway
// - removal of safety checks at scaleImage() start
// - float compared as int where possible -> measurably faster on 32-bit
//   (maybe not in 64-bit, but we don't have that for rpi anyway)
// - aligned struct Kernel_4x4 on 16 byte boundaries
//   -> might be faster on some SSE machines (doesn't hurt)
// - using original Kernel_4x4 struct instead Kernel_3x3 for scalePixel() and all necessary subfunctions
//   -> unnecessary nested copy for each pixel/kernel speed things up a percent or so
// - moving preProcBuffer to a newly created char-array
//   -> writing directly to a framebuffer/texture more than necessary (not "filling" it for real)
//      can cause enormous slowdowns (depending on render type, but defininitely for Direct3D)
//   -> also CPU cache can benefit better
// - using an output buffer for each line and copy it to output frame afterwards,
//   -> line-wise with memcpy instead of direct manipulation in framebuffer/texture
// ... overall ~15-30 % faster than original code (depending on machine and compiler)
//
// currently only for Windows
// -> for other systems: replace the stuff in DllMain() and change or remove
//    the threading in scale() and RenderPluginOutput() first


//////////////
// main config: scaling factor (2,3,4) and threading
// don't use other values for _SCALER_ as above (5 not implemented)
const unsigned char _SCALER_ = 4;
// number of threads = image slices scaled parallel in xBRZ - src image size
// must be a multiple of it! - prob. no more gain above 4 slices
const unsigned char NUM_SLICE = 4;
// defined/undefined = enable/disable threaded scaling
#define _XBRZ_MT_
#undef _XBRZ_MT_
// use lookup table for 24 -> 16/15 (reverse) conversion for a ~5 percent speedup
// -> would be 64 MiB(!) memory for every plugin version loaded, therefore optional
#define _XBRZ_RLUT_
#undef _XBRZ_RLUT_
//////////////
// max width - needed for otherwise non-const stack-array init for line output buffer
// worst case for kega: 320px -> 384 for overhead and alignment (multiple of 128 - cacheline etc.)
// results in 24 kiB buffer for 4x scaler, 13.5 for 3x, 6 for 2x
#define MAXW 384
//////////////


#if (defined _WINDOWS || defined _WIN32)
	#define WIN32_LEAN_AND_MEAN
	//#define NOMINMAX // if using std::max or std::min
	#include <windows.h>
	//#undef NOMINMAX
#endif
#include <complex> // for sqrt() - <cmath> also works (identical internal stdlib function)
//#include <algorithm> // if using std::max or std::min
//////////////
#ifdef _MSC_VER
#define FORCE_INLINE __forceinline
#elif defined __GNUC__
#define FORCE_INLINE __attribute__((always_inline)) inline
#else
#define FORCE_INLINE inline
#endif
//////////////
#ifdef _MSC_VER
#define ALIGNED16 __declspec(align(16))
#elif defined __GNUC__
#define ALIGNED16 __attribute__((aligned (16),packed))
#else
#define ALIGNED16
#endif


////////////////////////////////////////////////////////////////////////////////
// Kega stuff
#if !(defined _WINDOWS || defined _WIN32)
	typedef unsigned long DWORD;
	typedef unsigned short WORD;
	typedef unsigned char BYTE;
	typedef void* HMODULE;
#endif

typedef struct {
	unsigned long Size;
	unsigned long Flags;
	void* SrcPtr;
	unsigned long SrcPitch;
	unsigned long SrcW;
	unsigned long SrcH;
	void* DstPtr;
	unsigned long DstPitch;
	unsigned long DstW;
	unsigned long DstH;
	unsigned long OutW;
	unsigned long OutH;
} RENDER_PLUGIN_OUTP;

typedef void (*RENDPLUG_Output)(RENDER_PLUGIN_OUTP *);

typedef struct {
	char Name[60];
	unsigned long Flags;
	HMODULE Handle;
	RENDPLUG_Output Output;
} RENDER_PLUGIN_INFO;

typedef RENDER_PLUGIN_INFO*(*RENDPLUG_GetInfo)(void);

#define	RPI_VERSION		0x02
#define	RPI_MMX_USED	0x000000100
#define	RPI_MMX_REQD	0x000000200
#define	RPI_555_SUPP	0x000000400
#define	RPI_565_SUPP	0x000000800
#define	RPI_888_SUPP	0x000001000
#define	RPI_DST_WIDE	0x000008000
#define	RPI_OUT_SCL1	0x000010000
#define	RPI_OUT_SCL2	0x000020000
#define	RPI_OUT_SCL3	0x000030000
#define	RPI_OUT_SCL4	0x000040000

#if (defined _WINDOWS || defined _WIN32)
	extern "C" __declspec(dllexport) RENDER_PLUGIN_INFO* RenderPluginGetInfo(void);
	extern "C" __declspec(dllexport) void RenderPluginOutput(RENDER_PLUGIN_OUTP* rpo);
#endif
////////////////////////////////////////////////////////////////////////////////
// end Kega stuff


////////////////////////////////////////////////////////////////////////////////
// global vars
WORD* picture_input;
WORD* picture_outdst;
RENDER_PLUGIN_INFO MyRPI;
unsigned int mask_g; // bitmask for 24 -> 15/16 conversion (either 555 or 565)
char shift_r; // shift for 24 -> 15/16 conversion (either 555 or 565)
unsigned int picture_srcheight;
unsigned int picture_srcwidth;
unsigned long picture_dstpitch;
unsigned long picture_srcpitch;
unsigned int* _table16_24; // color-LUT for 565
unsigned int* _table15_24; // color-LUT for 555
unsigned short* _table24_16; // reverse color-LUT for 565
unsigned short* _table24_15; // reverse color-LUT for 555
unsigned int* colortable; // color-LUT
unsigned short* colortable_o; // reverse color-LUT
int equalColorTolerance_ = 0x41f00000; // =30.F for fast int-compare (32 bit)
////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////


namespace xbrz {
#ifdef _XBRZ_MT_
	DWORD WINAPI scale(LPVOID);
#else
	void scale();
#endif
} // namespace


namespace {

const unsigned int redMask   = 0xff0000;
const unsigned int greenMask = 0x00ff00;
const unsigned int blueMask  = 0x0000ff;

enum RotationDegree { //clock-wise
	ROT_0,
	ROT_90,
	ROT_180,
	ROT_270
};

enum BlendType {
	BLEND_NONE = 0,
	BLEND_NORMAL,   //a normal indication to blend
	BLEND_DOMINANT, //a strong indication to blend
	//attention: BlendType must fit into the value range of 2 bit!!!
};

struct BlendResult {
	BlendType
	/**/blend_f, blend_g,
	/**/blend_j, blend_k;
};

struct ALIGNED16 Kernel_4x4 { //kernel for preprocessing step
	unsigned int
	/**/a, b, c, d,
	/**/e, f, g, h,
	/**/i, j, k, l,
	/**/m, n, o, p;
};

struct Kernel_3x3 {
	unsigned int
	/**/a,  b,  c,
	/**/d,  e,  f,
	/**/g,  h,  i;
};




template <unsigned int N, unsigned int M> inline
void alphaBlend(unsigned int& dst, unsigned int col) { //blend color over destination with opacity N / M
	dst = (redMask   & ((col & redMask  ) * N + (dst & redMask  ) * (M - N)) / M) | //this works because 8 upper bits are free
		(greenMask & ((col & greenMask) * N + (dst & greenMask) * (M - N)) / M) |
		(blueMask  & ((col & blueMask ) * N + (dst & blueMask ) * (M - N)) / M);
}


//calculate input matrix coordinates after rotation at compile time
template <RotationDegree rotDeg, size_t I, size_t J, size_t N>
struct MatrixRotation;


template <size_t I, size_t J, size_t N>
struct MatrixRotation<ROT_0, I, J, N> {
	static const size_t I_old = I;
	static const size_t J_old = J;
};


template <RotationDegree rotDeg, size_t I, size_t J, size_t N> //(i, j) = (row, col) indices, N = size of (square) matrix
struct MatrixRotation {
	static const size_t I_old = N - 1 - MatrixRotation<static_cast<RotationDegree>(rotDeg - 1), I, J, N>::J_old; //old coordinates before rotation!
	static const size_t J_old =         MatrixRotation<static_cast<RotationDegree>(rotDeg - 1), I, J, N>::I_old; //
};


template <size_t N, RotationDegree rotDeg>
class OutputMatrix {
	public:
		OutputMatrix(unsigned int* out, int outWidth) : //access matrix area, top-left at position "out" for image with given width
			out_(out),
			outWidth_(outWidth) {}
	template <size_t I, size_t J>
	unsigned int& ref() const {
		static const size_t I_old = MatrixRotation<rotDeg, I, J, N>::I_old;
		static const size_t J_old = MatrixRotation<rotDeg, I, J, N>::J_old;
		return *(out_ + J_old + I_old * outWidth_);
	}
	private:
		unsigned int* out_;
		const int outWidth_;
};


FORCE_INLINE
float distYCbCr(const unsigned int& pix1, const unsigned int& pix2) {
	//if (pix1 == pix2) //about 8% perf boost
		//return 0;
	//http://en.wikipedia.org/wiki/YCbCr#ITU-R_BT.601_conversion
	//YCbCr conversion is a matrix multiplication => take advantage of linearity by subtracting first!
	const int r_diff = *((unsigned char*)&pix1+2) - *((unsigned char*)&pix2+2); //we may delay division by 255 to after matrix multiplication
	const int g_diff = *((unsigned char*)&pix1+1) - *((unsigned char*)&pix2+1); //
	const int b_diff = *((unsigned char*)&pix1) - *((unsigned char*)&pix2); //substraction for int is noticeable faster than for double!
	//ITU-R BT.709 conversion
	const float y   = 0.2126F * r_diff + 0.7152F * g_diff + 0.0722F * b_diff; //[!], analog YCbCr!
	const float c_b = (b_diff - y) * 0.5389F;
	const float c_r = (r_diff - y) * 0.635F;
	//we skip division by 255 to have similar range like other distance functions
	return std::sqrt(y*y + c_b*c_b +  c_r*c_r);
}


FORCE_INLINE
bool distYCbCr_c(const unsigned int& pix1, const unsigned int& pix2) {
	float x = distYCbCr(pix1, pix2);
	return ((*(int*)&x) < equalColorTolerance_);
}


/*
input kernel area naming convention:
-----------------
| A | B | C | D |
----|---|---|---|
| E | F | G | H |   //evalute the four corners between F, G, J, K
----|---|---|---|   //input pixel is at position F
| I | J | K | L |
----|---|---|---|
| M | N | O | P |
-----------------
*/
FORCE_INLINE //detect blend direction
BlendResult preProcessCorners(const Kernel_4x4& ker) { //result: F, G, J, K corners of "GradientType"
	BlendResult result = {};
	if ((ker.f == ker.g && ker.j == ker.k) || (ker.f == ker.j && ker.g == ker.k))
		return result;
	//auto dist = [&](unsigned int col1, unsigned int col2) { return colorDist(col1, col2, cfg.luminanceWeight_); };
	//const int weight = 4;
	float jg = ((ker.i == ker.f)?(0):(distYCbCr(ker.i, ker.f))) + ((ker.f == ker.c)?(0):(distYCbCr(ker.f, ker.c))) + ((ker.n == ker.k)?(0):(distYCbCr(ker.n, ker.k))) + ((ker.k == ker.h)?(0):(distYCbCr(ker.k, ker.h))) + ((ker.j == ker.g)?(0):(4 * distYCbCr(ker.j, ker.g)));
	float fk = ((ker.e == ker.j)?(0):(distYCbCr(ker.e, ker.j))) + ((ker.j == ker.o)?(0):(distYCbCr(ker.j, ker.o))) + ((ker.b == ker.g)?(0):(distYCbCr(ker.b, ker.g))) + ((ker.g == ker.l)?(0):(distYCbCr(ker.g, ker.l))) + ((ker.f == ker.k)?(0):(4 * distYCbCr(ker.f, ker.k)));
	if (*(int*)&jg < *(int*)&fk) { //test sample: 70% of values max(jg, fk) / min(jg, fk) are between 1.1 and 3.7 with median being 1.8
		float jg_ = jg * 3.6F;
		const bool dominantGradient = *(int*)&jg_ < *(int*)&fk;
		if (ker.f != ker.g && ker.f != ker.j)
			result.blend_f = dominantGradient ? BLEND_DOMINANT : BLEND_NORMAL;
		if (ker.k != ker.j && ker.k != ker.g)
			result.blend_k = dominantGradient ? BLEND_DOMINANT : BLEND_NORMAL;
	}
	else if (*(int*)&fk < *(int*)&jg) {
		float fk_ = fk * 3.6F;
		const bool dominantGradient = *(int*)&fk_ < *(int*)&jg;
		if (ker.j != ker.f && ker.j != ker.k)
			result.blend_j = dominantGradient ? BLEND_DOMINANT : BLEND_NORMAL;
		if (ker.g != ker.f && ker.g != ker.k)
			result.blend_g = dominantGradient ? BLEND_DOMINANT : BLEND_NORMAL;
	}
	return result;
}


//compress four blend types into a single byte
inline BlendType getTopL(unsigned char b) {
	return static_cast<BlendType>(0x3 & b);
}
inline BlendType getTopR(unsigned char b) {
	return static_cast<BlendType>(0x3 & (b >> 2));
}
inline BlendType getBottomR(unsigned char b) {
	return static_cast<BlendType>(0x3 & (b >> 4));
}
inline BlendType getBottomL(unsigned char b) {
	return static_cast<BlendType>(0x3 & (b >> 6));
}
inline void setTopL(unsigned char& b, BlendType bt) {
	b |= bt;
} //buffer is assumed to be initialized before preprocessing!
inline void setTopR(unsigned char& b, BlendType bt) {
	b |= (bt << 2);
}
inline void setBottomR(unsigned char& b, BlendType bt) {
	b |= (bt << 4);
}
inline void setBottomL(unsigned char& b, BlendType bt) {
	b |= (bt << 6);
}


template <RotationDegree rotDeg> inline
unsigned char rotateBlendInfo(unsigned char b) {
	return b;
}
template <> inline unsigned char rotateBlendInfo<ROT_90 >(unsigned char b) {
	return ((b << 2) | (b >> 6)) & 0xff;
}
template <> inline unsigned char rotateBlendInfo<ROT_180>(unsigned char b) {
	return ((b << 4) | (b >> 4)) & 0xff;
}
template <> inline unsigned char rotateBlendInfo<ROT_270>(unsigned char b) {
	return ((b << 6) | (b >> 2)) & 0xff;
}


template <RotationDegree rotDeg> unsigned int inline get_a(const Kernel_4x4& ker) { return ker.a; }
template <RotationDegree rotDeg> unsigned int inline get_b(const Kernel_4x4& ker) { return ker.b; }
template <RotationDegree rotDeg> unsigned int inline get_c(const Kernel_4x4& ker) { return ker.c; }
template <RotationDegree rotDeg> unsigned int inline get_d(const Kernel_4x4& ker) { return ker.d; }
template <RotationDegree rotDeg> unsigned int inline get_e(const Kernel_4x4& ker) { return ker.e; }
template <RotationDegree rotDeg> unsigned int inline get_f(const Kernel_4x4& ker) { return ker.f; }
template <RotationDegree rotDeg> unsigned int inline get_g(const Kernel_4x4& ker) { return ker.g; }
template <RotationDegree rotDeg> unsigned int inline get_h(const Kernel_4x4& ker) { return ker.h; }
template <RotationDegree rotDeg> unsigned int inline get_i(const Kernel_4x4& ker) { return ker.i; }

template <> inline unsigned int get_a<ROT_90>(const Kernel_4x4& ker) { return ker.g; }
template <> inline unsigned int get_b<ROT_90>(const Kernel_4x4& ker) { return ker.d; }
template <> inline unsigned int get_c<ROT_90>(const Kernel_4x4& ker) { return ker.a; }
template <> inline unsigned int get_d<ROT_90>(const Kernel_4x4& ker) { return ker.h; }
template <> inline unsigned int get_e<ROT_90>(const Kernel_4x4& ker) { return ker.e; }
template <> inline unsigned int get_f<ROT_90>(const Kernel_4x4& ker) { return ker.b; }
template <> inline unsigned int get_g<ROT_90>(const Kernel_4x4& ker) { return ker.i; }
template <> inline unsigned int get_h<ROT_90>(const Kernel_4x4& ker) { return ker.f; }
template <> inline unsigned int get_i<ROT_90>(const Kernel_4x4& ker) { return ker.c; }

template <> inline unsigned int get_a<ROT_180>(const Kernel_4x4& ker) { return ker.i; }
template <> inline unsigned int get_b<ROT_180>(const Kernel_4x4& ker) { return ker.h; }
template <> inline unsigned int get_c<ROT_180>(const Kernel_4x4& ker) { return ker.g; }
template <> inline unsigned int get_d<ROT_180>(const Kernel_4x4& ker) { return ker.f; }
template <> inline unsigned int get_e<ROT_180>(const Kernel_4x4& ker) { return ker.e; }
template <> inline unsigned int get_f<ROT_180>(const Kernel_4x4& ker) { return ker.d; }
template <> inline unsigned int get_g<ROT_180>(const Kernel_4x4& ker) { return ker.c; }
template <> inline unsigned int get_h<ROT_180>(const Kernel_4x4& ker) { return ker.b; }
template <> inline unsigned int get_i<ROT_180>(const Kernel_4x4& ker) { return ker.a; }

template <> inline unsigned int get_a<ROT_270>(const Kernel_4x4& ker) { return ker.c; }
template <> inline unsigned int get_b<ROT_270>(const Kernel_4x4& ker) { return ker.f; }
template <> inline unsigned int get_c<ROT_270>(const Kernel_4x4& ker) { return ker.i; }
template <> inline unsigned int get_d<ROT_270>(const Kernel_4x4& ker) { return ker.b; }
template <> inline unsigned int get_e<ROT_270>(const Kernel_4x4& ker) { return ker.e; }
template <> inline unsigned int get_f<ROT_270>(const Kernel_4x4& ker) { return ker.h; }
template <> inline unsigned int get_g<ROT_270>(const Kernel_4x4& ker) { return ker.a; }
template <> inline unsigned int get_h<ROT_270>(const Kernel_4x4& ker) { return ker.d; }
template <> inline unsigned int get_i<ROT_270>(const Kernel_4x4& ker) { return ker.g; }


/*
input kernel area naming convention:
-------------
| A | B | C |
----|---|---|
| D | E | F | //input pixel is at position E
----|---|---|
| G | H | I |
-------------
*/
template <class Scaler, RotationDegree rotDeg>
FORCE_INLINE //perf: quite worth it!
void scalePixel(const Kernel_4x4& ker,unsigned int* target, int trgWidth,
	unsigned char blendInfo //result of preprocessing all four corners of pixel "e"
	) {
	const unsigned char blend = rotateBlendInfo<rotDeg>(blendInfo);
	if (getBottomR(blend) >= BLEND_NORMAL) {
		unsigned int ee = get_e<rotDeg>(ker);
		unsigned int ff = get_f<rotDeg>(ker);
		unsigned int hh = get_h<rotDeg>(ker);
		unsigned int gg = get_g<rotDeg>(ker);
		unsigned int cc = get_c<rotDeg>(ker);
		unsigned int ii = get_i<rotDeg>(ker);
		//auto eq   = [&](unsigned int col1, unsigned int col2) { return colorDist(col1, col2, cfg.luminanceWeight_) < cfg.equalColorTolerance_; };
		//auto dist = [&](unsigned int col1, unsigned int col2) { return colorDist(col1, col2, cfg.luminanceWeight_); };
		bool doLineBlend = true;
		//if (getBottomR(blend) >= BLEND_DOMINANT)
			//doLineBlend = true;
		//make sure there is no second blending in an adjacent rotation for this pixel: handles insular pixels, mario eyes
		if (getTopR(blend) != BLEND_NONE && !((ee == gg)?(1):(distYCbCr_c(ee, gg)))) //but support double-blending for 90 corners
			doLineBlend = false;
		if (getBottomL(blend) != BLEND_NONE && !((ee == cc)?(1):(distYCbCr_c(ee, cc))))
			doLineBlend = false;
		//no full blending for L-shapes; blend corner only (handles "mario mushroom eyes")
		if (((gg == hh)?(1):(distYCbCr_c(gg, hh))) && ((hh == ii)?(1):(distYCbCr_c(hh, ii))) && ((ii == ff)?(1):(distYCbCr_c(ii, ff))) && ((ff == cc)?(1):(distYCbCr_c(ff, cc))) && !((ee == ii)?(1):(distYCbCr_c(ee, ii))))
			doLineBlend = false;
		const unsigned int px = ((ee == ff)?(0):(distYCbCr(ee, ff))) <= ((ee == hh)?(0):(distYCbCr(ee, hh))) ? ff : hh; //choose most similar color
		OutputMatrix<Scaler::scale, rotDeg> out(target, trgWidth);
		if (doLineBlend) {
			const float fg = ((ff == gg)?(0):(distYCbCr(ff, gg))); //test sample: 70% of values max(fg, hc) / min(fg, hc) are between 1.1 and 3.7 with median being 1.9
			const float hc = ((hh == cc)?(0):(distYCbCr(hh, cc))); //
			const float hc_ = hc * 2.2F;
			const float fg_ = fg * 2.2F;
			//const bool haveShallowLine = ee != gg && get_d<rotDeg>(ker) != gg && 2.2F * fg <= hc;
			//const bool haveSteepLine   = 2.2F * hc <= fg && ee != cc && get_b<rotDeg>(ker) != cc;
			if (ee != gg && get_d<rotDeg>(ker) != gg && *(int*)&fg_ <= *(int*)&hc) {
				if (ee != cc && get_b<rotDeg>(ker) != cc && *(int*)&hc_ <= *(int*)&fg)
					Scaler::blendLineSteepAndShallow(px, out);
				else
					Scaler::blendLineShallow(px, out);
			}
			else {
				if (ee != cc && get_b<rotDeg>(ker) != cc && *(int*)&hc_ <= *(int*)&fg)
					Scaler::blendLineSteep(px, out);
				else
					Scaler::blendLineDiagonal(px,out);
			}
		}
		else
			Scaler::blendCorner(px, out);
	}
}


template <class Scaler> //scaler policy: see "Scaler2x" reference implementation
void scaleImage(const WORD* src, WORD* trg, int srcWidth, int srcHeight, int yFirst, int yLast) {
	/*yFirst = ((yFirst<0)?(0):(yFirst));
	yLast  = ((!(srcHeight<yLast))?(yLast):(srcHeight));
	if (yFirst >= yLast || srcWidth <= 0)
		return;*/
	const int trgWidth = MAXW * Scaler::scale; // not srcWidth * Scaler::scale -> slight performance increase (due to cache alignment?)
	//"use" space at the end of the image as temporary buffer for "on the fly preprocessing": we even could use larger area of
	//"sizeof(uint32_t) * srcWidth * (yLast - yFirst)" bytes without risk of accidental overwriting before accessing
	//const int bufferSize = srcWidth;
	unsigned char preProcBuffer[MAXW] = {0};
	unsigned int out_tmp[MAXW * Scaler::scale * Scaler::scale];
	//unsigned char* preProcBuffer = reinterpret_cast<unsigned char*>(trg + yLast * Scaler::scale * trgWidth) - bufferSize;
	//std::fill(preProcBuffer, preProcBuffer + bufferSize, 0);
	//initialize preprocessing buffer for first row: detect upper left and right corner blending
	//this cannot be optimized for adjacent processing stripes; we must not allow for a memory race condition!
	Kernel_4x4 ker4 = {}; //perf: initialization is negligable
	if (yFirst > 0) {
		const int y = yFirst - 1;
		const WORD* s_0  = src + picture_srcpitch * y; //center line
		const WORD* s_m1 = ((y)?(s_0 - picture_srcpitch):(s_0));
		const WORD* s_p1 = ((y+1<srcHeight)?(s_0 + picture_srcpitch):(s_0));
		const WORD* s_p2 = ((y+2<srcHeight)?(s_p1 + picture_srcpitch):(s_p1));
		for (int x = 0; x < srcWidth; ++x) {
			const int x_m1 = ((x)?(x-1):(0));
			const int x_p1 = ((x+1<srcWidth)?(x+1):(x));
			const int x_p2 = ((x_p1+1<srcWidth)?(x_p1+1):(x_p1));
			//Kernel_4x4 ker = {}; //perf: initialization is negligable
			ker4.a = colortable[s_m1[x_m1]]; //read sequentially from memory as far as possible
			ker4.b = colortable[s_m1[x]];
			ker4.c = colortable[s_m1[x_p1]];
			ker4.d = colortable[s_m1[x_p2]];
			ker4.e = colortable[s_0[x_m1]];
			ker4.f = colortable[s_0[x]];
			ker4.g = colortable[s_0[x_p1]];
			ker4.h = colortable[s_0[x_p2]];
			ker4.i = colortable[s_p1[x_m1]];
			ker4.j = colortable[s_p1[x]];
			ker4.k = colortable[s_p1[x_p1]];
			ker4.l = colortable[s_p1[x_p2]];
			//ker.m = colortable[s_p2[x_m1]]; // not used
			ker4.n = colortable[s_p2[x]];
			ker4.o = colortable[s_p2[x_p1]];
			//ker.p = colortable[s_p2[x_p2]]; // not used
			const BlendResult res = preProcessCorners(ker4);
			/*
			preprocessing blend result:
			---------
			| F | G |   //evalute corner between F, G, J, K
			----|---|   //input pixel is at position F
			| J | K |
			---------
			*/
			setTopR(preProcBuffer[x], res.blend_j);
			if (x + 1 < srcWidth)
				setTopL(preProcBuffer[x + 1], res.blend_k);
		}
	}
	//------------------------------------------------------------------------------------
	unsigned int* tt; // new
	unsigned int ii, jj; // new
	for (int y = yFirst; y < yLast; ++y) {
		//unsigned int* out = trg + Scaler::scale * y * trgWidth; //consider MT "striped" access
		unsigned int* out = out_tmp;
		const WORD* s_0  = src + picture_srcpitch * y; //center line
		const WORD* s_m1 = ((y)?(s_0 - picture_srcpitch):(s_0));
		const WORD* s_p1 = ((y+1<srcHeight)?(s_0 + picture_srcpitch):(s_0));
		const WORD* s_p2 = ((y+2<srcHeight)?(s_p1 + picture_srcpitch):(s_p1));
		unsigned char blend_xy1 = 0; //corner blending for current (x, y + 1) position
		for (int x = 0; x < srcWidth; ++x, out += Scaler::scale) {
			//all those bounds checks have only insignificant impact on performance!
			const int x_m1 = ((x)?(x-1):(0));
			const int x_p1 = ((x+1<srcWidth)?(x+1):(x));
			const int x_p2 = ((x_p1+1<srcWidth)?(x_p1+1):(x_p1));
			//evaluate the four corners on bottom-right of current pixel
			unsigned char blend_xy = 0; { //for current (x, y) position
				ker4.a = colortable[s_m1[x_m1]]; //read sequentially from memory as far as possible
				ker4.b = colortable[s_m1[x]];
				ker4.c = colortable[s_m1[x_p1]];
				ker4.d = colortable[s_m1[x_p2]];
				ker4.e = colortable[s_0[x_m1]];
				ker4.f = colortable[s_0[x]];
				ker4.g = colortable[s_0[x_p1]];
				ker4.h = colortable[s_0[x_p2]];
				ker4.i = colortable[s_p1[x_m1]];
				ker4.j = colortable[s_p1[x]];
				ker4.k = colortable[s_p1[x_p1]];
				ker4.l = colortable[s_p1[x_p2]];
				//ker4.m = colortable[s_p2[x_m1]]; // not used
				ker4.n = colortable[s_p2[x]];
				ker4.o = colortable[s_p2[x_p1]];
				//ker4.p = colortable[s_p2[x_p2]]; // not used
				const BlendResult res = preProcessCorners(ker4);
				/*
				preprocessing blend result:
				---------
				| F | G |   //evalute corner between F, G, J, K
				----|---|   //current input pixel is at position F
				| J | K |
				---------
				*/
				blend_xy = preProcBuffer[x];
				setBottomR(blend_xy, res.blend_f); //all four corners of (x, y) have been determined at this point due to processing sequence!
				setTopR(blend_xy1, res.blend_j); //set 2nd known corner for (x, y + 1)
				preProcBuffer[x] = blend_xy1; //store on current buffer position for use on next row
				blend_xy1 = 0;
				setTopL(blend_xy1, res.blend_k); //set 1st known corner for (x + 1, y + 1) and buffer for use on next column
				if (x + 1 < srcWidth) //set 3rd known corner for (x + 1, y)
					setBottomL(preProcBuffer[x + 1], res.blend_g);
			}
			//fill block of size scale * scale with the given color
			//fillBlock(out, trgWidth * sizeof(unsigned int), s_0[x], Scaler::scale); //place *after* preprocessing step, to not overwrite the results while processing the the last pixel!
			// new start
			for (ii = 0, tt = out; ii < Scaler::scale; ++ii, tt += trgWidth)
				for (jj = 0; jj < Scaler::scale; ++jj)
					tt[jj] = ker4.f;
			// end new
			//blend four corners of current pixel
			if (blend_xy != 0) { //good 20% perf-improvement
				/*Kernel_3x3 ker = {}; //perf: initialization is negligable
				ker.a = ker4.a; //read sequentially from memory as far as possible
				ker.b = ker4.b;
				ker.c = ker4.c;
				ker.d = ker4.e;
				ker.e = ker4.f;
				ker.f = ker4.g;
				ker.g = ker4.i;
				ker.h = ker4.j;
				ker.i = ker4.k;*/
				ker4.d = ker4.e;
				ker4.e = ker4.f;
				ker4.f = ker4.g;
				ker4.g = ker4.i;
				ker4.h = ker4.j;
				ker4.i = ker4.k;
				scalePixel<Scaler, ROT_0  >(ker4, out, trgWidth, blend_xy);
				scalePixel<Scaler, ROT_90 >(ker4, out, trgWidth, blend_xy);
				scalePixel<Scaler, ROT_180>(ker4, out, trgWidth, blend_xy);
				scalePixel<Scaler, ROT_270>(ker4, out, trgWidth, blend_xy);
			}
		}
		out = out_tmp;
		for(int i=0; i<Scaler::scale; i++) {
			for(int j=0; j<srcWidth*Scaler::scale; j++) {
				#ifdef _XBRZ_RLUT_
					trg[j] = colortable_o[out[j]];
				#else
					trg[j] = ((0x000000F8 & out[j]) >> 3) | ((mask_g & out[j]) >> 5) | ((0x00F80000 & out[j]) >> shift_r);
				#endif
			}
			trg += picture_dstpitch;
			out += trgWidth;
		}
	}
}


struct Scaler2x {
	static const int scale = 2;
	template <class OutputMatrix>
	static void blendLineShallow(unsigned int col, OutputMatrix& out) {
		alphaBlend<1, 4>(out.template ref<scale - 1, 0>(), col);
		alphaBlend<3, 4>(out.template ref<scale - 1, 1>(), col);
	}
	template <class OutputMatrix>
	static void blendLineSteep(unsigned int col, OutputMatrix& out) {
		alphaBlend<1, 4>(out.template ref<0, scale - 1>(), col);
		alphaBlend<3, 4>(out.template ref<1, scale - 1>(), col);
	}
	template <class OutputMatrix>
	static void blendLineSteepAndShallow(unsigned int col, OutputMatrix& out) {
		alphaBlend<1, 4>(out.template ref<1, 0>(), col);
		alphaBlend<1, 4>(out.template ref<0, 1>(), col);
		alphaBlend<5, 6>(out.template ref<1, 1>(), col); //[!] fixes 7/8 used in xBR
	}
	template <class OutputMatrix>
	static void blendLineDiagonal(unsigned int col, OutputMatrix& out) {
		alphaBlend<1, 2>(out.template ref<1, 1>(), col);
	}
	template <class OutputMatrix>
	static void blendCorner(unsigned int col, OutputMatrix& out) {
		//model a round corner
		alphaBlend<21, 100>(out.template ref<1, 1>(), col); //exact: 1 - pi/4 = 0.2146018366
	}
};


struct Scaler3x {
	static const int scale = 3;
	template <class OutputMatrix>
	static void blendLineShallow(unsigned int col, OutputMatrix& out) {
		alphaBlend<1, 4>(out.template ref<scale - 1, 0>(), col);
		alphaBlend<1, 4>(out.template ref<scale - 2, 2>(), col);
		alphaBlend<3, 4>(out.template ref<scale - 1, 1>(), col);
		out.template ref<scale - 1, 2>() = col;
	}
	template <class OutputMatrix>
	static void blendLineSteep(unsigned int col, OutputMatrix& out) {
		alphaBlend<1, 4>(out.template ref<0, scale - 1>(), col);
		alphaBlend<1, 4>(out.template ref<2, scale - 2>(), col);
		alphaBlend<3, 4>(out.template ref<1, scale - 1>(), col);
		out.template ref<2, scale - 1>() = col;
	}
	template <class OutputMatrix>
	static void blendLineSteepAndShallow(unsigned int col, OutputMatrix& out) {
		alphaBlend<1, 4>(out.template ref<2, 0>(), col);
		alphaBlend<1, 4>(out.template ref<0, 2>(), col);
		alphaBlend<3, 4>(out.template ref<2, 1>(), col);
		alphaBlend<3, 4>(out.template ref<1, 2>(), col);
		out.template ref<2, 2>() = col;
	}
	template <class OutputMatrix>
	static void blendLineDiagonal(unsigned int col, OutputMatrix& out) {
		alphaBlend<1, 8>(out.template ref<1, 2>(), col);
		alphaBlend<1, 8>(out.template ref<2, 1>(), col);
		alphaBlend<7, 8>(out.template ref<2, 2>(), col);
	}
	template <class OutputMatrix>
	static void blendCorner(unsigned int col, OutputMatrix& out) {
		//model a round corner
		alphaBlend<45, 100>(out.template ref<2, 2>(), col); //exact: 0.4545939598
		//alphaBlend<14, 1000>(out.template ref<2, 1>(), col); //0.01413008627 -> negligable
		//alphaBlend<14, 1000>(out.template ref<1, 2>(), col); //0.01413008627
	}
};


struct Scaler4x {
	static const int scale = 4;
	template <class OutputMatrix>
	static void blendLineShallow(unsigned int col, OutputMatrix& out) {
		alphaBlend<1, 4>(out.template ref<scale - 1, 0>(), col);
		alphaBlend<1, 4>(out.template ref<scale - 2, 2>(), col);
		alphaBlend<3, 4>(out.template ref<scale - 1, 1>(), col);
		alphaBlend<3, 4>(out.template ref<scale - 2, 3>(), col);
		out.template ref<scale - 1, 2>() = col;
		out.template ref<scale - 1, 3>() = col;
	}
	template <class OutputMatrix>
	static void blendLineSteep(unsigned int col, OutputMatrix& out) {
		alphaBlend<1, 4>(out.template ref<0, scale - 1>(), col);
		alphaBlend<1, 4>(out.template ref<2, scale - 2>(), col);
		alphaBlend<3, 4>(out.template ref<1, scale - 1>(), col);
		alphaBlend<3, 4>(out.template ref<3, scale - 2>(), col);
		out.template ref<2, scale - 1>() = col;
		out.template ref<3, scale - 1>() = col;
	}
	template <class OutputMatrix>
	static void blendLineSteepAndShallow(unsigned int col, OutputMatrix& out) {
		alphaBlend<3, 4>(out.template ref<3, 1>(), col);
		alphaBlend<3, 4>(out.template ref<1, 3>(), col);
		alphaBlend<1, 4>(out.template ref<3, 0>(), col);
		alphaBlend<1, 4>(out.template ref<0, 3>(), col);
		alphaBlend<1, 3>(out.template ref<2, 2>(), col); //[!] fixes 1/4 used in xBR
		out.template ref<3, 3>() = out.template ref<3, 2>() = out.template ref<2, 3>() = col;
	}
	template <class OutputMatrix>
	static void blendLineDiagonal(unsigned int col, OutputMatrix& out) {
		alphaBlend<1, 2>(out.template ref<scale - 1, scale / 2    >(), col);
		alphaBlend<1, 2>(out.template ref<scale - 2, scale / 2 + 1>(), col);
		out.template ref<scale - 1, scale - 1>() = col;
	}
	template <class OutputMatrix>
	static void blendCorner(unsigned int col, OutputMatrix& out) {
		//model a round corner
		alphaBlend<68, 100>(out.template ref<3, 3>(), col); //exact: 0.6848532563
		alphaBlend< 9, 100>(out.template ref<3, 2>(), col); //0.08677704501
		alphaBlend< 9, 100>(out.template ref<2, 3>(), col); //0.08677704501
	}
};


} // namespace


////////////////////////////////////////////////////////////////////////////////
// Main caller
#ifdef _XBRZ_MT_
	DWORD WINAPI xbrz::scale(LPVOID lpParam) {
		unsigned char slice = *(unsigned char*)lpParam;
		
		// treating possible odd input height
		// -> only the last slice will have odd height (one line less) if input height is odd
		// -> NUM_SLICE should be even
		int start = (picture_srcheight/NUM_SLICE) + (picture_srcheight % 2);
		WORD* output = picture_outdst;
		output += start * slice * _SCALER_ * picture_dstpitch;
		int end = start * (slice+1);
		start *= slice;
		if(slice+1 == NUM_SLICE)
			end = picture_srcheight;
		
		if(_SCALER_ == 2)
			scaleImage<Scaler2x>(picture_input, output, picture_srcwidth, picture_srcheight, start, end);
		else if(_SCALER_ == 3)
			scaleImage<Scaler3x>(picture_input, output, picture_srcwidth, picture_srcheight, start, end);
		else // 4 or sth. else
			scaleImage<Scaler4x>(picture_input, output, picture_srcwidth, picture_srcheight, start, end);
		return 0;
	}
#else
	void xbrz::scale() {
		if(_SCALER_ == 2)
			scaleImage<Scaler2x>(picture_input, picture_outdst, picture_srcwidth, picture_srcheight, 0, picture_srcheight);
		else if(_SCALER_ == 3)
			scaleImage<Scaler3x>(picture_input, picture_outdst, picture_srcwidth, picture_srcheight, 0, picture_srcheight);
		else // 4 or sth. else
			scaleImage<Scaler4x>(picture_input, picture_outdst, picture_srcwidth, picture_srcheight, 0, picture_srcheight);
	}
#endif
////////////////////////////////////////////////////////////////////////////////





////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////
// Kega


// Kega helper function
static void rpi_strcpy(char* out, char* in) {
	while(1) {
		*out++=*in;
		if(!(*in++))
			break;
	}
}


extern "C" RENDER_PLUGIN_INFO* RenderPluginGetInfo(void) {
	// Provide a name for this Render Plugin (max 60 chars) as follows:
	// Name (Original Author)
	// Make sure the name is correct with respect to capitalisation, etc.
	// For example, this plugin is called "Double"
	// If you cannot fit the Original Authors name in then you may shorten the Plugin name,
	// BUT DO NOT GO OVER 60 CHARS.
	//                         "............................................................"
	//rpi_strcpy(&MyRPI.Name[0], "4xBRZ (Zenju)");
	// Set the Version Number and other flags.
	MyRPI.Flags=RPI_VERSION | RPI_555_SUPP | RPI_565_SUPP;
	if(_SCALER_ == 2) {
		MyRPI.Flags |= RPI_OUT_SCL2;
		#ifdef _XBRZ_MT_
			rpi_strcpy(&MyRPI.Name[0], "2xBRZ MT (Zenju)");
		#else
			rpi_strcpy(&MyRPI.Name[0], "2xBRZ (Zenju)");
		#endif
	}
	else if(_SCALER_ == 3) {
		MyRPI.Flags |= RPI_OUT_SCL3;
		#ifdef _XBRZ_MT_
			rpi_strcpy(&MyRPI.Name[0], "3xBRZ MT (Zenju)");
		#else
			rpi_strcpy(&MyRPI.Name[0], "3xBRZ (Zenju)");
		#endif
	}
	else { // 4 or sth. else
		MyRPI.Flags |= RPI_OUT_SCL4;
		#ifdef _XBRZ_MT_
			rpi_strcpy(&MyRPI.Name[0], "4xBRZ MT (Zenju)");
		#else
			rpi_strcpy(&MyRPI.Name[0], "4xBRZ (Zenju)");
		#endif
	}
	// Do any other setup required here.
	// Return pointer to the info structure.
	return(&MyRPI);
}


extern "C" void RenderPluginOutput(RENDER_PLUGIN_OUTP* rpo) {
	// Make sure I can use this renderer - in this case, width/height checks.
	if(((rpo->SrcW*_SCALER_)<=rpo->DstW) && ((rpo->SrcH*_SCALER_)<=rpo->DstH)) {
		bool VideoFormat = !(rpo->Flags & RPI_565_SUPP);
		picture_dstpitch = rpo->DstPitch/sizeof(WORD);
		picture_srcpitch = rpo->SrcPitch/sizeof(WORD);
		picture_srcheight = rpo->SrcH;
		picture_srcwidth = rpo->SrcW;
		picture_input = (WORD*)rpo->SrcPtr;
		picture_outdst = (WORD*)rpo->DstPtr;
		colortable = VideoFormat ? _table15_24 : _table16_24; // point to the correct color-LUT
		colortable_o = VideoFormat ? _table24_15 : _table24_16;
		mask_g = VideoFormat ? 0x0000F800 : 0x0000FC00;
		shift_r = VideoFormat ? 9 : 8;
		//////////////////////////////////////////////////
		#ifdef _XBRZ_MT_
			HANDLE hThread_[NUM_SLICE];
			DWORD threadID_[NUM_SLICE];
			unsigned char slice_[NUM_SLICE];
			for(unsigned char c = 0; c < NUM_SLICE; c++)
				slice_[c] = c;
			for(int i = 0; i < NUM_SLICE; i++)
				hThread_[i] = CreateThread(NULL, 0, xbrz::scale, &slice_[i], 0, &threadID_[i]);
			WaitForMultipleObjects(NUM_SLICE, hThread_, TRUE, INFINITE);
			for(int i = 0; i < NUM_SLICE; i++)
				CloseHandle(hThread_[i]);
		#else
			xbrz::scale();
		#endif
		//////////////////////////////////////////////////
		rpo->OutW=rpo->SrcW*_SCALER_;
		rpo->OutH=rpo->SrcH*_SCALER_;
	}
}



////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////

extern "C" BOOL APIENTRY DllMain(HMODULE hModule, DWORD dwReason, LPVOID lpReserved) {
	if(dwReason == DLL_PROCESS_ATTACH) {
		_table16_24 = new unsigned int[65536];
		_table15_24 = new unsigned int[32768];
		for(unsigned int i = 0; i < 65536; i++) {
			unsigned char r = (i >> 11) & 0x1F;
			unsigned char g = (i >>  5) & 0x3F;
			unsigned char b = i & 0x1F;
			r = (r << 3) | (r >> 2);
			g = (g << 2) | (g >> 4);
			b = (b << 3) | (b >> 2);
			_table16_24[i] = (r << 16) | (g << 8) | (b << 0);
		}
		for(unsigned int i = 0; i < 32768; i++) {
			unsigned char r = (i >> 10) & 0x1F;
			unsigned char g = (i >>  5) & 0x1F;
			unsigned char b = i & 0x1F;
			r = (r << 3) | (r >> 2);
			g = (g << 3) | (g >> 2);
			b = (b << 3) | (b >> 2);
			_table15_24[i] = (r << 16) | (g << 8) | (b << 0);
		}
		#ifdef _XBRZ_RLUT_
			unsigned int tsize = 1024*1024*16;
			_table24_16 = new unsigned short[tsize];
			_table24_15 = new unsigned short[tsize];
			for(unsigned int i = 0; i < tsize; i++) {
				_table24_16[i] = ((0x000000F8 & i) >> 3) | ((0x0000FC00 & i) >> 5) | ((0x00F80000 & i) >> 8);
			}
			for(unsigned int i = 0; i < tsize; i++) {
				_table24_15[i] = ((0x000000F8 & i) >> 3) | ((0x0000F800 & i) >> 5) | ((0x00F80000 & i) >> 9);
			}
		#endif
	}
	else if(dwReason == DLL_PROCESS_DETACH) {
		delete[] _table16_24;
		delete[] _table15_24;
		#ifdef _XBRZ_RLUT_
			delete[] _table24_16;
			delete[] _table24_15;
		#endif
	}
	return TRUE;
}

