// ****************************************************************************
// * This file is part of the xBRZ project. It is distributed under           *
// * GNU General Public License: http://www.gnu.org/licenses/gpl-3.0          *
// * Copyright (C) Zenju (zenju AT gmx DOT de) - All Rights Reserved          *
// ****************************************************************************

#include "xBRZ_rpi.h"
#include <vector>
#include <algorithm>
#include <cassert>
#include <xbrz.h>
#include <xbrz_tools.h>
#include <ppl.h>


namespace
{
const size_t scalingFactor = 4;

enum class RgbType
{
    RGB555,
    RGB565,
};

#ifndef NDEBUG
DWORD threadIdLast = 0;
#endif
}


RENDER_PLUGIN_INFO* kega::RenderPluginGetInfo()
{
    static RENDER_PLUGIN_INFO globalRpi = []
    {
        RENDER_PLUGIN_INFO rpi = {};
        const char scalerName[] = "xBRZ";
        static_assert(sizeof(rpi.Name) >= sizeof(scalerName), "");
        ::memcpy(rpi.Name, scalerName, sizeof(scalerName)); //including null-termination
        rpi.Flags = RPI_VERSION |
                    RPI_MMX_USED | RPI_MMX_REQD |
                    RPI_555_SUPP | RPI_565_SUPP |
                    RPI_OUT_SCL4; //see scalingFactor
        return rpi;
    }();
    return &globalRpi;
}


void kega::RenderPluginOutput(RENDER_PLUGIN_OUTP* rpo)
{
#ifndef NDEBUG
	const DWORD threadIdNew = ::GetCurrentThreadId();
	if (threadIdLast == 0)
		threadIdLast = threadIdNew;
	else
		assert(threadIdLast == threadIdNew); //make sure this DLL is called by a single thread only with all these statics lying around...
#endif

    static std::vector<uint32_t> bufSrc;
    static std::vector<uint32_t> bufXbrz;

    if (rpo->SrcW <= 0 || rpo->SrcH <= 0 || rpo->DstW <= 0 || rpo->DstH <= 0)
    {
        assert(false);
        return;
    }

    const int srcWidth  = rpo->SrcW;
    const int srcHeight = rpo->SrcH;
    const int srcPitch  = rpo->SrcPitch;

    const int xbrzWidth  = srcWidth  * scalingFactor;
    const int xbrzHeight = srcHeight * scalingFactor;

    const int trgWidth  = rpo->DstW;
    const int trgHeight = rpo->DstH;
    const int trgPitch  = rpo->DstPitch;

    assert(trgWidth  == xbrzWidth);  //should be true due to RPI_OUT_SCL4
    assert(trgHeight == xbrzHeight); //but don't rely on it => xbrz::nearestNeighborScale()


	const RgbType rgbType = [&]
    {
        if (rpo->Flags & RPI_555_SUPP)
            return RgbType::RGB555;

        assert(rpo->Flags & RPI_565_SUPP);
        return RgbType::RGB565;
    }();


    const auto convertRgb16To32 = [rgbType](uint16_t pix)
    {
#if 0 //perf: no improvement using static lookup table (note the bug: static table hard-codes rgbType)
        static const std::vector<uint32_t> rgb16To32 = [&]
        {
            std::vector<uint32_t> tmp(256 * 256);
            for (uint32_t i = 0; i < 256 * 256; ++i)
                switch (rgbType)
                {
                    case RgbType::RGB555:
                        tmp[i] = xbrz::rgb555to888(static_cast<uint16_t>(i));
                        break;
                    case RgbType::RGB565:
                        tmp[i] = xbrz::rgb565to888(static_cast<uint16_t>(i));
                        break;
                }
            return tmp;
        }();
        return pix < rgb16To32.size() ? rgb16To32[pix] : uint32_t(0);
#else
        switch (rgbType)
        {
            case RgbType::RGB555:
                return xbrz::rgb555to888(pix);
            case RgbType::RGB565:
                return xbrz::rgb565to888(pix);
        }
        return uint32_t(0);
#endif
    };

    const auto convertRgb32To16 = [rgbType](uint32_t pix)
    {
#if 0 //perf: no improvement using static lookup table (note the bug: static table hard-codes rgbType)
        static const std::vector<uint16_t> rgb32To16 = [&]
        {
            std::vector<uint16_t> tmp(256 * 256 * 256);
            for (uint32_t i = 0; i < 256 * 256 * 256; ++i)
                switch (rgbType)
                {
                    case RgbType::RGB555:
                        tmp[i] = xbrz::rgb888to555(i);
                        break;
                    case RgbType::RGB565:
                        tmp[i] = xbrz::rgb888to565(i);
                        break;
                }
            return tmp;
        }();
        return pix < rgb32To16.size() ? rgb32To16[pix] : uint16_t(0);
#else
        switch (rgbType)
        {
            case RgbType::RGB555:
                return xbrz::rgb888to555(pix);
            case RgbType::RGB565:
                return xbrz::rgb888to565(pix);
        }
        return uint16_t(0);
#endif
    };

    //--------- 16 to 32 bit color conversion -----------------
    bufSrc.resize(srcWidth * srcHeight);
    auto it = bufSrc.begin();

    for (int h = 0; h < srcHeight; ++h) //parallelize? probably not worth it
        for (int w = 0; w < srcWidth; ++w)
            *it++ = convertRgb16To32(*(reinterpret_cast<const uint16_t*>(static_cast<const char*>(rpo->SrcPtr) + srcPitch * h) + w));


	bufXbrz.resize(xbrzWidth * xbrzHeight);

    const uint32_t* srcPtr  = &bufSrc [0]; //help VS compiler a little + support capture by value
    uint32_t*       xbrzPtr = &bufXbrz[0];
    uint16_t*       trgPtr  = static_cast<uint16_t*>(rpo->DstPtr);

    //--------- xBRZ scaling -----------------
    const int TASK_GRANULARITY = 16; //granularity 1 has noticeable overhead for xBRZ

    concurrency::task_group tg;

    for (int i = 0; i < srcHeight; i += TASK_GRANULARITY)
        tg.run([=]
    {
        const int iLast = std::min(i + TASK_GRANULARITY, srcHeight);
        xbrz::scale(scalingFactor, srcPtr, xbrzPtr, srcWidth, srcHeight, xbrz::ColorFormat::RGB, xbrz::ScalerCfg(), i, iLast);
    });
    tg.wait();

    //--------- 32 to 16 bit color conversion -----------------

    //parallelize? slight pessimization!
    //integrate into scaling threads using SliceType::SOURCE? medium pessimization!
    //-> note: this is not related to Concurrency::MaxConcurrency
	xbrz::nearestNeighborScale(xbrzPtr, xbrzWidth, xbrzHeight, xbrzWidth * sizeof(uint32_t),
                               trgPtr,  trgWidth,  trgHeight, trgPitch,
							   xbrz::SliceType::TARGET, 0, trgHeight, convertRgb32To16);

    rpo->OutW = trgWidth;
    rpo->OutH = trgHeight;
}
