// ****************************************************************************
// * This file is part of the xBRZ project. It is distributed under           *
// * GNU General Public License: http://www.gnu.org/licenses/gpl-3.0          *
// * Copyright (C) Zenju (zenju AT gmx DOT de) - All Rights Reserved          *
// ****************************************************************************

#include "hq.h"
#include <vector>
#include "xbrz.h"

#ifdef USE_TBB
#include <tbb/task_scheduler_init.h>
#include <tbb/parallel_for.h>
#else
#include <ppl.h>
#endif

namespace
{
const size_t TASK_GRANULARITY = 8; //granularity 1 has noticeable overhead for xBRZ

void parallelScaleXBRZ(size_t factor, const uint32_t* src, uint32_t* trg, int srcWidth, int srcHeight, const xbrz::ScalerCfg& cfg)
{

#ifdef USE_TBB
    tbb::parallel_for(tbb::blocked_range<int>(0, srcHeight, TASK_GRANULARITY),
        [=, &cfg](const tbb::blocked_range<int>& r)
    {
        xbrz::scale(factor, src, trg, srcWidth, srcHeight, xbrz::ColorFormat::RGB, cfg, r.begin(), r.end());
    });
#else
    concurrency::parallel_for(0, srcHeight, (int)TASK_GRANULARITY, [&](const int& i) 
    {
        xbrz::scale(factor, src, trg, srcWidth, srcHeight, xbrz::ColorFormat::RGB, cfg, i, i + TASK_GRANULARITY);
    });
#endif
}


uint32_t LUT16to32[256 * 256];

struct InitLut16to32
{
    InitLut16to32()
    {
        for (uint32_t i = 0; i < 256 * 256; ++i) //i needs larger range than 2^16
            //        *DSTPtr++=
            //            ((lu<<11) & 0xf800) |
            //((lu<<1)  & 0x07c0)   |
            //((lu>>10) & 0x001f);

            LUT16to32[i] = ((i & 0xF800) << 8) + ((i & 0x07E0) << 5) + ((i & 0x001F) << 3);

        //LUT16to32[i] =
        //    ((i &  0x1f      )  << 19) |
        //    ((i & (0x1f << 5))  << 6)  |
        //    ((i & (0x1f << 10)) >> 7);
    }
} dummy;


void scaleXbrz(int scale, unsigned char* srcPtr, unsigned long srcPitch, unsigned char* dstPtr, int width, int height)
{
    srcPitch = 512 * sizeof(uint16_t);  //it looks like pete's plugin hardcodes 1024 bytes input and 4096 output pitch!!!
    unsigned long dstPitch = 1024*16; //


    static std::vector<uint32_t> bufSrc;
    static std::vector<uint32_t> bufDst;

    bufSrc.resize(width * height);
    if (!bufSrc.empty())
    {
        //tbb::parallel_for(tbb::blocked_range<int>(0, height, TASK_GRANULARITY),
        //                  [width, srcPtr, srcPitch](const tbb::blocked_range<int>& r)
        //{
        //    auto it = bufSrc.begin() + r.begin() * width;
        //    for (int h = r.begin(); h < r.end(); ++h)
        //        for (int w = 0; w < width; ++w)
        //            *it++ = LUT16to32[*(reinterpret_cast<const uint16_t*>(srcPtr + srcPitch * h) + w)];
        //});

        {
            auto it = bufSrc.begin();
            for (int h = 0; h < height; ++h)
                for (int w = 0; w < width; ++w)
                    *it++ = LUT16to32[*(reinterpret_cast<const uint16_t*>(srcPtr + srcPitch * h) + w)];
        }

        bufDst.resize(width * height * scale * scale);

        parallelScaleXBRZ(scale, &bufSrc[0], &bufDst[0], width, height, xbrz::ScalerCfg());

        //xbrz::nearestNeighborScale(&bufDst[0], width * scale, height * scale, width * scale * sizeof(uint32_t),
        //                     trg, trgWidth, trgHeight, trgWidth * sizeof(uint32_t),
        //                     NN_SCALE_SLICE_TARGET, 0, trgHeight);


        //tbb::parallel_for(tbb::blocked_range<int>(0, height * scale, TASK_GRANULARITY),
        //                  [width, scale, dstPtr, dstPitch](const tbb::blocked_range<int>& r)
        //{
        //    auto it = bufDst.begin() + r.begin() * width * scale;

        //    for (int h = r.begin(); h < r.end(); ++h)
        //        for (int w = 0; w < width * scale; ++w)
        //            *(reinterpret_cast<uint32_t*>(dstPtr + dstPitch * h) + w) = *it++;
        //});

        auto it = bufDst.begin();

        for (int h = 0; h < height * scale; ++h)
            for (int w = 0; w < width * scale; ++w)
                *(reinterpret_cast<uint32_t*>(dstPtr + dstPitch * h) + w) = *it++;
    }
}
}


void scale2xbrz(unsigned char* srcPtr, unsigned long srcPitch, unsigned char* dstPtr, int width, int height)
{
    scaleXbrz(2, srcPtr, srcPitch, dstPtr, width, height);
}


void scale3xbrz(unsigned char* srcPtr, unsigned long srcPitch, unsigned char* dstPtr, int width, int height)
{
    scaleXbrz(3, srcPtr, srcPitch, dstPtr, width, height);
}

void scale4xbrz(unsigned char* srcPtr, unsigned long srcPitch, unsigned char* dstPtr, int width, int height)
{
    scaleXbrz(4, srcPtr, srcPitch, dstPtr, width, height);
}

void scale5xbrz(unsigned char* srcPtr, unsigned long srcPitch, unsigned char* dstPtr, int width, int height)
{
    scaleXbrz(5, srcPtr, srcPitch, dstPtr, width, height);
}