/*******************************************************************************
* Copyright 2016 Intel Corporation.
*
*
* This software and the related documents are Intel copyrighted materials, and your use of them is governed by
* the express license under which they were provided to you ('License'). Unless the License provides otherwise,
* you may not use, modify, copy, publish, distribute, disclose or transmit this software or the related
* documents without Intel's prior written permission.
* This software and the related documents are provided as is, with no express or implied warranties, other than
* those that are expressly stated in the License.
*******************************************************************************/

/* Intel(R) Integrated Performance Primitives (Intel(R) IPP) */

#include "prfilterbrd_tl.h"
static void ownrGetFilterBorderCubeSize(IpprVolumeL dstRoiSize, IpprVolumeL kernelVolume, Ipp32u numThreads, IpprVolumeL* pCubeVolume, IpprVolumeL* pLastVolume, IpprPointL* splitImage)
{
    IpprVolumeL cubeVolume;
    cubeVolume.width = dstRoiSize.width;
    cubeVolume.height = dstRoiSize.height;
    cubeVolume.depth  = dstRoiSize.depth / (IppSizeL)numThreads;
    if(cubeVolume.depth < TILE_S && TILE_S <= dstRoiSize.depth)
        cubeVolume.depth = TILE_S;
    (*splitImage).x = (*splitImage).y = (*splitImage).z = 1;
    if (((numThreads == 1) || (cubeVolume.depth < TILE_S)) && (dstRoiSize.depth))
    {
        (*pLastVolume).width  = (*pCubeVolume).width  = dstRoiSize.width;
        (*pLastVolume).height = (*pCubeVolume).height = dstRoiSize.height;
        (*pLastVolume).depth  = (*pCubeVolume).depth  = dstRoiSize.depth;
    }
    else
    {
        cubeVolume.depth  = TILE_S;
        cubeVolume.width  = dstRoiSize.width;
        cubeVolume.height = dstRoiSize.height;
        /* split the volume to cubes */
        ipprSplitToCubes_LT(dstRoiSize, cubeVolume, splitImage, pCubeVolume, pLastVolume);
    }
}

IPPFUN(IppStatus, ipprFilterBorderGetSize_LT, (IpprVolumeL kernelVolume, IpprVolumeL dstRoiVolume, IppDataType dataType, IppDataType kernelType, int numChannels, IppSizeL* pSpecSizeL, IppSizeL* pBufferSize))
{
    Ipp32s      numThreads;
    IpprVolumeL pTileSize = { 0, 0, 0 }, pLastSize = { 0, 0, 0 };
    IppSizeL    pSpecSize;
    IpprPointL  splitImage = { 0, 0, 0 };
    IppStatus   status = ippStsNoErr;
    IppSizeL    width  = dstRoiVolume.width, pBufSize;
    IppSizeL    height = dstRoiVolume.height;
    IppSizeL    depth  = dstRoiVolume.depth;

    if (pSpecSizeL == 0 || pBufferSize == 0) return ippStsNullPtrErr;
    if (width <= 0 || height <= 0 || depth <= 0) return ippStsSizeErr;
    if (kernelVolume.width <= 0 || kernelVolume.height <= 0 || kernelVolume.depth <= 0) return ippStsSizeErr;

    ippGetNumThreads_LT(&numThreads);

    ownrGetFilterBorderCubeSize(dstRoiVolume, kernelVolume, numThreads, &pTileSize, &pLastSize, &splitImage);
    if (pLastSize.width  < pTileSize.width)   pLastSize.width  = pTileSize.width;
    if (pLastSize.height < pTileSize.height)  pLastSize.height = pTileSize.height;
    if (pLastSize.depth  < pTileSize.depth)   pLastSize.depth  = pTileSize.depth;
    status = ipprFilterBorderGetSize_L(kernelVolume, pLastSize, dataType, kernelType, numChannels, &pSpecSize, &pBufSize);
    if (status >= 0){
        *pSpecSizeL  = pSpecSize + sizeof(FilterBorderInfo);
        *pBufferSize = pBufSize * ((IppSizeL)numThreads);
    }
    return status;
}
IPPFUN(IppStatus, ipprFilterBorderInit_16s_LT, (const Ipp16s* pKernel, IpprVolumeL  kernelVolume, int divisor, IppDataType dataType, int numChannels, IpprFilterBorderSpec_LT* pSpecLT))
{
    FilterBorderInfo*     pFilterBorderInfo = (FilterBorderInfo*)pSpecLT;
    IpprFilterBorderSpec* pSpecL            = (IpprFilterBorderSpec*)((Ipp8u*)pSpecLT + sizeof(FilterBorderInfo));

    if (pSpecLT == 0)     return ippStsNullPtrErr;
    pFilterBorderInfo->kernelVolume = kernelVolume;
    pFilterBorderInfo->kernelType   = ipp16s;
    return ipprFilterBorderInit_16s_L(pKernel, kernelVolume, divisor, dataType, numChannels, pSpecL);
}
IPPFUN(IppStatus, ipprFilterBorderInit_32f_LT, (const Ipp32f* pKernel, IpprVolumeL  kernelVolume, IppDataType dataType, int numChannels, IpprFilterBorderSpec_LT* pSpecLT))
{
    FilterBorderInfo*     pFilterBorderInfo = (FilterBorderInfo*)pSpecLT;
    IpprFilterBorderSpec* pSpecL = (IpprFilterBorderSpec*)((Ipp8u*)pSpecLT + sizeof(FilterBorderInfo));

    if (pSpecLT == 0)     return ippStsNullPtrErr;
    pFilterBorderInfo->kernelVolume = kernelVolume;
    pFilterBorderInfo->kernelType = ipp32f;
    return ipprFilterBorderInit_32f_L(pKernel, kernelVolume, dataType, numChannels, pSpecL);
}
IPPFUN(IppStatus, ipprFilterBorderInit_64f_LT, (const Ipp64f* pKernel, IpprVolumeL  kernelVolume, IppDataType dataType, int numChannels, IpprFilterBorderSpec_LT* pSpecLT))
{
    FilterBorderInfo*     pFilterBorderInfo = (FilterBorderInfo*)pSpecLT;
    IpprFilterBorderSpec* pSpecL = (IpprFilterBorderSpec*)((Ipp8u*)pSpecLT + sizeof(FilterBorderInfo));

    if (pSpecLT == 0)     return ippStsNullPtrErr;
    pFilterBorderInfo->kernelVolume = kernelVolume;
    pFilterBorderInfo->kernelType = ipp64f;
    return ipprFilterBorderInit_64f_L(pKernel, kernelVolume, dataType, numChannels, pSpecL);
}

IppStatus ipprFilterBorder_8u_C1V_LT_Fun(IppSizeL t, void *arg)
{
    ipprFilterBorder_8u_LT_Str* ts = (ipprFilterBorder_8u_LT_Str *)arg;
    const Ipp8u * pSrc = (const Ipp8u *)ts->pSrc; //const
    IppSizeL srcPlaneStep = ts->srcPlaneStep;
    IppSizeL srcStep = ts->srcStep;
    Ipp8u * pDst = ts->pDst;
    IppSizeL dstPlaneStep = ts->dstPlaneStep;
    IppSizeL dstStep = ts->dstStep;
    IpprBorderType border = ts->border;
    Ipp8u * borderValue = ts->borderValue;
    const IpprFilterBorderSpec_LT * pSpec = (const IpprFilterBorderSpec_LT *)ts->pSpec; //const
    Ipp8u * pBuffer = ts->pBuffer;
    IppSizeL bufSize = ts->bufSize;
    IppSizeL numChannels = ts->numChannels;
    IpprPointL splitImage = ts->splitImage;
    IpprVolumeL pTileSize = ts->pTileSize;
    IpprVolumeL pLastSize = ts->pLastSize;

    IpprVolumeL roiSizeS;
    roiSizeS.depth = pTileSize.depth;
    IppSizeL w, h, d;
    IpprBorderType borderTrd = border;
    IpprBorderType borderTrdW = borderTrd;
    IpprBorderType borderTrdD = borderTrd;
    Ipp8u* pSrcRoi;
    Ipp8u* pDstRoi;
    int threadIdx = 0;
    IppStatus status = ippStsOk;

    w =  t % splitImage.x;
    h = (t % (splitImage.x * splitImage.y)) / splitImage.x;
    d =  t / (splitImage.x * splitImage.y);

    pSrcRoi = (Ipp8u*)((Ipp8u*)(pSrc + w * pTileSize.width*numChannels) + h * pTileSize.height * srcStep + d * pTileSize.depth * srcPlaneStep);
    pDstRoi = (Ipp8u*)((Ipp8u*)(pDst + w * pTileSize.width*numChannels) + h * pTileSize.height * dstStep + d * pTileSize.depth * dstPlaneStep);
    roiSizeS.depth = pTileSize.depth;
    roiSizeS.height = pTileSize.height;
    roiSizeS.width = pTileSize.width;
    if (pLastSize.depth  && (d == (int)(splitImage.z - 1))) roiSizeS.depth  = pLastSize.depth;
    if (pLastSize.height && (h == (int)(splitImage.y - 1))) roiSizeS.height = pLastSize.height;
    if (pLastSize.width  && (w == (int)(splitImage.x - 1))) roiSizeS.width  = pLastSize.width;

    if ((splitImage.y > 1))
    {
        if (h == 0) borderTrd = (IpprBorderType)((int)border | (int)ipprBorderInMemBottom);
        else if (h == (int)(splitImage.y - 1)) borderTrd = (IpprBorderType)((int)border | (int)ipprBorderInMemTop);
        else  borderTrd = (IpprBorderType)((int)border | (int)ipprBorderInMemBottom | (int)ipprBorderInMemTop);
    }
    borderTrdW = borderTrd;
    if ((splitImage.x > 1))
    {
        if (w == 0) borderTrdW = (IpprBorderType)((int)borderTrd | (int)ipprBorderInMemRight);
        else if (w == (int)(splitImage.x - 1)) borderTrdW = (IpprBorderType)((int)borderTrd | (int)ipprBorderInMemLeft);
        else  borderTrdW = (IpprBorderType)((int)borderTrd | (int)ipprBorderInMemRight | (int)ipprBorderInMemLeft);
    }
    borderTrdD = borderTrdW;
    if ((splitImage.z > 1))
    {
        if (d == 0) borderTrdD = (IpprBorderType)((int)borderTrdW | (int)ipprBorderInMemBack);
        else if (d == (int)(splitImage.z - 1)) borderTrdD = (IpprBorderType)((int)borderTrdW | (int)ipprBorderInMemFront);
        else  borderTrdD = (IpprBorderType)((int)borderTrdW | (int)ipprBorderInMemBack | (int)ipprBorderInMemFront);
    }
    ippGetThreadIdx_LT(&threadIdx);
    Ipp8u *pBuf = pBuffer + bufSize * threadIdx;
    status = ipprFilterBorder_8u_C1V_L(pSrcRoi, srcPlaneStep, srcStep, pDstRoi, dstPlaneStep, dstStep, roiSizeS, borderTrdD, borderValue, (IpprFilterBorderSpec*)pSpec, pBuf);
    return status;
}
IppStatus ipprFilterBorder_16u_C1V_LT_Fun(IppSizeL t, void *arg)
{
    ipprFilterBorder_16u_LT_Str* ts = (ipprFilterBorder_16u_LT_Str *)arg;
    const Ipp16u * pSrc = (const Ipp16u *)ts->pSrc; //const
    IppSizeL srcPlaneStep = ts->srcPlaneStep;
    IppSizeL srcStep = ts->srcStep;
    Ipp16u * pDst = ts->pDst;
    IppSizeL dstPlaneStep = ts->dstPlaneStep;
    IppSizeL dstStep = ts->dstStep;
    IpprBorderType border = ts->border;
    Ipp16u * borderValue = ts->borderValue;
    const IpprFilterBorderSpec_LT * pSpec = (const IpprFilterBorderSpec_LT *)ts->pSpec; //const
    Ipp8u * pBuffer = ts->pBuffer;
    IppSizeL bufSize = ts->bufSize;
    IppSizeL numChannels = ts->numChannels;
    IpprPointL splitImage = ts->splitImage;
    IpprVolumeL pTileSize = ts->pTileSize;
    IpprVolumeL pLastSize = ts->pLastSize;

    IpprVolumeL roiSizeS;
    roiSizeS.depth = pTileSize.depth;
    IppSizeL w, h, d;
    IpprBorderType borderTrd = border;
    IpprBorderType borderTrdW = borderTrd;
    IpprBorderType borderTrdD = borderTrd;
    Ipp16u* pSrcRoi;
    Ipp16u* pDstRoi;
    int threadIdx = 0;
    IppStatus status = ippStsOk;

    w = t % splitImage.x;
    h = (t % (splitImage.x * splitImage.y)) / splitImage.x;
    d = t / (splitImage.x * splitImage.y);

    pSrcRoi = (Ipp16u*)((Ipp8u*)(pSrc + w * pTileSize.width*numChannels) + h * pTileSize.height * srcStep + d * pTileSize.depth * srcPlaneStep);
    pDstRoi = (Ipp16u*)((Ipp8u*)(pDst + w * pTileSize.width*numChannels) + h * pTileSize.height * dstStep + d * pTileSize.depth * dstPlaneStep);
    roiSizeS.depth = pTileSize.depth;
    roiSizeS.height = pTileSize.height;
    roiSizeS.width = pTileSize.width;
    if (pLastSize.depth && (d == (int)(splitImage.z - 1))) roiSizeS.depth = pLastSize.depth;
    if (pLastSize.height && (h == (int)(splitImage.y - 1))) roiSizeS.height = pLastSize.height;
    if (pLastSize.width && (w == (int)(splitImage.x - 1))) roiSizeS.width = pLastSize.width;

    if ((splitImage.y > 1))
    {
        if (h == 0) borderTrd = (IpprBorderType)((int)border | (int)ipprBorderInMemBottom);
        else if (h == (int)(splitImage.y - 1)) borderTrd = (IpprBorderType)((int)border | (int)ipprBorderInMemTop);
        else  borderTrd = (IpprBorderType)((int)border | (int)ipprBorderInMemBottom | (int)ipprBorderInMemTop);
    }
    borderTrdW = borderTrd;
    if ((splitImage.x > 1))
    {
        if (w == 0) borderTrdW = (IpprBorderType)((int)borderTrd | (int)ipprBorderInMemRight);
        else if (w == (int)(splitImage.x - 1)) borderTrdW = (IpprBorderType)((int)borderTrd | (int)ipprBorderInMemLeft);
        else  borderTrdW = (IpprBorderType)((int)borderTrd | (int)ipprBorderInMemRight | (int)ipprBorderInMemLeft);
    }
    borderTrdD = borderTrdW;
    if ((splitImage.z > 1))
    {
        if (d == 0) borderTrdD = (IpprBorderType)((int)borderTrdW | (int)ipprBorderInMemBack);
        else if (d == (int)(splitImage.z - 1)) borderTrdD = (IpprBorderType)((int)borderTrdW | (int)ipprBorderInMemFront);
        else  borderTrdD = (IpprBorderType)((int)borderTrdW | (int)ipprBorderInMemBack | (int)ipprBorderInMemFront);
    }
    ippGetThreadIdx_LT(&threadIdx);
    Ipp8u *pBuf = pBuffer + bufSize * threadIdx;
    status = ipprFilterBorder_16u_C1V_L(pSrcRoi, srcPlaneStep, srcStep, pDstRoi, dstPlaneStep, dstStep, roiSizeS, borderTrdD, borderValue, (IpprFilterBorderSpec*)pSpec, pBuf);
    return status;
}
IppStatus ipprFilterBorder_16s_C1V_LT_Fun(IppSizeL t, void *arg)
{
    ipprFilterBorder_16s_LT_Str* ts = (ipprFilterBorder_16s_LT_Str *)arg;
    const Ipp16s * pSrc = (const Ipp16s *)ts->pSrc; //const
    IppSizeL srcPlaneStep = ts->srcPlaneStep;
    IppSizeL srcStep = ts->srcStep;
    Ipp16s * pDst = ts->pDst;
    IppSizeL dstPlaneStep = ts->dstPlaneStep;
    IppSizeL dstStep = ts->dstStep;
    IpprBorderType border = ts->border;
    Ipp16s * borderValue = ts->borderValue;
    const IpprFilterBorderSpec_LT * pSpec = (const IpprFilterBorderSpec_LT *)ts->pSpec; //const
    Ipp8u * pBuffer = ts->pBuffer;
    IppSizeL bufSize = ts->bufSize;
    IppSizeL numChannels = ts->numChannels;
    IpprPointL splitImage = ts->splitImage;
    IpprVolumeL pTileSize = ts->pTileSize;
    IpprVolumeL pLastSize = ts->pLastSize;

    IpprVolumeL roiSizeS;
    roiSizeS.depth = pTileSize.depth;
    IppSizeL w, h, d;
    IpprBorderType borderTrd = border;
    IpprBorderType borderTrdW = borderTrd;
    IpprBorderType borderTrdD = borderTrd;
    Ipp16s* pSrcRoi;
    Ipp16s* pDstRoi;
    int threadIdx = 0;
    IppStatus status = ippStsOk;

    w = t % splitImage.x;
    h = (t % (splitImage.x * splitImage.y)) / splitImage.x;
    d = t / (splitImage.x * splitImage.y);

    pSrcRoi = (Ipp16s*)((Ipp8u*)(pSrc + w * pTileSize.width*numChannels ) + h * pTileSize.height * srcStep + d * pTileSize.depth * srcPlaneStep);
    pDstRoi = (Ipp16s*)((Ipp8u*)(pDst + w * pTileSize.width*numChannels ) + h * pTileSize.height * dstStep + d * pTileSize.depth * dstPlaneStep);
    roiSizeS.depth = pTileSize.depth;
    roiSizeS.height = pTileSize.height;
    roiSizeS.width = pTileSize.width;
    if (pLastSize.depth && (d == (int)(splitImage.z - 1))) roiSizeS.depth = pLastSize.depth;
    if (pLastSize.height && (h == (int)(splitImage.y - 1))) roiSizeS.height = pLastSize.height;
    if (pLastSize.width && (w == (int)(splitImage.x - 1))) roiSizeS.width = pLastSize.width;

    if ((splitImage.y > 1))
    {
        if (h == 0) borderTrd = (IpprBorderType)((int)border | (int)ipprBorderInMemBottom);
        else if (h == (int)(splitImage.y - 1)) borderTrd = (IpprBorderType)((int)border | (int)ipprBorderInMemTop);
        else  borderTrd = (IpprBorderType)((int)border | (int)ipprBorderInMemBottom | (int)ipprBorderInMemTop);
    }
    borderTrdW = borderTrd;
    if ((splitImage.x > 1))
    {
        if (w == 0) borderTrdW = (IpprBorderType)((int)borderTrd | (int)ipprBorderInMemRight);
        else if (w == (int)(splitImage.x - 1)) borderTrdW = (IpprBorderType)((int)borderTrd | (int)ipprBorderInMemLeft);
        else  borderTrdW = (IpprBorderType)((int)borderTrd | (int)ipprBorderInMemRight | (int)ipprBorderInMemLeft);
    }
    borderTrdD = borderTrdW;
    if ((splitImage.z > 1))
    {
        if (d == 0) borderTrdD = (IpprBorderType)((int)borderTrdW | (int)ipprBorderInMemBack);
        else if (d == (int)(splitImage.z - 1)) borderTrdD = (IpprBorderType)((int)borderTrdW | (int)ipprBorderInMemFront);
        else  borderTrdD = (IpprBorderType)((int)borderTrdW | (int)ipprBorderInMemBack | (int)ipprBorderInMemFront);
    }
    ippGetThreadIdx_LT(&threadIdx);
    Ipp8u *pBuf = pBuffer + bufSize * threadIdx;
    status = ipprFilterBorder_16s_C1V_L(pSrcRoi, srcPlaneStep, srcStep, pDstRoi, dstPlaneStep, dstStep, roiSizeS, borderTrdD, borderValue, (IpprFilterBorderSpec*)pSpec, pBuf);
    return status;
}
IppStatus ipprFilterBorder_32f_C1V_LT_Fun(IppSizeL t, void *arg)
{
    ipprFilterBorder_32f_LT_Str* ts = (ipprFilterBorder_32f_LT_Str *)arg;
    const Ipp32f * pSrc = (const Ipp32f *)ts->pSrc; //const
    IppSizeL srcPlaneStep = ts->srcPlaneStep;
    IppSizeL srcStep = ts->srcStep;
    Ipp32f * pDst = ts->pDst;
    IppSizeL dstPlaneStep = ts->dstPlaneStep;
    IppSizeL dstStep = ts->dstStep;
    IpprBorderType border = ts->border;
    Ipp32f * borderValue = ts->borderValue;
    const IpprFilterBorderSpec_LT * pSpec = (const IpprFilterBorderSpec_LT *)ts->pSpec; //const
    Ipp8u * pBuffer = ts->pBuffer;
    IppSizeL bufSize = ts->bufSize;
    IppSizeL numChannels = ts->numChannels;
    IpprPointL splitImage = ts->splitImage;
    IpprVolumeL pTileSize = ts->pTileSize;
    IpprVolumeL pLastSize = ts->pLastSize;

    IpprVolumeL roiSizeS;
    roiSizeS.depth = pTileSize.depth;
    IppSizeL w, h, d;
    IpprBorderType borderTrd = border;
    IpprBorderType borderTrdW = borderTrd;
    IpprBorderType borderTrdD = borderTrd;
    Ipp32f* pSrcRoi;
    Ipp32f* pDstRoi;
    int threadIdx = 0;
    IppStatus status = ippStsOk;

    w = t % splitImage.x;
    h = (t % (splitImage.x * splitImage.y)) / splitImage.x;
    d = t / (splitImage.x * splitImage.y);

    pSrcRoi = (Ipp32f*)((Ipp8u*)(pSrc + w * pTileSize.width*numChannels ) + h * pTileSize.height * srcStep + d * pTileSize.depth * srcPlaneStep);
    pDstRoi = (Ipp32f*)((Ipp8u*)(pDst + w * pTileSize.width*numChannels ) + h * pTileSize.height * dstStep + d * pTileSize.depth * dstPlaneStep);
    roiSizeS.depth = pTileSize.depth;
    roiSizeS.height = pTileSize.height;
    roiSizeS.width = pTileSize.width;
    if (pLastSize.depth && (d == (int)(splitImage.z - 1))) roiSizeS.depth = pLastSize.depth;
    if (pLastSize.height && (h == (int)(splitImage.y - 1))) roiSizeS.height = pLastSize.height;
    if (pLastSize.width && (w == (int)(splitImage.x - 1))) roiSizeS.width = pLastSize.width;

    if ((splitImage.y > 1))
    {
        if (h == 0) borderTrd = (IpprBorderType)((int)border | (int)ipprBorderInMemBottom);
        else if (h == (int)(splitImage.y - 1)) borderTrd = (IpprBorderType)((int)border | (int)ipprBorderInMemTop);
        else  borderTrd = (IpprBorderType)((int)border | (int)ipprBorderInMemBottom | (int)ipprBorderInMemTop);
    }
    borderTrdW = borderTrd;
    if ((splitImage.x > 1))
    {
        if (w == 0) borderTrdW = (IpprBorderType)((int)borderTrd | (int)ipprBorderInMemRight);
        else if (w == (int)(splitImage.x - 1)) borderTrdW = (IpprBorderType)((int)borderTrd | (int)ipprBorderInMemLeft);
        else  borderTrdW = (IpprBorderType)((int)borderTrd | (int)ipprBorderInMemRight | (int)ipprBorderInMemLeft);
    }
    borderTrdD = borderTrdW;
    if ((splitImage.z > 1))
    {
        if (d == 0) borderTrdD = (IpprBorderType)((int)borderTrdW | (int)ipprBorderInMemBack);
        else if (d == (int)(splitImage.z - 1)) borderTrdD = (IpprBorderType)((int)borderTrdW | (int)ipprBorderInMemFront);
        else  borderTrdD = (IpprBorderType)((int)borderTrdW | (int)ipprBorderInMemBack | (int)ipprBorderInMemFront);
    }
    ippGetThreadIdx_LT(&threadIdx);
    Ipp8u *pBuf = pBuffer + bufSize * threadIdx;
    status = ipprFilterBorder_32f_C1V_L(pSrcRoi, srcPlaneStep, srcStep, pDstRoi, dstPlaneStep, dstStep, roiSizeS, borderTrdD, borderValue, (IpprFilterBorderSpec*)pSpec, pBuf);
    return status;
}
IppStatus ipprFilterBorder_64f_C1V_LT_Fun(IppSizeL t, void *arg)
{
    ipprFilterBorder_64f_LT_Str* ts = (ipprFilterBorder_64f_LT_Str *)arg;
    const Ipp64f * pSrc = (const Ipp64f *)ts->pSrc; //const
    IppSizeL srcPlaneStep = ts->srcPlaneStep;
    IppSizeL srcStep = ts->srcStep;
    Ipp64f * pDst = ts->pDst;
    IppSizeL dstPlaneStep = ts->dstPlaneStep;
    IppSizeL dstStep = ts->dstStep;
    IpprBorderType border = ts->border;
    Ipp64f * borderValue = ts->borderValue;
    const IpprFilterBorderSpec_LT * pSpec = (const IpprFilterBorderSpec_LT *)ts->pSpec; //const
    Ipp8u * pBuffer = ts->pBuffer;
    IppSizeL bufSize = ts->bufSize;
    IppSizeL numChannels = ts->numChannels;
    IpprPointL splitImage = ts->splitImage;
    IpprVolumeL pTileSize = ts->pTileSize;
    IpprVolumeL pLastSize = ts->pLastSize;

    IpprVolumeL roiSizeS;
    roiSizeS.depth = pTileSize.depth;
    IppSizeL w, h, d;
    IpprBorderType borderTrd = border;
    IpprBorderType borderTrdW = borderTrd;
    IpprBorderType borderTrdD = borderTrd;
    Ipp64f* pSrcRoi;
    Ipp64f* pDstRoi;
    int threadIdx = 0;
    IppStatus status = ippStsOk;

    w = t % splitImage.x;
    h = (t % (splitImage.x * splitImage.y)) / splitImage.x;
    d = t / (splitImage.x * splitImage.y);

    pSrcRoi = (Ipp64f*)((Ipp8u*)(pSrc + w * pTileSize.width*numChannels ) + h * pTileSize.height * srcStep + d * pTileSize.depth * srcPlaneStep);
    pDstRoi = (Ipp64f*)((Ipp8u*)(pDst + w * pTileSize.width*numChannels ) + h * pTileSize.height * dstStep + d * pTileSize.depth * dstPlaneStep);
    roiSizeS.depth = pTileSize.depth;
    roiSizeS.height = pTileSize.height;
    roiSizeS.width = pTileSize.width;
    if (pLastSize.depth && (d == (int)(splitImage.z - 1))) roiSizeS.depth = pLastSize.depth;
    if (pLastSize.height && (h == (int)(splitImage.y - 1))) roiSizeS.height = pLastSize.height;
    if (pLastSize.width && (w == (int)(splitImage.x - 1))) roiSizeS.width = pLastSize.width;

    if ((splitImage.y > 1))
    {
        if (h == 0) borderTrd = (IpprBorderType)((int)border | (int)ipprBorderInMemBottom);
        else if (h == (int)(splitImage.y - 1)) borderTrd = (IpprBorderType)((int)border | (int)ipprBorderInMemTop);
        else  borderTrd = (IpprBorderType)((int)border | (int)ipprBorderInMemBottom | (int)ipprBorderInMemTop);
    }
    borderTrdW = borderTrd;
    if ((splitImage.x > 1))
    {
        if (w == 0) borderTrdW = (IpprBorderType)((int)borderTrd | (int)ipprBorderInMemRight);
        else if (w == (int)(splitImage.x - 1)) borderTrdW = (IpprBorderType)((int)borderTrd | (int)ipprBorderInMemLeft);
        else  borderTrdW = (IpprBorderType)((int)borderTrd | (int)ipprBorderInMemRight | (int)ipprBorderInMemLeft);
    }
    borderTrdD = borderTrdW;
    if ((splitImage.z > 1))
    {
        if (d == 0) borderTrdD = (IpprBorderType)((int)borderTrdW | (int)ipprBorderInMemBack);
        else if (d == (int)(splitImage.z - 1)) borderTrdD = (IpprBorderType)((int)borderTrdW | (int)ipprBorderInMemFront);
        else  borderTrdD = (IpprBorderType)((int)borderTrdW | (int)ipprBorderInMemBack | (int)ipprBorderInMemFront);
    }
    ippGetThreadIdx_LT(&threadIdx);
    Ipp8u *pBuf = pBuffer + bufSize * threadIdx;
    status = ipprFilterBorder_64f_C1V_L(pSrcRoi, srcPlaneStep, srcStep, pDstRoi, dstPlaneStep, dstStep, roiSizeS, borderTrdD, borderValue, (IpprFilterBorderSpec*)pSpec, pBuf);
    return status;
}

IPPFUN(IppStatus, ipprFilterBorder_8u_C1V_LT, (const Ipp8u*  pSrc, IppSizeL srcPlaneStep, IppSizeL srcStep, Ipp8u*  pDst, IppSizeL dstPlaneStep, IppSizeL dstStep, IpprVolumeL roiVolume, IpprBorderType borderType, const Ipp8u  borderValue[1], const IpprFilterBorderSpec_LT* pSpecLT, Ipp8u* pBuffer))
{
    IppStatus statusAll;
    IppSizeL  numChannels = 1;
    int    numThreads = 1;
   
    FilterBorderInfo *pFilterBorderInfo;           
    IpprFilterBorderSpec* pSpec;
    IpprVolumeL kernelVolume;
    IpprPointL splitImage = { 1, 1, 1 };
    IpprVolumeL pTileSize, pLastSize;
    if (pSrc == 0 || pDst == 0)     return ippStsNullPtrErr;
    if (roiVolume.width <= 0 || roiVolume.height <= 0 || roiVolume.depth <= 0) return ippStsSizeErr;
    if (pSpecLT == 0 || pBuffer == 0)     return ippStsNullPtrErr;

    pFilterBorderInfo = (FilterBorderInfo*)pSpecLT;
    pSpec = (IpprFilterBorderSpec*)((Ipp8u*)pSpecLT + sizeof(FilterBorderInfo));
    kernelVolume = pFilterBorderInfo->kernelVolume;
    statusAll = ippStsNoErr;
    splitImage.x = splitImage.y = splitImage.z = 0;

    ippGetNumThreads_LT(&numThreads);
    ownrGetFilterBorderCubeSize(roiVolume, kernelVolume, numThreads, &pTileSize, &pLastSize, &splitImage);
    if (pLastSize.width  < pTileSize.width)   pLastSize.width = pTileSize.width;
    if (pLastSize.height < pTileSize.height)  pLastSize.height = pTileSize.height;
    if (pLastSize.depth  < pTileSize.depth)   pLastSize.depth = pTileSize.depth;

    if ((numThreads == 1) || (splitImage.x*splitImage.y*splitImage.z == 1))
    {
        /* Intel IPP function call */
        statusAll = ipprFilterBorder_8u_C1V_L(pSrc, srcPlaneStep, srcStep, pDst, dstPlaneStep, dstStep, roiVolume, borderType, borderValue, pSpec, pBuffer);
    }
    else
    {
        IppSizeL numTiles = splitImage.x*splitImage.y*splitImage.z;
        ipprFilterBorder_8u_LT_Str ts;
        IppSizeL specSize, bufSize;
        IppStatus status;
        status = ipprFilterBorderGetSize_L(kernelVolume, pLastSize, ipp8u, pFilterBorderInfo->kernelType, numChannels, &specSize, &bufSize);
        fBrdThreadingStructureEncode_8u_LT((Ipp8u *)pSrc, srcPlaneStep, srcStep, pDst, dstPlaneStep, dstStep, borderType, (Ipp8u *)borderValue, (IpprFilterBorderSpec_LT *)pSpec, pBuffer, bufSize, numChannels, splitImage, pTileSize, pLastSize, &ts);
        statusAll = ippParallelFor_LT(numTiles, (void*)&ts, ipprFilterBorder_8u_C1V_LT_Fun);
    }
    return statusAll;
}
IPPFUN(IppStatus, ipprFilterBorder_16u_C1V_LT, (const Ipp16u* pSrc, IppSizeL srcPlaneStep, IppSizeL srcStep, Ipp16u* pDst, IppSizeL dstPlaneStep, IppSizeL dstStep, IpprVolumeL roiVolume, IpprBorderType borderType, const Ipp16u borderValue[1], const IpprFilterBorderSpec_LT* pSpecLT, Ipp8u* pBuffer)) {
    IppStatus statusAll;
    IppSizeL  numChannels = 1;
    int    numThreads = 1;

    FilterBorderInfo *pFilterBorderInfo;         
    IpprFilterBorderSpec* pSpec;
    IpprVolumeL kernelVolume;
    IpprPointL splitImage = { 1, 1, 1 };
    IpprVolumeL pTileSize, pLastSize;
    if (pSrc == 0 || pDst == 0)     return ippStsNullPtrErr;
    if (roiVolume.width <= 0 || roiVolume.height <= 0 || roiVolume.depth <= 0) return ippStsSizeErr;
    if (pSpecLT == 0 || pBuffer == 0)     return ippStsNullPtrErr;

    pFilterBorderInfo = (FilterBorderInfo*)pSpecLT;
    pSpec = (IpprFilterBorderSpec*)((Ipp8u*)pSpecLT + sizeof(FilterBorderInfo));
    kernelVolume = pFilterBorderInfo->kernelVolume;
    statusAll = ippStsNoErr;
    splitImage.x = splitImage.y = splitImage.z = 0;

    ippGetNumThreads_LT(&numThreads);
    ownrGetFilterBorderCubeSize(roiVolume, kernelVolume, numThreads, &pTileSize, &pLastSize, &splitImage);
    if (pLastSize.width  < pTileSize.width)   pLastSize.width = pTileSize.width;
    if (pLastSize.height < pTileSize.height)  pLastSize.height = pTileSize.height;
    if (pLastSize.depth  < pTileSize.depth)   pLastSize.depth = pTileSize.depth;

    if ((numThreads == 1) || (splitImage.x*splitImage.y*splitImage.z == 1))
    {
        /* Intel IPP function call */
        statusAll = ipprFilterBorder_16u_C1V_L(pSrc, srcPlaneStep, srcStep, pDst, dstPlaneStep, dstStep, roiVolume, borderType, borderValue, pSpec, pBuffer);
    }
    else
    {
        IppSizeL numTiles = splitImage.x*splitImage.y*splitImage.z;
        ipprFilterBorder_16u_LT_Str ts;
        IppSizeL specSize, bufSize;
        IppStatus status;
        status = ipprFilterBorderGetSize_L(kernelVolume, pLastSize, ipp16u, pFilterBorderInfo->kernelType, numChannels, &specSize, &bufSize);
        fBrdThreadingStructureEncode_16u_LT((Ipp16u *)pSrc, srcPlaneStep, srcStep, pDst, dstPlaneStep, dstStep, borderType, (Ipp16u *)borderValue, (IpprFilterBorderSpec_LT *)pSpec, pBuffer, bufSize, numChannels, splitImage, pTileSize, pLastSize, &ts);
        statusAll = ippParallelFor_LT(numTiles, (void*)&ts, ipprFilterBorder_16u_C1V_LT_Fun);
    }
    return statusAll;
}
IPPFUN(IppStatus, ipprFilterBorder_16s_C1V_LT, (const Ipp16s* pSrc, IppSizeL srcPlaneStep, IppSizeL srcStep, Ipp16s* pDst, IppSizeL dstPlaneStep, IppSizeL dstStep, IpprVolumeL roiVolume, IpprBorderType borderType, const Ipp16s borderValue[1], const IpprFilterBorderSpec_LT* pSpecLT, Ipp8u* pBuffer))
{
    IppStatus statusAll;
    IppSizeL  numChannels = 1;
    int    numThreads = 1;

    FilterBorderInfo *pFilterBorderInfo;           
    IpprFilterBorderSpec* pSpec;
    IpprVolumeL kernelVolume;
    IpprPointL splitImage = { 1, 1, 1 };
    IpprVolumeL pTileSize, pLastSize;
    if (pSrc == 0 || pDst == 0)     return ippStsNullPtrErr;
    if (roiVolume.width <= 0 || roiVolume.height <= 0 || roiVolume.depth <= 0) return ippStsSizeErr;
    if (pSpecLT == 0 || pBuffer == 0)     return ippStsNullPtrErr;

    pFilterBorderInfo = (FilterBorderInfo*)pSpecLT;
    pSpec = (IpprFilterBorderSpec*)((Ipp8u*)pSpecLT + sizeof(FilterBorderInfo));
    kernelVolume = pFilterBorderInfo->kernelVolume;
    statusAll = ippStsNoErr;
    splitImage.x = splitImage.y = splitImage.z = 0;

    ippGetNumThreads_LT(&numThreads);
    ownrGetFilterBorderCubeSize(roiVolume, kernelVolume, numThreads, &pTileSize, &pLastSize, &splitImage);
    if (pLastSize.width  < pTileSize.width)   pLastSize.width = pTileSize.width;
    if (pLastSize.height < pTileSize.height)  pLastSize.height = pTileSize.height;
    if (pLastSize.depth  < pTileSize.depth)   pLastSize.depth = pTileSize.depth;

    if ((numThreads == 1) || (splitImage.x*splitImage.y*splitImage.z == 1))
    {
        /* Intel IPP function call */
        statusAll = ipprFilterBorder_16s_C1V_L(pSrc, srcPlaneStep, srcStep, pDst, dstPlaneStep, dstStep, roiVolume, borderType, borderValue, pSpec, pBuffer);
    }
    else
    {
        IppSizeL numTiles = splitImage.x*splitImage.y*splitImage.z;
        ipprFilterBorder_16s_LT_Str ts;
        IppSizeL specSize, bufSize;
        IppStatus status;
        status = ipprFilterBorderGetSize_L(kernelVolume, pLastSize, ipp16s, pFilterBorderInfo->kernelType, numChannels, &specSize, &bufSize);
        fBrdThreadingStructureEncode_16s_LT((Ipp16s *)pSrc, srcPlaneStep, srcStep, pDst, dstPlaneStep, dstStep, borderType, (Ipp16s *)borderValue, (IpprFilterBorderSpec_LT *)pSpec, pBuffer, bufSize, numChannels, splitImage, pTileSize, pLastSize, &ts);
        statusAll = ippParallelFor_LT(numTiles, (void*)&ts, ipprFilterBorder_16s_C1V_LT_Fun);
    }
    return statusAll;
}
IPPFUN(IppStatus, ipprFilterBorder_32f_C1V_LT, (const Ipp32f* pSrc, IppSizeL srcPlaneStep, IppSizeL srcStep, Ipp32f* pDst, IppSizeL dstPlaneStep, IppSizeL dstStep, IpprVolumeL roiVolume, IpprBorderType borderType, const Ipp32f borderValue[1], const IpprFilterBorderSpec_LT* pSpecLT, Ipp8u* pBuffer))
{
    IppStatus statusAll;
    IppSizeL  numChannels = 1;
    int    numThreads = 1;

    FilterBorderInfo *pFilterBorderInfo;           
    IpprFilterBorderSpec* pSpec;
    IpprVolumeL kernelVolume;
    IpprPointL splitImage = { 1, 1, 1 };
    IpprVolumeL pTileSize, pLastSize;
    if (pSrc == 0 || pDst == 0)     return ippStsNullPtrErr;
    if (roiVolume.width <= 0 || roiVolume.height <= 0 || roiVolume.depth <= 0) return ippStsSizeErr;
    if (pSpecLT == 0 || pBuffer == 0)     return ippStsNullPtrErr;

    pFilterBorderInfo = (FilterBorderInfo*)pSpecLT;
    pSpec = (IpprFilterBorderSpec*)((Ipp8u*)pSpecLT + sizeof(FilterBorderInfo));
    kernelVolume = pFilterBorderInfo->kernelVolume;
    statusAll = ippStsNoErr;
    splitImage.x = splitImage.y = splitImage.z = 0;

    ippGetNumThreads_LT(&numThreads);
    ownrGetFilterBorderCubeSize(roiVolume, kernelVolume, numThreads, &pTileSize, &pLastSize, &splitImage);
    if (pLastSize.width  < pTileSize.width)   pLastSize.width = pTileSize.width;
    if (pLastSize.height < pTileSize.height)  pLastSize.height = pTileSize.height;
    if (pLastSize.depth  < pTileSize.depth)   pLastSize.depth = pTileSize.depth;

    if ((numThreads == 1) || (splitImage.x*splitImage.y*splitImage.z == 1))
    {
        /* Intel IPP function call */
        statusAll = ipprFilterBorder_32f_C1V_L(pSrc, srcPlaneStep, srcStep, pDst, dstPlaneStep, dstStep, roiVolume, borderType, borderValue, pSpec, pBuffer);
    }
    else
    {
        IppSizeL numTiles = splitImage.x*splitImage.y*splitImage.z;
        ipprFilterBorder_32f_LT_Str ts;
        IppSizeL specSize, bufSize;
        IppStatus status;
        status = ipprFilterBorderGetSize_L(kernelVolume, pLastSize, ipp32f, pFilterBorderInfo->kernelType, numChannels, &specSize, &bufSize);
        fBrdThreadingStructureEncode_32f_LT((Ipp32f *)pSrc, srcPlaneStep, srcStep, pDst, dstPlaneStep, dstStep, borderType, (Ipp32f *)borderValue, (IpprFilterBorderSpec_LT *)pSpec, pBuffer, bufSize, numChannels, splitImage, pTileSize, pLastSize, &ts);
        statusAll = ippParallelFor_LT(numTiles, (void*)&ts, ipprFilterBorder_32f_C1V_LT_Fun);
    }
    return statusAll;
}
IPPFUN(IppStatus, ipprFilterBorder_64f_C1V_LT, (const Ipp64f* pSrc, IppSizeL srcPlaneStep, IppSizeL srcStep, Ipp64f* pDst, IppSizeL dstPlaneStep, IppSizeL dstStep, IpprVolumeL roiVolume, IpprBorderType borderType, const Ipp64f borderValue[1], const IpprFilterBorderSpec_LT* pSpecLT, Ipp8u* pBuffer))
{
    IppStatus statusAll;
    IppSizeL  numChannels = 1;
    int    numThreads = 1;

    FilterBorderInfo *pFilterBorderInfo;         
    IpprFilterBorderSpec* pSpec;
    IpprVolumeL kernelVolume;
    IpprPointL splitImage = { 1, 1, 1 };
    IpprVolumeL pTileSize, pLastSize;
    if (pSrc == 0 || pDst == 0)     return ippStsNullPtrErr;
    if (roiVolume.width <= 0 || roiVolume.height <= 0 || roiVolume.depth <= 0) return ippStsSizeErr;
    if (pSpecLT == 0 || pBuffer == 0)     return ippStsNullPtrErr;

    pFilterBorderInfo = (FilterBorderInfo*)pSpecLT;
    pSpec = (IpprFilterBorderSpec*)((Ipp8u*)pSpecLT + sizeof(FilterBorderInfo));
    kernelVolume = pFilterBorderInfo->kernelVolume;
    statusAll = ippStsNoErr;
    splitImage.x = splitImage.y = splitImage.z = 0;

    ippGetNumThreads_LT(&numThreads);
    ownrGetFilterBorderCubeSize(roiVolume, kernelVolume, numThreads, &pTileSize, &pLastSize, &splitImage);
    if (pLastSize.width  < pTileSize.width)   pLastSize.width = pTileSize.width;
    if (pLastSize.height < pTileSize.height)  pLastSize.height = pTileSize.height;
    if (pLastSize.depth  < pTileSize.depth)   pLastSize.depth = pTileSize.depth;

    if ((numThreads == 1) || (splitImage.x*splitImage.y*splitImage.z == 1))
    {
        /* Intel IPP function call */
        statusAll = ipprFilterBorder_64f_C1V_L(pSrc, srcPlaneStep, srcStep, pDst, dstPlaneStep, dstStep, roiVolume, borderType, borderValue, pSpec, pBuffer);
    }
    else
    {
        IppSizeL numTiles = splitImage.x*splitImage.y*splitImage.z;
        ipprFilterBorder_64f_LT_Str ts;
        IppSizeL specSize, bufSize;
        IppStatus status;
        status = ipprFilterBorderGetSize_L(kernelVolume, pLastSize, ipp64f, pFilterBorderInfo->kernelType, numChannels, &specSize, &bufSize);
        fBrdThreadingStructureEncode_64f_LT((Ipp64f *)pSrc, srcPlaneStep, srcStep, pDst, dstPlaneStep, dstStep, borderType, (Ipp64f *)borderValue, (IpprFilterBorderSpec_LT *)pSpec, pBuffer, bufSize, numChannels, splitImage, pTileSize, pLastSize, &ts);
        statusAll = ippParallelFor_LT(numTiles, (void*)&ts, ipprFilterBorder_64f_C1V_LT_Fun);
    }
    return statusAll;
}


