add part of opencv

2020-01-27 20:20:56 +08:00
parent 0c4ac1d8bb
commit a71fa47620
6518 changed files with 3122580 additions and 0 deletions
--- a/Lib/opencv/sources/modules/dnn/src/layers/batch_norm_layer.cpp
+++ b/Lib/opencv/sources/modules/dnn/src/layers/batch_norm_layer.cpp
@@ -0,0 +1,424 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+// Copyright (C) 2016, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+
+/*
+Implementation of Batch Normalization layer.
+*/
+
+#include "../precomp.hpp"
+#include "layers_common.hpp"
+#include "../op_cuda.hpp"
+#include "../op_halide.hpp"
+#include "../op_inf_engine.hpp"
+#include "../ie_ngraph.hpp"
+
+#include <opencv2/dnn/shape_utils.hpp>
+
+#ifdef HAVE_OPENCL
+#include "opencl_kernels_dnn.hpp"
+#endif
+
+#ifdef HAVE_CUDA
+#include "../cuda4dnn/primitives/batch_norm.hpp"
+using namespace cv::dnn::cuda4dnn;
+#endif
+
+namespace cv
+{
+namespace dnn
+{
+
+class BatchNormLayerImpl CV_FINAL : public BatchNormLayer
+{
+public:
+    Mat weights_, bias_;
+    UMat umat_weight, umat_bias;
+    mutable int dims;
+
+
+    BatchNormLayerImpl(const LayerParams& params)
+        : dims(-1)
+    {
+        setParamsFrom(params);
+        CV_Assert(blobs.size() >= 2);
+
+        hasWeights = params.get<bool>("has_weight", false);
+        hasBias = params.get<bool>("has_bias", false);
+        useGlobalStats = params.get<bool>("use_global_stats", true);
+        if(params.get<bool>("scale_bias", false))
+            hasWeights = hasBias = true;
+        epsilon = params.get<float>("eps", 1E-5);
+
+        size_t n = blobs[0].total();
+        CV_Assert(blobs[1].total() == n &&
+                  blobs[0].isContinuous() && blobs[1].isContinuous() &&
+                  blobs[0].type() == CV_32F && blobs[1].type() == CV_32F);
+
+        float varMeanScale = 1.f;
+        if (!hasWeights && !hasBias && blobs.size() > 2 && useGlobalStats) {
+            CV_Assert(blobs.size() == 3); CV_CheckTypeEQ(blobs[2].type(), CV_32FC1, "");
+            varMeanScale = blobs[2].at<float>(0);
+            if (varMeanScale != 0)
+                varMeanScale = 1/varMeanScale;
+        }
+
+        const int biasBlobIndex = blobs.size() - 1;
+        const int weightsBlobIndex = biasBlobIndex - hasBias;
+
+        if( hasWeights )
+        {
+            CV_Assert((size_t)weightsBlobIndex < blobs.size());
+            const Mat& w = blobs[weightsBlobIndex];
+            CV_Assert(w.isContinuous() && w.type() == CV_32F && w.total() == (size_t)n);
+        }
+
+        if( hasBias )
+        {
+            CV_Assert((size_t)biasBlobIndex < blobs.size());
+            const Mat& b = blobs[weightsBlobIndex];
+            CV_Assert(b.isContinuous() && b.type() == CV_32F && b.total() == (size_t)n);
+        }
+
+        const float* meanData = blobs[0].ptr<float>();
+        const float* stdData = blobs[1].ptr<float>();
+        const float* weightsData = hasWeights ? blobs[weightsBlobIndex].ptr<float>() : 0;
+        const float* biasData = hasBias ? blobs[biasBlobIndex].ptr<float>() : 0;
+
+        weights_.create(1, (int)n, CV_32F);
+        bias_.create(1, (int)n, CV_32F);
+
+        float* dstWeightsData = weights_.ptr<float>();
+        float* dstBiasData = bias_.ptr<float>();
+
+        for (size_t i = 0; i < n; ++i)
+        {
+            float w = (hasWeights ? weightsData[i] : 1.0f) / sqrt(stdData[i] * varMeanScale + epsilon);
+            dstWeightsData[i] = w;
+            dstBiasData[i] = (hasBias ? biasData[i] : 0.0f) - w * meanData[i] * varMeanScale;
+        }
+    }
+
+    void getScaleShift(Mat& scale, Mat& shift) const CV_OVERRIDE
+    {
+        scale = weights_;
+        shift = bias_;
+    }
+
+    virtual bool tryFuse(Ptr<Layer>& top) CV_OVERRIDE
+    {
+        Mat w, b;
+        top->getScaleShift(w, b);
+        if (w.empty() && b.empty())
+            return false;
+
+        const int numChannels = weights_.total();
+        const int numFusedWeights = w.total();
+        const int numFusedBias = b.total();
+
+        if ((numFusedWeights != numChannels && numFusedWeights != 1 && !w.empty()) ||
+            (numFusedBias != numChannels && numFusedBias != 1 && !b.empty()))
+            return false;
+
+        if (!w.empty())
+        {
+            w = w.reshape(1, 1);
+            if (numFusedWeights == 1)
+            {
+                multiply(weights_, w.at<float>(0), weights_);
+                multiply(bias_, w.at<float>(0), bias_);
+            }
+            else
+            {
+                multiply(weights_, w, weights_);
+                multiply(bias_, w, bias_);
+            }
+        }
+        if (!b.empty())
+        {
+            b = b.reshape(1, 1);
+            if (numFusedBias == 1)
+                add(bias_, b.at<float>(0), bias_);
+            else
+                add(bias_, b.reshape(1, 1), bias_);
+        }
+        return true;
+    }
+
+    bool getMemoryShapes(const std::vector<MatShape> &inputs,
+                         const int requiredOutputs,
+                         std::vector<MatShape> &outputs,
+                         std::vector<MatShape> &internals) const CV_OVERRIDE
+    {
+        dims = inputs[0].size();
+        if (!useGlobalStats && inputs[0][0] != 1)
+            CV_Error(Error::StsNotImplemented, "Batch normalization in training mode with batch size > 1");
+        Layer::getMemoryShapes(inputs, requiredOutputs, outputs, internals);
+        return true;
+    }
+
+    virtual bool supportBackend(int backendId) CV_OVERRIDE
+    {
+        return (backendId == DNN_BACKEND_OPENCV) ||
+               backendId == DNN_BACKEND_CUDA ||
+               (backendId == DNN_BACKEND_HALIDE && haveHalide()) ||
+               ((backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 || backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH) && haveInfEngine() && (preferableTarget == DNN_TARGET_CPU || dims == 4));
+    }
+
+#ifdef HAVE_OPENCL
+    bool forward_ocl(InputArrayOfArrays inputs_, OutputArrayOfArrays outputs_, OutputArrayOfArrays internals_)
+    {
+        std::vector<UMat> inputs;
+        std::vector<UMat> outputs;
+
+        bool use_half = (inputs_.depth() == CV_16S);
+        inputs_.getUMatVector(inputs);
+        outputs_.getUMatVector(outputs);
+
+        CV_Assert(blobs.size() >= 2);
+        CV_Assert(inputs.size() == 1);
+
+        if (use_half && inputs[0].dims == 2)
+            return false;
+
+        if (umat_weight.empty())
+        {
+            weights_.copyTo(umat_weight);
+            bias_.copyTo(umat_bias);
+        }
+
+        UMat &inpBlob = inputs[0];
+        int groups = inpBlob.size[0];
+        int channels = inpBlob.size[1];
+        int planeSize = 1;
+        for (size_t i = 2; i < inpBlob.dims; i++) {
+            planeSize *= inpBlob.size[i];
+        }
+
+        String opts = (use_half) ? " -DDtype=half" : " -DDtype=float";
+        for (size_t ii = 0; ii < outputs.size(); ii++)
+        {
+            if (inpBlob.dims == 2)
+            {
+                UMat& src = inputs[ii];
+                UMat& dst = outputs[ii];
+                multiply(src, weights_, dst);
+                add(dst, bias_, dst);
+            }
+            else
+            {
+                MatShape s = shape(groups * channels, planeSize);
+                UMat src = inputs[ii].reshape(1, s.size(), &s[0]);
+                UMat dst = outputs[ii].reshape(1, s.size(), &s[0]);
+                int number = (s[1] % 8 == 0) ? 8 : ((s[1] % 4 == 0) ? 4 : 1);
+                String buildopt = format("-DNUM=%d", number) + opts;
+                String kname = format("batch_norm%d", number);
+                if (number == 1)
+                    buildopt += format(" -Dconvert_T=convert_%s", use_half ? "half" : "float");
+                else
+                    buildopt += format(" -Dconvert_T=convert_%s%d", use_half ? "half" : "float", number);
+                ocl::Kernel kernel(kname.c_str(), ocl::dnn::batchnorm_oclsrc, buildopt);
+                if (kernel.empty())
+                    return false;
+                size_t global[] = { (size_t)s[0], (size_t)(s[1] / number) };
+                kernel.set(0, ocl::KernelArg::PtrReadOnly(src));
+                kernel.set(1, (int)s[0]);
+                kernel.set(2, (int)s[1]);
+                kernel.set(3, (int)channels);
+                kernel.set(4, ocl::KernelArg::PtrReadOnly(umat_weight));
+                kernel.set(5, ocl::KernelArg::PtrReadOnly(umat_bias));
+                kernel.set(6, ocl::KernelArg::PtrWriteOnly(dst));
+                bool ret = kernel.run(2, global, NULL, false);
+                if (!ret)
+                    return false;
+            }
+        }
+        return true;
+    }
+#endif
+
+    void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
+    {
+        CV_TRACE_FUNCTION();
+        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+
+        CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
+                   forward_ocl(inputs_arr, outputs_arr, internals_arr))
+
+        if (inputs_arr.depth() == CV_16S)
+        {
+            forward_fallback(inputs_arr, outputs_arr, internals_arr);
+            return;
+        }
+
+        std::vector<Mat> inputs, outputs;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
+
+        CV_Assert(blobs.size() >= 2);
+        CV_Assert(inputs.size() == 1);
+
+        Mat &inpBlob = inputs[0];
+        int planeSize = 1;
+        for (size_t i = 2; i < inpBlob.dims; i++) {
+            planeSize *= inpBlob.size[i];
+        }
+
+        for (size_t ii = 0; ii < outputs.size(); ii++)
+        {
+            Mat &outBlob = outputs[ii];
+
+            for(int num = 0; num < outBlob.size[0]; num++)
+            {
+                for (int n = 0; n < outBlob.size[1]; n++)
+                {
+                    float w = weights_.at<float>(n);
+                    float b = bias_.at<float>(n);
+                    Mat inpBlobPlane(1, planeSize, CV_32F, inpBlob.ptr<float>(num, n));
+                    Mat outBlobPlane(1, planeSize, CV_32F, outBlob.ptr<float>(num, n));
+                    inpBlobPlane.convertTo(outBlobPlane, CV_32F, w, b);
+                }
+            }
+        }
+    }
+
+    void forwardSlice(const float* srcptr, float* dstptr, int len, size_t planeSize, int cn0, int cn1) const CV_OVERRIDE
+    {
+        for( int cn = cn0; cn < cn1; cn++, srcptr += planeSize, dstptr += planeSize )
+        {
+            int i = 0;
+            float w = weights_.at<float>(cn);
+            float b = bias_.at<float>(cn);
+#if CV_SIMD128
+            v_float32x4 wV = v_setall_f32(w), bV = v_setall_f32(b);
+            for( ; i <= len - 16; i += 16 )
+            {
+                v_float32x4 x0 = v_load(srcptr + i);
+                v_float32x4 x1 = v_load(srcptr + i + 4);
+                v_float32x4 x2 = v_load(srcptr + i + 8);
+                v_float32x4 x3 = v_load(srcptr + i + 12);
+                x0 = v_muladd(x0, wV, bV);
+                x1 = v_muladd(x1, wV, bV);
+                x2 = v_muladd(x2, wV, bV);
+                x3 = v_muladd(x3, wV, bV);
+                v_store(dstptr + i, x0);
+                v_store(dstptr + i + 4, x1);
+                v_store(dstptr + i + 8, x2);
+                v_store(dstptr + i + 12, x3);
+            }
+#endif
+            for( ; i < len; i++ )
+                dstptr[i] = w * srcptr[i] + b;
+        }
+    }
+
+#ifdef HAVE_CUDA
+    Ptr<BackendNode> initCUDA(
+        void *context_,
+        const std::vector<Ptr<BackendWrapper>>& inputs,
+        const std::vector<Ptr<BackendWrapper>>& outputs
+    ) override
+    {
+        auto context = reinterpret_cast<csl::CSLContext*>(context_);
+        return make_cuda_node<cuda4dnn::BatchNormOp>(preferableTarget, std::move(context->stream), weights_, bias_);
+    }
+#endif
+
+    virtual Ptr<BackendNode> tryAttach(const Ptr<BackendNode>& node) CV_OVERRIDE
+    {
+        switch (node->backendId)
+        {
+            case DNN_BACKEND_HALIDE:
+            {
+#ifdef HAVE_HALIDE
+                auto base = node.dynamicCast<HalideBackendNode>();
+                Halide::Func& input = base->funcs.back();
+                Halide::Var x("x"), y("y"), c("c"), n("n");
+                Halide::Func top = attachHalide(input(x, y, c, n));
+                return Ptr<BackendNode>(new HalideBackendNode(base, top));
+#endif  // HAVE_HALIDE
+                break;
+            }
+        }
+        return Ptr<BackendNode>();
+    }
+
+    virtual Ptr<BackendNode> initHalide(const std::vector<Ptr<BackendWrapper> > &inputs) CV_OVERRIDE
+    {
+#ifdef HAVE_HALIDE
+        Halide::Buffer<float> input = halideBuffer(inputs[0]);
+        Halide::Var x("x"), y("y"), c("c"), n("n");
+        Halide::Func top = attachHalide(input(x, y, c, n));
+        return Ptr<BackendNode>(new HalideBackendNode(top));
+#endif  // HAVE_HALIDE
+        return Ptr<BackendNode>();
+    }
+
+#ifdef HAVE_HALIDE
+    // attachHalide can work both with Halide::Buffer and Halide::Func. In the
+    // second case it will be a fusion.
+    Halide::Func attachHalide(const Halide::Expr& input)
+    {
+        Halide::Func top = (name.empty() ? Halide::Func() : Halide::Func(name));
+        Halide::Var x("x"), y("y"), c("c"), n("n");
+
+        const int numChannels = weights_.total();
+        auto weights = wrapToHalideBuffer(weights_, {numChannels});
+        auto bias = wrapToHalideBuffer(bias_, {numChannels});
+        top(x, y, c, n) = input * weights(c) + bias(c);
+        return top;
+    }
+#endif  // HAVE_HALIDE
+
+#ifdef HAVE_INF_ENGINE
+    virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >&) CV_OVERRIDE
+    {
+        InferenceEngine::Builder::Layer ieLayer = InferenceEngine::Builder::ScaleShiftLayer(name);
+        const size_t numChannels = weights_.total();
+        addConstantData("weights", wrapToInfEngineBlob(weights_, {numChannels}, InferenceEngine::Layout::C), ieLayer);
+        addConstantData("biases", wrapToInfEngineBlob(bias_, {numChannels}, InferenceEngine::Layout::C), ieLayer);
+        return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
+    }
+#endif  // HAVE_INF_ENGINE
+
+#ifdef HAVE_DNN_NGRAPH
+    virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> >& inputs, const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
+    {
+        auto ieInpNode = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
+        std::vector<size_t> shape(ieInpNode->get_shape().size(), 1);
+        shape[1] = weights_.total();
+        auto weight = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape(shape), weights_.data);
+        auto bias = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape(shape), bias_.data);
+        auto scale_node = std::make_shared<ngraph::op::v1::Multiply>(ieInpNode, weight, ngraph::op::AutoBroadcastType::NUMPY);
+        auto scale_shift = std::make_shared<ngraph::op::v1::Add>(scale_node, bias, ngraph::op::AutoBroadcastType::NUMPY);
+        return Ptr<BackendNode>(new InfEngineNgraphNode(scale_shift));
+    }
+#endif  // HAVE_DNN_NGRAPH
+
+    virtual int64 getFLOPS(const std::vector<MatShape> &inputs,
+                           const std::vector<MatShape> &outputs) const CV_OVERRIDE
+    {
+        CV_UNUSED(outputs); // suppress unused variable warning
+
+        int64 flops = 0;
+        for(int i = 0; i < inputs.size(); i++)
+        {
+            flops += 3*total(inputs[i]);
+        }
+        return flops;
+    }
+
+private:
+    bool useGlobalStats;
+};
+
+Ptr<BatchNormLayer> BatchNormLayer::create(const LayerParams& params)
+{
+    return Ptr<BatchNormLayer>(new BatchNormLayerImpl(params));
+}
+
+}  // namespace dnn
+}  // namespace cv
--- a/Lib/opencv/sources/modules/dnn/src/layers/blank_layer.cpp
+++ b/Lib/opencv/sources/modules/dnn/src/layers/blank_layer.cpp
@@ -0,0 +1,190 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Copyright (C) 2017, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+#include "../precomp.hpp"
+#include "../op_cuda.hpp"
+#include "../op_inf_engine.hpp"
+#include "../ie_ngraph.hpp"
+
+#ifdef HAVE_CUDA
+#include "../cuda4dnn/primitives/reshape.hpp"
+using namespace cv::dnn::cuda4dnn;
+#endif
+
+namespace cv
+{
+namespace dnn
+{
+class BlankLayerImpl CV_FINAL : public BlankLayer
+{
+public:
+    BlankLayerImpl(const LayerParams& params)
+    {
+        setParamsFrom(params);
+    }
+
+    virtual bool supportBackend(int backendId) CV_OVERRIDE
+    {
+        return backendId == DNN_BACKEND_OPENCV ||
+               backendId == DNN_BACKEND_CUDA ||
+               ((backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 || backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH) && haveInfEngine());
+    }
+
+    bool getMemoryShapes(const std::vector<MatShape> &inputs,
+                         const int requiredOutputs,
+                         std::vector<MatShape> &outputs,
+                         std::vector<MatShape> &internals) const CV_OVERRIDE
+    {
+        Layer::getMemoryShapes(inputs, requiredOutputs, outputs, internals);
+        return true;
+    }
+
+#ifdef HAVE_OPENCL
+    bool forward_ocl(InputArrayOfArrays inputs_, OutputArrayOfArrays outputs_, OutputArrayOfArrays internals_)
+    {
+        std::vector<UMat> inputs;
+        std::vector<UMat> outputs;
+
+        inputs_.getUMatVector(inputs);
+        outputs_.getUMatVector(outputs);
+
+        for (int i = 0, n = outputs.size(); i < n; ++i)
+        {
+            void *src_handle = inputs[i].handle(ACCESS_READ);
+            void *dst_handle = outputs[i].handle(ACCESS_WRITE);
+            if (src_handle != dst_handle)
+                inputs[i].copyTo(outputs[i]);
+        }
+
+        return true;
+    }
+#endif
+
+    void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
+    {
+        CV_TRACE_FUNCTION();
+        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+
+        CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
+                   forward_ocl(inputs_arr, outputs_arr, internals_arr))
+
+        std::vector<Mat> inputs, outputs;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
+
+        for (int i = 0, n = outputs.size(); i < n; ++i)
+            if (outputs[i].data != inputs[i].data)
+                inputs[i].copyTo(outputs[i]);
+    }
+
+#ifdef HAVE_CUDA
+    Ptr<BackendNode> initCUDA(
+        void *context_,
+        const std::vector<Ptr<BackendWrapper>>& inputs,
+        const std::vector<Ptr<BackendWrapper>>& outputs
+    ) override
+    {
+        auto context = reinterpret_cast<csl::CSLContext*>(context_);
+        return make_cuda_node<cuda4dnn::ReshapeOp>(preferableTarget, std::move(context->stream));
+    }
+#endif
+
+#ifdef HAVE_INF_ENGINE
+    virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >& inputs) CV_OVERRIDE
+    {
+        InferenceEngine::DataPtr input = infEngineDataNode(inputs[0]);
+        std::vector<size_t> dims = input->getDims();
+        CV_Assert(!dims.empty());
+
+        InferenceEngine::Builder::Layer ieLayer(name);
+        ieLayer.setName(name);
+        if (preferableTarget == DNN_TARGET_MYRIAD)
+        {
+            ieLayer.setType("Copy");
+        }
+        else
+        {
+            ieLayer.setType("Split");
+            ieLayer.getParameters()["axis"] = dims.size() - 1;
+            ieLayer.getParameters()["out_sizes"] = dims[0];
+        }
+        ieLayer.setInputPorts({InferenceEngine::Port(dims)});
+        ieLayer.setOutputPorts(std::vector<InferenceEngine::Port>(1));
+        return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
+    }
+#endif  // HAVE_INF_ENGINE
+
+
+#ifdef HAVE_DNN_NGRAPH
+    virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> >& inputs,
+                                        const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
+    {
+        auto& ieInpNode = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
+        ngraph::NodeVector inp{ieInpNode};
+        auto blank = std::make_shared<ngraph::op::Concat>(inp, 0);
+        return Ptr<BackendNode>(new InfEngineNgraphNode(blank));
+    }
+#endif  // HAVE_DNN_NGRAPH
+};
+
+Ptr<Layer> BlankLayer::create(const LayerParams& params)
+{
+    // In case of Caffe's Dropout layer from Faster-RCNN framework,
+    // https://github.com/rbgirshick/caffe-fast-rcnn/tree/faster-rcnn
+    // return Power layer.
+    if (!params.get<bool>("scale_train", true))
+    {
+        float scale = 1 - params.get<float>("dropout_ratio", 0.5f);
+        CV_Assert(scale > 0);
+
+        LayerParams powerParams;
+        powerParams.name = params.name;
+        powerParams.type = "Power";
+        powerParams.set("scale", scale);
+
+        return PowerLayer::create(powerParams);
+    }
+    else
+        return Ptr<BlankLayer>(new BlankLayerImpl(params));
+}
+
+}
+}
--- a/Lib/opencv/sources/modules/dnn/src/layers/concat_layer.cpp
+++ b/Lib/opencv/sources/modules/dnn/src/layers/concat_layer.cpp
@@ -0,0 +1,373 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Copyright (C) 2017, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "../precomp.hpp"
+#include "layers_common.hpp"
+#include "../op_cuda.hpp"
+#include "../op_halide.hpp"
+#include "../op_inf_engine.hpp"
+#include "../ie_ngraph.hpp"
+#include "../op_vkcom.hpp"
+
+#ifdef HAVE_OPENCL
+#include "opencl_kernels_dnn.hpp"
+#endif
+
+#ifdef HAVE_CUDA
+#include "../cuda4dnn/primitives/concat.hpp"
+using namespace cv::dnn::cuda4dnn;
+#endif
+
+namespace cv
+{
+namespace dnn
+{
+
+class ConcatLayerImpl CV_FINAL : public ConcatLayer
+{
+public:
+    ConcatLayerImpl(const LayerParams& params)
+    {
+        setParamsFrom(params);
+        axis = params.get<int>("axis", 1);
+        padding = params.get<bool>("padding", false);
+    }
+
+    virtual bool getMemoryShapes(const std::vector<MatShape> &inputs,
+                                 const int requiredOutputs,
+                                 std::vector<MatShape> &outputs,
+                                 std::vector<MatShape> &internals) const CV_OVERRIDE
+    {
+        CV_Assert(inputs.size() > 0);
+        outputs.resize(1, inputs[0]);
+        int cAxis = clamp(axis, inputs[0]);
+
+        int axisSum = 0;
+        for (size_t i = 0; i < inputs.size(); i++)
+        {
+            MatShape curShape = inputs[i];
+
+            if (padding)
+            {
+                for (int curAxis = 0; curAxis < outputs[0].size(); curAxis++)
+                {
+                    outputs[0][curAxis] = std::max(outputs[0][curAxis], curShape[curAxis]);
+                }
+            }
+            else
+            {
+                CV_Assert(curShape.size() == outputs[0].size());
+                for (int curAxis = 0; curAxis < outputs[0].size(); curAxis++)
+                {
+                    if (curAxis != cAxis && outputs[0][curAxis] != curShape[curAxis])
+                        CV_Error(Error::StsBadSize, "Inconsistent shape for ConcatLayer");
+                }
+            }
+
+            axisSum += curShape[cAxis];
+        }
+        outputs[0][cAxis] = axisSum;
+        return false;
+    }
+
+    virtual bool supportBackend(int backendId) CV_OVERRIDE
+    {
+        return backendId == DNN_BACKEND_OPENCV ||
+               backendId == DNN_BACKEND_CUDA ||
+               (backendId == DNN_BACKEND_HALIDE && haveHalide() && axis == 1 && !padding) ||  // By channels
+               ((backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 || backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH) && haveInfEngine() && !padding) ||
+               (backendId == DNN_BACKEND_VKCOM && haveVulkan() && !padding);
+    }
+
+    class ChannelConcatInvoker : public ParallelLoopBody
+    {
+    public:
+        std::vector<Mat>* inputs;
+        Mat* output;
+        int nstripes;
+        std::vector<const float*> chptrs;
+
+        static void run(std::vector<Mat>& inputs, Mat& output, int nstripes)
+        {
+            ChannelConcatInvoker cc;
+            cc.inputs = &inputs;
+            cc.output = &output;
+            cc.nstripes = nstripes;
+
+            size_t i, ninputs = inputs.size();
+            int nchannels = 0, batchsz = output.size[0];
+            for( i = 0; i < ninputs; i++ )
+            {
+                Mat& inp = inputs[i];
+                CV_Assert( inp.isContinuous() && (inp.type() == CV_32F || inp.type() == CV_16S) &&
+                           inp.dims == 4 && inp.size[0] == output.size[0] &&
+                           inp.size[2] == output.size[2] &&
+                           inp.size[3] == output.size[3] );
+                nchannels += inp.size[1];
+            }
+            CV_Assert( nchannels == output.size[1] );
+            CV_Assert( output.isContinuous() && (output.type() == CV_32F || output.type() == CV_16S) );
+
+            cc.chptrs.resize(nchannels*batchsz);
+
+            int ofs = 0;
+            for( i = 0; i < ninputs; i++)
+            {
+                Mat& inp = inputs[i];
+                for( int j = 0; j < batchsz; j++ )
+                    for( int k = 0; k < inp.size[1]; k++ )
+                    {
+                        const float* ptr = inp.ptr<float>(j, k);
+                        cc.chptrs[ofs + j*nchannels + k] = ptr;
+                    }
+                ofs += inp.size[1];
+            }
+
+            parallel_for_(Range(0, nstripes), cc, nstripes);
+        }
+
+        ChannelConcatInvoker()  : inputs(0), output(0), nstripes(0) {}
+
+        void operator()(const Range& r) const CV_OVERRIDE
+        {
+            size_t planeSize = (size_t)output->size[2]*output->size[3];
+            size_t nch = chptrs.size();
+            size_t total = nch*planeSize;
+            size_t stripeSize = (total + nstripes - 1)/nstripes;
+            size_t stripeStart = r.start*stripeSize;
+            size_t stripeEnd = std::min(total, r.end*stripeSize);
+            const float** ptrs = (const float**)&chptrs[0];
+            float* outptr = output->ptr<float>();
+            size_t blockSize0 = 1 << 16;
+
+            for( size_t ofs0 = stripeStart; ofs0 < stripeEnd; )
+            {
+                size_t ch = ofs0/planeSize;
+                size_t ofs = ofs0 - ch*planeSize;
+                size_t blockSize = std::min(blockSize0, planeSize - ofs);
+                memcpy(outptr + ofs0, ptrs[ch] + ofs, blockSize*sizeof(outptr[0]));
+                ofs0 += blockSize;
+            }
+        }
+    };
+
+#ifdef HAVE_OPENCL
+    bool forward_ocl(InputArrayOfArrays inps, OutputArrayOfArrays outs, OutputArrayOfArrays internals)
+    {
+        std::vector<UMat> inputs;
+        std::vector<UMat> outputs;
+
+        bool use_half = (inps.depth() == CV_16S);
+        inps.getUMatVector(inputs);
+        outs.getUMatVector(outputs);
+
+        int cAxis = clamp(axis, inputs[0].dims);
+        if (padding)
+            return false;
+
+        int bottom_concat_axis;
+        int concat_size = total(shape(inputs[0]), cAxis + 1);
+        int top_concat_axis = outputs[0].size[cAxis];
+        int num_concats = total(shape(inputs[0]), 0, cAxis);
+        int offset_concat_axis = 0;
+        UMat& outMat = outputs[0];
+        String buildopt = format(" -DDtype=%s", (use_half) ? "half" : "float");
+        String kname = format("concat_%s", use_half ? "half" : "float");
+
+        for (size_t i = 0; i < inputs.size(); i++)
+        {
+            ocl::Kernel kernel(kname.c_str(), ocl::dnn::concat_oclsrc, buildopt);
+            if (kernel.empty())
+                return false;
+
+            UMat& inpMat = inputs[i];
+            bottom_concat_axis = inputs[i].size[cAxis];
+            size_t nthreads = inputs[i].total();
+
+            kernel.set(0, (int)nthreads);
+            kernel.set(1, ocl::KernelArg::PtrReadOnly(inpMat));
+            kernel.set(2, (int)num_concats);
+            kernel.set(3, (int)concat_size);
+            kernel.set(4, (int)top_concat_axis);
+            kernel.set(5, (int)bottom_concat_axis);
+            kernel.set(6, (int)offset_concat_axis);
+            kernel.set(7, ocl::KernelArg::PtrWriteOnly(outMat));
+
+            if (!kernel.run(1, &nthreads, NULL, false))
+                return false;
+
+            offset_concat_axis += bottom_concat_axis;
+        }
+
+        return true;
+    }
+#endif
+
+    void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
+    {
+        CV_TRACE_FUNCTION();
+        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+
+        CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
+                   forward_ocl(inputs_arr, outputs_arr, internals_arr))
+
+        std::vector<Mat> inputs, outputs;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
+
+        int cAxis = clamp(axis, inputs[0].dims);
+        Mat& outMat = outputs[0];
+
+        if (padding)
+            outMat.setTo(0);
+
+        if( cAxis == 1 && outMat.dims == 4 && !padding)
+        {
+            int nstripes = getNumThreads();
+            ChannelConcatInvoker::run(inputs, outMat, nstripes);
+        }
+        else
+        {
+            std::vector<Range> ranges(outputs[0].dims, Range::all());
+
+            ranges[cAxis].start = 0;
+            for (size_t i = 0; i < inputs.size(); i++)
+            {
+                ranges[cAxis].end = ranges[cAxis].start + inputs[i].size[cAxis];
+                for (int j = 0; j < outMat.dims; ++j)
+                {
+                    if (j == cAxis) continue;
+                    ranges[j].start = (outMat.size[j] - inputs[i].size[j]) / 2;
+                    ranges[j].end = ranges[j].start + inputs[i].size[j];
+                }
+                inputs[i].copyTo(outMat(&ranges[0]));
+                ranges[cAxis].start = ranges[cAxis].end;
+            }
+        }
+    }
+
+#ifdef HAVE_CUDA
+    Ptr<BackendNode> initCUDA(
+        void *context_,
+        const std::vector<Ptr<BackendWrapper>>& inputs,
+        const std::vector<Ptr<BackendWrapper>>& outputs
+    ) override
+    {
+        auto context = reinterpret_cast<csl::CSLContext*>(context_);
+
+        auto input_wrapper = inputs[0].dynamicCast<CUDABackendWrapper>();
+        auto concat_axis = clamp(axis, input_wrapper->getRank());
+        return make_cuda_node<cuda4dnn::ConcatOp>(preferableTarget, std::move(context->stream), concat_axis, padding);
+    }
+#endif
+
+    virtual Ptr<BackendNode> initVkCom(const std::vector<Ptr<BackendWrapper> > &input) CV_OVERRIDE
+    {
+#ifdef HAVE_VULKAN
+        vkcom::Tensor in = VkComTensor(input[0]);
+        int cAxis = clamp(axis, in.dimNum());
+        std::shared_ptr<vkcom::OpBase> op(new vkcom::OpConcat(cAxis));
+        return Ptr<BackendNode>(new VkComBackendNode(input, op));
+#endif // HAVE_VULKAN
+        return Ptr<BackendNode>();
+    }
+
+    virtual Ptr<BackendNode> initHalide(const std::vector<Ptr<BackendWrapper> > &input) CV_OVERRIDE
+    {
+#ifdef HAVE_HALIDE
+        std::vector<Halide::Buffer<> > inputBuffers = halideBuffers(input);
+
+        Halide::Var x("x"), y("y"), c("c"), n("n");
+        Halide::Func top = (name.empty() ? Halide::Func() : Halide::Func(name));
+        int offset = inputBuffers[0].channels();
+        Halide::Expr topExpr = select(c < offset,
+                                      inputBuffers[0](x, y, c, n),
+                                      inputBuffers[1](x, y, c - offset, n));
+        for (int i = 2; i < input.size(); ++i)
+        {
+            offset += inputBuffers[i - 1].channels();
+            topExpr = select(c < offset, topExpr,
+                             inputBuffers[i](x, y, c - offset, n));
+        }
+        top(x, y, c, n) = topExpr;
+        return Ptr<BackendNode>(new HalideBackendNode(top));
+#endif  // HAVE_HALIDE
+        return Ptr<BackendNode>();
+    }
+
+#ifdef HAVE_INF_ENGINE
+    virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >& inputs) CV_OVERRIDE
+    {
+        InferenceEngine::DataPtr input = infEngineDataNode(inputs[0]);
+
+        InferenceEngine::Builder::ConcatLayer ieLayer(name);
+        ieLayer.setAxis(clamp(axis, input->getDims().size()));
+        ieLayer.setInputPorts(std::vector<InferenceEngine::Port>(inputs.size()));
+        return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
+    }
+#endif  // HAVE_INF_ENGINE
+
+
+#ifdef HAVE_DNN_NGRAPH
+    virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> >& inputs,
+                                        const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
+    {
+        CV_Assert(inputs.size() == nodes.size());
+        ngraph::NodeVector inp_nodes;
+        for (auto& node : nodes) {
+            inp_nodes.push_back(node.dynamicCast<InfEngineNgraphNode>()->node);
+        }
+
+        InferenceEngine::DataPtr data = ngraphDataNode(inputs[0]);
+        auto concat = std::make_shared<ngraph::op::Concat>(inp_nodes, clamp(axis, data->getDims().size()));
+        return Ptr<BackendNode>(new InfEngineNgraphNode(concat));
+    }
+#endif  // HAVE_DNN_NGRAPH
+};
+
+Ptr<ConcatLayer> ConcatLayer::create(const LayerParams& params)
+{
+    return Ptr<ConcatLayer>(new ConcatLayerImpl(params));
+}
+
+}
+}
--- a/Lib/opencv/sources/modules/dnn/src/layers/const_layer.cpp
+++ b/Lib/opencv/sources/modules/dnn/src/layers/const_layer.cpp
@@ -0,0 +1,106 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+// Copyright (C) 2018, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+
+#include "../precomp.hpp"
+#include "../op_inf_engine.hpp"
+#include "../op_cuda.hpp"
+#include "layers_common.hpp"
+
+#ifdef HAVE_OPENCL
+#include "opencl_kernels_dnn.hpp"
+#endif
+
+#ifdef HAVE_CUDA
+#include "../cuda4dnn/primitives/const.hpp"
+using namespace cv::dnn::cuda4dnn;
+#endif
+
+namespace cv { namespace dnn {
+
+class ConstLayerImpl CV_FINAL : public ConstLayer
+{
+public:
+    ConstLayerImpl(const LayerParams& params)
+    {
+        setParamsFrom(params);
+        CV_Assert(blobs.size() == 1);
+    }
+
+    virtual bool supportBackend(int backendId) CV_OVERRIDE
+    {
+        return backendId == DNN_BACKEND_OPENCV ||
+               backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 ||
+               backendId == DNN_BACKEND_CUDA;
+    }
+
+    virtual bool getMemoryShapes(const std::vector<MatShape> &inputs,
+                                 const int requiredOutputs,
+                                 std::vector<MatShape> &outputs,
+                                 std::vector<MatShape> &internals) const CV_OVERRIDE
+    {
+        CV_Assert(inputs.empty());
+        outputs.assign(1, shape(blobs[0]));
+        return false;
+    }
+
+#ifdef HAVE_OPENCL
+    bool forward_ocl(InputArrayOfArrays inps, OutputArrayOfArrays outs, OutputArrayOfArrays internals)
+    {
+        std::vector<UMat> outputs;
+        outs.getUMatVector(outputs);
+        if (outs.depth() == CV_16S)
+            convertFp16(blobs[0], outputs[0]);
+        else
+            blobs[0].copyTo(outputs[0]);
+        return true;
+    }
+#endif
+
+    void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
+    {
+        CV_TRACE_FUNCTION();
+        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+
+        CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
+                   forward_ocl(inputs_arr, outputs_arr, internals_arr))
+
+        std::vector<Mat> outputs;
+        outputs_arr.getMatVector(outputs);
+        blobs[0].copyTo(outputs[0]);
+    }
+
+#ifdef HAVE_INF_ENGINE
+    virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >&) CV_OVERRIDE
+    {
+        InferenceEngine::Builder::ConstLayer ieLayer(name);
+        ieLayer.setData(wrapToInfEngineBlob(blobs[0]));
+        return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
+    }
+#endif  // HAVE_INF_ENGINE
+
+#ifdef HAVE_CUDA
+    Ptr<BackendNode> initCUDA(
+        void *context_,
+        const std::vector<Ptr<BackendWrapper>>& inputs,
+        const std::vector<Ptr<BackendWrapper>>& outputs
+    ) override
+    {
+        auto context = reinterpret_cast<csl::CSLContext*>(context_);
+
+        CV_Assert(blobs.size() == 1);
+        return make_cuda_node<cuda4dnn::ConstOp>(preferableTarget, std::move(context->stream), blobs[0]);
+    }
+#endif
+
+};
+
+Ptr<Layer> ConstLayer::create(const LayerParams& params)
+{
+    return Ptr<Layer>(new ConstLayerImpl(params));
+}
+
+}}  // namespace cv::dnn
--- a/Lib/opencv/sources/modules/dnn/src/layers/convolution_layer.cpp
+++ b/Lib/opencv/sources/modules/dnn/src/layers/convolution_layer.cpp
--- a/Lib/opencv/sources/modules/dnn/src/layers/crop_and_resize_layer.cpp
+++ b/Lib/opencv/sources/modules/dnn/src/layers/crop_and_resize_layer.cpp
@@ -0,0 +1,146 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+// Copyright (C) 2018, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+#include "../precomp.hpp"
+#include "layers_common.hpp"
+
+#ifdef HAVE_CUDA
+#include "../cuda4dnn/primitives/crop_and_resize.hpp"
+using namespace cv::dnn::cuda4dnn;
+#endif
+
+namespace cv { namespace dnn {
+
+class CropAndResizeLayerImpl CV_FINAL : public CropAndResizeLayer
+{
+public:
+    CropAndResizeLayerImpl(const LayerParams& params)
+    {
+        setParamsFrom(params);
+        CV_Assert_N(params.has("width"), params.has("height"));
+        outWidth = params.get<float>("width");
+        outHeight = params.get<float>("height");
+    }
+
+    bool getMemoryShapes(const std::vector<MatShape> &inputs,
+                         const int requiredOutputs,
+                         std::vector<MatShape> &outputs,
+                         std::vector<MatShape> &internals) const CV_OVERRIDE
+    {
+        CV_Assert_N(inputs.size() == 2, inputs[0].size() == 4);
+        if (inputs[0][0] != 1)
+            CV_Error(Error::StsNotImplemented, "");
+        outputs.resize(1, MatShape(4));
+        outputs[0][0] = inputs[1][2];  // Number of bounding boxes.
+        outputs[0][1] = inputs[0][1];  // Number of channels.
+        outputs[0][2] = outHeight;
+        outputs[0][3] = outWidth;
+        return false;
+    }
+
+    virtual bool supportBackend(int backendId) CV_OVERRIDE
+    {
+        return backendId == DNN_BACKEND_OPENCV || backendId == DNN_BACKEND_CUDA;
+    }
+
+    void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
+    {
+        CV_TRACE_FUNCTION();
+        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+
+        if (inputs_arr.depth() == CV_16S)
+        {
+            forward_fallback(inputs_arr, outputs_arr, internals_arr);
+            return;
+        }
+
+        std::vector<Mat> inputs, outputs;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
+
+        Mat& inp = inputs[0];
+        Mat& out = outputs[0];
+        Mat boxes = inputs[1].reshape(1, inputs[1].total() / 7);
+        const int numChannels = inp.size[1];
+        const int inpHeight = inp.size[2];
+        const int inpWidth = inp.size[3];
+        const int inpSpatialSize = inpHeight * inpWidth;
+        const int outSpatialSize = outHeight * outWidth;
+        CV_Assert_N(inp.isContinuous(), out.isContinuous());
+
+        for (int b = 0; b < boxes.rows; ++b)
+        {
+            float* outDataBox = out.ptr<float>(b);
+            float left = boxes.at<float>(b, 3);
+            float top = boxes.at<float>(b, 4);
+            float right = boxes.at<float>(b, 5);
+            float bottom = boxes.at<float>(b, 6);
+            float boxWidth = right - left;
+            float boxHeight = bottom - top;
+
+            float heightScale = boxHeight * static_cast<float>(inpHeight - 1) / (outHeight - 1);
+            float widthScale = boxWidth * static_cast<float>(inpWidth - 1) / (outWidth - 1);
+            for (int y = 0; y < outHeight; ++y)
+            {
+                float input_y = top * (inpHeight - 1) + y * heightScale;
+                int y0 = static_cast<int>(input_y);
+                const float* inpData_row0 = inp.ptr<float>(0, 0, y0);
+                const float* inpData_row1 = (y0 + 1 < inpHeight) ? (inpData_row0 + inpWidth) : inpData_row0;
+                for (int x = 0; x < outWidth; ++x)
+                {
+                    float input_x = left * (inpWidth - 1) + x * widthScale;
+                    int x0 = static_cast<int>(input_x);
+                    int x1 = std::min(x0 + 1, inpWidth - 1);
+
+                    float* outData = outDataBox + y * outWidth + x;
+                    const float* inpData_row0_c = inpData_row0;
+                    const float* inpData_row1_c = inpData_row1;
+                    for (int c = 0; c < numChannels; ++c)
+                    {
+                        *outData = inpData_row0_c[x0] +
+                            (input_y - y0) * (inpData_row1_c[x0] - inpData_row0_c[x0]) +
+                            (input_x - x0) * (inpData_row0_c[x1] - inpData_row0_c[x0] +
+                            (input_y - y0) * (inpData_row1_c[x1] - inpData_row0_c[x1] - inpData_row1_c[x0] + inpData_row0_c[x0]));
+
+                        inpData_row0_c += inpSpatialSize;
+                        inpData_row1_c += inpSpatialSize;
+                        outData += outSpatialSize;
+                    }
+                }
+            }
+        }
+        if (boxes.rows < out.size[0])
+        {
+            // left = top = right = bottom = 0
+            std::vector<cv::Range> dstRanges(4, Range::all());
+            dstRanges[0] = Range(boxes.rows, out.size[0]);
+            out(dstRanges).setTo(inp.ptr<float>(0, 0, 0)[0]);
+        }
+    }
+
+#ifdef HAVE_CUDA
+    Ptr<BackendNode> initCUDA(
+        void *context_,
+        const std::vector<Ptr<BackendWrapper>>& inputs,
+        const std::vector<Ptr<BackendWrapper>>& outputs
+    ) override
+    {
+        auto context = reinterpret_cast<csl::CSLContext*>(context_);
+        return make_cuda_node<cuda4dnn::CropAndResizeOp>(preferableTarget, std::move(context->stream));
+    }
+#endif
+
+private:
+    int outWidth, outHeight;
+};
+
+Ptr<Layer> CropAndResizeLayer::create(const LayerParams& params)
+{
+    return Ptr<CropAndResizeLayer>(new CropAndResizeLayerImpl(params));
+}
+
+}  // namespace dnn
+}  // namespace cv
--- a/Lib/opencv/sources/modules/dnn/src/layers/detection_output_layer.cpp
+++ b/Lib/opencv/sources/modules/dnn/src/layers/detection_output_layer.cpp
@@ -0,0 +1,998 @@
+/*M ///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Copyright (C) 2017, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "../precomp.hpp"
+#include "layers_common.hpp"
+#include "../op_inf_engine.hpp"
+
+#include <float.h>
+#include <string>
+#include "../nms.inl.hpp"
+
+#ifdef HAVE_OPENCL
+#include "opencl_kernels_dnn.hpp"
+#endif
+
+#ifdef HAVE_DNN_NGRAPH
+#include "../ie_ngraph.hpp"
+#include <ngraph/op/experimental/layers/detection_output.hpp>
+#endif
+
+namespace cv
+{
+namespace dnn
+{
+
+namespace util
+{
+
+class NormalizedBBox
+{
+public:
+    float xmin, ymin, xmax, ymax;
+
+    NormalizedBBox()
+        : xmin(0), ymin(0), xmax(0), ymax(0), has_size_(false), size_(0) {}
+
+    float size() const { return size_; }
+
+    bool has_size() const { return has_size_; }
+
+    void set_size(float value) { size_ = value; has_size_ = true; }
+
+    void clear_size() { size_ = 0; has_size_ = false; }
+
+private:
+    bool has_size_;
+    float size_;
+};
+
+template <typename T>
+static inline bool SortScorePairDescend(const std::pair<float, T>& pair1,
+                          const std::pair<float, T>& pair2)
+{
+    return pair1.first > pair2.first;
+}
+
+static inline float caffe_box_overlap(const util::NormalizedBBox& a, const util::NormalizedBBox& b);
+
+static inline float caffe_norm_box_overlap(const util::NormalizedBBox& a, const util::NormalizedBBox& b);
+
+} // namespace
+
+class DetectionOutputLayerImpl CV_FINAL : public DetectionOutputLayer
+{
+public:
+    unsigned _numClasses;
+    bool _shareLocation;
+    int _numLocClasses;
+
+    int _backgroundLabelId;
+
+    cv::String _codeType;
+
+    bool _varianceEncodedInTarget;
+    int _keepTopK;
+    float _confidenceThreshold;
+
+    float _nmsThreshold;
+    int _topK;
+    // Whenever predicted bounding boxes are represented in YXHW instead of XYWH layout.
+    bool _locPredTransposed;
+    // It's true whenever predicted bounding boxes and proposals are normalized to [0, 1].
+    bool _bboxesNormalized;
+    bool _clip;
+    bool _groupByClasses;
+
+    enum { _numAxes = 4 };
+    static const std::string _layerName;
+
+    typedef std::map<int, std::vector<util::NormalizedBBox> > LabelBBox;
+
+    bool getParameterDict(const LayerParams &params,
+                          const std::string &parameterName,
+                          DictValue& result)
+    {
+        if (!params.has(parameterName))
+        {
+            return false;
+        }
+
+        result = params.get(parameterName);
+        return true;
+    }
+
+    template<typename T>
+    T getParameter(const LayerParams &params,
+                   const std::string &parameterName,
+                   const size_t &idx=0,
+                   const bool required=true,
+                   const T& defaultValue=T())
+    {
+        DictValue dictValue;
+        bool success = getParameterDict(params, parameterName, dictValue);
+        if(!success)
+        {
+            if(required)
+            {
+                std::string message = _layerName;
+                message += " layer parameter does not contain ";
+                message += parameterName;
+                message += " parameter.";
+                CV_Error(Error::StsBadArg, message);
+            }
+            else
+            {
+                return defaultValue;
+            }
+        }
+        return dictValue.get<T>(idx);
+    }
+
+    void getCodeType(const LayerParams &params)
+    {
+        String codeTypeString = toLowerCase(params.get<String>("code_type"));
+        if (codeTypeString == "center_size")
+            _codeType = "CENTER_SIZE";
+        else
+            _codeType = "CORNER";
+    }
+
+    DetectionOutputLayerImpl(const LayerParams &params)
+    {
+        _numClasses = getParameter<unsigned>(params, "num_classes");
+        _shareLocation = getParameter<bool>(params, "share_location");
+        _numLocClasses = _shareLocation ? 1 : _numClasses;
+        _backgroundLabelId = getParameter<int>(params, "background_label_id");
+        _varianceEncodedInTarget = getParameter<bool>(params, "variance_encoded_in_target", 0, false, false);
+        _keepTopK = getParameter<int>(params, "keep_top_k");
+        _confidenceThreshold = getParameter<float>(params, "confidence_threshold", 0, false, 0);
+        _topK = getParameter<int>(params, "top_k", 0, false, -1);
+        _locPredTransposed = getParameter<bool>(params, "loc_pred_transposed", 0, false, false);
+        _bboxesNormalized = getParameter<bool>(params, "normalized_bbox", 0, false, true);
+        _clip = getParameter<bool>(params, "clip", 0, false, false);
+        _groupByClasses = getParameter<bool>(params, "group_by_classes", 0, false, true);
+
+        getCodeType(params);
+
+        // Parameters used in nms.
+        _nmsThreshold = getParameter<float>(params, "nms_threshold");
+        CV_Assert(_nmsThreshold > 0.);
+
+        setParamsFrom(params);
+    }
+
+    virtual bool supportBackend(int backendId) CV_OVERRIDE
+    {
+        return backendId == DNN_BACKEND_OPENCV ||
+               ((backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 || backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH) && !_locPredTransposed && _bboxesNormalized);
+    }
+
+    bool getMemoryShapes(const std::vector<MatShape> &inputs,
+                         const int requiredOutputs,
+                         std::vector<MatShape> &outputs,
+                         std::vector<MatShape> &internals) const CV_OVERRIDE
+    {
+        const int num = inputs[0][0];
+        CV_Assert(inputs.size() >= 3);
+        CV_Assert(num == inputs[1][0]);
+
+        int numPriors = inputs[2][2] / 4;
+        CV_Assert((numPriors * _numLocClasses * 4) == total(inputs[0], 1));
+        CV_Assert(int(numPriors * _numClasses) == total(inputs[1], 1));
+        CV_Assert(inputs[2][1] == 1 + (int)(!_varianceEncodedInTarget));
+
+        // num() and channels() are 1.
+        // Since the number of bboxes to be kept is unknown before nms, we manually
+        // set it to maximal number of detections, [keep_top_k] parameter multiplied by batch size.
+        // Each row is a 7 dimension std::vector, which stores
+        // [image_id, label, confidence, xmin, ymin, xmax, ymax]
+        outputs.resize(1, shape(1, 1, _keepTopK * num, 7));
+
+        return false;
+    }
+
+#ifdef HAVE_OPENCL
+    // Decode all bboxes in a batch
+    bool ocl_DecodeBBoxesAll(UMat& loc_mat, UMat& prior_mat,
+                             const int num, const int numPriors, const bool share_location,
+                             const int num_loc_classes, const int background_label_id,
+                             const cv::String& code_type, const bool variance_encoded_in_target,
+                             const bool clip, std::vector<LabelBBox>& all_decode_bboxes)
+    {
+        UMat outmat = UMat(loc_mat.dims, loc_mat.size, CV_32F);
+        size_t nthreads = loc_mat.total();
+        String kernel_name;
+
+        if (code_type == "CORNER")
+            kernel_name = "DecodeBBoxesCORNER";
+        else if (code_type == "CENTER_SIZE")
+            kernel_name = "DecodeBBoxesCENTER_SIZE";
+        else
+            return false;
+
+        for (int i = 0; i < num; ++i)
+        {
+            ocl::Kernel kernel(kernel_name.c_str(), ocl::dnn::detection_output_oclsrc);
+            kernel.set(0, (int)nthreads);
+            kernel.set(1, ocl::KernelArg::PtrReadOnly(loc_mat));
+            kernel.set(2, ocl::KernelArg::PtrReadOnly(prior_mat));
+            kernel.set(3, (int)variance_encoded_in_target);
+            kernel.set(4, (int)numPriors);
+            kernel.set(5, (int)share_location);
+            kernel.set(6, (int)num_loc_classes);
+            kernel.set(7, (int)background_label_id);
+            kernel.set(8, (int)clip);
+            kernel.set(9, (int)_locPredTransposed);
+            kernel.set(10, ocl::KernelArg::PtrWriteOnly(outmat));
+
+            if (!kernel.run(1, &nthreads, NULL, false))
+                return false;
+        }
+
+        all_decode_bboxes.clear();
+        all_decode_bboxes.resize(num);
+        {
+            Mat mat = outmat.getMat(ACCESS_READ);
+            const float* decode_data = mat.ptr<float>();
+            for (int i = 0; i < num; ++i)
+            {
+                LabelBBox& decode_bboxes = all_decode_bboxes[i];
+                for (int c = 0; c < num_loc_classes; ++c)
+                {
+                    int label = share_location ? -1 : c;
+                    decode_bboxes[label].resize(numPriors);
+                    for (int p = 0; p < numPriors; ++p)
+                    {
+                        int startIdx = p * num_loc_classes * 4;
+                        util::NormalizedBBox& bbox = decode_bboxes[label][p];
+                        bbox.xmin = decode_data[startIdx + c * 4];
+                        bbox.ymin = decode_data[startIdx + c * 4 + 1];
+                        bbox.xmax = decode_data[startIdx + c * 4 + 2];
+                        bbox.ymax = decode_data[startIdx + c * 4 + 3];
+                    }
+                }
+            }
+        }
+        return true;
+    }
+
+    void ocl_GetConfidenceScores(const UMat& inp1, const int num,
+                                 const int numPredsPerClass, const int numClasses,
+                                 std::vector<Mat>& confPreds)
+    {
+        int shape[] = { numClasses, numPredsPerClass };
+        for (int i = 0; i < num; i++)
+            confPreds.push_back(Mat(2, shape, CV_32F));
+
+        shape[0] = num * numPredsPerClass;
+        shape[1] = inp1.total() / shape[0];
+        UMat umat = inp1.reshape(1, 2, &shape[0]);
+        for (int i = 0; i < num; ++i)
+        {
+            Range ranges[] = { Range(i * numPredsPerClass, (i + 1) * numPredsPerClass), Range::all() };
+            transpose(umat(ranges), confPreds[i]);
+        }
+    }
+
+    bool forward_ocl(InputArrayOfArrays inps, OutputArrayOfArrays outs, OutputArrayOfArrays internals)
+    {
+        std::vector<UMat> inputs;
+        std::vector<UMat> outputs;
+        outs.getUMatVector(outputs);
+
+        bool use_half = (inps.depth() == CV_16S);
+        if (use_half)
+        {
+            std::vector<UMat> orig_inputs;
+            inps.getUMatVector(orig_inputs);
+
+            inputs.resize(orig_inputs.size());
+            for (size_t i = 0; i < orig_inputs.size(); i++)
+                convertFp16(orig_inputs[i], inputs[i]);
+        }
+        else
+        {
+            inps.getUMatVector(inputs);
+        }
+
+        std::vector<LabelBBox> allDecodedBBoxes;
+        std::vector<Mat> allConfidenceScores;
+
+        int num = inputs[0].size[0];
+
+        // extract predictions from input layers
+        {
+            int numPriors = inputs[2].size[2] / 4;
+
+            // Retrieve all confidences
+            ocl_GetConfidenceScores(inputs[1], num, numPriors, _numClasses, allConfidenceScores);
+
+            // Decode all loc predictions to bboxes
+            bool ret = ocl_DecodeBBoxesAll(inputs[0], inputs[2], num, numPriors,
+                                           _shareLocation, _numLocClasses, _backgroundLabelId,
+                                           _codeType, _varianceEncodedInTarget, _clip,
+                                           allDecodedBBoxes);
+            if (!ret)
+                return false;
+        }
+
+        size_t numKept = 0;
+        std::vector<std::map<int, std::vector<int> > > allIndices;
+        for (int i = 0; i < num; ++i)
+        {
+            numKept += processDetections_(allDecodedBBoxes[i], allConfidenceScores[i], allIndices);
+        }
+
+        if (numKept == 0)
+        {
+            outputs[0].setTo(0);
+            return true;
+        }
+
+        UMat umat = use_half ? UMat::zeros(4, outputs[0].size, CV_32F) : outputs[0];
+
+        if (!use_half)
+            umat.setTo(0);
+
+        // If there are valid detections
+        if (numKept > 0)
+        {
+            Mat mat = umat.getMat(ACCESS_WRITE);
+            float* outputsData = mat.ptr<float>();
+
+            size_t count = 0;
+            for (int i = 0; i < num; ++i)
+            {
+                count += outputDetections_(i, &outputsData[count * 7],
+                                           allDecodedBBoxes[i], allConfidenceScores[i],
+                                           allIndices[i], _groupByClasses);
+            }
+            CV_Assert(count == numKept);
+        }
+
+        if (use_half)
+        {
+            UMat half_umat;
+            convertFp16(umat, half_umat);
+            outs.assign(std::vector<UMat>(1, half_umat));
+        }
+
+        return true;
+    }
+#endif
+
+    void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
+    {
+        CV_TRACE_FUNCTION();
+        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+
+        if (_bboxesNormalized)
+        {
+            CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
+                       forward_ocl(inputs_arr, outputs_arr, internals_arr))
+        }
+        if (inputs_arr.depth() == CV_16S)
+        {
+            forward_fallback(inputs_arr, outputs_arr, internals_arr);
+            return;
+        }
+
+        std::vector<Mat> inputs, outputs;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
+
+        std::vector<LabelBBox> allDecodedBBoxes;
+        std::vector<Mat> allConfidenceScores;
+
+        int num = inputs[0].size[0];
+
+        // extract predictions from input layers
+        {
+            int numPriors = inputs[2].size[2] / 4;
+
+            const float* locationData = inputs[0].ptr<float>();
+            const float* confidenceData = inputs[1].ptr<float>();
+            const float* priorData = inputs[2].ptr<float>();
+
+            // Retrieve all location predictions
+            std::vector<LabelBBox> allLocationPredictions;
+            GetLocPredictions(locationData, num, numPriors, _numLocClasses,
+                              _shareLocation, _locPredTransposed, allLocationPredictions);
+
+            // Retrieve all confidences
+            GetConfidenceScores(confidenceData, num, numPriors, _numClasses, allConfidenceScores);
+
+            // Retrieve all prior bboxes
+            std::vector<util::NormalizedBBox> priorBBoxes;
+            std::vector<std::vector<float> > priorVariances;
+            GetPriorBBoxes(priorData, numPriors, _bboxesNormalized, priorBBoxes, priorVariances);
+
+            // Decode all loc predictions to bboxes
+            util::NormalizedBBox clipBounds;
+            if (_clip)
+            {
+                CV_Assert(_bboxesNormalized || inputs.size() >= 4);
+                clipBounds.xmin = clipBounds.ymin = 0.0f;
+                if (_bboxesNormalized)
+                    clipBounds.xmax = clipBounds.ymax = 1.0f;
+                else
+                {
+                    // Input image sizes;
+                    CV_Assert(inputs[3].dims == 4);
+                    clipBounds.xmax = inputs[3].size[3] - 1;
+                    clipBounds.ymax = inputs[3].size[2] - 1;
+                }
+            }
+            DecodeBBoxesAll(allLocationPredictions, priorBBoxes, priorVariances, num,
+                            _shareLocation, _numLocClasses, _backgroundLabelId,
+                            _codeType, _varianceEncodedInTarget, _clip, clipBounds,
+                            _bboxesNormalized, allDecodedBBoxes);
+        }
+
+        size_t numKept = 0;
+        std::vector<std::map<int, std::vector<int> > > allIndices;
+        for (int i = 0; i < num; ++i)
+        {
+            numKept += processDetections_(allDecodedBBoxes[i], allConfidenceScores[i], allIndices);
+        }
+
+        outputs[0].setTo(0);
+
+        // If there is no detections
+        if (numKept == 0)
+            return;
+
+        float* outputsData = outputs[0].ptr<float>();
+
+        size_t count = 0;
+        for (int i = 0; i < num; ++i)
+        {
+            count += outputDetections_(i, &outputsData[count * 7],
+                                       allDecodedBBoxes[i], allConfidenceScores[i],
+                                       allIndices[i], _groupByClasses);
+        }
+        CV_Assert(count == numKept);
+        // Sync results back due changed output shape.
+        outputs_arr.assign(outputs);
+    }
+
+    size_t outputDetections_(
+            const int i, float* outputsData,
+            const LabelBBox& decodeBBoxes, Mat& confidenceScores,
+            const std::map<int, std::vector<int> >& indicesMap,
+            bool groupByClasses
+    )
+    {
+        std::vector<int> dstIndices;
+        std::vector<std::pair<float, int> > allScores;
+        for (std::map<int, std::vector<int> >::const_iterator it = indicesMap.begin(); it != indicesMap.end(); ++it)
+        {
+            int label = it->first;
+            if (confidenceScores.rows <= label)
+                CV_Error_(cv::Error::StsError, ("Could not find confidence predictions for label %d", label));
+            const std::vector<float>& scores = confidenceScores.row(label);
+            const std::vector<int>& indices = it->second;
+
+            const int numAllScores = allScores.size();
+            allScores.reserve(numAllScores + indices.size());
+            for (size_t j = 0; j < indices.size(); ++j)
+            {
+                allScores.push_back(std::make_pair(scores[indices[j]], numAllScores + j));
+            }
+        }
+        if (!groupByClasses)
+            std::sort(allScores.begin(), allScores.end(), util::SortScorePairDescend<int>);
+
+        dstIndices.resize(allScores.size());
+        for (size_t j = 0; j < dstIndices.size(); ++j)
+        {
+            dstIndices[allScores[j].second] = j;
+        }
+
+        size_t count = 0;
+        for (std::map<int, std::vector<int> >::const_iterator it = indicesMap.begin(); it != indicesMap.end(); ++it)
+        {
+            int label = it->first;
+            if (confidenceScores.rows <= label)
+                CV_Error_(cv::Error::StsError, ("Could not find confidence predictions for label %d", label));
+            const std::vector<float>& scores = confidenceScores.row(label);
+            int locLabel = _shareLocation ? -1 : label;
+            LabelBBox::const_iterator label_bboxes = decodeBBoxes.find(locLabel);
+            if (label_bboxes == decodeBBoxes.end())
+                CV_Error_(cv::Error::StsError, ("Could not find location predictions for label %d", locLabel));
+            const std::vector<int>& indices = it->second;
+
+            for (size_t j = 0; j < indices.size(); ++j, ++count)
+            {
+                int idx = indices[j];
+                int dstIdx = dstIndices[count];
+                const util::NormalizedBBox& decode_bbox = label_bboxes->second[idx];
+                outputsData[dstIdx * 7] = i;
+                outputsData[dstIdx * 7 + 1] = label;
+                outputsData[dstIdx * 7 + 2] = scores[idx];
+                outputsData[dstIdx * 7 + 3] = decode_bbox.xmin;
+                outputsData[dstIdx * 7 + 4] = decode_bbox.ymin;
+                outputsData[dstIdx * 7 + 5] = decode_bbox.xmax;
+                outputsData[dstIdx * 7 + 6] = decode_bbox.ymax;
+            }
+        }
+        return count;
+    }
+
+    size_t processDetections_(
+            const LabelBBox& decodeBBoxes, Mat& confidenceScores,
+            std::vector<std::map<int, std::vector<int> > >& allIndices
+    )
+    {
+        std::map<int, std::vector<int> > indices;
+        size_t numDetections = 0;
+        for (int c = 0; c < (int)_numClasses; ++c)
+        {
+            if (c == _backgroundLabelId)
+                continue; // Ignore background class.
+            if (c >= confidenceScores.rows)
+                CV_Error_(cv::Error::StsError, ("Could not find confidence predictions for label %d", c));
+
+            const std::vector<float> scores = confidenceScores.row(c);
+            int label = _shareLocation ? -1 : c;
+
+            LabelBBox::const_iterator label_bboxes = decodeBBoxes.find(label);
+            if (label_bboxes == decodeBBoxes.end())
+                CV_Error_(cv::Error::StsError, ("Could not find location predictions for label %d", label));
+            if (_bboxesNormalized)
+                NMSFast_(label_bboxes->second, scores, _confidenceThreshold, _nmsThreshold, 1.0, _topK,
+                         indices[c], util::caffe_norm_box_overlap);
+            else
+                NMSFast_(label_bboxes->second, scores, _confidenceThreshold, _nmsThreshold, 1.0, _topK,
+                         indices[c], util::caffe_box_overlap);
+            numDetections += indices[c].size();
+        }
+        if (_keepTopK > -1 && numDetections > (size_t)_keepTopK)
+        {
+            std::vector<std::pair<float, std::pair<int, int> > > scoreIndexPairs;
+            for (std::map<int, std::vector<int> >::iterator it = indices.begin();
+                 it != indices.end(); ++it)
+            {
+                int label = it->first;
+                const std::vector<int>& labelIndices = it->second;
+                if (label >= confidenceScores.rows)
+                    CV_Error_(cv::Error::StsError, ("Could not find location predictions for label %d", label));
+                const std::vector<float>& scores = confidenceScores.row(label);
+                for (size_t j = 0; j < labelIndices.size(); ++j)
+                {
+                    size_t idx = labelIndices[j];
+                    CV_Assert(idx < scores.size());
+                    scoreIndexPairs.push_back(std::make_pair(scores[idx], std::make_pair(label, idx)));
+                }
+            }
+            // Keep outputs k results per image.
+            std::sort(scoreIndexPairs.begin(), scoreIndexPairs.end(),
+                      util::SortScorePairDescend<std::pair<int, int> >);
+            scoreIndexPairs.resize(_keepTopK);
+
+            std::map<int, std::vector<int> > newIndices;
+            for (size_t j = 0; j < scoreIndexPairs.size(); ++j)
+            {
+                int label = scoreIndexPairs[j].second.first;
+                int idx = scoreIndexPairs[j].second.second;
+                newIndices[label].push_back(idx);
+            }
+            allIndices.push_back(newIndices);
+            return (size_t)_keepTopK;
+        }
+        else
+        {
+            allIndices.push_back(indices);
+            return numDetections;
+        }
+    }
+
+
+    // **************************************************************
+    // Utility functions
+    // **************************************************************
+
+    // Compute bbox size
+    static float BBoxSize(const util::NormalizedBBox& bbox, bool normalized)
+    {
+        if (bbox.xmax < bbox.xmin || bbox.ymax < bbox.ymin)
+        {
+            return 0; // If bbox is invalid (e.g. xmax < xmin or ymax < ymin), return 0.
+        }
+        else
+        {
+            if (bbox.has_size())
+            {
+                return bbox.size();
+            }
+            else
+            {
+                float width = bbox.xmax - bbox.xmin;
+                float height = bbox.ymax - bbox.ymin;
+                if (normalized)
+                {
+                    return width * height;
+                }
+                else
+                {
+                    // If bbox is not within range [0, 1].
+                    return (width + 1) * (height + 1);
+                }
+            }
+        }
+    }
+
+
+    // Decode a bbox according to a prior bbox
+    template<bool variance_encoded_in_target>
+    static void DecodeBBox(
+        const util::NormalizedBBox& prior_bbox, const std::vector<float>& prior_variance,
+        const cv::String& code_type,
+        const bool clip_bbox, const util::NormalizedBBox& clip_bounds,
+        const bool normalized_bbox, const util::NormalizedBBox& bbox,
+        util::NormalizedBBox& decode_bbox)
+    {
+        float bbox_xmin = variance_encoded_in_target ? bbox.xmin : prior_variance[0] * bbox.xmin;
+        float bbox_ymin = variance_encoded_in_target ? bbox.ymin : prior_variance[1] * bbox.ymin;
+        float bbox_xmax = variance_encoded_in_target ? bbox.xmax : prior_variance[2] * bbox.xmax;
+        float bbox_ymax = variance_encoded_in_target ? bbox.ymax : prior_variance[3] * bbox.ymax;
+        if (code_type == "CORNER")
+        {
+            decode_bbox.xmin = prior_bbox.xmin + bbox_xmin;
+            decode_bbox.ymin = prior_bbox.ymin + bbox_ymin;
+            decode_bbox.xmax = prior_bbox.xmax + bbox_xmax;
+            decode_bbox.ymax = prior_bbox.ymax + bbox_ymax;
+        }
+        else if (code_type == "CENTER_SIZE")
+        {
+            float prior_width = prior_bbox.xmax - prior_bbox.xmin;
+            float prior_height = prior_bbox.ymax - prior_bbox.ymin;
+            if (!normalized_bbox)
+            {
+                prior_width += 1.0f;
+                prior_height += 1.0f;
+            }
+            float prior_center_x = prior_bbox.xmin + prior_width * .5;
+            float prior_center_y = prior_bbox.ymin + prior_height * .5;
+
+            float decode_bbox_center_x, decode_bbox_center_y;
+            float decode_bbox_width, decode_bbox_height;
+            decode_bbox_center_x = bbox_xmin * prior_width + prior_center_x;
+            decode_bbox_center_y = bbox_ymin * prior_height + prior_center_y;
+            decode_bbox_width = exp(bbox_xmax) * prior_width;
+            decode_bbox_height = exp(bbox_ymax) * prior_height;
+            decode_bbox.xmin = decode_bbox_center_x - decode_bbox_width * .5;
+            decode_bbox.ymin = decode_bbox_center_y - decode_bbox_height * .5;
+            decode_bbox.xmax = decode_bbox_center_x + decode_bbox_width * .5;
+            decode_bbox.ymax = decode_bbox_center_y + decode_bbox_height * .5;
+        }
+        else
+            CV_Error(Error::StsBadArg, "Unknown type.");
+
+        if (clip_bbox)
+        {
+            // Clip the util::NormalizedBBox.
+            decode_bbox.xmin = std::max(std::min(decode_bbox.xmin, clip_bounds.xmax), clip_bounds.xmin);
+            decode_bbox.ymin = std::max(std::min(decode_bbox.ymin, clip_bounds.ymax), clip_bounds.ymin);
+            decode_bbox.xmax = std::max(std::min(decode_bbox.xmax, clip_bounds.xmax), clip_bounds.xmin);
+            decode_bbox.ymax = std::max(std::min(decode_bbox.ymax, clip_bounds.ymax), clip_bounds.ymin);
+        }
+        decode_bbox.clear_size();
+        decode_bbox.set_size(BBoxSize(decode_bbox, normalized_bbox));
+    }
+
+    // Decode a set of bboxes according to a set of prior bboxes
+    static void DecodeBBoxes(
+        const std::vector<util::NormalizedBBox>& prior_bboxes,
+        const std::vector<std::vector<float> >& prior_variances,
+        const cv::String& code_type, const bool variance_encoded_in_target,
+        const bool clip_bbox, const util::NormalizedBBox& clip_bounds,
+        const bool normalized_bbox, const std::vector<util::NormalizedBBox>& bboxes,
+        std::vector<util::NormalizedBBox>& decode_bboxes)
+    {
+        CV_Assert(prior_bboxes.size() == prior_variances.size());
+        CV_Assert(prior_bboxes.size() == bboxes.size());
+        size_t num_bboxes = prior_bboxes.size();
+        CV_Assert(num_bboxes == 0 || prior_variances[0].size() == 4);
+        decode_bboxes.clear(); decode_bboxes.resize(num_bboxes);
+        if(variance_encoded_in_target)
+        {
+            for (int i = 0; i < num_bboxes; ++i)
+                DecodeBBox<true>(prior_bboxes[i], prior_variances[i], code_type,
+                                 clip_bbox, clip_bounds, normalized_bbox,
+                                 bboxes[i], decode_bboxes[i]);
+        }
+        else
+        {
+            for (int i = 0; i < num_bboxes; ++i)
+                DecodeBBox<false>(prior_bboxes[i], prior_variances[i], code_type,
+                                  clip_bbox, clip_bounds, normalized_bbox,
+                                  bboxes[i], decode_bboxes[i]);
+        }
+    }
+
+    // Decode all bboxes in a batch
+    static void DecodeBBoxesAll(const std::vector<LabelBBox>& all_loc_preds,
+        const std::vector<util::NormalizedBBox>& prior_bboxes,
+        const std::vector<std::vector<float> >& prior_variances,
+        const int num, const bool share_location,
+        const int num_loc_classes, const int background_label_id,
+        const cv::String& code_type, const bool variance_encoded_in_target,
+        const bool clip, const util::NormalizedBBox& clip_bounds,
+        const bool normalized_bbox, std::vector<LabelBBox>& all_decode_bboxes)
+    {
+        CV_Assert(all_loc_preds.size() == num);
+        all_decode_bboxes.clear();
+        all_decode_bboxes.resize(num);
+        for (int i = 0; i < num; ++i)
+        {
+            // Decode predictions into bboxes.
+            const LabelBBox& loc_preds = all_loc_preds[i];
+            LabelBBox& decode_bboxes = all_decode_bboxes[i];
+            for (int c = 0; c < num_loc_classes; ++c)
+            {
+                int label = share_location ? -1 : c;
+                if (label == background_label_id)
+                    continue; // Ignore background class.
+                LabelBBox::const_iterator label_loc_preds = loc_preds.find(label);
+                if (label_loc_preds == loc_preds.end())
+                    CV_Error_(cv::Error::StsError, ("Could not find location predictions for label %d", label));
+                DecodeBBoxes(prior_bboxes, prior_variances,
+                             code_type, variance_encoded_in_target, clip, clip_bounds,
+                             normalized_bbox, label_loc_preds->second, decode_bboxes[label]);
+            }
+        }
+    }
+
+    // Get prior bounding boxes from prior_data
+    //    prior_data: 1 x 2 x num_priors * 4 x 1 blob.
+    //    num_priors: number of priors.
+    //    prior_bboxes: stores all the prior bboxes in the format of util::NormalizedBBox.
+    //    prior_variances: stores all the variances needed by prior bboxes.
+    static void GetPriorBBoxes(const float* priorData, const int& numPriors,
+                        bool normalized_bbox, std::vector<util::NormalizedBBox>& priorBBoxes,
+                        std::vector<std::vector<float> >& priorVariances)
+    {
+        priorBBoxes.clear(); priorBBoxes.resize(numPriors);
+        priorVariances.clear(); priorVariances.resize(numPriors);
+        for (int i = 0; i < numPriors; ++i)
+        {
+            int startIdx = i * 4;
+            util::NormalizedBBox& bbox = priorBBoxes[i];
+            bbox.xmin = priorData[startIdx];
+            bbox.ymin = priorData[startIdx + 1];
+            bbox.xmax = priorData[startIdx + 2];
+            bbox.ymax = priorData[startIdx + 3];
+            bbox.set_size(BBoxSize(bbox, normalized_bbox));
+        }
+
+        for (int i = 0; i < numPriors; ++i)
+        {
+            int startIdx = (numPriors + i) * 4;
+            // not needed here: priorVariances[i].clear();
+            for (int j = 0; j < 4; ++j)
+            {
+                priorVariances[i].push_back(priorData[startIdx + j]);
+            }
+        }
+    }
+
+    // Get location predictions from loc_data.
+    //    loc_data: num x num_preds_per_class * num_loc_classes * 4 blob.
+    //    num: the number of images.
+    //    num_preds_per_class: number of predictions per class.
+    //    num_loc_classes: number of location classes. It is 1 if share_location is
+    //      true; and is equal to number of classes needed to predict otherwise.
+    //    share_location: if true, all classes share the same location prediction.
+    //    loc_pred_transposed: if true, represent four bounding box values as
+    //                         [y,x,height,width] or [x,y,width,height] otherwise.
+    //    loc_preds: stores the location prediction, where each item contains
+    //      location prediction for an image.
+    static void GetLocPredictions(const float* locData, const int num,
+                           const int numPredsPerClass, const int numLocClasses,
+                           const bool shareLocation, const bool locPredTransposed,
+                           std::vector<LabelBBox>& locPreds)
+    {
+        locPreds.clear();
+        if (shareLocation)
+        {
+            CV_Assert(numLocClasses == 1);
+        }
+        locPreds.resize(num);
+        for (int i = 0; i < num; ++i, locData += numPredsPerClass * numLocClasses * 4)
+        {
+            LabelBBox& labelBBox = locPreds[i];
+            for (int p = 0; p < numPredsPerClass; ++p)
+            {
+                int startIdx = p * numLocClasses * 4;
+                for (int c = 0; c < numLocClasses; ++c)
+                {
+                    int label = shareLocation ? -1 : c;
+                    if (labelBBox.find(label) == labelBBox.end())
+                    {
+                        labelBBox[label].resize(numPredsPerClass);
+                    }
+                    util::NormalizedBBox& bbox = labelBBox[label][p];
+                    if (locPredTransposed)
+                    {
+                        bbox.ymin = locData[startIdx + c * 4];
+                        bbox.xmin = locData[startIdx + c * 4 + 1];
+                        bbox.ymax = locData[startIdx + c * 4 + 2];
+                        bbox.xmax = locData[startIdx + c * 4 + 3];
+                    }
+                    else
+                    {
+                        bbox.xmin = locData[startIdx + c * 4];
+                        bbox.ymin = locData[startIdx + c * 4 + 1];
+                        bbox.xmax = locData[startIdx + c * 4 + 2];
+                        bbox.ymax = locData[startIdx + c * 4 + 3];
+                    }
+                }
+            }
+        }
+    }
+
+    // Get confidence predictions from conf_data.
+    //    conf_data: num x num_preds_per_class * num_classes blob.
+    //    num: the number of images.
+    //    num_preds_per_class: number of predictions per class.
+    //    num_classes: number of classes.
+    //    conf_preds: stores the confidence prediction, where each item contains
+    //      confidence prediction for an image.
+    static void GetConfidenceScores(const float* confData, const int num,
+                             const int numPredsPerClass, const int numClasses,
+                             std::vector<Mat>& confPreds)
+    {
+        int shape[] = { numClasses, numPredsPerClass };
+        for (int i = 0; i < num; i++)
+            confPreds.push_back(Mat(2, shape, CV_32F));
+
+        for (int i = 0; i < num; ++i, confData += numPredsPerClass * numClasses)
+        {
+            Mat labelScores = confPreds[i];
+            for (int c = 0; c < numClasses; ++c)
+            {
+                for (int p = 0; p < numPredsPerClass; ++p)
+                {
+                    labelScores.at<float>(c, p) = confData[p * numClasses + c];
+                }
+            }
+        }
+    }
+
+    // Compute the jaccard (intersection over union IoU) overlap between two bboxes.
+    template<bool normalized>
+    static float JaccardOverlap(const util::NormalizedBBox& bbox1,
+                         const util::NormalizedBBox& bbox2)
+    {
+        util::NormalizedBBox intersect_bbox;
+        intersect_bbox.xmin = std::max(bbox1.xmin, bbox2.xmin);
+        intersect_bbox.ymin = std::max(bbox1.ymin, bbox2.ymin);
+        intersect_bbox.xmax = std::min(bbox1.xmax, bbox2.xmax);
+        intersect_bbox.ymax = std::min(bbox1.ymax, bbox2.ymax);
+
+        float intersect_size = BBoxSize(intersect_bbox, normalized);
+        if (intersect_size > 0)
+        {
+            float bbox1_size = BBoxSize(bbox1, normalized);
+            float bbox2_size = BBoxSize(bbox2, normalized);
+            return intersect_size / (bbox1_size + bbox2_size - intersect_size);
+        }
+        else
+        {
+            return 0.;
+        }
+    }
+
+#ifdef HAVE_INF_ENGINE
+    virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >&) CV_OVERRIDE
+    {
+        InferenceEngine::Builder::DetectionOutputLayer ieLayer(name);
+
+        ieLayer.setNumClasses(_numClasses);
+        ieLayer.setShareLocation(_shareLocation);
+        ieLayer.setBackgroudLabelId(_backgroundLabelId);
+        ieLayer.setNMSThreshold(_nmsThreshold);
+        ieLayer.setTopK(_topK > 0 ? _topK : _keepTopK);
+        ieLayer.setKeepTopK(_keepTopK);
+        ieLayer.setConfidenceThreshold(_confidenceThreshold);
+        ieLayer.setVariantEncodedInTarget(_varianceEncodedInTarget);
+        ieLayer.setCodeType("caffe.PriorBoxParameter." + _codeType);
+        ieLayer.setInputPorts(std::vector<InferenceEngine::Port>(3));
+
+        InferenceEngine::Builder::Layer l = ieLayer;
+        l.getParameters()["eta"] = std::string("1.0");
+        l.getParameters()["clip"] = _clip;
+
+        return Ptr<BackendNode>(new InfEngineBackendNode(l));
+    }
+#endif  // HAVE_INF_ENGINE
+
+
+#ifdef HAVE_DNN_NGRAPH
+    virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> >& inputs, const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
+    {
+        CV_Assert(nodes.size() == 3);
+        auto& box_logits  = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
+        auto& class_preds = nodes[1].dynamicCast<InfEngineNgraphNode>()->node;
+        auto& proposals   = nodes[2].dynamicCast<InfEngineNgraphNode>()->node;
+
+        ngraph::op::DetectionOutputAttrs attrs;
+        attrs.num_classes                = _numClasses;
+        attrs.background_label_id        = _backgroundLabelId;
+        attrs.top_k                      = _topK > 0 ? _topK : _keepTopK;
+        attrs.variance_encoded_in_target = _varianceEncodedInTarget;
+        attrs.keep_top_k                 = {_keepTopK};
+        attrs.nms_threshold              = _nmsThreshold;
+        attrs.confidence_threshold       = _confidenceThreshold;
+        attrs.share_location             = _shareLocation;
+        attrs.clip_before_nms            = _clip;
+        attrs.code_type                  = std::string{"caffe.PriorBoxParameter." + _codeType};
+        attrs.normalized                 = true;
+
+        auto det_out = std::make_shared<ngraph::op::DetectionOutput>(box_logits, class_preds,
+                       proposals, attrs);
+        return Ptr<BackendNode>(new InfEngineNgraphNode(det_out));
+    }
+#endif  // HAVE_DNN_NGRAPH
+};
+
+float util::caffe_box_overlap(const util::NormalizedBBox& a, const util::NormalizedBBox& b)
+{
+    return DetectionOutputLayerImpl::JaccardOverlap<false>(a, b);
+}
+
+float util::caffe_norm_box_overlap(const util::NormalizedBBox& a, const util::NormalizedBBox& b)
+{
+    return DetectionOutputLayerImpl::JaccardOverlap<true>(a, b);
+}
+
+const std::string DetectionOutputLayerImpl::_layerName = std::string("DetectionOutput");
+
+Ptr<DetectionOutputLayer> DetectionOutputLayer::create(const LayerParams &params)
+{
+    return Ptr<DetectionOutputLayer>(new DetectionOutputLayerImpl(params));
+}
+
+}
+}
--- a/Lib/opencv/sources/modules/dnn/src/layers/elementwise_layers.cpp
+++ b/Lib/opencv/sources/modules/dnn/src/layers/elementwise_layers.cpp
--- a/Lib/opencv/sources/modules/dnn/src/layers/eltwise_layer.cpp
+++ b/Lib/opencv/sources/modules/dnn/src/layers/eltwise_layer.cpp
@@ -0,0 +1,780 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Copyright (C) 2017, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "../precomp.hpp"
+#include "layers_common.hpp"
+#include "../op_cuda.hpp"
+#include "../op_halide.hpp"
+#include "../op_inf_engine.hpp"
+#include "../ie_ngraph.hpp"
+
+#ifdef HAVE_OPENCL
+#include "opencl_kernels_dnn.hpp"
+#endif
+
+#ifdef HAVE_CUDA
+#include "../cuda4dnn/primitives/eltwise.hpp"
+using namespace cv::dnn::cuda4dnn;
+#endif
+
+namespace cv
+{
+namespace dnn
+{
+
+class EltwiseLayerImpl CV_FINAL : public EltwiseLayer
+{
+public:
+    enum EltwiseOp
+    {
+        PROD = 0,
+        SUM = 1,
+        MAX = 2,
+        DIV = 3
+    } op;
+    std::vector<float> coeffs;
+
+    enum OutputChannelsMode
+    {
+        ELTWISE_CHANNNELS_SAME = 0,              //!< number of channels from inputs must be the same and equal to output's number of channels
+        ELTWISE_CHANNNELS_INPUT_0,               //!< number of channels from inputs may be different,
+                                                 //!< output's number of channels is equal to number of channels of first input
+                                                 //!< number of channels of other inputs should not be greater than number of channels of first input
+        ELTWISE_CHANNNELS_INPUT_0_TRUNCATE,      //!< number of channels from inputs may be different,
+                                                 //!< output's number of channels is equal to number of channels of first input
+                                                 //!< there is restriction on number of channels of other inputs
+                                                 //!< extra channels of other inputs is ignored
+        ELTWISE_CHANNNELS_USE_MAX,               //!< number of channels from inputs may be different,
+                                                 //!< output's number of channels is equal to maximal number of input channels
+                                                 //!< @note supported operation: `SUM`
+    } channelsModeInput;
+
+
+    mutable OutputChannelsMode channelsMode;     //!< "optimized" channels mode (switch to ELTWISE_CHANNNELS_SAME if number of input channels are equal)
+    mutable /*size_t*/int outputChannels;
+
+    EltwiseLayerImpl(const LayerParams& params)
+        : outputChannels(0)
+    {
+        setParamsFrom(params);
+        op = SUM;
+        if (params.has("operation"))
+        {
+            String operation = toLowerCase(params.get<String>("operation"));
+            if (operation == "prod")
+                op = PROD;
+            else if (operation == "sum")
+                op = SUM;
+            else if (operation == "max")
+                op = MAX;
+            else if (operation == "div")
+                op = DIV;
+            else
+                CV_Error(cv::Error::StsBadArg, "Unknown operation type \"" + operation + "\"");
+        }
+
+        if (params.has("coeff"))
+        {
+            DictValue paramCoeff = params.get("coeff");
+            int i, n = paramCoeff.size();
+            coeffs.resize(n);
+            for (i = 0; i < n; i++)
+            {
+                coeffs[i] = paramCoeff.get<float>(i);
+            }
+        }
+
+        channelsModeInput = ELTWISE_CHANNNELS_SAME;
+        if (params.has("output_channels_mode"))
+        {
+            String v = toLowerCase(params.get<String>("output_channels_mode"));
+            if (v == "same")
+            {
+                channelsModeInput = ELTWISE_CHANNNELS_SAME;
+            }
+            else if (v == "input_0")
+            {
+                channelsModeInput = ELTWISE_CHANNNELS_INPUT_0;
+            }
+            else if (v == "input_0_truncate")
+            {
+                channelsModeInput = ELTWISE_CHANNNELS_INPUT_0_TRUNCATE;
+            }
+            else if (v == "max_input_channels")
+            {
+                channelsModeInput = ELTWISE_CHANNNELS_USE_MAX;
+                if (op != SUM)
+                    CV_Error(cv::Error::StsBadArg, "[" + type + "]:(" + name + ") 'max' channels mode is limited to SUM operation only");
+            }
+            else
+                CV_Error(cv::Error::StsBadArg, "[" + type + "]:(" + name + ") unknown channels mode: \"" + v + "\"");
+        }
+        channelsMode = channelsModeInput;
+
+        // TODO Must have checks for other unknown options
+    }
+
+    virtual bool supportBackend(int backendId) CV_OVERRIDE
+    {
+        return backendId == DNN_BACKEND_OPENCV ||
+               backendId == DNN_BACKEND_CUDA ||
+               (backendId == DNN_BACKEND_HALIDE && op != DIV) ||  // TODO: not implemented, see PR #15811
+               ((((backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && (preferableTarget != DNN_TARGET_OPENCL || coeffs.empty()))
+                || backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH) && channelsMode == ELTWISE_CHANNNELS_SAME));
+    }
+
+    bool getMemoryShapes(const std::vector<MatShape> &inputs,
+                         const int requiredOutputs,
+                         std::vector<MatShape> &outputs,
+                         std::vector<MatShape> &internals) const CV_OVERRIDE
+    {
+        CV_Assert(inputs.size() >= 2);
+        CV_Assert(inputs[0].size() >= 2);
+        CV_Assert(coeffs.size() == 0 || coeffs.size() == inputs.size());
+        CV_Assert(op == SUM || coeffs.size() == 0);
+
+        int dims = inputs[0].size();
+        // Number of channels in output shape is determined by the first input tensor.
+        bool variableChannels = false;
+        int numChannels = inputs[0][1];
+        for (size_t i = 1; i < inputs.size(); i++)
+        {
+            CV_Assert(inputs[0][0] == inputs[i][0]);  // batch sizes are equal
+
+            int input_channels = inputs[i][1];
+            if (numChannels != input_channels)
+                variableChannels = true;
+
+            if (channelsModeInput == ELTWISE_CHANNNELS_SAME)
+            {
+                CV_Assert(numChannels == input_channels);
+            }
+            else if (channelsModeInput == ELTWISE_CHANNNELS_INPUT_0)
+            {
+                CV_Assert(numChannels >= input_channels);
+            }
+            else if (channelsModeInput == ELTWISE_CHANNNELS_INPUT_0_TRUNCATE)
+            {
+                // nothing to check
+            }
+            else if (channelsModeInput == ELTWISE_CHANNNELS_USE_MAX)
+            {
+                numChannels = std::max(numChannels, input_channels);
+            }
+            else
+            {
+                CV_Assert(0 && "Internal error");
+            }
+
+            for (size_t j = 2; j < dims; j++)
+                CV_Assert(inputs[0][j] == inputs[i][j]);
+        }
+
+        channelsMode = variableChannels ? channelsModeInput : ELTWISE_CHANNNELS_SAME;
+        outputChannels = numChannels;
+
+        outputs.assign(1, inputs[0]);
+        outputs[0][1] = numChannels;
+        return false;
+    }
+
+
+    class EltwiseInvoker : public ParallelLoopBody
+    {
+        EltwiseLayerImpl& self;
+        std::vector<const Mat*> srcs;
+        std::vector<int> srcNumChannels;
+        int nsrcs;
+        Mat* dst;
+        std::vector<float> coeffs;
+        int nstripes;
+        const ActivationLayer* activ;
+        int channels;
+        size_t planeSize;
+
+        EltwiseInvoker(EltwiseLayerImpl& self_)
+            : self(self_)
+            , nsrcs(0), dst(0), nstripes(0), activ(0), channels(0)
+            , planeSize(0)
+        {}
+
+    public:
+        static void run(EltwiseLayerImpl& self,
+                        const Mat* srcs, int nsrcs, Mat& dst,
+                        int nstripes)
+        {
+            const EltwiseOp op = self.op;
+            CV_Check(dst.dims, 1 < dst.dims && dst.dims <= 5, ""); CV_CheckTypeEQ(dst.type(), CV_32FC1, ""); CV_Assert(dst.isContinuous());
+            CV_Assert(self.coeffs.empty() || self.coeffs.size() == (size_t)nsrcs);
+            CV_CheckGE(nsrcs, 2, "");
+
+            CV_Assert(self.outputChannels == dst.size[1]);
+
+            EltwiseInvoker p(self);
+            p.srcs.resize(nsrcs);
+            p.srcNumChannels.resize(nsrcs);
+            p.coeffs = self.coeffs;  // can be sorted
+
+            bool sortInputs = false;
+            for( int i = 0; i < nsrcs; i++ )
+            {
+                p.srcs[i] = &srcs[i];
+                CV_CheckEQ(srcs[i].dims, dst.dims, "");
+                CV_Assert(srcs[i].isContinuous());
+                CV_Assert(srcs[i].type() == dst.type());
+                p.srcNumChannels[i] = (srcs[i].dims >= 4) ? srcs[i].size[1] : 1;
+
+                if (self.channelsMode == ELTWISE_CHANNNELS_SAME)
+                {
+                    CV_Assert(srcs[i].size == dst.size);
+                }
+                else if (self.channelsMode == ELTWISE_CHANNNELS_INPUT_0)
+                {
+                    if (i == 0)
+                        CV_Assert(srcs[0].size == dst.size);
+                    CV_Assert(self.outputChannels >= p.srcNumChannels[i]);
+                    sortInputs = true;
+                }
+                else if (self.channelsMode == ELTWISE_CHANNNELS_INPUT_0_TRUNCATE)
+                {
+                    if (i == 0)
+                        CV_Assert(srcs[0].size == dst.size);
+                    sortInputs = true;
+                }
+                else if (self.channelsMode == ELTWISE_CHANNNELS_USE_MAX)
+                {
+                    CV_Assert(op == SUM);
+                    CV_Assert(self.outputChannels >= p.srcNumChannels[i]);
+                    sortInputs = true;
+                }
+                else
+                {
+                    CV_Assert(0 && "Internal error");
+                }
+
+                if (sortInputs)
+                {
+                    // Sort srcs and coefficients in the desc order by number of channels
+                    for (int j = i; j >= 1; j--)
+                    {
+                        if (std::min(self.outputChannels, p.srcs[j - 1]->size[1]) < std::min(self.outputChannels, p.srcs[j]->size[1]))
+                        {
+                            std::swap(p.srcs[j - 1], p.srcs[j]);
+                            std::swap(p.srcNumChannels[j - 1], p.srcNumChannels[j]);
+                            if (!p.coeffs.empty())
+                                std::swap(p.coeffs[j - 1], p.coeffs[j]);
+                        }
+                        else
+                            break;
+                    }
+                }
+            }
+
+            p.nsrcs = nsrcs;
+            p.dst = &dst;
+            p.nstripes = nstripes;
+            p.channels = (dst.dims >= 4 ? dst.size[1] : 1);
+
+            p.planeSize = dst.total(dst.dims >= 4 ? 2 : 1);
+            CV_CheckEQ(dst.total(), dst.size[0] * p.channels * p.planeSize, "");
+
+            bool simpleCoeffs = true;
+            if (op == SUM && !p.coeffs.empty())
+            {
+                CV_CheckEQ(p.coeffs.size(), (size_t)nsrcs, "");
+
+                for (size_t i = 0; i < p.coeffs.size(); i++)
+                {
+                    if (p.coeffs[i] != 1)
+                    {
+                        simpleCoeffs = false;
+                        break;
+                    }
+                }
+            }
+            if (simpleCoeffs)
+                p.coeffs.clear();
+            p.activ = self.activ.get();
+
+            parallel_for_(Range(0, nstripes), p, nstripes);
+        }
+
+        void operator()(const Range& r) const CV_OVERRIDE
+        {
+            const EltwiseOp op = self.op;
+            size_t total = dst->size[0]*planeSize;
+            size_t stripeSize = (total + nstripes - 1)/nstripes;
+            size_t stripeStart = r.start*stripeSize;
+            size_t stripeEnd = std::min(r.end*stripeSize, total);
+            const float* coeffsptr = !coeffs.empty() ? &coeffs[0] : 0;
+            float* dstptr0 = dst->ptr<float>();
+            int blockSize0 = 1 << 12;
+
+            for (size_t ofs = stripeStart; ofs < stripeEnd; )
+            {
+                int sampleIdx = (int)(ofs / planeSize);
+                int delta = (int)ofs - sampleIdx * planeSize;
+                int blockSize = std::min(blockSize0, std::min((int)(stripeEnd - ofs), (int)planeSize - delta));
+                if( blockSize <= 0 )
+                    break;
+                ofs += blockSize;
+
+                for (int c = 0; c < channels; c++)
+                {
+                    size_t dstIdx = delta + (sampleIdx*channels + c)*planeSize;
+                    float* dstptr = dstptr0 + dstIdx;
+
+                    // process first two inputs
+                    {
+                        const float* srcptr0 = srcs[0]->ptr<float>() + dstIdx;
+
+                        const int inputIdx = 1;
+                        int src1_channels = srcNumChannels[inputIdx];
+                        if (c >= src1_channels)
+                        {
+                            // no data from second input
+                            if (!coeffsptr || coeffsptr[0] == 1.0f)
+                            {
+                                for (int j = 0; j < blockSize; j++)
+                                {
+                                    dstptr[j] = srcptr0[j];
+                                }
+                            }
+                            else
+                            {
+                                float c0 = coeffsptr[0];
+                                for (int j = 0; j < blockSize; j++)
+                                {
+                                    dstptr[j] = c0*srcptr0[j];
+                                }
+                            }
+                        }
+                        else
+                        {
+                            size_t srcIdx = delta + (sampleIdx * src1_channels + c) * planeSize;
+                            const float* srcptrI = srcs[inputIdx]->ptr<float>() + srcIdx;
+
+                            if (op == PROD)
+                            {
+                                for (int j = 0; j < blockSize; j++)
+                                {
+                                    dstptr[j] = srcptr0[j] * srcptrI[j];
+                                }
+                            }
+                            else if (op == DIV)
+                            {
+                                for (int j = 0; j < blockSize; j++)
+                                {
+                                    dstptr[j] = srcptr0[j] / srcptrI[j];
+                                }
+                            }
+                            else if (op == MAX)
+                            {
+                                for (int j = 0; j < blockSize; j++)
+                                {
+                                    dstptr[j] = std::max(srcptr0[j], srcptrI[j]);
+                                }
+                            }
+                            else if (op == SUM)
+                            {
+                                if (!coeffsptr || (coeffsptr[0] == 1.0f && coeffsptr[1] == 1.0f))
+                                {
+                                    for (int j = 0; j < blockSize; j++)
+                                    {
+                                        dstptr[j] = srcptr0[j] + srcptrI[j];
+                                    }
+                                }
+                                else
+                                {
+                                    float c0 = coeffsptr[0];
+                                    float c1 = coeffsptr[1];
+                                    for (int j = 0; j < blockSize; j++)
+                                    {
+                                        dstptr[j] = c0*srcptr0[j] + c1*srcptrI[j];
+                                    }
+                                }
+                            }
+                            else
+                                CV_Error(Error::StsInternal, "");
+                        }
+                    }
+
+                    // aggregate other inputs (3+)
+                    for (size_t inputIdx = 2; inputIdx < nsrcs; inputIdx++)
+                    {
+                        int srcI_channels = srcNumChannels[inputIdx];
+                        if (c >= srcI_channels)
+                            continue;  // no data from second input
+                        size_t srcIdx = delta + (sampleIdx * srcI_channels + c) * planeSize;
+                        const float* srcptrI = srcs[inputIdx]->ptr<float>() + srcIdx;
+
+                        if (op == PROD)
+                        {
+                            for (int j = 0; j < blockSize; j++)
+                            {
+                                dstptr[j] *= srcptrI[j];
+                            }
+                        }
+                        else if (op == DIV)
+                        {
+                            for (int j = 0; j < blockSize; j++)
+                            {
+                                dstptr[j] /= srcptrI[j];
+                            }
+                        }
+                        else if (op == MAX)
+                        {
+                            for (int j = 0; j < blockSize; j++)
+                            {
+                                dstptr[j] = std::max(dstptr[j], srcptrI[j]);
+                            }
+                        }
+                        else if (op == SUM)
+                        {
+                            if (!coeffsptr || coeffsptr[inputIdx] == 1.0f)
+                            {
+                                for (int j = 0; j < blockSize; j++)
+                                {
+                                    dstptr[j] += srcptrI[j];
+                                }
+                            }
+                            else
+                            {
+                                float cI = coeffsptr[inputIdx];
+                                for (int j = 0; j < blockSize; j++)
+                                {
+                                    dstptr[j] += cI * srcptrI[j];
+                                }
+                            }
+                        }
+                        else
+                            CV_Error(Error::StsInternal, "");
+                    }
+                }
+
+                if( activ )
+                {
+                    float* ptr = dstptr0 + delta + sampleIdx*channels*planeSize;
+                    activ->forwardSlice(ptr, ptr, blockSize, planeSize, 0, channels);
+                }
+            }
+        }
+    };
+
+#ifdef HAVE_OPENCL
+    bool forward_ocl(InputArrayOfArrays inputs_, OutputArrayOfArrays outputs_, OutputArrayOfArrays internals_)
+    {
+        std::vector<UMat> inputs;
+        std::vector<UMat> outputs;
+
+        if ((inputs_.depth() == CV_16S && op != SUM) || (channelsMode != ELTWISE_CHANNNELS_SAME))
+            return false;
+
+        inputs_.getUMatVector(inputs);
+        outputs_.getUMatVector(outputs);
+
+        switch (op)
+        {
+            case SUM:
+                {
+                    int channels = total(shape(outputs[0]), 0, 2);
+                    int plane_size = total(shape(outputs[0]), 2);
+                    if (channels % 4 == 0 && plane_size % 4 == 0)
+                    {
+                        size_t localsize[] = { 128 };
+                        size_t globalsize[] = { (size_t)channels / 4 * localsize[0] };
+                        String opts;
+                        if (inputs_.depth() == CV_16S)
+                            opts = " -DDtype=half -DDtype4=half4 -DDtype8=half8";
+                        else
+                            opts = " -DDtype=float -DDtype4=float4 -DDtype8=float8";
+
+                        for (int i = 0; i < (inputs.size() - 1); ++i)
+                        {
+                            String buildopt = format("-DLOOP=%d", i) + opts;
+                            ocl::Kernel kernel("op_sum4", ocl::dnn::eltwise_oclsrc, buildopt);
+                            int idx = 0;
+                            UMat inpMat = (i == 0) ? inputs[0] : UMat();
+                            float coeff1 = (coeffs.empty() || i > 0) ? 1.0f : coeffs[i];
+                            float coeff2 = coeffs.empty() ? 1.0f : coeffs[i + 1];
+                            kernel.set(idx++, ocl::KernelArg::PtrReadOnly(inputs[0]));
+                            kernel.set(idx++, ocl::KernelArg::PtrReadOnly(inputs[1]));
+                            kernel.set(idx++, (int)plane_size);
+                            kernel.set(idx++, (float)coeff1);
+                            kernel.set(idx++, (float)coeff2);
+                            kernel.set(idx++, ocl::KernelArg::PtrReadWrite(outputs[0]));
+                            bool ret = kernel.run(1, globalsize, localsize, false);
+                            if (!ret)
+                                return false;
+                        }
+                    }
+                    else
+                    {
+                        if (inputs_.depth() == CV_16S)
+                            return false;
+
+                        float coeff1 = coeffs.empty() ? 1.f : coeffs[0];
+                        float coeff2 = coeffs.empty() ? 1.f : coeffs[1];
+                        UMat mul0, mul1;
+                        multiply(coeff1, inputs[0], mul0);
+                        multiply(coeff2, inputs[1], mul1);
+                        add(mul0, mul1, outputs[0]);
+                        for (int i = 2; i < inputs.size(); ++i)
+                        {
+                            float coeff = coeffs.empty() ? 1.f : coeffs[i];
+                            multiply(coeff, inputs[i], mul0);
+                            add(mul0, outputs[0], outputs[0]);
+                        }
+                    }
+                }
+                break;
+            case PROD:
+                multiply(inputs[0], inputs[1], outputs[0]);
+                for (int i = 2; i < inputs.size(); ++i)
+                    multiply(inputs[i], outputs[0], outputs[0]);
+                break;
+            case DIV:
+                divide(inputs[0], inputs[1], outputs[0]);
+                for (int i = 2; i < inputs.size(); ++i)
+                    divide(outputs[0], inputs[i], outputs[0]);
+                break;
+            case MAX:
+                max(inputs[0], inputs[1], outputs[0]);
+                for (int i = 2; i < inputs.size(); ++i)
+                    max(inputs[i], outputs[0], outputs[0]);
+                break;
+            default:
+                return false;
+        }
+        return true;
+    }
+#endif
+
+    void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
+    {
+        CV_TRACE_FUNCTION();
+        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+
+        CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
+                   forward_ocl(inputs_arr, outputs_arr, internals_arr))
+
+        if (inputs_arr.depth() == CV_16S)
+        {
+            forward_fallback(inputs_arr, outputs_arr, internals_arr);
+            return;
+        }
+
+        std::vector<Mat> inputs, outputs;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
+
+        CV_Assert(outputs.size() == 1);
+        const int nstripes = getNumThreads();
+        EltwiseInvoker::run(*this,
+                            &inputs[0], (int)inputs.size(), outputs[0],
+                            nstripes);
+    }
+
+#ifdef HAVE_CUDA
+    Ptr<BackendNode> initCUDA(
+        void *context_,
+        const std::vector<Ptr<BackendWrapper>>& inputs,
+        const std::vector<Ptr<BackendWrapper>>& outputs
+    ) override
+    {
+        auto context = reinterpret_cast<csl::CSLContext*>(context_);
+
+        auto op_ = [this] {
+            switch (op) {
+            case MAX: return cuda4dnn::EltwiseOpType::MAX;
+            case SUM: return cuda4dnn::EltwiseOpType::SUM;
+            case PROD: return cuda4dnn::EltwiseOpType::PRODUCT;
+            case DIV: return cuda4dnn::EltwiseOpType::DIV;
+            }
+            return cuda4dnn::EltwiseOpType::SUM;
+        }();
+
+        return make_cuda_node<cuda4dnn::EltwiseOp>(preferableTarget, std::move(context->stream), op_, coeffs);
+    }
+#endif
+
+    virtual Ptr<BackendNode> initHalide(const std::vector<Ptr<BackendWrapper> > &input) CV_OVERRIDE
+    {
+#ifdef HAVE_HALIDE
+        Halide::Var x("x"), y("y"), c("c"), n("n");
+        Halide::Func top = (name.empty() ? Halide::Func() : Halide::Func(name));
+        Halide::Expr topExpr;
+        std::vector<Halide::Buffer<> > inputBuffers = halideBuffers(input);
+        switch (op)
+        {
+            case SUM:
+                if (coeffs.empty())
+                {
+                    topExpr = inputBuffers[0](x, y, c, n) +
+                              inputBuffers[1](x, y, c, n);
+                    for (int i = 2; i < inputBuffers.size(); ++i)
+                        topExpr += inputBuffers[i](x, y, c, n);
+                }
+                else
+                {
+                  topExpr = coeffs[0] * inputBuffers[0](x, y, c, n) +
+                            coeffs[1] * inputBuffers[1](x, y, c, n);
+                  for (int i = 2; i < inputBuffers.size(); ++i)
+                      topExpr += coeffs[i] * inputBuffers[i](x, y, c, n);
+                }
+                break;
+            case PROD:
+                topExpr = inputBuffers[0](x, y, c, n) *
+                          inputBuffers[1](x, y, c, n);
+                for (int i = 2; i < inputBuffers.size(); ++i)
+                    topExpr *= inputBuffers[i](x, y, c, n);
+                break;
+            case DIV:
+                topExpr = inputBuffers[0](x, y, c, n) /
+                          inputBuffers[1](x, y, c, n);
+                for (int i = 2; i < inputBuffers.size(); ++i)
+                    topExpr /= inputBuffers[i](x, y, c, n);
+                break;
+            case MAX:
+                topExpr = max(inputBuffers[0](x, y, c, n),
+                              inputBuffers[1](x, y, c, n));
+                for (int i = 2; i < inputBuffers.size(); ++i)
+                    topExpr = max(topExpr, inputBuffers[i](x, y, c, n));
+                break;
+            default:
+                return Ptr<BackendNode>();
+        }
+        top(x, y, c, n) = topExpr;
+        return Ptr<BackendNode>(new HalideBackendNode(top));
+#endif  // HAVE_HALIDE
+        return Ptr<BackendNode>();
+    }
+
+#ifdef HAVE_INF_ENGINE
+    virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >& inputs) CV_OVERRIDE
+    {
+        InferenceEngine::Builder::EltwiseLayer ieLayer(name);
+
+        ieLayer.setInputPorts(std::vector<InferenceEngine::Port>(inputs.size()));
+
+        if (op == SUM)
+            ieLayer.setEltwiseType(InferenceEngine::Builder::EltwiseLayer::EltwiseType::SUM);
+        else if (op == PROD)
+            ieLayer.setEltwiseType(InferenceEngine::Builder::EltwiseLayer::EltwiseType::MUL);
+        else if (op == DIV)
+            ieLayer.setEltwiseType(InferenceEngine::Builder::EltwiseLayer::EltwiseType::DIV);
+        else if (op == MAX)
+            ieLayer.setEltwiseType(InferenceEngine::Builder::EltwiseLayer::EltwiseType::MAX);
+        else
+            CV_Error(Error::StsNotImplemented, "Unsupported eltwise operation");
+
+        InferenceEngine::Builder::Layer l = ieLayer;
+        if (!coeffs.empty())
+            l.getParameters()["coeff"] = coeffs;
+
+        return Ptr<BackendNode>(new InfEngineBackendNode(l));
+    }
+#endif  // HAVE_INF_ENGINE
+
+
+#ifdef HAVE_DNN_NGRAPH
+    virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> >& inputs,
+                                        const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
+    {
+        auto curr_node = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
+        if (!coeffs.empty()) {
+            auto coeff = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape{1}, &coeffs[0]);
+            curr_node = std::make_shared<ngraph::op::v1::Multiply>(curr_node, coeff, ngraph::op::AutoBroadcastType::NUMPY);
+        }
+
+        for (size_t i = 1; i < nodes.size(); i++)
+        {
+            auto next_node = nodes[i].dynamicCast<InfEngineNgraphNode>()->node;
+            if (!coeffs.empty()) {
+                auto coeff = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape{1}, &coeffs[i]);
+                next_node = std::make_shared<ngraph::op::v1::Multiply>(next_node, coeff, ngraph::op::AutoBroadcastType::NUMPY);
+            }
+            switch (op) {
+                case SUM:  curr_node = std::make_shared<ngraph::op::v1::Add>(curr_node, next_node); break;
+                case PROD: curr_node = std::make_shared<ngraph::op::v1::Multiply>(curr_node, next_node); break;
+                case DIV:  curr_node = std::make_shared<ngraph::op::v1::Divide>(curr_node, next_node); break;
+                case MAX:  curr_node = std::make_shared<ngraph::op::v1::Maximum>(curr_node, next_node); break;
+                default: CV_Error(Error::StsNotImplemented, "Unsupported eltwise operation");
+            }
+        }
+        return Ptr<BackendNode>(new InfEngineNgraphNode(curr_node));
+    }
+#endif  // HAVE_DNN_NGRAPH
+
+    virtual int64 getFLOPS(const std::vector<MatShape> &inputs,
+                           const std::vector<MatShape> &outputs) const CV_OVERRIDE
+    {
+        CV_UNUSED(outputs); // suppress unused variable warning
+        CV_Assert(inputs.size());
+
+        // FIXIT: handle inputs with different number of channels
+        long flops = inputs.size() * total(inputs[0]);
+
+        return flops;
+    }
+
+    bool setActivation(const Ptr<ActivationLayer>& layer) CV_OVERRIDE
+    {
+        if (activ.empty() || layer.empty())
+        {
+            activ = layer;
+            return !activ.empty();
+        }
+        else
+            return false;
+    }
+
+    Ptr<ActivationLayer> activ;
+};
+
+Ptr<EltwiseLayer> EltwiseLayer::create(const LayerParams& params)
+{
+    return Ptr<EltwiseLayer>(new EltwiseLayerImpl(params));
+}
+
+}
+}
--- a/Lib/opencv/sources/modules/dnn/src/layers/flatten_layer.cpp
+++ b/Lib/opencv/sources/modules/dnn/src/layers/flatten_layer.cpp
@@ -0,0 +1,238 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Copyright (C) 2017, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "../precomp.hpp"
+#include "layers_common.hpp"
+#include "../op_cuda.hpp"
+#include "../op_inf_engine.hpp"
+#include "../ie_ngraph.hpp"
+
+#include <float.h>
+#include <algorithm>
+#include <opencv2/dnn/shape_utils.hpp>
+
+#ifdef HAVE_CUDA
+#include "../cuda4dnn/primitives/reshape.hpp"
+using namespace cv::dnn::cuda4dnn;
+#endif
+
+namespace cv
+{
+namespace dnn
+{
+
+class FlattenLayerImpl CV_FINAL : public FlattenLayer
+{
+public:
+    FlattenLayerImpl(const LayerParams &params)
+    {
+        _startAxis = params.get<int>("axis", 1);
+        _endAxis = params.get<int>("end_axis", -1);
+        setParamsFrom(params);
+    }
+
+    virtual bool supportBackend(int backendId) CV_OVERRIDE
+    {
+        return backendId == DNN_BACKEND_OPENCV ||
+               backendId == DNN_BACKEND_CUDA ||
+               ((backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 || backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH) && haveInfEngine());
+    }
+
+    bool getMemoryShapes(const std::vector<MatShape> &inputs,
+                         const int requiredOutputs,
+                         std::vector<MatShape> &outputs,
+                         std::vector<MatShape> &internals) const CV_OVERRIDE
+    {
+        CV_Assert(inputs.size() > 0);
+        for (size_t i = 1; i < inputs.size(); i++)
+        {
+            CV_Assert(inputs[i] == inputs[0]);
+        }
+
+        int numAxes = inputs[0].size();
+        int startAxis = clamp(_startAxis, numAxes);
+        int endAxis = clamp(_endAxis, numAxes);
+
+        CV_Assert(startAxis >= 0);
+        CV_Assert(endAxis >= startAxis && endAxis < (int)numAxes);
+
+        size_t flattenedDimensionSize = total(inputs[0], startAxis, endAxis + 1);
+
+        MatShape outputShapeVec;
+        for (int i = 0; i < startAxis; i++)
+        {
+            outputShapeVec.push_back(inputs[0][i]);
+        }
+        outputShapeVec.push_back(flattenedDimensionSize);
+        for (size_t i = endAxis + 1; i < numAxes; i++)
+        {
+            outputShapeVec.push_back(inputs[0][i]);
+        }
+        CV_Assert(outputShapeVec.size() <= 4);
+
+        outputs.resize(inputs.size(), outputShapeVec);
+
+        return true;
+    }
+
+    void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays) CV_OVERRIDE
+    {
+        std::vector<Mat> inputs;
+        inputs_arr.getMatVector(inputs);
+
+        int numAxes = inputs[0].dims;
+        _startAxis = clamp(_startAxis, numAxes);
+        _endAxis = clamp(_endAxis, numAxes);
+    }
+
+#ifdef HAVE_OPENCL
+    bool forward_ocl(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr)
+    {
+        std::vector<UMat> inpvec;
+        std::vector<UMat> outputs;
+
+        inputs_arr.getUMatVector(inpvec);
+        outputs_arr.getUMatVector(outputs);
+
+        std::vector<UMat*> inputs(inpvec.size());
+        for (int i = 0; i < inpvec.size(); i++)
+            inputs[i] = &inpvec[i];
+
+        for (size_t i = 0; i < inputs.size(); i++)
+        {
+            MatShape outShape = shape(outputs[i]);
+            UMat& output = outputs_arr.getUMatRef(i);
+            output = inputs[i]->reshape(1, (int)outShape.size(), &outShape[0]);
+        }
+
+        return true;
+    }
+#endif
+
+    void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
+    {
+        CV_TRACE_FUNCTION();
+        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+
+        CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
+                   outputs_arr.isUMatVector(),
+                   forward_ocl(inputs_arr, outputs_arr, internals_arr))
+
+        std::vector<Mat> inputs, outputs;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
+
+        for (size_t i = 0; i < inputs.size(); i++)
+        {
+            MatShape outShape = shape(outputs[i]);
+            if (inputs[i].data != outputs[i].data)
+            {
+                inputs[i].reshape(1, (int)outShape.size(), &outShape[0]).copyTo(outputs[i]);
+            }
+        }
+    }
+
+#ifdef HAVE_CUDA
+    Ptr<BackendNode> initCUDA(
+        void *context_,
+        const std::vector<Ptr<BackendWrapper>>& inputs,
+        const std::vector<Ptr<BackendWrapper>>& outputs
+    ) override
+    {
+        auto context = reinterpret_cast<csl::CSLContext*>(context_);
+        return make_cuda_node<cuda4dnn::ReshapeOp>(preferableTarget, std::move(context->stream));
+    }
+#endif
+
+#ifdef HAVE_INF_ENGINE
+    virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >& inputs) CV_OVERRIDE
+    {
+        InferenceEngine::Builder::Layer ieLayer(name);
+        ieLayer.setName(name);
+        ieLayer.setType("Flatten");
+        ieLayer.getParameters()["axis"] = (size_t)_startAxis;
+        ieLayer.getParameters()["end_axis"] = _endAxis;  // Do not cast to size_t because it might be negative.
+        ieLayer.setInputPorts(std::vector<InferenceEngine::Port>(1));
+        ieLayer.setOutputPorts(std::vector<InferenceEngine::Port>(1));
+        return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
+    }
+#endif  // HAVE_INF_ENGINE
+
+#ifdef HAVE_DNN_NGRAPH
+virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> >& inputs,
+                                    const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
+{
+        auto& ieInpNode = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
+        std::vector<size_t> dims = ieInpNode->get_shape();
+
+        int numAxes = dims.size();
+        int startAxis = clamp(_startAxis, numAxes);
+        int endAxis = clamp(_endAxis, numAxes);
+
+        CV_Assert(startAxis >= 0);
+        CV_Assert(endAxis >= startAxis && endAxis < numAxes);
+        int64_t flattenedDimensionSize = std::accumulate(dims.begin() + startAxis,
+                                         dims.begin() + endAxis + 1, 1, std::multiplies<size_t>());
+
+        std::vector<int64_t> outputShapeVec(dims.begin(), dims.begin() + startAxis);
+        outputShapeVec.push_back(flattenedDimensionSize);
+        outputShapeVec.insert(outputShapeVec.end(), dims.begin() + endAxis + 1, dims.end());
+
+        auto shape   = std::make_shared<ngraph::op::Constant>(ngraph::element::i64,
+                       ngraph::Shape({outputShapeVec.size()}), outputShapeVec.data());
+        auto reshape = std::make_shared<ngraph::op::v1::Reshape>(ieInpNode, shape, true);
+        return Ptr<BackendNode>(new InfEngineNgraphNode(reshape));
+    }
+#endif  // HAVE_DNN_NGRAPH
+  // HAVE_INF_ENGINE
+
+    int _startAxis;
+    int _endAxis;
+};
+
+Ptr<FlattenLayer> FlattenLayer::create(const LayerParams& params)
+{
+    return Ptr<FlattenLayer>(new FlattenLayerImpl(params));
+}
+
+}
+}
--- a/Lib/opencv/sources/modules/dnn/src/layers/fully_connected_layer.cpp
+++ b/Lib/opencv/sources/modules/dnn/src/layers/fully_connected_layer.cpp
@@ -0,0 +1,538 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Copyright (C) 2017, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "../precomp.hpp"
+#include "layers_common.hpp"
+#include "../op_cuda.hpp"
+#include "../op_halide.hpp"
+#include "../op_inf_engine.hpp"
+#include "../ie_ngraph.hpp"
+
+#include <opencv2/dnn/shape_utils.hpp>
+
+#ifdef HAVE_OPENCL
+#include "opencl_kernels_dnn.hpp"
+using namespace cv::dnn::ocl4dnn;
+#endif
+
+#ifdef HAVE_CUDA
+#include "../cuda4dnn/primitives/inner_product.hpp"
+using namespace cv::dnn::cuda4dnn;
+#endif
+
+namespace cv
+{
+namespace dnn
+{
+
+class FullyConnectedLayerImpl CV_FINAL : public InnerProductLayer
+{
+public:
+    enum { VEC_ALIGN = 8 };
+
+#ifdef HAVE_OPENCL
+    Ptr<OCL4DNNInnerProduct<float> > innerProductOp;
+    std::vector<UMat> umat_blobs;
+    std::vector<UMat> half_blobs;
+#endif
+
+    FullyConnectedLayerImpl(const LayerParams& params)
+    {
+        setParamsFrom(params);
+        CV_Assert(1 <= blobs.size() && blobs.size() <= 2);
+
+        int numOutput = params.get<int>("num_output");
+        int innerSize = (int)blobs[0].total() / numOutput;
+        bias = params.get<bool>("bias_term", true);
+        axis = params.get<int>("axis", 1);
+
+        CV_Assert(blobs[0].dims >= 2 && (size_t)(innerSize * numOutput) == blobs[0].total());
+        CV_Assert(!bias || (blobs.size() == 2 && (size_t)numOutput == blobs[1].total()));
+
+        weightsMat = blobs[0] = blobs[0].reshape(1, numOutput);
+        int vecsize = weightsMat.cols;
+        if( vecsize % VEC_ALIGN != 0 )
+        {
+            int vecsize_aligned = (int)alignSize(vecsize, VEC_ALIGN);
+            Mat weightsBuf(weightsMat.rows, vecsize_aligned, weightsMat.type());
+            Mat wpadding = weightsBuf.colRange(vecsize, vecsize_aligned);
+            wpadding.setTo(Scalar::all(0.));
+            weightsMat = weightsBuf.colRange(0, vecsize);
+            blobs[0].copyTo(weightsMat);
+        }
+
+        if (bias)
+            biasMat = blobs[1] = blobs[1].reshape(1, 1);
+        else
+            biasMat = Mat::zeros(1, numOutput, weightsMat.type());
+    }
+
+    bool getMemoryShapes(const std::vector<MatShape> &inputs,
+                         const int requiredOutputs,
+                         std::vector<MatShape> &outputs,
+                         std::vector<MatShape> &) const CV_OVERRIDE
+    {
+        CV_Assert(inputs.size() == 1);
+        CV_Assert(1 <= blobs.size() && blobs.size() <= 2);
+        CV_Assert(blobs[0].dims == 2);
+
+        int cAxis = clamp(axis, inputs[0]);
+        int numOutput = blobs[0].size[0];
+        MatShape outShape(cAxis + 1);
+        for (int i = 0; i < cAxis; ++i)
+            outShape[i] = inputs[0][i];
+        outShape.back() = numOutput;
+
+        outputs.resize(inputs.size(), outShape);
+
+        CV_Assert(!bias || (size_t)numOutput == blobs[1].total());
+        return false;
+    }
+
+    virtual bool supportBackend(int backendId) CV_OVERRIDE
+    {
+        return backendId == DNN_BACKEND_OPENCV ||
+               backendId == DNN_BACKEND_CUDA ||
+               (backendId == DNN_BACKEND_HALIDE && haveHalide() && axis == 1) ||
+               ((backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 || backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH) && haveInfEngine() && axis == 1);
+    }
+
+    virtual bool setActivation(const Ptr<ActivationLayer>& layer) CV_OVERRIDE
+    {
+        if (activ.empty() || layer.empty())
+        {
+            activ = layer;
+            return !activ.empty();
+        }
+        else
+            return false;
+    }
+
+    class FullyConnected : public ParallelLoopBody
+    {
+    public:
+        FullyConnected() : srcMat(0), weights(0), biasMat(0), activ(0), dstMat(0), nstripes(0), useAVX(false), useAVX2(false), useAVX512(false) {}
+
+        static void run(const Mat& srcMat, const Mat& weights, const Mat& biasMat,
+                        Mat& dstMat, const ActivationLayer* activ, int nstripes)
+        {
+            CV_Assert( srcMat.dims == 2 && srcMat.cols == weights.cols &&
+                       dstMat.rows == srcMat.rows && dstMat.cols == weights.rows &&
+                       srcMat.type() == weights.type() && weights.type() == dstMat.type() &&
+                       srcMat.type() == CV_32F &&
+                       (biasMat.empty() || (biasMat.type() == srcMat.type() &&
+                                           biasMat.isContinuous() && (int)biasMat.total() == dstMat.cols)) );
+
+            FullyConnected p;
+
+            p.srcMat = &srcMat;
+            p.weights = &weights;
+            p.biasMat = &biasMat;
+            p.dstMat = &dstMat;
+            p.nstripes = nstripes;
+            p.activ = activ;
+            p.useAVX = checkHardwareSupport(CPU_AVX);
+            p.useAVX2 = checkHardwareSupport(CPU_AVX2);
+            p.useAVX512 = CV_CPU_HAS_SUPPORT_AVX512_SKX;
+
+            parallel_for_(Range(0, nstripes), p, nstripes);
+        }
+
+        void operator()(const Range& r) const CV_OVERRIDE
+        {
+            int valign = FullyConnectedLayerImpl::VEC_ALIGN;
+            int nsamples = srcMat->rows;
+            int nw0 = weights->rows;
+            int k, vecsize = srcMat->cols;
+            int vecsize_aligned = (int)alignSize(vecsize, VEC_ALIGN);
+            size_t total = (size_t)nsamples*nw0;
+            size_t stripeSize = (total + nstripes - 1)/nstripes;
+            size_t stripeStart = r.start*stripeSize;
+            size_t stripeEnd = r.end == nstripes ? total : std::min(r.end*stripeSize, total);
+            size_t wstep = weights->step1();
+            AutoBuffer<float> srcbuf(vecsize_aligned + valign);
+            float* sptr = alignPtr(srcbuf.data(), (int)(valign*sizeof(float)));
+
+            for( k = vecsize; k < vecsize_aligned; k++ )
+                sptr[k] = 0.f;
+
+            for( size_t ofs = stripeStart; ofs < stripeEnd; )
+            {
+                int sampleIdx = (int)(ofs / nw0);
+                int delta = (int)(ofs - (size_t)sampleIdx*nw0);
+                const float* sptr_ = srcMat->ptr<float>(sampleIdx);
+                const float* wptr = weights->ptr<float>(delta);
+                float* dptr = dstMat->ptr<float>(sampleIdx) + delta;
+                const float* biasptr = biasMat->ptr<float>() + delta;
+                int nw = std::min(nw0 - delta, (int)(stripeEnd - ofs));
+
+                memcpy(sptr, sptr_, vecsize*sizeof(sptr[0]));
+
+            #if CV_TRY_AVX512_SKX
+                if( useAVX512 )
+                    opt_AVX512_SKX::fastGEMM1T( sptr, wptr, wstep, biasptr, dptr, nw, vecsize);
+                else
+            #endif
+            #if CV_TRY_AVX2
+                if( useAVX2 )
+                    opt_AVX2::fastGEMM1T( sptr, wptr, wstep, biasptr, dptr, nw, vecsize);
+                else
+            #endif
+            #if CV_TRY_AVX
+                if( useAVX )
+                    opt_AVX::fastGEMM1T( sptr, wptr, wstep, biasptr, dptr, nw, vecsize);
+                else
+            #endif
+                {
+                    int i = 0;
+
+            #if CV_SIMD128
+                    for( ; i <= nw - 4; i += 4, wptr += 4*wstep )
+                    {
+                        v_float32x4 vs0 = v_setall_f32(0.f), vs1 = v_setall_f32(0.f);
+                        v_float32x4 vs2 = v_setall_f32(0.f), vs3 = v_setall_f32(0.f);
+
+                        for( k = 0; k < vecsize; k += 4 )
+                        {
+                            v_float32x4 v = v_load_aligned(sptr + k);
+                            vs0 += v*v_load_aligned(wptr + k);
+                            vs1 += v*v_load_aligned(wptr + wstep + k);
+                            vs2 += v*v_load_aligned(wptr + wstep*2 + k);
+                            vs3 += v*v_load_aligned(wptr + wstep*3 + k);
+                        }
+
+                        v_float32x4 s = v_reduce_sum4(vs0, vs1, vs2, vs3);
+                        s += v_load(biasptr + i);
+                        v_store(dptr + i, s);
+                    }
+            #endif
+
+                    for( ; i < nw; i++, wptr += wstep )
+                    {
+                        float s0=biasptr[i];
+
+                        for( k = 0; k < vecsize; k++ )
+                        {
+                            float v = sptr[k];
+                            s0 += v*wptr[k];
+                        }
+                        dptr[i] = s0;
+                    }
+                }
+
+                if(activ)
+                    activ->forwardSlice(dptr, dptr, 1, 1, delta, delta + nw);
+
+                ofs += nw;
+            }
+        }
+
+        const Mat *srcMat, *weights, *biasMat;
+        const ActivationLayer* activ;
+        Mat* dstMat;
+        int nstripes;
+        bool useAVX;
+        bool useAVX2;
+        bool useAVX512;
+    };
+
+#ifdef HAVE_OPENCL
+    virtual void finalize(InputArrayOfArrays, OutputArrayOfArrays) CV_OVERRIDE
+    {
+        innerProductOp.release();
+        umat_blobs.clear();
+        half_blobs.clear();
+    }
+
+    bool forward_ocl(InputArrayOfArrays inps, OutputArrayOfArrays outs, InputArrayOfArrays internals)
+    {
+        std::vector<UMat> inputs;
+        std::vector<UMat> outputs;
+
+        bool use_half = (inps.depth() == CV_16S);
+        inps.getUMatVector(inputs);
+        outs.getUMatVector(outputs);
+
+        int axisCan = clamp(axis, inputs[0].dims);
+        int numOutput = blobs[0].size[0];
+        int innerSize = blobs[0].size[1];
+        int outerSize = total(shape(inputs[0]), 0, axisCan);
+        bool ret = true;
+
+        if (innerProductOp.empty())
+        {
+            size_t n = blobs.size();
+            umat_blobs.resize(n);
+            for (int i = 0; i < n; i++) blobs[i].copyTo(umat_blobs[i]);
+
+            OCL4DNNInnerProductConfig config;
+            config.num_output = numOutput;
+            config.bias_term = bias;
+            config.M = outerSize;
+            config.K = innerSize;
+            config.use_half = use_half;
+
+            if (use_half)
+            {
+                half_blobs.resize(umat_blobs.size());
+                for (int i = 0; i < umat_blobs.size(); i++)
+                {
+                    if (!umat_blobs[i].empty())
+                        convertFp16(umat_blobs[i], half_blobs[i]);
+                }
+            }
+
+            innerProductOp = Ptr<OCL4DNNInnerProduct<float> >(new OCL4DNNInnerProduct<float>(config));
+        }
+
+        for (size_t i = 0; i < inputs.size(); i++)
+        {
+            MatShape inshape, outshape;
+            inshape = shape(outerSize, innerSize);
+            outshape = shape(outerSize, numOutput);
+
+            UMat srcMat, dstMat;
+            srcMat = inputs[i].reshape(1, inshape.size(), &inshape[0]);
+            dstMat = outputs[i].reshape(1, outshape.size(), &outshape[0]);
+
+            if (!innerProductOp->Forward(srcMat, (use_half) ? half_blobs[0] : umat_blobs[0],
+                                         (bias) ? (use_half ? half_blobs[1] : umat_blobs[1]) : UMat(),
+                                         dstMat))
+            {
+                ret = false;
+                break;
+            }
+
+            if (!use_half && bias && (outerSize > 1))
+            {
+                UMat biasOnesMat = UMat::ones(outerSize, 1, umat_blobs[0].type());
+                UMat& biases = umat_blobs[1];
+                cv::gemm(biasOnesMat, biases, 1, dstMat, 1, dstMat, 0);
+            }
+        }
+
+        if (ret) return true;
+
+        UMat& weights = umat_blobs[0];
+        for (size_t i = 0; i < inputs.size(); i++)
+        {
+            MatShape inshape, outshape;
+            inshape = shape(outerSize, innerSize);
+            outshape = shape(outerSize, numOutput);
+
+            UMat srcMat, dstMat, srcMat_fp32, dstMat_fp32;
+            srcMat = inputs[i].reshape(1, inshape.size(), &inshape[0]);
+            dstMat = outputs[i].reshape(1, outshape.size(), &outshape[0]);
+
+            if (use_half)
+            {
+                convertFp16(srcMat, srcMat_fp32);
+                convertFp16(dstMat, dstMat_fp32);
+            }
+            else
+            {
+                srcMat_fp32 = srcMat;
+                dstMat_fp32 = dstMat;
+            }
+
+            cv::gemm(srcMat_fp32, weights, 1, noArray(), 0, dstMat_fp32, GEMM_2_T);
+
+            if (bias)
+            {
+                UMat biasOnesMat = UMat::ones(outerSize, 1, umat_blobs[0].type());
+                UMat& biases = umat_blobs[1];
+                cv::gemm(biasOnesMat, biases, 1, dstMat_fp32, 1, dstMat_fp32, 0);
+            }
+            if (use_half)
+            {
+                convertFp16(srcMat_fp32, srcMat);
+                convertFp16(dstMat_fp32, dstMat);
+            }
+        }
+
+        return true;
+    }
+#endif
+
+    void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
+    {
+        CV_TRACE_FUNCTION();
+        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+
+        CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
+                   forward_ocl(inputs_arr, outputs_arr, internals_arr))
+
+        if (inputs_arr.depth() == CV_16S)
+        {
+            forward_fallback(inputs_arr, outputs_arr, internals_arr);
+            return;
+        }
+
+        std::vector<Mat> input, output;
+        inputs_arr.getMatVector(input);
+        outputs_arr.getMatVector(output);
+
+        int axisCan = clamp(axis, input[0].dims);
+        int outerSize = input[0].total(0, axisCan);
+
+        for (size_t i = 0; i < input.size(); i++)
+        {
+            Mat srcMat = input[i].reshape(1, outerSize);
+            Mat dstMat = output[i].reshape(1, outerSize);
+
+            const int nstripes = getNumThreads();
+            FullyConnected::run(srcMat, weightsMat, biasMat, dstMat, activ.get(), nstripes);
+        }
+    }
+
+#ifdef HAVE_CUDA
+    Ptr<BackendNode> initCUDA(
+        void *context_,
+        const std::vector<Ptr<BackendWrapper>>& inputs,
+        const std::vector<Ptr<BackendWrapper>>& outputs
+    ) override
+    {
+        auto context = reinterpret_cast<csl::CSLContext*>(context_);
+
+        auto input_wrapper = inputs[0].dynamicCast<CUDABackendWrapper>();
+
+        auto flatten_start_axis = clamp(axis, input_wrapper->getRank());
+
+        auto biasMat_ = bias ? biasMat : Mat();
+        return make_cuda_node<cuda4dnn::InnerProductOp>(preferableTarget, std::move(context->stream), std::move(context->cublas_handle), flatten_start_axis, weightsMat, biasMat_);
+    }
+#endif
+
+    virtual Ptr<BackendNode> initHalide(const std::vector<Ptr<BackendWrapper> > &inputs) CV_OVERRIDE
+    {
+#ifdef HAVE_HALIDE
+        int inW, inH, inC, inN, outC = blobs[0].size[0];
+        Halide::Buffer<float> inputBuffer = halideBuffer(inputs[0]);
+        getCanonicalSize(inputBuffer, &inW, &inH, &inC, &inN);
+        auto weights = wrapToHalideBuffer(blobs[0], {inW, inH, inC, outC});
+
+        Halide::Var x("x"), y("y"), c("c"), n("n");
+        Halide::Func top = (name.empty() ? Halide::Func() : Halide::Func(name));
+        Halide::RDom r(0, inW, 0, inH, 0, inC);
+        Halide::Expr topExpr = sum(inputBuffer(r.x, r.y, r.z, n) *
+                                   weights(r.x, r.y, r.z, c));
+        if (bias)
+        {
+            Halide::Buffer<float> bias = wrapToHalideBuffer(blobs[1], {outC});
+            topExpr += bias(c);
+        }
+        top(x, y, c, n) = topExpr;
+        return Ptr<BackendNode>(new HalideBackendNode(top));
+#endif  // HAVE_HALIDE
+        return Ptr<BackendNode>();
+    }
+
+#ifdef HAVE_INF_ENGINE
+    virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >&) CV_OVERRIDE
+    {
+        InferenceEngine::Builder::FullyConnectedLayer ieLayer(name);
+
+        const int outNum = blobs[0].size[0];
+        ieLayer.setOutputNum(outNum);
+
+        InferenceEngine::Builder::Layer l = ieLayer;
+        addConstantData("weights", wrapToInfEngineBlob(blobs[0], {(size_t)blobs[0].size[0], (size_t)blobs[0].size[1], 1, 1}, InferenceEngine::Layout::OIHW), l);
+        if (bias)
+            addConstantData("biases", wrapToInfEngineBlob(blobs[1], {(size_t)outNum}, InferenceEngine::Layout::C), l);
+
+        return Ptr<BackendNode>(new InfEngineBackendNode(l));
+    }
+#endif  // HAVE_INF_ENGINE
+
+
+#ifdef HAVE_DNN_NGRAPH
+    virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> >& inputs,
+                                        const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
+    {
+        auto& ieInpNode = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
+        int batch = ieInpNode->get_shape()[0];
+
+        std::vector<size_t> data = {(size_t)batch, (size_t)blobs[0].size[1]};
+        auto new_shape = std::make_shared<ngraph::op::Constant>(ngraph::element::i64, ngraph::Shape{2}, data.data());
+        auto inp = std::make_shared<ngraph::op::v1::Reshape>(ieInpNode, new_shape, true);
+
+        std::vector<size_t> weight_shape{(size_t)blobs[0].size[0], (size_t)blobs[0].size[1]};
+        auto ieWeights = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, weight_shape, blobs[0].data);
+        auto matmul = std::make_shared<ngraph::op::MatMul>(inp, ieWeights, false, true);
+        if (bias) {
+            auto bias_node = std::make_shared<ngraph::op::Constant>(ngraph::element::f32,
+                                              ngraph::Shape{(size_t)blobs[1].size[1]}, blobs[1].data);
+            auto fc = std::make_shared<ngraph::op::v1::Add>(matmul, bias_node, ngraph::op::AutoBroadcastType::NUMPY);
+            return Ptr<BackendNode>(new InfEngineNgraphNode(fc));
+        }
+        return Ptr<BackendNode>(new InfEngineNgraphNode(matmul));
+    }
+#endif  // HAVE_DNN_NGRAPH
+
+    virtual int64 getFLOPS(const std::vector<MatShape> &inputs,
+                           const std::vector<MatShape> &outputs) const CV_OVERRIDE
+    {
+        CV_UNUSED(inputs); // suppress unused variable warning
+        long flops = 0;
+
+        int innerSize = blobs[0].size[1];
+        for(int i = 0; i < outputs.size(); i++)
+        {
+            flops += CV_BIG_INT(3)*innerSize*total(outputs[i]);
+        }
+
+        return flops;
+
+    }
+
+    bool bias;
+    Mat weightsMat, biasMat;
+    Ptr<ActivationLayer> activ;
+};
+
+Ptr<InnerProductLayer> InnerProductLayer::create(const LayerParams& params)
+{
+    return Ptr<InnerProductLayer>(new FullyConnectedLayerImpl(params));
+}
+
+}
+}
--- a/Lib/opencv/sources/modules/dnn/src/layers/layers_common.cpp
+++ b/Lib/opencv/sources/modules/dnn/src/layers/layers_common.cpp
@@ -0,0 +1,243 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Copyright (C) 2017, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "../precomp.hpp"
+#include "layers_common.hpp"
+
+namespace cv
+{
+namespace dnn
+{
+
+namespace util
+{
+
+std::string makeName(const std::string& str1, const std::string& str2)
+{
+    return str1 + str2;
+}
+
+bool getParameter(const LayerParams &params, const std::string& nameBase, const std::string& nameAll,
+                  std::vector<size_t>& parameter, bool hasDefault = false, const std::vector<size_t>& defaultValue = std::vector<size_t>(2, 0))
+{
+    std::string nameH = makeName(nameBase, std::string("_h"));
+    std::string nameW = makeName(nameBase, std::string("_w"));
+    std::string nameAll_ = nameAll;
+    if (nameAll_ == "")
+        nameAll_ = nameBase;
+
+    if (params.has(nameH) && params.has(nameW))
+    {
+        CV_Assert(params.get<int>(nameH) >= 0 && params.get<int>(nameW) >= 0);
+        parameter.push_back(params.get<int>(nameH));
+        parameter.push_back(params.get<int>(nameW));
+        return true;
+    }
+    else
+    {
+        if (params.has(nameAll_))
+        {
+            DictValue param = params.get(nameAll_);
+            for (int i = 0; i < param.size(); i++) {
+                CV_Assert(param.get<int>(i) >= 0);
+                parameter.push_back(param.get<int>(i));
+            }
+            if (parameter.size() == 1)
+                parameter.resize(2, parameter[0]);
+            return true;
+        }
+        else
+        {
+            if (hasDefault)
+            {
+                parameter = defaultValue;
+                return true;
+            }
+            else
+            {
+                return false;
+            }
+        }
+    }
+}
+
+void getKernelSize(const LayerParams &params, std::vector<size_t>& kernel)
+{
+    if (!util::getParameter(params, "kernel", "kernel_size", kernel))
+        CV_Error(cv::Error::StsBadArg, "kernel_size (or kernel_h and kernel_w) not specified");
+
+    for (int i = 0; i < kernel.size(); i++)
+        CV_Assert(kernel[i] > 0);
+}
+
+void getStrideAndPadding(const LayerParams &params, std::vector<size_t>& pads_begin, std::vector<size_t>& pads_end,
+                         std::vector<size_t>& strides, cv::String& padMode, size_t kernel_size = 2)
+{
+    if (params.has("pad_l") && params.has("pad_t") && params.has("pad_r") && params.has("pad_b")) {
+        CV_Assert(params.get<int>("pad_t") >= 0 && params.get<int>("pad_l") >= 0 &&
+                  params.get<int>("pad_b") >= 0 && params.get<int>("pad_r") >= 0);
+        pads_begin.push_back(params.get<int>("pad_t"));
+        pads_begin.push_back(params.get<int>("pad_l"));
+        pads_end.push_back(params.get<int>("pad_b"));
+        pads_end.push_back(params.get<int>("pad_r"));
+    }
+    else {
+        util::getParameter(params, "pad", "pad", pads_begin, true, std::vector<size_t>(kernel_size, 0));
+        if (pads_begin.size() < 4)
+            pads_end = pads_begin;
+        else
+        {
+            pads_end = std::vector<size_t>(pads_begin.begin() + pads_begin.size() / 2, pads_begin.end());
+            pads_begin.resize(pads_begin.size() / 2);
+        }
+        CV_Assert(pads_begin.size() == pads_end.size());
+    }
+    util::getParameter(params, "stride", "stride", strides, true, std::vector<size_t>(kernel_size, 1));
+
+    padMode = "";
+    if (params.has("pad_mode"))
+    {
+        padMode = params.get<String>("pad_mode");
+    }
+
+    for (int i = 0; i < strides.size(); i++)
+        CV_Assert(strides[i] > 0);
+}
+}
+
+void getPoolingKernelParams(const LayerParams &params, std::vector<size_t>& kernel, bool &globalPooling,
+                            std::vector<size_t>& pads_begin, std::vector<size_t>& pads_end,
+                            std::vector<size_t>& strides, cv::String &padMode)
+{
+    globalPooling = params.has("global_pooling") &&
+                    params.get<bool>("global_pooling");
+
+    if (globalPooling)
+    {
+        util::getStrideAndPadding(params, pads_begin, pads_end, strides, padMode);
+        if(params.has("kernel_h") || params.has("kernel_w") || params.has("kernel_size"))
+        {
+            CV_Error(cv::Error::StsBadArg, "In global_pooling mode, kernel_size (or kernel_h and kernel_w) cannot be specified");
+        }
+        for (int i = 0; i < pads_begin.size(); i++) {
+            if (pads_begin[i] != 0 || pads_end[i] != 0)
+                CV_Error(cv::Error::StsBadArg, "In global_pooling mode, pads must be = 0");
+        }
+        for (int i = 0; i < strides.size(); i++) {
+            if (strides[i] != 1)
+                CV_Error(cv::Error::StsBadArg, "In global_pooling mode, strides must be = 1");
+        }
+    }
+    else
+    {
+        util::getKernelSize(params, kernel);
+        util::getStrideAndPadding(params, pads_begin, pads_end, strides, padMode, kernel.size());
+    }
+}
+
+void getConvolutionKernelParams(const LayerParams &params, std::vector<size_t>& kernel, std::vector<size_t>& pads_begin,
+                                std::vector<size_t>& pads_end, std::vector<size_t>& strides,
+                                std::vector<size_t>& dilations, cv::String &padMode, std::vector<size_t>& adjust_pads)
+{
+    util::getKernelSize(params, kernel);
+    util::getStrideAndPadding(params, pads_begin, pads_end, strides, padMode, kernel.size());
+    util::getParameter(params, "dilation", "dilation", dilations, true, std::vector<size_t>(kernel.size(), 1));
+    util::getParameter(params, "adj", "adj", adjust_pads, true, std::vector<size_t>(kernel.size(), 0));
+
+    for (int i = 0; i < dilations.size(); i++)
+        CV_Assert(dilations[i] > 0);
+}
+
+// From TensorFlow code:
+// Total padding on rows and cols is
+// Pr = (R' - 1) * S + Kr - R
+// Pc = (C' - 1) * S + Kc - C
+// where (R', C') are output dimensions, (R, C) are input dimensions, S
+// is stride, (Kr, Kc) are filter dimensions.
+// We pad Pr/2 on the left and Pr - Pr/2 on the right, Pc/2 on the top
+// and Pc - Pc/2 on the bottom.  When Pr or Pc is odd, this means
+// we pad more on the right and bottom than on the top and left.
+void getConvPoolOutParams(const std::vector<int>& inp, const std::vector<size_t>& kernel,
+                          const std::vector<size_t>& stride, const String &padMode,
+                          const std::vector<size_t>& dilation, std::vector<int>& out)
+{
+    if (padMode == "VALID")
+    {
+        for (int i = 0; i < inp.size(); i++)
+            out.push_back((inp[i] - dilation[i] * (kernel[i] - 1) - 1 + stride[i]) / stride[i]);
+    }
+    else if (padMode == "SAME")
+    {
+        for (int i = 0; i < inp.size(); i++)
+            out.push_back((inp[i] - 1 + stride[i]) / stride[i]);
+    }
+    else
+    {
+        CV_Error(Error::StsError, "Unsupported padding mode");
+    }
+}
+
+void getConvPoolPaddings(const std::vector<int>& inp, const std::vector<size_t>& kernel,
+                         const std::vector<size_t>& strides, const String &padMode,
+                         std::vector<size_t>& pads_begin, std::vector<size_t>& pads_end)
+{
+    if (padMode == "SAME" || padMode == "VALID")
+    {
+        pads_begin.assign(kernel.size(), 0);
+        pads_end.assign(kernel.size(), 0);
+    }
+    if (padMode == "SAME")
+    {
+        CV_Assert_N(kernel.size() == strides.size(), kernel.size() == inp.size());
+        for (int i = 0; i < pads_begin.size(); i++) {
+            // There are test cases with stride > kernel.
+            if (strides[i] <= kernel[i])
+            {
+                int pad = (kernel[i] - 1 - (inp[i] - 1 + strides[i]) % strides[i]) / 2;
+                pads_begin[i] = pads_end[i] = pad;
+            }
+        }
+    }
+}
+
+}
+}
--- a/Lib/opencv/sources/modules/dnn/src/layers/layers_common.hpp
+++ b/Lib/opencv/sources/modules/dnn/src/layers/layers_common.hpp
@@ -0,0 +1,79 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Copyright (C) 2017, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_DNN_LAYERS_LAYERS_COMMON_HPP__
+#define __OPENCV_DNN_LAYERS_LAYERS_COMMON_HPP__
+#include <opencv2/dnn.hpp>
+#include <opencv2/dnn/shape_utils.hpp>
+
+#define CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
+// dispatched AVX/AVX2 optimizations
+#include "./layers_common.simd.hpp"
+#include "layers/layers_common.simd_declarations.hpp"
+#undef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
+
+#ifdef HAVE_OPENCL
+#include "../ocl4dnn/include/ocl4dnn.hpp"
+#endif
+
+namespace cv
+{
+namespace dnn
+{
+void getConvolutionKernelParams(const LayerParams &params, std::vector<size_t>& kernel, std::vector<size_t>& pads_begin,
+                                std::vector<size_t>& pads_end, std::vector<size_t>& strides, std::vector<size_t>& dilations,
+                                cv::String &padMode, std::vector<size_t>& adjust_pads);
+
+void getPoolingKernelParams(const LayerParams &params, std::vector<size_t>& kernel, bool &globalPooling,
+                            std::vector<size_t>& pads_begin, std::vector<size_t>& pads_end, std::vector<size_t>& strides, cv::String &padMode);
+
+void getConvPoolOutParams(const std::vector<int>& inp, const std::vector<size_t>& kernel,
+                          const std::vector<size_t>& stride, const String &padMode,
+                          const std::vector<size_t>& dilation, std::vector<int>& out);
+
+ void getConvPoolPaddings(const std::vector<int>& inp, const std::vector<size_t>& kernel,
+                          const std::vector<size_t>& strides, const String &padMode,
+                          std::vector<size_t>& pads_begin, std::vector<size_t>& pads_end);
+}
+}
+
+#endif
--- a/Lib/opencv/sources/modules/dnn/src/layers/layers_common.simd.hpp
+++ b/Lib/opencv/sources/modules/dnn/src/layers/layers_common.simd.hpp
@@ -0,0 +1,485 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Copyright (C) 2017, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "opencv2/core/hal/intrin.hpp"
+
+namespace cv {
+namespace dnn {
+CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN
+
+void fastConv( const float* weights, size_t wstep, const float* bias,
+               const float* rowbuf, float* output, const int* outShape,
+               int blockSize, int vecsize, int vecsize_aligned,
+               const float* relu, bool initOutput );
+void fastGEMM1T( const float* vec, const float* weights,
+                 size_t wstep, const float* bias,
+                 float* dst, int nvecs, int vecsize );
+void fastGEMM( const float* aptr, size_t astep, const float* bptr,
+               size_t bstep, float* cptr, size_t cstep,
+               int ma, int na, int nb );
+
+#if !defined(CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY) && CV_AVX
+
+#if !CV_FMA3 // AVX workaround
+#undef _mm256_fmadd_ps
+#define _mm256_fmadd_ps(a, b, c) _mm256_add_ps(c, _mm256_mul_ps(a, b))
+#endif
+
+void fastConv( const float* weights, size_t wstep, const float* bias,
+               const float* rowbuf, float* output, const int* outShape,
+               int blockSize, int vecsize, int vecsize_aligned,
+               const float* relu, bool initOutput )
+{
+    int outCn = outShape[1];
+    size_t outPlaneSize = outShape[2]*outShape[3];
+    float r0 = 1.f, r1 = 1.f, r2 = 1.f;
+    __m128 vr0 = _mm_set1_ps(1.f), vr1 = vr0, vr2 = vr0, z = _mm_setzero_ps();
+
+    // now compute dot product of the weights
+    // and im2row-transformed part of the tensor
+    for( int i = 0; i < outCn; i += 3 )
+    {
+        const float* wptr0 = weights + i*wstep;
+        const float* wptr1 = wptr0 + wstep;
+        const float* wptr2 = wptr1 + wstep;
+        float* outptr0 = output + i*outPlaneSize;
+        float* outptr1 = outptr0 + outPlaneSize;
+        float* outptr2 = outptr1 + outPlaneSize;
+        float bias0 = bias[i], bias1 = bias[i+1], bias2 = bias[i+2];
+
+        if( i+2 >= outCn )
+        {
+            wptr2 = wptr1;
+            outptr2 = outptr1;
+            bias2 = bias1;
+            if( i+1 >= outCn )
+            {
+                wptr2 = wptr1 = wptr0;
+                outptr2 = outptr1 = outptr0;
+                bias2 = bias1 = bias0;
+            }
+        }
+
+        if( relu )
+        {
+            r0 = relu[i]; r1 = relu[i+1]; r2 = relu[i+2];
+            if( i+2 >= outCn )
+            {
+                r2 = r1;
+                if( i+1 >= outCn )
+                    r2 = r1 = r0;
+            }
+            vr0 = _mm_set1_ps(r0);
+            vr1 = _mm_set1_ps(r1);
+            vr2 = _mm_set1_ps(r2);
+        }
+
+        int j = 0;
+        for( ; j <= blockSize - 4; j += 4 )
+        {
+            int k = 0;
+            const float* rptr = rowbuf + j*vecsize_aligned;
+
+            __m256 vs00 = _mm256_setzero_ps(), vs01 = _mm256_setzero_ps(),
+                   vs02 = _mm256_setzero_ps(), vs03 = _mm256_setzero_ps(),
+                   vs10 = _mm256_setzero_ps(), vs11 = _mm256_setzero_ps(),
+                   vs12 = _mm256_setzero_ps(), vs13 = _mm256_setzero_ps(),
+                   vs20 = _mm256_setzero_ps(), vs21 = _mm256_setzero_ps(),
+                   vs22 = _mm256_setzero_ps(), vs23 = _mm256_setzero_ps();
+
+#if CV_AVX512_SKX // AVX512VL is necessary to avoid register spilling
+            if (vecsize >= 32)
+            {
+                __m512 vs00_5 = _mm512_setzero_ps(), vs01_5 = _mm512_setzero_ps(),
+                       vs02_5 = _mm512_setzero_ps(), vs03_5 = _mm512_setzero_ps(),
+                       vs10_5 = _mm512_setzero_ps(), vs11_5 = _mm512_setzero_ps(),
+                       vs12_5 = _mm512_setzero_ps(), vs13_5 = _mm512_setzero_ps(),
+                       vs20_5 = _mm512_setzero_ps(), vs21_5 = _mm512_setzero_ps(),
+                       vs22_5 = _mm512_setzero_ps(), vs23_5 = _mm512_setzero_ps();
+
+                for (; k <= vecsize - 16; k += 16, rptr += 16)
+                {
+                    __m512 w0 = _mm512_loadu_ps(wptr0 + k);
+                    __m512 w1 = _mm512_loadu_ps(wptr1 + k);
+                    __m512 w2 = _mm512_loadu_ps(wptr2 + k);
+                    __m512 r0 = _mm512_loadu_ps(rptr);
+
+                    vs00_5 = _mm512_fmadd_ps(w0, r0, vs00_5);
+                    vs10_5 = _mm512_fmadd_ps(w1, r0, vs10_5);
+                    vs20_5 = _mm512_fmadd_ps(w2, r0, vs20_5);
+
+                    r0 = _mm512_loadu_ps(rptr + vecsize_aligned);
+                    vs01_5 = _mm512_fmadd_ps(w0, r0, vs01_5);
+                    vs11_5 = _mm512_fmadd_ps(w1, r0, vs11_5);
+                    vs21_5 = _mm512_fmadd_ps(w2, r0, vs21_5);
+
+                    r0 = _mm512_loadu_ps(rptr + vecsize_aligned*2);
+                    vs02_5 = _mm512_fmadd_ps(w0, r0, vs02_5);
+                    vs12_5 = _mm512_fmadd_ps(w1, r0, vs12_5);
+                    vs22_5 = _mm512_fmadd_ps(w2, r0, vs22_5);
+
+                    r0 = _mm512_loadu_ps(rptr + vecsize_aligned*3);
+                    vs03_5 = _mm512_fmadd_ps(w0, r0, vs03_5);
+                    vs13_5 = _mm512_fmadd_ps(w1, r0, vs13_5);
+                    vs23_5 = _mm512_fmadd_ps(w2, r0, vs23_5);
+                }
+                /*
+                 * now fold the 512 bit accumulator vectors into 256 bit vectors so that the AVX2 code can finish
+                 * the tail of the vector
+                 */
+                vs00 = _mm256_add_ps( _mm512_extractf32x8_ps(vs00_5, 0), _mm512_extractf32x8_ps(vs00_5, 1));
+                vs10 = _mm256_add_ps( _mm512_extractf32x8_ps(vs10_5, 0), _mm512_extractf32x8_ps(vs10_5, 1));
+                vs20 = _mm256_add_ps( _mm512_extractf32x8_ps(vs20_5, 0), _mm512_extractf32x8_ps(vs20_5, 1));
+
+                vs01 = _mm256_add_ps( _mm512_extractf32x8_ps(vs01_5, 0), _mm512_extractf32x8_ps(vs01_5, 1));
+                vs11 = _mm256_add_ps( _mm512_extractf32x8_ps(vs11_5, 0), _mm512_extractf32x8_ps(vs11_5, 1));
+                vs21 = _mm256_add_ps( _mm512_extractf32x8_ps(vs21_5, 0), _mm512_extractf32x8_ps(vs21_5, 1));
+
+                vs02 = _mm256_add_ps( _mm512_extractf32x8_ps(vs02_5, 0), _mm512_extractf32x8_ps(vs02_5, 1));
+                vs12 = _mm256_add_ps( _mm512_extractf32x8_ps(vs12_5, 0), _mm512_extractf32x8_ps(vs12_5, 1));
+                vs22 = _mm256_add_ps( _mm512_extractf32x8_ps(vs22_5, 0), _mm512_extractf32x8_ps(vs22_5, 1));
+
+                vs03 = _mm256_add_ps( _mm512_extractf32x8_ps(vs03_5, 0), _mm512_extractf32x8_ps(vs03_5, 1));
+                vs13 = _mm256_add_ps( _mm512_extractf32x8_ps(vs13_5, 0), _mm512_extractf32x8_ps(vs13_5, 1));
+                vs23 = _mm256_add_ps( _mm512_extractf32x8_ps(vs23_5, 0), _mm512_extractf32x8_ps(vs23_5, 1));
+            }
+#endif
+
+            for (; k < vecsize; k += 8, rptr += 8 )
+            {
+                __m256 w0 = _mm256_load_ps(wptr0 + k);
+                __m256 w1 = _mm256_load_ps(wptr1 + k);
+                __m256 w2 = _mm256_load_ps(wptr2 + k);
+                __m256 r0 = _mm256_load_ps(rptr);
+
+                vs00 = _mm256_fmadd_ps(w0, r0, vs00);
+                vs10 = _mm256_fmadd_ps(w1, r0, vs10);
+                vs20 = _mm256_fmadd_ps(w2, r0, vs20);
+
+                r0 = _mm256_load_ps(rptr + vecsize_aligned);
+                vs01 = _mm256_fmadd_ps(w0, r0, vs01);
+                vs11 = _mm256_fmadd_ps(w1, r0, vs11);
+                vs21 = _mm256_fmadd_ps(w2, r0, vs21);
+
+                r0 = _mm256_load_ps(rptr + vecsize_aligned*2);
+                vs02 = _mm256_fmadd_ps(w0, r0, vs02);
+                vs12 = _mm256_fmadd_ps(w1, r0, vs12);
+                vs22 = _mm256_fmadd_ps(w2, r0, vs22);
+
+                r0 = _mm256_load_ps(rptr + vecsize_aligned*3);
+                vs03 = _mm256_fmadd_ps(w0, r0, vs03);
+                vs13 = _mm256_fmadd_ps(w1, r0, vs13);
+                vs23 = _mm256_fmadd_ps(w2, r0, vs23);
+            }
+
+            __m256 t0 = _mm256_hadd_ps(_mm256_hadd_ps(vs00, vs01), _mm256_hadd_ps(vs02, vs03));
+            __m256 t1 = _mm256_hadd_ps(_mm256_hadd_ps(vs10, vs11), _mm256_hadd_ps(vs12, vs13));
+            __m256 t2 = _mm256_hadd_ps(_mm256_hadd_ps(vs20, vs21), _mm256_hadd_ps(vs22, vs23));
+
+            t0 = _mm256_add_ps(t0, _mm256_permute2f128_ps(t0, t0, 1));
+            t1 = _mm256_add_ps(t1, _mm256_permute2f128_ps(t1, t1, 1));
+            t2 = _mm256_add_ps(t2, _mm256_permute2f128_ps(t2, t2, 1));
+
+            __m128 s0, s1, s2;
+
+            if( initOutput )
+            {
+                s0 = _mm_set1_ps(bias0);
+                s1 = _mm_set1_ps(bias1);
+                s2 = _mm_set1_ps(bias2);
+            }
+            else
+            {
+                s0 = _mm_loadu_ps(outptr0 + j);
+                s1 = _mm_loadu_ps(outptr1 + j);
+                s2 = _mm_loadu_ps(outptr2 + j);
+            }
+
+            s0 = _mm_add_ps(s0, _mm256_castps256_ps128(t0));
+            s1 = _mm_add_ps(s1, _mm256_castps256_ps128(t1));
+            s2 = _mm_add_ps(s2, _mm256_castps256_ps128(t2));
+
+            if( relu )
+            {
+                __m128 m0 = _mm_cmp_ps(s0, z, _CMP_GT_OS);
+                __m128 m1 = _mm_cmp_ps(s1, z, _CMP_GT_OS);
+                __m128 m2 = _mm_cmp_ps(s2, z, _CMP_GT_OS);
+                s0 = _mm_xor_ps(s0, _mm_andnot_ps(m0, _mm_xor_ps(_mm_mul_ps(s0, vr0), s0)));
+                s1 = _mm_xor_ps(s1, _mm_andnot_ps(m1, _mm_xor_ps(_mm_mul_ps(s1, vr1), s1)));
+                s2 = _mm_xor_ps(s2, _mm_andnot_ps(m2, _mm_xor_ps(_mm_mul_ps(s2, vr2), s2)));
+            }
+
+            _mm_storeu_ps(outptr0 + j, s0);
+            _mm_storeu_ps(outptr1 + j, s1);
+            _mm_storeu_ps(outptr2 + j, s2);
+        }
+
+        for( ; j < blockSize; j++ )
+        {
+            const float* rptr = rowbuf + j*vecsize_aligned;
+            float s00, s10, s20;
+
+            if( initOutput )
+            {
+                s00 = bias0;
+                s10 = bias1;
+                s20 = bias2;
+            }
+            else
+            {
+                s00 = outptr0[j];
+                s10 = outptr1[j];
+                s20 = outptr2[j];
+            }
+
+            for( int k = 0; k < vecsize; k++ )
+            {
+                float r0 = rptr[k];
+                s00 += wptr0[k]*r0;
+                s10 += wptr1[k]*r0;
+                s20 += wptr2[k]*r0;
+            }
+
+            if( relu )
+            {
+                s00 = s00 > 0.f ? s00 : s00*r0;
+                s10 = s10 > 0.f ? s10 : s10*r1;
+                s20 = s20 > 0.f ? s20 : s20*r2;
+            }
+
+            outptr0[j] = s00;
+            outptr1[j] = s10;
+            outptr2[j] = s20;
+        }
+    }
+    _mm256_zeroupper();
+}
+
+// dst = vec * weights^t + bias
+void fastGEMM1T( const float* vec, const float* weights,
+                 size_t wstep, const float* bias,
+                 float* dst, int nvecs, int vecsize )
+{
+    int i = 0;
+
+    for( ; i <= nvecs - 8; i += 8 )
+    {
+        const float* wptr = weights + i*wstep;
+        __m256 vs0 = _mm256_setzero_ps(), vs1 = _mm256_setzero_ps(),
+               vs2 = _mm256_setzero_ps(), vs3 = _mm256_setzero_ps(),
+               vs4 = _mm256_setzero_ps(), vs5 = _mm256_setzero_ps(),
+               vs6 = _mm256_setzero_ps(), vs7 = _mm256_setzero_ps();
+
+        for( int k = 0; k < vecsize; k += 8, wptr += 8 )
+        {
+            __m256 v = _mm256_load_ps(vec + k);
+
+            vs0 = _mm256_fmadd_ps(_mm256_load_ps(wptr), v, vs0);
+            vs1 = _mm256_fmadd_ps(_mm256_load_ps(wptr + wstep), v, vs1);
+            vs2 = _mm256_fmadd_ps(_mm256_load_ps(wptr + wstep*2), v, vs2);
+            vs3 = _mm256_fmadd_ps(_mm256_load_ps(wptr + wstep*3), v, vs3);
+            vs4 = _mm256_fmadd_ps(_mm256_load_ps(wptr + wstep*4), v, vs4);
+            vs5 = _mm256_fmadd_ps(_mm256_load_ps(wptr + wstep*5), v, vs5);
+            vs6 = _mm256_fmadd_ps(_mm256_load_ps(wptr + wstep*6), v, vs6);
+            vs7 = _mm256_fmadd_ps(_mm256_load_ps(wptr + wstep*7), v, vs7);
+        }
+
+        __m256 s0 = _mm256_hadd_ps(_mm256_hadd_ps(vs0, vs1), _mm256_hadd_ps(vs2, vs3));
+        __m256 s1 = _mm256_hadd_ps(_mm256_hadd_ps(vs4, vs5), _mm256_hadd_ps(vs6, vs7));
+
+        s0 = _mm256_add_ps(s0, _mm256_permute2f128_ps(s0, s0, 1));
+        s1 = _mm256_add_ps(s1, _mm256_permute2f128_ps(s1, s1, 1));
+
+        s0 = _mm256_add_ps(s0, _mm256_castps128_ps256(_mm_loadu_ps(bias + i)));
+        s1 = _mm256_add_ps(s1, _mm256_castps128_ps256(_mm_loadu_ps(bias + i + 4)));
+
+        _mm_storeu_ps(dst + i, _mm256_castps256_ps128(s0));
+        _mm_storeu_ps(dst + i + 4, _mm256_castps256_ps128(s1));
+    }
+
+    float temp = 0.f;
+    for( ; i < nvecs; i++ )
+    {
+        const float* wptr = weights + i*wstep;
+        __m256 vs0 = _mm256_setzero_ps();
+
+        for( int k = 0; k < vecsize; k += 8, wptr += 8 )
+        {
+            __m256 v = _mm256_load_ps(vec + k);
+            vs0 = _mm256_fmadd_ps(_mm256_load_ps(wptr), v, vs0);
+        }
+
+        __m256 s0 = _mm256_hadd_ps(_mm256_hadd_ps(vs0, vs0), vs0);
+        s0 = _mm256_add_ps(s0, _mm256_permute2f128_ps(s0, s0, 1));
+        _mm_store_ss(&temp, _mm256_castps256_ps128(s0));
+        dst[i] = temp + bias[i];
+    }
+
+    _mm256_zeroupper();
+}
+
+
+void fastGEMM( const float* aptr, size_t astep, const float* bptr,
+               size_t bstep, float* cptr, size_t cstep,
+               int ma, int na, int nb )
+{
+    int n = 0;
+
+#if CV_AVX512_SKX // AVX512VL is necessary to avoid register spilling
+    for( ; n <= nb - 32; n += 32 )
+    {
+        for( int m = 0; m < ma; m += 4 )
+        {
+            const float* aptr0 = aptr + astep*m;
+            const float* aptr1 = aptr + astep*std::min(m+1, ma-1);
+            const float* aptr2 = aptr + astep*std::min(m+2, ma-1);
+            const float* aptr3 = aptr + astep*std::min(m+3, ma-1);
+
+            float* cptr0 = cptr + cstep*m;
+            float* cptr1 = cptr + cstep*std::min(m+1, ma-1);
+            float* cptr2 = cptr + cstep*std::min(m+2, ma-1);
+            float* cptr3 = cptr + cstep*std::min(m+3, ma-1);
+
+            __m512 d00 = _mm512_setzero_ps(), d01 = _mm512_setzero_ps();
+            __m512 d10 = _mm512_setzero_ps(), d11 = _mm512_setzero_ps();
+            __m512 d20 = _mm512_setzero_ps(), d21 = _mm512_setzero_ps();
+            __m512 d30 = _mm512_setzero_ps(), d31 = _mm512_setzero_ps();
+
+            for( int k = 0; k < na; k++ )
+            {
+                __m512 a0 = _mm512_set1_ps(aptr0[k]);
+                __m512 a1 = _mm512_set1_ps(aptr1[k]);
+                __m512 a2 = _mm512_set1_ps(aptr2[k]);
+                __m512 a3 = _mm512_set1_ps(aptr3[k]);
+                __m512 b0 = _mm512_loadu_ps(bptr + k*bstep + n);
+                __m512 b1 = _mm512_loadu_ps(bptr + k*bstep + n + 16);
+                d00 = _mm512_fmadd_ps(a0, b0, d00);
+                d01 = _mm512_fmadd_ps(a0, b1, d01);
+                d10 = _mm512_fmadd_ps(a1, b0, d10);
+                d11 = _mm512_fmadd_ps(a1, b1, d11);
+                d20 = _mm512_fmadd_ps(a2, b0, d20);
+                d21 = _mm512_fmadd_ps(a2, b1, d21);
+                d30 = _mm512_fmadd_ps(a3, b0, d30);
+                d31 = _mm512_fmadd_ps(a3, b1, d31);
+            }
+
+            _mm512_storeu_ps(cptr0 + n, d00);
+            _mm512_storeu_ps(cptr0 + n + 16, d01);
+            _mm512_storeu_ps(cptr1 + n, d10);
+            _mm512_storeu_ps(cptr1 + n + 16, d11);
+            _mm512_storeu_ps(cptr2 + n, d20);
+            _mm512_storeu_ps(cptr2 + n + 16, d21);
+            _mm512_storeu_ps(cptr3 + n, d30);
+            _mm512_storeu_ps(cptr3 + n + 16, d31);
+        }
+    }
+#endif
+
+    for( ; n <= nb - 16; n += 16 )
+    {
+        for( int m = 0; m < ma; m += 4 )
+        {
+            const float* aptr0 = aptr + astep*m;
+            const float* aptr1 = aptr + astep*std::min(m+1, ma-1);
+            const float* aptr2 = aptr + astep*std::min(m+2, ma-1);
+            const float* aptr3 = aptr + astep*std::min(m+3, ma-1);
+
+            float* cptr0 = cptr + cstep*m;
+            float* cptr1 = cptr + cstep*std::min(m+1, ma-1);
+            float* cptr2 = cptr + cstep*std::min(m+2, ma-1);
+            float* cptr3 = cptr + cstep*std::min(m+3, ma-1);
+
+            __m256 d00 = _mm256_setzero_ps(), d01 = _mm256_setzero_ps();
+            __m256 d10 = _mm256_setzero_ps(), d11 = _mm256_setzero_ps();
+            __m256 d20 = _mm256_setzero_ps(), d21 = _mm256_setzero_ps();
+            __m256 d30 = _mm256_setzero_ps(), d31 = _mm256_setzero_ps();
+
+            for( int k = 0; k < na; k++ )
+            {
+                __m256 a0 = _mm256_set1_ps(aptr0[k]);
+                __m256 a1 = _mm256_set1_ps(aptr1[k]);
+                __m256 a2 = _mm256_set1_ps(aptr2[k]);
+                __m256 a3 = _mm256_set1_ps(aptr3[k]);
+                __m256 b0 = _mm256_loadu_ps(bptr + k*bstep + n);
+                __m256 b1 = _mm256_loadu_ps(bptr + k*bstep + n + 8);
+                d00 = _mm256_fmadd_ps(a0, b0, d00);
+                d01 = _mm256_fmadd_ps(a0, b1, d01);
+                d10 = _mm256_fmadd_ps(a1, b0, d10);
+                d11 = _mm256_fmadd_ps(a1, b1, d11);
+                d20 = _mm256_fmadd_ps(a2, b0, d20);
+                d21 = _mm256_fmadd_ps(a2, b1, d21);
+                d30 = _mm256_fmadd_ps(a3, b0, d30);
+                d31 = _mm256_fmadd_ps(a3, b1, d31);
+            }
+
+            _mm256_storeu_ps(cptr0 + n, d00);
+            _mm256_storeu_ps(cptr0 + n + 8, d01);
+            _mm256_storeu_ps(cptr1 + n, d10);
+            _mm256_storeu_ps(cptr1 + n + 8, d11);
+            _mm256_storeu_ps(cptr2 + n, d20);
+            _mm256_storeu_ps(cptr2 + n + 8, d21);
+            _mm256_storeu_ps(cptr3 + n, d30);
+            _mm256_storeu_ps(cptr3 + n + 8, d31);
+        }
+    }
+
+    for( ; n < nb; n++ )
+    {
+        for( int m = 0; m < ma; m++ )
+        {
+            const float* aptr0 = aptr + astep*m;
+            float* cptr0 = cptr + cstep*m;
+            float d0 = 0.f;
+
+            for( int k = 0; k < na; k++ )
+                d0 += aptr0[k]*bptr[k*bstep + n];
+
+            cptr0[n] = d0;
+        }
+    }
+    _mm256_zeroupper();
+}
+
+#endif // CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
+
+CV_CPU_OPTIMIZATION_NAMESPACE_END
+}} // namespace
--- a/Lib/opencv/sources/modules/dnn/src/layers/lrn_layer.cpp
+++ b/Lib/opencv/sources/modules/dnn/src/layers/lrn_layer.cpp
@@ -0,0 +1,522 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Copyright (C) 2017, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "../precomp.hpp"
+#include "layers_common.hpp"
+#include "../op_cuda.hpp"
+#include "../op_halide.hpp"
+#include "../op_inf_engine.hpp"
+#include "../ie_ngraph.hpp"
+#include "../op_vkcom.hpp"
+
+#include "opencv2/imgproc.hpp"
+#include "opencv2/dnn/shape_utils.hpp"
+#include "opencv2/core/hal/hal.hpp"
+#include <algorithm>
+
+#ifdef HAVE_OPENCL
+#include "opencl_kernels_dnn.hpp"
+using namespace cv::dnn::ocl4dnn;
+#endif
+
+#ifdef HAVE_CUDA
+#include "../cuda4dnn/primitives/lrn.hpp"
+using namespace cv::dnn::cuda4dnn;
+#endif
+
+namespace cv
+{
+namespace dnn
+{
+
+class LRNLayerImpl CV_FINAL : public LRNLayer
+{
+public:
+    LRNLayerImpl(const LayerParams& params)
+    {
+        setParamsFrom(params);
+        type = -1;
+        String nrmType = params.get<String>("norm_region", "ACROSS_CHANNELS");
+        if (nrmType == "ACROSS_CHANNELS")
+            type = CHANNEL_NRM;
+        else if (nrmType == "WITHIN_CHANNEL")
+            type = SPATIAL_NRM;
+        else
+            CV_Error(Error::StsBadArg, "Unknown region type \"" + nrmType + "\"");
+
+        size = params.get<int>("local_size", 5);
+        if (size % 2 != 1 || size <= 0)
+            CV_Error(Error::StsBadArg, "LRN layer supports only positive odd values for local_size");
+
+        alpha = params.get<double>("alpha", 1);
+        beta = params.get<double>("beta", 0.75);
+        bias = params.get<double>("bias", 1);
+        normBySize = params.get<bool>("norm_by_size", true);
+    }
+
+#ifdef HAVE_OPENCL
+    Ptr<OCL4DNNLRN<float> > lrnOp;
+#endif
+
+    virtual bool supportBackend(int backendId) CV_OVERRIDE
+    {
+        if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019) {
+            return bias == (int)bias;
+        }
+        if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH) {
+            return type == CHANNEL_NRM && bias == (int)bias;
+        }
+        return backendId == DNN_BACKEND_OPENCV ||
+               backendId == DNN_BACKEND_CUDA ||
+               backendId == DNN_BACKEND_HALIDE ||
+               (backendId == DNN_BACKEND_VKCOM && haveVulkan() && (size % 2 == 1) && (type == CHANNEL_NRM));
+    }
+
+#ifdef HAVE_OPENCL
+    virtual void finalize(InputArrayOfArrays, OutputArrayOfArrays) CV_OVERRIDE
+    {
+        lrnOp.release();
+    }
+
+    bool forward_ocl(InputArrayOfArrays inps, OutputArrayOfArrays outs, OutputArrayOfArrays internals)
+    {
+        std::vector<UMat> inputs;
+        std::vector<UMat> outputs;
+
+        bool use_half = (inps.depth() == CV_16S);
+        inps.getUMatVector(inputs);
+        outs.getUMatVector(outputs);
+
+        if (lrnOp.empty())
+        {
+            OCL4DNNLRNConfig config;
+            config.lrn_type = type == CHANNEL_NRM ?
+                              LRNParameter_NormRegion_ACROSS_CHANNELS :
+                              LRNParameter_NormRegion_WITHIN_CHANNEL;
+
+            CHECK_EQ(size % 2, 1)<< "LRN only supports odd values for local_size";
+            config.local_size = size;
+            config.alpha = alpha;
+            config.beta = beta;
+            config.k = bias;
+            CHECK_EQ(4, inputs[0].dims) << "Input must have 4 axes, "
+                     << "corresponding to (num, channels, height, width)";
+            config.batch_size = inputs[0].size[0];
+            config.channels = inputs[0].size[1];
+            config.height = inputs[0].size[2];
+            config.width = inputs[0].size[3];
+            config.norm_by_size = normBySize;
+            config.use_half = use_half;
+
+            lrnOp = Ptr<OCL4DNNLRN<float> >(new OCL4DNNLRN<float>(config));
+        }
+
+        if (!lrnOp->Forward(inputs[0], outputs[0]))
+            return false;
+
+        return true;
+    }
+#endif
+
+    void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
+    {
+        CV_TRACE_FUNCTION();
+        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+
+        CV_Assert(inputs_arr.total() == outputs_arr.total());
+
+        CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
+                   forward_ocl(inputs_arr, outputs_arr, internals_arr))
+
+        if (inputs_arr.depth() == CV_16S)
+        {
+            forward_fallback(inputs_arr, outputs_arr, internals_arr);
+            return;
+        }
+
+        std::vector<Mat> inputs, outputs;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
+
+        CV_Assert(inputs.size() == outputs.size());
+
+        for (int i = 0; i < inputs.size(); i++)
+        {
+            CV_Assert(inputs[i].dims == 4);
+
+            Mat &src = inputs[i];
+            Mat &dst = outputs[i];
+
+            switch (type)
+            {
+                case CHANNEL_NRM:
+                    channelNormalization(src, dst);
+                    break;
+                case SPATIAL_NRM:
+                    spatialNormalization(src, dst);
+                    break;
+                default:
+                    CV_Error(Error::StsNotImplemented, "Unimplemented mode of LRN layer");
+                    break;
+            }
+        }
+    }
+
+    class ChannelLRN : public ParallelLoopBody
+    {
+    public:
+        ChannelLRN(const float* src, float* dst, int channels, int ksize,
+                   float alpha1, float bias1, float beta1,
+                   size_t planeSize, int nsamples, int nstripes)
+        {
+            src_ = src; dst_ = dst;
+            channels_ = channels;
+            ksize_ = ksize;
+            alpha1_ = alpha1; bias1_ = bias1; beta1_ = beta1;
+            planeSize_ = planeSize; nsamples_ = nsamples; nstripes_ = nstripes;
+        }
+
+        void operator()(const Range& r) const CV_OVERRIDE
+        {
+            int nsamples = nsamples_, nstripes = nstripes_;
+            size_t planeSize = planeSize_, planeSize_n = planeSize * nsamples;
+            size_t elemsPerStripe = (planeSize_n + nstripes - 1)/nstripes;
+            size_t rstart = r.start*elemsPerStripe;
+            size_t rend = r.end == nstripes ? planeSize_n : r.end*elemsPerStripe;
+            rstart = std::min(rstart, planeSize_n);
+            rend = std::min(rend, planeSize_n);
+            float alpha1 = alpha1_, bias1 = bias1_, beta1 = beta1_;
+            int k, channels = channels_, ksize = ksize_;
+
+            AutoBuffer<float> buf_((channels + ksize + 1)*2);
+            float* acc = buf_.data();
+            float* buf = acc + channels + ksize + 1;
+            for( k = 0; k <= ksize; k++ )
+                buf[-k-1] = buf[channels + k] = 0.f;
+
+            for( size_t ofs = rstart; ofs < rend; )
+            {
+                int sampleIdx = (int)(ofs/planeSize);
+                if( sampleIdx >= nsamples )
+                    break;
+                size_t ofs0 = ofs - sampleIdx*planeSize;
+                size_t ofs1 = std::min(planeSize - ofs0, rend - ofs) + ofs;
+                const float* src = src_ + sampleIdx*planeSize*channels + ofs0;
+                float* dst = dst_ + sampleIdx*planeSize*channels + ofs0;
+
+                for( ; ofs < ofs1; ofs++, src++, dst++ )
+                {
+                    for( k = 0; k < channels; k++ )
+                        buf[k] = src[k*planeSize];
+                    float s = 0;
+                    for( k = 0; k < ksize; k++ )
+                        s += buf[k]*buf[k];
+                    for( k = 0; k < channels; k++ )
+                    {
+                        float x1 = buf[k + ksize];
+                        float x0 = buf[k - ksize - 1];
+                        s = std::max(s + (x1 + x0)*(x1 - x0), 0.f);
+                        acc[k] = (float)(alpha1*s + bias1);
+                    }
+
+                    hal::log32f(acc, acc, channels);
+                    for( k = 0; k < channels; k++ )
+                        acc[k] *= beta1;
+                    hal::exp32f(acc, acc, channels);
+
+                    for( k = 0; k < channels; k++ )
+                        dst[k*planeSize] = buf[k]*acc[k];
+                }
+            }
+        }
+
+        const float* src_;
+        float* dst_;
+        float alpha1_, bias1_, beta1_;
+        size_t planeSize_;
+        int channels_, ksize_, nsamples_, nstripes_;
+    };
+
+    void channelNormalization(Mat &srcBlob, Mat &dstBlob)
+    {
+        int num = srcBlob.size[0];
+        int channels = srcBlob.size[1];
+        int ksize = (size - 1) / 2;
+        int sizeNormFactor = normBySize ? size : 1;
+        size_t planeSize = srcBlob.size[2]*srcBlob.size[3];
+
+        int nstripes = std::max(getNumThreads(), 1);
+
+        ChannelLRN clrn(srcBlob.ptr<float>(), dstBlob.ptr<float>(), channels,
+                        ksize, alpha/sizeNormFactor, bias, -beta, planeSize, num, nstripes);
+        parallel_for_(Range(0, nstripes), clrn, nstripes);
+    }
+
+    void sqrBoxFilter_(const Mat &src, Mat &dst)
+    {
+        Mat srcRawWrapper(src.rows, src.cols, src.type(), src.data, src.step[0]);
+        cv::sqrBoxFilter(srcRawWrapper, dst, dst.depth(), Size(size, size), Point(-1, -1), false, BORDER_CONSTANT);
+    }
+
+    void spatialNormalization(Mat &srcBlob, Mat &dstBlob)
+    {
+        int num = srcBlob.size[0];
+        int channels = srcBlob.size[1];
+        int sizeNormFactor = normBySize ? size*size : 1;
+
+        Mat srcMat = srcBlob;
+        Mat dstMat = dstBlob;
+
+        for (int n = 0; n < num; n++)
+        {
+            for (int cn = 0; cn < channels; cn++)
+            {
+                Mat src = getPlane(srcMat, n, cn);
+                Mat dst = getPlane(dstMat, n, cn);
+
+                sqrBoxFilter_(src, dst);
+
+                dst.convertTo(dst, dst.type(), alpha/sizeNormFactor, bias);
+                cv::pow(dst, beta, dst);
+                cv::divide(src, dst, dst);
+            }
+        }
+    }
+
+#ifdef HAVE_CUDA
+    Ptr<BackendNode> initCUDA(
+        void *context_,
+        const std::vector<Ptr<BackendWrapper>>& inputs,
+        const std::vector<Ptr<BackendWrapper>>& outputs
+    ) override
+    {
+        auto context = reinterpret_cast<csl::CSLContext*>(context_);
+
+        cuda4dnn::LRNType type_;
+        if (type == CHANNEL_NRM)
+            type_ = cuda4dnn::LRNType::ACROSS_CHANNELS;
+        else if (type == SPATIAL_NRM)
+            type_ = cuda4dnn::LRNType::WITHIN_CHANNEL;
+        else
+            CV_Error(Error::StsNotImplemented, "Unknown normalization region");
+
+        float alphaSize = alpha;
+        if (!normBySize) {
+            switch (type) {
+            case CHANNEL_NRM: alphaSize = alpha * size; break;
+            case SPATIAL_NRM: alphaSize = alpha * size * size; break;
+            }
+        }
+
+        std::size_t largestInputSize = 0;
+        for(auto& wrapper : inputs) {
+            auto input_wrapper = wrapper.dynamicCast<CUDABackendWrapper>();
+            auto shape = input_wrapper->getShape();
+            largestInputSize = std::max<std::size_t>(
+                largestInputSize,
+                std::accumulate(std::begin(shape), std::end(shape), 1, std::multiplies<int>())
+            );
+        }
+
+        return make_cuda_node<cuda4dnn::LRNOp>(preferableTarget,
+            std::move(context->cudnn_handle), type_, size, alphaSize, beta, bias, largestInputSize);
+    }
+#endif
+
+    virtual Ptr<BackendNode> initVkCom(const std::vector<Ptr<BackendWrapper> > &inputs) CV_OVERRIDE
+    {
+#ifdef HAVE_VULKAN
+        std::shared_ptr<vkcom::OpBase> op(new vkcom::OpLRN(size / 2, bias, alpha, beta, normBySize));
+        return Ptr<BackendNode>(new VkComBackendNode(inputs, op));
+#endif
+        return Ptr<BackendNode>();
+    }
+
+    virtual Ptr<BackendNode> initHalide(const std::vector<Ptr<BackendWrapper> > &inputs) CV_OVERRIDE
+    {
+#ifdef HAVE_HALIDE
+        float alphaSize = alpha;
+        if (normBySize)
+            alphaSize /= (type == CHANNEL_NRM ? size : size * size);
+        int width, height, channels, numImgs;
+        Halide::Buffer<float> inputBuffer = halideBuffer(inputs[0]);
+        getCanonicalSize(inputBuffer, &width, &height, &channels, &numImgs);
+
+        Halide::Var x("x"), y("y"), c("c"), n("n");
+        Halide::Func top = (name.empty() ? Halide::Func() : Halide::Func(name));
+        Halide::Func padded_sq(name + "_padded_sq");
+        Halide::Func sq("sq");
+        sq(x, y, c, n) = inputBuffer(x, y, c, n) * inputBuffer(x, y, c, n);
+
+        Halide::Func bounded =
+            Halide::BoundaryConditions::constant_exterior(sq, 0, 0, width,
+                                                          0, height,
+                                                          0, channels,
+                                                          0, numImgs);
+        padded_sq(x, y, c, n) = bounded(x, y, c, n);
+
+        Halide::Expr base;
+        if (type == CHANNEL_NRM)
+        {
+            Halide::RDom r((1 - size) / 2, size);
+            base = alphaSize * sum(padded_sq(x, y, c + r, n));
+        }
+        else  // SPATIAL_NRM
+        {
+            Halide::RDom r((1 - size) / 2, size, (1 - size) / 2, size);
+            base = alphaSize * sum(padded_sq(x + r.x, y + r.y, c, n));
+        }
+        base += static_cast<float>(bias);
+        top(x, y, c, n) = inputBuffer(x, y, c, n) / pow(base, beta);
+        return Ptr<BackendNode>(new HalideBackendNode({ padded_sq, top }));
+#endif  // HAVE_HALIDE
+        return Ptr<BackendNode>();
+    }
+
+    virtual void applyHalideScheduler(Ptr<BackendNode>& node,
+                                      const std::vector<Mat*> &inputs,
+                                      const std::vector<Mat> &outputs,
+                                      int targetId) const CV_OVERRIDE
+    {
+#ifdef  HAVE_HALIDE
+        if (targetId != DNN_TARGET_CPU)
+        {
+            Layer::applyHalideScheduler(node, inputs, outputs, targetId);
+            return;
+        }
+        int outW, outH, outC, outN;
+        getCanonicalSize(outputs[0].size, &outW, &outH, &outC, &outN);
+
+        Halide::Var x("x"), y("y"), c("c"), n("n"), yo("yo"), yi("yi"), tile("tile");
+        Halide::Func& top = node.dynamicCast<HalideBackendNode>()->funcs[1];
+        Halide::Func& padded_sq = node.dynamicCast<HalideBackendNode>()->funcs[0];
+
+        if (outW < 8 || outH <= 2)
+            return;
+
+        top.reorder(x, c, y, n)
+           .split(y, yo, yi, 2)
+           .fuse(yo, n, tile)
+           .parallel(tile)
+           .unroll(yi)
+           .vectorize(x, 8);
+        padded_sq.store_at(top, tile)
+                 .compute_at(top, yi);
+#endif  // HAVE_HALIDE
+    }
+
+#ifdef HAVE_INF_ENGINE
+    virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >&) CV_OVERRIDE
+    {
+        float alphaSize = alpha;
+        if (!normBySize)
+            alphaSize *= (type == SPATIAL_NRM ? size*size : size);
+
+        InferenceEngine::Builder::NormLayer ieLayer(name);
+        ieLayer.setSize(size);
+        ieLayer.setAlpha(alphaSize);
+        ieLayer.setBeta(beta);
+        ieLayer.setAcrossMaps(type == CHANNEL_NRM);
+
+        InferenceEngine::Builder::Layer l = ieLayer;
+        l.getParameters()["k"] = bias;
+        return Ptr<BackendNode>(new InfEngineBackendNode(l));
+    }
+#endif  // HAVE_INF_ENGINE
+
+#ifdef HAVE_DNN_NGRAPH
+    virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> >& inputs, const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
+    {
+        float alphaSize = alpha;
+        if (!normBySize)
+            alphaSize *= (type == SPATIAL_NRM ? size*size : size);
+
+        auto& ieInpNode = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
+        auto lrn = std::make_shared<ngraph::op::LRN>(ieInpNode, (double)alphaSize, (double)beta, (double)bias, (size_t)size);
+        return Ptr<BackendNode>(new InfEngineNgraphNode(lrn));
+    }
+#endif  // HAVE_DNN_NGRAPH
+
+    virtual int64 getFLOPS(const std::vector<MatShape> &inputs,
+                           const std::vector<MatShape> &outputs) const CV_OVERRIDE
+    {
+        CV_UNUSED(outputs); // suppress unused variable warning
+        CV_Assert(inputs.size() > 0);
+        long flops = 0;
+
+        for(int i = 0; i < inputs.size(); i++)
+        {
+            if (type == CHANNEL_NRM)
+            {
+                int channels = inputs[i][1];
+                int ksize = (size - 1) / 2;
+
+                flops += inputs[i][0]*(std::min(ksize, channels)*2*total(inputs[i], 2) + channels*4*total(inputs[i], 2));
+
+                if (ksize < channels)
+                {
+                    flops += (size + 2*(channels - size))*total(inputs[i], 2);
+                }
+            }
+            else
+            {
+                flops += total(inputs[i])*(2*size*size + 2);
+            }
+        }
+        return flops;
+    }
+
+private:
+    enum Type
+    {
+        CHANNEL_NRM,
+        SPATIAL_NRM
+    };
+};
+
+Ptr<LRNLayer> LRNLayer::create(const LayerParams& params)
+{
+    return Ptr<LRNLayer>(new LRNLayerImpl(params));
+}
+
+}
+}
--- a/Lib/opencv/sources/modules/dnn/src/layers/max_unpooling_layer.cpp
+++ b/Lib/opencv/sources/modules/dnn/src/layers/max_unpooling_layer.cpp
@@ -0,0 +1,196 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+// Copyright (C) 2016, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+
+/*
+Implementation of Batch Normalization layer.
+*/
+
+#include "../precomp.hpp"
+#include "layers_common.hpp"
+#include "../op_cuda.hpp"
+#include "../op_halide.hpp"
+#include <opencv2/dnn/shape_utils.hpp>
+
+#ifdef HAVE_CUDA
+#include "../cuda4dnn/primitives/max_unpooling.hpp"
+using namespace cv::dnn::cuda4dnn;
+#endif
+
+namespace cv
+{
+namespace dnn
+{
+
+class MaxUnpoolLayerImpl CV_FINAL : public MaxUnpoolLayer
+{
+public:
+    MaxUnpoolLayerImpl(const LayerParams& params)
+    {
+        setParamsFrom(params);
+        poolKernel = Size(params.get<int>("pool_k_w"), params.get<int>("pool_k_h"));
+        poolPad = Size(params.get<int>("pool_pad_w"), params.get<int>("pool_pad_h"));
+        poolStride = Size(params.get<int>("pool_stride_w"), params.get<int>("pool_stride_h"));
+    }
+
+    virtual bool supportBackend(int backendId) CV_OVERRIDE
+    {
+        return backendId == DNN_BACKEND_OPENCV ||
+               backendId == DNN_BACKEND_CUDA ||
+               (backendId == DNN_BACKEND_HALIDE && haveHalide() && !poolPad.width && !poolPad.height);
+    }
+
+    bool getMemoryShapes(const std::vector<MatShape> &inputs,
+                         const int requiredOutputs,
+                         std::vector<MatShape> &outputs,
+                         std::vector<MatShape> &internals) const CV_OVERRIDE
+    {
+        CV_Assert(inputs.size() == 2 || inputs.size() == 3);
+        CV_Assert(total(inputs[0]) == total(inputs[1]));
+
+        MatShape outShape;
+        if (inputs.size() == 2)
+        {
+            outShape = inputs[0];
+            outShape[2] = (outShape[2] - 1) * poolStride.height + poolKernel.height - 2 * poolPad.height;
+            outShape[3] = (outShape[3] - 1) * poolStride.width + poolKernel.width - 2 * poolPad.width;
+        }
+        else
+            outShape = inputs[2];
+
+        outputs.clear();
+        outputs.push_back(outShape);
+
+        return false;
+    }
+
+    void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
+    {
+        CV_TRACE_FUNCTION();
+        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+
+        if (inputs_arr.depth() == CV_16S)
+        {
+            forward_fallback(inputs_arr, outputs_arr, internals_arr);
+            return;
+        }
+
+        std::vector<Mat> inputs, outputs;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
+
+        CV_Assert(inputs.size() == 2 || inputs.size() == 3);
+        Mat& input = inputs[0];
+        Mat& indices = inputs[1];
+
+        CV_Assert(input.total() == indices.total());
+        CV_Assert(input.size[0] == 1);
+        CV_Assert(input.isContinuous());
+
+        for(int i_n = 0; i_n < outputs.size(); i_n++)
+        {
+            Mat& outBlob = outputs[i_n];
+            outBlob.setTo(0);
+            CV_Assert(input.size[1] == outBlob.size[1]);
+            int outPlaneTotal = outBlob.size[2]*outBlob.size[3];
+
+            for (int i_c = 0; i_c < input.size[1]; i_c++)
+            {
+                Mat outPlane = getPlane(outBlob, 0, i_c);
+                int wh_area = input.size[2]*input.size[3];
+                const float* inptr = input.ptr<float>(0, i_c);
+                const float* idxptr = indices.ptr<float>(0, i_c);
+                float* outptr = outPlane.ptr<float>();
+
+                for(int i_wh = 0; i_wh < wh_area; i_wh++)
+                {
+                    int index = idxptr[i_wh];
+                    if (!(0 <= index && index < outPlaneTotal))
+                    {
+                        std::cerr
+                            << "i_n=" << i_n << std::endl
+                            << "i_c=" << i_c << std::endl
+                            << "i_wh=" << i_wh << std::endl
+                            << "index=" << index << std::endl
+                            << "maxval=" << inptr[i_wh] << std::endl
+                            << "outPlaneTotal=" << outPlaneTotal << std::endl
+                            << "input.size=" << input.size << std::endl
+                            << "indices.size=" << indices.size << std::endl
+                            << "outBlob=" << outBlob.size << std::endl
+                            ;
+                        CV_Assert(0 <= index && index < outPlaneTotal);
+                    }
+                    outptr[index] = inptr[i_wh];
+                }
+            }
+        }
+    }
+
+#ifdef HAVE_CUDA
+    Ptr<BackendNode> initCUDA(
+        void *context_,
+        const std::vector<Ptr<BackendWrapper>>& inputs,
+        const std::vector<Ptr<BackendWrapper>>& outputs
+    ) override
+    {
+        auto context = reinterpret_cast<csl::CSLContext*>(context_);
+
+        cuda4dnn::MaxUnpoolingConfiguration config;
+        auto& window_size = config.window_size;
+        window_size.resize(2);
+        window_size[0] = poolKernel.height;
+        window_size[1] = poolKernel.width;
+
+        auto& strides = config.strides;
+        strides.resize(2);
+        strides[0] = poolStride.height;
+        strides[1] = poolStride.width;
+
+        auto& pads_begin = config.pads_begin;
+        pads_begin.resize(2);
+        pads_begin[0] = poolPad.height;
+        pads_begin[1] = poolPad.width;
+
+        return make_cuda_node<cuda4dnn::MaxUnpoolingOp>(preferableTarget, std::move(context->stream), config);
+    }
+#endif
+
+    virtual Ptr<BackendNode> initHalide(const std::vector<Ptr<BackendWrapper> > &input) CV_OVERRIDE
+    {
+#ifdef HAVE_HALIDE
+        // Meaningless operation if false because if kernel > stride
+        // it is not deterministic and if kernel < stride we just
+        // skip a part of input data (you'd better change your model).
+        if (poolKernel.width != poolStride.width ||
+            poolKernel.height != poolStride.height)
+            CV_Error(cv::Error::StsNotImplemented,
+                     "Halide backend for maximum unpooling "
+                     "is not support cases when kernel != stride");
+
+        Halide::Var x("x"), y("y"), c("c"), n("n");
+        Halide::Func top = (name.empty() ? Halide::Func() : Halide::Func(name));
+        Halide::Buffer<float> inputBuffer = halideBuffer(input[0]);
+        Halide::Buffer<float> indices = halideBuffer(input[1]);
+
+        Halide::Expr pooledX = x / poolKernel.width;
+        Halide::Expr pooledY = y / poolKernel.height;
+
+        const int outW = inputBuffer.width() * poolKernel.width;
+        top(x, y, c, n) = select(y * outW + x == indices(pooledX, pooledY, c, n),
+                                 inputBuffer(pooledX, pooledY, c, n), 0.0f);
+        return Ptr<BackendNode>(new HalideBackendNode(top));
+#endif  // HAVE_HALIDE
+        return Ptr<BackendNode>();
+    }
+};
+
+Ptr<MaxUnpoolLayer> MaxUnpoolLayer::create(const LayerParams& params)
+{
+    return Ptr<MaxUnpoolLayer>(new MaxUnpoolLayerImpl(params));
+}
+
+}
+}
--- a/Lib/opencv/sources/modules/dnn/src/layers/mvn_layer.cpp
+++ b/Lib/opencv/sources/modules/dnn/src/layers/mvn_layer.cpp
@@ -0,0 +1,416 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Copyright (C) 2017, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "../precomp.hpp"
+#include "layers_common.hpp"
+#include "../op_inf_engine.hpp"
+#include "../ie_ngraph.hpp"
+
+#include <opencv2/dnn/shape_utils.hpp>
+
+#ifdef HAVE_OPENCL
+#include "../ocl4dnn/include/math_functions.hpp"
+#include "opencl_kernels_dnn.hpp"
+#endif
+
+namespace cv
+{
+namespace dnn
+{
+
+class MVNLayerImpl CV_FINAL : public MVNLayer
+{
+public:
+    MVNLayerImpl(const LayerParams& params)
+    {
+        setParamsFrom(params);
+        normVariance = params.get<bool>("normalize_variance", true);
+        acrossChannels = params.get<bool>("across_channels", false);
+        eps = params.get<double>("eps", 1e-9);
+        fuse_batch_norm = false;
+        fuse_relu = false;
+        relu_slope = 0.f;
+        zeroDev = false;
+    }
+
+    Mat scale, shift;
+#ifdef HAVE_OPENCL
+    UMat umat_scale, umat_shift;
+#endif
+    bool fuse_batch_norm;
+
+    Ptr<ReLULayer> activ_relu;
+    float relu_slope;
+    bool fuse_relu;
+    bool zeroDev;  // TODO: Doesn't considered in Intel's Inference Engine backend.
+    bool setActivation(const Ptr<ActivationLayer>& layer) CV_OVERRIDE
+    {
+        if (!layer.empty() && !fuse_relu && !fuse_batch_norm)
+        {
+            layer->getScaleShift(scale, shift);
+            fuse_batch_norm = !scale.empty() || !shift.empty();
+            return fuse_batch_norm;
+        }
+
+        if (!layer.empty() && preferableTarget == DNN_TARGET_OPENCL)
+        {
+            activ_relu = layer.dynamicCast<ReLULayer>();
+            if( !activ_relu.empty() )
+                relu_slope = activ_relu->negativeSlope;
+        }
+        fuse_relu = !activ_relu.empty();
+        return fuse_relu;
+    }
+
+    void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays) CV_OVERRIDE
+    {
+        std::vector<Mat> inputs;
+        inputs_arr.getMatVector(inputs);
+        int splitDim = (acrossChannels) ? 1 : 2;
+        int i, newRows = 1;
+        for( i = 0; i < splitDim; i++ )
+            newRows *= inputs[0].size[i];
+        zeroDev = inputs[0].total() == newRows;
+#ifdef HAVE_OPENCL
+        umat_scale.release();
+        umat_shift.release();
+#endif
+    }
+
+    virtual bool supportBackend(int backendId) CV_OVERRIDE
+    {
+#ifdef HAVE_INF_ENGINE
+        if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 || backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
+            return !zeroDev && (preferableTarget != DNN_TARGET_MYRIAD || eps <= 1e-7f);
+        else
+#endif  // HAVE_INF_ENGINE
+            return backendId == DNN_BACKEND_OPENCV;
+    }
+
+#ifdef HAVE_OPENCL
+    bool fast_forward_ocl(std::vector<UMat> &inputs, std::vector<UMat> &outputs)
+    {
+        if (umat_scale.empty() && !scale.empty())
+            scale.copyTo(umat_scale);
+        if (umat_shift.empty() && !shift.empty())
+            shift.copyTo(umat_shift);
+        UMat& bnorm_weight = umat_scale;
+        UMat& bnorm_bias = umat_shift;
+
+        const unsigned LOCAL_SIZE = 128;
+        bool use_half = (inputs[0].depth() == CV_16S);
+        String opts = format(" -DT=%s -DT4=%s -Dconvert_T=%s -DLOCAL_SIZE=%u", use_half ? "half" : "float",
+                             use_half ? "half4" : "float4", use_half ? "convert_half4" : "convert_float4",
+                             LOCAL_SIZE
+        );
+
+        int splitDim = (acrossChannels) ? 1 : 2;
+        for (size_t inpIdx = 0; inpIdx < inputs.size(); inpIdx++)
+        {
+            UMat &inpMat = inputs[inpIdx];
+            UMat &outMat = outputs[inpIdx];
+            int newRows = total(shape(inpMat), 0, splitDim);
+            CV_Assert(newRows != 0);
+
+            MatShape s = shape(newRows, inpMat.total() / newRows);
+            UMat meanMat = UMat(s[0], 1, (use_half) ? CV_16S : CV_32F);
+            UMat tmpMat  = UMat(s[0], s[1], CV_32F);
+            float alpha = 1.0f / s[1];
+
+            String buildopt = "-DNUM=4" + opts;
+            ocl::Kernel k("mean_fuse4", ocl::dnn::mvn_oclsrc, buildopt + " -DKERNEL_MEAN_FUSE");
+            size_t localsize[] = { LOCAL_SIZE };
+            size_t globalsize[] = { (size_t)s[0] / 4 * localsize[0] };
+
+            int argId = 0;
+            k.set(argId++, ocl::KernelArg::PtrReadOnly(inpMat));
+            k.set(argId++, (int)s[1]);
+            k.set(argId++, alpha);
+            k.set(argId++, ocl::KernelArg::PtrWriteOnly(meanMat));
+            k.set(argId++, ocl::KernelArg::PtrWriteOnly(tmpMat));
+            bool ret = k.run(1, globalsize, localsize, false);
+            if (!ret)
+                return false;
+
+            buildopt += format(" %s %s", (fuse_batch_norm) ? "-DFUSE_BATCH_NORM" : "",
+                               (fuse_relu) ? "-DFUSE_RELU" : "");
+
+            ocl::Kernel k1("mvn_fuse4", ocl::dnn::mvn_oclsrc, buildopt + " -DKERNEL_MVN_FUSE");
+            argId = 0;
+            k1.set(argId++, ocl::KernelArg::PtrReadOnly(tmpMat));
+            k1.set(argId++, ocl::KernelArg::PtrReadOnly(inpMat));
+            k1.set(argId++, ocl::KernelArg::PtrReadOnly(meanMat));
+            k1.set(argId++, (int)s[1]);
+            k1.set(argId++, (float)alpha);
+            k1.set(argId++, (float)eps);
+            k1.set(argId++, (float)relu_slope);
+            k1.set(argId++, ocl::KernelArg::PtrReadOnly(bnorm_weight));
+            k1.set(argId++, ocl::KernelArg::PtrReadOnly(bnorm_bias));
+            k1.set(argId++, ocl::KernelArg::PtrWriteOnly(outMat));
+            ret = k1.run(1, globalsize, localsize, false);
+            if (!ret)
+                return false;
+        }
+        return true;
+    }
+
+    bool forward_ocl(InputArrayOfArrays inputs_, OutputArrayOfArrays outputs_, OutputArrayOfArrays internals_)
+    {
+        if (umat_scale.empty() && !scale.empty())
+            scale.copyTo(umat_scale);
+        if (umat_shift.empty() && !shift.empty())
+            shift.copyTo(umat_shift);
+        UMat& bnorm_weight = umat_scale;
+        UMat& bnorm_bias = umat_shift;
+
+        std::vector<UMat> inputs;
+        std::vector<UMat> outputs;
+
+        inputs_.getUMatVector(inputs);
+        outputs_.getUMatVector(outputs);
+
+        int splitDim = (acrossChannels) ? 1 : 2;
+        int row_size = total(shape(inputs[0]), 0, splitDim);
+        int plane_size = total(shape(inputs[0]), splitDim);
+        if (normVariance && (row_size % 4 == 0) && (plane_size % 4 == 0))
+            return fast_forward_ocl(inputs, outputs);
+
+        if (inputs[0].depth() == CV_16S)
+            return false;
+
+        String opts = format(" -DT=float -DT4=float4 -Dconvert_T=convert_float4");
+
+        for (size_t inpIdx = 0; inpIdx < inputs.size(); inpIdx++)
+        {
+            UMat &inpMat = inputs[inpIdx];
+            UMat &outMat = outputs[inpIdx];
+            int newRows = total(shape(inpMat), 0, splitDim);
+            CV_Assert(newRows != 0);
+
+            MatShape s = shape(newRows, inpMat.total() / newRows);
+            UMat oneMat = UMat::ones(s[1], 1, CV_32F);
+            UMat meanMat = UMat(s[0], 1, CV_32F);
+            UMat devMat  = UMat(s[0], 1, CV_32F);
+            UMat tmpMat  = UMat(s[0], s[1], CV_32F);
+            float alpha = 1.0f / s[1];
+
+            bool ret = ocl4dnn::ocl4dnnGEMV<float>(ocl4dnn::CblasNoTrans, s[0], s[1], alpha,
+                                                   inpMat, 0, oneMat, 0, 0.0f, meanMat, 0);
+            if (!ret)
+                return false;
+
+            int number = (s[1] % 8 == 0) ? 8 : ((s[1] % 4 == 0) ? 4 : 1);
+            size_t global[] = { (size_t)s[0], (size_t)(s[1] / number) };
+            String buildopt = format("-DNUM=%d", number) + opts;
+            if (normVariance)
+            {
+                String kname = format("calc_mean%d", number);
+                ocl::Kernel kernel(kname.c_str(), ocl::dnn::mvn_oclsrc, buildopt + " -DKERNEL_MEAN");
+                if (kernel.empty())
+                    return false;
+
+                kernel.set(0, ocl::KernelArg::PtrReadOnly(inpMat));
+                kernel.set(1, (int)s[0]);
+                kernel.set(2, (int)s[1]);
+                kernel.set(3, ocl::KernelArg::PtrReadOnly(meanMat));
+                kernel.set(4, ocl::KernelArg::PtrWriteOnly(tmpMat));
+                ret = kernel.run(2, global, NULL, false);
+                if (!ret)
+                    return false;
+
+                ret = ocl4dnn::ocl4dnnGEMV<float>(ocl4dnn::CblasNoTrans, s[0], s[1], alpha,
+                                                  tmpMat, 0, oneMat, 0, 0.0f, devMat, 0);
+                if (!ret)
+                    return false;
+            }
+
+            String kname = format("mvn%d", number);
+            buildopt += format("%s%s%s -DKERNEL_MVN", (normVariance) ? " -DNORM_VARIANCE" : "",
+                               (fuse_batch_norm) ? " -DFUSE_BATCH_NORM" : "",
+                               (fuse_relu) ? " -DFUSE_RELU" : "");
+            ocl::Kernel kernel1(kname.c_str(), ocl::dnn::mvn_oclsrc, buildopt);
+            if (kernel1.empty())
+                return false;
+            kernel1.set(0, ocl::KernelArg::PtrReadOnly(inpMat));
+            kernel1.set(1, (int)s[0]);
+            kernel1.set(2, (int)s[1]);
+            kernel1.set(3, (float)eps);
+            kernel1.set(4, ocl::KernelArg::PtrReadOnly(meanMat));
+            kernel1.set(5, ocl::KernelArg::PtrReadOnly(devMat));
+            kernel1.set(6, ocl::KernelArg::PtrReadOnly(bnorm_weight));
+            kernel1.set(7, ocl::KernelArg::PtrReadOnly(bnorm_bias));
+            kernel1.set(8, (int)inpMat.size[1]);
+            kernel1.set(9, (float)relu_slope);
+            kernel1.set(10, ocl::KernelArg::PtrWriteOnly(outMat));
+            ret = kernel1.run(2, global, NULL, false);
+            if (!ret)
+                return false;
+        }
+        return true;
+    }
+#endif
+
+    void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
+    {
+        CV_TRACE_FUNCTION();
+        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+
+        CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
+                   forward_ocl(inputs_arr, outputs_arr, internals_arr))
+
+        if (inputs_arr.depth() == CV_16S)
+        {
+            forward_fallback(inputs_arr, outputs_arr, internals_arr);
+            return;
+        }
+
+        std::vector<Mat> inputs, outputs, internals;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
+        internals_arr.getMatVector(internals);
+
+        for (size_t inpIdx = 0; inpIdx < inputs.size(); inpIdx++)
+        {
+            Mat &inpBlob = inputs[inpIdx];
+            Mat &outBlob = outputs[inpIdx];
+
+            int splitDim = (acrossChannels) ? 1 : 2;
+            int i, newRows = 1;
+            for( i = 0; i < splitDim; i++ )
+                newRows *= inpBlob.size[i];
+
+            Mat inpMat = inpBlob.reshape(1, newRows);
+            Mat outMat = outBlob.reshape(1, newRows);
+
+            if ( inpBlob.total() == newRows )
+            {
+                // MVN is applied to single values at an every row.
+                if (shift.empty())
+                {
+                    outBlob.setTo(0);
+                }
+                else
+                {
+                    for ( i = 0; i < newRows; i++ )
+                    {
+                        outMat.row(i).setTo(((float*)shift.data)[i]);
+                    }
+                }
+                return;
+            }
+
+            Scalar mean, dev;
+            for ( i = 0; i < newRows; i++)
+            {
+                Mat inpRow = inpMat.row(i);
+                Mat outRow = outMat.row(i);
+                float weight = 1.f;
+                float bias = 0.f;
+                if (fuse_batch_norm)
+                {
+                    weight = i < scale.cols ? ((float*)scale.data)[i] : weight;
+                    bias = i < shift.cols ? ((float*)shift.data)[i] : bias;
+                }
+                cv::meanStdDev(inpRow, mean, (normVariance) ? dev : noArray());
+                double alpha = 1;
+                if (normVariance)
+                {
+                    alpha = 1 / std::sqrt(eps + dev[0]*dev[0]);
+                }
+                double normalizationScale = 1.0;
+                double normalizationShift = 0.0;
+                if (fuse_batch_norm)
+                {
+                    normalizationScale = alpha * weight;
+                    normalizationShift = -mean[0] * normalizationScale + bias;
+                }
+                else
+                {
+                    normalizationScale = alpha;
+                    normalizationShift = -mean[0] * alpha;
+                }
+                inpRow.convertTo(outRow, outRow.type(), normalizationScale, normalizationShift);
+            }
+        }
+    }
+
+#ifdef HAVE_INF_ENGINE
+    virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >&) CV_OVERRIDE
+    {
+        InferenceEngine::Builder::MVNLayer ieLayer(name);
+        ieLayer.setAcrossChannels(acrossChannels);
+        ieLayer.setNormalize(normVariance);
+        ieLayer.setEpsilon(eps);
+        return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
+    }
+#endif  // HAVE_INF_ENGINE
+
+#ifdef HAVE_DNN_NGRAPH
+    virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> >& inputs,
+                                        const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
+    {
+        auto& ieInpNode = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
+        auto mvn = std::make_shared<ngraph::op::MVN>(ieInpNode, acrossChannels, normVariance, eps);
+        return Ptr<BackendNode>(new InfEngineNgraphNode(mvn));
+    }
+#endif  // HAVE_DNN_NGRAPH
+
+    virtual int64 getFLOPS(const std::vector<MatShape> &inputs,
+                           const std::vector<MatShape> &outputs) const CV_OVERRIDE
+    {
+        CV_UNUSED(outputs); // suppress unused variable warning
+        long flops = 0;
+        for(int i = 0; i < inputs.size(); i++)
+        {
+            flops += 6*total(inputs[i]) + 3*total(inputs[i], 0, normVariance ? 2 : 1);
+        }
+        return flops;
+    }
+};
+
+Ptr<MVNLayer> MVNLayer::create(const LayerParams& params)
+{
+    return Ptr<MVNLayer>(new MVNLayerImpl(params));
+}
+
+}
+}
--- a/Lib/opencv/sources/modules/dnn/src/layers/normalize_bbox_layer.cpp
+++ b/Lib/opencv/sources/modules/dnn/src/layers/normalize_bbox_layer.cpp
@@ -0,0 +1,396 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Copyright (C) 2017, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "../precomp.hpp"
+#include "layers_common.hpp"
+#include "../op_cuda.hpp"
+#include "../op_inf_engine.hpp"
+#include "../ie_ngraph.hpp"
+
+#ifdef HAVE_CUDA
+#include "../cuda4dnn/primitives/normalize_bbox.hpp"
+using namespace cv::dnn::cuda4dnn;
+#endif
+
+namespace cv { namespace dnn {
+
+class NormalizeBBoxLayerImpl CV_FINAL : public NormalizeBBoxLayer
+{
+public:
+    NormalizeBBoxLayerImpl(const LayerParams& params)
+    {
+        setParamsFrom(params);
+        pnorm = params.get<float>("p", 2);
+        epsilon = params.get<float>("eps", 1e-10f);
+        acrossSpatial = params.get<bool>("across_spatial", true);
+        startAxis = params.get<int>("start_axis", 1);
+        CV_Assert(!params.has("across_spatial") || !params.has("end_axis"));
+        endAxis = params.get<int>("end_axis", acrossSpatial ? -1 : startAxis);
+        CV_Assert(pnorm > 0);
+    }
+
+    virtual bool supportBackend(int backendId) CV_OVERRIDE
+    {
+        if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 || backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
+        {
+            if (pnorm != 2)
+                return false;
+
+            return preferableTarget == DNN_TARGET_MYRIAD ? !acrossSpatial : startAxis == 1;
+        }
+        return backendId == DNN_BACKEND_OPENCV ||
+               (backendId == DNN_BACKEND_CUDA && (pnorm == 1 || pnorm == 2));
+    }
+
+    bool getMemoryShapes(const std::vector<MatShape> &inputs,
+                         const int requiredOutputs,
+                         std::vector<MatShape> &outputs,
+                         std::vector<MatShape> &internals) const CV_OVERRIDE
+    {
+        CV_Assert(inputs.size() == 1);
+        Layer::getMemoryShapes(inputs, requiredOutputs, outputs, internals);
+        internals.resize(1, inputs[0]);
+        internals[0][0] = 1;  // Batch size.
+        return true;
+    }
+
+    void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays) CV_OVERRIDE
+    {
+        std::vector<Mat> inputs;
+        inputs_arr.getMatVector(inputs);
+        CV_Assert(inputs.size() == 1);
+        endAxis = endAxis == -1 ? (inputs[0].dims - 1) : endAxis;
+        startAxis = startAxis == -1 ? (inputs[0].dims - 1) : startAxis;
+        acrossSpatial = (startAxis == 1 && endAxis == inputs[0].dims - 1);
+    }
+
+#ifdef HAVE_OPENCL
+    bool forward_ocl(InputArrayOfArrays inputs_, OutputArrayOfArrays outputs_, OutputArrayOfArrays internals_)
+    {
+        std::vector<UMat> inputs;
+        std::vector<UMat> outputs;
+        std::vector<UMat> internals;
+
+        if (inputs_.depth() == CV_16S)
+            return false;
+
+        inputs_.getUMatVector(inputs);
+        outputs_.getUMatVector(outputs);
+        internals_.getUMatVector(internals);
+
+        CV_Assert(inputs.size() == 1 && outputs.size() == 1);
+        CV_Assert(inputs[0].total() == outputs[0].total());
+
+        const UMat& inp0 = inputs[0];
+        UMat& buffer = internals[0];
+        startAxis = clamp(startAxis, inp0.dims);
+        endAxis = clamp(endAxis, inp0.dims);
+
+        size_t num = total(shape(inp0.size), 0, startAxis);
+        size_t numPlanes = total(shape(inp0.size), startAxis, endAxis + 1);
+        size_t planeSize = inp0.total() / (num * numPlanes);
+        MatShape s = shape(1, inputs[0].total());
+        UMat inp = inputs[0].reshape(1, s.size(), &s[0]).reshape(1, num);
+        UMat out = outputs[0].reshape(1, s.size(), &s[0]).reshape(1, num);
+        for (size_t i = 0; i < num; ++i)
+        {
+            s = shape(numPlanes, planeSize);
+            UMat src = inp.row(i).reshape(1, s.size(), &s[0]);
+            UMat dst = out.row(i).reshape(1, s.size(), &s[0]);
+
+            UMat abs_mat;
+            absdiff(src, cv::Scalar::all(0), abs_mat);
+            pow(abs_mat, pnorm, buffer);
+
+            if (planeSize == 1)
+            {
+                // add eps to avoid overflow
+                float absSum = sum(buffer)[0] + epsilon;
+                float norm = pow(absSum, 1.0f / pnorm);
+                multiply(src, 1.0f / norm, dst);
+            }
+            else
+            {
+                Mat norm;
+                reduce(buffer, norm, 0, REDUCE_SUM);
+                norm += epsilon;
+
+                // compute inverted norm to call multiply instead divide
+                cv::pow(norm, -1.0f / pnorm, norm);
+
+                repeat(norm, numPlanes, 1, buffer);
+                multiply(src, buffer, dst);
+            }
+
+            if (!blobs.empty())
+            {
+                // scale the output
+                Mat scale = blobs[0];
+                if (scale.total() == 1)
+                {
+                    // _scale: 1 x 1
+                    multiply(dst, scale.at<float>(0, 0), dst);
+                }
+                else
+                {
+                    // _scale: _channels x 1
+                    CV_Assert(scale.total() == numPlanes);
+                    repeat(scale, 1, dst.cols, buffer);
+                    multiply(dst, buffer, dst);
+                }
+            }
+        }
+        return true;
+    }
+#endif
+
+    void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
+    {
+        CV_TRACE_FUNCTION();
+        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+
+        CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
+                   forward_ocl(inputs_arr, outputs_arr, internals_arr))
+
+        if (inputs_arr.depth() == CV_16S)
+        {
+            forward_fallback(inputs_arr, outputs_arr, internals_arr);
+            return;
+        }
+
+        std::vector<Mat> inputs, outputs, internals;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
+        internals_arr.getMatVector(internals);
+
+        CV_Assert(inputs.size() == 1 && outputs.size() == 1);
+        CV_Assert(inputs[0].total() == outputs[0].total());
+
+        const Mat& inp0 = inputs[0];
+        Mat& buffer = internals[0];
+        startAxis = clamp(startAxis, inp0.dims);
+        endAxis = clamp(endAxis, inp0.dims);
+
+        const float* inpData = inp0.ptr<float>();
+        float* outData = outputs[0].ptr<float>();
+
+        size_t num = total(shape(inp0.size), 0, startAxis);
+        size_t numPlanes = total(shape(inp0.size), startAxis, endAxis + 1);
+        CV_Assert(num * numPlanes != 0);
+        size_t planeSize = inp0.total() / (num * numPlanes);
+        for (size_t n = 0; n < num; ++n)
+        {
+            Mat src = Mat(numPlanes, planeSize, CV_32F, (void*)inpData);
+            Mat dst = Mat(numPlanes, planeSize, CV_32F, (void*)outData);
+            cv::pow(abs(src), pnorm, buffer);
+
+            if (planeSize == 1)
+            {
+                // add eps to avoid overflow
+                float absSum = sum(buffer)[0] + epsilon;
+                float norm = pow(absSum, 1.0f / pnorm);
+                multiply(src, 1.0f / norm, dst);
+            }
+            else
+            {
+                Mat norm;
+                reduce(buffer, norm, 0, REDUCE_SUM);
+                norm += epsilon;
+
+                // compute inverted norm to call multiply instead divide
+                cv::pow(norm, -1.0f / pnorm, norm);
+
+                repeat(norm, numPlanes, 1, buffer);
+                multiply(src, buffer, dst);
+            }
+
+            if (!blobs.empty())
+            {
+                // scale the output
+                Mat scale = blobs[0];
+                if (scale.total() == 1)
+                {
+                    // _scale: 1 x 1
+                    dst *= scale.at<float>(0, 0);
+                }
+                else
+                {
+                    // _scale: _channels x 1
+                    CV_Assert(scale.total() == numPlanes);
+                    repeat(scale, 1, dst.cols, buffer);
+                    multiply(dst, buffer, dst);
+                }
+            }
+            inpData += numPlanes * planeSize;
+            outData += numPlanes * planeSize;
+        }
+    }
+
+#ifdef HAVE_CUDA
+    Ptr<BackendNode> initCUDA(
+        void *context_,
+        const std::vector<Ptr<BackendWrapper>>& inputs,
+        const std::vector<Ptr<BackendWrapper>>& outputs
+    ) override
+    {
+        auto context = reinterpret_cast<csl::CSLContext*>(context_);
+
+        if(pnorm != 1 && pnorm != 2)
+            CV_Error(Error::StsNotImplemented, "Unsupported normalization mode");
+
+        auto input_wrapper = inputs[0].dynamicCast<CUDABackendWrapper>();
+        auto input_shape = input_wrapper->getShape();
+
+        NormalizeConfiguration<float> config;
+        config.input_shape.assign(std::begin(input_shape), std::end(input_shape));
+        config.axis_start = clamp(startAxis, input_shape.size());
+        config.axis_end = clamp(endAxis, input_shape.size()) + 1; /* +1 because NormalizeOp follows [start, end) convention */
+        config.norm = pnorm;
+        config.eps = epsilon;
+
+        const auto& weightsMat = blobs.empty() ? Mat() : blobs[0];
+        return make_cuda_node<cuda4dnn::NormalizeOp>(preferableTarget, std::move(context->stream), weightsMat, config);
+    }
+#endif
+
+#ifdef HAVE_INF_ENGINE
+    virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >& inputs) CV_OVERRIDE
+    {
+        InferenceEngine::DataPtr input = infEngineDataNode(inputs[0]);
+        std::vector<size_t> dims = input->getDims();
+        if (dims.size() == 4)
+        {
+            InferenceEngine::Builder::NormalizeLayer ieLayer(name);
+
+            ieLayer.setChannelShared(false);
+            ieLayer.setAcrossMaps(acrossSpatial);
+            ieLayer.setEpsilon(epsilon);
+
+            InferenceEngine::Builder::Layer l = ieLayer;
+            const int numChannels = dims[1];
+            InferenceEngine::Blob::Ptr weights;
+            if (blobs.empty())
+            {
+                weights = InferenceEngine::make_shared_blob<float>({
+                              InferenceEngine::Precision::FP32,
+                              {(size_t)numChannels}, InferenceEngine::Layout::C
+                          });
+                weights->allocate();
+
+                Mat weightsMat = infEngineBlobToMat(weights).reshape(1, numChannels);
+                Mat(numChannels, 1, CV_32F, Scalar(1)).copyTo(weightsMat);
+                l.getParameters()["channel_shared"] = false;
+            }
+            else
+            {
+                CV_Assert(numChannels == blobs[0].total());
+                weights = wrapToInfEngineBlob(blobs[0], {(size_t)numChannels}, InferenceEngine::Layout::C);
+                l.getParameters()["channel_shared"] = blobs[0].total() == 1;
+            }
+            addConstantData("weights", weights, l);
+            l.getParameters()["across_spatial"] = acrossSpatial;
+            return Ptr<BackendNode>(new InfEngineBackendNode(l));
+        }
+        else
+        {
+            InferenceEngine::Builder::GRNLayer ieLayer(name);
+            ieLayer.setBeta(epsilon);
+
+            InferenceEngine::Builder::Layer l = ieLayer;
+            l.getParameters()["bias"] = epsilon;
+
+            return Ptr<BackendNode>(new InfEngineBackendNode(l));
+        }
+    }
+#endif  // HAVE_INF_ENGINE
+
+#ifdef HAVE_DNN_NGRAPH
+    virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> >& inputs,
+                                        const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
+    {
+        auto& ieInpNode = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
+        const size_t batch = ieInpNode->get_shape()[0];
+        const size_t numChannels = ieInpNode->get_shape()[1];
+
+        std::vector<int64_t> axes_data;
+        if (!acrossSpatial) {
+            axes_data.push_back(1);
+        } else {
+            axes_data.resize(ieInpNode->get_shape().size());
+            std::iota(axes_data.begin(), axes_data.end(), 0);
+        }
+        auto axes = std::make_shared<ngraph::op::Constant>(ngraph::element::i64, ngraph::Shape{axes_data.size()}, axes_data);
+        auto norm = std::make_shared<ngraph::op::NormalizeL2>(ieInpNode, axes, epsilon, ngraph::op::EpsMode::ADD);
+
+        CV_Assert(blobs.empty() || numChannels == blobs[0].total());
+        std::vector<size_t> shape(ieInpNode->get_shape().size(), 1);
+        shape[0] = blobs.empty() ? 1 : batch;
+        shape[1] = numChannels;
+        std::shared_ptr<ngraph::op::Constant> weight;
+        if (blobs.empty())
+        {
+            std::vector<float> ones(numChannels, 1);
+            weight = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape(shape), ones.data());
+        }
+        else
+        {
+            // weight->get_shape().size() > 1 ~> channel_shared = false
+            weight = std::make_shared<ngraph::op::Constant>(
+                                      ngraph::element::f32, ngraph::Shape(shape), blobs[0].data);
+        }
+        auto mul = std::make_shared<ngraph::op::v1::Multiply>(norm, weight, ngraph::op::AutoBroadcastType::NUMPY);
+        return Ptr<BackendNode>(new InfEngineNgraphNode(mul));
+    }
+#endif  // HAVE_DNN_NGRAPH
+
+private:
+    int startAxis, endAxis;
+};
+
+
+Ptr<NormalizeBBoxLayer> NormalizeBBoxLayer::create(const LayerParams &params)
+{
+    return Ptr<NormalizeBBoxLayer>(new NormalizeBBoxLayerImpl(params));
+}
+
+}
+}
--- a/Lib/opencv/sources/modules/dnn/src/layers/padding_layer.cpp
+++ b/Lib/opencv/sources/modules/dnn/src/layers/padding_layer.cpp
@@ -0,0 +1,277 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+// Copyright (C) 2017, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+
+/*
+Implementation of padding layer, which adds paddings to input blob.
+*/
+
+#include "../precomp.hpp"
+#include "layers_common.hpp"
+#include "../op_cuda.hpp"
+#include "../op_halide.hpp"
+#include "../op_inf_engine.hpp"
+#include "../ie_ngraph.hpp"
+
+#include <vector>
+
+#ifdef HAVE_CUDA
+#include "../cuda4dnn/primitives/padding.hpp"
+using namespace cv::dnn::cuda4dnn;
+#endif
+
+namespace cv
+{
+namespace dnn
+{
+
+class PaddingLayerImpl CV_FINAL : public PaddingLayer
+{
+public:
+    PaddingLayerImpl(const LayerParams &params)
+    {
+        setParamsFrom(params);
+        paddingValue = params.get<float>("value", 0);
+        inputDims = params.get<int>("input_dims", -1);
+        paddingType = params.get<String>("type", "constant");
+
+        CV_Assert(params.has("paddings"));
+        const DictValue& paddingsParam = params.get("paddings");
+        CV_Assert((paddingsParam.size() & 1) == 0);
+
+        paddings.resize(paddingsParam.size() / 2);
+        for (int i = 0; i < paddings.size(); ++i)
+        {
+            paddings[i].first = paddingsParam.get<int>(i * 2);  // Pad before.
+            paddings[i].second = paddingsParam.get<int>(i * 2 + 1);  // Pad after.
+            CV_Assert_N(paddings[i].first >= 0, paddings[i].second >= 0);
+        }
+    }
+
+    bool getMemoryShapes(const std::vector<MatShape> &inputs,
+                         const int requiredOutputs,
+                         std::vector<MatShape> &outputs,
+                         std::vector<MatShape> &internals) const CV_OVERRIDE
+    {
+        CV_Assert(inputs.size() == 1);
+        const MatShape& inpShape = inputs[0];
+        CV_Assert(inpShape.size() >= paddings.size());
+        CV_Assert(inputDims == -1 || inpShape.size() == inputDims || inpShape.size() > paddings.size());
+
+        outputs.resize(1, inpShape);
+        int offset = (inputDims == -1 ? 0 : (inpShape.size() > inputDims ? 1 : 0));
+        for (int i = 0; i < paddings.size(); ++i)
+        {
+            outputs[0][offset + i] = inpShape[offset + i] + paddings[i].first + paddings[i].second;
+        }
+        return false;
+    }
+
+    void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays) CV_OVERRIDE
+    {
+        std::vector<Mat> inputs;
+        inputs_arr.getMatVector(inputs);
+
+        // Compute dstRanges.
+        const MatSize& inpShape = inputs[0].size;
+
+        if (inputDims != -1 && inputs[0].dims != inputDims)
+        {
+            paddings.insert(paddings.begin(), std::make_pair(0, 0));
+        }
+
+        dstRanges.resize(paddings.size());
+        for (int i = 0; i < paddings.size(); ++i)
+        {
+            dstRanges[i].start = paddings[i].first;
+            dstRanges[i].end = paddings[i].first + inpShape[i];
+        }
+
+        // Add the rest of dimensions.
+        for (int i = dstRanges.size(); i < inputs[0].dims; ++i)
+        {
+            dstRanges.push_back(Range::all());
+            paddings.push_back(std::make_pair(0, 0));
+        }
+        inputDims = -1;  // Next time paddings are filled for all the dimensions.
+    }
+
+    virtual bool supportBackend(int backendId) CV_OVERRIDE
+    {
+#ifdef HAVE_INF_ENGINE
+        if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 || backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
+            return INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2019R1) &&
+                   (preferableTarget != DNN_TARGET_MYRIAD ||
+                    (dstRanges.size() == 4 && paddings[0].first == 0 && paddings[0].second == 0));
+#endif
+        return backendId == DNN_BACKEND_OPENCV ||
+               backendId == DNN_BACKEND_CUDA ||
+               (backendId == DNN_BACKEND_HALIDE && haveHalide() && dstRanges.size() == 4);
+    }
+
+    void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
+    {
+        CV_TRACE_FUNCTION();
+        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+
+        std::vector<Mat> inputs, outputs;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
+
+        if (paddingType == "constant")
+        {
+            if (inputs_arr.depth() == CV_16S)
+            {
+                std::vector<float> paddingValue_fp32(1, paddingValue);
+                std::vector<int16_t> paddingValue_fp16(1);
+                cv::convertFp16(paddingValue_fp32, paddingValue_fp16);
+                outputs[0].setTo(paddingValue_fp16[0]);
+            }
+            else
+                outputs[0].setTo(paddingValue);
+            inputs[0].copyTo(outputs[0](dstRanges));
+        }
+        else if (paddingType == "reflect")
+        {
+            CV_Assert(inputs.size() == 1);
+            CV_Assert(outputs.size() == 1);
+            CV_Assert(inputs[0].dims == 4);
+            CV_Assert(outputs[0].dims == 4);
+
+            if (inputs[0].size[0] != outputs[0].size[0] || inputs[0].size[1] != outputs[0].size[1])
+                CV_Error(Error::StsNotImplemented, "Only spatial reflection padding is supported.");
+
+            const int inpHeight = inputs[0].size[2];
+            const int inpWidth = inputs[0].size[3];
+            const int outHeight = outputs[0].size[2];
+            const int outWidth = outputs[0].size[3];
+            const int padTop = dstRanges[2].start;
+            const int padBottom = outHeight - dstRanges[2].end;
+            const int padLeft = dstRanges[3].start;
+            const int padRight = outWidth - dstRanges[3].end;
+            CV_CheckLT(padTop, inpHeight, ""); CV_CheckLT(padBottom, inpHeight, "");
+            CV_CheckLT(padLeft, inpWidth, ""); CV_CheckLT(padRight, inpWidth, "");
+
+            for (size_t n = 0; n < inputs[0].size[0]; ++n)
+            {
+                for (size_t ch = 0; ch < inputs[0].size[1]; ++ch)
+                {
+                    copyMakeBorder(getPlane(inputs[0], n, ch),
+                                   getPlane(outputs[0], n, ch),
+                                   padTop, padBottom, padLeft, padRight,
+                                   BORDER_REFLECT_101);
+                }
+            }
+        }
+        else
+            CV_Error(Error::StsNotImplemented, "Unknown padding type: " + paddingType);
+    }
+
+#ifdef HAVE_CUDA
+    Ptr<BackendNode> initCUDA(
+        void *context_,
+        const std::vector<Ptr<BackendWrapper>>& inputs,
+        const std::vector<Ptr<BackendWrapper>>& outputs
+    ) override
+    {
+        auto context = reinterpret_cast<csl::CSLContext*>(context_);
+
+        cuda4dnn::PaddingType ptype;
+        if (paddingType == "constant")
+            ptype = PaddingType::CONSTANT;
+        else if (paddingType == "reflect")
+            ptype = PaddingType::REFLECTION101;
+        else
+            CV_Error(Error::StsNotImplemented, "Unsupported padding mode");
+
+        return make_cuda_node<cuda4dnn::PaddingOp>(preferableTarget, std::move(context->stream), ptype, paddingValue, dstRanges);
+    }
+#endif
+
+    virtual Ptr<BackendNode> initHalide(const std::vector<Ptr<BackendWrapper> > &inputs) CV_OVERRIDE
+    {
+#ifdef HAVE_HALIDE
+        int inW, inH, inC, inN;
+        int minN = std::max(dstRanges[0].start, 0);
+        int minC = std::max(dstRanges[1].start, 0);
+        int minY = std::max(dstRanges[2].start, 0);
+        int minX = std::max(dstRanges[3].start, 0);
+        Halide::Buffer<float> inputBuffer = halideBuffer(inputs[0]);
+        getCanonicalSize(inputBuffer, &inW, &inH, &inC, &inN);
+
+        Halide::Var x("x"), y("y"), c("c"), n("n");
+        Halide::Func top = (name.empty() ? Halide::Func() : Halide::Func(name));
+        Halide::Func padded =
+            Halide::BoundaryConditions::constant_exterior(inputBuffer, paddingValue);
+        top(x, y, c, n) = padded(x - minX, y - minY, c - minC, n - minN);
+        return Ptr<BackendNode>(new HalideBackendNode(top));
+#endif  // HAVE_HALIDE
+        return Ptr<BackendNode>();
+    }
+
+#ifdef HAVE_INF_ENGINE
+    virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >&) CV_OVERRIDE
+    {
+        InferenceEngine::Builder::Layer ieLayer(name);
+        ieLayer.setName(name);
+        ieLayer.setType("Pad");
+
+        std::vector<int> begins(paddings.size(), 0), ends(paddings.size(), 0);
+        for (int i = 0; i < paddings.size(); ++i)
+        {
+            begins[i] = paddings[i].first;
+            ends[i] = paddings[i].second;
+        }
+        ieLayer.getParameters()["pads_begin"] = begins;
+        ieLayer.getParameters()["pads_end"] = ends;
+        ieLayer.getParameters()["pad_mode"] = paddingType;
+        if (paddingType == "constant")
+            ieLayer.getParameters()["pad_value"] = paddingValue;
+
+        ieLayer.setInputPorts(std::vector<InferenceEngine::Port>(1));
+        ieLayer.setOutputPorts(std::vector<InferenceEngine::Port>(1));
+        return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
+    }
+#endif
+
+#ifdef HAVE_DNN_NGRAPH
+    virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> >& inputs,
+                                        const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
+    {
+        auto& ieInpNode = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
+        std::vector<int64_t> begins(paddings.size(), 0), ends(paddings.size(), 0);
+        for (int i = 0; i < paddings.size(); ++i)
+        {
+            begins[i] = static_cast<int64_t>(paddings[i].first);
+            ends[i]   = static_cast<int64_t>(paddings[i].second);
+        }
+        auto padding_below = std::make_shared<ngraph::op::Constant>(ngraph::element::i64, ngraph::Shape{begins.size()}, begins.data());
+        auto padding_above = std::make_shared<ngraph::op::Constant>(ngraph::element::i64, ngraph::Shape{ends.size()}, ends.data());
+        auto pad_mode = paddingType == "constant" ? ngraph::op::PadMode::CONSTANT : ngraph::op::PadMode::REFLECT; // SYMMETRIC
+        auto arg_pad_value = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape{}, &paddingValue);;
+
+        auto pad = paddingType == "constant" ?
+             std::make_shared<ngraph::op::v1::Pad>(ieInpNode, padding_below, padding_above, arg_pad_value, pad_mode) :
+             std::make_shared<ngraph::op::v1::Pad>(ieInpNode, padding_below, padding_above, pad_mode);
+        return Ptr<BackendNode>(new InfEngineNgraphNode(pad));
+    }
+#endif
+
+private:
+    std::vector<std::pair<int, int> > paddings;  // Pairs pad before, pad after.
+    std::vector<Range> dstRanges;
+    int inputDims;
+    float paddingValue;
+    std::string paddingType;
+};
+
+Ptr<PaddingLayer> PaddingLayer::create(const LayerParams &params)
+{
+    return Ptr<PaddingLayer>(new PaddingLayerImpl(params));
+}
+
+}
+}
--- a/Lib/opencv/sources/modules/dnn/src/layers/permute_layer.cpp
+++ b/Lib/opencv/sources/modules/dnn/src/layers/permute_layer.cpp
@@ -0,0 +1,450 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Copyright (C) 2017, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "../precomp.hpp"
+#include "layers_common.hpp"
+#include "../op_cuda.hpp"
+#include "../op_inf_engine.hpp"
+#include "../ie_ngraph.hpp"
+#include "../op_vkcom.hpp"
+
+#include <float.h>
+#include <algorithm>
+
+#ifdef HAVE_OPENCL
+#include "opencl_kernels_dnn.hpp"
+#endif
+
+#ifdef HAVE_CUDA
+#include "../cuda4dnn/primitives/permute.hpp"
+using namespace cv::dnn::cuda4dnn;
+#endif
+
+namespace cv
+{
+namespace dnn
+{
+class PermuteLayerImpl CV_FINAL : public PermuteLayer
+{
+public:
+    void checkNeedForPermutation()
+    {
+        _needsPermute = false;
+        for (size_t i = 0; i < _numAxes; ++i)
+        {
+            if (_order[i] != i)
+            {
+                _needsPermute = true;
+                break;
+            }
+        }
+    }
+
+    PermuteLayerImpl(const LayerParams &params)
+        : _count(0), _needsPermute(false), _numAxes(0)
+    {
+        if (!params.has("order"))
+        {
+            return;
+        }
+
+        DictValue paramOrder = params.get("order");
+        _numAxes = paramOrder.size();
+
+        for (size_t i = 0; i < _numAxes; i++)
+        {
+            int currentOrder = paramOrder.get<int>(i);
+            if (currentOrder < 0 || currentOrder > _numAxes)
+            {
+                CV_Error(Error::StsBadArg,
+                         format("Orders of dimensions in Permute layer parameter"
+                                "must be in [0...%zu]", _numAxes - 1));
+            }
+            if (std::find(_order.begin(), _order.end(), currentOrder) != _order.end())
+            {
+                CV_Error(Error::StsBadArg,
+                         "Permute layer parameter contains duplicated orders.");
+            }
+            _order.push_back(currentOrder);
+        }
+
+        setParamsFrom(params);
+        checkNeedForPermutation();
+    }
+
+    virtual bool supportBackend(int backendId) CV_OVERRIDE
+    {
+        return backendId == DNN_BACKEND_OPENCV ||
+               backendId == DNN_BACKEND_CUDA ||
+               ((backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 || backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH) && haveInfEngine()) ||
+               (backendId == DNN_BACKEND_VKCOM && haveVulkan());
+    }
+
+    bool getMemoryShapes(const std::vector<MatShape> &inputs,
+                         const int requiredOutputs,
+                         std::vector<MatShape> &outputs,
+                         std::vector<MatShape> &internals) const CV_OVERRIDE
+    {
+        if(!_needsPermute)
+        {
+            Layer::getMemoryShapes(inputs, requiredOutputs, outputs, internals);
+            return true;
+        }
+
+        CV_Assert(inputs.size() > 0);
+        CV_Assert((int)_numAxes == inputs[0].size());
+
+        MatShape shapeBefore = inputs[0], shapeAfter;
+        for (size_t i = 0; i < _numAxes; i++)
+        {
+            shapeAfter.push_back(shapeBefore[_order[i]]);
+        }
+
+        outputs.clear();
+
+        for (size_t i = 0; i < inputs.size(); i++)
+        {
+            CV_Assert(total(inputs[i]) == total(shapeAfter));
+            outputs.push_back(shapeAfter);
+        }
+
+        return false;
+    }
+
+    void computeStrides(const MatShape &shapeBefore, const MatShape &shapeAfter)
+    {
+        _oldStride.resize(_numAxes);
+        _newStride.resize(_numAxes);
+
+        _oldStride[_numAxes - 1] = 1;
+        _newStride[_numAxes - 1] = 1;
+
+        for(int i = _numAxes - 2; i >= 0; i--)
+        {
+            _oldStride[i] = _oldStride[i + 1] * shapeBefore[i + 1];
+            _newStride[i] = _newStride[i + 1] * shapeAfter[i + 1];
+        }
+
+        _count = _oldStride[0] * shapeBefore[0];
+    }
+
+    void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr) CV_OVERRIDE
+    {
+        if(!_needsPermute)
+        {
+            return;
+        }
+        std::vector<Mat> inputs, outputs;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
+
+        CV_Assert(inputs.size() > 0);
+        const Mat& inp0 = inputs[0];
+        CV_Assert((int)_numAxes == inp0.dims);
+
+        computeStrides(shape(inputs[0]), shape(outputs[0]));
+
+#ifdef HAVE_OPENCL
+        if (uorder.empty())
+        {
+            std::vector<int> orderVec(_order.begin(), _order.end());;
+            Mat morder(1, orderVec.size(), CV_32SC1, &orderVec[0]);
+
+            std::vector<int> oldStrideVec(_oldStride.begin(), _oldStride.end());
+            Mat mold_stride(1, _oldStride.size(), CV_32SC1, &oldStrideVec[0]);
+
+            std::vector<int> newStrideVec(_newStride.begin(), _newStride.end());
+            Mat mnew_stride(1, newStrideVec.size(), CV_32SC1, &newStrideVec[0]);
+
+            morder.copyTo(uorder);
+            mold_stride.copyTo(uold_stride);
+            mnew_stride.copyTo(unew_stride);
+        }
+#endif
+    }
+
+    class PermuteInvoker : public ParallelLoopBody
+    {
+    public:
+        const Mat* inp;
+        Mat* out;
+        const std::vector<size_t>* order;
+        int nstripes;
+
+        static void run(const Mat& inp, Mat& out, const std::vector<size_t>& order, int nstripes)
+        {
+            PermuteInvoker p;
+            p.inp = &inp;
+            p.out = &out;
+            p.order = &order;
+            p.nstripes = nstripes;
+
+            CV_Assert( out.size[0] == inp.size[order[0]] &&
+                      out.size[1] == inp.size[order[1]] &&
+                      out.size[2] == inp.size[order[2]] &&
+                      out.size[3] == inp.size[order[3]]);
+
+            parallel_for_(Range(0, nstripes), p, nstripes);
+        }
+
+        PermuteInvoker() : inp(0), out(0), order(0), nstripes(0) {}
+
+        void operator()(const Range& r) const CV_OVERRIDE
+        {
+            int n0 = out->size[0], n1 = out->size[1], n2 = out->size[2], n3 = out->size[3];
+
+            size_t orows = (size_t)n0*n1*n2;
+            size_t stripeSize = (orows + nstripes - 1)/nstripes;
+            size_t stripeStart = r.start*stripeSize;
+            size_t stripeEnd = std::min(r.end*stripeSize, orows);
+
+            const size_t esz = sizeof(float);
+            size_t ostep0 = out->step[0]/esz, ostep1 = out->step[1]/esz, ostep2 = out->step[2]/esz;
+            const size_t* ord = &order->at(0);
+            size_t istep0 = inp->step[ord[0]]/esz, istep1 = inp->step[ord[1]]/esz,
+            istep2 = inp->step[ord[2]]/esz, istep3 = inp->step[ord[3]]/esz;
+
+            size_t val = stripeStart;
+            int i2 = (int)(val % n2);
+            val /= n2;
+            int i1 = (int)(val % n1);
+            int i0 = (int)(val / n1);
+
+            const float* inptr_orig = inp->ptr<float>();
+            float* outptr_orig = out->ptr<float>();
+
+            for( size_t ofs = stripeStart; ofs < stripeEnd; ofs++ )
+            {
+                const float* inptr = inptr_orig + i0*istep0 + i1*istep1 + i2*istep2;
+                float* outptr = outptr_orig + i0*ostep0 + i1*ostep1 + i2*ostep2;
+
+                for( int i3 = 0; i3 < n3; i3++ )
+                    outptr[i3] = inptr[i3*istep3];
+
+                if( ++i2 >= n2 )
+                {
+                    i2 = 0;
+                    if( ++i1 >= n1 )
+                    {
+                        i1 = 0;
+                        if( ++i0 >= n0 )
+                            break;
+                    }
+                }
+            }
+        }
+    };
+
+#ifdef HAVE_OPENCL
+    bool forward_ocl(InputArrayOfArrays inps, OutputArrayOfArrays outs, OutputArrayOfArrays internals)
+    {
+        std::vector<UMat> inputs;
+        std::vector<UMat> outputs;
+
+        inps.getUMatVector(inputs);
+        outs.getUMatVector(outputs);
+
+        if (!_needsPermute)
+            return false;
+
+        bool use_half = (inps.depth() == CV_16S);
+        String opts = format("-DDtype=%s", use_half ? "half" : "float");
+        for (size_t i = 0; i < inputs.size(); i++)
+        {
+            ocl::Kernel kernel("permute", ocl::dnn::permute_oclsrc, opts);
+
+            kernel.set(0, (int)_count);
+            kernel.set(1, ocl::KernelArg::PtrReadOnly(inputs[i]));
+            kernel.set(2, ocl::KernelArg::PtrReadOnly(uorder));
+            kernel.set(3, ocl::KernelArg::PtrReadOnly(uold_stride));
+            kernel.set(4, ocl::KernelArg::PtrReadOnly(unew_stride));
+            kernel.set(5, (int)_numAxes);
+            kernel.set(6, ocl::KernelArg::PtrWriteOnly(outputs[i]));
+
+            if (!kernel.run(1, &_count, NULL, false))
+                return false;
+        }
+
+        return true;
+    }
+#endif
+
+    void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
+    {
+        CV_TRACE_FUNCTION();
+        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+
+        CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
+                   forward_ocl(inputs_arr, outputs_arr, internals_arr))
+
+        if (inputs_arr.depth() == CV_16S)
+        {
+            forward_fallback(inputs_arr, outputs_arr, internals_arr);
+            return;
+        }
+
+        std::vector<Mat> inputs, outputs;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
+
+        size_t k, ninputs = inputs.size();
+        if(!_needsPermute)
+        {
+            for (k = 0; k < ninputs; k++)
+            {
+                CV_Assert(outputs[k].total() == inputs[k].total());
+                if (outputs[k].data != inputs[k].data)
+                    inputs[k].copyTo(outputs[k]);
+            }
+        }
+        else
+        {
+            size_t i, j, count = _count, numAxes = _numAxes;
+            const size_t* newStride = &_newStride[0];
+            const size_t* oldStride = &_oldStride[0];
+            const size_t* order = &_order[0];
+
+            for (k = 0; k < ninputs; k++)
+            {
+                const Mat& inp = inputs[k];
+                Mat& out = outputs[k];
+
+                CV_Assert(inp.dims == numAxes && inp.size == inputs[0].size);
+                CV_Assert(out.dims == numAxes && out.size == outputs[0].size);
+
+                CV_Assert(inp.isContinuous() && out.isContinuous());
+                CV_Assert(inp.type() == CV_32F && out.type() == CV_32F);
+
+                if( numAxes == 4 )
+                {
+                    int nstripes = getNumThreads();
+                    PermuteInvoker::run(inp, out, _order, nstripes);
+                }
+                else
+                {
+                    const float *srcData = inp.ptr<float>();
+                    float *dstData = out.ptr<float>();
+
+                    for (i = 0; i < count; ++i)
+                    {
+                        size_t oldPosition = 0;
+                        size_t newPosition = i;
+
+                        for (j = 0; j < numAxes; ++j)
+                        {
+                            oldPosition += (newPosition / newStride[j]) * oldStride[order[j]];
+                            newPosition %= newStride[j];
+                        }
+                        dstData[i] = srcData[oldPosition];
+                    }
+                }
+            }
+        }
+    }
+
+#ifdef HAVE_CUDA
+    Ptr<BackendNode> initCUDA(
+        void *context_,
+        const std::vector<Ptr<BackendWrapper>>& inputs,
+        const std::vector<Ptr<BackendWrapper>>& outputs
+    ) override
+    {
+        auto context = reinterpret_cast<csl::CSLContext*>(context_);
+        return make_cuda_node<cuda4dnn::PermuteOp>(preferableTarget, std::move(context->stream), _order);
+    }
+#endif
+
+    virtual Ptr<BackendNode> initVkCom(const std::vector<Ptr<BackendWrapper> > &input) CV_OVERRIDE
+    {
+#ifdef HAVE_VULKAN
+        CV_Assert(!_order.empty());
+        std::shared_ptr<vkcom::OpBase> op(new vkcom::OpPermute(_order));
+        return Ptr<BackendNode>(new VkComBackendNode(input, op));
+#endif // HAVE_VULKAN
+        return Ptr<BackendNode>();
+    }
+
+#ifdef HAVE_INF_ENGINE
+    virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >&) CV_OVERRIDE
+    {
+        InferenceEngine::Builder::PermuteLayer ieLayer(name);
+        ieLayer.setOrder(_order);
+        return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
+    }
+#endif  // HAVE_INF_ENGINE
+
+#ifdef HAVE_DNN_NGRAPH
+    virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> >& inputs,
+                                        const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
+    {
+        auto& ieInpNode = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
+        auto tr_axes = std::make_shared<ngraph::op::Constant>(ngraph::element::i64,
+                       ngraph::Shape({_order.size()}), _order.data());
+        auto transpose = std::make_shared<ngraph::op::Transpose>(ieInpNode, tr_axes);
+        return Ptr<BackendNode>(new InfEngineNgraphNode(transpose));
+    }
+#endif  // HAVE_DNN_NGRAPH
+
+    size_t _count;
+    std::vector<size_t> _order;
+
+    std::vector<int> _oldDimensionSize;
+    std::vector<int> _newDimensionSize;
+
+    std::vector<size_t> _oldStride;
+    std::vector<size_t> _newStride;
+    bool _needsPermute;
+
+#ifdef HAVE_OPENCL
+    UMat uorder, uold_stride, unew_stride;
+#endif
+
+    size_t _numAxes;
+};
+
+Ptr<PermuteLayer> PermuteLayer::create(const LayerParams &params)
+{
+    return Ptr<PermuteLayer>(new PermuteLayerImpl(params));
+}
+
+}
+}
--- a/Lib/opencv/sources/modules/dnn/src/layers/pooling_layer.cpp
+++ b/Lib/opencv/sources/modules/dnn/src/layers/pooling_layer.cpp
--- a/Lib/opencv/sources/modules/dnn/src/layers/prior_box_layer.cpp
+++ b/Lib/opencv/sources/modules/dnn/src/layers/prior_box_layer.cpp
@@ -0,0 +1,757 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Copyright (C) 2017, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "../precomp.hpp"
+#include "layers_common.hpp"
+#include "../op_cuda.hpp"
+#include "../op_inf_engine.hpp"
+
+#ifdef HAVE_DNN_NGRAPH
+#include "../ie_ngraph.hpp"
+#include <ngraph/op/experimental/layers/prior_box.hpp>
+#include <ngraph/op/experimental/layers/prior_box_clustered.hpp>
+#endif
+
+#include "../op_vkcom.hpp"
+
+#include <float.h>
+#include <algorithm>
+#include <cmath>
+
+#ifdef HAVE_OPENCL
+#include "opencl_kernels_dnn.hpp"
+#endif
+
+#ifdef HAVE_CUDA
+#include "../cuda4dnn/primitives/prior_box.hpp"
+using namespace cv::dnn::cuda4dnn;
+#endif
+
+namespace cv
+{
+namespace dnn
+{
+
+class PriorBoxLayerImpl CV_FINAL : public PriorBoxLayer
+{
+public:
+    static bool getParameterDict(const LayerParams &params,
+                                 const std::string &parameterName,
+                                 DictValue& result)
+    {
+        if (!params.has(parameterName))
+        {
+            return false;
+        }
+
+        result = params.get(parameterName);
+        return true;
+    }
+
+    template<typename T>
+    T getParameter(const LayerParams &params,
+                   const std::string &parameterName,
+                   const size_t &idx=0,
+                   const bool required=true,
+                   const T& defaultValue=T())
+    {
+        DictValue dictValue;
+        bool success = getParameterDict(params, parameterName, dictValue);
+        if(!success)
+        {
+            if(required)
+            {
+                std::string message = _layerName;
+                message += " layer parameter does not contain ";
+                message += parameterName;
+                message += " parameter.";
+                CV_Error(Error::StsBadArg, message);
+            }
+            else
+            {
+                return defaultValue;
+            }
+        }
+        return dictValue.get<T>(idx);
+    }
+
+    void getAspectRatios(const LayerParams &params)
+    {
+        DictValue aspectRatioParameter;
+        bool aspectRatioRetieved = getParameterDict(params, "aspect_ratio", aspectRatioParameter);
+        if (!aspectRatioRetieved)
+            return;
+
+        for (int i = 0; i < aspectRatioParameter.size(); ++i)
+        {
+            float aspectRatio = aspectRatioParameter.get<float>(i);
+            bool alreadyExists = fabs(aspectRatio - 1.f) < 1e-6f;
+
+            for (size_t j = 0; j < _aspectRatios.size() && !alreadyExists; ++j)
+            {
+                alreadyExists = fabs(aspectRatio - _aspectRatios[j]) < 1e-6;
+            }
+            if (!alreadyExists)
+            {
+                _aspectRatios.push_back(aspectRatio);
+                if (_flip)
+                {
+                    _aspectRatios.push_back(1./aspectRatio);
+                }
+            }
+        }
+    }
+
+    static void getParams(const std::string& name, const LayerParams &params,
+                          std::vector<float>* values)
+    {
+        DictValue dict;
+        if (getParameterDict(params, name, dict))
+        {
+            values->resize(dict.size());
+            for (int i = 0; i < dict.size(); ++i)
+            {
+                (*values)[i] = dict.get<float>(i);
+            }
+        }
+        else
+            values->clear();
+    }
+
+    void getVariance(const LayerParams &params)
+    {
+        DictValue varianceParameter;
+        bool varianceParameterRetrieved = getParameterDict(params, "variance", varianceParameter);
+        CV_Assert(varianceParameterRetrieved);
+
+        int varianceSize = varianceParameter.size();
+        if (varianceSize > 1)
+        {
+            // Must and only provide 4 variance.
+            CV_Assert(varianceSize == 4);
+
+            for (int i = 0; i < varianceSize; ++i)
+            {
+                float variance = varianceParameter.get<float>(i);
+                CV_Assert(variance > 0);
+                _variance.push_back(variance);
+            }
+        }
+        else
+        {
+            if (varianceSize == 1)
+            {
+                float variance = varianceParameter.get<float>(0);
+                CV_Assert(variance > 0);
+                _variance.push_back(variance);
+            }
+            else
+            {
+                // Set default to 0.1.
+                _variance.push_back(0.1f);
+            }
+        }
+    }
+
+    PriorBoxLayerImpl(const LayerParams &params)
+    {
+        setParamsFrom(params);
+        _flip = getParameter<bool>(params, "flip", 0, false, true);
+        _clip = getParameter<bool>(params, "clip", 0, false, true);
+        _bboxesNormalized = getParameter<bool>(params, "normalized_bbox", 0, false, true);
+
+        getParams("min_size", params, &_minSize);
+        getAspectRatios(params);
+        getVariance(params);
+
+        if (params.has("max_size"))
+        {
+            getParams("max_size", params, &_maxSize);
+            CV_Assert(_minSize.size() == _maxSize.size());
+            for (int i = 0; i < _maxSize.size(); i++)
+                CV_Assert(_minSize[i] < _maxSize[i]);
+        }
+
+        std::vector<float> widths, heights;
+        getParams("width", params, &widths);
+        getParams("height", params, &heights);
+        _explicitSizes = !widths.empty();
+        CV_Assert(widths.size() == heights.size());
+
+        if (_explicitSizes)
+        {
+            CV_Assert(_aspectRatios.empty());
+            CV_Assert(!params.has("min_size"));
+            CV_Assert(!params.has("max_size"));
+            _boxWidths = widths;
+            _boxHeights = heights;
+        }
+        else
+        {
+            CV_Assert(!_minSize.empty());
+            for (int i = 0; i < _minSize.size(); ++i)
+            {
+                float minSize = _minSize[i];
+                CV_Assert(minSize > 0);
+                _boxWidths.push_back(minSize);
+                _boxHeights.push_back(minSize);
+
+                if (_maxSize.size() > 0)
+                {
+                    float size = sqrt(minSize * _maxSize[i]);
+                    _boxWidths.push_back(size);
+                    _boxHeights.push_back(size);
+                }
+
+                // rest of priors
+                for (size_t r = 0; r < _aspectRatios.size(); ++r)
+                {
+                    float arSqrt = sqrt(_aspectRatios[r]);
+                    _boxWidths.push_back(minSize * arSqrt);
+                    _boxHeights.push_back(minSize / arSqrt);
+                }
+            }
+        }
+        CV_Assert(_boxWidths.size() == _boxHeights.size());
+        _numPriors = _boxWidths.size();
+
+        if (params.has("step_h") || params.has("step_w")) {
+          CV_Assert(!params.has("step"));
+          _stepY = getParameter<float>(params, "step_h");
+          CV_Assert(_stepY > 0.);
+          _stepX = getParameter<float>(params, "step_w");
+          CV_Assert(_stepX > 0.);
+        } else if (params.has("step")) {
+          const float step = getParameter<float>(params, "step");
+          CV_Assert(step > 0);
+          _stepY = step;
+          _stepX = step;
+        } else {
+          _stepY = 0;
+          _stepX = 0;
+        }
+        if (params.has("offset_h") || params.has("offset_w"))
+        {
+            CV_Assert_N(!params.has("offset"), params.has("offset_h"), params.has("offset_w"));
+            getParams("offset_h", params, &_offsetsY);
+            getParams("offset_w", params, &_offsetsX);
+            CV_Assert(_offsetsX.size() == _offsetsY.size());
+            _numPriors *= std::max((size_t)1, 2 * (_offsetsX.size() - 1));
+        }
+        else
+        {
+            float offset = getParameter<float>(params, "offset", 0, false, 0.5);
+            _offsetsX.assign(1, offset);
+            _offsetsY.assign(1, offset);
+        }
+    }
+
+    virtual bool supportBackend(int backendId) CV_OVERRIDE
+    {
+#ifdef HAVE_DNN_NGRAPH
+        if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
+            return _explicitSizes || _stepX == _stepY;
+#endif
+        return backendId == DNN_BACKEND_OPENCV ||
+               backendId == DNN_BACKEND_CUDA ||
+               (backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && haveInfEngine() &&
+                   ( _explicitSizes || (_minSize.size() == 1 && _maxSize.size() <= 1)))
+               || (backendId == DNN_BACKEND_VKCOM && haveVulkan());
+    }
+
+    bool getMemoryShapes(const std::vector<MatShape> &inputs,
+                         const int requiredOutputs,
+                         std::vector<MatShape> &outputs,
+                         std::vector<MatShape> &internals) const CV_OVERRIDE
+    {
+        CV_Assert(!inputs.empty());
+
+        int layerHeight = inputs[0][2];
+        int layerWidth = inputs[0][3];
+
+        // Since all images in a batch has same height and width, we only need to
+        // generate one set of priors which can be shared across all images.
+        size_t outNum = 1;
+        // 2 channels. First channel stores the mean of each prior coordinate.
+        // Second channel stores the variance of each prior coordinate.
+        size_t outChannels = 2;
+
+        outputs.resize(1, shape(outNum, outChannels,
+                                layerHeight * layerWidth * _numPriors * 4));
+
+        return false;
+    }
+
+    void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays) CV_OVERRIDE
+    {
+        std::vector<Mat> inputs;
+        inputs_arr.getMatVector(inputs);
+
+        CV_CheckGT(inputs.size(), (size_t)1, "");
+        CV_CheckEQ(inputs[0].dims, 4, ""); CV_CheckEQ(inputs[1].dims, 4, "");
+        int layerWidth = inputs[0].size[3];
+        int layerHeight = inputs[0].size[2];
+
+        int imageWidth = inputs[1].size[3];
+        int imageHeight = inputs[1].size[2];
+
+        _stepY = _stepY == 0 ? (static_cast<float>(imageHeight) / layerHeight) : _stepY;
+        _stepX = _stepX == 0 ? (static_cast<float>(imageWidth) / layerWidth) : _stepX;
+    }
+
+#ifdef HAVE_OPENCL
+    bool forward_ocl(InputArrayOfArrays inps, OutputArrayOfArrays outs, OutputArrayOfArrays internals)
+    {
+        std::vector<UMat> inputs;
+        std::vector<UMat> outputs;
+
+        bool use_half = (inps.depth() == CV_16S);
+        inps.getUMatVector(inputs);
+        outs.getUMatVector(outputs);
+
+        int _layerWidth = inputs[0].size[3];
+        int _layerHeight = inputs[0].size[2];
+
+        int _imageWidth = inputs[1].size[3];
+        int _imageHeight = inputs[1].size[2];
+
+        if (umat_offsetsX.empty())
+        {
+            Mat offsetsX(1, _offsetsX.size(), CV_32FC1, &_offsetsX[0]);
+            Mat offsetsY(1, _offsetsY.size(), CV_32FC1, &_offsetsY[0]);
+            Mat variance(1, _variance.size(), CV_32FC1, &_variance[0]);
+            Mat widths(1, _boxWidths.size(), CV_32FC1, &_boxWidths[0]);
+            Mat heights(1, _boxHeights.size(), CV_32FC1, &_boxHeights[0]);
+
+            offsetsX.copyTo(umat_offsetsX);
+            offsetsY.copyTo(umat_offsetsY);
+            variance.copyTo(umat_variance);
+            widths.copyTo(umat_widths);
+            heights.copyTo(umat_heights);
+        }
+
+        String opts;
+        if (use_half)
+            opts = "-DDtype=half -DDtype4=half4 -Dconvert_T=convert_half4";
+        else
+            opts = "-DDtype=float -DDtype4=float4 -Dconvert_T=convert_float4";
+
+        size_t nthreads = _layerHeight * _layerWidth;
+        ocl::Kernel kernel("prior_box", ocl::dnn::prior_box_oclsrc, opts);
+
+        kernel.set(0, (int)nthreads);
+        kernel.set(1, (float)_stepX);
+        kernel.set(2, (float)_stepY);
+        kernel.set(3, ocl::KernelArg::PtrReadOnly(umat_offsetsX));
+        kernel.set(4, ocl::KernelArg::PtrReadOnly(umat_offsetsY));
+        kernel.set(5, (int)_offsetsX.size());
+        kernel.set(6, ocl::KernelArg::PtrReadOnly(umat_widths));
+        kernel.set(7, ocl::KernelArg::PtrReadOnly(umat_heights));
+        kernel.set(8, (int)_boxWidths.size());
+        kernel.set(9, ocl::KernelArg::PtrWriteOnly(outputs[0]));
+        kernel.set(10, (int)_layerHeight);
+        kernel.set(11, (int)_layerWidth);
+        kernel.set(12, (int)_imageHeight);
+        kernel.set(13, (int)_imageWidth);
+        kernel.run(1, &nthreads, NULL, false);
+
+        // clip the prior's coordinate such that it is within [0, 1]
+        if (_clip)
+        {
+            ocl::Kernel kernel("clip", ocl::dnn::prior_box_oclsrc, opts);
+            size_t nthreads = _layerHeight * _layerWidth * _numPriors * 4;
+            if (!kernel.args((int)nthreads, ocl::KernelArg::PtrReadWrite(outputs[0]))
+                       .run(1, &nthreads, NULL, false))
+                return false;
+        }
+
+        // set the variance.
+        {
+            ocl::Kernel kernel("set_variance", ocl::dnn::prior_box_oclsrc, opts);
+            int offset = total(shape(outputs[0]), 2);
+            size_t nthreads = _layerHeight * _layerWidth * _numPriors;
+            kernel.set(0, (int)nthreads);
+            kernel.set(1, (int)offset);
+            kernel.set(2, (int)_variance.size());
+            kernel.set(3, ocl::KernelArg::PtrReadOnly(umat_variance));
+            kernel.set(4, ocl::KernelArg::PtrWriteOnly(outputs[0]));
+            if (!kernel.run(1, &nthreads, NULL, false))
+                return false;
+        }
+        return true;
+    }
+#endif
+
+    void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
+    {
+        CV_TRACE_FUNCTION();
+        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+
+        CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
+                   forward_ocl(inputs_arr, outputs_arr, internals_arr))
+
+        if (inputs_arr.depth() == CV_16S)
+        {
+            forward_fallback(inputs_arr, outputs_arr, internals_arr);
+            return;
+        }
+
+        std::vector<Mat> inputs, outputs;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
+
+        CV_Assert(inputs.size() == 2);
+
+        int _layerWidth = inputs[0].size[3];
+        int _layerHeight = inputs[0].size[2];
+
+        int _imageWidth = inputs[1].size[3];
+        int _imageHeight = inputs[1].size[2];
+
+        float* outputPtr = outputs[0].ptr<float>();
+        float _boxWidth, _boxHeight;
+        for (size_t h = 0; h < _layerHeight; ++h)
+        {
+            for (size_t w = 0; w < _layerWidth; ++w)
+            {
+                for (size_t i = 0; i < _boxWidths.size(); ++i)
+                {
+                    _boxWidth = _boxWidths[i];
+                    _boxHeight = _boxHeights[i];
+                    for (int j = 0; j < _offsetsX.size(); ++j)
+                    {
+                        float center_x = (w + _offsetsX[j]) * _stepX;
+                        float center_y = (h + _offsetsY[j]) * _stepY;
+                        outputPtr = addPrior(center_x, center_y, _boxWidth, _boxHeight, _imageWidth,
+                                             _imageHeight, _bboxesNormalized, outputPtr);
+                    }
+                }
+            }
+        }
+        // clip the prior's coordinate such that it is within [0, 1]
+        if (_clip)
+        {
+            int _outChannelSize = _layerHeight * _layerWidth * _numPriors * 4;
+            outputPtr = outputs[0].ptr<float>();
+            for (size_t d = 0; d < _outChannelSize; ++d)
+            {
+                outputPtr[d] = std::min<float>(std::max<float>(outputPtr[d], 0.), 1.);
+            }
+        }
+        // set the variance.
+        outputPtr = outputs[0].ptr<float>(0, 1);
+        if(_variance.size() == 1)
+        {
+            Mat secondChannel(1, outputs[0].size[2], CV_32F, outputPtr);
+            secondChannel.setTo(Scalar::all(_variance[0]));
+        }
+        else
+        {
+            int count = 0;
+            for (size_t h = 0; h < _layerHeight; ++h)
+            {
+                for (size_t w = 0; w < _layerWidth; ++w)
+                {
+                    for (size_t i = 0; i < _numPriors; ++i)
+                    {
+                        for (int j = 0; j < 4; ++j)
+                        {
+                            outputPtr[count] = _variance[j];
+                            ++count;
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+#ifdef HAVE_CUDA
+    Ptr<BackendNode> initCUDA(
+        void *context_,
+        const std::vector<Ptr<BackendWrapper>>& inputs,
+        const std::vector<Ptr<BackendWrapper>>& outputs
+    ) override
+    {
+        auto context = reinterpret_cast<csl::CSLContext*>(context_);
+
+        auto feature_map_wrapper = inputs[0].dynamicCast<CUDABackendWrapper>();
+        auto feature_map_shape = feature_map_wrapper->getShape();
+
+        auto image_wrapper = inputs[1].dynamicCast<CUDABackendWrapper>();
+        auto image_shape = image_wrapper->getShape();
+
+        PriorBoxConfiguration config;
+        config.feature_map_width = feature_map_shape.rbegin()[0];
+        config.feature_map_height = feature_map_shape.rbegin()[1];
+        config.image_width = image_shape.rbegin()[0];
+        config.image_height = image_shape.rbegin()[1];
+
+        config.num_priors = _numPriors;
+        config.box_widths = _boxWidths;
+        config.box_heights = _boxHeights;
+        config.offsets_x = _offsetsX;
+        config.offsets_y = _offsetsY;
+        config.stepX = _stepX;
+        config.stepY = _stepY;
+
+        config.variance = _variance;
+
+        config.clip = _clip;
+        config.normalize = _bboxesNormalized;
+
+        return make_cuda_node<cuda4dnn::PriorBoxOp>(preferableTarget, std::move(context->stream), config);
+    }
+#endif
+
+    virtual Ptr<BackendNode> initVkCom(const std::vector<Ptr<BackendWrapper> > &input) CV_OVERRIDE
+    {
+#ifdef HAVE_VULKAN
+        std::shared_ptr<vkcom::OpBase> op(new vkcom::OpPriorBox(_stepX, _stepY,
+                                                                _clip, _numPriors,
+                                                                _variance, _offsetsX,
+                                                                _offsetsY, _boxWidths,
+                                                                _boxHeights));
+        return Ptr<BackendNode>(new VkComBackendNode(input, op));
+#endif // HAVE_VULKAN
+        return Ptr<BackendNode>();
+    }
+
+#ifdef HAVE_INF_ENGINE
+    virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >&) CV_OVERRIDE
+    {
+        if (_explicitSizes)
+        {
+            InferenceEngine::Builder::PriorBoxClusteredLayer ieLayer(name);
+            ieLayer.setSteps({_stepY, _stepX});
+
+            CV_CheckEQ(_offsetsX.size(), (size_t)1, ""); CV_CheckEQ(_offsetsY.size(), (size_t)1, ""); CV_CheckEQ(_offsetsX[0], _offsetsY[0], "");
+            ieLayer.setOffset(_offsetsX[0]);
+
+            ieLayer.setClip(_clip);
+            ieLayer.setFlip(false);  // We already flipped aspect ratios.
+
+            InferenceEngine::Builder::Layer l = ieLayer;
+
+            CV_Assert_N(!_boxWidths.empty(), !_boxHeights.empty(), !_variance.empty());
+            CV_Assert(_boxWidths.size() == _boxHeights.size());
+            l.getParameters()["width"] = _boxWidths;
+            l.getParameters()["height"] = _boxHeights;
+            l.getParameters()["variance"] = _variance;
+            return Ptr<BackendNode>(new InfEngineBackendNode(l));
+        }
+        else
+        {
+            InferenceEngine::Builder::PriorBoxLayer ieLayer(name);
+
+            CV_Assert(!_explicitSizes);
+            ieLayer.setMinSize(_minSize[0]);
+            if (!_maxSize.empty())
+                ieLayer.setMaxSize(_maxSize[0]);
+
+            CV_CheckEQ(_offsetsX.size(), (size_t)1, ""); CV_CheckEQ(_offsetsY.size(), (size_t)1, ""); CV_CheckEQ(_offsetsX[0], _offsetsY[0], "");
+            ieLayer.setOffset(_offsetsX[0]);
+
+            ieLayer.setClip(_clip);
+            ieLayer.setFlip(false);  // We already flipped aspect ratios.
+
+            InferenceEngine::Builder::Layer l = ieLayer;
+            if (_stepX == _stepY)
+            {
+                l.getParameters()["step"] = _stepX;
+                l.getParameters()["step_h"] = 0.0f;
+                l.getParameters()["step_w"] = 0.0f;
+            }
+            else
+            {
+                l.getParameters()["step"] = 0.0f;
+                l.getParameters()["step_h"] = _stepY;
+                l.getParameters()["step_w"] = _stepX;
+            }
+            if (!_aspectRatios.empty())
+            {
+                l.getParameters()["aspect_ratio"] = _aspectRatios;
+            }
+            CV_Assert(!_variance.empty());
+            l.getParameters()["variance"] = _variance;
+            return Ptr<BackendNode>(new InfEngineBackendNode(l));
+        }
+    }
+#endif  // HAVE_INF_ENGINE
+
+#ifdef HAVE_DNN_NGRAPH
+    virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> >& inputs, const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
+    {
+        CV_Assert(nodes.size() == 2);
+        auto layer = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
+        auto image = nodes[1].dynamicCast<InfEngineNgraphNode>()->node;
+        auto layer_shape = std::make_shared<ngraph::op::ShapeOf>(layer);
+        auto image_shape = std::make_shared<ngraph::op::ShapeOf>(image);
+
+        auto lower_bounds = std::make_shared<ngraph::op::Constant>(ngraph::element::i64, ngraph::Shape{1}, std::vector<int64_t>{2});
+        auto upper_bounds = std::make_shared<ngraph::op::Constant>(ngraph::element::i64, ngraph::Shape{1}, std::vector<int64_t>{4});
+        auto strides      = std::make_shared<ngraph::op::Constant>(ngraph::element::i64, ngraph::Shape{1}, std::vector<int64_t>{1});
+
+        auto slice_layer = std::make_shared<ngraph::op::v1::StridedSlice>(layer_shape,
+                                            lower_bounds, upper_bounds, strides, std::vector<int64_t>{}, std::vector<int64_t>{});
+        auto slice_image = std::make_shared<ngraph::op::v1::StridedSlice>(image_shape,
+                                            lower_bounds, upper_bounds, strides, std::vector<int64_t>{}, std::vector<int64_t>{});
+
+        if (_explicitSizes)
+        {
+            CV_Assert_N(!_boxWidths.empty(), !_boxHeights.empty(), !_variance.empty());
+            CV_Assert(_boxWidths.size() == _boxHeights.size());
+            ngraph::op::PriorBoxClusteredAttrs attrs;
+            attrs.widths = _boxWidths;
+            attrs.heights = _boxHeights;
+            attrs.clip = _clip;
+            CV_CheckEQ(_offsetsX.size(), (size_t)1, ""); CV_CheckEQ(_offsetsY.size(), (size_t)1, ""); CV_CheckEQ(_offsetsX[0], _offsetsY[0], "");
+            attrs.offset = _offsetsX[0];
+            attrs.step_heights = _stepY;
+            attrs.step_widths = _stepX;
+            attrs.variances = _variance;
+
+            auto priorBox = std::make_shared<ngraph::op::PriorBoxClustered>(slice_layer, slice_image, attrs);
+            auto axis = std::make_shared<ngraph::op::Constant>(ngraph::element::i64, ngraph::Shape{1}, std::vector<int64_t>{0});
+            auto unsqueeze = std::make_shared<ngraph::op::Unsqueeze>(priorBox, axis);
+            return Ptr<BackendNode>(new InfEngineNgraphNode(unsqueeze));
+        }
+        else
+        {
+            ngraph::op::PriorBoxAttrs attrs;
+            attrs.min_size = _minSize;
+            attrs.max_size = _maxSize;
+            // doesn't work with empty aspectRatio
+            attrs.aspect_ratio = !_aspectRatios.empty()? _aspectRatios : std::vector<float>{1.0f};
+            attrs.clip = _clip;
+            attrs.flip = false;
+            attrs.variance = _variance;
+            CV_CheckEQ(_offsetsX.size(), (size_t)1, ""); CV_CheckEQ(_offsetsY.size(), (size_t)1, ""); CV_CheckEQ(_offsetsX[0], _offsetsY[0], "");
+            attrs.offset = _offsetsX[0];
+
+            attrs.step = _stepX;
+            attrs.scale_all_sizes = !_aspectRatios.empty();
+
+            auto priorBox = std::make_shared<ngraph::op::PriorBox>(slice_layer, slice_image, attrs);
+            auto axis = std::make_shared<ngraph::op::Constant>(ngraph::element::i64, ngraph::Shape{1}, std::vector<int64_t>{0});
+            auto unsqueeze = std::make_shared<ngraph::op::Unsqueeze>(priorBox, axis);
+            return Ptr<BackendNode>(new InfEngineNgraphNode(unsqueeze));
+        }
+    }
+#endif  // HAVE_DNN_NGRAPH
+
+
+    virtual int64 getFLOPS(const std::vector<MatShape> &inputs,
+                           const std::vector<MatShape> &outputs) const CV_OVERRIDE
+    {
+        CV_UNUSED(outputs); // suppress unused variable warning
+        long flops = 0;
+
+        for (int i = 0; i < inputs.size(); i++)
+        {
+            flops += total(inputs[i], 2) * _numPriors * 4;
+        }
+
+        return flops;
+    }
+
+private:
+    std::vector<float> _minSize;
+    std::vector<float> _maxSize;
+
+    float _stepX, _stepY;
+
+    std::vector<float> _aspectRatios;
+    std::vector<float> _variance;
+    std::vector<float> _offsetsX;
+    std::vector<float> _offsetsY;
+    // Precomputed final widths and heights based on aspect ratios or explicit sizes.
+    std::vector<float> _boxWidths;
+    std::vector<float> _boxHeights;
+
+#ifdef HAVE_OPENCL
+    UMat umat_offsetsX;
+    UMat umat_offsetsY;
+    UMat umat_widths;
+    UMat umat_heights;
+    UMat umat_variance;
+#endif
+
+    bool _flip;
+    bool _clip;
+    bool _explicitSizes;
+    bool _bboxesNormalized;
+
+    size_t _numPriors;
+
+    static const size_t _numAxes = 4;
+    static const std::string _layerName;
+
+    static float* addPrior(float center_x, float center_y, float width, float height,
+                           float imgWidth, float imgHeight, bool normalized, float* dst)
+    {
+        if (normalized)
+        {
+            dst[0] = (center_x - width * 0.5f) / imgWidth;    // xmin
+            dst[1] = (center_y - height * 0.5f) / imgHeight;  // ymin
+            dst[2] = (center_x + width * 0.5f) / imgWidth;    // xmax
+            dst[3] = (center_y + height * 0.5f) / imgHeight;  // ymax
+        }
+        else
+        {
+            dst[0] = center_x - width * 0.5f;          // xmin
+            dst[1] = center_y - height * 0.5f;         // ymin
+            dst[2] = center_x + width * 0.5f - 1.0f;   // xmax
+            dst[3] = center_y + height * 0.5f - 1.0f;  // ymax
+        }
+        return dst + 4;
+    }
+};
+
+const std::string PriorBoxLayerImpl::_layerName = std::string("PriorBox");
+
+Ptr<PriorBoxLayer> PriorBoxLayer::create(const LayerParams &params)
+{
+    return Ptr<PriorBoxLayer>(new PriorBoxLayerImpl(params));
+}
+
+}
+}
--- a/Lib/opencv/sources/modules/dnn/src/layers/proposal_layer.cpp
+++ b/Lib/opencv/sources/modules/dnn/src/layers/proposal_layer.cpp
@@ -0,0 +1,441 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+// Copyright (C) 2017, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+#include "../precomp.hpp"
+#include "layers_common.hpp"
+#include "../op_inf_engine.hpp"
+
+#ifdef HAVE_DNN_NGRAPH
+#include "../ie_ngraph.hpp"
+#include <ngraph/op/experimental/layers/proposal.hpp>
+#endif
+
+namespace cv { namespace dnn {
+
+class ProposalLayerImpl CV_FINAL : public ProposalLayer
+{
+public:
+    ProposalLayerImpl(const LayerParams& params)
+    {
+        setParamsFrom(params);
+
+        featStride = params.get<uint32_t>("feat_stride", 16);
+        baseSize = params.get<uint32_t>("base_size", 16);
+        // uint32_t minSize = params.get<uint32_t>("min_size", 16);
+        keepTopBeforeNMS = params.get<uint32_t>("pre_nms_topn", 6000);
+        keepTopAfterNMS = params.get<uint32_t>("post_nms_topn", 300);
+        nmsThreshold = params.get<float>("nms_thresh", 0.7);
+        ratios = params.get("ratio");
+        scales = params.get("scale");
+
+        {
+            LayerParams lp;
+            lp.set("step", featStride);
+            lp.set("flip", false);
+            lp.set("clip", false);
+            lp.set("normalized_bbox", false);
+            lp.set("offset", 0.5 * baseSize / featStride);
+
+            // Unused values.
+            float variance[] = {0.1f, 0.1f, 0.2f, 0.2f};
+            lp.set("variance", DictValue::arrayReal<float*>(&variance[0], 4));
+
+            // Compute widths and heights explicitly.
+            std::vector<float> widths, heights;
+            widths.reserve(ratios.size() * scales.size());
+            heights.reserve(ratios.size() * scales.size());
+            for (int i = 0; i < ratios.size(); ++i)
+            {
+                float ratio = ratios.get<float>(i);
+                for (int j = 0; j < scales.size(); ++j)
+                {
+                    float scale = scales.get<float>(j);
+                    float width = std::floor(baseSize / sqrt(ratio) + 0.5f);
+                    float height = std::floor(width * ratio + 0.5f);
+                    widths.push_back(scale * width);
+                    heights.push_back(scale * height);
+                }
+            }
+            lp.set("width", DictValue::arrayReal<float*>(&widths[0], widths.size()));
+            lp.set("height", DictValue::arrayReal<float*>(&heights[0], heights.size()));
+
+            priorBoxLayer = PriorBoxLayer::create(lp);
+        }
+        {
+            int order[] = {0, 2, 3, 1};
+            LayerParams lp;
+            lp.set("order", DictValue::arrayInt<int*>(&order[0], 4));
+
+            deltasPermute = PermuteLayer::create(lp);
+            scoresPermute = PermuteLayer::create(lp);
+        }
+        {
+            LayerParams lp;
+            lp.set("code_type", "CENTER_SIZE");
+            lp.set("num_classes", 1);
+            lp.set("share_location", true);
+            lp.set("background_label_id", 1);  // We won't pass background scores so set it out of range [0, num_classes)
+            lp.set("variance_encoded_in_target", true);
+            lp.set("keep_top_k", keepTopAfterNMS);
+            lp.set("top_k", keepTopBeforeNMS);
+            lp.set("nms_threshold", nmsThreshold);
+            lp.set("normalized_bbox", false);
+            lp.set("clip", true);
+
+            detectionOutputLayer = DetectionOutputLayer::create(lp);
+        }
+    }
+
+    virtual bool supportBackend(int backendId) CV_OVERRIDE
+    {
+        return backendId == DNN_BACKEND_OPENCV ||
+               ((backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 || backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH) && preferableTarget != DNN_TARGET_MYRIAD);
+    }
+
+    bool getMemoryShapes(const std::vector<MatShape> &inputs,
+                         const int requiredOutputs,
+                         std::vector<MatShape> &outputs,
+                         std::vector<MatShape> &internals) const CV_OVERRIDE
+    {
+        // We need to allocate the following blobs:
+        // - output priors from PriorBoxLayer
+        // - permuted priors
+        // - permuted scores
+        CV_Assert(inputs.size() == 3);
+
+        const MatShape& scores = inputs[0];
+        const MatShape& bboxDeltas = inputs[1];
+
+        std::vector<MatShape> layerInputs, layerOutputs, layerInternals;
+
+        // Prior boxes layer.
+        layerInputs.assign(1, scores);
+        priorBoxLayer->getMemoryShapes(layerInputs, 1, layerOutputs, layerInternals);
+        CV_Assert(layerOutputs.size() == 1);
+        CV_Assert(layerInternals.empty());
+        internals.push_back(layerOutputs[0]);
+
+        // Scores permute layer.
+        CV_Assert(scores.size() == 4);
+        MatShape objectScores = scores;
+        CV_Assert((scores[1] & 1) == 0);  // Number of channels is even.
+        objectScores[1] /= 2;
+        layerInputs.assign(1, objectScores);
+        scoresPermute->getMemoryShapes(layerInputs, 1, layerOutputs, layerInternals);
+        CV_Assert(layerOutputs.size() == 1);
+        CV_Assert(layerInternals.empty());
+        internals.push_back(layerOutputs[0]);
+
+        // BBox predictions permute layer.
+        layerInputs.assign(1, bboxDeltas);
+        deltasPermute->getMemoryShapes(layerInputs, 1, layerOutputs, layerInternals);
+        CV_Assert(layerOutputs.size() == 1);
+        CV_Assert(layerInternals.empty());
+        internals.push_back(layerOutputs[0]);
+
+        // Detections layer.
+        internals.push_back(shape(1, 1, keepTopAfterNMS, 7));
+
+        outputs.resize(2);
+        outputs[0] = shape(keepTopAfterNMS, 5);
+        outputs[1] = shape(keepTopAfterNMS, 1);
+        return false;
+    }
+
+    void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays) CV_OVERRIDE
+    {
+        std::vector<Mat> inputs;
+        inputs_arr.getMatVector(inputs);
+
+        std::vector<Mat> layerInputs;
+        std::vector<Mat> layerOutputs;
+
+        // Scores permute layer.
+        Mat scores = getObjectScores(inputs[0]);
+        layerInputs.assign(1, scores);
+        layerOutputs.assign(1, Mat(shape(scores.size[0], scores.size[2],
+                                         scores.size[3], scores.size[1]), CV_32FC1));
+        scoresPermute->finalize(layerInputs, layerOutputs);
+
+        // BBox predictions permute layer.
+        const Mat& bboxDeltas = inputs[1];
+        CV_Assert(bboxDeltas.dims == 4);
+        layerInputs.assign(1, bboxDeltas);
+        layerOutputs.assign(1, Mat(shape(bboxDeltas.size[0], bboxDeltas.size[2],
+                                         bboxDeltas.size[3], bboxDeltas.size[1]), CV_32FC1));
+        deltasPermute->finalize(layerInputs, layerOutputs);
+    }
+
+#ifdef HAVE_OPENCL
+    bool forward_ocl(InputArrayOfArrays inputs_, OutputArrayOfArrays outputs_, OutputArrayOfArrays internals_)
+    {
+        std::vector<UMat> inputs;
+        std::vector<UMat> outputs;
+        std::vector<UMat> internals;
+
+        if (inputs_.depth() == CV_16S)
+            return false;
+
+        inputs_.getUMatVector(inputs);
+        outputs_.getUMatVector(outputs);
+        internals_.getUMatVector(internals);
+
+        CV_Assert(inputs.size() == 3);
+        CV_Assert(internals.size() == 4);
+        const UMat& scores = inputs[0];
+        const UMat& bboxDeltas = inputs[1];
+        const UMat& imInfo = inputs[2];
+        UMat& priorBoxes = internals[0];
+        UMat& permuttedScores = internals[1];
+        UMat& permuttedDeltas = internals[2];
+        UMat& detections = internals[3];
+
+        CV_Assert(imInfo.total() >= 2);
+        // We've chosen the smallest data type because we need just a shape from it.
+        Mat szMat;
+        imInfo.copyTo(szMat);
+        int rows = (int)szMat.at<float>(0);
+        int cols = (int)szMat.at<float>(1);
+        umat_fakeImageBlob.create(shape(1, 1, rows, cols), CV_8UC1);
+        umat_fakeImageBlob.setTo(0);
+
+        // Generate prior boxes.
+        std::vector<UMat> layerInputs(2), layerOutputs(1, priorBoxes);
+        layerInputs[0] = scores;
+        layerInputs[1] = umat_fakeImageBlob;
+        priorBoxLayer->forward(layerInputs, layerOutputs, internals);
+
+        // Permute scores.
+        layerInputs.assign(1, getObjectScores(scores));
+        layerOutputs.assign(1, permuttedScores);
+        scoresPermute->forward(layerInputs, layerOutputs, internals);
+
+        // Permute deltas.
+        layerInputs.assign(1, bboxDeltas);
+        layerOutputs.assign(1, permuttedDeltas);
+        deltasPermute->forward(layerInputs, layerOutputs, internals);
+
+        // Sort predictions by scores and apply NMS. DetectionOutputLayer allocates
+        // output internally because of different number of objects after NMS.
+        layerInputs.resize(4);
+        layerInputs[0] = permuttedDeltas;
+        layerInputs[1] = permuttedScores;
+        layerInputs[2] = priorBoxes;
+        layerInputs[3] = umat_fakeImageBlob;
+
+        layerOutputs[0] = detections;
+        detectionOutputLayer->forward(layerInputs, layerOutputs, internals);
+
+        // DetectionOutputLayer produces 1x1xNx7 output where N might be less or
+        // equal to keepTopAfterNMS. We fill the rest by zeros.
+        const int numDets = layerOutputs[0].total() / 7;
+        CV_Assert(numDets <= keepTopAfterNMS);
+
+        MatShape s = shape(numDets, 7);
+        layerOutputs[0] = layerOutputs[0].reshape(1, s.size(), &s[0]);
+
+        // The boxes.
+        UMat dst = outputs[0].rowRange(0, numDets);
+        layerOutputs[0].colRange(3, 7).copyTo(dst.colRange(1, 5));
+        dst.col(0).setTo(0);  // First column are batch ids. Keep it zeros too.
+
+        // The scores.
+        dst = outputs[1].rowRange(0, numDets);
+        layerOutputs[0].col(2).copyTo(dst);
+
+        return true;
+    }
+#endif
+
+    void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
+    {
+        CV_TRACE_FUNCTION();
+        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+
+        CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
+                   OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
+                   forward_ocl(inputs_arr, outputs_arr, internals_arr))
+
+        if (inputs_arr.depth() == CV_16S)
+        {
+            forward_fallback(inputs_arr, outputs_arr, internals_arr);
+            return;
+        }
+
+        std::vector<Mat> inputs, outputs, internals;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
+        internals_arr.getMatVector(internals);
+
+        CV_Assert(inputs.size() == 3);
+        CV_Assert(internals.size() == 4);
+        const Mat& scores = inputs[0];
+        const Mat& bboxDeltas = inputs[1];
+        const Mat& imInfo = inputs[2];
+        Mat& priorBoxes = internals[0];
+        Mat& permuttedScores = internals[1];
+        Mat& permuttedDeltas = internals[2];
+        Mat& detections = internals[3];
+
+        CV_Assert(imInfo.total() >= 2);
+        // We've chosen the smallest data type because we need just a shape from it.
+        fakeImageBlob.create(shape(1, 1, imInfo.at<float>(0), imInfo.at<float>(1)), CV_8UC1);
+
+        // Generate prior boxes.
+        std::vector<Mat> layerInputs(2), layerOutputs(1, priorBoxes);
+        layerInputs[0] = scores;
+        layerInputs[1] = fakeImageBlob;
+        priorBoxLayer->forward(layerInputs, layerOutputs, internals);
+
+        // Permute scores.
+        layerInputs.assign(1, getObjectScores(scores));
+        layerOutputs.assign(1, permuttedScores);
+        scoresPermute->forward(layerInputs, layerOutputs, internals);
+
+        // Permute deltas.
+        layerInputs.assign(1, bboxDeltas);
+        layerOutputs.assign(1, permuttedDeltas);
+        deltasPermute->forward(layerInputs, layerOutputs, internals);
+
+        // Sort predictions by scores and apply NMS. DetectionOutputLayer allocates
+        // output internally because of different number of objects after NMS.
+        layerInputs.resize(4);
+        layerInputs[0] = permuttedDeltas;
+        layerInputs[1] = permuttedScores;
+        layerInputs[2] = priorBoxes;
+        layerInputs[3] = fakeImageBlob;
+
+        layerOutputs[0] = detections;
+        detectionOutputLayer->forward(layerInputs, layerOutputs, internals);
+
+        // DetectionOutputLayer produces 1x1xNx7 output where N might be less or
+        // equal to keepTopAfterNMS. We fill the rest by zeros.
+        const int numDets = layerOutputs[0].total() / 7;
+        CV_Assert(numDets <= keepTopAfterNMS);
+
+        // The boxes.
+        layerOutputs[0] = layerOutputs[0].reshape(1, numDets);
+        Mat dst = outputs[0].rowRange(0, numDets);
+        layerOutputs[0].colRange(3, 7).copyTo(dst.colRange(1, 5));
+        dst.col(0).setTo(0);  // First column are batch ids. Keep it zeros too.
+
+        // The scores.
+        dst = outputs[1].rowRange(0, numDets);
+        layerOutputs[0].col(2).copyTo(dst);
+    }
+
+#ifdef HAVE_INF_ENGINE
+    virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >&) CV_OVERRIDE
+    {
+        InferenceEngine::Builder::ProposalLayer ieLayer(name);
+
+        ieLayer.setBaseSize(baseSize);
+        ieLayer.setFeatStride(featStride);
+        ieLayer.setMinSize(16);
+        ieLayer.setNMSThresh(nmsThreshold);
+        ieLayer.setPostNMSTopN(keepTopAfterNMS);
+        ieLayer.setPreNMSTopN(keepTopBeforeNMS);
+
+        std::vector<float> scalesVec(scales.size());
+        for (int i = 0; i < scales.size(); ++i)
+            scalesVec[i] = scales.get<float>(i);
+        ieLayer.setScale(scalesVec);
+
+        std::vector<float> ratiosVec(ratios.size());
+        for (int i = 0; i < ratios.size(); ++i)
+            ratiosVec[i] = ratios.get<float>(i);
+        ieLayer.setRatio(ratiosVec);
+
+        return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
+    }
+#endif  // HAVE_INF_ENGINE
+
+
+#ifdef HAVE_DNN_NGRAPH
+    virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> >& inputs,
+                                        const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
+    {
+        CV_Assert(nodes.size() == 3);
+        ngraph::op::ProposalAttrs attr;
+        attr.base_size     = baseSize;
+        attr.nms_thresh    = nmsThreshold;
+        attr.feat_stride   = featStride;
+        attr.min_size      = 16;
+        attr.pre_nms_topn  = keepTopBeforeNMS;
+        attr.post_nms_topn = keepTopAfterNMS;
+
+        std::vector<float> ratiosVec(ratios.size());
+        for (int i = 0; i < ratios.size(); ++i)
+            ratiosVec[i] = ratios.get<float>(i);
+        attr.ratio = ratiosVec;
+
+        std::vector<float> scalesVec(scales.size());
+        for (int i = 0; i < scales.size(); ++i)
+            scalesVec[i] = scales.get<float>(i);
+        attr.scale = scalesVec;
+
+        auto& class_probs  = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
+        auto& class_logits = nodes[1].dynamicCast<InfEngineNgraphNode>()->node;
+        auto& image_shape  = nodes[2].dynamicCast<InfEngineNgraphNode>()->node;
+
+        CV_Assert_N(image_shape->get_shape().size() == 2, image_shape->get_shape().front() == 1);
+        auto shape   = std::make_shared<ngraph::op::Constant>(ngraph::element::i64,
+                       ngraph::Shape{1},
+                       std::vector<int64_t>{(int64_t)image_shape->get_shape().back()});
+        auto reshape = std::make_shared<ngraph::op::v1::Reshape>(image_shape, shape, true);
+
+        auto proposal = std::make_shared<ngraph::op::Proposal>(class_probs, class_logits, reshape, attr);
+        return Ptr<BackendNode>(new InfEngineNgraphNode(proposal));
+    }
+#endif  // HAVE_DNN_NGRAPH
+
+private:
+    // A first half of channels are background scores. We need only a second one.
+    static Mat getObjectScores(const Mat& m)
+    {
+        CV_Assert(m.dims == 4);
+        CV_Assert(m.size[0] == 1);
+        int channels = m.size[1];
+        CV_Assert((channels & 1) == 0);
+        return slice(m, Range::all(), Range(channels / 2, channels));
+    }
+
+#ifdef HAVE_OPENCL
+    static UMat getObjectScores(const UMat& m)
+    {
+        CV_Assert(m.dims == 4);
+        CV_Assert(m.size[0] == 1);
+        int channels = m.size[1];
+        CV_Assert((channels & 1) == 0);
+
+        Range r = Range(channels / 2, channels);
+        Range ranges[4] = { Range::all(), r, Range::all(), Range::all() };
+        return m(&ranges[0]);
+    }
+#endif
+
+    Ptr<PriorBoxLayer> priorBoxLayer;
+    Ptr<DetectionOutputLayer> detectionOutputLayer;
+
+    Ptr<PermuteLayer> deltasPermute;
+    Ptr<PermuteLayer> scoresPermute;
+    uint32_t keepTopBeforeNMS, keepTopAfterNMS, featStride, baseSize;
+    Mat fakeImageBlob;
+    float nmsThreshold;
+    DictValue ratios, scales;
+#ifdef HAVE_OPENCL
+    UMat umat_fakeImageBlob;
+#endif
+};
+
+
+Ptr<ProposalLayer> ProposalLayer::create(const LayerParams& params)
+{
+    return Ptr<ProposalLayer>(new ProposalLayerImpl(params));
+}
+
+}  // namespace dnn
+}  // namespace cv
--- a/Lib/opencv/sources/modules/dnn/src/layers/recurrent_layers.cpp
+++ b/Lib/opencv/sources/modules/dnn/src/layers/recurrent_layers.cpp
@@ -0,0 +1,550 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Copyright (C) 2017, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "../precomp.hpp"
+#include <iostream>
+#include <iterator>
+#include <cmath>
+#include <opencv2/dnn/shape_utils.hpp>
+
+namespace cv
+{
+namespace dnn
+{
+
+template<typename Dtype>
+static void tanh(const Mat &src, Mat &dst)
+{
+    MatConstIterator_<Dtype> itSrc = src.begin<Dtype>();
+    MatIterator_<Dtype> itDst = dst.begin<Dtype>();
+
+    for (; itSrc != src.end<Dtype>(); itSrc++, itDst++)
+        *itDst = std::tanh(*itSrc);
+}
+
+//TODO: make utils method
+static void tanh(const Mat &src, Mat &dst)
+{
+    dst.create(src.dims, (const int*)src.size, src.type());
+
+    if (src.type() == CV_32F)
+        tanh<float>(src, dst);
+    else if (src.type() == CV_64F)
+        tanh<double>(src, dst);
+    else
+        CV_Error(Error::StsUnsupportedFormat, "Function supports only floating point types");
+}
+
+static void sigmoid(const Mat &src, Mat &dst)
+{
+    cv::exp(-src, dst);
+    cv::pow(1 + dst, -1, dst);
+}
+
+class LSTMLayerImpl CV_FINAL : public LSTMLayer
+{
+    int numTimeStamps, numSamples;
+    bool allocated;
+
+    MatShape outTailShape;                 //shape of single output sample
+    MatShape outTsShape;    //shape of N output samples
+
+    bool useTimestampDim;
+    bool produceCellOutput;
+    float forgetBias, cellClip;
+    bool useCellClip, usePeephole;
+    bool reverse;   // If true, go in negative direction along the time axis
+
+public:
+
+    LSTMLayerImpl(const LayerParams& params)
+        : numTimeStamps(0), numSamples(0)
+    {
+        setParamsFrom(params);
+
+        if (!blobs.empty())
+        {
+            CV_Assert(blobs.size() >= 3);
+
+            blobs[2] = blobs[2].reshape(1, 1);
+
+            const Mat& Wh = blobs[0];
+            const Mat& Wx = blobs[1];
+            const Mat& bias = blobs[2];
+            CV_Assert(Wh.dims == 2 && Wx.dims == 2);
+            CV_Assert(Wh.rows == Wx.rows);
+            CV_Assert(Wh.rows == 4*Wh.cols);
+            CV_Assert(Wh.rows == (int)bias.total());
+            CV_Assert(Wh.type() == Wx.type() && Wx.type() == bias.type());
+
+            // Peephole weights.
+            if (blobs.size() > 3)
+            {
+                CV_Assert(blobs.size() == 6);
+                const int N = Wh.cols;
+                for (int i = 3; i < 6; ++i)
+                {
+                    CV_Assert(blobs[i].rows == N && blobs[i].cols == N);
+                    CV_Assert(blobs[i].type() == bias.type());
+                }
+            }
+        }
+        useTimestampDim = params.get<bool>("use_timestamp_dim", true);
+        produceCellOutput = params.get<bool>("produce_cell_output", false);
+        forgetBias = params.get<float>("forget_bias", 0.0f);
+        cellClip = params.get<float>("cell_clip", 0.0f);
+        useCellClip = params.get<bool>("use_cell_clip", false);
+        usePeephole = params.get<bool>("use_peephole", false);
+        reverse = params.get<bool>("reverse", false);
+
+        allocated = false;
+        outTailShape.clear();
+    }
+
+    void setUseTimstampsDim(bool use) CV_OVERRIDE
+    {
+        CV_Assert(!allocated);
+        useTimestampDim = use;
+    }
+
+    void setProduceCellOutput(bool produce) CV_OVERRIDE
+    {
+        CV_Assert(!allocated);
+        produceCellOutput = produce;
+    }
+
+    void setOutShape(const MatShape &outTailShape_) CV_OVERRIDE
+    {
+        CV_Assert(!allocated || total(outTailShape) == total(outTailShape_));
+        outTailShape = outTailShape_;
+    }
+
+    void setWeights(const Mat &Wh, const Mat &Wx, const Mat &bias) CV_OVERRIDE
+    {
+        CV_Assert(Wh.dims == 2 && Wx.dims == 2);
+        CV_Assert(Wh.rows == Wx.rows);
+        CV_Assert(Wh.rows == 4*Wh.cols);
+        CV_Assert(Wh.rows == (int)bias.total());
+        CV_Assert(Wh.type() == Wx.type() && Wx.type() == bias.type());
+
+        blobs.resize(3);
+        blobs[0] = Mat(Wh.clone());
+        blobs[1] = Mat(Wx.clone());
+        blobs[2] = Mat(bias.clone()).reshape(1, 1);
+    }
+
+    bool getMemoryShapes(const std::vector<MatShape> &inputs,
+                         const int requiredOutputs,
+                         std::vector<MatShape> &outputs,
+                         std::vector<MatShape> &internals) const CV_OVERRIDE
+    {
+        CV_Assert((!usePeephole && blobs.size() == 3) || (usePeephole && blobs.size() == 6));
+        CV_Assert(inputs.size() == 1);
+        const MatShape& inp0 = inputs[0];
+
+        const Mat &Wh = blobs[0], &Wx = blobs[1];
+        int _numOut = Wh.size[1];
+        int _numInp = Wx.size[1];
+        MatShape outTailShape_(outTailShape), outResShape;
+
+        if (!outTailShape_.empty())
+            CV_Assert(total(outTailShape_) == _numOut);
+        else
+            outTailShape_.assign(1, _numOut);
+
+        int _numSamples;
+        if (useTimestampDim)
+        {
+            CV_Assert(inp0.size() >= 2 && total(inp0, 2) == _numInp);
+            _numSamples = inp0[1];
+            outResShape.push_back(inp0[0]);
+        }
+        else
+        {
+            CV_Assert(inp0.size() >= 2 && total(inp0, 1) == _numInp);
+            _numSamples = inp0[0];
+        }
+
+        outResShape.push_back(_numSamples);
+        outResShape.insert(outResShape.end(), outTailShape_.begin(), outTailShape_.end());
+
+        size_t noutputs = produceCellOutput ? 2 : 1;
+        outputs.assign(noutputs, outResShape);
+
+        internals.assign(1, shape(_numSamples, _numOut)); // hInternal
+        internals.push_back(shape(_numSamples, _numOut)); // cInternal
+        internals.push_back(shape(_numSamples, 1)); // dummyOnes
+        internals.push_back(shape(_numSamples, 4*_numOut)); // gates
+
+        return false;
+    }
+
+    void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays) CV_OVERRIDE
+    {
+        std::vector<Mat> input;
+        inputs_arr.getMatVector(input);
+
+        CV_Assert((!usePeephole && blobs.size() == 3) || (usePeephole && blobs.size() == 6));
+        CV_Assert(input.size() == 1);
+        const Mat& inp0 = input[0];
+
+        Mat &Wh = blobs[0], &Wx = blobs[1];
+        int numOut = Wh.size[1];
+        int numInp = Wx.size[1];
+
+        if (!outTailShape.empty())
+            CV_Assert(total(outTailShape) == numOut);
+        else
+            outTailShape.assign(1, numOut);
+
+        if (useTimestampDim)
+        {
+            CV_Assert(inp0.dims >= 2 && (int)inp0.total(2) == numInp);
+            numTimeStamps = inp0.size[0];
+            numSamples = inp0.size[1];
+        }
+        else
+        {
+            CV_Assert(inp0.dims >= 2 && (int)inp0.total(1) == numInp);
+            numTimeStamps = 1;
+            numSamples = inp0.size[0];
+        }
+
+        outTsShape.clear();
+        outTsShape.push_back(numSamples);
+        outTsShape.insert(outTsShape.end(), outTailShape.begin(), outTailShape.end());
+
+        allocated = true;
+    }
+
+    void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
+    {
+        CV_TRACE_FUNCTION();
+        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+
+        if (inputs_arr.depth() == CV_16S)
+        {
+            forward_fallback(inputs_arr, outputs_arr, internals_arr);
+            return;
+        }
+
+        std::vector<Mat> input, output, internals;
+        inputs_arr.getMatVector(input);
+        outputs_arr.getMatVector(output);
+        internals_arr.getMatVector(internals);
+
+        const Mat &Wh = blobs[0];
+        const Mat &Wx = blobs[1];
+        const Mat &bias = blobs[2];
+
+        int numOut = Wh.size[1];
+
+        Mat hInternal = internals[0], cInternal = internals[1],
+                dummyOnes = internals[2], gates = internals[3];
+        hInternal.setTo(0.);
+        cInternal.setTo(0.);
+        dummyOnes.setTo(1.);
+
+        int numSamplesTotal = numTimeStamps*numSamples;
+        Mat xTs = input[0].reshape(1, numSamplesTotal);
+
+        Mat hOutTs = output[0].reshape(1, numSamplesTotal);
+        Mat cOutTs = produceCellOutput ? output[1].reshape(1, numSamplesTotal) : Mat();
+
+        int tsStart, tsEnd, tsInc;
+        if (reverse) {
+            tsStart = numTimeStamps - 1;
+            tsEnd = -1;
+            tsInc = -1;
+        }
+        else {
+            tsStart = 0;
+            tsEnd = numTimeStamps;
+            tsInc = 1;
+        }
+        for (int ts = tsStart; ts != tsEnd; ts += tsInc)
+        {
+            Range curRowRange(ts*numSamples, (ts + 1)*numSamples);
+            Mat xCurr = xTs.rowRange(curRowRange);
+
+            gemm(xCurr, Wx, 1, gates, 0, gates, GEMM_2_T);      // Wx * x_t
+            gemm(hInternal, Wh, 1, gates, 1, gates, GEMM_2_T);  //+Wh * h_{t-1}
+            gemm(dummyOnes, bias, 1, gates, 1, gates);          //+b
+
+            Mat gateI = gates.colRange(0*numOut, 1*numOut);
+            Mat gateF = gates.colRange(1*numOut, 2*numOut);
+            Mat gateO = gates.colRange(2*numOut, 3*numOut);
+            Mat gateG = gates.colRange(3*numOut, 4*numOut);
+
+            if (forgetBias)
+                add(gateF, forgetBias, gateF);
+
+            if (usePeephole)
+            {
+                Mat gatesIF = gates.colRange(0, 2*numOut);
+                gemm(cInternal, blobs[3], 1, gateI, 1, gateI);
+                gemm(cInternal, blobs[4], 1, gateF, 1, gateF);
+                sigmoid(gatesIF, gatesIF);
+            }
+            else
+            {
+                Mat gatesIFO = gates.colRange(0, 3*numOut);
+                sigmoid(gatesIFO, gatesIFO);
+            }
+
+            tanh(gateG, gateG);
+
+            //compute c_t
+            multiply(gateF, cInternal, gateF);  // f_t (*) c_{t-1}
+            multiply(gateI, gateG, gateI);      // i_t (*) g_t
+            add(gateF, gateI, cInternal);       // c_t = f_t (*) c_{t-1} + i_t (*) g_t
+
+            if (useCellClip)
+            {
+                min(cInternal, cellClip, cInternal);
+                max(cInternal, -cellClip, cInternal);
+            }
+            if (usePeephole)
+            {
+                gemm(cInternal, blobs[5], 1, gateO, 1, gateO);
+                sigmoid(gateO, gateO);
+            }
+
+            //compute h_t
+            tanh(cInternal, hInternal);
+            multiply(gateO, hInternal, hInternal);
+
+            //save results in output blobs
+            hInternal.copyTo(hOutTs.rowRange(curRowRange));
+            if (produceCellOutput)
+                cInternal.copyTo(cOutTs.rowRange(curRowRange));
+        }
+    }
+};
+
+Ptr<LSTMLayer> LSTMLayer::create(const LayerParams& params)
+{
+    return Ptr<LSTMLayer>(new LSTMLayerImpl(params));
+}
+
+int LSTMLayer::inputNameToIndex(String inputName)
+{
+    if (toLowerCase(inputName) == "x")
+        return 0;
+    return -1;
+}
+
+int LSTMLayer::outputNameToIndex(const String& outputName)
+{
+    if (toLowerCase(outputName) == "h")
+        return 0;
+    else if (toLowerCase(outputName) == "c")
+        return 1;
+    return -1;
+}
+
+
+class RNNLayerImpl : public RNNLayer
+{
+    int numX, numH, numO;
+    int numSamples, numTimestamps, numSamplesTotal;
+    int dtype;
+    Mat Whh, Wxh, bh;
+    Mat Who, bo;
+    bool produceH;
+
+public:
+
+    RNNLayerImpl(const LayerParams& params)
+        : numX(0), numH(0), numO(0), numSamples(0), numTimestamps(0), numSamplesTotal(0), dtype(0)
+    {
+        setParamsFrom(params);
+        type = "RNN";
+        produceH = false;
+    }
+
+    void setProduceHiddenOutput(bool produce = false) CV_OVERRIDE
+    {
+        produceH = produce;
+    }
+
+    void setWeights(const Mat &W_xh, const Mat &b_h, const Mat &W_hh, const Mat &W_ho, const Mat &b_o) CV_OVERRIDE
+    {
+        CV_Assert(W_hh.dims == 2 && W_xh.dims == 2);
+        CV_Assert(W_hh.size[0] == W_xh.size[0] && W_hh.size[0] == W_hh.size[1] && (int)b_h.total() == W_xh.size[0]);
+        CV_Assert(W_ho.size[0] == (int)b_o.total());
+        CV_Assert(W_ho.size[1] == W_hh.size[1]);
+
+        blobs.resize(5);
+        blobs[0] = Mat(W_xh.clone());
+        blobs[1] = Mat(b_h.clone());
+        blobs[2] = Mat(W_hh.clone());
+        blobs[3] = Mat(W_ho.clone());
+        blobs[4] = Mat(b_o.clone());
+    }
+
+    bool getMemoryShapes(const std::vector<MatShape> &inputs,
+                         const int requiredOutputs,
+                         std::vector<MatShape> &outputs,
+                         std::vector<MatShape> &internals) const CV_OVERRIDE
+    {
+        CV_Assert(inputs.size() >= 1 && inputs.size() <= 2);
+
+        Mat Who_ = blobs[3];
+        Mat Wxh_ = blobs[0];
+
+        int numTimestamps_ = inputs[0][0];
+        int numSamples_ = inputs[0][1];
+
+        int numO_ = Who_.rows;
+        int numH_ = Wxh_.rows;
+
+        outputs.clear();
+        int dims[] = {numTimestamps_, numSamples_, numO_};
+        outputs.push_back(shape(dims, 3));
+        dims[2] = numH_;
+        if (produceH)
+            outputs.push_back(shape(dims, 3));
+
+        internals.assign(2, shape(numSamples_, numH_));
+        internals.push_back(shape(numSamples_, 1));
+
+        return false;
+    }
+
+    void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays) CV_OVERRIDE
+    {
+        std::vector<Mat> input, outputs;
+        inputs_arr.getMatVector(input);
+
+        CV_Assert(input.size() >= 1 && input.size() <= 2);
+
+        Wxh = blobs[0];
+        bh  = blobs[1];
+        Whh = blobs[2];
+        Who = blobs[3];
+        bo  = blobs[4];
+
+        numH = Wxh.rows;
+        numX = Wxh.cols;
+        numO = Who.rows;
+
+        const Mat& inp0 = input[0];
+
+        CV_Assert(inp0.dims >= 2);
+        CV_Assert(inp0.total(2) == numX);
+        dtype = CV_32F;
+        CV_Assert(inp0.type() == dtype);
+        numTimestamps = inp0.size[0];
+        numSamples = inp0.size[1];
+        numSamplesTotal = numTimestamps * numSamples;
+
+        bh = bh.reshape(1, 1); //is 1 x numH Mat
+        bo = bo.reshape(1, 1); //is 1 x numO Mat
+    }
+
+    void reshapeOutput(std::vector<Mat> &output)
+    {
+        output.resize(produceH ? 2 : 1);
+        int sz0[] = { numTimestamps, numSamples, numO };
+        output[0].create(3, sz0, dtype);
+        if (produceH)
+        {
+            int sz1[] = { numTimestamps, numSamples, numH };
+            output[1].create(3, sz1, dtype);
+        }
+    }
+
+    void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
+    {
+        CV_TRACE_FUNCTION();
+        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+
+        if (inputs_arr.depth() == CV_16S)
+        {
+            forward_fallback(inputs_arr, outputs_arr, internals_arr);
+            return;
+        }
+
+        std::vector<Mat> input, output, internals;
+        inputs_arr.getMatVector(input);
+        outputs_arr.getMatVector(output);
+        internals_arr.getMatVector(internals);
+
+        Mat xTs = input[0].reshape(1, numSamplesTotal);
+        Mat oTs = output[0].reshape(1, numSamplesTotal);
+        Mat hTs = produceH ? output[1].reshape(1, numSamplesTotal) : Mat();
+        Mat hCurr = internals[0];
+        Mat hPrev = internals[1];
+        Mat dummyBiasOnes = internals[2];
+
+        hPrev.setTo(0.);
+        dummyBiasOnes.setTo(1.);
+
+        for (int ts = 0; ts < numTimestamps; ts++)
+        {
+            Range curRowRange = Range(ts * numSamples, (ts + 1) * numSamples);
+            Mat xCurr = xTs.rowRange(curRowRange);
+
+            gemm(hPrev, Whh, 1, hCurr, 0, hCurr, GEMM_2_T); // W_{hh} * h_{prev}
+            gemm(xCurr, Wxh, 1, hCurr, 1, hCurr, GEMM_2_T); //+W_{xh} * x_{curr}
+            gemm(dummyBiasOnes, bh, 1, hCurr, 1, hCurr);    //+bh
+            tanh(hCurr, hPrev);
+
+            Mat oCurr = oTs.rowRange(curRowRange);
+            gemm(hPrev, Who, 1, oCurr, 0, oCurr, GEMM_2_T); // W_{ho} * h_{prev}
+            gemm(dummyBiasOnes, bo, 1, oCurr, 1, oCurr);    //+b_o
+            tanh(oCurr, oCurr);
+
+            if (produceH)
+                hPrev.copyTo(hTs.rowRange(curRowRange));
+        }
+    }
+};
+
+CV_EXPORTS_W Ptr<RNNLayer> RNNLayer::create(const LayerParams& params)
+{
+    return Ptr<RNNLayer>(new RNNLayerImpl(params));
+}
+
+}
+}
--- a/Lib/opencv/sources/modules/dnn/src/layers/region_layer.cpp
+++ b/Lib/opencv/sources/modules/dnn/src/layers/region_layer.cpp
@@ -0,0 +1,422 @@
+/*M ///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Copyright (C) 2017, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "../precomp.hpp"
+#include "../op_cuda.hpp"
+#include <opencv2/dnn/shape_utils.hpp>
+#include <opencv2/dnn/all_layers.hpp>
+#include "../nms.inl.hpp"
+
+#ifdef HAVE_OPENCL
+#include "opencl_kernels_dnn.hpp"
+#endif
+
+#ifdef HAVE_CUDA
+#include "../cuda4dnn/primitives/region.hpp"
+using namespace cv::dnn::cuda4dnn;
+#endif
+
+namespace cv
+{
+namespace dnn
+{
+
+class RegionLayerImpl CV_FINAL : public RegionLayer
+{
+public:
+    int coords, classes, anchors, classfix;
+    float thresh, nmsThreshold;
+    bool useSoftmax, useLogistic;
+#ifdef HAVE_OPENCL
+    UMat blob_umat;
+#endif
+
+    RegionLayerImpl(const LayerParams& params)
+    {
+        setParamsFrom(params);
+        CV_Assert(blobs.size() == 1);
+
+        thresh = params.get<float>("thresh", 0.2);
+        coords = params.get<int>("coords", 4);
+        classes = params.get<int>("classes", 0);
+        anchors = params.get<int>("anchors", 5);
+        classfix = params.get<int>("classfix", 0);
+        useSoftmax = params.get<bool>("softmax", false);
+        useLogistic = params.get<bool>("logistic", false);
+        nmsThreshold = params.get<float>("nms_threshold", 0.4);
+
+        CV_Assert(nmsThreshold >= 0.);
+        CV_Assert(coords == 4);
+        CV_Assert(classes >= 1);
+        CV_Assert(anchors >= 1);
+        CV_Assert(useLogistic || useSoftmax);
+        if (params.get<bool>("softmax_tree", false))
+            CV_Error(cv::Error::StsNotImplemented, "Yolo9000 is not implemented");
+    }
+
+    virtual bool supportBackend(int backendId) CV_OVERRIDE
+    {
+        return backendId == DNN_BACKEND_OPENCV ||
+               backendId == DNN_BACKEND_CUDA;
+    }
+
+    bool getMemoryShapes(const std::vector<MatShape> &inputs,
+                         const int requiredOutputs,
+                         std::vector<MatShape> &outputs,
+                         std::vector<MatShape> &internals) const CV_OVERRIDE
+    {
+        CV_Assert(inputs.size() > 0);
+        // channels == cell_size*anchors
+        CV_Assert(inputs[0][3] == (1 + coords + classes)*anchors);
+        int batch_size = inputs[0][0];
+        if(batch_size > 1)
+            outputs = std::vector<MatShape>(1, shape(batch_size, inputs[0][1] * inputs[0][2] * anchors, inputs[0][3] / anchors));
+        else
+            outputs = std::vector<MatShape>(1, shape(inputs[0][1] * inputs[0][2] * anchors, inputs[0][3] / anchors));
+        return false;
+    }
+
+    float logistic_activate(float x) { return 1.F / (1.F + exp(-x)); }
+
+    void softmax_activate(const float* input, const int n, const float temp, float* output)
+    {
+        int i;
+        float sum = 0;
+        float largest = -FLT_MAX;
+        for (i = 0; i < n; ++i) {
+            if (input[i] > largest) largest = input[i];
+        }
+        for (i = 0; i < n; ++i) {
+            float e = exp((input[i] - largest) / temp);
+            sum += e;
+            output[i] = e;
+        }
+        for (i = 0; i < n; ++i) {
+            output[i] /= sum;
+        }
+    }
+
+#ifdef HAVE_OPENCL
+    bool forward_ocl(InputArrayOfArrays inps, OutputArrayOfArrays outs, OutputArrayOfArrays internals)
+    {
+        if (blob_umat.empty())
+            blobs[0].copyTo(blob_umat);
+
+        std::vector<UMat> inputs;
+        std::vector<UMat> outputs;
+
+        // TODO: implement a logistic activation to classification scores.
+        if (useLogistic || inps.depth() == CV_16S)
+            return false;
+
+        inps.getUMatVector(inputs);
+        outs.getUMatVector(outputs);
+
+        CV_Assert(inputs.size() >= 1);
+        int const cell_size = classes + coords + 1;
+
+        for (size_t ii = 0; ii < outputs.size(); ii++)
+        {
+            UMat& inpBlob = inputs[ii];
+            UMat& outBlob = outputs[ii];
+
+            int batch_size = inpBlob.size[0];
+            int rows = inpBlob.size[1];
+            int cols = inpBlob.size[2];
+
+            // channels == cell_size*anchors, see l. 94
+            int sample_size = cell_size*rows*cols*anchors;
+
+            ocl::Kernel logistic_kernel("logistic_activ", ocl::dnn::region_oclsrc);
+            size_t nanchors = rows*cols*anchors*batch_size;
+            logistic_kernel.set(0, (int)nanchors);
+            logistic_kernel.set(1, ocl::KernelArg::PtrReadOnly(inpBlob));
+            logistic_kernel.set(2, (int)cell_size);
+            logistic_kernel.set(3, ocl::KernelArg::PtrWriteOnly(outBlob));
+            logistic_kernel.run(1, &nanchors, NULL, false);
+
+            if (useSoftmax)
+            {
+                // Yolo v2
+                // softmax activation for Probability, for each grid cell (X x Y x Anchor-index)
+                ocl::Kernel softmax_kernel("softmax_activ", ocl::dnn::region_oclsrc);
+                size_t nanchors = rows*cols*anchors*batch_size;
+                softmax_kernel.set(0, (int)nanchors);
+                softmax_kernel.set(1, ocl::KernelArg::PtrReadOnly(inpBlob));
+                softmax_kernel.set(2, ocl::KernelArg::PtrReadOnly(blob_umat));
+                softmax_kernel.set(3, (int)cell_size);
+                softmax_kernel.set(4, (int)classes);
+                softmax_kernel.set(5, (int)classfix);
+                softmax_kernel.set(6, (int)rows);
+                softmax_kernel.set(7, (int)cols);
+                softmax_kernel.set(8, (int)anchors);
+                softmax_kernel.set(9, (float)thresh);
+                softmax_kernel.set(10, ocl::KernelArg::PtrWriteOnly(outBlob));
+                if (!softmax_kernel.run(1, &nanchors, NULL, false))
+                    return false;
+            }
+
+            if (nmsThreshold > 0) {
+                Mat mat = outBlob.getMat(ACCESS_WRITE);
+                float *dstData = mat.ptr<float>();
+                for (int b = 0; b < batch_size; ++b)
+                    do_nms_sort(dstData + b*sample_size, rows*cols*anchors, thresh, nmsThreshold);
+            }
+
+        }
+
+        return true;
+    }
+#endif
+
+    void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
+    {
+        CV_TRACE_FUNCTION();
+        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+
+        CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
+                   forward_ocl(inputs_arr, outputs_arr, internals_arr))
+
+        if (inputs_arr.depth() == CV_16S)
+        {
+            forward_fallback(inputs_arr, outputs_arr, internals_arr);
+            return;
+        }
+
+        std::vector<Mat> inputs, outputs, internals;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
+        internals_arr.getMatVector(internals);
+
+        CV_Assert(inputs.size() >= 1);
+        CV_Assert(outputs.size() == 1);
+        int const cell_size = classes + coords + 1;
+
+        const float* biasData = blobs[0].ptr<float>();
+
+        for (size_t ii = 0; ii < outputs.size(); ii++)
+        {
+            Mat &inpBlob = inputs[ii];
+            Mat &outBlob = outputs[ii];
+
+            int batch_size = inpBlob.size[0];
+            int rows = inpBlob.size[1];
+            int cols = inpBlob.size[2];
+
+            // address length for one image in batch, both for input and output
+            int sample_size = cell_size*rows*cols*anchors;
+
+            // assert that the comment above is true
+            CV_Assert(sample_size*batch_size == inpBlob.total());
+            CV_Assert(sample_size*batch_size == outBlob.total());
+
+            CV_Assert(inputs.size() < 2 || inputs[1].dims == 4);
+            int hNorm = inputs.size() > 1 ? inputs[1].size[2] : rows;
+            int wNorm = inputs.size() > 1 ? inputs[1].size[3] : cols;
+
+            const float *srcData = inpBlob.ptr<float>();
+            float *dstData = outBlob.ptr<float>();
+
+            // logistic activation for t0, for each grid cell (X x Y x Anchor-index)
+            for (int i = 0; i < batch_size*rows*cols*anchors; ++i) {
+                int index = cell_size*i;
+                float x = srcData[index + 4];
+                dstData[index + 4] = logistic_activate(x);	// logistic activation
+            }
+
+            if (useSoftmax) {  // Yolo v2
+                for (int i = 0; i < batch_size*rows*cols*anchors; ++i) {
+                    int index = cell_size*i;
+                    softmax_activate(srcData + index + 5, classes, 1, dstData + index + 5);
+                }
+            }
+            else if (useLogistic) {  // Yolo v3
+                for (int i = 0; i < batch_size*rows*cols*anchors; ++i){
+                    int index = cell_size*i;
+                    const float* input = srcData + index + 5;
+                    float* output = dstData + index + 5;
+                    for (int c = 0; c < classes; ++c)
+                        output[c] = logistic_activate(input[c]);
+                }
+            }
+            for (int b = 0; b < batch_size; ++b)
+                for (int x = 0; x < cols; ++x)
+                    for(int y = 0; y < rows; ++y)
+                        for (int a = 0; a < anchors; ++a) {
+                            // relative start address for image b within the batch data
+                            int index_sample_offset = sample_size*b;
+                            int index = (y*cols + x)*anchors + a;  // index for each grid-cell & anchor
+                            int p_index = index_sample_offset + index * cell_size + 4;
+                            float scale = dstData[p_index];
+                            if (classfix == -1 && scale < .5) scale = 0;  // if(t0 < 0.5) t0 = 0;
+                            int box_index = index_sample_offset + index * cell_size;
+
+                            dstData[box_index + 0] = (x + logistic_activate(srcData[box_index + 0])) / cols;
+                            dstData[box_index + 1] = (y + logistic_activate(srcData[box_index + 1])) / rows;
+                            dstData[box_index + 2] = exp(srcData[box_index + 2]) * biasData[2 * a] / wNorm;
+                            dstData[box_index + 3] = exp(srcData[box_index + 3]) * biasData[2 * a + 1] / hNorm;
+
+                            int class_index = index_sample_offset + index * cell_size + 5;
+                            for (int j = 0; j < classes; ++j) {
+                                float prob = scale*dstData[class_index + j];  // prob = IoU(box, object) = t0 * class-probability
+                                dstData[class_index + j] = (prob > thresh) ? prob : 0;  // if (IoU < threshold) IoU = 0;
+                            }
+                        }
+            if (nmsThreshold > 0) {
+                for (int b = 0; b < batch_size; ++b){
+                    do_nms_sort(dstData+b*sample_size, rows*cols*anchors, thresh, nmsThreshold);
+                }
+            }
+        }
+    }
+
+    void do_nms_sort(float *detections, int total, float score_thresh, float nms_thresh)
+    {
+        std::vector<Rect2d> boxes(total);
+        std::vector<float> scores(total);
+
+        for (int i = 0; i < total; ++i)
+        {
+            Rect2d &b = boxes[i];
+            int box_index = i * (classes + coords + 1);
+            b.width = detections[box_index + 2];
+            b.height = detections[box_index + 3];
+            b.x = detections[box_index + 0] - b.width / 2;
+            b.y = detections[box_index + 1] - b.height / 2;
+        }
+
+        std::vector<int> indices;
+        for (int k = 0; k < classes; ++k)
+        {
+            for (int i = 0; i < total; ++i)
+            {
+                int box_index = i * (classes + coords + 1);
+                int class_index = box_index + 5;
+                scores[i] = detections[class_index + k];
+                detections[class_index + k] = 0;
+            }
+            NMSBoxes(boxes, scores, score_thresh, nms_thresh, indices);
+            for (int i = 0, n = indices.size(); i < n; ++i)
+            {
+                int box_index = indices[i] * (classes + coords + 1);
+                int class_index = box_index + 5;
+                detections[class_index + k] = scores[indices[i]];
+            }
+        }
+    }
+
+#ifdef HAVE_CUDA
+    Ptr<BackendNode> initCUDA(
+        void *context_,
+        const std::vector<Ptr<BackendWrapper>>& inputs,
+        const std::vector<Ptr<BackendWrapper>>& outputs
+    ) override
+    {
+        auto context = reinterpret_cast<csl::CSLContext*>(context_);
+
+        if (coords != 4)
+            CV_Error(Error::StsNotImplemented, "Only upright rectangular boxes are supported in RegionLayer.");
+
+        std::size_t height_norm, width_norm;
+        if (inputs.size() == 1)
+        {
+            auto input_wrapper = inputs[0].dynamicCast<CUDABackendWrapper>();
+            auto input_shape = input_wrapper->getShape();
+            height_norm = input_shape[1];
+            width_norm = input_shape[2];
+        }
+        else
+        {
+            auto input_wrapper = inputs[1].dynamicCast<CUDABackendWrapper>();
+            auto input_shape = input_wrapper->getShape();
+            CV_Assert(input_shape.size() == 4);
+            height_norm = input_shape[2];
+            width_norm = input_shape[3];
+        }
+
+        cuda4dnn::SquashMethod squash_method;
+        if(useLogistic)
+            squash_method = cuda4dnn::SquashMethod::SIGMOID;
+        else if (useSoftmax)
+            squash_method = cuda4dnn::SquashMethod::SOFTMAX;
+
+        /* exactly one must be true */
+        CV_Assert((useLogistic || useSoftmax) && !(useLogistic && useSoftmax));
+
+        cuda4dnn::RegionConfiguration<float> config;
+        config.squash_method = squash_method;
+        config.classes = classes;
+        config.boxes_per_cell = anchors;
+
+        config.height_norm = height_norm;
+        config.width_norm = width_norm;
+
+        config.object_prob_cutoff = (classfix == -1) ? 0.5 : 0.0;
+        config.class_prob_cutoff = thresh;
+
+        config.nms_iou_threshold = nmsThreshold;
+
+        return make_cuda_node<cuda4dnn::RegionOp>(preferableTarget, std::move(context->stream), blobs[0], config);
+    }
+#endif
+
+    virtual int64 getFLOPS(const std::vector<MatShape> &inputs,
+                           const std::vector<MatShape> &outputs) const CV_OVERRIDE
+    {
+        CV_UNUSED(outputs); // suppress unused variable warning
+
+        int64 flops = 0;
+        for(int i = 0; i < inputs.size(); i++)
+        {
+            flops += 60*total(inputs[i]);
+        }
+        return flops;
+    }
+};
+
+Ptr<RegionLayer> RegionLayer::create(const LayerParams& params)
+{
+    return Ptr<RegionLayer>(new RegionLayerImpl(params));
+}
+
+}  // namespace dnn
+}  // namespace cv
--- a/Lib/opencv/sources/modules/dnn/src/layers/reorg_layer.cpp
+++ b/Lib/opencv/sources/modules/dnn/src/layers/reorg_layer.cpp
@@ -0,0 +1,251 @@
+/*M ///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Copyright (C) 2017, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "../precomp.hpp"
+#include "../op_cuda.hpp"
+#include "../op_inf_engine.hpp"
+
+#ifdef HAVE_DNN_NGRAPH
+#include "../ie_ngraph.hpp"
+#include <ngraph/op/experimental/layers/reorg_yolo.hpp>
+#endif
+
+#include <opencv2/dnn/shape_utils.hpp>
+#include <opencv2/dnn/all_layers.hpp>
+
+#ifdef HAVE_OPENCL
+#include "opencl_kernels_dnn.hpp"
+#endif
+
+#ifdef HAVE_CUDA
+#include "../cuda4dnn/primitives/reorg.hpp"
+using namespace cv::dnn::cuda4dnn;
+#endif
+
+namespace cv
+{
+namespace dnn
+{
+
+class ReorgLayerImpl CV_FINAL : public ReorgLayer
+{
+    int reorgStride;
+public:
+
+    ReorgLayerImpl(const LayerParams& params)
+    {
+        setParamsFrom(params);
+
+        reorgStride = params.get<int>("reorg_stride", 2);
+        CV_Assert(reorgStride > 0);
+    }
+
+    bool getMemoryShapes(const std::vector<MatShape> &inputs,
+                         const int requiredOutputs,
+                         std::vector<MatShape> &outputs,
+                         std::vector<MatShape> &internals) const CV_OVERRIDE
+    {
+        CV_Assert(inputs.size() > 0);
+        outputs = std::vector<MatShape>(inputs.size(), shape(
+            inputs[0][0],
+            inputs[0][1] * reorgStride * reorgStride,
+            inputs[0][2] / reorgStride,
+            inputs[0][3] / reorgStride));
+
+        CV_Assert(outputs[0][0] > 0 && outputs[0][1] > 0 && outputs[0][2] > 0 && outputs[0][3] > 0);
+        CV_Assert(total(outputs[0]) == total(inputs[0]));
+
+        return false;
+    }
+
+    virtual void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr) CV_OVERRIDE
+    {
+        std::vector<Mat> inputs, outputs;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
+
+        Mat inp = inputs[0];
+        Mat out = outputs[0];
+        int batchSize = inp.size[0];
+
+        LayerParams permParams;
+        if (batchSize == 1)
+        {
+            int order[] = {1, 3, 0, 2};
+            permParams.set("order", DictValue::arrayInt(&order[0], 4));
+
+            permuteInpShape.resize(4);
+            permuteInpShape[0] = inp.size[1] * inp.size[2] / (reorgStride * reorgStride);  // (channels*height)/(r*r)
+            permuteInpShape[1] = reorgStride;
+            permuteInpShape[2] = inp.size[3];  // width
+            permuteInpShape[3] = reorgStride;
+
+            permuteOutShape.resize(4);
+            for (int i = 0; i < 4; ++i)
+                permuteOutShape[i] = permuteInpShape[order[i]];
+        }
+        else
+        {
+            int order[] = {0, 2, 4, 1, 3};
+            permParams.set("order", DictValue::arrayInt(&order[0], 5));
+
+            permuteInpShape.resize(5);
+            permuteInpShape[0] = batchSize;
+            permuteInpShape[1] = inp.size[1] * inp.size[2] / (reorgStride * reorgStride);  // (channels*height)/(r*r)
+            permuteInpShape[2] = reorgStride;
+            permuteInpShape[3] = inp.size[3];  // width
+            permuteInpShape[4] = reorgStride;
+
+            permuteOutShape.resize(5);
+            for (int i = 0; i < 5; ++i)
+                permuteOutShape[i] = permuteInpShape[order[i]];
+        }
+        permute = PermuteLayer::create(permParams);
+        std::vector<Mat> permuteInputs(1, inp.reshape(1, permuteInpShape));
+        std::vector<Mat> permuteOutputs(1, out.reshape(1, permuteOutShape));
+        permute->finalize(permuteInputs, permuteOutputs);
+    }
+
+    virtual bool supportBackend(int backendId) CV_OVERRIDE
+    {
+        return backendId == DNN_BACKEND_OPENCV ||
+               backendId == DNN_BACKEND_CUDA ||
+               backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 ||
+               backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH;
+    }
+
+#ifdef HAVE_OPENCL
+    bool forward_ocl(InputArrayOfArrays inps, OutputArrayOfArrays outs, OutputArrayOfArrays internals)
+    {
+        std::vector<UMat> inputs;
+        std::vector<UMat> outputs;
+
+        inps.getUMatVector(inputs);
+        outs.getUMatVector(outputs);
+
+        inputs[0] = inputs[0].reshape(1, permuteInpShape.size(), &permuteInpShape[0]);
+        outputs[0] = outputs[0].reshape(1, permuteOutShape.size(), &permuteOutShape[0]);
+        permute->preferableTarget = preferableTarget;
+        permute->forward(inputs, outputs, internals);
+        return true;
+    }
+#endif
+
+    void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
+    {
+        CV_TRACE_FUNCTION();
+        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+
+        CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
+                   forward_ocl(inputs_arr, outputs_arr, internals_arr))
+
+        if (inputs_arr.depth() == CV_16S)
+        {
+            forward_fallback(inputs_arr, outputs_arr, internals_arr);
+            return;
+        }
+
+        std::vector<Mat> inputs, outputs;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
+
+        inputs[0] = inputs[0].reshape(1, permuteInpShape);
+        outputs[0] = outputs[0].reshape(1, permuteOutShape);
+        permute->forward(inputs, outputs, internals_arr);
+    }
+
+#ifdef HAVE_CUDA
+    Ptr<BackendNode> initCUDA(
+        void *context_,
+        const std::vector<Ptr<BackendWrapper>>& inputs,
+        const std::vector<Ptr<BackendWrapper>>& outputs
+    ) override
+    {
+        auto context = reinterpret_cast<csl::CSLContext*>(context_);
+        return make_cuda_node<cuda4dnn::ReorgOp>(preferableTarget, std::move(context->stream), reorgStride);
+    }
+#endif
+
+#ifdef HAVE_INF_ENGINE
+    virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >&) CV_OVERRIDE
+    {
+        InferenceEngine::Builder::ReorgYoloLayer ieLayer(name);
+        ieLayer.setStride(reorgStride);
+        return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
+    }
+#endif  // HAVE_INF_ENGINE
+
+#ifdef HAVE_DNN_NGRAPH
+    virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> > &inputs,
+                                        const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
+    {
+        auto& ieInpNode = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
+        auto reorg = std::make_shared<ngraph::op::ReorgYolo>(ieInpNode, ngraph::Strides{(size_t)reorgStride});
+        return Ptr<BackendNode>(new InfEngineNgraphNode(reorg));
+    }
+#endif  // HAVE_DNN_NGRAPH
+
+    virtual int64 getFLOPS(const std::vector<MatShape> &inputs,
+                           const std::vector<MatShape> &outputs) const CV_OVERRIDE
+    {
+        CV_UNUSED(outputs); // suppress unused variable warning
+
+        int64 flops = 0;
+        for(int i = 0; i < inputs.size(); i++)
+        {
+            flops += 21*total(inputs[i]);
+        }
+        return flops;
+    }
+
+private:
+    Ptr<PermuteLayer> permute;
+    std::vector<int> permuteInpShape, permuteOutShape;
+};
+
+Ptr<ReorgLayer> ReorgLayer::create(const LayerParams& params)
+{
+    return Ptr<ReorgLayer>(new ReorgLayerImpl(params));
+}
+
+}  // namespace dnn
+}  // namespace cv
--- a/Lib/opencv/sources/modules/dnn/src/layers/reshape_layer.cpp
+++ b/Lib/opencv/sources/modules/dnn/src/layers/reshape_layer.cpp
@@ -0,0 +1,318 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Copyright (C) 2017, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "../precomp.hpp"
+#include "layers_common.hpp"
+#include "../op_cuda.hpp"
+#include "../op_inf_engine.hpp"
+#include "../ie_ngraph.hpp"
+
+#include <opencv2/dnn/shape_utils.hpp>
+
+#ifdef HAVE_CUDA
+#include "../cuda4dnn/primitives/reshape.hpp"
+using namespace cv::dnn::cuda4dnn;
+#endif
+
+namespace cv
+{
+namespace dnn
+{
+
+static void computeShapeByReshapeMask(const MatShape &srcShape,
+                                      const MatShape &maskShape,
+                                      Range srcRange /*= Range::all()*/,
+                                      MatShape& dstShape)
+{
+    int srcShapeSize = (int)srcShape.size();
+    int maskShapeSize = (int)maskShape.size();
+
+    if (srcRange == Range::all())
+        srcRange = Range(0, srcShapeSize);
+    else
+    {
+        int sz = srcRange.size();
+        srcRange.start = clamp(srcRange.start, srcShapeSize);
+        srcRange.end = srcRange.end == INT_MAX ? srcShapeSize : srcRange.start + sz;
+    }
+
+    bool explicitMask = !maskShape.empty();  // All mask values are positive.
+    for (int i = 0, n = maskShape.size(); i < n && explicitMask; ++i)
+    {
+        explicitMask = maskShape[i] > 0;
+    }
+    // Working range of source shape is a range where area(src) == area(mask).
+    if (explicitMask)
+    {
+        int maskTotal = total(maskShape);
+        // Go from the end of mask until we collect required total.
+        bool matched = false;
+        for (int i = srcRange.end - 1; i >= srcRange.start; --i)
+        {
+            if (matched)
+            {
+                if (total(srcShape, i, srcRange.end) != maskTotal)
+                {
+                    srcRange.start = i + 1;
+                    break;
+                }
+                else if (i == 0)
+                {
+                    srcRange.start = 0;
+                    break;
+                }
+            }
+            else
+            {
+                matched = total(srcShape, i, srcRange.end) == maskTotal;
+            }
+        }
+        while (total(srcShape, srcRange.start, srcRange.end) != maskTotal && srcRange.start > 0)
+        {
+            srcRange.start -= 1;
+        }
+        CV_Assert(total(srcShape, srcRange.start, srcRange.end) == maskTotal);
+    }
+
+    CV_Assert(0 <= srcRange.start && srcRange.start <= srcRange.end && srcRange.end <= srcShapeSize);
+    int dstShapeSize = srcShapeSize - srcRange.size() + maskShapeSize;
+    dstShape.resize(dstShapeSize);
+
+    std::copy(srcShape.begin(), srcShape.begin() + srcRange.start, dstShape.begin());
+    std::copy(srcShape.begin() + srcRange.end, srcShape.begin() + srcShapeSize, dstShape.begin() + srcRange.start + maskShapeSize);
+
+    int inferDim = -1;
+    for (int i = 0; i < maskShapeSize; i++)
+    {
+        if (maskShape[i] > 0)
+        {
+            dstShape[srcRange.start + i] = maskShape[i];
+        }
+        else if (maskShape[i] == 0)
+        {
+            if (srcRange.start + i >= srcShapeSize)
+                CV_Error(Error::StsBadArg, format("Copy dim[%d] (which has zero size) is out of the source shape bounds", srcRange.start + i));
+            dstShape[srcRange.start + i] = srcShape[srcRange.start + i];
+        }
+        else if (maskShape[i] == -1)
+        {
+            if (inferDim != -1)
+                CV_Error(Error::StsAssert, "Duplicate of inferred dim (which is denoted by -1)");
+            inferDim = srcRange.start + i;
+            dstShape[inferDim] = 1;
+        }
+        else
+            CV_Error(Error::StsBadArg, "maskShape[i] >= -1");
+    }
+
+    size_t srcTotal = total(srcShape);
+    size_t dstTotal = total(dstShape);
+    CV_Assert(dstTotal != 0);
+
+    if (inferDim != -1)
+    {
+        if (srcTotal % dstTotal != 0)
+            CV_Error(Error::StsBackTrace, "Can't infer a dim denoted by -1");
+
+        dstShape[inferDim] = (int)(srcTotal / dstTotal);
+    }
+    else
+    {
+        CV_Assert(srcTotal == dstTotal);
+    }
+}
+
+
+class ReshapeLayerImpl CV_FINAL : public ReshapeLayer
+{
+public:
+    ReshapeLayerImpl(const LayerParams& params)
+    {
+        setParamsFrom(params);
+        int axis = params.get<int>("axis", 0);
+        int numAxes = params.get<int>("num_axes", -1);
+        CV_Assert(numAxes >= -1);
+        newShapeRange = (numAxes == -1) ? Range(axis, INT_MAX) : Range(axis, axis + numAxes);
+
+        newShapeDesc.clear();
+        if (params.has("dim"))
+        {
+            const DictValue &paramShape = params.get("dim");
+            int i, dims = paramShape.size();
+            newShapeDesc.resize(dims);
+            for (i = 0; i < dims; i++)
+                newShapeDesc[i] = paramShape.get<int>(i);
+        }
+    }
+
+    virtual bool supportBackend(int backendId) CV_OVERRIDE
+    {
+        return backendId == DNN_BACKEND_OPENCV ||
+               backendId == DNN_BACKEND_CUDA ||
+               ((backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 || backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH) && haveInfEngine());
+    }
+
+    bool getMemoryShapes(const std::vector<MatShape> &inputs,
+                         const int requiredOutputs,
+                         std::vector<MatShape> &outputs,
+                         std::vector<MatShape> &internals) const CV_OVERRIDE
+    {
+        if (inputs.size() == 1 || inputs.size() == requiredOutputs)
+        {
+            outputs.clear();
+            for (size_t i = 0; i < inputs.size(); i++)
+            {
+                outputs.push_back(MatShape());
+                computeShapeByReshapeMask(inputs[i], newShapeDesc, newShapeRange, outputs.back());
+            }
+        }
+        else
+        {
+            CV_Assert_N(inputs.size() == 2, total(inputs[0]) == total(inputs[1]));
+            outputs.assign(1, inputs[1]);
+        }
+        return true;
+    }
+
+    void finalize(InputArrayOfArrays, OutputArrayOfArrays outputs_arr) CV_OVERRIDE
+    {
+        std::vector<Mat> outputs;
+        outputs_arr.getMatVector(outputs);
+
+        CV_Assert(!outputs.empty());
+        outShapes.resize(outputs.size());
+        for (int i = 0; i < outputs.size(); ++i)
+            outShapes[i] = shape(outputs[i]);
+    }
+
+    bool forward_ocl(InputArrayOfArrays inps, OutputArrayOfArrays outs, OutputArrayOfArrays internals)
+    {
+        std::vector<UMat> inputs;
+        std::vector<UMat> outputs;
+
+        inps.getUMatVector(inputs);
+        outs.getUMatVector(outputs);
+
+        for (size_t i = 0; i < outputs.size(); i++)
+        {
+            UMat srcBlob = inputs[i];
+            void *src_handle = inputs[i].handle(ACCESS_READ);
+            void *dst_handle = outputs[i].handle(ACCESS_WRITE);
+            if (src_handle != dst_handle)
+            {
+                UMat umat = srcBlob.reshape(1, (int)outShapes[i].size(), &outShapes[i][0]);
+                umat.copyTo(outputs[i]);
+            }
+        }
+        outs.assign(outputs);
+
+        return true;
+    }
+
+    void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
+    {
+        CV_TRACE_FUNCTION();
+        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+
+        CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
+                   forward_ocl(inputs_arr, outputs_arr, internals_arr))
+
+        std::vector<Mat> inputs, outputs;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
+        for (size_t i = 0; i < outputs.size(); i++)
+        {
+            Mat srcBlob = inputs[i];
+            if (outputs[i].data != srcBlob.data)
+                srcBlob.reshape(1, shape(outputs[i])).copyTo(outputs[i]);
+        }
+    }
+
+#ifdef HAVE_CUDA
+    Ptr<BackendNode> initCUDA(
+        void *context_,
+        const std::vector<Ptr<BackendWrapper>>& inputs,
+        const std::vector<Ptr<BackendWrapper>>& outputs
+    ) override
+    {
+        auto context = reinterpret_cast<csl::CSLContext*>(context_);
+        return make_cuda_node<cuda4dnn::ReshapeOp>(preferableTarget, std::move(context->stream));
+    }
+#endif
+
+#ifdef HAVE_INF_ENGINE
+    virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >& inputs) CV_OVERRIDE
+    {
+        InferenceEngine::Builder::ReshapeLayer ieLayer(name);
+        CV_Assert(outShapes.size() == 1);
+        ieLayer.setDims(outShapes[0]);
+        return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
+    }
+#endif  // HAVE_INF_ENGINE
+
+#ifdef HAVE_DNN_NGRAPH
+    virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> >& inputs,
+                                        const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
+    {
+        CV_Assert(outShapes.size() == 1);
+        auto& ieInpNode = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
+
+        std::vector<int64_t> out(outShapes[0].begin(), outShapes[0].end());
+        auto shape   = std::make_shared<ngraph::op::Constant>(ngraph::element::i64,
+                       ngraph::Shape{out.size()}, out.data());
+        auto reshape = std::make_shared<ngraph::op::v1::Reshape>(ieInpNode, shape, true);
+        return Ptr<BackendNode>(new InfEngineNgraphNode(reshape));
+    }
+#endif  // HAVE_DNN_NGRAPH
+
+private:
+    std::vector<MatShape> outShapes;
+};
+
+Ptr<ReshapeLayer> ReshapeLayer::create(const LayerParams& params)
+{
+    return Ptr<ReshapeLayer>(new ReshapeLayerImpl(params));
+}
+
+
+}
+}
--- a/Lib/opencv/sources/modules/dnn/src/layers/resize_layer.cpp
+++ b/Lib/opencv/sources/modules/dnn/src/layers/resize_layer.cpp
@@ -0,0 +1,361 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+// Copyright (C) 2017, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+#include "../precomp.hpp"
+#include "layers_common.hpp"
+#include "../op_cuda.hpp"
+#include "../op_inf_engine.hpp"
+#include <opencv2/imgproc.hpp>
+
+#ifdef HAVE_DNN_NGRAPH
+#include "../ie_ngraph.hpp"
+#include <ngraph/op/experimental/layers/interpolate.hpp>
+#endif
+
+#ifdef HAVE_CUDA
+#include "../cuda4dnn/primitives/resize.hpp"
+using namespace cv::dnn::cuda4dnn;
+#endif
+
+namespace cv { namespace dnn {
+
+class ResizeLayerImpl : public ResizeLayer
+{
+public:
+    ResizeLayerImpl(const LayerParams& params) : zoomFactorWidth(0), zoomFactorHeight(0), scaleWidth(0), scaleHeight(0)
+    {
+        setParamsFrom(params);
+        outWidth = params.get<float>("width", 0);
+        outHeight = params.get<float>("height", 0);
+        if (params.has("zoom_factor"))
+        {
+            CV_Assert(!params.has("zoom_factor_x") && !params.has("zoom_factor_y"));
+            zoomFactorWidth = zoomFactorHeight = params.get<int>("zoom_factor");
+        }
+        else if (params.has("zoom_factor_x") || params.has("zoom_factor_y"))
+        {
+            CV_Assert(params.has("zoom_factor_x") && params.has("zoom_factor_y"));
+            zoomFactorWidth = params.get<int>("zoom_factor_x");
+            zoomFactorHeight = params.get<int>("zoom_factor_y");
+        }
+        interpolation = params.get<String>("interpolation");
+        CV_Assert(interpolation == "nearest" || interpolation == "bilinear");
+
+        alignCorners = params.get<bool>("align_corners", false);
+    }
+
+    bool getMemoryShapes(const std::vector<MatShape> &inputs,
+                         const int requiredOutputs,
+                         std::vector<MatShape> &outputs,
+                         std::vector<MatShape> &internals) const CV_OVERRIDE
+    {
+        CV_Assert_N(inputs.size() == 1, inputs[0].size() == 4);
+        outputs.resize(1, inputs[0]);
+        outputs[0][2] = outHeight > 0 ? outHeight : (outputs[0][2] * zoomFactorHeight);
+        outputs[0][3] = outWidth > 0 ? outWidth : (outputs[0][3] * zoomFactorWidth);
+        // We can work in-place (do nothing) if input shape == output shape.
+        return (outputs[0][2] == inputs[0][2]) && (outputs[0][3] == inputs[0][3]);
+    }
+
+    virtual bool supportBackend(int backendId) CV_OVERRIDE
+    {
+        if (backendId == DNN_BACKEND_CUDA)
+            return interpolation == "nearest" || interpolation == "bilinear";
+
+#ifdef HAVE_INF_ENGINE
+        if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 ||
+            backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
+        {
+            return (interpolation == "nearest" && scaleWidth == scaleHeight) ||
+                   (interpolation == "bilinear");
+        }
+#endif
+        return backendId == DNN_BACKEND_OPENCV;
+    }
+
+    virtual void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr) CV_OVERRIDE
+    {
+        std::vector<Mat> inputs, outputs;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
+
+        if (!outWidth && !outHeight)
+        {
+            outHeight = outputs[0].size[2];
+            outWidth = outputs[0].size[3];
+        }
+        if (alignCorners && outHeight > 1)
+            scaleHeight = static_cast<float>(inputs[0].size[2] - 1) / (outHeight - 1);
+        else
+            scaleHeight = static_cast<float>(inputs[0].size[2]) / outHeight;
+
+        if (alignCorners && outWidth > 1)
+            scaleWidth = static_cast<float>(inputs[0].size[3] - 1) / (outWidth - 1);
+        else
+            scaleWidth = static_cast<float>(inputs[0].size[3]) / outWidth;
+    }
+
+    void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
+    {
+        CV_TRACE_FUNCTION();
+        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+
+        if (inputs_arr.depth() == CV_16S)
+        {
+            forward_fallback(inputs_arr, outputs_arr, internals_arr);
+            return;
+        }
+
+        std::vector<Mat> inputs, outputs, internals;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
+        internals_arr.getMatVector(internals);
+
+        if (outHeight == inputs[0].size[2] && outWidth == inputs[0].size[3])
+            return;
+
+        Mat& inp = inputs[0];
+        Mat& out = outputs[0];
+        if (interpolation == "nearest")
+        {
+            for (size_t n = 0; n < inputs[0].size[0]; ++n)
+            {
+                for (size_t ch = 0; ch < inputs[0].size[1]; ++ch)
+                {
+                    resize(getPlane(inp, n, ch), getPlane(out, n, ch),
+                           Size(outWidth, outHeight), 0, 0, INTER_NEAREST);
+                }
+            }
+        }
+        else if (interpolation == "bilinear")
+        {
+            const int inpHeight = inp.size[2];
+            const int inpWidth = inp.size[3];
+            const int inpSpatialSize = inpHeight * inpWidth;
+            const int outSpatialSize = outHeight * outWidth;
+            const int numPlanes = inp.size[0] * inp.size[1];
+            CV_Assert_N(inp.isContinuous(), out.isContinuous());
+
+            Mat inpPlanes = inp.reshape(1, numPlanes * inpHeight);
+            Mat outPlanes = out.reshape(1, numPlanes * outHeight);
+            for (int y = 0; y < outHeight; ++y)
+            {
+                float input_y = y * scaleHeight;
+                int y0 = static_cast<int>(input_y);
+                const float* inpData_row0 = inpPlanes.ptr<float>(y0);
+                const float* inpData_row1 = inpPlanes.ptr<float>(std::min(y0 + 1, inpHeight - 1));
+                for (int x = 0; x < outWidth; ++x)
+                {
+                    float input_x = x * scaleWidth;
+                    int x0 = static_cast<int>(input_x);
+                    int x1 = std::min(x0 + 1, inpWidth - 1);
+
+                    float* outData = outPlanes.ptr<float>(y, x);
+                    const float* inpData_row0_c = inpData_row0;
+                    const float* inpData_row1_c = inpData_row1;
+                    for (int c = 0; c < numPlanes; ++c)
+                    {
+                        *outData = inpData_row0_c[x0] +
+                            (input_y - y0) * (inpData_row1_c[x0] - inpData_row0_c[x0]) +
+                            (input_x - x0) * (inpData_row0_c[x1] - inpData_row0_c[x0] +
+                            (input_y - y0) * (inpData_row1_c[x1] - inpData_row0_c[x1] - inpData_row1_c[x0] + inpData_row0_c[x0]));
+
+                        inpData_row0_c += inpSpatialSize;
+                        inpData_row1_c += inpSpatialSize;
+                        outData += outSpatialSize;
+                    }
+                }
+            }
+        }
+        else
+            CV_Error(Error::StsNotImplemented, "Unknown interpolation: " + interpolation);
+    }
+
+#ifdef HAVE_CUDA
+    Ptr<BackendNode> initCUDA(
+        void *context_,
+        const std::vector<Ptr<BackendWrapper>>& inputs,
+        const std::vector<Ptr<BackendWrapper>>& outputs
+    ) override
+    {
+        auto context = reinterpret_cast<csl::CSLContext*>(context_);
+
+        cuda4dnn::InterpolationType itype;
+        if (interpolation == "nearest")
+            itype = InterpolationType::NEAREST_NEIGHBOUR;
+        else if (interpolation == "bilinear")
+            itype = InterpolationType::BILINEAR;
+        else
+            CV_Error(Error::StsNotImplemented, "Requested interpolation mode is not available in resize layer.");
+
+        return make_cuda_node<cuda4dnn::ResizeOp>(preferableTarget, std::move(context->stream), itype, scaleHeight, scaleWidth);
+    }
+#endif
+
+    virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >&) CV_OVERRIDE
+    {
+#ifdef HAVE_INF_ENGINE
+        InferenceEngine::Builder::Layer ieLayer(name);
+        ieLayer.setName(name);
+        if (interpolation == "nearest")
+        {
+            ieLayer.setType("Resample");
+            ieLayer.getParameters()["type"] = std::string("caffe.ResampleParameter.NEAREST");
+            ieLayer.getParameters()["antialias"] = false;
+            if (scaleWidth != scaleHeight)
+                CV_Error(Error::StsNotImplemented, "resample with sw != sh");
+            ieLayer.getParameters()["factor"] = 1.0f / scaleWidth;
+        }
+        else if (interpolation == "bilinear")
+        {
+            ieLayer.setType("Interp");
+            ieLayer.getParameters()["pad_beg"] = 0;
+            ieLayer.getParameters()["pad_end"] = 0;
+            ieLayer.getParameters()["align_corners"] = false;
+        }
+        else
+            CV_Error(Error::StsNotImplemented, "Unsupported interpolation: " + interpolation);
+        ieLayer.getParameters()["width"] = outWidth;
+        ieLayer.getParameters()["height"] = outHeight;
+        ieLayer.setInputPorts(std::vector<InferenceEngine::Port>(1));
+        ieLayer.setOutputPorts(std::vector<InferenceEngine::Port>(1));
+        return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
+#endif  // HAVE_INF_ENGINE
+        return Ptr<BackendNode>();
+    }
+
+
+#ifdef HAVE_DNN_NGRAPH
+    virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> >& inputs,
+                                        const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
+    {
+        auto& ieInpNode = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
+
+        ngraph::op::InterpolateAttrs attrs;
+        attrs.pads_begin.push_back(0);
+        attrs.pads_end.push_back(0);
+        attrs.axes = ngraph::AxisSet{2, 3};
+        attrs.align_corners = false;
+
+        if (interpolation == "nearest") {
+            attrs.mode = "nearest";
+            attrs.antialias = false;
+        } else if (interpolation == "bilinear") {
+            attrs.mode = "linear";
+        } else {
+            CV_Error(Error::StsNotImplemented, "Unsupported interpolation: " + interpolation);
+        }
+
+        std::vector<int64_t> shape = {outHeight, outWidth};
+        auto out_shape = std::make_shared<ngraph::op::Constant>(ngraph::element::i64, ngraph::Shape{2}, shape.data());
+        auto interp = std::make_shared<ngraph::op::Interpolate>(ieInpNode, out_shape, attrs);
+        return Ptr<BackendNode>(new InfEngineNgraphNode(interp));
+    }
+#endif  // HAVE_DNN_NGRAPH
+
+protected:
+    int outWidth, outHeight, zoomFactorWidth, zoomFactorHeight;
+    String interpolation;
+    float scaleWidth, scaleHeight;
+    bool alignCorners;
+};
+
+
+Ptr<ResizeLayer> ResizeLayer::create(const LayerParams& params)
+{
+    return Ptr<ResizeLayer>(new ResizeLayerImpl(params));
+}
+
+class InterpLayerImpl CV_FINAL : public ResizeLayerImpl
+{
+public:
+    InterpLayerImpl(const LayerParams& params) : ResizeLayerImpl(params) {}
+
+    bool getMemoryShapes(const std::vector<MatShape> &inputs,
+                         const int requiredOutputs,
+                         std::vector<MatShape> &outputs,
+                         std::vector<MatShape> &internals) const CV_OVERRIDE
+    {
+        CV_Assert_N(inputs.size() == 1, inputs[0].size() == 4);
+        outputs.resize(1, inputs[0]);
+        outputs[0][2] = outHeight > 0 ? outHeight : (1 + zoomFactorHeight * (outputs[0][2] - 1));
+        outputs[0][3] = outWidth > 0 ? outWidth : (1 + zoomFactorWidth * (outputs[0][3] - 1));
+        // We can work in-place (do nothing) if input shape == output shape.
+        return (outputs[0][2] == inputs[0][2]) && (outputs[0][3] == inputs[0][3]);
+    }
+
+    virtual bool supportBackend(int backendId) CV_OVERRIDE
+    {
+#ifdef HAVE_INF_ENGINE
+        if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019
+            || backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
+            return true;
+#endif
+        return backendId == DNN_BACKEND_OPENCV ||
+               backendId == DNN_BACKEND_CUDA;
+    }
+
+    virtual void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr) CV_OVERRIDE
+    {
+        std::vector<Mat> inputs, outputs;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
+
+        if (!outWidth && !outHeight)
+        {
+            outHeight = outputs[0].size[2];
+            outWidth = outputs[0].size[3];
+        }
+        int inpHeight = inputs[0].size[2];
+        int inpWidth = inputs[0].size[3];
+        scaleHeight = (outHeight > 1) ? (static_cast<float>(inpHeight - 1) / (outHeight - 1)) : 0.f;
+        scaleWidth = (outWidth > 1) ? (static_cast<float>(inpWidth - 1) / (outWidth - 1)) : 0.f;
+    }
+
+#ifdef HAVE_INF_ENGINE
+    virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >&) CV_OVERRIDE
+    {
+        InferenceEngine::Builder::Layer ieLayer(name);
+        ieLayer.setName(name);
+        ieLayer.setType("Interp");
+        ieLayer.getParameters()["pad_beg"] = 0;
+        ieLayer.getParameters()["pad_end"] = 0;
+        ieLayer.getParameters()["width"] = outWidth;
+        ieLayer.getParameters()["height"] = outHeight;
+        ieLayer.setInputPorts(std::vector<InferenceEngine::Port>(1));
+        ieLayer.setOutputPorts(std::vector<InferenceEngine::Port>(1));
+        return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
+    }
+#endif  // HAVE_INF_ENGINE
+
+#ifdef HAVE_DNN_NGRAPH
+    virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> >& inputs,
+                                        const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
+    {
+        auto& ieInpNode = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
+        ngraph::op::InterpolateAttrs attrs;
+        attrs.pads_begin.push_back(0);
+        attrs.pads_end.push_back(0);
+        attrs.axes = ngraph::AxisSet{2, 3};
+        attrs.mode = "linear";
+        std::vector<int64_t> shape = {outHeight, outWidth};
+        auto out_shape = std::make_shared<ngraph::op::Constant>(ngraph::element::i64, ngraph::Shape{2}, shape.data());
+        auto interp = std::make_shared<ngraph::op::Interpolate>(ieInpNode, out_shape, attrs);
+        return Ptr<BackendNode>(new InfEngineNgraphNode(interp));
+    }
+#endif  // HAVE_DNN_NGRAPH
+
+};
+
+Ptr<Layer> InterpLayer::create(const LayerParams& params)
+{
+    LayerParams lp(params);
+    lp.set("interpolation", "bilinear");
+    return Ptr<Layer>(new InterpLayerImpl(lp));
+}
+
+}  // namespace dnn
+}  // namespace cv
--- a/Lib/opencv/sources/modules/dnn/src/layers/scale_layer.cpp
+++ b/Lib/opencv/sources/modules/dnn/src/layers/scale_layer.cpp
@@ -0,0 +1,325 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+// Copyright (C) 2016, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+
+/*
+Implementation of Scale layer.
+*/
+
+#include "../precomp.hpp"
+#include "layers_common.hpp"
+#include "../op_cuda.hpp"
+#include "../op_halide.hpp"
+#include "../op_inf_engine.hpp"
+#include "../ie_ngraph.hpp"
+
+#include <opencv2/dnn/shape_utils.hpp>
+
+#ifdef HAVE_CUDA
+#include "../cuda4dnn/primitives/scale_shift.hpp"
+using namespace cv::dnn::cuda4dnn;
+#endif
+
+namespace cv
+{
+namespace dnn
+{
+
+class ScaleLayerImpl CV_FINAL : public ScaleLayer
+{
+public:
+    ScaleLayerImpl(const LayerParams& params)
+    {
+        setParamsFrom(params);
+        hasBias = params.get<bool>("bias_term", false);
+        axis = params.get<int>("axis", 1);
+        hasWeights = false;
+    }
+
+    bool getMemoryShapes(const std::vector<MatShape> &inputs,
+                         const int requiredOutputs,
+                         std::vector<MatShape> &outputs,
+                         std::vector<MatShape> &internals) const CV_OVERRIDE
+    {
+        outputs.assign(1, inputs[0]);
+        return true;
+    }
+
+    virtual void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays) CV_OVERRIDE
+    {
+        std::vector<Mat> inputs;
+        inputs_arr.getMatVector(inputs);
+        hasWeights = blobs.size() == 2 || (blobs.size() == 1 && !hasBias);
+        CV_Assert((inputs.size() == 2 && blobs.empty()) || blobs.size() == (int)hasWeights + (int)hasBias);
+    }
+
+    virtual bool supportBackend(int backendId) CV_OVERRIDE
+    {
+        return backendId == DNN_BACKEND_OPENCV ||
+               backendId == DNN_BACKEND_CUDA ||
+               backendId == DNN_BACKEND_HALIDE ||
+               ((backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 || backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH) && axis == 1);
+    }
+
+    void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
+    {
+        CV_TRACE_FUNCTION();
+        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+
+        if (inputs_arr.depth() == CV_16S)
+        {
+            forward_fallback(inputs_arr, outputs_arr, internals_arr);
+            return;
+        }
+
+        std::vector<Mat> inputs, outputs;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
+
+        CV_Assert_N(outputs.size() == 1, !blobs.empty() || inputs.size() == 2);
+
+        Mat &inpBlob = inputs[0];
+        Mat &outBlob = outputs[0];
+        // There is a mode when we multiply a first blob by a second one
+        // instead of trainable weights.
+        Mat weights = blobs.empty() ? inputs[1] : (hasWeights ? blobs[0] : Mat());
+        Mat bias = hasBias ? blobs.back().reshape(1, 1) : Mat();
+        if (!weights.empty())
+            weights = weights.reshape(1, 1);
+        MatShape inpShape = shape(inpBlob);
+        const int numWeights = !weights.empty() ? weights.total() : bias.total();
+        CV_Assert(numWeights != 0);
+        if (hasWeights && hasBias)
+            CV_CheckEQ(weights.total(), bias.total(), "Incompatible weights/bias blobs");
+
+        int endAxis;
+        for (endAxis = axis + 1; endAxis <= inpBlob.dims; ++endAxis)
+        {
+            if (total(inpShape, axis, endAxis) == numWeights)
+                break;
+        }
+        CV_Assert(total(inpShape, axis, endAxis) == numWeights);
+        CV_Assert(!hasBias || numWeights == bias.total());
+        CV_CheckTypeEQ(inpBlob.type(), CV_32FC1, ""); CV_CheckTypeEQ(outBlob.type(), CV_32FC1, "");
+
+        int numSlices = total(inpShape, 0, axis);
+        float* inpData = (float*)inpBlob.data;
+        float* outData = (float*)outBlob.data;
+
+        if (endAxis != inpBlob.dims)
+        {
+            float* weightsData = !weights.empty() ? (float*)weights.data : 0;
+            float* biasesData = hasBias ? (float*)bias.data : 0;
+            int spatialSize = total(inpShape, endAxis);  // spatialSize != 1
+            for (int i = 0; i < numSlices; ++i)
+            {
+                for (int j = 0; j < numWeights; ++j)
+                {
+                    float w = weightsData ? weightsData[j] : 1;
+                    float b = biasesData ? biasesData[j] : 0;
+                    Mat inpSlice(1, spatialSize, CV_32F, inpData);
+                    Mat outSlice(1, spatialSize, CV_32F, outData);
+                    inpSlice.convertTo(outSlice, CV_32F, w, b);
+                    inpData += spatialSize;
+                    outData += spatialSize;
+                }
+            }
+        }
+        else
+        {
+            for (int i = 0; i < numSlices; ++i)
+            {
+                Mat inpSlice(1, numWeights, CV_32F, inpData);
+                Mat outSlice(1, numWeights, CV_32F, outData);
+                if (!weights.empty())
+                {
+                    multiply(inpSlice, weights, outSlice);
+                    if (hasBias)
+                        add(outSlice, bias, outSlice);
+                }
+                else if (hasBias)
+                    add(inpSlice, bias, outSlice);
+                inpData += numWeights;
+                outData += numWeights;
+            }
+        }
+    }
+
+#ifdef HAVE_CUDA
+    Ptr<BackendNode> initCUDA(
+        void *context_,
+        const std::vector<Ptr<BackendWrapper>>& inputs,
+        const std::vector<Ptr<BackendWrapper>>& outputs
+    ) override
+    {
+        auto context = reinterpret_cast<csl::CSLContext*>(context_);
+
+        CV_Assert(!blobs.empty() || inputs.size() == 2);
+
+        cv::Mat weightsMat = hasWeights ? blobs[0] : Mat();
+
+        /* if the weights are provided, bias will be in blobs[1]; otherwise, it will be in blobs[0]
+         * in either case, it is at the end of the blobs vector => bias = blobs.back()
+         */
+        cv::Mat biasMat = hasBias ? blobs.back() : Mat();
+
+        return make_cuda_node<cuda4dnn::ScaleShiftOp>(preferableTarget, std::move(context->stream), axis, weightsMat, biasMat);
+    }
+#endif
+
+    virtual Ptr<BackendNode> tryAttach(const Ptr<BackendNode>& node) CV_OVERRIDE
+    {
+        switch (node->backendId)
+        {
+            case DNN_BACKEND_HALIDE:
+            {
+#ifdef HAVE_HALIDE
+                auto base = node.dynamicCast<HalideBackendNode>();
+                Halide::Func& input = base->funcs.back();
+                Halide::Var x("x"), y("y"), c("c"), n("n");
+                Halide::Func top = attachHalide(input(x, y, c, n));
+                return Ptr<BackendNode>(new HalideBackendNode(base, top));
+#endif  // HAVE_HALIDE
+                break;
+            }
+        }
+        return Ptr<BackendNode>();
+    }
+
+    virtual Ptr<BackendNode> initHalide(const std::vector<Ptr<BackendWrapper> > &inputs) CV_OVERRIDE
+    {
+#ifdef HAVE_HALIDE
+        Halide::Buffer<float> input = halideBuffer(inputs[0]);
+        Halide::Var x("x"), y("y"), c("c"), n("n");
+        Halide::Func top = attachHalide(input(x, y, c, n));
+        return Ptr<BackendNode>(new HalideBackendNode(top));
+#endif  // HAVE_HALIDE
+        return Ptr<BackendNode>();
+    }
+
+#ifdef HAVE_HALIDE
+    // attachHalide can work both with Halide::Buffer and Halide::Func. In the
+    // second case it will be a fusion.
+    Halide::Func attachHalide(const Halide::Expr& input)
+    {
+        Halide::Func top = (name.empty() ? Halide::Func() : Halide::Func(name));
+        Halide::Var x("x"), y("y"), c("c"), n("n");
+
+        const int numChannels = blobs[0].total();
+
+        Halide::Expr topExpr = input;
+        if (hasWeights)
+        {
+            auto weights = wrapToHalideBuffer(blobs[0], {numChannels});
+            topExpr *= weights(c);
+        }
+        if (hasBias)
+        {
+            auto bias = wrapToHalideBuffer(blobs.back(), {numChannels});
+            topExpr += bias(c);
+        }
+        top(x, y, c, n) = topExpr;
+        return top;
+    }
+#endif  // HAVE_HALIDE
+
+#ifdef HAVE_INF_ENGINE
+    virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >&) CV_OVERRIDE
+    {
+        InferenceEngine::Builder::Layer l = InferenceEngine::Builder::ScaleShiftLayer(name);
+
+        CV_Assert(!blobs.empty());
+        const size_t numChannels = blobs[0].total();
+        if (hasWeights)
+        {
+            addConstantData("weights", wrapToInfEngineBlob(blobs[0], {numChannels}, InferenceEngine::Layout::C), l);
+        }
+        else
+        {
+            auto weights = InferenceEngine::make_shared_blob<float>({
+                               InferenceEngine::Precision::FP32, {(size_t)numChannels},
+                               InferenceEngine::Layout::C
+                           });
+            weights->allocate();
+            float* buf = weights->buffer().as<float*>();
+            std::fill(buf, buf + numChannels, 1);
+            addConstantData("weights", weights, l);
+        }
+        if (hasBias)
+            addConstantData("biases", wrapToInfEngineBlob(blobs.back(), {numChannels}, InferenceEngine::Layout::C), l);
+        return Ptr<BackendNode>(new InfEngineBackendNode(l));
+    }
+#endif  // HAVE_INF_ENGINE
+
+
+#ifdef HAVE_DNN_NGRAPH
+    virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> >& inputs, const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
+    {
+        CV_Assert(!blobs.empty());
+        const size_t numChannels = blobs[0].total();
+        auto ieInpNode = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
+
+        std::vector<size_t> shape(ieInpNode->get_shape().size(), 1);
+        shape[1] = numChannels;
+        auto weight = hasWeights ?
+                    std::make_shared<ngraph::op::Constant>(ngraph::element::f32,
+                                                           ngraph::Shape(shape), blobs[0].data) :
+                    std::make_shared<ngraph::op::Constant>(ngraph::element::f32,
+                                                           ngraph::Shape(shape), std::vector<float>(numChannels, 1).data());
+
+        auto bias = hasBias ?
+                    std::make_shared<ngraph::op::Constant>(ngraph::element::f32,
+                                                           ngraph::Shape(shape), blobs.back().data) :
+                    std::make_shared<ngraph::op::Constant>(ngraph::element::f32,
+                                                           ngraph::Shape(shape), std::vector<float>(numChannels, 0).data());
+
+        auto scale_node = std::make_shared<ngraph::op::v1::Multiply>(ieInpNode, weight, ngraph::op::AutoBroadcastType::NUMPY);
+        auto scale_shift = std::make_shared<ngraph::op::v1::Add>(scale_node, bias, ngraph::op::AutoBroadcastType::NUMPY);
+        return Ptr<BackendNode>(new InfEngineNgraphNode(scale_shift));
+    }
+#endif  // HAVE_DNN_NGRAPH
+
+    void getScaleShift(Mat& scale, Mat& shift) const CV_OVERRIDE
+    {
+        scale = hasWeights ? blobs[0] : Mat();
+        shift = hasBias ? blobs.back() : Mat();
+    }
+
+    virtual int64 getFLOPS(const std::vector<MatShape> &inputs,
+                           const std::vector<MatShape> &outputs) const CV_OVERRIDE
+    {
+        CV_UNUSED(outputs); // suppress unused variable warning
+        long flops = 0;
+        for(int i = 0; i < inputs.size(); i++)
+        {
+            flops += 2*total(inputs[i]);
+        }
+        return flops;
+    }
+
+private:
+    bool hasWeights;
+};
+
+
+Ptr<ScaleLayer> ScaleLayer::create(const LayerParams& params)
+{
+    return Ptr<ScaleLayer>(new ScaleLayerImpl(params));
+}
+
+Ptr<Layer> ShiftLayer::create(const LayerParams& params)
+{
+    LayerParams scaleParams;
+    scaleParams.name = params.name;
+    scaleParams.type = "Scale";
+    scaleParams.blobs = params.blobs;
+    scaleParams.set("bias_term", true);
+    scaleParams.set("axis", 0);
+    return Ptr<ScaleLayer>(new ScaleLayerImpl(scaleParams));
+}
+
+}  // namespace dnn
+}  // namespace cv
--- a/Lib/opencv/sources/modules/dnn/src/layers/shuffle_channel_layer.cpp
+++ b/Lib/opencv/sources/modules/dnn/src/layers/shuffle_channel_layer.cpp
@@ -0,0 +1,161 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+// Copyright (C) 2018, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+#include "../precomp.hpp"
+#include "../op_cuda.hpp"
+
+#ifdef HAVE_CUDA
+#include "../cuda4dnn/primitives/shuffle_channel.hpp"
+using namespace cv::dnn::cuda4dnn;
+#endif
+
+namespace cv { namespace dnn {
+
+class ShuffleChannelLayerImpl CV_FINAL : public ShuffleChannelLayer
+{
+public:
+    ShuffleChannelLayerImpl(const LayerParams& params)
+    {
+        group = params.get<int>("group", 1);
+        setParamsFrom(params);
+    }
+
+    virtual bool supportBackend(int backendId) CV_OVERRIDE
+    {
+        return backendId == DNN_BACKEND_OPENCV ||
+               backendId == DNN_BACKEND_CUDA;
+    }
+
+    bool getMemoryShapes(const std::vector<MatShape> &inputs,
+                         const int requiredOutputs,
+                         std::vector<MatShape> &outputs,
+                         std::vector<MatShape> &internals) const CV_OVERRIDE
+    {
+        CV_Assert(inputs.size() == 1 && inputs[0].size() == 4);
+        CV_Assert(inputs[0][1] % group == 0);
+        Layer::getMemoryShapes(inputs, requiredOutputs, outputs, internals);
+        return group == 1;
+    }
+
+    virtual void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr) CV_OVERRIDE
+    {
+        if (group != 1)
+        {
+            std::vector<Mat> inputs, outputs;
+            inputs_arr.getMatVector(inputs);
+            outputs_arr.getMatVector(outputs);
+
+            LayerParams lp;
+            float order[] = {0, 2, 1, 3};
+            lp.set("order", DictValue::arrayInt(&order[0], 4));
+            permute = PermuteLayer::create(lp);
+
+            const Mat& inp = inputs[0];
+            const Mat& out = outputs[0];
+
+            permuteInpShape.resize(4);
+            permuteInpShape[0] = inp.size[0];
+            permuteInpShape[1] = group;
+            permuteInpShape[2] = inp.size[1] / group;
+            permuteInpShape[3] = inp.size[2]*inp.size[3];
+
+            permuteOutShape.resize(4);
+            permuteOutShape[0] = permuteInpShape[0];
+            permuteOutShape[1] = permuteInpShape[2];
+            permuteOutShape[2] = permuteInpShape[1];
+            permuteOutShape[3] = permuteInpShape[3];
+
+            std::vector<Mat> permuteInputs(1, inp.reshape(1, permuteInpShape));
+            std::vector<Mat> permuteOutputs(1, out.reshape(1, permuteOutShape));
+            permute->finalize(permuteInputs, permuteOutputs);
+        }
+    }
+
+#ifdef HAVE_OPENCL
+    bool forward_ocl(InputArrayOfArrays inps, OutputArrayOfArrays outs, OutputArrayOfArrays internals)
+    {
+        std::vector<UMat> inputs;
+        std::vector<UMat> outputs;
+
+        inps.getUMatVector(inputs);
+        outs.getUMatVector(outputs);
+
+        if (inputs[0].u != outputs[0].u)
+        {
+            if (!permute.empty())
+            {
+                inputs[0] = inputs[0].reshape(1, permuteInpShape.size(), &permuteInpShape[0]);
+                outputs[0] = outputs[0].reshape(1, permuteOutShape.size(), &permuteOutShape[0]);
+                permute->preferableTarget = preferableTarget;
+                permute->forward(inputs, outputs, internals);
+            }
+            else
+                inputs[0].copyTo(outputs[0]);
+        }
+        return true;
+    }
+#endif
+
+    void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
+    {
+        CV_TRACE_FUNCTION();
+        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+
+        CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
+                   forward_ocl(inputs_arr, outputs_arr, internals_arr))
+
+        if (inputs_arr.depth() == CV_16S)
+        {
+            forward_fallback(inputs_arr, outputs_arr, internals_arr);
+            return;
+        }
+
+        std::vector<Mat> inputs, outputs, internals;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
+        internals_arr.getMatVector(internals);
+
+        Mat inp = inputs[0];
+        Mat out = outputs[0];
+        if (inp.data != out.data)
+        {
+            if (!permute.empty())
+            {
+                inp = inp.reshape(1, permuteInpShape);
+                out = out.reshape(1, permuteOutShape);
+                std::vector<Mat> permuteInputs(1, inp);
+                std::vector<Mat> permuteOutputs(1, out);
+                permute->forward(permuteInputs, permuteOutputs, internals);
+            }
+            else
+                inp.copyTo(out);
+        }
+    }
+
+#ifdef HAVE_CUDA
+    Ptr<BackendNode> initCUDA(
+        void *context_,
+        const std::vector<Ptr<BackendWrapper>>& inputs,
+        const std::vector<Ptr<BackendWrapper>>& outputs
+    ) override
+    {
+        auto context = reinterpret_cast<csl::CSLContext*>(context_);
+        return make_cuda_node<cuda4dnn::ShuffleChannelOp>(preferableTarget, std::move(context->stream), group);
+    }
+#endif
+
+private:
+    Ptr<PermuteLayer> permute;
+    std::vector<int> permuteInpShape, permuteOutShape;
+};
+
+Ptr<Layer> ShuffleChannelLayer::create(const LayerParams& params)
+{
+    return Ptr<Layer>(new ShuffleChannelLayerImpl(params));
+}
+
+}  // namespace dnn
+}  // namespace cv
--- a/Lib/opencv/sources/modules/dnn/src/layers/slice_layer.cpp
+++ b/Lib/opencv/sources/modules/dnn/src/layers/slice_layer.cpp
@@ -0,0 +1,473 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Copyright (C) 2017, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "../precomp.hpp"
+#include "../op_cuda.hpp"
+#include "../op_inf_engine.hpp"
+#include "../ie_ngraph.hpp"
+
+#include "layers_common.hpp"
+#include <opencv2/dnn/shape_utils.hpp>
+
+#ifdef HAVE_OPENCL
+#include "opencl_kernels_dnn.hpp"
+#endif
+
+#ifdef HAVE_CUDA
+#include "../cuda4dnn/primitives/slice.hpp"
+using namespace cv::dnn::cuda4dnn;
+#endif
+
+namespace cv
+{
+namespace dnn
+{
+
+class SliceLayerImpl : public SliceLayer
+{
+public:
+    SliceLayerImpl(const LayerParams& params)
+    {
+        setParamsFrom(params);
+        axis = params.get<int>("axis", 1);
+        num_split = params.get<int>("num_split", 0);
+        if (params.has("slice_point"))
+        {
+            CV_Assert(!params.has("begin") && !params.has("size") && !params.has("end"));
+            const DictValue &indicesValue = params.get("slice_point");
+            sliceRanges.resize(indicesValue.size() + 1,
+                               std::vector<Range>(axis + 1, Range::all()));
+            int prevSlice = 0;
+            for (int i = 0; i < indicesValue.size(); ++i)
+            {
+                sliceRanges[i][axis].start = prevSlice;
+                sliceRanges[i][axis].end = indicesValue.get<int>(i);
+                prevSlice = sliceRanges[i][axis].end;
+            }
+            sliceRanges.back()[axis].start = prevSlice;
+        }
+        else if (params.has("begin"))
+        {
+            CV_Assert(params.has("size") ^ params.has("end"));
+            const DictValue &begins = params.get("begin");
+            const DictValue &sizesOrEnds = params.has("size") ? params.get("size") : params.get("end");
+            CV_Assert(begins.size() == sizesOrEnds.size());
+
+            sliceRanges.resize(1);
+            sliceRanges[0].resize(begins.size(), Range::all());
+            for (int i = 0; i < begins.size(); ++i)
+            {
+                int start = begins.get<int>(i);
+                int sizeOrEnd = sizesOrEnds.get<int>(i);  // It may be negative to reverse indexation.
+                CV_Assert(start >= 0);
+
+                sliceRanges[0][i].start = start;
+                if (params.has("size"))
+                {
+                    int size = sizeOrEnd;
+                    CV_Assert(size == -1 || size > 0);  // -1 value means range [start, axis_size).
+                    sliceRanges[0][i].end = size > 0 ? (start + size) : -1;  // We'll finalize a negative value later.
+                }
+                else
+                {
+                    int end = sizeOrEnd;
+                    CV_Assert(end < 0 || end > start);  // End index is excluded.
+                    sliceRanges[0][i].end = end;  // We'll finalize a negative value later.
+                }
+            }
+        }
+    }
+
+    virtual bool supportBackend(int backendId) CV_OVERRIDE
+    {
+        return backendId == DNN_BACKEND_OPENCV ||
+               backendId == DNN_BACKEND_CUDA ||
+               ((backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 || backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH) &&
+#ifdef HAVE_INF_ENGINE
+                INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2019R1) &&
+#endif
+                sliceRanges.size() == 1 && sliceRanges[0].size() == 4);
+    }
+
+    bool getMemoryShapes(const std::vector<MatShape> &inputs,
+                            const int requiredOutputs,
+                            std::vector<MatShape> &outputs,
+                            std::vector<MatShape> &internals) const CV_OVERRIDE
+    {
+        CV_Assert(inputs.size() == 1);
+        MatShape inpShape = inputs[0];
+
+        if (!sliceRanges.empty())
+        {
+            outputs.resize(sliceRanges.size(), inpShape);
+            for (int i = 0; i < outputs.size(); ++i)
+            {
+                CV_Assert(sliceRanges[i].size() <= inpShape.size());
+                for (int j = 0; j < sliceRanges[i].size(); ++j)
+                {
+                    outputs[i][j] = clamp(sliceRanges[i][j], inpShape[j]).size();
+                }
+            }
+        }
+        else  // Divide input blob on equal parts by axis.
+        {
+            CV_Assert(0 <= axis && axis < inpShape.size());
+            int splits = num_split ? num_split : requiredOutputs;
+            CV_Assert(splits > 0 && inpShape[axis] % splits == 0);
+            inpShape[axis] /= splits;
+            outputs.resize(splits, inpShape);
+        }
+        return false;
+    }
+
+    void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr) CV_OVERRIDE
+    {
+        std::vector<Mat> inputs, outputs;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
+
+        CV_Assert(inputs.size() == 1);
+        const MatSize& inpShape = inputs[0].size;
+
+        if (sliceRanges.empty())
+        {
+            // Divide input blob on equal parts by axis.
+            int outAxisSize = inpShape[axis] / outputs.size();
+            sliceRanges.resize(outputs.size(),
+                               std::vector<Range>(axis + 1, Range::all()));
+            int prevSlice = 0;
+            for (int i = 0; i < outputs.size(); ++i)
+            {
+                sliceRanges[i][axis].start = prevSlice;
+                sliceRanges[i][axis].end = sliceRanges[i][axis].start + outAxisSize;
+                prevSlice = sliceRanges[i][axis].end;
+            }
+        }
+        else
+            CV_Assert(outputs.size() == sliceRanges.size());
+
+        for (int i = 0; i < outputs.size(); ++i)
+        {
+            CV_Assert(sliceRanges[i].size() <= inpShape.dims());
+            // Fill the rest of ranges.
+            for (int j = sliceRanges[i].size(); j < inpShape.dims(); ++j)
+            {
+                sliceRanges[i].push_back(Range::all());
+            }
+            // Clamp.
+            for (int j = 0; j < sliceRanges[i].size(); ++j)
+            {
+                sliceRanges[i][j] = clamp(sliceRanges[i][j], inpShape[j]);
+            }
+        }
+    }
+
+#ifdef HAVE_OPENCL
+    bool forward_ocl(InputArrayOfArrays inputs_, OutputArrayOfArrays outputs_, OutputArrayOfArrays internals_)
+    {
+        std::vector<UMat> inputs;
+        std::vector<UMat> outputs;
+
+        bool use_half = (inputs_.depth() == CV_16S);
+        inputs_.getUMatVector(inputs);
+        outputs_.getUMatVector(outputs);
+
+        if (inputs[0].dims < 4 || (total(shape(outputs[0]), 0, 2) % 4 != 0) ||
+            (total(shape(outputs[0]), 2) % 4 != 0))
+            return false;
+
+        String opts;
+        if (use_half)
+            opts = "-DDtype=half -DDtype4=half4 -DDtype8=half8";
+        else
+            opts = "-DDtype=float -DDtype4=float4 -DDtype8=float8";
+        const UMat& inpMat = inputs[0];
+        for (size_t i = 0; i < outputs.size(); i++)
+        {
+            int groups = outputs[i].size[0];
+            int channels = outputs[i].size[1];
+            int rows = outputs[i].size[2];
+            int cols = outputs[i].size[3];
+
+            ocl::Kernel kernel("slice", ocl::dnn::slice_oclsrc, opts);
+            size_t local[] = { 128 };
+            size_t global[] = { (size_t)groups * channels / 4 * local[0] };
+            int idx = 0;
+            kernel.set(idx++, ocl::KernelArg::PtrReadOnly(inpMat));
+            kernel.set(idx++, (int)(inpMat.size[2] * inpMat.size[3]));
+            kernel.set(idx++, (int)(rows * cols));
+            kernel.set(idx++, (int)inpMat.size[3]);
+            kernel.set(idx++, (int)cols);
+            kernel.set(idx++, (int)sliceRanges[i][2].start);
+            kernel.set(idx++, (int)sliceRanges[i][3].start);
+            kernel.set(idx++, ocl::KernelArg::PtrWriteOnly(outputs[i]));
+            bool ret = kernel.run(1, global, local, false);
+            if (!ret)
+                return false;
+        }
+
+        return true;
+    }
+#endif
+
+    void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
+    {
+        CV_TRACE_FUNCTION();
+        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+
+        CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
+                   forward_ocl(inputs_arr, outputs_arr, internals_arr))
+
+        std::vector<Mat> inputs, outputs;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
+
+        const Mat& inpMat = inputs[0];
+        CV_Assert(outputs.size() == sliceRanges.size());
+        for (size_t i = 0; i < outputs.size(); i++)
+        {
+            inpMat(sliceRanges[i]).copyTo(outputs[i]);
+        }
+    }
+
+#ifdef HAVE_CUDA
+    Ptr<BackendNode> initCUDA(
+        void *context_,
+        const std::vector<Ptr<BackendWrapper>>& inputs,
+        const std::vector<Ptr<BackendWrapper>>& outputs
+    ) override
+    {
+        auto context = reinterpret_cast<csl::CSLContext*>(context_);
+
+        std::vector<std::vector<std::size_t>> offsets;
+        for (const auto& ranges : sliceRanges)
+        {
+            std::vector<std::size_t> offsets_i;
+            for (const auto& range : ranges)
+                offsets_i.push_back(range.start);
+            offsets.push_back(std::move(offsets_i));
+        }
+
+        return make_cuda_node<cuda4dnn::SliceOp>(preferableTarget, std::move(context->stream), std::move(offsets));
+    }
+#endif
+
+#ifdef HAVE_INF_ENGINE
+#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2019R1)
+    virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >& inputs) CV_OVERRIDE
+    {
+        CV_Assert_N(sliceRanges.size() == 1, inputs.size() <= 2);
+
+        std::vector<size_t> axes, offsets, dims;
+        int from, to, step;
+        int numDims = sliceRanges[0].size();
+        if (preferableTarget == DNN_TARGET_MYRIAD)
+        {
+            from = axis;
+            to = numDims;
+            step = 1;
+        }
+        else
+        {
+            from = numDims - 1;
+            to = axis - 1;
+            step = -1;
+        }
+        for (int i = from; i != to; i += step)
+        {
+            axes.push_back(i);
+            offsets.push_back(sliceRanges[0][i].start);
+            dims.push_back(sliceRanges[0][i].size());
+        }
+
+        InferenceEngine::Builder::Layer ieLayer(name);
+        ieLayer.setName(name);
+        ieLayer.setType("Crop");
+        ieLayer.getParameters()["axis"] = axes;
+        ieLayer.getParameters()["dim"] = dims;
+        ieLayer.getParameters()["offset"] = offsets;
+        ieLayer.setInputPorts(std::vector<InferenceEngine::Port>(2));
+        ieLayer.setOutputPorts(std::vector<InferenceEngine::Port>(1));
+
+        if (inputs.size() != 2)
+        {
+            std::vector<size_t> outShape(numDims);
+            for (int i = 0; i < numDims; ++i)
+                outShape[i] = sliceRanges[0][i].size();
+
+            ieLayer.getInputPorts()[1].setParameter("type", "weights");
+
+            auto shapeSource = InferenceEngine::make_shared_blob<float>({
+                                   InferenceEngine::Precision::FP32, outShape,
+                                   InferenceEngine::Layout::ANY
+                               });
+            shapeSource->allocate();
+            addConstantData("weights", shapeSource, ieLayer);
+        }
+        return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
+    }
+#endif
+#endif
+
+#ifdef HAVE_DNN_NGRAPH
+    virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> >& inputs,
+                                        const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
+    {
+        CV_Assert_N(nodes.size() <= 2);
+        auto& ieInpNode = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
+        CV_Assert(sliceRanges[0].size() == ieInpNode->get_shape().size());
+
+        std::vector<int64_t> offsets, dims;
+        for (int i = 0; i < sliceRanges[0].size(); ++i)
+        {
+            offsets.push_back(sliceRanges[0][i].start);
+            dims.push_back(sliceRanges[0][i].end);
+        }
+
+        auto lower_bounds = std::make_shared<ngraph::op::Constant>(ngraph::element::i64,
+                                             ngraph::Shape{offsets.size()}, offsets.data());
+        auto upper_bounds = std::make_shared<ngraph::op::Constant>(ngraph::element::i64,
+                                             ngraph::Shape{dims.size()}, dims.data());
+        auto strides = std::make_shared<ngraph::op::Constant>(ngraph::element::i64,
+                                        ngraph::Shape{dims.size()}, std::vector<int64_t>((int64_t)dims.size(), 1));
+
+        auto slice = std::make_shared<ngraph::op::v1::StridedSlice>(ieInpNode,
+                                      lower_bounds, upper_bounds, strides, std::vector<int64_t>{}, std::vector<int64_t>{});
+
+        return Ptr<BackendNode>(new InfEngineNgraphNode(slice));
+    }
+#endif  // HAVE_DNN_NGRAPH
+
+};
+
+class CropLayerImpl CV_FINAL : public SliceLayerImpl
+{
+public:
+    CropLayerImpl(const LayerParams& params) : SliceLayerImpl(LayerParams())
+    {
+        setParamsFrom(params);
+        axis = params.get<int>("axis", 2);
+        const DictValue *paramOffset = params.ptr("offset");
+
+        if (paramOffset)
+        {
+            for (int i = 0; i < paramOffset->size(); i++)
+                offset.push_back(paramOffset->get<int>(i));
+        }
+    }
+
+    bool getMemoryShapes(const std::vector<MatShape> &inputs,
+                         const int requiredOutputs,
+                         std::vector<MatShape> &outputs,
+                         std::vector<MatShape> &internals) const CV_OVERRIDE
+    {
+        CV_Assert(inputs.size() == 2);
+
+        MatShape dstShape = inputs[0];
+        int start = clamp(axis, dstShape);
+        for (int i = start; i < dstShape.size(); i++)
+        {
+            dstShape[i] = inputs[1][i];
+        }
+        outputs.resize(1, dstShape);
+        return false;
+    }
+
+    void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays) CV_OVERRIDE
+    {
+        std::vector<Mat> inputs;
+        inputs_arr.getMatVector(inputs);
+        CV_Assert(2 == inputs.size());
+
+        const Mat &inpBlob = inputs[0];
+        const Mat &inpSzBlob = inputs[1];
+
+        int dims = inpBlob.dims;
+        int start_axis = clamp(axis, dims);
+
+        std::vector<int> offset_final(dims, 0);
+        if (offset.size() == 1)
+        {
+            for (int i = start_axis; i < dims; i++)
+                offset_final[i] = offset[0];
+        }
+        else if (offset.size() > 1)
+        {
+            if ((int)offset.size() != dims - start_axis)
+                CV_Error(Error::StsBadArg, "number of offset values specified must be "
+                                           "equal to the number of dimensions following axis.");
+
+            for (int i = start_axis; i < dims; i++)
+                offset_final[i] = offset[i - start_axis];
+        }
+
+        sliceRanges.resize(1);
+        sliceRanges[0].resize(dims);
+        for (int i = 0; i < start_axis; i++)
+        {
+            sliceRanges[0][i] = Range(0, inpBlob.size[i]);
+        }
+        for (int i = start_axis; i < dims; i++)
+        {
+            if (offset_final[i] < 0 || offset_final[i] + inpSzBlob.size[i] > inpBlob.size[i])
+                CV_Error(Error::StsBadArg, "invalid crop parameters or blob sizes");
+
+            sliceRanges[0][i] = Range(offset_final[i], offset_final[i] + inpSzBlob.size[i]);
+        }
+    }
+
+private:
+    std::vector<int> offset;
+};
+
+Ptr<SliceLayer> SliceLayer::create(const LayerParams& params)
+{
+    return Ptr<SliceLayer>(new SliceLayerImpl(params));
+}
+
+Ptr<Layer> CropLayer::create(const LayerParams& params)
+{
+    return Ptr<Layer>(new CropLayerImpl(params));
+}
+
+}
+}
--- a/Lib/opencv/sources/modules/dnn/src/layers/softmax_layer.cpp
+++ b/Lib/opencv/sources/modules/dnn/src/layers/softmax_layer.cpp
@@ -0,0 +1,396 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Copyright (C) 2017, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "../precomp.hpp"
+#include "layers_common.hpp"
+#include "../op_cuda.hpp"
+#include "../op_halide.hpp"
+#include "../op_inf_engine.hpp"
+#include "../ie_ngraph.hpp"
+#include "../op_vkcom.hpp"
+
+#include <algorithm>
+#include <stdlib.h>
+using std::max;
+
+#ifdef HAVE_OPENCL
+#include "opencl_kernels_dnn.hpp"
+using namespace cv::dnn::ocl4dnn;
+#endif
+
+#ifdef HAVE_CUDA
+#include "../cuda4dnn/primitives/softmax.hpp"
+using namespace cv::dnn::cuda4dnn;
+#endif
+
+namespace cv
+{
+namespace dnn
+{
+
+class SoftMaxLayerImpl CV_FINAL : public SoftmaxLayer
+{
+public:
+
+    SoftMaxLayerImpl(const LayerParams& params)
+    {
+        axisRaw = params.get<int>("axis", 1);
+        logSoftMax = params.get<bool>("log_softmax", false);
+        setParamsFrom(params);
+    }
+
+#ifdef HAVE_OPENCL
+    Ptr<OCL4DNNSoftmax<float> > softmaxOp;
+#endif
+
+    bool getMemoryShapes(const std::vector<MatShape> &inputs,
+                         const int requiredOutputs,
+                         std::vector<MatShape> &outputs,
+                         std::vector<MatShape> &internals) const CV_OVERRIDE
+    {
+        bool inplace = Layer::getMemoryShapes(inputs, requiredOutputs, outputs, internals);
+        MatShape shape = inputs[0];
+        int cAxis = clamp(axisRaw, shape.size());
+        shape[cAxis] = 1;
+        internals.assign(1, shape);
+        return inplace;
+    }
+
+    virtual bool supportBackend(int backendId) CV_OVERRIDE
+    {
+        return backendId == DNN_BACKEND_OPENCV ||
+               backendId == DNN_BACKEND_CUDA ||
+               (backendId == DNN_BACKEND_HALIDE && haveHalide() && axisRaw == 1) ||
+               ((backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 || backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH) && haveInfEngine() && !logSoftMax) ||
+               (backendId == DNN_BACKEND_VKCOM && haveVulkan());
+    }
+
+#ifdef HAVE_OPENCL
+    virtual void finalize(const std::vector<Mat*> &inputs, std::vector<Mat> &outputs) CV_OVERRIDE
+    {
+        softmaxOp.release();
+    }
+
+    bool forward_ocl(InputArrayOfArrays inputs_, OutputArrayOfArrays outputs_, OutputArrayOfArrays internals_)
+    {
+        std::vector<UMat> inputs;
+        std::vector<UMat> outputs;
+        std::vector<UMat> internals;
+
+        bool use_half = (inputs_.depth() == CV_16S);
+        inputs_.getUMatVector(inputs);
+        outputs_.getUMatVector(outputs);
+        internals_.getUMatVector(internals);
+
+        UMat& src = inputs[0];
+        UMat& dstMat = outputs[0];
+        int axis = clamp(axisRaw, src.dims);
+
+        if (softmaxOp.empty())
+        {
+            OCL4DNNSoftmaxConfig config;
+            config.in_shape = shape(inputs[0]);
+            config.axis = axis;
+            config.channels = inputs[0].size[axis];
+            config.logsoftmax = logSoftMax;
+            config.use_half = use_half;
+
+            softmaxOp = Ptr<OCL4DNNSoftmax<float> >(new OCL4DNNSoftmax<float>(config));
+        }
+
+        if (softmaxOp->Forward(src, dstMat))
+            return true;
+
+        UMat& bufMat = internals[0];
+        MatShape s = shape(src);
+        size_t outerSize = total(s, 0, axis);
+        size_t channels = src.size[axis];
+        size_t innerSize = total(s, axis + 1);
+
+        String buildOpts = format("-DT=%s", use_half ? "half" : "float");
+        ocl::Kernel kmax, ksub, ksum, kdiv;
+
+        if (!kmax.create("kernel_channel_max", ocl::dnn::softmax_oclsrc, buildOpts))
+            return false;
+
+        if (!ksub.create("kernel_channel_subtract", ocl::dnn::softmax_oclsrc, buildOpts))
+            return false;
+
+        if (!ksum.create("kernel_channel_sum", ocl::dnn::softmax_oclsrc, buildOpts))
+            return false;
+
+        if (logSoftMax) buildOpts += " -DLOG_SOFTMAX ";
+        if (!kdiv.create("kernel_channel_div", ocl::dnn::softmax_oclsrc, buildOpts))
+            return false;
+
+        size_t bufSize = internals[0].total();
+        size_t totalSize = src.total();
+
+        size_t internal_globalSize[1] = { bufSize };
+        size_t total_globalSize[1] = { totalSize };
+
+        kmax.args((int)outerSize, (int)channels, (int)innerSize,
+                  ocl::KernelArg::PtrReadOnly(src), ocl::KernelArg::PtrReadWrite(bufMat));
+        if (!kmax.run(1, internal_globalSize, NULL, false))
+            return false;
+
+        ksub.args((int)totalSize, (int)outerSize, (int)channels, (int)innerSize,
+                  ocl::KernelArg::PtrReadOnly(bufMat),
+                  ocl::KernelArg::PtrReadOnly(src), ocl::KernelArg::PtrWriteOnly(dstMat));
+        if (!ksub.run(1, total_globalSize, NULL, false))
+            return false;
+
+        ksum.args((int)outerSize, (int)channels, (int)innerSize,
+                  ocl::KernelArg::PtrReadOnly(dstMat), ocl::KernelArg::PtrReadWrite(bufMat));
+        if (!ksum.run(1, internal_globalSize, NULL, false))
+            return false;
+
+        kdiv.args((int)totalSize, (int)outerSize, (int)channels, (int)innerSize,
+                  ocl::KernelArg::PtrReadOnly(bufMat), ocl::KernelArg::PtrReadWrite(dstMat));
+        if (!kdiv.run(1, total_globalSize, NULL, false))
+            return false;
+
+        return true;
+    }
+#endif
+
+    void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
+    {
+        CV_TRACE_FUNCTION();
+        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+
+        CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
+                   forward_ocl(inputs_arr, outputs_arr, internals_arr))
+
+        if (inputs_arr.depth() == CV_16S)
+        {
+            forward_fallback(inputs_arr, outputs_arr, internals_arr);
+            return;
+        }
+
+        std::vector<Mat> inputs, outputs, internals;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
+        internals_arr.getMatVector(internals);
+
+        const Mat &src = inputs[0];
+        Mat &dst = outputs[0];
+
+        int axis = clamp(axisRaw, src.dims);
+        size_t outerSize = src.total(0, axis), channels = src.size[axis],
+                innerSize = src.total(axis + 1);
+
+        CV_Assert(src.type() == CV_32F);
+        CV_Assert(src.isContinuous() && dst.isContinuous());
+
+        const float *srcPtr = src.ptr<float>();
+        float *dstPtr = dst.ptr<float>();
+        float *bufPtr = internals[0].ptr<float>();
+
+        size_t outerStep = src.total(axis);
+        size_t cnStep = src.total(axis + 1);
+
+        //compute max along axis
+        for (size_t outerDim = 0; outerDim < outerSize; outerDim++)
+        {
+            size_t srcOffset = outerDim * outerStep;
+            size_t bufOffset = outerDim * cnStep;
+
+            memcpy(bufPtr + bufOffset, srcPtr + srcOffset, innerSize * sizeof(float));
+
+            for (size_t cnDim = 1; cnDim < channels; cnDim++)
+            {
+                for (size_t i = 0; i < innerSize; i++)
+                    bufPtr[bufOffset + i] = std::max(bufPtr[bufOffset + i], srcPtr[srcOffset + cnDim * cnStep + i]);
+            }
+        }
+
+        //subtract max
+        for (size_t outerDim = 0; outerDim < outerSize; outerDim++)
+        {
+            size_t srcOffset = outerDim * outerStep;
+            size_t bufOffset = outerDim * cnStep;
+
+            for (size_t cnDim = 0; cnDim < channels; cnDim++)
+            {
+                const int offset = srcOffset + cnDim * cnStep;
+                for (size_t i = 0; i < innerSize; i++)
+                    dstPtr[offset + i] = srcPtr[offset + i] - bufPtr[bufOffset + i];
+            }
+        }
+
+        cv::exp(dst, dst);
+
+        for (size_t outerDim = 0; outerDim < outerSize; outerDim++)
+        {
+            size_t srcOffset = outerDim * outerStep;
+            size_t bufOffset = outerDim * cnStep;
+
+            //sum exp along axis
+            for (size_t i = 0; i < innerSize; i++)
+                bufPtr[bufOffset + i] = 0.f;
+
+            for (size_t cnDim = 0; cnDim < channels; cnDim++)
+            {
+                const int offset = srcOffset + cnDim * cnStep;
+                for (size_t i = 0; i < innerSize; i++)
+                    bufPtr[bufOffset + i] += dstPtr[offset + i];
+            }
+
+            //divide by computed sum
+            for (size_t cnDim = 0; cnDim < channels; cnDim++)
+            {
+                const int offset = srcOffset + cnDim * cnStep;
+                for (size_t i = 0; i < innerSize; i++)
+                    dstPtr[offset + i] /= bufPtr[bufOffset + i];
+            }
+            if (logSoftMax)
+            {
+                for (size_t cnDim = 0; cnDim < channels; cnDim++)
+                {
+                    const int offset = srcOffset + cnDim * cnStep;
+                    for (size_t i = 0; i < innerSize; i++)
+                        dstPtr[offset + i] = log(dstPtr[offset + i]);
+                }
+            }
+        }
+    }
+
+#ifdef HAVE_CUDA
+    Ptr<BackendNode> initCUDA(
+        void *context_,
+        const std::vector<Ptr<BackendWrapper>>& inputs,
+        const std::vector<Ptr<BackendWrapper>>& outputs
+    ) override
+    {
+        auto context = reinterpret_cast<csl::CSLContext*>(context_);
+
+        auto input_wrapper = inputs[0].dynamicCast<CUDABackendWrapper>();
+        auto channel_axis = clamp(axisRaw, input_wrapper->getRank());
+        return make_cuda_node<cuda4dnn::SoftmaxOp>(preferableTarget, std::move(context->cudnn_handle), channel_axis, logSoftMax);
+    }
+#endif
+
+    virtual Ptr<BackendNode> initVkCom(const std::vector<Ptr<BackendWrapper> > &inputs) CV_OVERRIDE
+    {
+#ifdef HAVE_VULKAN
+        vkcom::Tensor in = VkComTensor(inputs[0]);
+        int cAxis = clamp(axisRaw, in.dimNum());
+        std::shared_ptr<vkcom::OpBase> op(new vkcom::OpSoftmax(cAxis, logSoftMax));
+        return Ptr<BackendNode>(new VkComBackendNode(inputs, op));
+#endif  // HAVE_VULKAN
+        return Ptr<BackendNode>();
+    }
+
+
+    virtual Ptr<BackendNode> initHalide(const std::vector<Ptr<BackendWrapper> > &inputs) CV_OVERRIDE
+    {
+#ifdef HAVE_HALIDE
+        Halide::Buffer<float> inputBuffer = halideBuffer(inputs[0]);
+        int inW, inH, inC, inN;
+        getCanonicalSize(inputBuffer, &inW, &inH, &inC, &inN);
+
+        if (inW != 1 || inH != 1)
+            CV_Error(cv::Error::StsNotImplemented,
+                     "Halide backend for SoftMax with spatial size "
+                     "more than 1x1 is not implemented");
+
+        Halide::Var x("x"), y("y"), c("c"), n("n");
+        Halide::Func top = (name.empty() ? Halide::Func() : Halide::Func(name));
+
+        Halide::Func expInput("expInput");
+        Halide::RDom r(0, inW, 0, inH, 0, inC);
+        expInput(x, y, c, n) = exp(inputBuffer(x, y, c, n));
+        Halide::Expr globalSum = sum(expInput(r.x, r.y, r.z, n));
+        top(x, y, c, n) = expInput(x, y, c, n) / globalSum;
+        return Ptr<BackendNode>(new HalideBackendNode(top));
+#endif  // HAVE_HALIDE
+        return Ptr<BackendNode>();
+    }
+
+#ifdef HAVE_INF_ENGINE
+    virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >& inputs) CV_OVERRIDE
+    {
+        InferenceEngine::DataPtr input = infEngineDataNode(inputs[0]);
+
+        InferenceEngine::Builder::SoftMaxLayer ieLayer(name);
+        ieLayer.setAxis(clamp(axisRaw, input->getDims().size()));
+
+        return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
+    }
+#endif  // HAVE_INF_ENGINE
+
+#ifdef HAVE_DNN_NGRAPH
+    virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> >& inputs,
+                                        const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
+    {
+        auto& ieInpNode = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
+        int axis = clamp(axisRaw, ieInpNode->get_shape().size());
+        auto softmax = std::make_shared<ngraph::op::v1::Softmax>(ieInpNode, axis);
+        return Ptr<BackendNode>(new InfEngineNgraphNode(softmax));
+    }
+#endif  // HAVE_DNN_NGRAPH
+
+    int64 getFLOPS(const std::vector<MatShape> &inputs,
+                  const std::vector<MatShape> &outputs) const CV_OVERRIDE
+    {
+        CV_UNUSED(outputs); // suppress unused variable warning
+        int64 flops = 0;
+
+        for (int i = 0; i < inputs.size(); i++)
+        {
+            flops += 4*total(inputs[i]);
+        }
+
+        return flops;
+    }
+
+    int axisRaw;
+};
+
+Ptr<SoftmaxLayer> SoftmaxLayer::create(const LayerParams& params)
+{
+    return Ptr<SoftmaxLayer>(new SoftMaxLayerImpl(params));
+}
+
+}
+}
--- a/Lib/opencv/sources/modules/dnn/src/layers/split_layer.cpp
+++ b/Lib/opencv/sources/modules/dnn/src/layers/split_layer.cpp
@@ -0,0 +1,128 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Copyright (C) 2017, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "../precomp.hpp"
+#include "../op_cuda.hpp"
+#include "layers_common.hpp"
+
+#ifdef HAVE_CUDA
+#include "../cuda4dnn/primitives/split.hpp"
+using namespace cv::dnn::cuda4dnn;
+#endif
+
+namespace cv
+{
+namespace dnn
+{
+
+class SplitLayerImpl CV_FINAL : public SplitLayer
+{
+public:
+    SplitLayerImpl(const LayerParams &params)
+    {
+        setParamsFrom(params);
+        //TODO: maybe "top_count" param is useless because it can be determined by output connections number
+        if (params.has("top_count"))
+        {
+            outputsCount = params.get<int>("top_count");
+            CV_Assert(outputsCount >= 0);
+        }
+        else
+        {
+            outputsCount = -1;
+        }
+    }
+
+    virtual bool supportBackend(int backendId) CV_OVERRIDE
+    {
+        return backendId == DNN_BACKEND_OPENCV ||
+               backendId == DNN_BACKEND_CUDA;
+    }
+
+    bool getMemoryShapes(const std::vector<MatShape> &inputs,
+                         const int requiredOutputs,
+                         std::vector<MatShape> &outputs,
+                         std::vector<MatShape> &internals) const CV_OVERRIDE
+    {
+        CV_Assert(inputs.size() == 1);
+
+        Layer::getMemoryShapes(inputs, max(1, outputsCount >= 0 ? outputsCount : requiredOutputs),
+                               outputs, internals);
+        return false;
+    }
+
+    void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
+    {
+        CV_TRACE_FUNCTION();
+        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+
+        std::vector<Mat> inputs, outputs;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
+        for (size_t i = 0; i < outputs.size(); i++)
+        {
+            CV_Assert(inputs[0].total() == outputs[i].total());
+            inputs[0].copyTo(outputs[i]);
+        }
+    }
+
+#ifdef HAVE_CUDA
+    Ptr<BackendNode> initCUDA(
+        void *context_,
+        const std::vector<Ptr<BackendWrapper>>& inputs,
+        const std::vector<Ptr<BackendWrapper>>& outputs
+    ) override
+    {
+        auto context = reinterpret_cast<csl::CSLContext*>(context_);
+        return make_cuda_node<cuda4dnn::SplitOp>(preferableTarget, std::move(context->stream));
+    }
+#endif
+
+};
+
+Ptr<SplitLayer> SplitLayer::create(const LayerParams& params)
+{
+    return Ptr<SplitLayer>(new SplitLayerImpl(params));
+}
+
+}
+}